From 9ca8dbcc65cfc63d6f5ef3312a33184e1d726e00 Mon Sep 17 00:00:00 2001
From: Yunhong Jiang <yunhong.jiang@intel.com>
Date: Tue, 4 Aug 2015 12:17:53 -0700
Subject: Add the rt linux 4.1.3-rt3 as base

Import the rt linux 4.1.3-rt3 as OPNFV kvm base.

It's from git://git.kernel.org/pub/scm/linux/kernel/git/rt/linux-rt-devel.git linux-4.1.y-rt and
the base is:

commit 0917f823c59692d751951bf5ea699a2d1e2f26a2
Author: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date:   Sat Jul 25 12:13:34 2015 +0200

    Prepare v4.1.3-rt3

    Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>

We lose all the git history this way and it's not good. We
should apply another opnfv project repo in future.

Change-Id: I87543d81c9df70d99c5001fbdf646b202c19f423
Signed-off-by: Yunhong Jiang <yunhong.jiang@intel.com>
---
 kernel/drivers/staging/lustre/Kconfig              |    3 +
 kernel/drivers/staging/lustre/Makefile             |    2 +
 kernel/drivers/staging/lustre/README.txt           |   87 +
 kernel/drivers/staging/lustre/TODO                 |   12 +
 .../staging/lustre/include/linux/libcfs/curproc.h  |   97 +
 .../staging/lustre/include/linux/libcfs/libcfs.h   |  187 +
 .../lustre/include/linux/libcfs/libcfs_cpu.h       |  219 +
 .../lustre/include/linux/libcfs/libcfs_crypto.h    |  199 +
 .../lustre/include/linux/libcfs/libcfs_debug.h     |  262 ++
 .../lustre/include/linux/libcfs/libcfs_fail.h      |  171 +
 .../lustre/include/linux/libcfs/libcfs_hash.h      |  843 ++++
 .../lustre/include/linux/libcfs/libcfs_ioctl.h     |  214 +
 .../include/linux/libcfs/libcfs_kernelcomm.h       |  118 +
 .../lustre/include/linux/libcfs/libcfs_prim.h      |   87 +
 .../lustre/include/linux/libcfs/libcfs_private.h   |  556 +++
 .../lustre/include/linux/libcfs/libcfs_string.h    |  107 +
 .../lustre/include/linux/libcfs/libcfs_time.h      |  131 +
 .../lustre/include/linux/libcfs/libcfs_workitem.h  |  110 +
 .../lustre/include/linux/libcfs/linux/libcfs.h     |  147 +
 .../lustre/include/linux/libcfs/linux/linux-cpu.h  |   82 +
 .../lustre/include/linux/libcfs/linux/linux-mem.h  |   80 +
 .../lustre/include/linux/libcfs/linux/linux-time.h |  144 +
 .../lustre/include/linux/lnet/api-support.h        |   44 +
 .../staging/lustre/include/linux/lnet/api.h        |  217 +
 .../staging/lustre/include/linux/lnet/lib-lnet.h   |  883 ++++
 .../staging/lustre/include/linux/lnet/lib-types.h  |  760 ++++
 .../lustre/include/linux/lnet/linux/api-support.h  |   42 +
 .../lustre/include/linux/lnet/linux/lib-lnet.h     |   71 +
 .../lustre/include/linux/lnet/linux/lib-types.h    |   45 +
 .../staging/lustre/include/linux/lnet/linux/lnet.h |   56 +
 .../lustre/include/linux/lnet/lnet-sysctl.h        |   49 +
 .../staging/lustre/include/linux/lnet/lnet.h       |   51 +
 .../staging/lustre/include/linux/lnet/lnetctl.h    |   80 +
 .../staging/lustre/include/linux/lnet/lnetst.h     |  491 +++
 .../staging/lustre/include/linux/lnet/ptllnd.h     |   93 +
 .../lustre/include/linux/lnet/ptllnd_wire.h        |  119 +
 .../staging/lustre/include/linux/lnet/socklnd.h    |  103 +
 .../staging/lustre/include/linux/lnet/types.h      |  492 +++
 kernel/drivers/staging/lustre/lnet/Kconfig         |   40 +
 kernel/drivers/staging/lustre/lnet/Makefile        |    1 +
 kernel/drivers/staging/lustre/lnet/klnds/Makefile  |    1 +
 .../staging/lustre/lnet/klnds/o2iblnd/Makefile     |    2 +
 .../staging/lustre/lnet/klnds/o2iblnd/o2iblnd.c    | 3118 ++++++++++++++
 .../staging/lustre/lnet/klnds/o2iblnd/o2iblnd.h    | 1030 +++++
 .../staging/lustre/lnet/klnds/o2iblnd/o2iblnd_cb.c | 3519 +++++++++++++++
 .../lustre/lnet/klnds/o2iblnd/o2iblnd_modparams.c  |  230 +
 .../staging/lustre/lnet/klnds/socklnd/Makefile     |    3 +
 .../staging/lustre/lnet/klnds/socklnd/socklnd.c    | 2886 +++++++++++++
 .../staging/lustre/lnet/klnds/socklnd/socklnd.h    |  588 +++
 .../staging/lustre/lnet/klnds/socklnd/socklnd_cb.c | 2634 ++++++++++++
 .../lustre/lnet/klnds/socklnd/socklnd_lib-linux.c  |  714 ++++
 .../lustre/lnet/klnds/socklnd/socklnd_lib-linux.h  |   86 +
 .../lustre/lnet/klnds/socklnd/socklnd_modparams.c  |  188 +
 .../lustre/lnet/klnds/socklnd/socklnd_proto.c      |  797 ++++
 kernel/drivers/staging/lustre/lnet/lnet/Makefile   |    5 +
 kernel/drivers/staging/lustre/lnet/lnet/acceptor.c |  500 +++
 kernel/drivers/staging/lustre/lnet/lnet/api-ni.c   | 1940 +++++++++
 kernel/drivers/staging/lustre/lnet/lnet/config.c   | 1292 ++++++
 kernel/drivers/staging/lustre/lnet/lnet/lib-eq.c   |  441 ++
 kernel/drivers/staging/lustre/lnet/lnet/lib-md.c   |  454 ++
 kernel/drivers/staging/lustre/lnet/lnet/lib-me.c   |  298 ++
 kernel/drivers/staging/lustre/lnet/lnet/lib-move.c | 2460 +++++++++++
 kernel/drivers/staging/lustre/lnet/lnet/lib-msg.c  |  647 +++
 kernel/drivers/staging/lustre/lnet/lnet/lib-ptl.c  |  935 ++++
 kernel/drivers/staging/lustre/lnet/lnet/lo.c       |  120 +
 kernel/drivers/staging/lustre/lnet/lnet/module.c   |  155 +
 kernel/drivers/staging/lustre/lnet/lnet/peer.c     |  338 ++
 kernel/drivers/staging/lustre/lnet/lnet/router.c   | 1706 ++++++++
 .../drivers/staging/lustre/lnet/lnet/router_proc.c |  968 +++++
 .../drivers/staging/lustre/lnet/selftest/Makefile  |    4 +
 .../staging/lustre/lnet/selftest/brw_test.c        |  508 +++
 .../drivers/staging/lustre/lnet/selftest/conctl.c  |  929 ++++
 .../drivers/staging/lustre/lnet/selftest/conrpc.c  | 1396 ++++++
 .../drivers/staging/lustre/lnet/selftest/conrpc.h  |  146 +
 .../drivers/staging/lustre/lnet/selftest/console.c | 2096 +++++++++
 .../drivers/staging/lustre/lnet/selftest/console.h |  235 +
 .../staging/lustre/lnet/selftest/framework.c       | 1804 ++++++++
 .../drivers/staging/lustre/lnet/selftest/module.c  |  159 +
 .../staging/lustre/lnet/selftest/ping_test.c       |  230 +
 kernel/drivers/staging/lustre/lnet/selftest/rpc.c  | 1673 ++++++++
 kernel/drivers/staging/lustre/lnet/selftest/rpc.h  |  302 ++
 .../staging/lustre/lnet/selftest/selftest.h        |  624 +++
 .../drivers/staging/lustre/lnet/selftest/timer.c   |  248 ++
 .../drivers/staging/lustre/lnet/selftest/timer.h   |   53 +
 kernel/drivers/staging/lustre/lustre/Kconfig       |   62 +
 kernel/drivers/staging/lustre/lustre/Makefile      |    2 +
 kernel/drivers/staging/lustre/lustre/fid/Makefile  |    3 +
 .../staging/lustre/lustre/fid/fid_internal.h       |   56 +
 kernel/drivers/staging/lustre/lustre/fid/fid_lib.c |   95 +
 .../staging/lustre/lustre/fid/fid_request.c        |  572 +++
 .../drivers/staging/lustre/lustre/fid/lproc_fid.c  |  225 +
 kernel/drivers/staging/lustre/lustre/fld/Makefile  |    3 +
 .../drivers/staging/lustre/lustre/fld/fld_cache.c  |  546 +++
 .../staging/lustre/lustre/fld/fld_internal.h       |  193 +
 .../staging/lustre/lustre/fld/fld_request.c        |  526 +++
 .../drivers/staging/lustre/lustre/fld/lproc_fld.c  |  172 +
 .../staging/lustre/lustre/include/cl_object.h      | 3287 ++++++++++++++
 .../staging/lustre/lustre/include/dt_object.h      | 1499 +++++++
 .../staging/lustre/lustre/include/interval_tree.h  |  124 +
 .../staging/lustre/lustre/include/lclient.h        |  433 ++
 .../lustre/lustre/include/linux/lustre_compat25.h  |  216 +
 .../lustre/lustre/include/linux/lustre_lite.h      |   98 +
 .../lustre/include/linux/lustre_patchless_compat.h |   85 +
 .../lustre/lustre/include/linux/lustre_user.h      |   70 +
 .../staging/lustre/lustre/include/linux/obd.h      |  125 +
 .../staging/lustre/lustre/include/lprocfs_status.h | 1015 +++++
 .../staging/lustre/lustre/include/lu_object.h      | 1340 ++++++
 .../drivers/staging/lustre/lustre/include/lu_ref.h |  182 +
 .../staging/lustre/lustre/include/lustre/libiam.h  |  145 +
 .../lustre/lustre/include/lustre/ll_fiemap.h       |  121 +
 .../lustre/include/lustre/lustre_build_version.h   |    2 +
 .../lustre/lustre/include/lustre/lustre_errno.h    |  215 +
 .../lustre/lustre/include/lustre/lustre_idl.h      | 3734 ++++++++++++++++
 .../lustre/include/lustre/lustre_lfsck_user.h      |   95 +
 .../lustre/lustre/include/lustre/lustre_user.h     | 1179 +++++
 .../staging/lustre/lustre/include/lustre_acl.h     |   49 +
 .../staging/lustre/lustre/include/lustre_capa.h    |  305 ++
 .../staging/lustre/lustre/include/lustre_cfg.h     |  293 ++
 .../staging/lustre/lustre/include/lustre_debug.h   |   56 +
 .../staging/lustre/lustre/include/lustre_disk.h    |  547 +++
 .../staging/lustre/lustre/include/lustre_dlm.h     | 1480 +++++++
 .../lustre/lustre/include/lustre_dlm_flags.h       |  476 +++
 .../staging/lustre/lustre/include/lustre_eacl.h    |   95 +
 .../staging/lustre/lustre/include/lustre_export.h  |  406 ++
 .../staging/lustre/lustre/include/lustre_fid.h     |  767 ++++
 .../staging/lustre/lustre/include/lustre_fld.h     |  160 +
 .../staging/lustre/lustre/include/lustre_ha.h      |   64 +
 .../staging/lustre/lustre/include/lustre_handles.h |   97 +
 .../staging/lustre/lustre/include/lustre_import.h  |  385 ++
 .../staging/lustre/lustre/include/lustre_intent.h  |   62 +
 .../staging/lustre/lustre/include/lustre_lib.h     |  666 +++
 .../staging/lustre/lustre/include/lustre_lite.h    |  150 +
 .../staging/lustre/lustre/include/lustre_log.h     |  545 +++
 .../staging/lustre/lustre/include/lustre_mdc.h     |  191 +
 .../staging/lustre/lustre/include/lustre_mds.h     |   81 +
 .../staging/lustre/lustre/include/lustre_net.h     | 2967 +++++++++++++
 .../staging/lustre/lustre/include/lustre_param.h   |  121 +
 .../staging/lustre/lustre/include/lustre_quota.h   |  241 ++
 .../lustre/lustre/include/lustre_req_layout.h      |  341 ++
 .../staging/lustre/lustre/include/lustre_sec.h     | 1147 +++++
 .../staging/lustre/lustre/include/lustre_ver.h     |   26 +
 kernel/drivers/staging/lustre/lustre/include/obd.h | 1496 +++++++
 .../staging/lustre/lustre/include/obd_cache.h      |   39 +
 .../staging/lustre/lustre/include/obd_cksum.h      |  176 +
 .../staging/lustre/lustre/include/obd_class.h      | 1929 +++++++++
 .../staging/lustre/lustre/include/obd_support.h    |  862 ++++
 .../staging/lustre/lustre/lclient/glimpse.c        |  269 ++
 .../staging/lustre/lustre/lclient/lcommon_cl.c     | 1287 ++++++
 .../staging/lustre/lustre/lclient/lcommon_misc.c   |  199 +
 .../staging/lustre/lustre/ldlm/interval_tree.c     |  751 ++++
 kernel/drivers/staging/lustre/lustre/ldlm/l_lock.c |   76 +
 .../staging/lustre/lustre/ldlm/ldlm_extent.c       |  241 ++
 .../staging/lustre/lustre/ldlm/ldlm_flock.c        |  859 ++++
 .../staging/lustre/lustre/ldlm/ldlm_inodebits.c    |   74 +
 .../staging/lustre/lustre/ldlm/ldlm_internal.h     |  316 ++
 .../drivers/staging/lustre/lustre/ldlm/ldlm_lib.c  |  870 ++++
 .../drivers/staging/lustre/lustre/ldlm/ldlm_lock.c | 2322 ++++++++++
 .../staging/lustre/lustre/ldlm/ldlm_lockd.c        | 1191 ++++++
 .../staging/lustre/lustre/ldlm/ldlm_plain.c        |   72 +
 .../drivers/staging/lustre/lustre/ldlm/ldlm_pool.c | 1455 +++++++
 .../staging/lustre/lustre/ldlm/ldlm_request.c      | 2294 ++++++++++
 .../staging/lustre/lustre/ldlm/ldlm_resource.c     | 1425 +++++++
 .../drivers/staging/lustre/lustre/libcfs/Makefile  |   18 +
 .../drivers/staging/lustre/lustre/libcfs/debug.c   |  460 ++
 kernel/drivers/staging/lustre/lustre/libcfs/fail.c |  138 +
 kernel/drivers/staging/lustre/lustre/libcfs/hash.c | 2098 +++++++++
 .../lustre/lustre/libcfs/kernel_user_comm.c        |  240 ++
 .../staging/lustre/lustre/libcfs/libcfs_cpu.c      |  224 +
 .../staging/lustre/lustre/libcfs/libcfs_lock.c     |  189 +
 .../staging/lustre/lustre/libcfs/libcfs_mem.c      |  202 +
 .../staging/lustre/lustre/libcfs/libcfs_string.c   |  562 +++
 .../staging/lustre/lustre/libcfs/linux/linux-cpu.c | 1056 +++++
 .../lustre/libcfs/linux/linux-crypto-adler.c       |  141 +
 .../lustre/lustre/libcfs/linux/linux-crypto.c      |  291 ++
 .../lustre/lustre/libcfs/linux/linux-crypto.h      |   29 +
 .../lustre/lustre/libcfs/linux/linux-curproc.c     |  111 +
 .../lustre/lustre/libcfs/linux/linux-debug.c       |  200 +
 .../lustre/lustre/libcfs/linux/linux-module.c      |  183 +
 .../lustre/lustre/libcfs/linux/linux-prim.c        |  217 +
 .../lustre/lustre/libcfs/linux/linux-tcpip.c       |  623 +++
 .../lustre/lustre/libcfs/linux/linux-tracefile.c   |  275 ++
 .../lustre/lustre/libcfs/linux/linux-tracefile.h   |   48 +
 .../drivers/staging/lustre/lustre/libcfs/module.c  |  976 +++++
 .../staging/lustre/lustre/libcfs/nidstrings.c      |  842 ++++
 kernel/drivers/staging/lustre/lustre/libcfs/prng.c |  139 +
 .../staging/lustre/lustre/libcfs/tracefile.c       | 1196 ++++++
 .../staging/lustre/lustre/libcfs/tracefile.h       |  340 ++
 .../staging/lustre/lustre/libcfs/workitem.c        |  479 +++
 .../drivers/staging/lustre/lustre/llite/Makefile   |   11 +
 .../drivers/staging/lustre/lustre/llite/dcache.c   |  363 ++
 kernel/drivers/staging/lustre/lustre/llite/dir.c   | 1971 +++++++++
 kernel/drivers/staging/lustre/lustre/llite/file.c  | 3624 ++++++++++++++++
 .../staging/lustre/lustre/llite/llite_capa.c       |  654 +++
 .../staging/lustre/lustre/llite/llite_close.c      |  393 ++
 .../staging/lustre/lustre/llite/llite_internal.h   | 1521 +++++++
 .../staging/lustre/lustre/llite/llite_lib.c        | 2354 ++++++++++
 .../staging/lustre/lustre/llite/llite_mmap.c       |  492 +++
 .../staging/lustre/lustre/llite/llite_nfs.c        |  335 ++
 .../staging/lustre/lustre/llite/llite_rmtacl.c     |  300 ++
 kernel/drivers/staging/lustre/lustre/llite/lloop.c |  877 ++++
 .../staging/lustre/lustre/llite/lproc_llite.c      | 1536 +++++++
 kernel/drivers/staging/lustre/lustre/llite/namei.c | 1178 +++++
 .../staging/lustre/lustre/llite/remote_perm.c      |  331 ++
 kernel/drivers/staging/lustre/lustre/llite/rw.c    | 1289 ++++++
 kernel/drivers/staging/lustre/lustre/llite/rw26.c  |  553 +++
 .../staging/lustre/lustre/llite/statahead.c        | 1729 ++++++++
 .../drivers/staging/lustre/lustre/llite/super25.c  |  226 +
 .../drivers/staging/lustre/lustre/llite/symlink.c  |  170 +
 .../drivers/staging/lustre/lustre/llite/vvp_dev.c  |  547 +++
 .../staging/lustre/lustre/llite/vvp_internal.h     |   62 +
 .../drivers/staging/lustre/lustre/llite/vvp_io.c   | 1209 ++++++
 .../drivers/staging/lustre/lustre/llite/vvp_lock.c |   85 +
 .../staging/lustre/lustre/llite/vvp_object.c       |  201 +
 .../drivers/staging/lustre/lustre/llite/vvp_page.c |  551 +++
 kernel/drivers/staging/lustre/lustre/llite/xattr.c |  621 +++
 .../staging/lustre/lustre/llite/xattr_cache.c      |  538 +++
 kernel/drivers/staging/lustre/lustre/lmv/Makefile  |    3 +
 kernel/drivers/staging/lustre/lustre/lmv/lmv_fld.c |   83 +
 .../drivers/staging/lustre/lustre/lmv/lmv_intent.c |  323 ++
 .../staging/lustre/lustre/lmv/lmv_internal.h       |  157 +
 kernel/drivers/staging/lustre/lustre/lmv/lmv_obd.c | 2892 +++++++++++++
 .../drivers/staging/lustre/lustre/lmv/lproc_lmv.c  |  237 ++
 kernel/drivers/staging/lustre/lustre/lov/Makefile  |    6 +
 .../staging/lustre/lustre/lov/lov_cl_internal.h    |  839 ++++
 kernel/drivers/staging/lustre/lustre/lov/lov_dev.c |  528 +++
 kernel/drivers/staging/lustre/lustre/lov/lov_ea.c  |  363 ++
 .../staging/lustre/lustre/lov/lov_internal.h       |  319 ++
 kernel/drivers/staging/lustre/lustre/lov/lov_io.c  |  990 +++++
 .../drivers/staging/lustre/lustre/lov/lov_lock.c   | 1198 ++++++
 .../drivers/staging/lustre/lustre/lov/lov_merge.c  |  186 +
 kernel/drivers/staging/lustre/lustre/lov/lov_obd.c | 2395 +++++++++++
 .../drivers/staging/lustre/lustre/lov/lov_object.c | 1001 +++++
 .../drivers/staging/lustre/lustre/lov/lov_offset.c |  264 ++
 .../drivers/staging/lustre/lustre/lov/lov_pack.c   |  511 +++
 .../drivers/staging/lustre/lustre/lov/lov_page.c   |  232 +
 .../drivers/staging/lustre/lustre/lov/lov_pool.c   |  673 +++
 .../staging/lustre/lustre/lov/lov_request.c        |  773 ++++
 .../drivers/staging/lustre/lustre/lov/lovsub_dev.c |  209 +
 .../drivers/staging/lustre/lustre/lov/lovsub_io.c  |   55 +
 .../staging/lustre/lustre/lov/lovsub_lock.c        |  466 ++
 .../staging/lustre/lustre/lov/lovsub_object.c      |  164 +
 .../staging/lustre/lustre/lov/lovsub_page.c        |   71 +
 .../drivers/staging/lustre/lustre/lov/lproc_lov.c  |  311 ++
 kernel/drivers/staging/lustre/lustre/mdc/Makefile  |    3 +
 .../drivers/staging/lustre/lustre/mdc/lproc_mdc.c  |  220 +
 .../staging/lustre/lustre/mdc/mdc_internal.h       |  181 +
 kernel/drivers/staging/lustre/lustre/mdc/mdc_lib.c |  593 +++
 .../drivers/staging/lustre/lustre/mdc/mdc_locks.c  | 1313 ++++++
 .../drivers/staging/lustre/lustre/mdc/mdc_reint.c  |  483 +++
 .../staging/lustre/lustre/mdc/mdc_request.c        | 2731 ++++++++++++
 kernel/drivers/staging/lustre/lustre/mgc/Makefile  |    3 +
 .../drivers/staging/lustre/lustre/mgc/lproc_mgc.c  |   80 +
 .../staging/lustre/lustre/mgc/mgc_internal.h       |   73 +
 .../staging/lustre/lustre/mgc/mgc_request.c        | 1762 ++++++++
 .../staging/lustre/lustre/obdclass/Makefile        |   11 +
 .../drivers/staging/lustre/lustre/obdclass/acl.c   |  548 +++
 .../drivers/staging/lustre/lustre/obdclass/capa.c  |  421 ++
 .../staging/lustre/lustre/obdclass/cl_internal.h   |  121 +
 .../drivers/staging/lustre/lustre/obdclass/cl_io.c | 1669 ++++++++
 .../staging/lustre/lustre/obdclass/cl_lock.c       | 2239 ++++++++++
 .../staging/lustre/lustre/obdclass/cl_object.c     | 1139 +++++
 .../staging/lustre/lustre/obdclass/cl_page.c       | 1553 +++++++
 .../staging/lustre/lustre/obdclass/class_obd.c     |  704 +++
 .../drivers/staging/lustre/lustre/obdclass/debug.c |  109 +
 .../staging/lustre/lustre/obdclass/dt_object.c     | 1059 +++++
 .../staging/lustre/lustre/obdclass/genops.c        | 1833 ++++++++
 .../lustre/lustre/obdclass/linux/linux-module.c    |  449 ++
 .../lustre/lustre/obdclass/linux/linux-obdo.c      |  222 +
 .../lustre/lustre/obdclass/linux/linux-sysctl.c    |  405 ++
 .../drivers/staging/lustre/lustre/obdclass/llog.c  | 1007 +++++
 .../staging/lustre/lustre/obdclass/llog_cat.c      |  815 ++++
 .../staging/lustre/lustre/obdclass/llog_internal.h |   98 +
 .../staging/lustre/lustre/obdclass/llog_obd.c      |  262 ++
 .../staging/lustre/lustre/obdclass/llog_swab.c     |  415 ++
 .../lustre/lustre/obdclass/lprocfs_counters.c      |  139 +
 .../lustre/lustre/obdclass/lprocfs_status.c        | 2059 +++++++++
 .../staging/lustre/lustre/obdclass/lu_object.c     | 2192 ++++++++++
 .../staging/lustre/lustre/obdclass/lu_ref.c        |   50 +
 .../lustre/lustre/obdclass/lustre_handles.c        |  257 ++
 .../staging/lustre/lustre/obdclass/lustre_peer.c   |  217 +
 .../staging/lustre/lustre/obdclass/obd_config.c    | 1953 +++++++++
 .../staging/lustre/lustre/obdclass/obd_mount.c     | 1319 ++++++
 .../drivers/staging/lustre/lustre/obdclass/obdo.c  |  362 ++
 .../staging/lustre/lustre/obdclass/statfs_pack.c   |   75 +
 .../drivers/staging/lustre/lustre/obdclass/uuid.c  |   82 +
 .../drivers/staging/lustre/lustre/obdecho/Makefile |    2 +
 .../staging/lustre/lustre/obdecho/echo_client.c    | 2197 ++++++++++
 .../staging/lustre/lustre/obdecho/echo_internal.h  |   47 +
 .../staging/lustre/lustre/obdecho/lproc_echo.c     |   57 +
 kernel/drivers/staging/lustre/lustre/osc/Makefile  |    4 +
 .../drivers/staging/lustre/lustre/osc/lproc_osc.c  |  751 ++++
 .../drivers/staging/lustre/lustre/osc/osc_cache.c  | 2944 +++++++++++++
 .../staging/lustre/lustre/osc/osc_cl_internal.h    |  685 +++
 kernel/drivers/staging/lustre/lustre/osc/osc_dev.c |  262 ++
 .../staging/lustre/lustre/osc/osc_internal.h       |  203 +
 kernel/drivers/staging/lustre/lustre/osc/osc_io.c  |  819 ++++
 .../drivers/staging/lustre/lustre/osc/osc_lock.c   | 1613 +++++++
 .../drivers/staging/lustre/lustre/osc/osc_object.c |  271 ++
 .../drivers/staging/lustre/lustre/osc/osc_page.c   |  916 ++++
 .../drivers/staging/lustre/lustre/osc/osc_quota.c  |  327 ++
 .../staging/lustre/lustre/osc/osc_request.c        | 3379 +++++++++++++++
 .../drivers/staging/lustre/lustre/ptlrpc/Makefile  |   20 +
 .../drivers/staging/lustre/lustre/ptlrpc/client.c  | 3149 ++++++++++++++
 .../staging/lustre/lustre/ptlrpc/connection.c      |  241 ++
 .../drivers/staging/lustre/lustre/ptlrpc/errno.c   |  380 ++
 .../drivers/staging/lustre/lustre/ptlrpc/events.c  |  585 +++
 .../drivers/staging/lustre/lustre/ptlrpc/import.c  | 1642 +++++++
 .../drivers/staging/lustre/lustre/ptlrpc/layout.c  | 2442 +++++++++++
 .../staging/lustre/lustre/ptlrpc/llog_client.c     |  366 ++
 .../staging/lustre/lustre/ptlrpc/llog_net.c        |   72 +
 .../staging/lustre/lustre/ptlrpc/lproc_ptlrpc.c    | 1366 ++++++
 .../drivers/staging/lustre/lustre/ptlrpc/niobuf.c  |  731 ++++
 kernel/drivers/staging/lustre/lustre/ptlrpc/nrs.c  | 1754 ++++++++
 .../staging/lustre/lustre/ptlrpc/nrs_fifo.c        |  270 ++
 .../staging/lustre/lustre/ptlrpc/pack_generic.c    | 2536 +++++++++++
 kernel/drivers/staging/lustre/lustre/ptlrpc/pers.c |   75 +
 .../drivers/staging/lustre/lustre/ptlrpc/pinger.c  |  678 +++
 .../staging/lustre/lustre/ptlrpc/ptlrpc_internal.h |  312 ++
 .../staging/lustre/lustre/ptlrpc/ptlrpc_module.c   |  171 +
 .../drivers/staging/lustre/lustre/ptlrpc/ptlrpcd.c |  811 ++++
 .../drivers/staging/lustre/lustre/ptlrpc/recover.c |  379 ++
 kernel/drivers/staging/lustre/lustre/ptlrpc/sec.c  | 2459 +++++++++++
 .../staging/lustre/lustre/ptlrpc/sec_bulk.c        |  884 ++++
 .../staging/lustre/lustre/ptlrpc/sec_config.c      |  901 ++++
 .../drivers/staging/lustre/lustre/ptlrpc/sec_gc.c  |  252 ++
 .../staging/lustre/lustre/ptlrpc/sec_lproc.c       |  199 +
 .../staging/lustre/lustre/ptlrpc/sec_null.c        |  458 ++
 .../staging/lustre/lustre/ptlrpc/sec_plain.c       | 1013 +++++
 .../drivers/staging/lustre/lustre/ptlrpc/service.c | 3105 ++++++++++++++
 .../staging/lustre/lustre/ptlrpc/wiretest.c        | 4492 ++++++++++++++++++++
 330 files changed, 219991 insertions(+)
 create mode 100644 kernel/drivers/staging/lustre/Kconfig
 create mode 100644 kernel/drivers/staging/lustre/Makefile
 create mode 100644 kernel/drivers/staging/lustre/README.txt
 create mode 100644 kernel/drivers/staging/lustre/TODO
 create mode 100644 kernel/drivers/staging/lustre/include/linux/libcfs/curproc.h
 create mode 100644 kernel/drivers/staging/lustre/include/linux/libcfs/libcfs.h
 create mode 100644 kernel/drivers/staging/lustre/include/linux/libcfs/libcfs_cpu.h
 create mode 100644 kernel/drivers/staging/lustre/include/linux/libcfs/libcfs_crypto.h
 create mode 100644 kernel/drivers/staging/lustre/include/linux/libcfs/libcfs_debug.h
 create mode 100644 kernel/drivers/staging/lustre/include/linux/libcfs/libcfs_fail.h
 create mode 100644 kernel/drivers/staging/lustre/include/linux/libcfs/libcfs_hash.h
 create mode 100644 kernel/drivers/staging/lustre/include/linux/libcfs/libcfs_ioctl.h
 create mode 100644 kernel/drivers/staging/lustre/include/linux/libcfs/libcfs_kernelcomm.h
 create mode 100644 kernel/drivers/staging/lustre/include/linux/libcfs/libcfs_prim.h
 create mode 100644 kernel/drivers/staging/lustre/include/linux/libcfs/libcfs_private.h
 create mode 100644 kernel/drivers/staging/lustre/include/linux/libcfs/libcfs_string.h
 create mode 100644 kernel/drivers/staging/lustre/include/linux/libcfs/libcfs_time.h
 create mode 100644 kernel/drivers/staging/lustre/include/linux/libcfs/libcfs_workitem.h
 create mode 100644 kernel/drivers/staging/lustre/include/linux/libcfs/linux/libcfs.h
 create mode 100644 kernel/drivers/staging/lustre/include/linux/libcfs/linux/linux-cpu.h
 create mode 100644 kernel/drivers/staging/lustre/include/linux/libcfs/linux/linux-mem.h
 create mode 100644 kernel/drivers/staging/lustre/include/linux/libcfs/linux/linux-time.h
 create mode 100644 kernel/drivers/staging/lustre/include/linux/lnet/api-support.h
 create mode 100644 kernel/drivers/staging/lustre/include/linux/lnet/api.h
 create mode 100644 kernel/drivers/staging/lustre/include/linux/lnet/lib-lnet.h
 create mode 100644 kernel/drivers/staging/lustre/include/linux/lnet/lib-types.h
 create mode 100644 kernel/drivers/staging/lustre/include/linux/lnet/linux/api-support.h
 create mode 100644 kernel/drivers/staging/lustre/include/linux/lnet/linux/lib-lnet.h
 create mode 100644 kernel/drivers/staging/lustre/include/linux/lnet/linux/lib-types.h
 create mode 100644 kernel/drivers/staging/lustre/include/linux/lnet/linux/lnet.h
 create mode 100644 kernel/drivers/staging/lustre/include/linux/lnet/lnet-sysctl.h
 create mode 100644 kernel/drivers/staging/lustre/include/linux/lnet/lnet.h
 create mode 100644 kernel/drivers/staging/lustre/include/linux/lnet/lnetctl.h
 create mode 100644 kernel/drivers/staging/lustre/include/linux/lnet/lnetst.h
 create mode 100644 kernel/drivers/staging/lustre/include/linux/lnet/ptllnd.h
 create mode 100644 kernel/drivers/staging/lustre/include/linux/lnet/ptllnd_wire.h
 create mode 100644 kernel/drivers/staging/lustre/include/linux/lnet/socklnd.h
 create mode 100644 kernel/drivers/staging/lustre/include/linux/lnet/types.h
 create mode 100644 kernel/drivers/staging/lustre/lnet/Kconfig
 create mode 100644 kernel/drivers/staging/lustre/lnet/Makefile
 create mode 100644 kernel/drivers/staging/lustre/lnet/klnds/Makefile
 create mode 100644 kernel/drivers/staging/lustre/lnet/klnds/o2iblnd/Makefile
 create mode 100644 kernel/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd.c
 create mode 100644 kernel/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd.h
 create mode 100644 kernel/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd_cb.c
 create mode 100644 kernel/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd_modparams.c
 create mode 100644 kernel/drivers/staging/lustre/lnet/klnds/socklnd/Makefile
 create mode 100644 kernel/drivers/staging/lustre/lnet/klnds/socklnd/socklnd.c
 create mode 100644 kernel/drivers/staging/lustre/lnet/klnds/socklnd/socklnd.h
 create mode 100644 kernel/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_cb.c
 create mode 100644 kernel/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_lib-linux.c
 create mode 100644 kernel/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_lib-linux.h
 create mode 100644 kernel/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_modparams.c
 create mode 100644 kernel/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_proto.c
 create mode 100644 kernel/drivers/staging/lustre/lnet/lnet/Makefile
 create mode 100644 kernel/drivers/staging/lustre/lnet/lnet/acceptor.c
 create mode 100644 kernel/drivers/staging/lustre/lnet/lnet/api-ni.c
 create mode 100644 kernel/drivers/staging/lustre/lnet/lnet/config.c
 create mode 100644 kernel/drivers/staging/lustre/lnet/lnet/lib-eq.c
 create mode 100644 kernel/drivers/staging/lustre/lnet/lnet/lib-md.c
 create mode 100644 kernel/drivers/staging/lustre/lnet/lnet/lib-me.c
 create mode 100644 kernel/drivers/staging/lustre/lnet/lnet/lib-move.c
 create mode 100644 kernel/drivers/staging/lustre/lnet/lnet/lib-msg.c
 create mode 100644 kernel/drivers/staging/lustre/lnet/lnet/lib-ptl.c
 create mode 100644 kernel/drivers/staging/lustre/lnet/lnet/lo.c
 create mode 100644 kernel/drivers/staging/lustre/lnet/lnet/module.c
 create mode 100644 kernel/drivers/staging/lustre/lnet/lnet/peer.c
 create mode 100644 kernel/drivers/staging/lustre/lnet/lnet/router.c
 create mode 100644 kernel/drivers/staging/lustre/lnet/lnet/router_proc.c
 create mode 100644 kernel/drivers/staging/lustre/lnet/selftest/Makefile
 create mode 100644 kernel/drivers/staging/lustre/lnet/selftest/brw_test.c
 create mode 100644 kernel/drivers/staging/lustre/lnet/selftest/conctl.c
 create mode 100644 kernel/drivers/staging/lustre/lnet/selftest/conrpc.c
 create mode 100644 kernel/drivers/staging/lustre/lnet/selftest/conrpc.h
 create mode 100644 kernel/drivers/staging/lustre/lnet/selftest/console.c
 create mode 100644 kernel/drivers/staging/lustre/lnet/selftest/console.h
 create mode 100644 kernel/drivers/staging/lustre/lnet/selftest/framework.c
 create mode 100644 kernel/drivers/staging/lustre/lnet/selftest/module.c
 create mode 100644 kernel/drivers/staging/lustre/lnet/selftest/ping_test.c
 create mode 100644 kernel/drivers/staging/lustre/lnet/selftest/rpc.c
 create mode 100644 kernel/drivers/staging/lustre/lnet/selftest/rpc.h
 create mode 100644 kernel/drivers/staging/lustre/lnet/selftest/selftest.h
 create mode 100644 kernel/drivers/staging/lustre/lnet/selftest/timer.c
 create mode 100644 kernel/drivers/staging/lustre/lnet/selftest/timer.h
 create mode 100644 kernel/drivers/staging/lustre/lustre/Kconfig
 create mode 100644 kernel/drivers/staging/lustre/lustre/Makefile
 create mode 100644 kernel/drivers/staging/lustre/lustre/fid/Makefile
 create mode 100644 kernel/drivers/staging/lustre/lustre/fid/fid_internal.h
 create mode 100644 kernel/drivers/staging/lustre/lustre/fid/fid_lib.c
 create mode 100644 kernel/drivers/staging/lustre/lustre/fid/fid_request.c
 create mode 100644 kernel/drivers/staging/lustre/lustre/fid/lproc_fid.c
 create mode 100644 kernel/drivers/staging/lustre/lustre/fld/Makefile
 create mode 100644 kernel/drivers/staging/lustre/lustre/fld/fld_cache.c
 create mode 100644 kernel/drivers/staging/lustre/lustre/fld/fld_internal.h
 create mode 100644 kernel/drivers/staging/lustre/lustre/fld/fld_request.c
 create mode 100644 kernel/drivers/staging/lustre/lustre/fld/lproc_fld.c
 create mode 100644 kernel/drivers/staging/lustre/lustre/include/cl_object.h
 create mode 100644 kernel/drivers/staging/lustre/lustre/include/dt_object.h
 create mode 100644 kernel/drivers/staging/lustre/lustre/include/interval_tree.h
 create mode 100644 kernel/drivers/staging/lustre/lustre/include/lclient.h
 create mode 100644 kernel/drivers/staging/lustre/lustre/include/linux/lustre_compat25.h
 create mode 100644 kernel/drivers/staging/lustre/lustre/include/linux/lustre_lite.h
 create mode 100644 kernel/drivers/staging/lustre/lustre/include/linux/lustre_patchless_compat.h
 create mode 100644 kernel/drivers/staging/lustre/lustre/include/linux/lustre_user.h
 create mode 100644 kernel/drivers/staging/lustre/lustre/include/linux/obd.h
 create mode 100644 kernel/drivers/staging/lustre/lustre/include/lprocfs_status.h
 create mode 100644 kernel/drivers/staging/lustre/lustre/include/lu_object.h
 create mode 100644 kernel/drivers/staging/lustre/lustre/include/lu_ref.h
 create mode 100644 kernel/drivers/staging/lustre/lustre/include/lustre/libiam.h
 create mode 100644 kernel/drivers/staging/lustre/lustre/include/lustre/ll_fiemap.h
 create mode 100644 kernel/drivers/staging/lustre/lustre/include/lustre/lustre_build_version.h
 create mode 100644 kernel/drivers/staging/lustre/lustre/include/lustre/lustre_errno.h
 create mode 100644 kernel/drivers/staging/lustre/lustre/include/lustre/lustre_idl.h
 create mode 100644 kernel/drivers/staging/lustre/lustre/include/lustre/lustre_lfsck_user.h
 create mode 100644 kernel/drivers/staging/lustre/lustre/include/lustre/lustre_user.h
 create mode 100644 kernel/drivers/staging/lustre/lustre/include/lustre_acl.h
 create mode 100644 kernel/drivers/staging/lustre/lustre/include/lustre_capa.h
 create mode 100644 kernel/drivers/staging/lustre/lustre/include/lustre_cfg.h
 create mode 100644 kernel/drivers/staging/lustre/lustre/include/lustre_debug.h
 create mode 100644 kernel/drivers/staging/lustre/lustre/include/lustre_disk.h
 create mode 100644 kernel/drivers/staging/lustre/lustre/include/lustre_dlm.h
 create mode 100644 kernel/drivers/staging/lustre/lustre/include/lustre_dlm_flags.h
 create mode 100644 kernel/drivers/staging/lustre/lustre/include/lustre_eacl.h
 create mode 100644 kernel/drivers/staging/lustre/lustre/include/lustre_export.h
 create mode 100644 kernel/drivers/staging/lustre/lustre/include/lustre_fid.h
 create mode 100644 kernel/drivers/staging/lustre/lustre/include/lustre_fld.h
 create mode 100644 kernel/drivers/staging/lustre/lustre/include/lustre_ha.h
 create mode 100644 kernel/drivers/staging/lustre/lustre/include/lustre_handles.h
 create mode 100644 kernel/drivers/staging/lustre/lustre/include/lustre_import.h
 create mode 100644 kernel/drivers/staging/lustre/lustre/include/lustre_intent.h
 create mode 100644 kernel/drivers/staging/lustre/lustre/include/lustre_lib.h
 create mode 100644 kernel/drivers/staging/lustre/lustre/include/lustre_lite.h
 create mode 100644 kernel/drivers/staging/lustre/lustre/include/lustre_log.h
 create mode 100644 kernel/drivers/staging/lustre/lustre/include/lustre_mdc.h
 create mode 100644 kernel/drivers/staging/lustre/lustre/include/lustre_mds.h
 create mode 100644 kernel/drivers/staging/lustre/lustre/include/lustre_net.h
 create mode 100644 kernel/drivers/staging/lustre/lustre/include/lustre_param.h
 create mode 100644 kernel/drivers/staging/lustre/lustre/include/lustre_quota.h
 create mode 100644 kernel/drivers/staging/lustre/lustre/include/lustre_req_layout.h
 create mode 100644 kernel/drivers/staging/lustre/lustre/include/lustre_sec.h
 create mode 100644 kernel/drivers/staging/lustre/lustre/include/lustre_ver.h
 create mode 100644 kernel/drivers/staging/lustre/lustre/include/obd.h
 create mode 100644 kernel/drivers/staging/lustre/lustre/include/obd_cache.h
 create mode 100644 kernel/drivers/staging/lustre/lustre/include/obd_cksum.h
 create mode 100644 kernel/drivers/staging/lustre/lustre/include/obd_class.h
 create mode 100644 kernel/drivers/staging/lustre/lustre/include/obd_support.h
 create mode 100644 kernel/drivers/staging/lustre/lustre/lclient/glimpse.c
 create mode 100644 kernel/drivers/staging/lustre/lustre/lclient/lcommon_cl.c
 create mode 100644 kernel/drivers/staging/lustre/lustre/lclient/lcommon_misc.c
 create mode 100644 kernel/drivers/staging/lustre/lustre/ldlm/interval_tree.c
 create mode 100644 kernel/drivers/staging/lustre/lustre/ldlm/l_lock.c
 create mode 100644 kernel/drivers/staging/lustre/lustre/ldlm/ldlm_extent.c
 create mode 100644 kernel/drivers/staging/lustre/lustre/ldlm/ldlm_flock.c
 create mode 100644 kernel/drivers/staging/lustre/lustre/ldlm/ldlm_inodebits.c
 create mode 100644 kernel/drivers/staging/lustre/lustre/ldlm/ldlm_internal.h
 create mode 100644 kernel/drivers/staging/lustre/lustre/ldlm/ldlm_lib.c
 create mode 100644 kernel/drivers/staging/lustre/lustre/ldlm/ldlm_lock.c
 create mode 100644 kernel/drivers/staging/lustre/lustre/ldlm/ldlm_lockd.c
 create mode 100644 kernel/drivers/staging/lustre/lustre/ldlm/ldlm_plain.c
 create mode 100644 kernel/drivers/staging/lustre/lustre/ldlm/ldlm_pool.c
 create mode 100644 kernel/drivers/staging/lustre/lustre/ldlm/ldlm_request.c
 create mode 100644 kernel/drivers/staging/lustre/lustre/ldlm/ldlm_resource.c
 create mode 100644 kernel/drivers/staging/lustre/lustre/libcfs/Makefile
 create mode 100644 kernel/drivers/staging/lustre/lustre/libcfs/debug.c
 create mode 100644 kernel/drivers/staging/lustre/lustre/libcfs/fail.c
 create mode 100644 kernel/drivers/staging/lustre/lustre/libcfs/hash.c
 create mode 100644 kernel/drivers/staging/lustre/lustre/libcfs/kernel_user_comm.c
 create mode 100644 kernel/drivers/staging/lustre/lustre/libcfs/libcfs_cpu.c
 create mode 100644 kernel/drivers/staging/lustre/lustre/libcfs/libcfs_lock.c
 create mode 100644 kernel/drivers/staging/lustre/lustre/libcfs/libcfs_mem.c
 create mode 100644 kernel/drivers/staging/lustre/lustre/libcfs/libcfs_string.c
 create mode 100644 kernel/drivers/staging/lustre/lustre/libcfs/linux/linux-cpu.c
 create mode 100644 kernel/drivers/staging/lustre/lustre/libcfs/linux/linux-crypto-adler.c
 create mode 100644 kernel/drivers/staging/lustre/lustre/libcfs/linux/linux-crypto.c
 create mode 100644 kernel/drivers/staging/lustre/lustre/libcfs/linux/linux-crypto.h
 create mode 100644 kernel/drivers/staging/lustre/lustre/libcfs/linux/linux-curproc.c
 create mode 100644 kernel/drivers/staging/lustre/lustre/libcfs/linux/linux-debug.c
 create mode 100644 kernel/drivers/staging/lustre/lustre/libcfs/linux/linux-module.c
 create mode 100644 kernel/drivers/staging/lustre/lustre/libcfs/linux/linux-prim.c
 create mode 100644 kernel/drivers/staging/lustre/lustre/libcfs/linux/linux-tcpip.c
 create mode 100644 kernel/drivers/staging/lustre/lustre/libcfs/linux/linux-tracefile.c
 create mode 100644 kernel/drivers/staging/lustre/lustre/libcfs/linux/linux-tracefile.h
 create mode 100644 kernel/drivers/staging/lustre/lustre/libcfs/module.c
 create mode 100644 kernel/drivers/staging/lustre/lustre/libcfs/nidstrings.c
 create mode 100644 kernel/drivers/staging/lustre/lustre/libcfs/prng.c
 create mode 100644 kernel/drivers/staging/lustre/lustre/libcfs/tracefile.c
 create mode 100644 kernel/drivers/staging/lustre/lustre/libcfs/tracefile.h
 create mode 100644 kernel/drivers/staging/lustre/lustre/libcfs/workitem.c
 create mode 100644 kernel/drivers/staging/lustre/lustre/llite/Makefile
 create mode 100644 kernel/drivers/staging/lustre/lustre/llite/dcache.c
 create mode 100644 kernel/drivers/staging/lustre/lustre/llite/dir.c
 create mode 100644 kernel/drivers/staging/lustre/lustre/llite/file.c
 create mode 100644 kernel/drivers/staging/lustre/lustre/llite/llite_capa.c
 create mode 100644 kernel/drivers/staging/lustre/lustre/llite/llite_close.c
 create mode 100644 kernel/drivers/staging/lustre/lustre/llite/llite_internal.h
 create mode 100644 kernel/drivers/staging/lustre/lustre/llite/llite_lib.c
 create mode 100644 kernel/drivers/staging/lustre/lustre/llite/llite_mmap.c
 create mode 100644 kernel/drivers/staging/lustre/lustre/llite/llite_nfs.c
 create mode 100644 kernel/drivers/staging/lustre/lustre/llite/llite_rmtacl.c
 create mode 100644 kernel/drivers/staging/lustre/lustre/llite/lloop.c
 create mode 100644 kernel/drivers/staging/lustre/lustre/llite/lproc_llite.c
 create mode 100644 kernel/drivers/staging/lustre/lustre/llite/namei.c
 create mode 100644 kernel/drivers/staging/lustre/lustre/llite/remote_perm.c
 create mode 100644 kernel/drivers/staging/lustre/lustre/llite/rw.c
 create mode 100644 kernel/drivers/staging/lustre/lustre/llite/rw26.c
 create mode 100644 kernel/drivers/staging/lustre/lustre/llite/statahead.c
 create mode 100644 kernel/drivers/staging/lustre/lustre/llite/super25.c
 create mode 100644 kernel/drivers/staging/lustre/lustre/llite/symlink.c
 create mode 100644 kernel/drivers/staging/lustre/lustre/llite/vvp_dev.c
 create mode 100644 kernel/drivers/staging/lustre/lustre/llite/vvp_internal.h
 create mode 100644 kernel/drivers/staging/lustre/lustre/llite/vvp_io.c
 create mode 100644 kernel/drivers/staging/lustre/lustre/llite/vvp_lock.c
 create mode 100644 kernel/drivers/staging/lustre/lustre/llite/vvp_object.c
 create mode 100644 kernel/drivers/staging/lustre/lustre/llite/vvp_page.c
 create mode 100644 kernel/drivers/staging/lustre/lustre/llite/xattr.c
 create mode 100644 kernel/drivers/staging/lustre/lustre/llite/xattr_cache.c
 create mode 100644 kernel/drivers/staging/lustre/lustre/lmv/Makefile
 create mode 100644 kernel/drivers/staging/lustre/lustre/lmv/lmv_fld.c
 create mode 100644 kernel/drivers/staging/lustre/lustre/lmv/lmv_intent.c
 create mode 100644 kernel/drivers/staging/lustre/lustre/lmv/lmv_internal.h
 create mode 100644 kernel/drivers/staging/lustre/lustre/lmv/lmv_obd.c
 create mode 100644 kernel/drivers/staging/lustre/lustre/lmv/lproc_lmv.c
 create mode 100644 kernel/drivers/staging/lustre/lustre/lov/Makefile
 create mode 100644 kernel/drivers/staging/lustre/lustre/lov/lov_cl_internal.h
 create mode 100644 kernel/drivers/staging/lustre/lustre/lov/lov_dev.c
 create mode 100644 kernel/drivers/staging/lustre/lustre/lov/lov_ea.c
 create mode 100644 kernel/drivers/staging/lustre/lustre/lov/lov_internal.h
 create mode 100644 kernel/drivers/staging/lustre/lustre/lov/lov_io.c
 create mode 100644 kernel/drivers/staging/lustre/lustre/lov/lov_lock.c
 create mode 100644 kernel/drivers/staging/lustre/lustre/lov/lov_merge.c
 create mode 100644 kernel/drivers/staging/lustre/lustre/lov/lov_obd.c
 create mode 100644 kernel/drivers/staging/lustre/lustre/lov/lov_object.c
 create mode 100644 kernel/drivers/staging/lustre/lustre/lov/lov_offset.c
 create mode 100644 kernel/drivers/staging/lustre/lustre/lov/lov_pack.c
 create mode 100644 kernel/drivers/staging/lustre/lustre/lov/lov_page.c
 create mode 100644 kernel/drivers/staging/lustre/lustre/lov/lov_pool.c
 create mode 100644 kernel/drivers/staging/lustre/lustre/lov/lov_request.c
 create mode 100644 kernel/drivers/staging/lustre/lustre/lov/lovsub_dev.c
 create mode 100644 kernel/drivers/staging/lustre/lustre/lov/lovsub_io.c
 create mode 100644 kernel/drivers/staging/lustre/lustre/lov/lovsub_lock.c
 create mode 100644 kernel/drivers/staging/lustre/lustre/lov/lovsub_object.c
 create mode 100644 kernel/drivers/staging/lustre/lustre/lov/lovsub_page.c
 create mode 100644 kernel/drivers/staging/lustre/lustre/lov/lproc_lov.c
 create mode 100644 kernel/drivers/staging/lustre/lustre/mdc/Makefile
 create mode 100644 kernel/drivers/staging/lustre/lustre/mdc/lproc_mdc.c
 create mode 100644 kernel/drivers/staging/lustre/lustre/mdc/mdc_internal.h
 create mode 100644 kernel/drivers/staging/lustre/lustre/mdc/mdc_lib.c
 create mode 100644 kernel/drivers/staging/lustre/lustre/mdc/mdc_locks.c
 create mode 100644 kernel/drivers/staging/lustre/lustre/mdc/mdc_reint.c
 create mode 100644 kernel/drivers/staging/lustre/lustre/mdc/mdc_request.c
 create mode 100644 kernel/drivers/staging/lustre/lustre/mgc/Makefile
 create mode 100644 kernel/drivers/staging/lustre/lustre/mgc/lproc_mgc.c
 create mode 100644 kernel/drivers/staging/lustre/lustre/mgc/mgc_internal.h
 create mode 100644 kernel/drivers/staging/lustre/lustre/mgc/mgc_request.c
 create mode 100644 kernel/drivers/staging/lustre/lustre/obdclass/Makefile
 create mode 100644 kernel/drivers/staging/lustre/lustre/obdclass/acl.c
 create mode 100644 kernel/drivers/staging/lustre/lustre/obdclass/capa.c
 create mode 100644 kernel/drivers/staging/lustre/lustre/obdclass/cl_internal.h
 create mode 100644 kernel/drivers/staging/lustre/lustre/obdclass/cl_io.c
 create mode 100644 kernel/drivers/staging/lustre/lustre/obdclass/cl_lock.c
 create mode 100644 kernel/drivers/staging/lustre/lustre/obdclass/cl_object.c
 create mode 100644 kernel/drivers/staging/lustre/lustre/obdclass/cl_page.c
 create mode 100644 kernel/drivers/staging/lustre/lustre/obdclass/class_obd.c
 create mode 100644 kernel/drivers/staging/lustre/lustre/obdclass/debug.c
 create mode 100644 kernel/drivers/staging/lustre/lustre/obdclass/dt_object.c
 create mode 100644 kernel/drivers/staging/lustre/lustre/obdclass/genops.c
 create mode 100644 kernel/drivers/staging/lustre/lustre/obdclass/linux/linux-module.c
 create mode 100644 kernel/drivers/staging/lustre/lustre/obdclass/linux/linux-obdo.c
 create mode 100644 kernel/drivers/staging/lustre/lustre/obdclass/linux/linux-sysctl.c
 create mode 100644 kernel/drivers/staging/lustre/lustre/obdclass/llog.c
 create mode 100644 kernel/drivers/staging/lustre/lustre/obdclass/llog_cat.c
 create mode 100644 kernel/drivers/staging/lustre/lustre/obdclass/llog_internal.h
 create mode 100644 kernel/drivers/staging/lustre/lustre/obdclass/llog_obd.c
 create mode 100644 kernel/drivers/staging/lustre/lustre/obdclass/llog_swab.c
 create mode 100644 kernel/drivers/staging/lustre/lustre/obdclass/lprocfs_counters.c
 create mode 100644 kernel/drivers/staging/lustre/lustre/obdclass/lprocfs_status.c
 create mode 100644 kernel/drivers/staging/lustre/lustre/obdclass/lu_object.c
 create mode 100644 kernel/drivers/staging/lustre/lustre/obdclass/lu_ref.c
 create mode 100644 kernel/drivers/staging/lustre/lustre/obdclass/lustre_handles.c
 create mode 100644 kernel/drivers/staging/lustre/lustre/obdclass/lustre_peer.c
 create mode 100644 kernel/drivers/staging/lustre/lustre/obdclass/obd_config.c
 create mode 100644 kernel/drivers/staging/lustre/lustre/obdclass/obd_mount.c
 create mode 100644 kernel/drivers/staging/lustre/lustre/obdclass/obdo.c
 create mode 100644 kernel/drivers/staging/lustre/lustre/obdclass/statfs_pack.c
 create mode 100644 kernel/drivers/staging/lustre/lustre/obdclass/uuid.c
 create mode 100644 kernel/drivers/staging/lustre/lustre/obdecho/Makefile
 create mode 100644 kernel/drivers/staging/lustre/lustre/obdecho/echo_client.c
 create mode 100644 kernel/drivers/staging/lustre/lustre/obdecho/echo_internal.h
 create mode 100644 kernel/drivers/staging/lustre/lustre/obdecho/lproc_echo.c
 create mode 100644 kernel/drivers/staging/lustre/lustre/osc/Makefile
 create mode 100644 kernel/drivers/staging/lustre/lustre/osc/lproc_osc.c
 create mode 100644 kernel/drivers/staging/lustre/lustre/osc/osc_cache.c
 create mode 100644 kernel/drivers/staging/lustre/lustre/osc/osc_cl_internal.h
 create mode 100644 kernel/drivers/staging/lustre/lustre/osc/osc_dev.c
 create mode 100644 kernel/drivers/staging/lustre/lustre/osc/osc_internal.h
 create mode 100644 kernel/drivers/staging/lustre/lustre/osc/osc_io.c
 create mode 100644 kernel/drivers/staging/lustre/lustre/osc/osc_lock.c
 create mode 100644 kernel/drivers/staging/lustre/lustre/osc/osc_object.c
 create mode 100644 kernel/drivers/staging/lustre/lustre/osc/osc_page.c
 create mode 100644 kernel/drivers/staging/lustre/lustre/osc/osc_quota.c
 create mode 100644 kernel/drivers/staging/lustre/lustre/osc/osc_request.c
 create mode 100644 kernel/drivers/staging/lustre/lustre/ptlrpc/Makefile
 create mode 100644 kernel/drivers/staging/lustre/lustre/ptlrpc/client.c
 create mode 100644 kernel/drivers/staging/lustre/lustre/ptlrpc/connection.c
 create mode 100644 kernel/drivers/staging/lustre/lustre/ptlrpc/errno.c
 create mode 100644 kernel/drivers/staging/lustre/lustre/ptlrpc/events.c
 create mode 100644 kernel/drivers/staging/lustre/lustre/ptlrpc/import.c
 create mode 100644 kernel/drivers/staging/lustre/lustre/ptlrpc/layout.c
 create mode 100644 kernel/drivers/staging/lustre/lustre/ptlrpc/llog_client.c
 create mode 100644 kernel/drivers/staging/lustre/lustre/ptlrpc/llog_net.c
 create mode 100644 kernel/drivers/staging/lustre/lustre/ptlrpc/lproc_ptlrpc.c
 create mode 100644 kernel/drivers/staging/lustre/lustre/ptlrpc/niobuf.c
 create mode 100644 kernel/drivers/staging/lustre/lustre/ptlrpc/nrs.c
 create mode 100644 kernel/drivers/staging/lustre/lustre/ptlrpc/nrs_fifo.c
 create mode 100644 kernel/drivers/staging/lustre/lustre/ptlrpc/pack_generic.c
 create mode 100644 kernel/drivers/staging/lustre/lustre/ptlrpc/pers.c
 create mode 100644 kernel/drivers/staging/lustre/lustre/ptlrpc/pinger.c
 create mode 100644 kernel/drivers/staging/lustre/lustre/ptlrpc/ptlrpc_internal.h
 create mode 100644 kernel/drivers/staging/lustre/lustre/ptlrpc/ptlrpc_module.c
 create mode 100644 kernel/drivers/staging/lustre/lustre/ptlrpc/ptlrpcd.c
 create mode 100644 kernel/drivers/staging/lustre/lustre/ptlrpc/recover.c
 create mode 100644 kernel/drivers/staging/lustre/lustre/ptlrpc/sec.c
 create mode 100644 kernel/drivers/staging/lustre/lustre/ptlrpc/sec_bulk.c
 create mode 100644 kernel/drivers/staging/lustre/lustre/ptlrpc/sec_config.c
 create mode 100644 kernel/drivers/staging/lustre/lustre/ptlrpc/sec_gc.c
 create mode 100644 kernel/drivers/staging/lustre/lustre/ptlrpc/sec_lproc.c
 create mode 100644 kernel/drivers/staging/lustre/lustre/ptlrpc/sec_null.c
 create mode 100644 kernel/drivers/staging/lustre/lustre/ptlrpc/sec_plain.c
 create mode 100644 kernel/drivers/staging/lustre/lustre/ptlrpc/service.c
 create mode 100644 kernel/drivers/staging/lustre/lustre/ptlrpc/wiretest.c

(limited to 'kernel/drivers/staging/lustre')

diff --git a/kernel/drivers/staging/lustre/Kconfig b/kernel/drivers/staging/lustre/Kconfig
new file mode 100644
index 000000000..a224d88bf
--- /dev/null
+++ b/kernel/drivers/staging/lustre/Kconfig
@@ -0,0 +1,3 @@
+source "drivers/staging/lustre/lustre/Kconfig"
+
+source "drivers/staging/lustre/lnet/Kconfig"
diff --git a/kernel/drivers/staging/lustre/Makefile b/kernel/drivers/staging/lustre/Makefile
new file mode 100644
index 000000000..95ffe337a
--- /dev/null
+++ b/kernel/drivers/staging/lustre/Makefile
@@ -0,0 +1,2 @@
+obj-$(CONFIG_LNET)		+= lnet/
+obj-$(CONFIG_LUSTRE_FS)		+= lustre/
diff --git a/kernel/drivers/staging/lustre/README.txt b/kernel/drivers/staging/lustre/README.txt
new file mode 100644
index 000000000..cf0ca50ff
--- /dev/null
+++ b/kernel/drivers/staging/lustre/README.txt
@@ -0,0 +1,87 @@
+Lustre Parallel Filesystem Client
+=================================
+
+The Lustre file system is an open-source, parallel file system
+that supports many requirements of leadership class HPC simulation
+environments.
+Born from from a research project at Carnegie Mellon University,
+the Lustre file system is a widely-used option in HPC.
+The Lustre file system provides a POSIX compliant file system interface,
+can scale to thousands of clients, petabytes of storage and
+hundreds of gigabytes per second of I/O bandwidth.
+
+Unlike shared disk storage cluster filesystems (e.g. OCFS2, GFS, GPFS),
+Lustre has independent Metadata and Data servers that clients can access
+in parallel to maximize performance.
+
+In order to use Lustre client you will need to download lustre client
+tools from
+https://downloads.hpdd.intel.com/public/lustre/latest-feature-release/
+the package name is lustre-client.
+
+You will need to install and configure your Lustre servers separately.
+
+Mount Syntax
+============
+After you installed the lustre-client tools including mount.lustre binary
+you can mount your Lustre filesystem with:
+
+mount -t lustre mgs:/fsname mnt
+
+where mgs is the host name or ip address of your Lustre MGS(management service)
+fsname is the name of the filesystem you would like to mount.
+
+
+Mount Options
+=============
+
+  noflock
+	Disable posix file locking (Applications trying to use
+	the functionality will get ENOSYS)
+
+  localflock
+	Enable local flock support, using only client-local flock
+	(faster, for applications that require flock but do not run
+	 on multiple nodes).
+
+  flock
+	Enable cluster-global posix file locking coherent across all
+	client nodes.
+
+  user_xattr, nouser_xattr
+	Support "user." extended attributes (or not)
+
+  user_fid2path, nouser_fid2path
+	Enable FID to path translation by regular users (or not)
+
+  checksum, nochecksum
+	Verify data consistency on the wire and in memory as it passes
+	between the layers (or not).
+
+  lruresize, nolruresize
+	Allow lock LRU to be controlled by memory pressure on the server
+	(or only 100 (default, controlled by lru_size proc parameter) locks
+	 per CPU per server on this client).
+
+  lazystatfs, nolazystatfs
+	Do not block in statfs() if some of the servers are down.
+
+  32bitapi
+	Shrink inode numbers to fit into 32 bits. This is necessary
+	if you plan to reexport Lustre filesystem from this client via
+	NFSv4.
+
+  verbose, noverbose
+	Enable mount/umount console messages (or not)
+
+More Information
+================
+You can get more information at
+OpenSFS website: http://lustre.opensfs.org/about/
+Intel HPDD wiki: https://wiki.hpdd.intel.com
+
+Out of tree Lustre client and server code is available at:
+http://git.whamcloud.com/fs/lustre-release.git
+
+Latest binary packages:
+http://lustre.opensfs.org/download-lustre/
diff --git a/kernel/drivers/staging/lustre/TODO b/kernel/drivers/staging/lustre/TODO
new file mode 100644
index 000000000..0512594b5
--- /dev/null
+++ b/kernel/drivers/staging/lustre/TODO
@@ -0,0 +1,12 @@
+* Possible remaining coding style fix.
+* Remove deadcode.
+* Seperate client/server functionality. Functions only used by server can be
+  removed from client.
+* Clean up libcfs layer. Ideally we can remove include/linux/libcfs entirely.
+* Clean up CLIO layer. Lustre client readahead/writeback control needs to better
+  suit kernel providings.
+* Add documents in Documentation.
+* Other minor misc cleanups...
+
+Please send any patches to Greg Kroah-Hartman <greg@kroah.com>, Andreas Dilger
+<andreas.dilger@intel.com>, and Oleg Drokin <oleg.drokin@intel.com>.
diff --git a/kernel/drivers/staging/lustre/include/linux/libcfs/curproc.h b/kernel/drivers/staging/lustre/include/linux/libcfs/curproc.h
new file mode 100644
index 000000000..1edfca58c
--- /dev/null
+++ b/kernel/drivers/staging/lustre/include/linux/libcfs/curproc.h
@@ -0,0 +1,97 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * libcfs/include/libcfs/curproc.h
+ *
+ * Lustre curproc API declaration
+ *
+ * Author: Nikita Danilov <nikita@clusterfs.com>
+ */
+
+#ifndef __LIBCFS_CURPROC_H__
+#define __LIBCFS_CURPROC_H__
+
+/*
+ * Plus, platform-specific constant
+ *
+ * CFS_CURPROC_COMM_MAX,
+ *
+ * and opaque scalar type
+ *
+ * kernel_cap_t
+ */
+
+/* check if task is running in compat mode.*/
+#define current_pid()		(current->pid)
+#define current_comm()		(current->comm)
+
+typedef __u32 cfs_cap_t;
+
+#define CFS_CAP_CHOWN		   0
+#define CFS_CAP_DAC_OVERRIDE	    1
+#define CFS_CAP_DAC_READ_SEARCH	 2
+#define CFS_CAP_FOWNER		  3
+#define CFS_CAP_FSETID		  4
+#define CFS_CAP_LINUX_IMMUTABLE	 9
+#define CFS_CAP_SYS_ADMIN	      21
+#define CFS_CAP_SYS_BOOT	       23
+#define CFS_CAP_SYS_RESOURCE	   24
+
+#define CFS_CAP_FS_MASK ((1 << CFS_CAP_CHOWN) |		 \
+			 (1 << CFS_CAP_DAC_OVERRIDE) |	  \
+			 (1 << CFS_CAP_DAC_READ_SEARCH) |       \
+			 (1 << CFS_CAP_FOWNER) |		\
+			 (1 << CFS_CAP_FSETID) |	       \
+			 (1 << CFS_CAP_LINUX_IMMUTABLE) |       \
+			 (1 << CFS_CAP_SYS_ADMIN) |	     \
+			 (1 << CFS_CAP_SYS_BOOT) |	      \
+			 (1 << CFS_CAP_SYS_RESOURCE))
+
+void cfs_cap_raise(cfs_cap_t cap);
+void cfs_cap_lower(cfs_cap_t cap);
+int cfs_cap_raised(cfs_cap_t cap);
+cfs_cap_t cfs_curproc_cap_pack(void);
+
+/* __LIBCFS_CURPROC_H__ */
+#endif
+/*
+ * Local variables:
+ * c-indentation-style: "K&R"
+ * c-basic-offset: 8
+ * tab-width: 8
+ * fill-column: 80
+ * scroll-step: 1
+ * End:
+ */
diff --git a/kernel/drivers/staging/lustre/include/linux/libcfs/libcfs.h b/kernel/drivers/staging/lustre/include/linux/libcfs/libcfs.h
new file mode 100644
index 000000000..4410d7fdc
--- /dev/null
+++ b/kernel/drivers/staging/lustre/include/linux/libcfs/libcfs.h
@@ -0,0 +1,187 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef __LIBCFS_LIBCFS_H__
+#define __LIBCFS_LIBCFS_H__
+
+#if !__GNUC__
+#define __attribute__(x)
+#endif
+
+#include "linux/libcfs.h"
+#include <linux/gfp.h>
+
+#include "curproc.h"
+
+#ifndef offsetof
+# define offsetof(typ, memb) ((long)(long_ptr_t)((char *)&(((typ *)0)->memb)))
+#endif
+
+#ifndef ARRAY_SIZE
+#define ARRAY_SIZE(a) ((sizeof(a)) / (sizeof((a)[0])))
+#endif
+
+#if !defined(swap)
+#define swap(x, y) do { typeof(x) z = x; x = y; y = z; } while (0)
+#endif
+
+#if !defined(container_of)
+/* given a pointer @ptr to the field @member embedded into type (usually
+ * struct) @type, return pointer to the embedding instance of @type. */
+#define container_of(ptr, type, member) \
+	((type *)((char *)(ptr)-(char *)(&((type *)0)->member)))
+#endif
+
+static inline int __is_po2(unsigned long long val)
+{
+	return !(val & (val - 1));
+}
+
+#define IS_PO2(val) __is_po2((unsigned long long)(val))
+
+#define LOWEST_BIT_SET(x)       ((x) & ~((x) - 1))
+
+/*
+ * Lustre Error Checksum: calculates checksum
+ * of Hex number by XORing each bit.
+ */
+#define LERRCHKSUM(hexnum) (((hexnum) & 0xf) ^ ((hexnum) >> 4 & 0xf) ^ \
+			   ((hexnum) >> 8 & 0xf))
+
+#define LUSTRE_SRV_LNET_PID      LUSTRE_LNET_PID
+
+#include <linux/list.h>
+
+int libcfs_arch_init(void);
+void libcfs_arch_cleanup(void);
+
+/* libcfs tcpip */
+int libcfs_ipif_query(char *name, int *up, __u32 *ip, __u32 *mask);
+int libcfs_ipif_enumerate(char ***names);
+void libcfs_ipif_free_enumeration(char **names, int n);
+int libcfs_sock_listen(struct socket **sockp, __u32 ip, int port, int backlog);
+int libcfs_sock_accept(struct socket **newsockp, struct socket *sock);
+void libcfs_sock_abort_accept(struct socket *sock);
+int libcfs_sock_connect(struct socket **sockp, int *fatal,
+			__u32 local_ip, int local_port,
+			__u32 peer_ip, int peer_port);
+int libcfs_sock_setbuf(struct socket *socket, int txbufsize, int rxbufsize);
+int libcfs_sock_getbuf(struct socket *socket, int *txbufsize, int *rxbufsize);
+int libcfs_sock_getaddr(struct socket *socket, int remote, __u32 *ip, int *port);
+int libcfs_sock_write(struct socket *sock, void *buffer, int nob, int timeout);
+int libcfs_sock_read(struct socket *sock, void *buffer, int nob, int timeout);
+void libcfs_sock_release(struct socket *sock);
+
+/* need both kernel and user-land acceptor */
+#define LNET_ACCEPTOR_MIN_RESERVED_PORT    512
+#define LNET_ACCEPTOR_MAX_RESERVED_PORT    1023
+
+/*
+ * libcfs pseudo device operations
+ *
+ * It's just draft now.
+ */
+
+struct cfs_psdev_file {
+	unsigned long   off;
+	void	    *private_data;
+	unsigned long   reserved1;
+	unsigned long   reserved2;
+};
+
+struct cfs_psdev_ops {
+	int (*p_open)(unsigned long, void *);
+	int (*p_close)(unsigned long, void *);
+	int (*p_read)(struct cfs_psdev_file *, char *, unsigned long);
+	int (*p_write)(struct cfs_psdev_file *, char *, unsigned long);
+	int (*p_ioctl)(struct cfs_psdev_file *, unsigned long, void *);
+};
+
+/*
+ * Drop into debugger, if possible. Implementation is provided by platform.
+ */
+
+void cfs_enter_debugger(void);
+
+/*
+ * Defined by platform
+ */
+int unshare_fs_struct(void);
+sigset_t cfs_get_blocked_sigs(void);
+sigset_t cfs_block_allsigs(void);
+sigset_t cfs_block_sigs(unsigned long sigs);
+sigset_t cfs_block_sigsinv(unsigned long sigs);
+void cfs_restore_sigs(sigset_t);
+int cfs_signal_pending(void);
+void cfs_clear_sigpending(void);
+
+/*
+ * Random number handling
+ */
+
+/* returns a random 32-bit integer */
+unsigned int cfs_rand(void);
+/* seed the generator */
+void cfs_srand(unsigned int, unsigned int);
+void cfs_get_random_bytes(void *buf, int size);
+
+#include "libcfs_debug.h"
+#include "libcfs_cpu.h"
+#include "libcfs_private.h"
+#include "libcfs_ioctl.h"
+#include "libcfs_prim.h"
+#include "libcfs_time.h"
+#include "libcfs_string.h"
+#include "libcfs_kernelcomm.h"
+#include "libcfs_workitem.h"
+#include "libcfs_hash.h"
+#include "libcfs_fail.h"
+#include "libcfs_crypto.h"
+
+/* container_of depends on "likely" which is defined in libcfs_private.h */
+static inline void *__container_of(void *ptr, unsigned long shift)
+{
+	if (unlikely(IS_ERR(ptr) || ptr == NULL))
+		return ptr;
+	return (char *)ptr - shift;
+}
+
+#define container_of0(ptr, type, member) \
+	((type *)__container_of((void *)(ptr), offsetof(type, member)))
+
+#define _LIBCFS_H
+
+#endif /* _LIBCFS_H */
diff --git a/kernel/drivers/staging/lustre/include/linux/libcfs/libcfs_cpu.h b/kernel/drivers/staging/lustre/include/linux/libcfs/libcfs_cpu.h
new file mode 100644
index 000000000..787867847
--- /dev/null
+++ b/kernel/drivers/staging/lustre/include/linux/libcfs/libcfs_cpu.h
@@ -0,0 +1,219 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * libcfs/include/libcfs/libcfs_cpu.h
+ *
+ * CPU partition
+ *   . CPU partition is virtual processing unit
+ *
+ *   . CPU partition can present 1-N cores, or 1-N NUMA nodes,
+ *     in other words, CPU partition is a processors pool.
+ *
+ * CPU Partition Table (CPT)
+ *   . a set of CPU partitions
+ *
+ *   . There are two modes for CPT: CFS_CPU_MODE_NUMA and CFS_CPU_MODE_SMP
+ *
+ *   . User can specify total number of CPU partitions while creating a
+ *     CPT, ID of CPU partition is always start from 0.
+ *
+ *     Example: if there are 8 cores on the system, while creating a CPT
+ *     with cpu_npartitions=4:
+ *	      core[0, 1] = partition[0], core[2, 3] = partition[1]
+ *	      core[4, 5] = partition[2], core[6, 7] = partition[3]
+ *
+ *	  cpu_npartitions=1:
+ *	      core[0, 1, ... 7] = partition[0]
+ *
+ *   . User can also specify CPU partitions by string pattern
+ *
+ *     Examples: cpu_partitions="0[0,1], 1[2,3]"
+ *	       cpu_partitions="N 0[0-3], 1[4-8]"
+ *
+ *     The first character "N" means following numbers are numa ID
+ *
+ *   . NUMA allocators, CPU affinity threads are built over CPU partitions,
+ *     instead of HW CPUs or HW nodes.
+ *
+ *   . By default, Lustre modules should refer to the global cfs_cpt_table,
+ *     instead of accessing HW CPUs directly, so concurrency of Lustre can be
+ *     configured by cpu_npartitions of the global cfs_cpt_table
+ *
+ *   . If cpu_npartitions=1(all CPUs in one pool), lustre should work the
+ *     same way as 2.2 or earlier versions
+ *
+ * Author: liang@whamcloud.com
+ */
+
+#ifndef __LIBCFS_CPU_H__
+#define __LIBCFS_CPU_H__
+
+/* any CPU partition */
+#define CFS_CPT_ANY		(-1)
+
+#ifdef CONFIG_SMP
+/**
+ * return cpumask of CPU partition \a cpt
+ */
+cpumask_t *cfs_cpt_cpumask(struct cfs_cpt_table *cptab, int cpt);
+/**
+ * print string information of cpt-table
+ */
+int cfs_cpt_table_print(struct cfs_cpt_table *cptab, char *buf, int len);
+#else /* !CONFIG_SMP */
+struct cfs_cpt_table {
+	/* # of CPU partitions */
+	int			ctb_nparts;
+	/* cpu mask */
+	cpumask_t		ctb_mask;
+	/* node mask */
+	nodemask_t		ctb_nodemask;
+	/* version */
+	__u64			ctb_version;
+};
+
+static inline cpumask_t *
+cfs_cpt_cpumask(struct cfs_cpt_table *cptab, int cpt)
+{
+	return NULL;
+}
+
+static inline int
+cfs_cpt_table_print(struct cfs_cpt_table *cptab, char *buf, int len)
+{
+	return 0;
+}
+#endif /* CONFIG_SMP */
+
+extern struct cfs_cpt_table	*cfs_cpt_table;
+
+/**
+ * destroy a CPU partition table
+ */
+void cfs_cpt_table_free(struct cfs_cpt_table *cptab);
+/**
+ * create a cfs_cpt_table with \a ncpt number of partitions
+ */
+struct cfs_cpt_table *cfs_cpt_table_alloc(unsigned int ncpt);
+/**
+ * return total number of CPU partitions in \a cptab
+ */
+int
+cfs_cpt_number(struct cfs_cpt_table *cptab);
+/**
+ * return number of HW cores or hyper-threadings in a CPU partition \a cpt
+ */
+int cfs_cpt_weight(struct cfs_cpt_table *cptab, int cpt);
+/**
+ * is there any online CPU in CPU partition \a cpt
+ */
+int cfs_cpt_online(struct cfs_cpt_table *cptab, int cpt);
+/**
+ * return nodemask of CPU partition \a cpt
+ */
+nodemask_t *cfs_cpt_nodemask(struct cfs_cpt_table *cptab, int cpt);
+/**
+ * shadow current HW processor ID to CPU-partition ID of \a cptab
+ */
+int cfs_cpt_current(struct cfs_cpt_table *cptab, int remap);
+/**
+ * shadow HW processor ID \a CPU to CPU-partition ID by \a cptab
+ */
+int cfs_cpt_of_cpu(struct cfs_cpt_table *cptab, int cpu);
+/**
+ * bind current thread on a CPU-partition \a cpt of \a cptab
+ */
+int cfs_cpt_bind(struct cfs_cpt_table *cptab, int cpt);
+/**
+ * add \a cpu to CPU partition @cpt of \a cptab, return 1 for success,
+ * otherwise 0 is returned
+ */
+int cfs_cpt_set_cpu(struct cfs_cpt_table *cptab, int cpt, int cpu);
+/**
+ * remove \a cpu from CPU partition \a cpt of \a cptab
+ */
+void cfs_cpt_unset_cpu(struct cfs_cpt_table *cptab, int cpt, int cpu);
+/**
+ * add all cpus in \a mask to CPU partition \a cpt
+ * return 1 if successfully set all CPUs, otherwise return 0
+ */
+int cfs_cpt_set_cpumask(struct cfs_cpt_table *cptab,
+			int cpt, cpumask_t *mask);
+/**
+ * remove all cpus in \a mask from CPU partition \a cpt
+ */
+void cfs_cpt_unset_cpumask(struct cfs_cpt_table *cptab,
+			   int cpt, cpumask_t *mask);
+/**
+ * add all cpus in NUMA node \a node to CPU partition \a cpt
+ * return 1 if successfully set all CPUs, otherwise return 0
+ */
+int cfs_cpt_set_node(struct cfs_cpt_table *cptab, int cpt, int node);
+/**
+ * remove all cpus in NUMA node \a node from CPU partition \a cpt
+ */
+void cfs_cpt_unset_node(struct cfs_cpt_table *cptab, int cpt, int node);
+
+/**
+ * add all cpus in node mask \a mask to CPU partition \a cpt
+ * return 1 if successfully set all CPUs, otherwise return 0
+ */
+int cfs_cpt_set_nodemask(struct cfs_cpt_table *cptab,
+			 int cpt, nodemask_t *mask);
+/**
+ * remove all cpus in node mask \a mask from CPU partition \a cpt
+ */
+void cfs_cpt_unset_nodemask(struct cfs_cpt_table *cptab,
+			    int cpt, nodemask_t *mask);
+/**
+ * unset all cpus for CPU partition \a cpt
+ */
+void cfs_cpt_clear(struct cfs_cpt_table *cptab, int cpt);
+/**
+ * convert partition id \a cpt to numa node id, if there are more than one
+ * nodes in this partition, it might return a different node id each time.
+ */
+int cfs_cpt_spread_node(struct cfs_cpt_table *cptab, int cpt);
+
+/**
+ * return number of HTs in the same core of \a cpu
+ */
+int cfs_cpu_ht_nsiblings(int cpu);
+
+/**
+ * iterate over all CPU partitions in \a cptab
+ */
+#define cfs_cpt_for_each(i, cptab)	\
+	for (i = 0; i < cfs_cpt_number(cptab); i++)
+
+int  cfs_cpu_init(void);
+void cfs_cpu_fini(void);
+
+#endif /* __LIBCFS_CPU_H__ */
diff --git a/kernel/drivers/staging/lustre/include/linux/libcfs/libcfs_crypto.h b/kernel/drivers/staging/lustre/include/linux/libcfs/libcfs_crypto.h
new file mode 100644
index 000000000..e8663697e
--- /dev/null
+++ b/kernel/drivers/staging/lustre/include/linux/libcfs/libcfs_crypto.h
@@ -0,0 +1,199 @@
+/* GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see http://www.gnu.org/licenses
+ *
+ * Please  visit http://www.xyratex.com/contact if you need additional
+ * information or have any questions.
+ *
+ * GPL HEADER END
+ */
+
+/*
+ * Copyright 2012 Xyratex Technology Limited
+ */
+
+#ifndef _LIBCFS_CRYPTO_H
+#define _LIBCFS_CRYPTO_H
+
+struct cfs_crypto_hash_type {
+	char		*cht_name;      /**< hash algorithm name, equal to
+					 * format name for crypto api */
+	unsigned int    cht_key;	/**< init key by default (valid for
+					 * 4 bytes context like crc32, adler */
+	unsigned int    cht_size;       /**< hash digest size */
+};
+
+enum cfs_crypto_hash_alg {
+	CFS_HASH_ALG_NULL       = 0,
+	CFS_HASH_ALG_ADLER32,
+	CFS_HASH_ALG_CRC32,
+	CFS_HASH_ALG_MD5,
+	CFS_HASH_ALG_SHA1,
+	CFS_HASH_ALG_SHA256,
+	CFS_HASH_ALG_SHA384,
+	CFS_HASH_ALG_SHA512,
+	CFS_HASH_ALG_CRC32C,
+	CFS_HASH_ALG_MAX
+};
+
+static struct cfs_crypto_hash_type hash_types[] = {
+	[CFS_HASH_ALG_NULL]    = { "null",     0,      0 },
+	[CFS_HASH_ALG_ADLER32] = { "adler32",  1,      4 },
+	[CFS_HASH_ALG_CRC32]   = { "crc32",   ~0,      4 },
+	[CFS_HASH_ALG_CRC32C]  = { "crc32c",  ~0,      4 },
+	[CFS_HASH_ALG_MD5]     = { "md5",      0,     16 },
+	[CFS_HASH_ALG_SHA1]    = { "sha1",     0,     20 },
+	[CFS_HASH_ALG_SHA256]  = { "sha256",   0,     32 },
+	[CFS_HASH_ALG_SHA384]  = { "sha384",   0,     48 },
+	[CFS_HASH_ALG_SHA512]  = { "sha512",   0,     64 },
+};
+
+/**    Return pointer to type of hash for valid hash algorithm identifier */
+static inline const struct cfs_crypto_hash_type *
+		    cfs_crypto_hash_type(unsigned char hash_alg)
+{
+	struct cfs_crypto_hash_type *ht;
+
+	if (hash_alg < CFS_HASH_ALG_MAX) {
+		ht = &hash_types[hash_alg];
+		if (ht->cht_name)
+			return ht;
+	}
+	return NULL;
+}
+
+/**     Return hash name for valid hash algorithm identifier or "unknown" */
+static inline const char *cfs_crypto_hash_name(unsigned char hash_alg)
+{
+	const struct cfs_crypto_hash_type *ht;
+
+	ht = cfs_crypto_hash_type(hash_alg);
+	if (ht)
+		return ht->cht_name;
+	return "unknown";
+}
+
+/**     Return digest size for valid algorithm identifier or 0 */
+static inline int cfs_crypto_hash_digestsize(unsigned char hash_alg)
+{
+	const struct cfs_crypto_hash_type *ht;
+
+	ht = cfs_crypto_hash_type(hash_alg);
+	if (ht)
+		return ht->cht_size;
+	return 0;
+}
+
+/**     Return hash identifier for valid hash algorithm name or 0xFF */
+static inline unsigned char cfs_crypto_hash_alg(const char *algname)
+{
+	unsigned char   i;
+
+	for (i = 0; i < CFS_HASH_ALG_MAX; i++)
+		if (!strcmp(hash_types[i].cht_name, algname))
+			break;
+	return (i == CFS_HASH_ALG_MAX ? 0xFF : i);
+}
+
+/**     Calculate hash digest for buffer.
+ *      @param alg	    id of hash algorithm
+ *      @param buf	    buffer of data
+ *      @param buf_len	buffer len
+ *      @param key	    initial value for algorithm, if it is NULL,
+ *			    default initial value should be used.
+ *      @param key_len	len of initial value
+ *      @param hash	   [out] pointer to hash, if it is NULL, hash_len is
+ *			    set to valid digest size in bytes, retval -ENOSPC.
+ *      @param hash_len       [in,out] size of hash buffer
+ *      @returns	      status of operation
+ *      @retval -EINVAL       if buf, buf_len, hash_len or alg_id is invalid
+ *      @retval -ENODEV       if this algorithm is unsupported
+ *      @retval -ENOSPC       if pointer to hash is NULL, or hash_len less than
+ *			    digest size
+ *      @retval 0	     for success
+ *      @retval < 0	   other errors from lower layers.
+ */
+int cfs_crypto_hash_digest(unsigned char alg,
+			   const void *buf, unsigned int buf_len,
+			   unsigned char *key, unsigned int key_len,
+			   unsigned char *hash, unsigned int *hash_len);
+
+/* cfs crypto hash descriptor */
+struct cfs_crypto_hash_desc;
+
+/**     Allocate and initialize descriptor for hash algorithm.
+ *      @param alg	    algorithm id
+ *      @param key	    initial value for algorithm, if it is NULL,
+ *			    default initial value should be used.
+ *      @param key_len	len of initial value
+ *      @returns	      pointer to descriptor of hash instance
+ *      @retval ERR_PTR(error) when errors occurred.
+ */
+struct cfs_crypto_hash_desc*
+	cfs_crypto_hash_init(unsigned char alg,
+			     unsigned char *key, unsigned int key_len);
+
+/**    Update digest by part of data.
+ *     @param desc	      hash descriptor
+ *     @param page	      data page
+ *     @param offset	    data offset
+ *     @param len	       data len
+ *     @returns		 status of operation
+ *     @retval 0		for success.
+ */
+int cfs_crypto_hash_update_page(struct cfs_crypto_hash_desc *desc,
+				struct page *page, unsigned int offset,
+				unsigned int len);
+
+/**    Update digest by part of data.
+ *     @param desc	      hash descriptor
+ *     @param buf	       pointer to data buffer
+ *     @param buf_len	   size of data at buffer
+ *     @returns		 status of operation
+ *     @retval 0		for success.
+ */
+int cfs_crypto_hash_update(struct cfs_crypto_hash_desc *desc, const void *buf,
+			   unsigned int buf_len);
+
+/**    Finalize hash calculation, copy hash digest to buffer, destroy hash
+ *     descriptor.
+ *     @param desc	      hash descriptor
+ *     @param hash	      buffer pointer to store hash digest
+ *     @param hash_len	  pointer to hash buffer size, if NULL
+ *			      destroy hash descriptor
+ *     @returns		 status of operation
+ *     @retval -ENOSPC	  if hash is NULL, or *hash_len less than
+ *			      digest size
+ *     @retval 0		for success
+ *     @retval < 0	      other errors from lower layers.
+ */
+int cfs_crypto_hash_final(struct cfs_crypto_hash_desc *desc,
+			  unsigned char *hash, unsigned int *hash_len);
+/**
+ *      Register crypto hash algorithms
+ */
+int cfs_crypto_register(void);
+
+/**
+ *      Unregister
+ */
+void cfs_crypto_unregister(void);
+
+/**     Return hash speed in Mbytes per second for valid hash algorithm
+ *      identifier. If test was unsuccessful -1 would be returned.
+ */
+int cfs_crypto_hash_speed(unsigned char hash_alg);
+#endif
diff --git a/kernel/drivers/staging/lustre/include/linux/libcfs/libcfs_debug.h b/kernel/drivers/staging/lustre/include/linux/libcfs/libcfs_debug.h
new file mode 100644
index 000000000..8251ac932
--- /dev/null
+++ b/kernel/drivers/staging/lustre/include/linux/libcfs/libcfs_debug.h
@@ -0,0 +1,262 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * libcfs/include/libcfs/libcfs_debug.h
+ *
+ * Debug messages and assertions
+ *
+ */
+
+#ifndef __LIBCFS_DEBUG_H__
+#define __LIBCFS_DEBUG_H__
+
+/*
+ *  Debugging
+ */
+extern unsigned int libcfs_subsystem_debug;
+extern unsigned int libcfs_stack;
+extern unsigned int libcfs_debug;
+extern unsigned int libcfs_printk;
+extern unsigned int libcfs_console_ratelimit;
+extern unsigned int libcfs_watchdog_ratelimit;
+extern unsigned int libcfs_console_max_delay;
+extern unsigned int libcfs_console_min_delay;
+extern unsigned int libcfs_console_backoff;
+extern unsigned int libcfs_debug_binary;
+extern char libcfs_debug_file_path_arr[PATH_MAX];
+
+int libcfs_debug_mask2str(char *str, int size, int mask, int is_subsys);
+int libcfs_debug_str2mask(int *mask, const char *str, int is_subsys);
+
+/* Has there been an LBUG? */
+extern unsigned int libcfs_catastrophe;
+extern unsigned int libcfs_panic_on_lbug;
+
+/**
+ * Format for debug message headers
+ */
+struct ptldebug_header {
+	__u32 ph_len;
+	__u32 ph_flags;
+	__u32 ph_subsys;
+	__u32 ph_mask;
+	__u16 ph_cpu_id;
+	__u16 ph_type;
+	__u32 ph_sec;
+	__u64 ph_usec;
+	__u32 ph_stack;
+	__u32 ph_pid;
+	__u32 ph_extern_pid;
+	__u32 ph_line_num;
+} __packed;
+
+#define PH_FLAG_FIRST_RECORD 1
+
+/* Debugging subsystems (32 bits, non-overlapping) */
+/* keep these in sync with lnet/utils/debug.c and lnet/libcfs/debug.c */
+#define S_UNDEFINED	0x00000001
+#define S_MDC		0x00000002
+#define S_MDS		0x00000004
+#define S_OSC		0x00000008
+#define S_OST		0x00000010
+#define S_CLASS		0x00000020
+#define S_LOG		0x00000040
+#define S_LLITE		0x00000080
+#define S_RPC		0x00000100
+#define S_MGMT		0x00000200
+#define S_LNET		0x00000400
+#define S_LND		0x00000800 /* ALL LNDs */
+#define S_PINGER	0x00001000
+#define S_FILTER	0x00002000
+/* unused */
+#define S_ECHO		0x00008000
+#define S_LDLM		0x00010000
+#define S_LOV		0x00020000
+#define S_LQUOTA	0x00040000
+#define S_OSD		0x00080000
+/* unused */
+/* unused */
+/* unused */
+#define S_LMV		0x00800000 /* b_new_cmd */
+/* unused */
+#define S_SEC		0x02000000 /* upcall cache */
+#define S_GSS		0x04000000 /* b_new_cmd */
+/* unused */
+#define S_MGC		0x10000000
+#define S_MGS		0x20000000
+#define S_FID		0x40000000 /* b_new_cmd */
+#define S_FLD		0x80000000 /* b_new_cmd */
+/* keep these in sync with lnet/utils/debug.c and lnet/libcfs/debug.c */
+
+/* Debugging masks (32 bits, non-overlapping) */
+/* keep these in sync with lnet/utils/debug.c and lnet/libcfs/debug.c */
+#define D_TRACE		0x00000001 /* ENTRY/EXIT markers */
+#define D_INODE		0x00000002
+#define D_SUPER		0x00000004
+#define D_EXT2		0x00000008 /* anything from ext2_debug */
+#define D_MALLOC	0x00000010 /* print malloc, free information */
+#define D_CACHE		0x00000020 /* cache-related items */
+#define D_INFO		0x00000040 /* general information */
+#define D_IOCTL		0x00000080 /* ioctl related information */
+#define D_NETERROR	0x00000100 /* network errors */
+#define D_NET		0x00000200 /* network communications */
+#define D_WARNING	0x00000400 /* CWARN(...) == CDEBUG (D_WARNING, ...) */
+#define D_BUFFS		0x00000800
+#define D_OTHER		0x00001000
+#define D_DENTRY	0x00002000
+#define D_NETTRACE	0x00004000
+#define D_PAGE		0x00008000 /* bulk page handling */
+#define D_DLMTRACE	0x00010000
+#define D_ERROR		0x00020000 /* CERROR(...) == CDEBUG (D_ERROR, ...) */
+#define D_EMERG		0x00040000 /* CEMERG(...) == CDEBUG (D_EMERG, ...) */
+#define D_HA		0x00080000 /* recovery and failover */
+#define D_RPCTRACE	0x00100000 /* for distributed debugging */
+#define D_VFSTRACE	0x00200000
+#define D_READA		0x00400000 /* read-ahead */
+#define D_MMAP		0x00800000
+#define D_CONFIG	0x01000000
+#define D_CONSOLE	0x02000000
+#define D_QUOTA		0x04000000
+#define D_SEC		0x08000000
+#define D_LFSCK		0x10000000 /* For both OI scrub and LFSCK */
+/* keep these in sync with lnet/{utils,libcfs}/debug.c */
+
+#define D_HSM	 D_TRACE
+
+#define D_CANTMASK   (D_ERROR | D_EMERG | D_WARNING | D_CONSOLE)
+
+#ifndef DEBUG_SUBSYSTEM
+# define DEBUG_SUBSYSTEM S_UNDEFINED
+#endif
+
+#define CDEBUG_DEFAULT_MAX_DELAY (cfs_time_seconds(600))	 /* jiffies */
+#define CDEBUG_DEFAULT_MIN_DELAY ((cfs_time_seconds(1) + 1) / 2) /* jiffies */
+#define CDEBUG_DEFAULT_BACKOFF   2
+struct cfs_debug_limit_state {
+	unsigned long   cdls_next;
+	unsigned int cdls_delay;
+	int	     cdls_count;
+};
+
+struct libcfs_debug_msg_data {
+	const char *msg_file;
+	const char *msg_fn;
+	int	    msg_subsys;
+	int	    msg_line;
+	int	    msg_mask;
+	struct cfs_debug_limit_state *msg_cdls;
+};
+
+#define LIBCFS_DEBUG_MSG_DATA_INIT(data, mask, cdls)		\
+do {								\
+	(data)->msg_subsys = DEBUG_SUBSYSTEM;			\
+	(data)->msg_file   = __FILE__;				\
+	(data)->msg_fn     = __func__;				\
+	(data)->msg_line   = __LINE__;				\
+	(data)->msg_cdls   = (cdls);				\
+	(data)->msg_mask   = (mask);				\
+} while (0)
+
+#define LIBCFS_DEBUG_MSG_DATA_DECL(dataname, mask, cdls)	\
+	static struct libcfs_debug_msg_data dataname = {	\
+	       .msg_subsys = DEBUG_SUBSYSTEM,			\
+	       .msg_file   = __FILE__,				\
+	       .msg_fn     = __func__,				\
+	       .msg_line   = __LINE__,				\
+	       .msg_cdls   = (cdls)	 };			\
+	dataname.msg_mask   = (mask)
+
+/**
+ * Filters out logging messages based on mask and subsystem.
+ */
+static inline int cfs_cdebug_show(unsigned int mask, unsigned int subsystem)
+{
+	return mask & D_CANTMASK ||
+		((libcfs_debug & mask) && (libcfs_subsystem_debug & subsystem));
+}
+
+#define __CDEBUG(cdls, mask, format, ...)				\
+do {									\
+	static struct libcfs_debug_msg_data msgdata;			\
+									\
+	CFS_CHECK_STACK(&msgdata, mask, cdls);				\
+									\
+	if (cfs_cdebug_show(mask, DEBUG_SUBSYSTEM)) {			\
+		LIBCFS_DEBUG_MSG_DATA_INIT(&msgdata, mask, cdls);	\
+		libcfs_debug_msg(&msgdata, format, ## __VA_ARGS__);	\
+	}								\
+} while (0)
+
+#define CDEBUG(mask, format, ...) __CDEBUG(NULL, mask, format, ## __VA_ARGS__)
+
+#define CDEBUG_LIMIT(mask, format, ...)					\
+do {									\
+	static struct cfs_debug_limit_state cdls;			\
+									\
+	__CDEBUG(&cdls, mask, format, ## __VA_ARGS__);			\
+} while (0)
+
+#define CWARN(format, ...)	CDEBUG_LIMIT(D_WARNING, format, ## __VA_ARGS__)
+#define CERROR(format, ...)	CDEBUG_LIMIT(D_ERROR, format, ## __VA_ARGS__)
+#define CNETERR(format, a...)	CDEBUG_LIMIT(D_NETERROR, format, ## a)
+#define CEMERG(format, ...)	CDEBUG_LIMIT(D_EMERG, format, ## __VA_ARGS__)
+
+#define LCONSOLE(mask, format, ...) CDEBUG(D_CONSOLE | (mask), format, ## __VA_ARGS__)
+#define LCONSOLE_INFO(format, ...)  CDEBUG_LIMIT(D_CONSOLE, format, ## __VA_ARGS__)
+#define LCONSOLE_WARN(format, ...)  CDEBUG_LIMIT(D_CONSOLE | D_WARNING, format, ## __VA_ARGS__)
+#define LCONSOLE_ERROR_MSG(errnum, format, ...) CDEBUG_LIMIT(D_CONSOLE | D_ERROR, \
+			   "%x-%x: " format, errnum, LERRCHKSUM(errnum), ## __VA_ARGS__)
+#define LCONSOLE_ERROR(format, ...) LCONSOLE_ERROR_MSG(0x00, format, ## __VA_ARGS__)
+
+#define LCONSOLE_EMERG(format, ...) CDEBUG(D_CONSOLE | D_EMERG, format, ## __VA_ARGS__)
+
+int libcfs_debug_msg(struct libcfs_debug_msg_data *msgdata,
+			    const char *format1, ...)
+	__printf(2, 3);
+
+int libcfs_debug_vmsg2(struct libcfs_debug_msg_data *msgdata,
+			      const char *format1,
+			      va_list args, const char *format2, ...)
+	__printf(4, 5);
+
+/* other external symbols that tracefile provides: */
+int cfs_trace_copyin_string(char *knl_buffer, int knl_buffer_nob,
+		const char __user *usr_buffer, int usr_buffer_nob);
+int cfs_trace_copyout_string(char __user *usr_buffer, int usr_buffer_nob,
+		const char *knl_buffer, char *append);
+
+#define LIBCFS_DEBUG_FILE_PATH_DEFAULT "/tmp/lustre-log"
+
+#endif	/* __LIBCFS_DEBUG_H__ */
diff --git a/kernel/drivers/staging/lustre/include/linux/libcfs/libcfs_fail.h b/kernel/drivers/staging/lustre/include/linux/libcfs/libcfs_fail.h
new file mode 100644
index 000000000..eea55d94e
--- /dev/null
+++ b/kernel/drivers/staging/lustre/include/linux/libcfs/libcfs_fail.h
@@ -0,0 +1,171 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see http://www.gnu.org/licenses
+ *
+ * Please contact Oracle Corporation, Inc., 500 Oracle Parkway, Redwood Shores,
+ * CA 94065 USA or visit www.oracle.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Oracle Corporation, Inc.
+ */
+
+#ifndef _LIBCFS_FAIL_H
+#define _LIBCFS_FAIL_H
+
+extern unsigned long cfs_fail_loc;
+extern unsigned int cfs_fail_val;
+
+extern wait_queue_head_t cfs_race_waitq;
+extern int cfs_race_state;
+
+int __cfs_fail_check_set(__u32 id, __u32 value, int set);
+int __cfs_fail_timeout_set(__u32 id, __u32 value, int ms, int set);
+
+enum {
+	CFS_FAIL_LOC_NOSET      = 0,
+	CFS_FAIL_LOC_ORSET      = 1,
+	CFS_FAIL_LOC_RESET      = 2,
+	CFS_FAIL_LOC_VALUE      = 3
+};
+
+/* Failure injection control */
+#define CFS_FAIL_MASK_SYS    0x0000FF00
+#define CFS_FAIL_MASK_LOC   (0x000000FF | CFS_FAIL_MASK_SYS)
+
+#define CFS_FAILED_BIT       30
+/* CFS_FAILED is 0x40000000 */
+#define CFS_FAILED	  (1 << CFS_FAILED_BIT)
+
+#define CFS_FAIL_ONCE_BIT    31
+/* CFS_FAIL_ONCE is 0x80000000 */
+#define CFS_FAIL_ONCE       (1 << CFS_FAIL_ONCE_BIT)
+
+/* The following flags aren't made to be combined */
+#define CFS_FAIL_SKIP	0x20000000 /* skip N times then fail */
+#define CFS_FAIL_SOME	0x10000000 /* only fail N times */
+#define CFS_FAIL_RAND	0x08000000 /* fail 1/N of the times */
+#define CFS_FAIL_USR1	0x04000000 /* user flag */
+
+#define CFS_FAIL_PRECHECK(id) (cfs_fail_loc &&				\
+			      (cfs_fail_loc & CFS_FAIL_MASK_LOC) ==	   \
+			      ((id) & CFS_FAIL_MASK_LOC))
+
+static inline int cfs_fail_check_set(__u32 id, __u32 value,
+				     int set, int quiet)
+{
+	int ret = 0;
+
+	if (unlikely(CFS_FAIL_PRECHECK(id) &&
+		     (ret = __cfs_fail_check_set(id, value, set)))) {
+		if (quiet) {
+			CDEBUG(D_INFO, "*** cfs_fail_loc=%x, val=%u***\n",
+			       id, value);
+		} else {
+			LCONSOLE_INFO("*** cfs_fail_loc=%x, val=%u***\n",
+				      id, value);
+		}
+	}
+
+	return ret;
+}
+
+/* If id hit cfs_fail_loc, return 1, otherwise return 0 */
+#define CFS_FAIL_CHECK(id) \
+	cfs_fail_check_set(id, 0, CFS_FAIL_LOC_NOSET, 0)
+#define CFS_FAIL_CHECK_QUIET(id) \
+	cfs_fail_check_set(id, 0, CFS_FAIL_LOC_NOSET, 1)
+
+/* If id hit cfs_fail_loc and cfs_fail_val == (-1 or value) return 1,
+ * otherwise return 0 */
+#define CFS_FAIL_CHECK_VALUE(id, value) \
+	cfs_fail_check_set(id, value, CFS_FAIL_LOC_VALUE, 0)
+#define CFS_FAIL_CHECK_VALUE_QUIET(id, value) \
+	cfs_fail_check_set(id, value, CFS_FAIL_LOC_VALUE, 1)
+
+/* If id hit cfs_fail_loc, cfs_fail_loc |= value and return 1,
+ * otherwise return 0 */
+#define CFS_FAIL_CHECK_ORSET(id, value) \
+	cfs_fail_check_set(id, value, CFS_FAIL_LOC_ORSET, 0)
+#define CFS_FAIL_CHECK_ORSET_QUIET(id, value) \
+	cfs_fail_check_set(id, value, CFS_FAIL_LOC_ORSET, 1)
+
+/* If id hit cfs_fail_loc, cfs_fail_loc = value and return 1,
+ * otherwise return 0 */
+#define CFS_FAIL_CHECK_RESET(id, value) \
+	cfs_fail_check_set(id, value, CFS_FAIL_LOC_RESET, 0)
+#define CFS_FAIL_CHECK_RESET_QUIET(id, value) \
+	cfs_fail_check_set(id, value, CFS_FAIL_LOC_RESET, 1)
+
+static inline int cfs_fail_timeout_set(__u32 id, __u32 value, int ms, int set)
+{
+	if (unlikely(CFS_FAIL_PRECHECK(id)))
+		return __cfs_fail_timeout_set(id, value, ms, set);
+	return 0;
+}
+
+/* If id hit cfs_fail_loc, sleep for seconds or milliseconds */
+#define CFS_FAIL_TIMEOUT(id, secs) \
+	cfs_fail_timeout_set(id, 0, secs * 1000, CFS_FAIL_LOC_NOSET)
+
+#define CFS_FAIL_TIMEOUT_MS(id, ms) \
+	cfs_fail_timeout_set(id, 0, ms, CFS_FAIL_LOC_NOSET)
+
+/* If id hit cfs_fail_loc, cfs_fail_loc |= value and
+ * sleep seconds or milliseconds */
+#define CFS_FAIL_TIMEOUT_ORSET(id, value, secs) \
+	cfs_fail_timeout_set(id, value, secs * 1000, CFS_FAIL_LOC_ORSET)
+
+#define CFS_FAIL_TIMEOUT_MS_ORSET(id, value, ms) \
+	cfs_fail_timeout_set(id, value, ms, CFS_FAIL_LOC_ORSET)
+
+/* The idea here is to synchronise two threads to force a race. The
+ * first thread that calls this with a matching fail_loc is put to
+ * sleep. The next thread that calls with the same fail_loc wakes up
+ * the first and continues. */
+static inline void cfs_race(__u32 id)
+{
+
+	if (CFS_FAIL_PRECHECK(id)) {
+		if (unlikely(__cfs_fail_check_set(id, 0, CFS_FAIL_LOC_NOSET))) {
+			int rc;
+
+			cfs_race_state = 0;
+			CERROR("cfs_race id %x sleeping\n", id);
+			rc = wait_event_interruptible(cfs_race_waitq,
+						      cfs_race_state != 0);
+			CERROR("cfs_fail_race id %x awake, rc=%d\n", id, rc);
+		} else {
+			CERROR("cfs_fail_race id %x waking\n", id);
+			cfs_race_state = 1;
+			wake_up(&cfs_race_waitq);
+		}
+	}
+}
+
+#define CFS_RACE(id) cfs_race(id)
+
+#endif /* _LIBCFS_FAIL_H */
diff --git a/kernel/drivers/staging/lustre/include/linux/libcfs/libcfs_hash.h b/kernel/drivers/staging/lustre/include/linux/libcfs/libcfs_hash.h
new file mode 100644
index 000000000..c40814591
--- /dev/null
+++ b/kernel/drivers/staging/lustre/include/linux/libcfs/libcfs_hash.h
@@ -0,0 +1,843 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * libcfs/include/libcfs/libcfs_hash.h
+ *
+ * Hashing routines
+ *
+ */
+
+#ifndef __LIBCFS_HASH_H__
+#define __LIBCFS_HASH_H__
+/*
+ * Knuth recommends primes in approximately golden ratio to the maximum
+ * integer representable by a machine word for multiplicative hashing.
+ * Chuck Lever verified the effectiveness of this technique:
+ * http://www.citi.umich.edu/techreports/reports/citi-tr-00-1.pdf
+ *
+ * These primes are chosen to be bit-sparse, that is operations on
+ * them can use shifts and additions instead of multiplications for
+ * machines where multiplications are slow.
+ */
+/* 2^31 + 2^29 - 2^25 + 2^22 - 2^19 - 2^16 + 1 */
+#define CFS_GOLDEN_RATIO_PRIME_32 0x9e370001UL
+/*  2^63 + 2^61 - 2^57 + 2^54 - 2^51 - 2^18 + 1 */
+#define CFS_GOLDEN_RATIO_PRIME_64 0x9e37fffffffc0001ULL
+
+/*
+ * Ideally we would use HAVE_HASH_LONG for this, but on linux we configure
+ * the linux kernel and user space at the same time, so we need to differentiate
+ * between them explicitly. If this is not needed on other architectures, then
+ * we'll need to move the functions to architecture specific headers.
+ */
+
+#include <linux/hash.h>
+
+/** disable debug */
+#define CFS_HASH_DEBUG_NONE	 0
+/** record hash depth and output to console when it's too deep,
+ *  computing overhead is low but consume more memory */
+#define CFS_HASH_DEBUG_1	    1
+/** expensive, check key validation */
+#define CFS_HASH_DEBUG_2	    2
+
+#define CFS_HASH_DEBUG_LEVEL	CFS_HASH_DEBUG_NONE
+
+struct cfs_hash_ops;
+struct cfs_hash_lock_ops;
+struct cfs_hash_hlist_ops;
+
+union cfs_hash_lock {
+	rwlock_t		rw;		/**< rwlock */
+	spinlock_t		spin;		/**< spinlock */
+};
+
+/**
+ * cfs_hash_bucket is a container of:
+ * - lock, counter ...
+ * - array of hash-head starting from hsb_head[0], hash-head can be one of
+ *   . cfs_hash_head_t
+ *   . cfs_hash_head_dep_t
+ *   . cfs_hash_dhead_t
+ *   . cfs_hash_dhead_dep_t
+ *   which depends on requirement of user
+ * - some extra bytes (caller can require it while creating hash)
+ */
+struct cfs_hash_bucket {
+	union cfs_hash_lock	hsb_lock;	/**< bucket lock */
+	__u32			hsb_count;	/**< current entries */
+	__u32			hsb_version;	/**< change version */
+	unsigned int		hsb_index;	/**< index of bucket */
+	int			hsb_depmax;	/**< max depth on bucket */
+	long			hsb_head[0];	/**< hash-head array */
+};
+
+/**
+ * cfs_hash bucket descriptor, it's normally in stack of caller
+ */
+struct cfs_hash_bd {
+	struct cfs_hash_bucket	*bd_bucket;      /**< address of bucket */
+	unsigned int		bd_offset;      /**< offset in bucket */
+};
+
+#define CFS_HASH_NAME_LEN	   16      /**< default name length */
+#define CFS_HASH_BIGNAME_LEN	64      /**< bigname for param tree */
+
+#define CFS_HASH_BKT_BITS	   3       /**< default bits of bucket */
+#define CFS_HASH_BITS_MAX	   30      /**< max bits of bucket */
+#define CFS_HASH_BITS_MIN	   CFS_HASH_BKT_BITS
+
+/**
+ * common hash attributes.
+ */
+enum cfs_hash_tag {
+	/**
+	 * don't need any lock, caller will protect operations with it's
+	 * own lock. With this flag:
+	 *  . CFS_HASH_NO_BKTLOCK, CFS_HASH_RW_BKTLOCK, CFS_HASH_SPIN_BKTLOCK
+	 *    will be ignored.
+	 *  . Some functions will be disabled with this flag, i.e:
+	 *    cfs_hash_for_each_empty, cfs_hash_rehash
+	 */
+	CFS_HASH_NO_LOCK	= 1 << 0,
+	/** no bucket lock, use one spinlock to protect the whole hash */
+	CFS_HASH_NO_BKTLOCK     = 1 << 1,
+	/** rwlock to protect bucket */
+	CFS_HASH_RW_BKTLOCK     = 1 << 2,
+	/** spinlock to protect bucket */
+	CFS_HASH_SPIN_BKTLOCK   = 1 << 3,
+	/** always add new item to tail */
+	CFS_HASH_ADD_TAIL       = 1 << 4,
+	/** hash-table doesn't have refcount on item */
+	CFS_HASH_NO_ITEMREF     = 1 << 5,
+	/** big name for param-tree */
+	CFS_HASH_BIGNAME	= 1 << 6,
+	/** track global count */
+	CFS_HASH_COUNTER	= 1 << 7,
+	/** rehash item by new key */
+	CFS_HASH_REHASH_KEY     = 1 << 8,
+	/** Enable dynamic hash resizing */
+	CFS_HASH_REHASH	 = 1 << 9,
+	/** can shrink hash-size */
+	CFS_HASH_SHRINK	 = 1 << 10,
+	/** assert hash is empty on exit */
+	CFS_HASH_ASSERT_EMPTY   = 1 << 11,
+	/** record hlist depth */
+	CFS_HASH_DEPTH	  = 1 << 12,
+	/**
+	 * rehash is always scheduled in a different thread, so current
+	 * change on hash table is non-blocking
+	 */
+	CFS_HASH_NBLK_CHANGE    = 1 << 13,
+	/** NB, we typed hs_flags as  __u16, please change it
+	 * if you need to extend >=16 flags */
+};
+
+/** most used attributes */
+#define CFS_HASH_DEFAULT       (CFS_HASH_RW_BKTLOCK | \
+				CFS_HASH_COUNTER | CFS_HASH_REHASH)
+
+/**
+ * cfs_hash is a hash-table implementation for general purpose, it can support:
+ *    . two refcount modes
+ *      hash-table with & without refcount
+ *    . four lock modes
+ *      nolock, one-spinlock, rw-bucket-lock, spin-bucket-lock
+ *    . general operations
+ *      lookup, add(add_tail or add_head), delete
+ *    . rehash
+ *      grows or shrink
+ *    . iteration
+ *      locked iteration and unlocked iteration
+ *    . bigname
+ *      support long name hash
+ *    . debug
+ *      trace max searching depth
+ *
+ * Rehash:
+ * When the htable grows or shrinks, a separate task (cfs_hash_rehash_worker)
+ * is spawned to handle the rehash in the background, it's possible that other
+ * processes can concurrently perform additions, deletions, and lookups
+ * without being blocked on rehash completion, because rehash will release
+ * the global wrlock for each bucket.
+ *
+ * rehash and iteration can't run at the same time because it's too tricky
+ * to keep both of them safe and correct.
+ * As they are relatively rare operations, so:
+ *   . if iteration is in progress while we try to launch rehash, then
+ *     it just giveup, iterator will launch rehash at the end.
+ *   . if rehash is in progress while we try to iterate the hash table,
+ *     then we just wait (shouldn't be very long time), anyway, nobody
+ *     should expect iteration of whole hash-table to be non-blocking.
+ *
+ * During rehashing, a (key,object) pair may be in one of two buckets,
+ * depending on whether the worker task has yet to transfer the object
+ * to its new location in the table. Lookups and deletions need to search both
+ * locations; additions must take care to only insert into the new bucket.
+ */
+
+struct cfs_hash {
+	/** serialize with rehash, or serialize all operations if
+	 * the hash-table has CFS_HASH_NO_BKTLOCK */
+	union cfs_hash_lock	     hs_lock;
+	/** hash operations */
+	struct cfs_hash_ops	*hs_ops;
+	/** hash lock operations */
+	struct cfs_hash_lock_ops   *hs_lops;
+	/** hash list operations */
+	struct cfs_hash_hlist_ops  *hs_hops;
+	/** hash buckets-table */
+	struct cfs_hash_bucket	 **hs_buckets;
+	/** total number of items on this hash-table */
+	atomic_t		hs_count;
+	/** hash flags, see cfs_hash_tag for detail */
+	__u16		       hs_flags;
+	/** # of extra-bytes for bucket, for user saving extended attributes */
+	__u16		       hs_extra_bytes;
+	/** wants to iterate */
+	__u8			hs_iterating;
+	/** hash-table is dying */
+	__u8			hs_exiting;
+	/** current hash bits */
+	__u8			hs_cur_bits;
+	/** min hash bits */
+	__u8			hs_min_bits;
+	/** max hash bits */
+	__u8			hs_max_bits;
+	/** bits for rehash */
+	__u8			hs_rehash_bits;
+	/** bits for each bucket */
+	__u8			hs_bkt_bits;
+	/** resize min threshold */
+	__u16		       hs_min_theta;
+	/** resize max threshold */
+	__u16		       hs_max_theta;
+	/** resize count */
+	__u32		       hs_rehash_count;
+	/** # of iterators (caller of cfs_hash_for_each_*) */
+	__u32		       hs_iterators;
+	/** rehash workitem */
+	cfs_workitem_t	      hs_rehash_wi;
+	/** refcount on this hash table */
+	atomic_t		hs_refcount;
+	/** rehash buckets-table */
+	struct cfs_hash_bucket	 **hs_rehash_buckets;
+#if CFS_HASH_DEBUG_LEVEL >= CFS_HASH_DEBUG_1
+	/** serialize debug members */
+	spinlock_t			hs_dep_lock;
+	/** max depth */
+	unsigned int		hs_dep_max;
+	/** id of the deepest bucket */
+	unsigned int		hs_dep_bkt;
+	/** offset in the deepest bucket */
+	unsigned int		hs_dep_off;
+	/** bits when we found the max depth */
+	unsigned int		hs_dep_bits;
+	/** workitem to output max depth */
+	cfs_workitem_t	      hs_dep_wi;
+#endif
+	/** name of htable */
+	char			hs_name[0];
+};
+
+typedef struct cfs_hash_lock_ops {
+	/** lock the hash table */
+	void    (*hs_lock)(union cfs_hash_lock *lock, int exclusive);
+	/** unlock the hash table */
+	void    (*hs_unlock)(union cfs_hash_lock *lock, int exclusive);
+	/** lock the hash bucket */
+	void    (*hs_bkt_lock)(union cfs_hash_lock *lock, int exclusive);
+	/** unlock the hash bucket */
+	void    (*hs_bkt_unlock)(union cfs_hash_lock *lock, int exclusive);
+} cfs_hash_lock_ops_t;
+
+typedef struct cfs_hash_hlist_ops {
+	/** return hlist_head of hash-head of @bd */
+	struct hlist_head *(*hop_hhead)(struct cfs_hash *hs, struct cfs_hash_bd *bd);
+	/** return hash-head size */
+	int (*hop_hhead_size)(struct cfs_hash *hs);
+	/** add @hnode to hash-head of @bd */
+	int (*hop_hnode_add)(struct cfs_hash *hs,
+			     struct cfs_hash_bd *bd, struct hlist_node *hnode);
+	/** remove @hnode from hash-head of @bd */
+	int (*hop_hnode_del)(struct cfs_hash *hs,
+			     struct cfs_hash_bd *bd, struct hlist_node *hnode);
+} cfs_hash_hlist_ops_t;
+
+typedef struct cfs_hash_ops {
+	/** return hashed value from @key */
+	unsigned (*hs_hash)(struct cfs_hash *hs, const void *key, unsigned mask);
+	/** return key address of @hnode */
+	void *   (*hs_key)(struct hlist_node *hnode);
+	/** copy key from @hnode to @key */
+	void     (*hs_keycpy)(struct hlist_node *hnode, void *key);
+	/**
+	 *  compare @key with key of @hnode
+	 *  returns 1 on a match
+	 */
+	int      (*hs_keycmp)(const void *key, struct hlist_node *hnode);
+	/** return object address of @hnode, i.e: container_of(...hnode) */
+	void *   (*hs_object)(struct hlist_node *hnode);
+	/** get refcount of item, always called with holding bucket-lock */
+	void     (*hs_get)(struct cfs_hash *hs, struct hlist_node *hnode);
+	/** release refcount of item */
+	void     (*hs_put)(struct cfs_hash *hs, struct hlist_node *hnode);
+	/** release refcount of item, always called with holding bucket-lock */
+	void     (*hs_put_locked)(struct cfs_hash *hs, struct hlist_node *hnode);
+	/** it's called before removing of @hnode */
+	void     (*hs_exit)(struct cfs_hash *hs, struct hlist_node *hnode);
+} cfs_hash_ops_t;
+
+/** total number of buckets in @hs */
+#define CFS_HASH_NBKT(hs)       \
+	(1U << ((hs)->hs_cur_bits - (hs)->hs_bkt_bits))
+
+/** total number of buckets in @hs while rehashing */
+#define CFS_HASH_RH_NBKT(hs)    \
+	(1U << ((hs)->hs_rehash_bits - (hs)->hs_bkt_bits))
+
+/** number of hlist for in bucket */
+#define CFS_HASH_BKT_NHLIST(hs) (1U << (hs)->hs_bkt_bits)
+
+/** total number of hlist in @hs */
+#define CFS_HASH_NHLIST(hs)     (1U << (hs)->hs_cur_bits)
+
+/** total number of hlist in @hs while rehashing */
+#define CFS_HASH_RH_NHLIST(hs)  (1U << (hs)->hs_rehash_bits)
+
+static inline int
+cfs_hash_with_no_lock(struct cfs_hash *hs)
+{
+	/* caller will serialize all operations for this hash-table */
+	return (hs->hs_flags & CFS_HASH_NO_LOCK) != 0;
+}
+
+static inline int
+cfs_hash_with_no_bktlock(struct cfs_hash *hs)
+{
+	/* no bucket lock, one single lock to protect the hash-table */
+	return (hs->hs_flags & CFS_HASH_NO_BKTLOCK) != 0;
+}
+
+static inline int
+cfs_hash_with_rw_bktlock(struct cfs_hash *hs)
+{
+	/* rwlock to protect hash bucket */
+	return (hs->hs_flags & CFS_HASH_RW_BKTLOCK) != 0;
+}
+
+static inline int
+cfs_hash_with_spin_bktlock(struct cfs_hash *hs)
+{
+	/* spinlock to protect hash bucket */
+	return (hs->hs_flags & CFS_HASH_SPIN_BKTLOCK) != 0;
+}
+
+static inline int
+cfs_hash_with_add_tail(struct cfs_hash *hs)
+{
+	return (hs->hs_flags & CFS_HASH_ADD_TAIL) != 0;
+}
+
+static inline int
+cfs_hash_with_no_itemref(struct cfs_hash *hs)
+{
+	/* hash-table doesn't keep refcount on item,
+	 * item can't be removed from hash unless it's
+	 * ZERO refcount */
+	return (hs->hs_flags & CFS_HASH_NO_ITEMREF) != 0;
+}
+
+static inline int
+cfs_hash_with_bigname(struct cfs_hash *hs)
+{
+	return (hs->hs_flags & CFS_HASH_BIGNAME) != 0;
+}
+
+static inline int
+cfs_hash_with_counter(struct cfs_hash *hs)
+{
+	return (hs->hs_flags & CFS_HASH_COUNTER) != 0;
+}
+
+static inline int
+cfs_hash_with_rehash(struct cfs_hash *hs)
+{
+	return (hs->hs_flags & CFS_HASH_REHASH) != 0;
+}
+
+static inline int
+cfs_hash_with_rehash_key(struct cfs_hash *hs)
+{
+	return (hs->hs_flags & CFS_HASH_REHASH_KEY) != 0;
+}
+
+static inline int
+cfs_hash_with_shrink(struct cfs_hash *hs)
+{
+	return (hs->hs_flags & CFS_HASH_SHRINK) != 0;
+}
+
+static inline int
+cfs_hash_with_assert_empty(struct cfs_hash *hs)
+{
+	return (hs->hs_flags & CFS_HASH_ASSERT_EMPTY) != 0;
+}
+
+static inline int
+cfs_hash_with_depth(struct cfs_hash *hs)
+{
+	return (hs->hs_flags & CFS_HASH_DEPTH) != 0;
+}
+
+static inline int
+cfs_hash_with_nblk_change(struct cfs_hash *hs)
+{
+	return (hs->hs_flags & CFS_HASH_NBLK_CHANGE) != 0;
+}
+
+static inline int
+cfs_hash_is_exiting(struct cfs_hash *hs)
+{       /* cfs_hash_destroy is called */
+	return hs->hs_exiting;
+}
+
+static inline int
+cfs_hash_is_rehashing(struct cfs_hash *hs)
+{       /* rehash is launched */
+	return hs->hs_rehash_bits != 0;
+}
+
+static inline int
+cfs_hash_is_iterating(struct cfs_hash *hs)
+{       /* someone is calling cfs_hash_for_each_* */
+	return hs->hs_iterating || hs->hs_iterators != 0;
+}
+
+static inline int
+cfs_hash_bkt_size(struct cfs_hash *hs)
+{
+	return offsetof(struct cfs_hash_bucket, hsb_head[0]) +
+	       hs->hs_hops->hop_hhead_size(hs) * CFS_HASH_BKT_NHLIST(hs) +
+	       hs->hs_extra_bytes;
+}
+
+static inline unsigned
+cfs_hash_id(struct cfs_hash *hs, const void *key, unsigned mask)
+{
+	return hs->hs_ops->hs_hash(hs, key, mask);
+}
+
+static inline void *
+cfs_hash_key(struct cfs_hash *hs, struct hlist_node *hnode)
+{
+	return hs->hs_ops->hs_key(hnode);
+}
+
+static inline void
+cfs_hash_keycpy(struct cfs_hash *hs, struct hlist_node *hnode, void *key)
+{
+	if (hs->hs_ops->hs_keycpy)
+		hs->hs_ops->hs_keycpy(hnode, key);
+}
+
+/**
+ * Returns 1 on a match,
+ */
+static inline int
+cfs_hash_keycmp(struct cfs_hash *hs, const void *key, struct hlist_node *hnode)
+{
+	return hs->hs_ops->hs_keycmp(key, hnode);
+}
+
+static inline void *
+cfs_hash_object(struct cfs_hash *hs, struct hlist_node *hnode)
+{
+	return hs->hs_ops->hs_object(hnode);
+}
+
+static inline void
+cfs_hash_get(struct cfs_hash *hs, struct hlist_node *hnode)
+{
+	return hs->hs_ops->hs_get(hs, hnode);
+}
+
+static inline void
+cfs_hash_put_locked(struct cfs_hash *hs, struct hlist_node *hnode)
+{
+	return hs->hs_ops->hs_put_locked(hs, hnode);
+}
+
+static inline void
+cfs_hash_put(struct cfs_hash *hs, struct hlist_node *hnode)
+{
+	return hs->hs_ops->hs_put(hs, hnode);
+}
+
+static inline void
+cfs_hash_exit(struct cfs_hash *hs, struct hlist_node *hnode)
+{
+	if (hs->hs_ops->hs_exit)
+		hs->hs_ops->hs_exit(hs, hnode);
+}
+
+static inline void cfs_hash_lock(struct cfs_hash *hs, int excl)
+{
+	hs->hs_lops->hs_lock(&hs->hs_lock, excl);
+}
+
+static inline void cfs_hash_unlock(struct cfs_hash *hs, int excl)
+{
+	hs->hs_lops->hs_unlock(&hs->hs_lock, excl);
+}
+
+static inline int cfs_hash_dec_and_lock(struct cfs_hash *hs,
+					atomic_t *condition)
+{
+	LASSERT(cfs_hash_with_no_bktlock(hs));
+	return atomic_dec_and_lock(condition, &hs->hs_lock.spin);
+}
+
+static inline void cfs_hash_bd_lock(struct cfs_hash *hs,
+				    struct cfs_hash_bd *bd, int excl)
+{
+	hs->hs_lops->hs_bkt_lock(&bd->bd_bucket->hsb_lock, excl);
+}
+
+static inline void cfs_hash_bd_unlock(struct cfs_hash *hs,
+				      struct cfs_hash_bd *bd, int excl)
+{
+	hs->hs_lops->hs_bkt_unlock(&bd->bd_bucket->hsb_lock, excl);
+}
+
+/**
+ * operations on cfs_hash bucket (bd: bucket descriptor),
+ * they are normally for hash-table without rehash
+ */
+void cfs_hash_bd_get(struct cfs_hash *hs, const void *key, struct cfs_hash_bd *bd);
+
+static inline void cfs_hash_bd_get_and_lock(struct cfs_hash *hs, const void *key,
+					    struct cfs_hash_bd *bd, int excl)
+{
+	cfs_hash_bd_get(hs, key, bd);
+	cfs_hash_bd_lock(hs, bd, excl);
+}
+
+static inline unsigned cfs_hash_bd_index_get(struct cfs_hash *hs, struct cfs_hash_bd *bd)
+{
+	return bd->bd_offset | (bd->bd_bucket->hsb_index << hs->hs_bkt_bits);
+}
+
+static inline void cfs_hash_bd_index_set(struct cfs_hash *hs,
+					 unsigned index, struct cfs_hash_bd *bd)
+{
+	bd->bd_bucket = hs->hs_buckets[index >> hs->hs_bkt_bits];
+	bd->bd_offset = index & (CFS_HASH_BKT_NHLIST(hs) - 1U);
+}
+
+static inline void *
+cfs_hash_bd_extra_get(struct cfs_hash *hs, struct cfs_hash_bd *bd)
+{
+	return (void *)bd->bd_bucket +
+	       cfs_hash_bkt_size(hs) - hs->hs_extra_bytes;
+}
+
+static inline __u32
+cfs_hash_bd_version_get(struct cfs_hash_bd *bd)
+{
+	/* need hold cfs_hash_bd_lock */
+	return bd->bd_bucket->hsb_version;
+}
+
+static inline __u32
+cfs_hash_bd_count_get(struct cfs_hash_bd *bd)
+{
+	/* need hold cfs_hash_bd_lock */
+	return bd->bd_bucket->hsb_count;
+}
+
+static inline int
+cfs_hash_bd_depmax_get(struct cfs_hash_bd *bd)
+{
+	return bd->bd_bucket->hsb_depmax;
+}
+
+static inline int
+cfs_hash_bd_compare(struct cfs_hash_bd *bd1, struct cfs_hash_bd *bd2)
+{
+	if (bd1->bd_bucket->hsb_index != bd2->bd_bucket->hsb_index)
+		return bd1->bd_bucket->hsb_index - bd2->bd_bucket->hsb_index;
+
+	if (bd1->bd_offset != bd2->bd_offset)
+		return bd1->bd_offset - bd2->bd_offset;
+
+	return 0;
+}
+
+void cfs_hash_bd_add_locked(struct cfs_hash *hs, struct cfs_hash_bd *bd,
+			    struct hlist_node *hnode);
+void cfs_hash_bd_del_locked(struct cfs_hash *hs, struct cfs_hash_bd *bd,
+			    struct hlist_node *hnode);
+void cfs_hash_bd_move_locked(struct cfs_hash *hs, struct cfs_hash_bd *bd_old,
+			     struct cfs_hash_bd *bd_new, struct hlist_node *hnode);
+
+static inline int cfs_hash_bd_dec_and_lock(struct cfs_hash *hs, struct cfs_hash_bd *bd,
+					   atomic_t *condition)
+{
+	LASSERT(cfs_hash_with_spin_bktlock(hs));
+	return atomic_dec_and_lock(condition,
+				       &bd->bd_bucket->hsb_lock.spin);
+}
+
+static inline struct hlist_head *cfs_hash_bd_hhead(struct cfs_hash *hs,
+						   struct cfs_hash_bd *bd)
+{
+	return hs->hs_hops->hop_hhead(hs, bd);
+}
+
+struct hlist_node *cfs_hash_bd_lookup_locked(struct cfs_hash *hs,
+					     struct cfs_hash_bd *bd, const void *key);
+struct hlist_node *cfs_hash_bd_peek_locked(struct cfs_hash *hs,
+					   struct cfs_hash_bd *bd, const void *key);
+struct hlist_node *cfs_hash_bd_findadd_locked(struct cfs_hash *hs,
+					      struct cfs_hash_bd *bd, const void *key,
+					     struct hlist_node *hnode,
+					     int insist_add);
+struct hlist_node *cfs_hash_bd_finddel_locked(struct cfs_hash *hs,
+					      struct cfs_hash_bd *bd, const void *key,
+					     struct hlist_node *hnode);
+
+/**
+ * operations on cfs_hash bucket (bd: bucket descriptor),
+ * they are safe for hash-table with rehash
+ */
+void cfs_hash_dual_bd_get(struct cfs_hash *hs, const void *key, struct cfs_hash_bd *bds);
+void cfs_hash_dual_bd_lock(struct cfs_hash *hs, struct cfs_hash_bd *bds, int excl);
+void cfs_hash_dual_bd_unlock(struct cfs_hash *hs, struct cfs_hash_bd *bds, int excl);
+
+static inline void cfs_hash_dual_bd_get_and_lock(struct cfs_hash *hs, const void *key,
+						 struct cfs_hash_bd *bds, int excl)
+{
+	cfs_hash_dual_bd_get(hs, key, bds);
+	cfs_hash_dual_bd_lock(hs, bds, excl);
+}
+
+struct hlist_node *cfs_hash_dual_bd_lookup_locked(struct cfs_hash *hs,
+						  struct cfs_hash_bd *bds,
+						 const void *key);
+struct hlist_node *cfs_hash_dual_bd_findadd_locked(struct cfs_hash *hs,
+						   struct cfs_hash_bd *bds,
+						  const void *key,
+						  struct hlist_node *hnode,
+						  int insist_add);
+struct hlist_node *cfs_hash_dual_bd_finddel_locked(struct cfs_hash *hs,
+						   struct cfs_hash_bd *bds,
+						  const void *key,
+						  struct hlist_node *hnode);
+
+/* Hash init/cleanup functions */
+struct cfs_hash *cfs_hash_create(char *name, unsigned cur_bits, unsigned max_bits,
+				 unsigned bkt_bits, unsigned extra_bytes,
+			    unsigned min_theta, unsigned max_theta,
+			    cfs_hash_ops_t *ops, unsigned flags);
+
+struct cfs_hash *cfs_hash_getref(struct cfs_hash *hs);
+void cfs_hash_putref(struct cfs_hash *hs);
+
+/* Hash addition functions */
+void cfs_hash_add(struct cfs_hash *hs, const void *key,
+		  struct hlist_node *hnode);
+int cfs_hash_add_unique(struct cfs_hash *hs, const void *key,
+			struct hlist_node *hnode);
+void *cfs_hash_findadd_unique(struct cfs_hash *hs, const void *key,
+			      struct hlist_node *hnode);
+
+/* Hash deletion functions */
+void *cfs_hash_del(struct cfs_hash *hs, const void *key, struct hlist_node *hnode);
+void *cfs_hash_del_key(struct cfs_hash *hs, const void *key);
+
+/* Hash lookup/for_each functions */
+#define CFS_HASH_LOOP_HOG       1024
+
+typedef int (*cfs_hash_for_each_cb_t)(struct cfs_hash *hs, struct cfs_hash_bd *bd,
+				      struct hlist_node *node, void *data);
+void *cfs_hash_lookup(struct cfs_hash *hs, const void *key);
+void cfs_hash_for_each(struct cfs_hash *hs, cfs_hash_for_each_cb_t, void *data);
+void cfs_hash_for_each_safe(struct cfs_hash *hs, cfs_hash_for_each_cb_t, void *data);
+int  cfs_hash_for_each_nolock(struct cfs_hash *hs,
+			      cfs_hash_for_each_cb_t, void *data);
+int  cfs_hash_for_each_empty(struct cfs_hash *hs,
+			     cfs_hash_for_each_cb_t, void *data);
+void cfs_hash_for_each_key(struct cfs_hash *hs, const void *key,
+			   cfs_hash_for_each_cb_t, void *data);
+typedef int (*cfs_hash_cond_opt_cb_t)(void *obj, void *data);
+void cfs_hash_cond_del(struct cfs_hash *hs, cfs_hash_cond_opt_cb_t, void *data);
+
+void cfs_hash_hlist_for_each(struct cfs_hash *hs, unsigned hindex,
+			     cfs_hash_for_each_cb_t, void *data);
+int  cfs_hash_is_empty(struct cfs_hash *hs);
+__u64 cfs_hash_size_get(struct cfs_hash *hs);
+
+/*
+ * Rehash - Theta is calculated to be the average chained
+ * hash depth assuming a perfectly uniform hash function.
+ */
+void cfs_hash_rehash_cancel_locked(struct cfs_hash *hs);
+void cfs_hash_rehash_cancel(struct cfs_hash *hs);
+int  cfs_hash_rehash(struct cfs_hash *hs, int do_rehash);
+void cfs_hash_rehash_key(struct cfs_hash *hs, const void *old_key,
+			 void *new_key, struct hlist_node *hnode);
+
+#if CFS_HASH_DEBUG_LEVEL > CFS_HASH_DEBUG_1
+/* Validate hnode references the correct key */
+static inline void
+cfs_hash_key_validate(struct cfs_hash *hs, const void *key,
+		      struct hlist_node *hnode)
+{
+	LASSERT(cfs_hash_keycmp(hs, key, hnode));
+}
+
+/* Validate hnode is in the correct bucket */
+static inline void
+cfs_hash_bucket_validate(struct cfs_hash *hs, struct cfs_hash_bd *bd,
+			 struct hlist_node *hnode)
+{
+	struct cfs_hash_bd   bds[2];
+
+	cfs_hash_dual_bd_get(hs, cfs_hash_key(hs, hnode), bds);
+	LASSERT(bds[0].bd_bucket == bd->bd_bucket ||
+		bds[1].bd_bucket == bd->bd_bucket);
+}
+
+#else /* CFS_HASH_DEBUG_LEVEL > CFS_HASH_DEBUG_1 */
+
+static inline void
+cfs_hash_key_validate(struct cfs_hash *hs, const void *key,
+		      struct hlist_node *hnode) {}
+
+static inline void
+cfs_hash_bucket_validate(struct cfs_hash *hs, struct cfs_hash_bd *bd,
+			 struct hlist_node *hnode) {}
+
+#endif /* CFS_HASH_DEBUG_LEVEL */
+
+#define CFS_HASH_THETA_BITS  10
+#define CFS_HASH_MIN_THETA  (1U << (CFS_HASH_THETA_BITS - 1))
+#define CFS_HASH_MAX_THETA  (1U << (CFS_HASH_THETA_BITS + 1))
+
+/* Return integer component of theta */
+static inline int __cfs_hash_theta_int(int theta)
+{
+	return (theta >> CFS_HASH_THETA_BITS);
+}
+
+/* Return a fractional value between 0 and 999 */
+static inline int __cfs_hash_theta_frac(int theta)
+{
+	return ((theta * 1000) >> CFS_HASH_THETA_BITS) -
+	       (__cfs_hash_theta_int(theta) * 1000);
+}
+
+static inline int __cfs_hash_theta(struct cfs_hash *hs)
+{
+	return (atomic_read(&hs->hs_count) <<
+		CFS_HASH_THETA_BITS) >> hs->hs_cur_bits;
+}
+
+static inline void __cfs_hash_set_theta(struct cfs_hash *hs, int min, int max)
+{
+	LASSERT(min < max);
+	hs->hs_min_theta = (__u16)min;
+	hs->hs_max_theta = (__u16)max;
+}
+
+/* Generic debug formatting routines mainly for proc handler */
+struct seq_file;
+void cfs_hash_debug_header(struct seq_file *m);
+void cfs_hash_debug_str(struct cfs_hash *hs, struct seq_file *m);
+
+/*
+ * Generic djb2 hash algorithm for character arrays.
+ */
+static inline unsigned
+cfs_hash_djb2_hash(const void *key, size_t size, unsigned mask)
+{
+	unsigned i, hash = 5381;
+
+	LASSERT(key != NULL);
+
+	for (i = 0; i < size; i++)
+		hash = hash * 33 + ((char *)key)[i];
+
+	return (hash & mask);
+}
+
+/*
+ * Generic u32 hash algorithm.
+ */
+static inline unsigned
+cfs_hash_u32_hash(const __u32 key, unsigned mask)
+{
+	return ((key * CFS_GOLDEN_RATIO_PRIME_32) & mask);
+}
+
+/*
+ * Generic u64 hash algorithm.
+ */
+static inline unsigned
+cfs_hash_u64_hash(const __u64 key, unsigned mask)
+{
+	return ((unsigned)(key * CFS_GOLDEN_RATIO_PRIME_64) & mask);
+}
+
+/** iterate over all buckets in @bds (array of struct cfs_hash_bd) */
+#define cfs_hash_for_each_bd(bds, n, i) \
+	for (i = 0; i < n && (bds)[i].bd_bucket != NULL; i++)
+
+/** iterate over all buckets of @hs */
+#define cfs_hash_for_each_bucket(hs, bd, pos)		   \
+	for (pos = 0;					   \
+	     pos < CFS_HASH_NBKT(hs) &&			 \
+	     ((bd)->bd_bucket = (hs)->hs_buckets[pos]) != NULL; pos++)
+
+/** iterate over all hlist of bucket @bd */
+#define cfs_hash_bd_for_each_hlist(hs, bd, hlist)	       \
+	for ((bd)->bd_offset = 0;			       \
+	     (bd)->bd_offset < CFS_HASH_BKT_NHLIST(hs) &&       \
+	     (hlist = cfs_hash_bd_hhead(hs, bd)) != NULL;       \
+	     (bd)->bd_offset++)
+
+/* !__LIBCFS__HASH_H__ */
+#endif
diff --git a/kernel/drivers/staging/lustre/include/linux/libcfs/libcfs_ioctl.h b/kernel/drivers/staging/lustre/include/linux/libcfs/libcfs_ioctl.h
new file mode 100644
index 000000000..3ee38782a
--- /dev/null
+++ b/kernel/drivers/staging/lustre/include/linux/libcfs/libcfs_ioctl.h
@@ -0,0 +1,214 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * libcfs/include/libcfs/libcfs_ioctl.h
+ *
+ * Low-level ioctl data structures. Kernel ioctl functions declared here,
+ * and user space functions are in libcfsutil_ioctl.h.
+ *
+ */
+
+#ifndef __LIBCFS_IOCTL_H__
+#define __LIBCFS_IOCTL_H__
+
+#define LIBCFS_IOCTL_VERSION 0x0001000a
+
+struct libcfs_ioctl_data {
+	__u32 ioc_len;
+	__u32 ioc_version;
+
+	__u64 ioc_nid;
+	__u64 ioc_u64[1];
+
+	__u32 ioc_flags;
+	__u32 ioc_count;
+	__u32 ioc_net;
+	__u32 ioc_u32[7];
+
+	__u32 ioc_inllen1;
+	char *ioc_inlbuf1;
+	__u32 ioc_inllen2;
+	char *ioc_inlbuf2;
+
+	__u32 ioc_plen1; /* buffers in userspace */
+	char *ioc_pbuf1;
+	__u32 ioc_plen2; /* buffers in userspace */
+	char *ioc_pbuf2;
+
+	char ioc_bulk[0];
+};
+
+#define ioc_priority ioc_u32[0]
+
+struct libcfs_ioctl_hdr {
+	__u32 ioc_len;
+	__u32 ioc_version;
+};
+
+struct libcfs_debug_ioctl_data {
+	struct libcfs_ioctl_hdr hdr;
+	unsigned int subs;
+	unsigned int debug;
+};
+
+#define LIBCFS_IOC_INIT(data)			   \
+do {						    \
+	memset(&data, 0, sizeof(data));		 \
+	data.ioc_version = LIBCFS_IOCTL_VERSION;	\
+	data.ioc_len = sizeof(data);		    \
+} while (0)
+
+struct libcfs_ioctl_handler {
+	struct list_head item;
+	int (*handle_ioctl)(unsigned int cmd, struct libcfs_ioctl_data *data);
+};
+
+#define DECLARE_IOCTL_HANDLER(ident, func)		      \
+	struct libcfs_ioctl_handler ident = {		   \
+		/* .item = */ LIST_HEAD_INIT(ident.item),   \
+		/* .handle_ioctl = */ func		      \
+	}
+
+/* FIXME check conflict with lustre_lib.h */
+#define LIBCFS_IOC_DEBUG_MASK	     _IOWR('f', 250, long)
+
+/* ioctls for manipulating snapshots 30- */
+#define IOC_LIBCFS_TYPE		   'e'
+#define IOC_LIBCFS_MIN_NR		 30
+/* libcfs ioctls */
+#define IOC_LIBCFS_PANIC		   _IOWR('e', 30, long)
+#define IOC_LIBCFS_CLEAR_DEBUG	     _IOWR('e', 31, long)
+#define IOC_LIBCFS_MARK_DEBUG	      _IOWR('e', 32, long)
+#define IOC_LIBCFS_MEMHOG		  _IOWR('e', 36, long)
+#define IOC_LIBCFS_PING_TEST	       _IOWR('e', 37, long)
+/* lnet ioctls */
+#define IOC_LIBCFS_GET_NI		  _IOWR('e', 50, long)
+#define IOC_LIBCFS_FAIL_NID		_IOWR('e', 51, long)
+#define IOC_LIBCFS_ADD_ROUTE	       _IOWR('e', 52, long)
+#define IOC_LIBCFS_DEL_ROUTE	       _IOWR('e', 53, long)
+#define IOC_LIBCFS_GET_ROUTE	       _IOWR('e', 54, long)
+#define IOC_LIBCFS_NOTIFY_ROUTER	   _IOWR('e', 55, long)
+#define IOC_LIBCFS_UNCONFIGURE	     _IOWR('e', 56, long)
+#define IOC_LIBCFS_PORTALS_COMPATIBILITY   _IOWR('e', 57, long)
+#define IOC_LIBCFS_LNET_DIST	       _IOWR('e', 58, long)
+#define IOC_LIBCFS_CONFIGURE	       _IOWR('e', 59, long)
+#define IOC_LIBCFS_TESTPROTOCOMPAT	 _IOWR('e', 60, long)
+#define IOC_LIBCFS_PING		    _IOWR('e', 61, long)
+#define IOC_LIBCFS_DEBUG_PEER	      _IOWR('e', 62, long)
+#define IOC_LIBCFS_LNETST		  _IOWR('e', 63, long)
+/* lnd ioctls */
+#define IOC_LIBCFS_REGISTER_MYNID	  _IOWR('e', 70, long)
+#define IOC_LIBCFS_CLOSE_CONNECTION	_IOWR('e', 71, long)
+#define IOC_LIBCFS_PUSH_CONNECTION	 _IOWR('e', 72, long)
+#define IOC_LIBCFS_GET_CONN		_IOWR('e', 73, long)
+#define IOC_LIBCFS_DEL_PEER		_IOWR('e', 74, long)
+#define IOC_LIBCFS_ADD_PEER		_IOWR('e', 75, long)
+#define IOC_LIBCFS_GET_PEER		_IOWR('e', 76, long)
+#define IOC_LIBCFS_GET_TXDESC	      _IOWR('e', 77, long)
+#define IOC_LIBCFS_ADD_INTERFACE	   _IOWR('e', 78, long)
+#define IOC_LIBCFS_DEL_INTERFACE	   _IOWR('e', 79, long)
+#define IOC_LIBCFS_GET_INTERFACE	   _IOWR('e', 80, long)
+
+#define IOC_LIBCFS_MAX_NR			     80
+
+static inline int libcfs_ioctl_packlen(struct libcfs_ioctl_data *data)
+{
+	int len = sizeof(*data);
+
+	len += cfs_size_round(data->ioc_inllen1);
+	len += cfs_size_round(data->ioc_inllen2);
+	return len;
+}
+
+static inline int libcfs_ioctl_is_invalid(struct libcfs_ioctl_data *data)
+{
+	if (data->ioc_len > (1<<30)) {
+		CERROR("LIBCFS ioctl: ioc_len larger than 1<<30\n");
+		return 1;
+	}
+	if (data->ioc_inllen1 > (1<<30)) {
+		CERROR("LIBCFS ioctl: ioc_inllen1 larger than 1<<30\n");
+		return 1;
+	}
+	if (data->ioc_inllen2 > (1<<30)) {
+		CERROR("LIBCFS ioctl: ioc_inllen2 larger than 1<<30\n");
+		return 1;
+	}
+	if (data->ioc_inlbuf1 && !data->ioc_inllen1) {
+		CERROR("LIBCFS ioctl: inlbuf1 pointer but 0 length\n");
+		return 1;
+	}
+	if (data->ioc_inlbuf2 && !data->ioc_inllen2) {
+		CERROR("LIBCFS ioctl: inlbuf2 pointer but 0 length\n");
+		return 1;
+	}
+	if (data->ioc_pbuf1 && !data->ioc_plen1) {
+		CERROR("LIBCFS ioctl: pbuf1 pointer but 0 length\n");
+		return 1;
+	}
+	if (data->ioc_pbuf2 && !data->ioc_plen2) {
+		CERROR("LIBCFS ioctl: pbuf2 pointer but 0 length\n");
+		return 1;
+	}
+	if (data->ioc_plen1 && !data->ioc_pbuf1) {
+		CERROR("LIBCFS ioctl: plen1 nonzero but no pbuf1 pointer\n");
+		return 1;
+	}
+	if (data->ioc_plen2 && !data->ioc_pbuf2) {
+		CERROR("LIBCFS ioctl: plen2 nonzero but no pbuf2 pointer\n");
+		return 1;
+	}
+	if ((__u32)libcfs_ioctl_packlen(data) != data->ioc_len) {
+		CERROR("LIBCFS ioctl: packlen != ioc_len\n");
+		return 1;
+	}
+	if (data->ioc_inllen1 &&
+	    data->ioc_bulk[data->ioc_inllen1 - 1] != '\0') {
+		CERROR("LIBCFS ioctl: inlbuf1 not 0 terminated\n");
+		return 1;
+	}
+	if (data->ioc_inllen2 &&
+	    data->ioc_bulk[cfs_size_round(data->ioc_inllen1) +
+			   data->ioc_inllen2 - 1] != '\0') {
+		CERROR("LIBCFS ioctl: inlbuf2 not 0 terminated\n");
+		return 1;
+	}
+	return 0;
+}
+
+int libcfs_register_ioctl(struct libcfs_ioctl_handler *hand);
+int libcfs_deregister_ioctl(struct libcfs_ioctl_handler *hand);
+int libcfs_ioctl_getdata(char *buf, char *end, void *arg);
+int libcfs_ioctl_popdata(void *arg, void *buf, int size);
+
+#endif /* __LIBCFS_IOCTL_H__ */
diff --git a/kernel/drivers/staging/lustre/include/linux/libcfs/libcfs_kernelcomm.h b/kernel/drivers/staging/lustre/include/linux/libcfs/libcfs_kernelcomm.h
new file mode 100644
index 000000000..a989d2666
--- /dev/null
+++ b/kernel/drivers/staging/lustre/include/linux/libcfs/libcfs_kernelcomm.h
@@ -0,0 +1,118 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Author: Nathan Rutman <nathan.rutman@sun.com>
+ *
+ * libcfs/include/libcfs/libcfs_kernelcomm.h
+ *
+ * Kernel <-> userspace communication routines.
+ * The definitions below are used in the kernel and userspace.
+ *
+ */
+
+#ifndef __LIBCFS_KERNELCOMM_H__
+#define __LIBCFS_KERNELCOMM_H__
+
+#ifndef __LIBCFS_LIBCFS_H__
+#error Do not #include this file directly. #include <linux/libcfs/libcfs.h> instead
+#endif
+
+/* KUC message header.
+ * All current and future KUC messages should use this header.
+ * To avoid having to include Lustre headers from libcfs, define this here.
+ */
+struct kuc_hdr {
+	__u16 kuc_magic;
+	__u8  kuc_transport;  /* Each new Lustre feature should use a different
+				 transport */
+	__u8  kuc_flags;
+	__u16 kuc_msgtype;    /* Message type or opcode, transport-specific */
+	__u16 kuc_msglen;     /* Including header */
+} __aligned(sizeof(__u64));
+
+#define KUC_CHANGELOG_MSG_MAXSIZE (sizeof(struct kuc_hdr)+CR_MAXSIZE)
+
+#define KUC_MAGIC  0x191C /*Lustre9etLinC */
+#define KUC_FL_BLOCK 0x01   /* Wait for send */
+
+/* kuc_msgtype values are defined in each transport */
+enum kuc_transport_type {
+	KUC_TRANSPORT_GENERIC   = 1,
+	KUC_TRANSPORT_HSM       = 2,
+	KUC_TRANSPORT_CHANGELOG = 3,
+};
+
+enum kuc_generic_message_type {
+	KUC_MSG_SHUTDOWN = 1,
+};
+
+/* prototype for callback function on kuc groups */
+typedef int (*libcfs_kkuc_cb_t)(__u32 data, void *cb_arg);
+
+/* KUC Broadcast Groups. This determines which userspace process hears which
+ * messages.  Mutliple transports may be used within a group, or multiple
+ * groups may use the same transport.  Broadcast
+ * groups need not be used if e.g. a UID is specified instead;
+ * use group 0 to signify unicast.
+ */
+#define KUC_GRP_HSM	   0x02
+#define KUC_GRP_MAX	   KUC_GRP_HSM
+
+/* Kernel methods */
+int libcfs_kkuc_msg_put(struct file *fp, void *payload);
+int libcfs_kkuc_group_put(int group, void *payload);
+int libcfs_kkuc_group_add(struct file *fp, int uid, int group,
+				 __u32 data);
+int libcfs_kkuc_group_rem(int uid, int group);
+int libcfs_kkuc_group_foreach(int group, libcfs_kkuc_cb_t cb_func,
+				     void *cb_arg);
+
+#define LK_FLG_STOP 0x01
+
+/* kernelcomm control structure, passed from userspace to kernel */
+typedef struct lustre_kernelcomm {
+	__u32 lk_wfd;
+	__u32 lk_rfd;
+	__u32 lk_uid;
+	__u32 lk_group;
+	__u32 lk_data;
+	__u32 lk_flags;
+} __packed lustre_kernelcomm;
+
+/* Userspace methods */
+int libcfs_ukuc_start(lustre_kernelcomm *l, int groups);
+int libcfs_ukuc_stop(lustre_kernelcomm *l);
+int libcfs_ukuc_msg_get(lustre_kernelcomm *l, char *buf, int maxsize,
+			       int transport);
+
+#endif /* __LIBCFS_KERNELCOMM_H__ */
diff --git a/kernel/drivers/staging/lustre/include/linux/libcfs/libcfs_prim.h b/kernel/drivers/staging/lustre/include/linux/libcfs/libcfs_prim.h
new file mode 100644
index 000000000..978d3e2f1
--- /dev/null
+++ b/kernel/drivers/staging/lustre/include/linux/libcfs/libcfs_prim.h
@@ -0,0 +1,87 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * libcfs/include/libcfs/libcfs_prim.h
+ *
+ * General primitives.
+ *
+ */
+
+#ifndef __LIBCFS_PRIM_H__
+#define __LIBCFS_PRIM_H__
+
+/*
+ * Timer
+ */
+typedef  void (cfs_timer_func_t)(ulong_ptr_t);
+
+void add_wait_queue_exclusive_head(wait_queue_head_t *, wait_queue_t *);
+
+void cfs_init_timer(struct timer_list *t);
+void cfs_timer_init(struct timer_list *t, cfs_timer_func_t *func, void *arg);
+void cfs_timer_done(struct timer_list *t);
+void cfs_timer_arm(struct timer_list *t, unsigned long deadline);
+void cfs_timer_disarm(struct timer_list *t);
+int  cfs_timer_is_armed(struct timer_list *t);
+unsigned long cfs_timer_deadline(struct timer_list *t);
+
+/*
+ * Memory
+ */
+#ifndef memory_pressure_get
+#define memory_pressure_get() (0)
+#endif
+#ifndef memory_pressure_set
+#define memory_pressure_set() do {} while (0)
+#endif
+#ifndef memory_pressure_clr
+#define memory_pressure_clr() do {} while (0)
+#endif
+
+static inline int cfs_memory_pressure_get_and_set(void)
+{
+	int old = memory_pressure_get();
+
+	if (!old)
+		memory_pressure_set();
+	return old;
+}
+
+static inline void cfs_memory_pressure_restore(int old)
+{
+	if (old)
+		memory_pressure_set();
+	else
+		memory_pressure_clr();
+}
+#endif
diff --git a/kernel/drivers/staging/lustre/include/linux/libcfs/libcfs_private.h b/kernel/drivers/staging/lustre/include/linux/libcfs/libcfs_private.h
new file mode 100644
index 000000000..fef882530
--- /dev/null
+++ b/kernel/drivers/staging/lustre/include/linux/libcfs/libcfs_private.h
@@ -0,0 +1,556 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * libcfs/include/libcfs/libcfs_private.h
+ *
+ * Various defines for libcfs.
+ *
+ */
+
+#ifndef __LIBCFS_PRIVATE_H__
+#define __LIBCFS_PRIVATE_H__
+
+/* XXX this layering violation is for nidstrings */
+#include "../lnet/types.h"
+
+#ifndef DEBUG_SUBSYSTEM
+# define DEBUG_SUBSYSTEM S_UNDEFINED
+#endif
+
+
+/*
+ * When this is on, LASSERT macro includes check for assignment used instead
+ * of equality check, but doesn't have unlikely(). Turn this on from time to
+ * time to make test-builds. This shouldn't be on for production release.
+ */
+#define LASSERT_CHECKED (0)
+
+#define LASSERTF(cond, fmt, ...)					\
+do {									\
+	if (unlikely(!(cond))) {					\
+		LIBCFS_DEBUG_MSG_DATA_DECL(__msg_data, D_EMERG, NULL);	\
+		libcfs_debug_msg(&__msg_data,				\
+				 "ASSERTION( %s ) failed: " fmt, #cond,	\
+				 ## __VA_ARGS__);			\
+		lbug_with_loc(&__msg_data);				\
+	}								\
+} while (0)
+
+#define LASSERT(cond) LASSERTF(cond, "\n")
+
+#ifdef CONFIG_LUSTRE_DEBUG_EXPENSIVE_CHECK
+/**
+ * This is for more expensive checks that one doesn't want to be enabled all
+ * the time. LINVRNT() has to be explicitly enabled by
+ * CONFIG_LUSTRE_DEBUG_EXPENSIVE_CHECK option.
+ */
+# define LINVRNT(exp) LASSERT(exp)
+#else
+# define LINVRNT(exp) ((void)sizeof !!(exp))
+#endif
+
+#define KLASSERT(e) LASSERT(e)
+
+void lbug_with_loc(struct libcfs_debug_msg_data *)__attribute__((noreturn));
+
+#define LBUG()							  \
+do {								    \
+	LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, D_EMERG, NULL);	     \
+	lbug_with_loc(&msgdata);					\
+} while (0)
+
+extern atomic_t libcfs_kmemory;
+/*
+ * Memory
+ */
+
+# define libcfs_kmem_inc(ptr, size)		\
+do {						\
+	atomic_add(size, &libcfs_kmemory);	\
+} while (0)
+
+# define libcfs_kmem_dec(ptr, size)		\
+do {						\
+	atomic_sub(size, &libcfs_kmemory);	\
+} while (0)
+
+# define libcfs_kmem_read()			\
+	atomic_read(&libcfs_kmemory)
+
+#ifndef LIBCFS_VMALLOC_SIZE
+#define LIBCFS_VMALLOC_SIZE	(2 << PAGE_CACHE_SHIFT) /* 2 pages */
+#endif
+
+#define LIBCFS_ALLOC_PRE(size, mask)					    \
+do {									    \
+	LASSERT(!in_interrupt() ||					    \
+		((size) <= LIBCFS_VMALLOC_SIZE &&			    \
+		 ((mask) & __GFP_WAIT) == 0));				    \
+} while (0)
+
+#define LIBCFS_ALLOC_POST(ptr, size)					    \
+do {									    \
+	if (unlikely((ptr) == NULL)) {					    \
+		CERROR("LNET: out of memory at %s:%d (tried to alloc '"	    \
+		       #ptr "' = %d)\n", __FILE__, __LINE__, (int)(size));  \
+		CERROR("LNET: %d total bytes allocated by lnet\n",	    \
+		       libcfs_kmem_read());				    \
+	} else {							    \
+		memset((ptr), 0, (size));				    \
+		libcfs_kmem_inc((ptr), (size));				    \
+		CDEBUG(D_MALLOC, "alloc '" #ptr "': %d at %p (tot %d).\n",  \
+		       (int)(size), (ptr), libcfs_kmem_read());		    \
+	}								   \
+} while (0)
+
+/**
+ * allocate memory with GFP flags @mask
+ */
+#define LIBCFS_ALLOC_GFP(ptr, size, mask)				    \
+do {									    \
+	LIBCFS_ALLOC_PRE((size), (mask));				    \
+	(ptr) = (size) <= LIBCFS_VMALLOC_SIZE ?				    \
+		kmalloc((size), (mask)) : vmalloc(size);	    \
+	LIBCFS_ALLOC_POST((ptr), (size));				    \
+} while (0)
+
+/**
+ * default allocator
+ */
+#define LIBCFS_ALLOC(ptr, size) \
+	LIBCFS_ALLOC_GFP(ptr, size, GFP_NOFS)
+
+/**
+ * non-sleeping allocator
+ */
+#define LIBCFS_ALLOC_ATOMIC(ptr, size) \
+	LIBCFS_ALLOC_GFP(ptr, size, GFP_ATOMIC)
+
+/**
+ * allocate memory for specified CPU partition
+ *   \a cptab != NULL, \a cpt is CPU partition id of \a cptab
+ *   \a cptab == NULL, \a cpt is HW NUMA node id
+ */
+#define LIBCFS_CPT_ALLOC_GFP(ptr, cptab, cpt, size, mask)		    \
+do {									    \
+	LIBCFS_ALLOC_PRE((size), (mask));				    \
+	(ptr) = (size) <= LIBCFS_VMALLOC_SIZE ?				    \
+		kmalloc_node((size), (mask), cfs_cpt_spread_node(cptab, cpt)) :\
+		vmalloc_node(size, cfs_cpt_spread_node(cptab, cpt));	    \
+	LIBCFS_ALLOC_POST((ptr), (size));				    \
+} while (0)
+
+/** default numa allocator */
+#define LIBCFS_CPT_ALLOC(ptr, cptab, cpt, size)				    \
+	LIBCFS_CPT_ALLOC_GFP(ptr, cptab, cpt, size, GFP_NOFS)
+
+#define LIBCFS_FREE(ptr, size)					  \
+do {								    \
+	int s = (size);						 \
+	if (unlikely((ptr) == NULL)) {				  \
+		CERROR("LIBCFS: free NULL '" #ptr "' (%d bytes) at "    \
+		       "%s:%d\n", s, __FILE__, __LINE__);	       \
+		break;						  \
+	}							       \
+	libcfs_kmem_dec((ptr), s);				      \
+	CDEBUG(D_MALLOC, "kfreed '" #ptr "': %d at %p (tot %d).\n",     \
+	       s, (ptr), libcfs_kmem_read());				\
+	if (unlikely(s > LIBCFS_VMALLOC_SIZE))			  \
+		vfree(ptr);				    \
+	else							    \
+		kfree(ptr);					  \
+} while (0)
+
+/******************************************************************************/
+
+/* htonl hack - either this, or compile with -O2. Stupid byteorder/generic.h */
+#if defined(__GNUC__) && (__GNUC__ >= 2) && !defined(__OPTIMIZE__)
+#define ___htonl(x) __cpu_to_be32(x)
+#define ___htons(x) __cpu_to_be16(x)
+#define ___ntohl(x) __be32_to_cpu(x)
+#define ___ntohs(x) __be16_to_cpu(x)
+#define htonl(x) ___htonl(x)
+#define ntohl(x) ___ntohl(x)
+#define htons(x) ___htons(x)
+#define ntohs(x) ___ntohs(x)
+#endif
+
+void libcfs_run_upcall(char **argv);
+void libcfs_run_lbug_upcall(struct libcfs_debug_msg_data *);
+void libcfs_debug_dumplog(void);
+int libcfs_debug_init(unsigned long bufsize);
+int libcfs_debug_cleanup(void);
+int libcfs_debug_clear_buffer(void);
+int libcfs_debug_mark_buffer(const char *text);
+
+void libcfs_debug_set_level(unsigned int debug_level);
+
+/*
+ * allocate per-cpu-partition data, returned value is an array of pointers,
+ * variable can be indexed by CPU ID.
+ *	cptable != NULL: size of array is number of CPU partitions
+ *	cptable == NULL: size of array is number of HW cores
+ */
+void *cfs_percpt_alloc(struct cfs_cpt_table *cptab, unsigned int size);
+/*
+ * destroy per-cpu-partition variable
+ */
+void  cfs_percpt_free(void *vars);
+int   cfs_percpt_number(void *vars);
+void *cfs_percpt_current(void *vars);
+void *cfs_percpt_index(void *vars, int idx);
+
+#define cfs_percpt_for_each(var, i, vars)		\
+	for (i = 0; i < cfs_percpt_number(vars) &&	\
+		    ((var) = (vars)[i]) != NULL; i++)
+
+/*
+ * allocate a variable array, returned value is an array of pointers.
+ * Caller can specify length of array by count.
+ */
+void *cfs_array_alloc(int count, unsigned int size);
+void  cfs_array_free(void *vars);
+
+#define LASSERT_ATOMIC_ENABLED	  (1)
+
+#if LASSERT_ATOMIC_ENABLED
+
+/** assert value of @a is equal to @v */
+#define LASSERT_ATOMIC_EQ(a, v)				 \
+do {							    \
+	LASSERTF(atomic_read(a) == v,		       \
+		 "value: %d\n", atomic_read((a)));	  \
+} while (0)
+
+/** assert value of @a is unequal to @v */
+#define LASSERT_ATOMIC_NE(a, v)				 \
+do {							    \
+	LASSERTF(atomic_read(a) != v,		       \
+		 "value: %d\n", atomic_read((a)));	  \
+} while (0)
+
+/** assert value of @a is little than @v */
+#define LASSERT_ATOMIC_LT(a, v)				 \
+do {							    \
+	LASSERTF(atomic_read(a) < v,			\
+		 "value: %d\n", atomic_read((a)));	  \
+} while (0)
+
+/** assert value of @a is little/equal to @v */
+#define LASSERT_ATOMIC_LE(a, v)				 \
+do {							    \
+	LASSERTF(atomic_read(a) <= v,		       \
+		 "value: %d\n", atomic_read((a)));	  \
+} while (0)
+
+/** assert value of @a is great than @v */
+#define LASSERT_ATOMIC_GT(a, v)				 \
+do {							    \
+	LASSERTF(atomic_read(a) > v,			\
+		 "value: %d\n", atomic_read((a)));	  \
+} while (0)
+
+/** assert value of @a is great/equal to @v */
+#define LASSERT_ATOMIC_GE(a, v)				 \
+do {							    \
+	LASSERTF(atomic_read(a) >= v,		       \
+		 "value: %d\n", atomic_read((a)));	  \
+} while (0)
+
+/** assert value of @a is great than @v1 and little than @v2 */
+#define LASSERT_ATOMIC_GT_LT(a, v1, v2)			 \
+do {							    \
+	int __v = atomic_read(a);			   \
+	LASSERTF(__v > v1 && __v < v2, "value: %d\n", __v);     \
+} while (0)
+
+/** assert value of @a is great than @v1 and little/equal to @v2 */
+#define LASSERT_ATOMIC_GT_LE(a, v1, v2)			 \
+do {							    \
+	int __v = atomic_read(a);			   \
+	LASSERTF(__v > v1 && __v <= v2, "value: %d\n", __v);    \
+} while (0)
+
+/** assert value of @a is great/equal to @v1 and little than @v2 */
+#define LASSERT_ATOMIC_GE_LT(a, v1, v2)			 \
+do {							    \
+	int __v = atomic_read(a);			   \
+	LASSERTF(__v >= v1 && __v < v2, "value: %d\n", __v);    \
+} while (0)
+
+/** assert value of @a is great/equal to @v1 and little/equal to @v2 */
+#define LASSERT_ATOMIC_GE_LE(a, v1, v2)			 \
+do {							    \
+	int __v = atomic_read(a);			   \
+	LASSERTF(__v >= v1 && __v <= v2, "value: %d\n", __v);   \
+} while (0)
+
+#else /* !LASSERT_ATOMIC_ENABLED */
+
+#define LASSERT_ATOMIC_EQ(a, v)		 do {} while (0)
+#define LASSERT_ATOMIC_NE(a, v)		 do {} while (0)
+#define LASSERT_ATOMIC_LT(a, v)		 do {} while (0)
+#define LASSERT_ATOMIC_LE(a, v)		 do {} while (0)
+#define LASSERT_ATOMIC_GT(a, v)		 do {} while (0)
+#define LASSERT_ATOMIC_GE(a, v)		 do {} while (0)
+#define LASSERT_ATOMIC_GT_LT(a, v1, v2)	 do {} while (0)
+#define LASSERT_ATOMIC_GT_LE(a, v1, v2)	 do {} while (0)
+#define LASSERT_ATOMIC_GE_LT(a, v1, v2)	 do {} while (0)
+#define LASSERT_ATOMIC_GE_LE(a, v1, v2)	 do {} while (0)
+
+#endif /* LASSERT_ATOMIC_ENABLED */
+
+#define LASSERT_ATOMIC_ZERO(a)		  LASSERT_ATOMIC_EQ(a, 0)
+#define LASSERT_ATOMIC_POS(a)		   LASSERT_ATOMIC_GT(a, 0)
+
+#define CFS_ALLOC_PTR(ptr)      LIBCFS_ALLOC(ptr, sizeof(*(ptr)))
+#define CFS_FREE_PTR(ptr)       LIBCFS_FREE(ptr, sizeof(*(ptr)))
+
+/*
+ * percpu partition lock
+ *
+ * There are some use-cases like this in Lustre:
+ * . each CPU partition has it's own private data which is frequently changed,
+ *   and mostly by the local CPU partition.
+ * . all CPU partitions share some global data, these data are rarely changed.
+ *
+ * LNet is typical example.
+ * CPU partition lock is designed for this kind of use-cases:
+ * . each CPU partition has it's own private lock
+ * . change on private data just needs to take the private lock
+ * . read on shared data just needs to take _any_ of private locks
+ * . change on shared data needs to take _all_ private locks,
+ *   which is slow and should be really rare.
+ */
+
+enum {
+	CFS_PERCPT_LOCK_EX	= -1, /* negative */
+};
+
+struct cfs_percpt_lock {
+	/* cpu-partition-table for this lock */
+	struct cfs_cpt_table	*pcl_cptab;
+	/* exclusively locked */
+	unsigned int		pcl_locked;
+	/* private lock table */
+	spinlock_t		**pcl_locks;
+};
+
+/* return number of private locks */
+static inline int
+cfs_percpt_lock_num(struct cfs_percpt_lock *pcl)
+{
+	return cfs_cpt_number(pcl->pcl_cptab);
+}
+
+/*
+ * create a cpu-partition lock based on CPU partition table \a cptab,
+ * each private lock has extra \a psize bytes padding data
+ */
+struct cfs_percpt_lock *cfs_percpt_lock_alloc(struct cfs_cpt_table *cptab);
+/* destroy a cpu-partition lock */
+void cfs_percpt_lock_free(struct cfs_percpt_lock *pcl);
+
+/* lock private lock \a index of \a pcl */
+void cfs_percpt_lock(struct cfs_percpt_lock *pcl, int index);
+/* unlock private lock \a index of \a pcl */
+void cfs_percpt_unlock(struct cfs_percpt_lock *pcl, int index);
+/* create percpt (atomic) refcount based on @cptab */
+atomic_t **cfs_percpt_atomic_alloc(struct cfs_cpt_table *cptab, int val);
+/* destroy percpt refcount */
+void cfs_percpt_atomic_free(atomic_t **refs);
+/* return sum of all percpu refs */
+int cfs_percpt_atomic_summary(atomic_t **refs);
+
+/** Compile-time assertion.
+
+ * Check an invariant described by a constant expression at compile time by
+ * forcing a compiler error if it does not hold.  \a cond must be a constant
+ * expression as defined by the ISO C Standard:
+ *
+ *       6.8.4.2  The switch statement
+ *       ....
+ *       [#3] The expression of each case label shall be  an  integer
+ *       constant   expression  and  no  two  of  the  case  constant
+ *       expressions in the same switch statement shall have the same
+ *       value  after  conversion...
+ *
+ */
+#define CLASSERT(cond) do {switch (42) {case (cond): case 0: break; } } while (0)
+
+/* support decl needed both by kernel and liblustre */
+int	 libcfs_isknown_lnd(int type);
+char       *libcfs_lnd2modname(int type);
+char       *libcfs_lnd2str(int type);
+int	 libcfs_str2lnd(const char *str);
+char       *libcfs_net2str(__u32 net);
+char       *libcfs_nid2str(lnet_nid_t nid);
+__u32       libcfs_str2net(const char *str);
+lnet_nid_t  libcfs_str2nid(const char *str);
+int	 libcfs_str2anynid(lnet_nid_t *nid, const char *str);
+char       *libcfs_id2str(lnet_process_id_t id);
+void	cfs_free_nidlist(struct list_head *list);
+int	 cfs_parse_nidlist(char *str, int len, struct list_head *list);
+int	 cfs_match_nid(lnet_nid_t nid, struct list_head *list);
+
+/** \addtogroup lnet_addr
+ * @{ */
+/* how an LNET NID encodes net:address */
+/** extract the address part of an lnet_nid_t */
+#define LNET_NIDADDR(nid)      ((__u32)((nid) & 0xffffffff))
+/** extract the network part of an lnet_nid_t */
+#define LNET_NIDNET(nid)       ((__u32)(((nid) >> 32)) & 0xffffffff)
+/** make an lnet_nid_t from a network part and an address part */
+#define LNET_MKNID(net, addr)   ((((__u64)(net))<<32)|((__u64)(addr)))
+/* how net encodes type:number */
+#define LNET_NETNUM(net)       ((net) & 0xffff)
+#define LNET_NETTYP(net)       (((net) >> 16) & 0xffff)
+#define LNET_MKNET(typ, num)    ((((__u32)(typ))<<16)|((__u32)(num)))
+/** @} lnet_addr */
+
+/* max value for numeric network address */
+#define MAX_NUMERIC_VALUE 0xffffffff
+
+/* implication */
+#define ergo(a, b) (!(a) || (b))
+/* logical equivalence */
+#define equi(a, b) (!!(a) == !!(b))
+
+/* --------------------------------------------------------------------
+ * Light-weight trace
+ * Support for temporary event tracing with minimal Heisenberg effect.
+ * -------------------------------------------------------------------- */
+
+struct libcfs_device_userstate {
+	int	   ldu_memhog_pages;
+	struct page   *ldu_memhog_root_page;
+};
+
+#define MKSTR(ptr) ((ptr)) ? (ptr) : ""
+
+static inline int cfs_size_round4(int val)
+{
+	return (val + 3) & (~0x3);
+}
+
+#ifndef HAVE_CFS_SIZE_ROUND
+static inline int cfs_size_round(int val)
+{
+	return (val + 7) & (~0x7);
+}
+
+#define HAVE_CFS_SIZE_ROUND
+#endif
+
+static inline int cfs_size_round16(int val)
+{
+	return (val + 0xf) & (~0xf);
+}
+
+static inline int cfs_size_round32(int val)
+{
+	return (val + 0x1f) & (~0x1f);
+}
+
+static inline int cfs_size_round0(int val)
+{
+	if (!val)
+		return 0;
+	return (val + 1 + 7) & (~0x7);
+}
+
+static inline size_t cfs_round_strlen(char *fset)
+{
+	return (size_t)cfs_size_round((int)strlen(fset) + 1);
+}
+
+/* roundup \a val to power2 */
+static inline unsigned int cfs_power2_roundup(unsigned int val)
+{
+	if (val != LOWEST_BIT_SET(val)) { /* not a power of 2 already */
+		do {
+			val &= ~LOWEST_BIT_SET(val);
+		} while (val != LOWEST_BIT_SET(val));
+		/* ...and round up */
+		val <<= 1;
+	}
+	return val;
+}
+
+#define LOGL(var, len, ptr)				       \
+do {							    \
+	if (var)						\
+		memcpy((char *)ptr, (const char *)var, len);    \
+	ptr += cfs_size_round(len);			     \
+} while (0)
+
+#define LOGU(var, len, ptr)				       \
+do {							    \
+	if (var)						\
+		memcpy((char *)var, (const char *)ptr, len);    \
+	ptr += cfs_size_round(len);			     \
+} while (0)
+
+#define LOGL0(var, len, ptr)			      \
+do {						    \
+	if (!len)				       \
+		break;				  \
+	memcpy((char *)ptr, (const char *)var, len);    \
+	*((char *)(ptr) + len) = 0;		     \
+	ptr += cfs_size_round(len + 1);		 \
+} while (0)
+
+/**
+ *  Lustre Network Driver types.
+ */
+enum {
+	/* Only add to these values (i.e. don't ever change or redefine them):
+	 * network addresses depend on them... */
+	QSWLND    = 1,
+	SOCKLND   = 2,
+	GMLND     = 3, /* obsolete, keep it so that libcfs_nid2str works */
+	PTLLND    = 4,
+	O2IBLND   = 5,
+	CIBLND    = 6,
+	OPENIBLND = 7,
+	IIBLND    = 8,
+	LOLND     = 9,
+	RALND     = 10,
+	VIBLND    = 11,
+	MXLND     = 12,
+	GNILND    = 13,
+};
+
+#endif
diff --git a/kernel/drivers/staging/lustre/include/linux/libcfs/libcfs_string.h b/kernel/drivers/staging/lustre/include/linux/libcfs/libcfs_string.h
new file mode 100644
index 000000000..509dc1e5c
--- /dev/null
+++ b/kernel/drivers/staging/lustre/include/linux/libcfs/libcfs_string.h
@@ -0,0 +1,107 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * libcfs/include/libcfs/libcfs_string.h
+ *
+ * Generic string manipulation functions.
+ *
+ * Author: Nathan Rutman <nathan.rutman@sun.com>
+ */
+
+#ifndef __LIBCFS_STRING_H__
+#define __LIBCFS_STRING_H__
+
+/* libcfs_string.c */
+/* string comparison ignoring case */
+int cfs_strncasecmp(const char *s1, const char *s2, size_t n);
+/* Convert a text string to a bitmask */
+int cfs_str2mask(const char *str, const char *(*bit2str)(int bit),
+		 int *oldmask, int minmask, int allmask);
+/* trim leading and trailing space characters */
+char *cfs_firststr(char *str, size_t size);
+
+/**
+ * Structure to represent NULL-less strings.
+ */
+struct cfs_lstr {
+	char		*ls_str;
+	int		ls_len;
+};
+
+/*
+ * Structure to represent \<range_expr\> token of the syntax.
+ */
+struct cfs_range_expr {
+	/*
+	 * Link to cfs_expr_list::el_exprs.
+	 */
+	struct list_head	re_link;
+	__u32		re_lo;
+	__u32		re_hi;
+	__u32		re_stride;
+};
+
+struct cfs_expr_list {
+	struct list_head	el_link;
+	struct list_head	el_exprs;
+};
+
+char *cfs_trimwhite(char *str);
+int cfs_gettok(struct cfs_lstr *next, char delim, struct cfs_lstr *res);
+int cfs_str2num_check(char *str, int nob, unsigned *num,
+		      unsigned min, unsigned max);
+int cfs_expr_list_match(__u32 value, struct cfs_expr_list *expr_list);
+int cfs_expr_list_values(struct cfs_expr_list *expr_list,
+			 int max, __u32 **values);
+static inline void
+cfs_expr_list_values_free(__u32 *values, int num)
+{
+	/* This array is allocated by LIBCFS_ALLOC(), so it shouldn't be freed
+	 * by OBD_FREE() if it's called by module other than libcfs & LNet,
+	 * otherwise we will see fake memory leak */
+	LIBCFS_FREE(values, num * sizeof(values[0]));
+}
+
+void cfs_expr_list_free(struct cfs_expr_list *expr_list);
+int cfs_expr_list_parse(char *str, int len, unsigned min, unsigned max,
+			struct cfs_expr_list **elpp);
+void cfs_expr_list_free_list(struct list_head *list);
+int cfs_ip_addr_parse(char *str, int len, struct list_head *list);
+int cfs_ip_addr_match(__u32 addr, struct list_head *list);
+void cfs_ip_addr_free(struct list_head *list);
+
+#define	strtoul(str, endp, base)	simple_strtoul(str, endp, base)
+
+#endif
diff --git a/kernel/drivers/staging/lustre/include/linux/libcfs/libcfs_time.h b/kernel/drivers/staging/lustre/include/linux/libcfs/libcfs_time.h
new file mode 100644
index 000000000..5de6da085
--- /dev/null
+++ b/kernel/drivers/staging/lustre/include/linux/libcfs/libcfs_time.h
@@ -0,0 +1,131 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * libcfs/include/libcfs/libcfs_time.h
+ *
+ * Time functions.
+ *
+ */
+
+#ifndef __LIBCFS_TIME_H__
+#define __LIBCFS_TIME_H__
+/*
+ * generic time manipulation functions.
+ */
+
+static inline unsigned long cfs_time_add(unsigned long t, long d)
+{
+	return (unsigned long)(t + d);
+}
+
+static inline unsigned long cfs_time_sub(unsigned long t1, unsigned long t2)
+{
+	return (unsigned long)(t1 - t2);
+}
+
+static inline int cfs_time_after(unsigned long t1, unsigned long t2)
+{
+	return time_before(t2, t1);
+}
+
+static inline int cfs_time_aftereq(unsigned long t1, unsigned long t2)
+{
+	return time_before_eq(t2, t1);
+}
+
+static inline unsigned long cfs_time_shift(int seconds)
+{
+	return cfs_time_add(cfs_time_current(), cfs_time_seconds(seconds));
+}
+
+static inline long cfs_timeval_sub(struct timeval *large, struct timeval *small,
+				   struct timeval *result)
+{
+	long r = (long)(
+		(large->tv_sec - small->tv_sec) * ONE_MILLION +
+		(large->tv_usec - small->tv_usec));
+	if (result != NULL) {
+		result->tv_usec = r % ONE_MILLION;
+		result->tv_sec = r / ONE_MILLION;
+	}
+	return r;
+}
+
+static inline void cfs_slow_warning(unsigned long now, int seconds, char *msg)
+{
+	if (cfs_time_after(cfs_time_current(),
+			   cfs_time_add(now, cfs_time_seconds(15))))
+		CERROR("slow %s "CFS_TIME_T" sec\n", msg,
+		       cfs_duration_sec(cfs_time_sub(cfs_time_current(), now)));
+}
+
+#define CFS_RATELIMIT(seconds)				  \
+({							      \
+	/*						      \
+	 * XXX nikita: non-portable initializer		 \
+	 */						     \
+	static time_t __next_message;		       \
+	int result;					     \
+								\
+	if (cfs_time_after(cfs_time_current(), __next_message)) \
+		result = 1;				     \
+	else {						  \
+		__next_message = cfs_time_shift(seconds);       \
+		result = 0;				     \
+	}						       \
+	result;						 \
+})
+
+/*
+ * helper function similar to do_gettimeofday() of Linux kernel
+ */
+static inline void cfs_fs_timeval(struct timeval *tv)
+{
+	struct timespec time;
+
+	cfs_fs_time_current(&time);
+	cfs_fs_time_usec(&time, tv);
+}
+
+/*
+ * return valid time-out based on user supplied one. Currently we only check
+ * that time-out is not shorted than allowed.
+ */
+static inline long cfs_timeout_cap(long timeout)
+{
+	if (timeout < CFS_TICK)
+		timeout = CFS_TICK;
+	return timeout;
+}
+
+#endif
diff --git a/kernel/drivers/staging/lustre/include/linux/libcfs/libcfs_workitem.h b/kernel/drivers/staging/lustre/include/linux/libcfs/libcfs_workitem.h
new file mode 100644
index 000000000..5cc64f327
--- /dev/null
+++ b/kernel/drivers/staging/lustre/include/linux/libcfs/libcfs_workitem.h
@@ -0,0 +1,110 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * libcfs/include/libcfs/libcfs_workitem.h
+ *
+ * Author: Isaac Huang  <he.h.huang@oracle.com>
+ *	 Liang Zhen   <zhen.liang@sun.com>
+ *
+ * A workitems is deferred work with these semantics:
+ * - a workitem always runs in thread context.
+ * - a workitem can be concurrent with other workitems but is strictly
+ *   serialized with respect to itself.
+ * - no CPU affinity, a workitem does not necessarily run on the same CPU
+ *   that schedules it. However, this might change in the future.
+ * - if a workitem is scheduled again before it has a chance to run, it
+ *   runs only once.
+ * - if a workitem is scheduled while it runs, it runs again after it
+ *   completes; this ensures that events occurring while other events are
+ *   being processed receive due attention. This behavior also allows a
+ *   workitem to reschedule itself.
+ *
+ * Usage notes:
+ * - a workitem can sleep but it should be aware of how that sleep might
+ *   affect others.
+ * - a workitem runs inside a kernel thread so there's no user space to access.
+ * - do not use a workitem if the scheduling latency can't be tolerated.
+ *
+ * When wi_action returns non-zero, it means the workitem has either been
+ * freed or reused and workitem scheduler won't touch it any more.
+ */
+
+#ifndef __LIBCFS_WORKITEM_H__
+#define __LIBCFS_WORKITEM_H__
+
+struct cfs_wi_sched;
+
+void cfs_wi_sched_destroy(struct cfs_wi_sched *);
+int cfs_wi_sched_create(char *name, struct cfs_cpt_table *cptab, int cpt,
+			int nthrs, struct cfs_wi_sched **);
+
+struct cfs_workitem;
+
+typedef int (*cfs_wi_action_t) (struct cfs_workitem *);
+typedef struct cfs_workitem {
+	/** chain on runq or rerunq */
+	struct list_head       wi_list;
+	/** working function */
+	cfs_wi_action_t  wi_action;
+	/** arg for working function */
+	void	    *wi_data;
+	/** in running */
+	unsigned short   wi_running:1;
+	/** scheduled */
+	unsigned short   wi_scheduled:1;
+} cfs_workitem_t;
+
+static inline void
+cfs_wi_init(cfs_workitem_t *wi, void *data, cfs_wi_action_t action)
+{
+	INIT_LIST_HEAD(&wi->wi_list);
+
+	wi->wi_running   = 0;
+	wi->wi_scheduled = 0;
+	wi->wi_data      = data;
+	wi->wi_action    = action;
+}
+
+void cfs_wi_schedule(struct cfs_wi_sched *sched, cfs_workitem_t *wi);
+int  cfs_wi_deschedule(struct cfs_wi_sched *sched, cfs_workitem_t *wi);
+void cfs_wi_exit(struct cfs_wi_sched *sched, cfs_workitem_t *wi);
+
+int  cfs_wi_startup(void);
+void cfs_wi_shutdown(void);
+
+/** # workitem scheduler loops before reschedule */
+#define CFS_WI_RESCHED    128
+
+#endif /* __LIBCFS_WORKITEM_H__ */
diff --git a/kernel/drivers/staging/lustre/include/linux/libcfs/linux/libcfs.h b/kernel/drivers/staging/lustre/include/linux/libcfs/linux/libcfs.h
new file mode 100644
index 000000000..4fe50841e
--- /dev/null
+++ b/kernel/drivers/staging/lustre/include/linux/libcfs/linux/libcfs.h
@@ -0,0 +1,147 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef __LIBCFS_LINUX_LIBCFS_H__
+#define __LIBCFS_LINUX_LIBCFS_H__
+
+#ifndef __LIBCFS_LIBCFS_H__
+#error Do not #include this file directly. #include <linux/libcfs/libcfs.h> instead
+#endif
+
+
+#include <linux/bitops.h>
+#include <linux/compiler.h>
+#include <linux/ctype.h>
+#include <linux/errno.h>
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/highmem.h>
+#include <linux/interrupt.h>
+#include <linux/kallsyms.h>
+#include <linux/kernel.h>
+#include <linux/kmod.h>
+#include <linux/kthread.h>
+#include <linux/miscdevice.h>
+#include <linux/mm.h>
+#include <linux/mm_inline.h>
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/mutex.h>
+#include <linux/notifier.h>
+#include <linux/proc_fs.h>
+#include <linux/random.h>
+#include <linux/rbtree.h>
+#include <linux/rwsem.h>
+#include <linux/scatterlist.h>
+#include <linux/sched.h>
+#include <linux/signal.h>
+#include <linux/slab.h>
+#include <linux/smp.h>
+#include <linux/stat.h>
+#include <linux/string.h>
+#include <linux/time.h>
+#include <linux/timer.h>
+#include <linux/types.h>
+#include <linux/unistd.h>
+#include <linux/vmalloc.h>
+#include <net/sock.h>
+#include <linux/atomic.h>
+#include <asm/div64.h>
+#include <linux/timex.h>
+#include <linux/uaccess.h>
+#include <stdarg.h>
+#include "linux-cpu.h"
+#include "linux-time.h"
+#include "linux-mem.h"
+
+
+#define LUSTRE_TRACE_SIZE (THREAD_SIZE >> 5)
+
+#if !defined(__x86_64__)
+# ifdef __ia64__
+#  define CDEBUG_STACK() (THREAD_SIZE -				 \
+			  ((unsigned long)__builtin_dwarf_cfa() &       \
+			   (THREAD_SIZE - 1)))
+# else
+#  define CDEBUG_STACK() (THREAD_SIZE -				 \
+			  ((unsigned long)__builtin_frame_address(0) &  \
+			   (THREAD_SIZE - 1)))
+# endif /* __ia64__ */
+
+#define __CHECK_STACK(msgdata, mask, cdls)			      \
+do {								    \
+	if (unlikely(CDEBUG_STACK() > libcfs_stack)) {		  \
+		LIBCFS_DEBUG_MSG_DATA_INIT(msgdata, D_WARNING, NULL);   \
+		libcfs_stack = CDEBUG_STACK();			  \
+		libcfs_debug_msg(msgdata,			       \
+				 "maximum lustre stack %lu\n",	  \
+				 CDEBUG_STACK());		       \
+		(msgdata)->msg_mask = mask;			     \
+		(msgdata)->msg_cdls = cdls;			     \
+		dump_stack();					   \
+	      /*panic("LBUG");*/					\
+	}							       \
+} while (0)
+#define CFS_CHECK_STACK(msgdata, mask, cdls)  __CHECK_STACK(msgdata, mask, cdls)
+#else /* __x86_64__ */
+#define CFS_CHECK_STACK(msgdata, mask, cdls) do {} while (0)
+#define CDEBUG_STACK() (0L)
+#endif /* __x86_64__ */
+
+/* initial pid  */
+#define LUSTRE_LNET_PID	  12345
+
+#define __current_nesting_level() (0)
+
+/**
+ * Platform specific declarations for cfs_curproc API (libcfs/curproc.h)
+ *
+ * Implementation is in linux-curproc.c
+ */
+#define CFS_CURPROC_COMM_MAX (sizeof((struct task_struct *)0)->comm)
+
+#include <linux/capability.h>
+
+/* long integer with size equal to pointer */
+typedef unsigned long ulong_ptr_t;
+typedef long long_ptr_t;
+
+#ifndef WITH_WATCHDOG
+#define WITH_WATCHDOG
+#endif
+
+
+#endif /* _LINUX_LIBCFS_H */
diff --git a/kernel/drivers/staging/lustre/include/linux/libcfs/linux/linux-cpu.h b/kernel/drivers/staging/lustre/include/linux/libcfs/linux/linux-cpu.h
new file mode 100644
index 000000000..520209f17
--- /dev/null
+++ b/kernel/drivers/staging/lustre/include/linux/libcfs/linux/linux-cpu.h
@@ -0,0 +1,82 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * libcfs/include/libcfs/linux/linux-mem.h
+ *
+ * Basic library routines.
+ *
+ * Author: liang@whamcloud.com
+ */
+
+#ifndef __LIBCFS_LINUX_CPU_H__
+#define __LIBCFS_LINUX_CPU_H__
+
+#ifndef __LIBCFS_LIBCFS_H__
+#error Do not #include this file directly. #include <linux/libcfs/libcfs.h> instead
+#endif
+
+#include <linux/cpu.h>
+#include <linux/cpuset.h>
+#include <linux/topology.h>
+
+#ifdef CONFIG_SMP
+
+#define HAVE_LIBCFS_CPT
+
+/** virtual processing unit */
+struct cfs_cpu_partition {
+	/* CPUs mask for this partition */
+	cpumask_t			*cpt_cpumask;
+	/* nodes mask for this partition */
+	nodemask_t			*cpt_nodemask;
+	/* spread rotor for NUMA allocator */
+	unsigned			cpt_spread_rotor;
+};
+
+/** descriptor for CPU partitions */
+struct cfs_cpt_table {
+	/* version, reserved for hotplug */
+	unsigned			ctb_version;
+	/* spread rotor for NUMA allocator */
+	unsigned			ctb_spread_rotor;
+	/* # of CPU partitions */
+	unsigned			ctb_nparts;
+	/* partitions tables */
+	struct cfs_cpu_partition	*ctb_parts;
+	/* shadow HW CPU to CPU partition ID */
+	int				*ctb_cpu2cpt;
+	/* all cpus in this partition table */
+	cpumask_t			*ctb_cpumask;
+	/* all nodes in this partition table */
+	nodemask_t			*ctb_nodemask;
+};
+
+#endif /* CONFIG_SMP */
+#endif /* __LIBCFS_LINUX_CPU_H__ */
diff --git a/kernel/drivers/staging/lustre/include/linux/libcfs/linux/linux-mem.h b/kernel/drivers/staging/lustre/include/linux/libcfs/linux/linux-mem.h
new file mode 100644
index 000000000..0f2fd79e5
--- /dev/null
+++ b/kernel/drivers/staging/lustre/include/linux/libcfs/linux/linux-mem.h
@@ -0,0 +1,80 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * libcfs/include/libcfs/linux/linux-mem.h
+ *
+ * Basic library routines.
+ */
+
+#ifndef __LIBCFS_LINUX_CFS_MEM_H__
+#define __LIBCFS_LINUX_CFS_MEM_H__
+
+#ifndef __LIBCFS_LIBCFS_H__
+#error Do not #include this file directly. #include <linux/libcfs/libcfs.h> instead
+#endif
+
+#include <linux/mm.h>
+#include <linux/vmalloc.h>
+#include <linux/pagemap.h>
+#include <linux/slab.h>
+#include <linux/memcontrol.h>
+#include <linux/mm_inline.h>
+
+#ifndef HAVE_LIBCFS_CPT
+/* Need this for cfs_cpt_table */
+#include "../libcfs_cpu.h"
+#endif
+
+#define CFS_PAGE_MASK		   (~((__u64)PAGE_CACHE_SIZE-1))
+#define page_index(p)       ((p)->index)
+
+#define memory_pressure_get() (current->flags & PF_MEMALLOC)
+#define memory_pressure_set() do { current->flags |= PF_MEMALLOC; } while (0)
+#define memory_pressure_clr() do { current->flags &= ~PF_MEMALLOC; } while (0)
+
+#if BITS_PER_LONG == 32
+/* limit to lowmem on 32-bit systems */
+#define NUM_CACHEPAGES \
+	min(totalram_pages, 1UL << (30 - PAGE_CACHE_SHIFT) * 3 / 4)
+#else
+#define NUM_CACHEPAGES totalram_pages
+#endif
+
+#define DECL_MMSPACE		mm_segment_t __oldfs
+#define MMSPACE_OPEN \
+	do { __oldfs = get_fs(); set_fs(get_ds()); } while (0)
+#define MMSPACE_CLOSE	       set_fs(__oldfs)
+
+#endif /* __LINUX_CFS_MEM_H__ */
diff --git a/kernel/drivers/staging/lustre/include/linux/libcfs/linux/linux-time.h b/kernel/drivers/staging/lustre/include/linux/libcfs/linux/linux-time.h
new file mode 100644
index 000000000..0fc490bac
--- /dev/null
+++ b/kernel/drivers/staging/lustre/include/linux/libcfs/linux/linux-time.h
@@ -0,0 +1,144 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * libcfs/include/libcfs/linux/linux-time.h
+ *
+ * Implementation of portable time API for Linux (kernel and user-level).
+ *
+ * Author: Nikita Danilov <nikita@clusterfs.com>
+ */
+
+#ifndef __LIBCFS_LINUX_LINUX_TIME_H__
+#define __LIBCFS_LINUX_LINUX_TIME_H__
+
+#ifndef __LIBCFS_LIBCFS_H__
+#error Do not #include this file directly. #include <linux/libcfs/libcfs.h> instead
+#endif
+
+#define ONE_BILLION ((u_int64_t)1000000000)
+#define ONE_MILLION 1000000
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/time.h>
+#include <asm/div64.h>
+
+/*
+ * post 2.5 kernels.
+ */
+
+#include <linux/jiffies.h>
+
+
+static inline void cfs_fs_time_usec(struct timespec *t, struct timeval *v)
+{
+	v->tv_sec  = t->tv_sec;
+	v->tv_usec = t->tv_nsec / 1000;
+}
+
+/*
+ * Generic kernel stuff
+ */
+
+static inline unsigned long cfs_time_current(void)
+{
+	return jiffies;
+}
+
+static inline void cfs_fs_time_current(struct timespec *t)
+{
+	*t = CURRENT_TIME;
+}
+
+static inline time_t cfs_fs_time_sec(struct timespec *t)
+{
+	return t->tv_sec;
+}
+
+static inline long cfs_time_seconds(int seconds)
+{
+	return ((long)seconds) * HZ;
+}
+
+static inline time_t cfs_duration_sec(long d)
+{
+	return d / HZ;
+}
+
+static inline void cfs_duration_usec(long d, struct timeval *s)
+{
+#if (BITS_PER_LONG == 32) && (HZ > 4096)
+	__u64 t;
+
+	s->tv_sec = d / HZ;
+	t = (d - (long)s->tv_sec * HZ) * ONE_MILLION;
+	do_div(t, HZ);
+	s->tv_usec = t;
+#else
+	s->tv_sec = d / HZ;
+	s->tv_usec = ((d - (long)s->tv_sec * HZ) * ONE_MILLION) / HZ;
+#endif
+}
+
+#define cfs_time_current_64 get_jiffies_64
+
+static inline __u64 cfs_time_add_64(__u64 t, __u64 d)
+{
+	return t + d;
+}
+
+static inline __u64 cfs_time_shift_64(int seconds)
+{
+	return cfs_time_add_64(cfs_time_current_64(),
+			       cfs_time_seconds(seconds));
+}
+
+static inline int cfs_time_before_64(__u64 t1, __u64 t2)
+{
+	return (__s64)t2 - (__s64)t1 > 0;
+}
+
+static inline int cfs_time_beforeq_64(__u64 t1, __u64 t2)
+{
+	return (__s64)t2 - (__s64)t1 >= 0;
+}
+
+/*
+ * One jiffy
+ */
+#define CFS_TICK		(1)
+
+#define CFS_TIME_T	      "%lu"
+#define CFS_DURATION_T	  "%ld"
+
+#endif /* __LIBCFS_LINUX_LINUX_TIME_H__ */
diff --git a/kernel/drivers/staging/lustre/include/linux/lnet/api-support.h b/kernel/drivers/staging/lustre/include/linux/lnet/api-support.h
new file mode 100644
index 000000000..8f7fa28b5
--- /dev/null
+++ b/kernel/drivers/staging/lustre/include/linux/lnet/api-support.h
@@ -0,0 +1,44 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef __LNET_API_SUPPORT_H__
+#define __LNET_API_SUPPORT_H__
+
+#include "linux/api-support.h"
+
+#include "../libcfs/libcfs.h"
+#include "types.h"
+#include "lnet.h"
+
+#endif
diff --git a/kernel/drivers/staging/lustre/include/linux/lnet/api.h b/kernel/drivers/staging/lustre/include/linux/lnet/api.h
new file mode 100644
index 000000000..cd8651757
--- /dev/null
+++ b/kernel/drivers/staging/lustre/include/linux/lnet/api.h
@@ -0,0 +1,217 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef __LNET_API_H__
+#define __LNET_API_H__
+
+/** \defgroup lnet LNet
+ *
+ * The Lustre Networking subsystem.
+ *
+ * LNet is an asynchronous message-passing API, which provides an unreliable
+ * connectionless service that can't guarantee any order. It supports OFA IB,
+ * TCP/IP, and Cray Portals, and routes between heterogeneous networks.
+ *
+ * LNet can run both in OS kernel space and in userspace as a library.
+ * @{
+ */
+
+#include "../lnet/types.h"
+
+/** \defgroup lnet_init_fini Initialization and cleanup
+ * The LNet must be properly initialized before any LNet calls can be made.
+ * @{ */
+int LNetInit(void);
+void LNetFini(void);
+
+int LNetNIInit(lnet_pid_t requested_pid);
+int LNetNIFini(void);
+/** @} lnet_init_fini */
+
+/** \defgroup lnet_addr LNet addressing and basic types
+ *
+ * Addressing scheme and basic data types of LNet.
+ *
+ * The LNet API is memory-oriented, so LNet must be able to address not only
+ * end-points but also memory region within a process address space.
+ * An ::lnet_nid_t addresses an end-point. An ::lnet_pid_t identifies a process
+ * in a node. A portal represents an opening in the address space of a
+ * process. Match bits is criteria to identify a region of memory inside a
+ * portal, and offset specifies an offset within the memory region.
+ *
+ * LNet creates a table of portals for each process during initialization.
+ * This table has MAX_PORTALS entries and its size can't be dynamically
+ * changed. A portal stays empty until the owning process starts to add
+ * memory regions to it. A portal is sometimes called an index because
+ * it's an entry in the portals table of a process.
+ *
+ * \see LNetMEAttach
+ * @{ */
+int LNetGetId(unsigned int index, lnet_process_id_t *id);
+int LNetDist(lnet_nid_t nid, lnet_nid_t *srcnid, __u32 *order);
+void LNetSnprintHandle(char *str, int str_len, lnet_handle_any_t handle);
+
+/** @} lnet_addr */
+
+/** \defgroup lnet_me Match entries
+ *
+ * A match entry (abbreviated as ME) describes a set of criteria to accept
+ * incoming requests.
+ *
+ * A portal is essentially a match list plus a set of attributes. A match
+ * list is a chain of MEs. Each ME includes a pointer to a memory descriptor
+ * and a set of match criteria. The match criteria can be used to reject
+ * incoming requests based on process ID or the match bits provided in the
+ * request. MEs can be dynamically inserted into a match list by LNetMEAttach()
+ * and LNetMEInsert(), and removed from its list by LNetMEUnlink().
+ * @{ */
+int LNetMEAttach(unsigned int      portal,
+		 lnet_process_id_t match_id_in,
+		 __u64	     match_bits_in,
+		 __u64	     ignore_bits_in,
+		 lnet_unlink_t     unlink_in,
+		 lnet_ins_pos_t    pos_in,
+		 lnet_handle_me_t *handle_out);
+
+int LNetMEInsert(lnet_handle_me_t  current_in,
+		 lnet_process_id_t match_id_in,
+		 __u64	     match_bits_in,
+		 __u64	     ignore_bits_in,
+		 lnet_unlink_t     unlink_in,
+		 lnet_ins_pos_t    position_in,
+		 lnet_handle_me_t *handle_out);
+
+int LNetMEUnlink(lnet_handle_me_t current_in);
+/** @} lnet_me */
+
+/** \defgroup lnet_md Memory descriptors
+ *
+ * A memory descriptor contains information about a region of a user's
+ * memory (either in kernel or user space) and optionally points to an
+ * event queue where information about the operations performed on the
+ * memory descriptor are recorded. Memory descriptor is abbreviated as
+ * MD and can be used interchangeably with the memory region it describes.
+ *
+ * The LNet API provides two operations to create MDs: LNetMDAttach()
+ * and LNetMDBind(); one operation to unlink and release the resources
+ * associated with a MD: LNetMDUnlink().
+ * @{ */
+int LNetMDAttach(lnet_handle_me_t  current_in,
+		 lnet_md_t	 md_in,
+		 lnet_unlink_t     unlink_in,
+		 lnet_handle_md_t *handle_out);
+
+int LNetMDBind(lnet_md_t	 md_in,
+	       lnet_unlink_t     unlink_in,
+	       lnet_handle_md_t *handle_out);
+
+int LNetMDUnlink(lnet_handle_md_t md_in);
+/** @} lnet_md */
+
+/** \defgroup lnet_eq Events and event queues
+ *
+ * Event queues (abbreviated as EQ) are used to log operations performed on
+ * local MDs. In particular, they signal the completion of a data transmission
+ * into or out of a MD. They can also be used to hold acknowledgments for
+ * completed PUT operations and indicate when a MD has been unlinked. Multiple
+ * MDs can share a single EQ. An EQ may have an optional event handler
+ * associated with it. If an event handler exists, it will be run for each
+ * event that is deposited into the EQ.
+ *
+ * In addition to the lnet_handle_eq_t, the LNet API defines two types
+ * associated with events: The ::lnet_event_kind_t defines the kinds of events
+ * that can be stored in an EQ. The lnet_event_t defines a structure that
+ * holds the information about with an event.
+ *
+ * There are five functions for dealing with EQs: LNetEQAlloc() is used to
+ * create an EQ and allocate the resources needed, while LNetEQFree()
+ * releases these resources and free the EQ. LNetEQGet() retrieves the next
+ * event from an EQ, and LNetEQWait() can be used to block a process until
+ * an EQ has at least one event. LNetEQPoll() can be used to test or wait
+ * on multiple EQs.
+ * @{ */
+int LNetEQAlloc(unsigned int       count_in,
+		lnet_eq_handler_t  handler,
+		lnet_handle_eq_t  *handle_out);
+
+int LNetEQFree(lnet_handle_eq_t eventq_in);
+
+int LNetEQGet(lnet_handle_eq_t  eventq_in,
+	      lnet_event_t     *event_out);
+
+int LNetEQWait(lnet_handle_eq_t  eventq_in,
+	       lnet_event_t     *event_out);
+
+int LNetEQPoll(lnet_handle_eq_t *eventqs_in,
+	       int	       neq_in,
+	       int	       timeout_ms,
+	       lnet_event_t     *event_out,
+	       int	      *which_eq_out);
+/** @} lnet_eq */
+
+/** \defgroup lnet_data Data movement operations
+ *
+ * The LNet API provides two data movement operations: LNetPut()
+ * and LNetGet().
+ * @{ */
+int LNetPut(lnet_nid_t	self,
+	    lnet_handle_md_t  md_in,
+	    lnet_ack_req_t    ack_req_in,
+	    lnet_process_id_t target_in,
+	    unsigned int      portal_in,
+	    __u64	     match_bits_in,
+	    unsigned int      offset_in,
+	    __u64	     hdr_data_in);
+
+int LNetGet(lnet_nid_t	self,
+	    lnet_handle_md_t  md_in,
+	    lnet_process_id_t target_in,
+	    unsigned int      portal_in,
+	    __u64	     match_bits_in,
+	    unsigned int      offset_in);
+/** @} lnet_data */
+
+/** \defgroup lnet_misc Miscellaneous operations.
+ * Miscellaneous operations.
+ * @{ */
+
+int LNetSetLazyPortal(int portal);
+int LNetClearLazyPortal(int portal);
+int LNetCtl(unsigned int cmd, void *arg);
+int LNetSetAsync(lnet_process_id_t id, int nasync);
+
+/** @} lnet_misc */
+
+/** @} lnet */
+#endif
diff --git a/kernel/drivers/staging/lustre/include/linux/lnet/lib-lnet.h b/kernel/drivers/staging/lustre/include/linux/lnet/lib-lnet.h
new file mode 100644
index 000000000..0038d29a3
--- /dev/null
+++ b/kernel/drivers/staging/lustre/include/linux/lnet/lib-lnet.h
@@ -0,0 +1,883 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/include/lnet/lib-lnet.h
+ *
+ * Top level include for library side routines
+ */
+
+#ifndef __LNET_LIB_LNET_H__
+#define __LNET_LIB_LNET_H__
+
+#include "linux/lib-lnet.h"
+#include "../libcfs/libcfs.h"
+#include "types.h"
+#include "lnet.h"
+#include "lib-types.h"
+
+extern lnet_t  the_lnet;			/* THE network */
+
+#if  defined(LNET_USE_LIB_FREELIST)
+/* 1 CPT, simplify implementation... */
+# define LNET_CPT_MAX_BITS      0
+
+#else /* KERNEL and no freelist */
+
+# if (BITS_PER_LONG == 32)
+/* 2 CPTs, allowing more CPTs might make us under memory pressure */
+#  define LNET_CPT_MAX_BITS     1
+
+# else /* 64-bit system */
+/*
+ * 256 CPTs for thousands of CPUs, allowing more CPTs might make us
+ * under risk of consuming all lh_cookie.
+ */
+#  define LNET_CPT_MAX_BITS     8
+# endif /* BITS_PER_LONG == 32 */
+#endif
+
+/* max allowed CPT number */
+#define LNET_CPT_MAX	    (1 << LNET_CPT_MAX_BITS)
+
+#define LNET_CPT_NUMBER	 (the_lnet.ln_cpt_number)
+#define LNET_CPT_BITS	   (the_lnet.ln_cpt_bits)
+#define LNET_CPT_MASK	   ((1ULL << LNET_CPT_BITS) - 1)
+
+/** exclusive lock */
+#define LNET_LOCK_EX	    CFS_PERCPT_LOCK_EX
+
+static inline int lnet_is_wire_handle_none(lnet_handle_wire_t *wh)
+{
+	return (wh->wh_interface_cookie == LNET_WIRE_HANDLE_COOKIE_NONE &&
+		wh->wh_object_cookie == LNET_WIRE_HANDLE_COOKIE_NONE);
+}
+
+static inline int lnet_md_exhausted(lnet_libmd_t *md)
+{
+	return (md->md_threshold == 0 ||
+		((md->md_options & LNET_MD_MAX_SIZE) != 0 &&
+		 md->md_offset + md->md_max_size > md->md_length));
+}
+
+static inline int lnet_md_unlinkable(lnet_libmd_t *md)
+{
+	/* Should unlink md when its refcount is 0 and either:
+	 *  - md has been flagged for deletion (by auto unlink or
+	 *    LNetM[DE]Unlink, in the latter case md may not be exhausted).
+	 *  - auto unlink is on and md is exhausted.
+	 */
+	if (md->md_refcount != 0)
+		return 0;
+
+	if ((md->md_flags & LNET_MD_FLAG_ZOMBIE) != 0)
+		return 1;
+
+	return ((md->md_flags & LNET_MD_FLAG_AUTO_UNLINK) != 0 &&
+		lnet_md_exhausted(md));
+}
+
+#define lnet_cpt_table()	(the_lnet.ln_cpt_table)
+#define lnet_cpt_current()	cfs_cpt_current(the_lnet.ln_cpt_table, 1)
+
+static inline int
+lnet_cpt_of_cookie(__u64 cookie)
+{
+	unsigned int cpt = (cookie >> LNET_COOKIE_TYPE_BITS) & LNET_CPT_MASK;
+
+	/* LNET_CPT_NUMBER doesn't have to be power2, which means we can
+	 * get illegal cpt from it's invalid cookie */
+	return cpt < LNET_CPT_NUMBER ? cpt : cpt % LNET_CPT_NUMBER;
+}
+
+static inline void
+lnet_res_lock(int cpt)
+{
+	cfs_percpt_lock(the_lnet.ln_res_lock, cpt);
+}
+
+static inline void
+lnet_res_unlock(int cpt)
+{
+	cfs_percpt_unlock(the_lnet.ln_res_lock, cpt);
+}
+
+static inline int
+lnet_res_lock_current(void)
+{
+	int cpt = lnet_cpt_current();
+
+	lnet_res_lock(cpt);
+	return cpt;
+}
+
+static inline void
+lnet_net_lock(int cpt)
+{
+	cfs_percpt_lock(the_lnet.ln_net_lock, cpt);
+}
+
+static inline void
+lnet_net_unlock(int cpt)
+{
+	cfs_percpt_unlock(the_lnet.ln_net_lock, cpt);
+}
+
+static inline int
+lnet_net_lock_current(void)
+{
+	int cpt = lnet_cpt_current();
+
+	lnet_net_lock(cpt);
+	return cpt;
+}
+
+#define LNET_LOCK()		lnet_net_lock(LNET_LOCK_EX)
+#define LNET_UNLOCK()		lnet_net_unlock(LNET_LOCK_EX)
+
+#define lnet_ptl_lock(ptl)	spin_lock(&(ptl)->ptl_lock)
+#define lnet_ptl_unlock(ptl)	spin_unlock(&(ptl)->ptl_lock)
+#define lnet_eq_wait_lock()	spin_lock(&the_lnet.ln_eq_wait_lock)
+#define lnet_eq_wait_unlock()	spin_unlock(&the_lnet.ln_eq_wait_lock)
+#define lnet_ni_lock(ni)	spin_lock(&(ni)->ni_lock)
+#define lnet_ni_unlock(ni)	spin_unlock(&(ni)->ni_lock)
+#define LNET_MUTEX_LOCK(m)	mutex_lock(m)
+#define LNET_MUTEX_UNLOCK(m)	mutex_unlock(m)
+
+#define MAX_PORTALS     64
+
+/* these are only used by code with LNET_USE_LIB_FREELIST, but we still
+ * exported them to !LNET_USE_LIB_FREELIST for easy implementation */
+#define LNET_FL_MAX_MES		2048
+#define LNET_FL_MAX_MDS		2048
+#define LNET_FL_MAX_EQS		512
+#define LNET_FL_MAX_MSGS	2048    /* Outstanding messages */
+
+#ifdef LNET_USE_LIB_FREELIST
+
+int lnet_freelist_init(lnet_freelist_t *fl, int n, int size);
+void lnet_freelist_fini(lnet_freelist_t *fl);
+
+static inline void *
+lnet_freelist_alloc(lnet_freelist_t *fl)
+{
+	/* ALWAYS called with liblock held */
+	lnet_freeobj_t *o;
+
+	if (list_empty(&fl->fl_list))
+		return NULL;
+
+	o = list_entry(fl->fl_list.next, lnet_freeobj_t, fo_list);
+	list_del(&o->fo_list);
+	return (void *)&o->fo_contents;
+}
+
+static inline void
+lnet_freelist_free(lnet_freelist_t *fl, void *obj)
+{
+	/* ALWAYS called with liblock held */
+	lnet_freeobj_t *o = list_entry(obj, lnet_freeobj_t, fo_contents);
+
+	list_add(&o->fo_list, &fl->fl_list);
+}
+
+static inline lnet_eq_t *
+lnet_eq_alloc(void)
+{
+	/* NEVER called with resource lock held */
+	struct lnet_res_container *rec = &the_lnet.ln_eq_container;
+	lnet_eq_t		  *eq;
+
+	LASSERT(LNET_CPT_NUMBER == 1);
+
+	lnet_res_lock(0);
+	eq = (lnet_eq_t *)lnet_freelist_alloc(&rec->rec_freelist);
+	lnet_res_unlock(0);
+
+	return eq;
+}
+
+static inline void
+lnet_eq_free_locked(lnet_eq_t *eq)
+{
+	/* ALWAYS called with resource lock held */
+	struct lnet_res_container *rec = &the_lnet.ln_eq_container;
+
+	LASSERT(LNET_CPT_NUMBER == 1);
+	lnet_freelist_free(&rec->rec_freelist, eq);
+}
+
+static inline void
+lnet_eq_free(lnet_eq_t *eq)
+{
+	lnet_res_lock(0);
+	lnet_eq_free_locked(eq);
+	lnet_res_unlock(0);
+}
+
+static inline lnet_libmd_t *
+lnet_md_alloc(lnet_md_t *umd)
+{
+	/* NEVER called with resource lock held */
+	struct lnet_res_container *rec = the_lnet.ln_md_containers[0];
+	lnet_libmd_t		  *md;
+
+	LASSERT(LNET_CPT_NUMBER == 1);
+
+	lnet_res_lock(0);
+	md = (lnet_libmd_t *)lnet_freelist_alloc(&rec->rec_freelist);
+	lnet_res_unlock(0);
+
+	if (md != NULL)
+		INIT_LIST_HEAD(&md->md_list);
+
+	return md;
+}
+
+static inline void
+lnet_md_free_locked(lnet_libmd_t *md)
+{
+	/* ALWAYS called with resource lock held */
+	struct lnet_res_container *rec = the_lnet.ln_md_containers[0];
+
+	LASSERT(LNET_CPT_NUMBER == 1);
+	lnet_freelist_free(&rec->rec_freelist, md);
+}
+
+static inline void
+lnet_md_free(lnet_libmd_t *md)
+{
+	lnet_res_lock(0);
+	lnet_md_free_locked(md);
+	lnet_res_unlock(0);
+}
+
+static inline lnet_me_t *
+lnet_me_alloc(void)
+{
+	/* NEVER called with resource lock held */
+	struct lnet_res_container *rec = the_lnet.ln_me_containers[0];
+	lnet_me_t		  *me;
+
+	LASSERT(LNET_CPT_NUMBER == 1);
+
+	lnet_res_lock(0);
+	me = (lnet_me_t *)lnet_freelist_alloc(&rec->rec_freelist);
+	lnet_res_unlock(0);
+
+	return me;
+}
+
+static inline void
+lnet_me_free_locked(lnet_me_t *me)
+{
+	/* ALWAYS called with resource lock held */
+	struct lnet_res_container *rec = the_lnet.ln_me_containers[0];
+
+	LASSERT(LNET_CPT_NUMBER == 1);
+	lnet_freelist_free(&rec->rec_freelist, me);
+}
+
+static inline void
+lnet_me_free(lnet_me_t *me)
+{
+	lnet_res_lock(0);
+	lnet_me_free_locked(me);
+	lnet_res_unlock(0);
+}
+
+static inline lnet_msg_t *
+lnet_msg_alloc(void)
+{
+	/* NEVER called with network lock held */
+	struct lnet_msg_container *msc = the_lnet.ln_msg_containers[0];
+	lnet_msg_t		  *msg;
+
+	LASSERT(LNET_CPT_NUMBER == 1);
+
+	lnet_net_lock(0);
+	msg = (lnet_msg_t *)lnet_freelist_alloc(&msc->msc_freelist);
+	lnet_net_unlock(0);
+
+	if (msg != NULL) {
+		/* NULL pointers, clear flags etc */
+		memset(msg, 0, sizeof(*msg));
+	}
+	return msg;
+}
+
+static inline void
+lnet_msg_free_locked(lnet_msg_t *msg)
+{
+	/* ALWAYS called with network lock held */
+	struct lnet_msg_container *msc = the_lnet.ln_msg_containers[0];
+
+	LASSERT(LNET_CPT_NUMBER == 1);
+	LASSERT(!msg->msg_onactivelist);
+	lnet_freelist_free(&msc->msc_freelist, msg);
+}
+
+static inline void
+lnet_msg_free(lnet_msg_t *msg)
+{
+	lnet_net_lock(0);
+	lnet_msg_free_locked(msg);
+	lnet_net_unlock(0);
+}
+
+#else /* !LNET_USE_LIB_FREELIST */
+
+static inline lnet_eq_t *
+lnet_eq_alloc(void)
+{
+	/* NEVER called with liblock held */
+	lnet_eq_t *eq;
+
+	LIBCFS_ALLOC(eq, sizeof(*eq));
+	return eq;
+}
+
+static inline void
+lnet_eq_free(lnet_eq_t *eq)
+{
+	/* ALWAYS called with resource lock held */
+	LIBCFS_FREE(eq, sizeof(*eq));
+}
+
+static inline lnet_libmd_t *
+lnet_md_alloc(lnet_md_t *umd)
+{
+	/* NEVER called with liblock held */
+	lnet_libmd_t *md;
+	unsigned int  size;
+	unsigned int  niov;
+
+	if ((umd->options & LNET_MD_KIOV) != 0) {
+		niov = umd->length;
+		size = offsetof(lnet_libmd_t, md_iov.kiov[niov]);
+	} else {
+		niov = ((umd->options & LNET_MD_IOVEC) != 0) ?
+		       umd->length : 1;
+		size = offsetof(lnet_libmd_t, md_iov.iov[niov]);
+	}
+
+	LIBCFS_ALLOC(md, size);
+
+	if (md != NULL) {
+		/* Set here in case of early free */
+		md->md_options = umd->options;
+		md->md_niov = niov;
+		INIT_LIST_HEAD(&md->md_list);
+	}
+
+	return md;
+}
+
+static inline void
+lnet_md_free(lnet_libmd_t *md)
+{
+	/* ALWAYS called with resource lock held */
+	unsigned int  size;
+
+	if ((md->md_options & LNET_MD_KIOV) != 0)
+		size = offsetof(lnet_libmd_t, md_iov.kiov[md->md_niov]);
+	else
+		size = offsetof(lnet_libmd_t, md_iov.iov[md->md_niov]);
+
+	LIBCFS_FREE(md, size);
+}
+
+static inline lnet_me_t *
+lnet_me_alloc(void)
+{
+	/* NEVER called with liblock held */
+	lnet_me_t *me;
+
+	LIBCFS_ALLOC(me, sizeof(*me));
+	return me;
+}
+
+static inline void
+lnet_me_free(lnet_me_t *me)
+{
+	/* ALWAYS called with resource lock held */
+	LIBCFS_FREE(me, sizeof(*me));
+}
+
+static inline lnet_msg_t *
+lnet_msg_alloc(void)
+{
+	/* NEVER called with liblock held */
+	lnet_msg_t *msg;
+
+	LIBCFS_ALLOC(msg, sizeof(*msg));
+
+	/* no need to zero, LIBCFS_ALLOC does for us */
+	return msg;
+}
+
+static inline void
+lnet_msg_free(lnet_msg_t *msg)
+{
+	/* ALWAYS called with network lock held */
+	LASSERT(!msg->msg_onactivelist);
+	LIBCFS_FREE(msg, sizeof(*msg));
+}
+
+#define lnet_eq_free_locked(eq)		lnet_eq_free(eq)
+#define lnet_md_free_locked(md)		lnet_md_free(md)
+#define lnet_me_free_locked(me)		lnet_me_free(me)
+#define lnet_msg_free_locked(msg)	lnet_msg_free(msg)
+
+#endif /* LNET_USE_LIB_FREELIST */
+
+lnet_libhandle_t *lnet_res_lh_lookup(struct lnet_res_container *rec,
+				     __u64 cookie);
+void lnet_res_lh_initialize(struct lnet_res_container *rec,
+			    lnet_libhandle_t *lh);
+static inline void
+lnet_res_lh_invalidate(lnet_libhandle_t *lh)
+{
+	/* ALWAYS called with resource lock held */
+	/* NB: cookie is still useful, don't reset it */
+	list_del(&lh->lh_hash_chain);
+}
+
+static inline void
+lnet_eq2handle(lnet_handle_eq_t *handle, lnet_eq_t *eq)
+{
+	if (eq == NULL) {
+		LNetInvalidateHandle(handle);
+		return;
+	}
+
+	handle->cookie = eq->eq_lh.lh_cookie;
+}
+
+static inline lnet_eq_t *
+lnet_handle2eq(lnet_handle_eq_t *handle)
+{
+	/* ALWAYS called with resource lock held */
+	lnet_libhandle_t *lh;
+
+	lh = lnet_res_lh_lookup(&the_lnet.ln_eq_container, handle->cookie);
+	if (lh == NULL)
+		return NULL;
+
+	return lh_entry(lh, lnet_eq_t, eq_lh);
+}
+
+static inline void
+lnet_md2handle(lnet_handle_md_t *handle, lnet_libmd_t *md)
+{
+	handle->cookie = md->md_lh.lh_cookie;
+}
+
+static inline lnet_libmd_t *
+lnet_handle2md(lnet_handle_md_t *handle)
+{
+	/* ALWAYS called with resource lock held */
+	lnet_libhandle_t *lh;
+	int		 cpt;
+
+	cpt = lnet_cpt_of_cookie(handle->cookie);
+	lh = lnet_res_lh_lookup(the_lnet.ln_md_containers[cpt],
+				handle->cookie);
+	if (lh == NULL)
+		return NULL;
+
+	return lh_entry(lh, lnet_libmd_t, md_lh);
+}
+
+static inline lnet_libmd_t *
+lnet_wire_handle2md(lnet_handle_wire_t *wh)
+{
+	/* ALWAYS called with resource lock held */
+	lnet_libhandle_t *lh;
+	int		 cpt;
+
+	if (wh->wh_interface_cookie != the_lnet.ln_interface_cookie)
+		return NULL;
+
+	cpt = lnet_cpt_of_cookie(wh->wh_object_cookie);
+	lh = lnet_res_lh_lookup(the_lnet.ln_md_containers[cpt],
+				wh->wh_object_cookie);
+	if (lh == NULL)
+		return NULL;
+
+	return lh_entry(lh, lnet_libmd_t, md_lh);
+}
+
+static inline void
+lnet_me2handle(lnet_handle_me_t *handle, lnet_me_t *me)
+{
+	handle->cookie = me->me_lh.lh_cookie;
+}
+
+static inline lnet_me_t *
+lnet_handle2me(lnet_handle_me_t *handle)
+{
+	/* ALWAYS called with resource lock held */
+	lnet_libhandle_t *lh;
+	int		 cpt;
+
+	cpt = lnet_cpt_of_cookie(handle->cookie);
+	lh = lnet_res_lh_lookup(the_lnet.ln_me_containers[cpt],
+				handle->cookie);
+	if (lh == NULL)
+		return NULL;
+
+	return lh_entry(lh, lnet_me_t, me_lh);
+}
+
+static inline void
+lnet_peer_addref_locked(lnet_peer_t *lp)
+{
+	LASSERT(lp->lp_refcount > 0);
+	lp->lp_refcount++;
+}
+
+void lnet_destroy_peer_locked(lnet_peer_t *lp);
+
+static inline void
+lnet_peer_decref_locked(lnet_peer_t *lp)
+{
+	LASSERT(lp->lp_refcount > 0);
+	lp->lp_refcount--;
+	if (lp->lp_refcount == 0)
+		lnet_destroy_peer_locked(lp);
+}
+
+static inline int
+lnet_isrouter(lnet_peer_t *lp)
+{
+	return lp->lp_rtr_refcount != 0;
+}
+
+static inline void
+lnet_ni_addref_locked(lnet_ni_t *ni, int cpt)
+{
+	LASSERT(cpt >= 0 && cpt < LNET_CPT_NUMBER);
+	LASSERT(*ni->ni_refs[cpt] >= 0);
+
+	(*ni->ni_refs[cpt])++;
+}
+
+static inline void
+lnet_ni_addref(lnet_ni_t *ni)
+{
+	lnet_net_lock(0);
+	lnet_ni_addref_locked(ni, 0);
+	lnet_net_unlock(0);
+}
+
+static inline void
+lnet_ni_decref_locked(lnet_ni_t *ni, int cpt)
+{
+	LASSERT(cpt >= 0 && cpt < LNET_CPT_NUMBER);
+	LASSERT(*ni->ni_refs[cpt] > 0);
+
+	(*ni->ni_refs[cpt])--;
+}
+
+static inline void
+lnet_ni_decref(lnet_ni_t *ni)
+{
+	lnet_net_lock(0);
+	lnet_ni_decref_locked(ni, 0);
+	lnet_net_unlock(0);
+}
+
+void lnet_ni_free(lnet_ni_t *ni);
+
+static inline int
+lnet_nid2peerhash(lnet_nid_t nid)
+{
+	return hash_long(nid, LNET_PEER_HASH_BITS);
+}
+
+static inline struct list_head *
+lnet_net2rnethash(__u32 net)
+{
+	return &the_lnet.ln_remote_nets_hash[(LNET_NETNUM(net) +
+		LNET_NETTYP(net)) &
+		((1U << the_lnet.ln_remote_nets_hbits) - 1)];
+}
+
+extern lnd_t the_lolnd;
+extern int avoid_asym_router_failure;
+
+int lnet_cpt_of_nid_locked(lnet_nid_t nid);
+int lnet_cpt_of_nid(lnet_nid_t nid);
+lnet_ni_t *lnet_nid2ni_locked(lnet_nid_t nid, int cpt);
+lnet_ni_t *lnet_net2ni_locked(__u32 net, int cpt);
+lnet_ni_t *lnet_net2ni(__u32 net);
+
+int lnet_notify(lnet_ni_t *ni, lnet_nid_t peer, int alive, unsigned long when);
+void lnet_notify_locked(lnet_peer_t *lp, int notifylnd, int alive,
+			unsigned long when);
+int lnet_add_route(__u32 net, unsigned int hops, lnet_nid_t gateway_nid,
+		   unsigned int priority);
+int lnet_check_routes(void);
+int lnet_del_route(__u32 net, lnet_nid_t gw_nid);
+void lnet_destroy_routes(void);
+int lnet_get_route(int idx, __u32 *net, __u32 *hops,
+		   lnet_nid_t *gateway, __u32 *alive, __u32 *priority);
+void lnet_proc_init(void);
+void lnet_proc_fini(void);
+int  lnet_rtrpools_alloc(int im_a_router);
+void lnet_rtrpools_free(void);
+lnet_remotenet_t *lnet_find_net_locked(__u32 net);
+
+int lnet_islocalnid(lnet_nid_t nid);
+int lnet_islocalnet(__u32 net);
+
+void lnet_msg_attach_md(lnet_msg_t *msg, lnet_libmd_t *md,
+			unsigned int offset, unsigned int mlen);
+void lnet_msg_detach_md(lnet_msg_t *msg, int status);
+void lnet_build_unlink_event(lnet_libmd_t *md, lnet_event_t *ev);
+void lnet_build_msg_event(lnet_msg_t *msg, lnet_event_kind_t ev_type);
+void lnet_msg_commit(lnet_msg_t *msg, int cpt);
+void lnet_msg_decommit(lnet_msg_t *msg, int cpt, int status);
+
+void lnet_eq_enqueue_event(lnet_eq_t *eq, lnet_event_t *ev);
+void lnet_prep_send(lnet_msg_t *msg, int type, lnet_process_id_t target,
+		    unsigned int offset, unsigned int len);
+int lnet_send(lnet_nid_t nid, lnet_msg_t *msg, lnet_nid_t rtr_nid);
+void lnet_return_tx_credits_locked(lnet_msg_t *msg);
+void lnet_return_rx_credits_locked(lnet_msg_t *msg);
+
+/* portals functions */
+/* portals attributes */
+static inline int
+lnet_ptl_is_lazy(lnet_portal_t *ptl)
+{
+	return !!(ptl->ptl_options & LNET_PTL_LAZY);
+}
+
+static inline int
+lnet_ptl_is_unique(lnet_portal_t *ptl)
+{
+	return !!(ptl->ptl_options & LNET_PTL_MATCH_UNIQUE);
+}
+
+static inline int
+lnet_ptl_is_wildcard(lnet_portal_t *ptl)
+{
+	return !!(ptl->ptl_options & LNET_PTL_MATCH_WILDCARD);
+}
+
+static inline void
+lnet_ptl_setopt(lnet_portal_t *ptl, int opt)
+{
+	ptl->ptl_options |= opt;
+}
+
+static inline void
+lnet_ptl_unsetopt(lnet_portal_t *ptl, int opt)
+{
+	ptl->ptl_options &= ~opt;
+}
+
+/* match-table functions */
+struct list_head *lnet_mt_match_head(struct lnet_match_table *mtable,
+				     lnet_process_id_t id, __u64 mbits);
+struct lnet_match_table *lnet_mt_of_attach(unsigned int index,
+					   lnet_process_id_t id, __u64 mbits,
+					   __u64 ignore_bits,
+					   lnet_ins_pos_t pos);
+int lnet_mt_match_md(struct lnet_match_table *mtable,
+		     struct lnet_match_info *info, struct lnet_msg *msg);
+
+/* portals match/attach functions */
+void lnet_ptl_attach_md(lnet_me_t *me, lnet_libmd_t *md,
+			struct list_head *matches, struct list_head *drops);
+void lnet_ptl_detach_md(lnet_me_t *me, lnet_libmd_t *md);
+int lnet_ptl_match_md(struct lnet_match_info *info, struct lnet_msg *msg);
+
+/* initialized and finalize portals */
+int lnet_portals_create(void);
+void lnet_portals_destroy(void);
+
+/* message functions */
+int lnet_parse(lnet_ni_t *ni, lnet_hdr_t *hdr,
+	       lnet_nid_t fromnid, void *private, int rdma_req);
+void lnet_recv(lnet_ni_t *ni, void *private, lnet_msg_t *msg, int delayed,
+	       unsigned int offset, unsigned int mlen, unsigned int rlen);
+lnet_msg_t *lnet_create_reply_msg(lnet_ni_t *ni, lnet_msg_t *get_msg);
+void lnet_set_reply_msg_len(lnet_ni_t *ni, lnet_msg_t *msg, unsigned int len);
+void lnet_finalize(lnet_ni_t *ni, lnet_msg_t *msg, int rc);
+void lnet_drop_delayed_msg_list(struct list_head *head, char *reason);
+void lnet_recv_delayed_msg_list(struct list_head *head);
+
+int lnet_msg_container_setup(struct lnet_msg_container *container, int cpt);
+void lnet_msg_container_cleanup(struct lnet_msg_container *container);
+void lnet_msg_containers_destroy(void);
+int lnet_msg_containers_create(void);
+
+char *lnet_msgtyp2str(int type);
+void lnet_print_hdr(lnet_hdr_t *hdr);
+int lnet_fail_nid(lnet_nid_t nid, unsigned int threshold);
+
+void lnet_counters_get(lnet_counters_t *counters);
+void lnet_counters_reset(void);
+
+unsigned int lnet_iov_nob(unsigned int niov, struct kvec *iov);
+int lnet_extract_iov(int dst_niov, struct kvec *dst,
+		     int src_niov, struct kvec *src,
+		      unsigned int offset, unsigned int len);
+
+unsigned int lnet_kiov_nob(unsigned int niov, lnet_kiov_t *iov);
+int lnet_extract_kiov(int dst_niov, lnet_kiov_t *dst,
+		      int src_niov, lnet_kiov_t *src,
+		      unsigned int offset, unsigned int len);
+
+void lnet_copy_iov2iov(unsigned int ndiov, struct kvec *diov,
+		       unsigned int doffset,
+			unsigned int nsiov, struct kvec *siov,
+			unsigned int soffset, unsigned int nob);
+void lnet_copy_kiov2iov(unsigned int niov, struct kvec *iov,
+			unsigned int iovoffset,
+			 unsigned int nkiov, lnet_kiov_t *kiov,
+			 unsigned int kiovoffset, unsigned int nob);
+void lnet_copy_iov2kiov(unsigned int nkiov, lnet_kiov_t *kiov,
+			unsigned int kiovoffset,
+			 unsigned int niov, struct kvec *iov,
+			 unsigned int iovoffset, unsigned int nob);
+void lnet_copy_kiov2kiov(unsigned int ndkiov, lnet_kiov_t *dkiov,
+			 unsigned int doffset,
+			  unsigned int nskiov, lnet_kiov_t *skiov,
+			  unsigned int soffset, unsigned int nob);
+
+static inline void
+lnet_copy_iov2flat(int dlen, void *dest, unsigned int doffset,
+		   unsigned int nsiov, struct kvec *siov, unsigned int soffset,
+		   unsigned int nob)
+{
+	struct kvec diov = {/*.iov_base = */ dest, /*.iov_len = */ dlen};
+
+	lnet_copy_iov2iov(1, &diov, doffset,
+			  nsiov, siov, soffset, nob);
+}
+
+static inline void
+lnet_copy_kiov2flat(int dlen, void *dest, unsigned int doffset,
+		    unsigned int nsiov, lnet_kiov_t *skiov,
+		    unsigned int soffset, unsigned int nob)
+{
+	struct kvec diov = {/* .iov_base = */ dest, /* .iov_len = */ dlen};
+
+	lnet_copy_kiov2iov(1, &diov, doffset,
+			   nsiov, skiov, soffset, nob);
+}
+
+static inline void
+lnet_copy_flat2iov(unsigned int ndiov, struct kvec *diov, unsigned int doffset,
+		   int slen, void *src, unsigned int soffset, unsigned int nob)
+{
+	struct kvec siov = {/*.iov_base = */ src, /*.iov_len = */slen};
+
+	lnet_copy_iov2iov(ndiov, diov, doffset,
+			  1, &siov, soffset, nob);
+}
+
+static inline void
+lnet_copy_flat2kiov(unsigned int ndiov, lnet_kiov_t *dkiov,
+		    unsigned int doffset, int slen, void *src,
+		    unsigned int soffset, unsigned int nob)
+{
+	struct kvec siov = {/* .iov_base = */ src, /* .iov_len = */ slen};
+
+	lnet_copy_iov2kiov(ndiov, dkiov, doffset,
+			   1, &siov, soffset, nob);
+}
+
+void lnet_me_unlink(lnet_me_t *me);
+
+void lnet_md_unlink(lnet_libmd_t *md);
+void lnet_md_deconstruct(lnet_libmd_t *lmd, lnet_md_t *umd);
+
+void lnet_register_lnd(lnd_t *lnd);
+void lnet_unregister_lnd(lnd_t *lnd);
+int lnet_set_ip_niaddr(lnet_ni_t *ni);
+
+int lnet_connect(struct socket **sockp, lnet_nid_t peer_nid,
+		 __u32 local_ip, __u32 peer_ip, int peer_port);
+void lnet_connect_console_error(int rc, lnet_nid_t peer_nid,
+				__u32 peer_ip, int port);
+int lnet_count_acceptor_nis(void);
+int lnet_acceptor_timeout(void);
+int lnet_acceptor_port(void);
+
+int lnet_count_acceptor_nis(void);
+int lnet_acceptor_port(void);
+
+int lnet_acceptor_start(void);
+void lnet_acceptor_stop(void);
+
+void lnet_get_tunables(void);
+int lnet_peers_start_down(void);
+int lnet_peer_buffer_credits(lnet_ni_t *ni);
+
+int lnet_router_checker_start(void);
+void lnet_router_checker_stop(void);
+void lnet_router_ni_update_locked(lnet_peer_t *gw, __u32 net);
+void lnet_swap_pinginfo(lnet_ping_info_t *info);
+
+int lnet_ping_target_init(void);
+void lnet_ping_target_fini(void);
+int lnet_ping(lnet_process_id_t id, int timeout_ms,
+	      lnet_process_id_t *ids, int n_ids);
+
+int lnet_parse_ip2nets(char **networksp, char *ip2nets);
+int lnet_parse_routes(char *route_str, int *im_a_router);
+int lnet_parse_networks(struct list_head *nilist, char *networks);
+
+int lnet_nid2peer_locked(lnet_peer_t **lpp, lnet_nid_t nid, int cpt);
+lnet_peer_t *lnet_find_peer_locked(struct lnet_peer_table *ptable,
+				   lnet_nid_t nid);
+void lnet_peer_tables_cleanup(void);
+void lnet_peer_tables_destroy(void);
+int lnet_peer_tables_create(void);
+void lnet_debug_peer(lnet_nid_t nid);
+
+static inline void lnet_peer_set_alive(lnet_peer_t *lp)
+{
+	lp->lp_last_alive = lp->lp_last_query = get_seconds();
+	if (!lp->lp_alive)
+		lnet_notify_locked(lp, 0, 1, lp->lp_last_alive);
+}
+
+
+#endif
diff --git a/kernel/drivers/staging/lustre/include/linux/lnet/lib-types.h b/kernel/drivers/staging/lustre/include/linux/lnet/lib-types.h
new file mode 100644
index 000000000..50537668f
--- /dev/null
+++ b/kernel/drivers/staging/lustre/include/linux/lnet/lib-types.h
@@ -0,0 +1,760 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/include/lnet/lib-types.h
+ *
+ * Types used by the library side routines that do not need to be
+ * exposed to the user application
+ */
+
+#ifndef __LNET_LIB_TYPES_H__
+#define __LNET_LIB_TYPES_H__
+
+#include "linux/lib-types.h"
+
+#include "../libcfs/libcfs.h"
+#include <linux/list.h>
+#include "types.h"
+
+#define WIRE_ATTR       __attribute__((packed))
+
+/* Packed version of lnet_process_id_t to transfer via network */
+typedef struct {
+	lnet_nid_t nid;
+	lnet_pid_t pid;   /* node id / process id */
+} WIRE_ATTR lnet_process_id_packed_t;
+
+/* The wire handle's interface cookie only matches one network interface in
+ * one epoch (i.e. new cookie when the interface restarts or the node
+ * reboots).  The object cookie only matches one object on that interface
+ * during that object's lifetime (i.e. no cookie re-use). */
+typedef struct {
+	__u64 wh_interface_cookie;
+	__u64 wh_object_cookie;
+} WIRE_ATTR lnet_handle_wire_t;
+
+typedef enum {
+	LNET_MSG_ACK = 0,
+	LNET_MSG_PUT,
+	LNET_MSG_GET,
+	LNET_MSG_REPLY,
+	LNET_MSG_HELLO,
+} lnet_msg_type_t;
+
+/* The variant fields of the portals message header are aligned on an 8
+ * byte boundary in the message header.  Note that all types used in these
+ * wire structs MUST be fixed size and the smaller types are placed at the
+ * end. */
+typedef struct lnet_ack {
+	lnet_handle_wire_t  dst_wmd;
+	__u64	       match_bits;
+	__u32	       mlength;
+} WIRE_ATTR lnet_ack_t;
+
+typedef struct lnet_put {
+	lnet_handle_wire_t  ack_wmd;
+	__u64	       match_bits;
+	__u64	       hdr_data;
+	__u32	       ptl_index;
+	__u32	       offset;
+} WIRE_ATTR lnet_put_t;
+
+typedef struct lnet_get {
+	lnet_handle_wire_t  return_wmd;
+	__u64	       match_bits;
+	__u32	       ptl_index;
+	__u32	       src_offset;
+	__u32	       sink_length;
+} WIRE_ATTR lnet_get_t;
+
+typedef struct lnet_reply {
+	lnet_handle_wire_t  dst_wmd;
+} WIRE_ATTR lnet_reply_t;
+
+typedef struct lnet_hello {
+	__u64	      incarnation;
+	__u32	      type;
+} WIRE_ATTR lnet_hello_t;
+
+typedef struct {
+	lnet_nid_t	  dest_nid;
+	lnet_nid_t	  src_nid;
+	lnet_pid_t	  dest_pid;
+	lnet_pid_t	  src_pid;
+	__u32	       type;	       /* lnet_msg_type_t */
+	__u32	       payload_length;     /* payload data to follow */
+	/*<------__u64 aligned------->*/
+	union {
+		lnet_ack_t   ack;
+		lnet_put_t   put;
+		lnet_get_t   get;
+		lnet_reply_t reply;
+		lnet_hello_t hello;
+	} msg;
+} WIRE_ATTR lnet_hdr_t;
+
+/* A HELLO message contains a magic number and protocol version
+ * code in the header's dest_nid, the peer's NID in the src_nid, and
+ * LNET_MSG_HELLO in the type field.  All other common fields are zero
+ * (including payload_size; i.e. no payload).
+ * This is for use by byte-stream LNDs (e.g. TCP/IP) to check the peer is
+ * running the same protocol and to find out its NID. These LNDs should
+ * exchange HELLO messages when a connection is first established.  Individual
+ * LNDs can put whatever else they fancy in lnet_hdr_t::msg.
+ */
+typedef struct {
+	__u32   magic;			  /* LNET_PROTO_TCP_MAGIC */
+	__u16   version_major;		  /* increment on incompatible change */
+	__u16   version_minor;		  /* increment on compatible change */
+} WIRE_ATTR lnet_magicversion_t;
+
+/* PROTO MAGIC for LNDs */
+#define LNET_PROTO_IB_MAGIC		 0x0be91b91
+#define LNET_PROTO_RA_MAGIC		 0x0be91b92
+#define LNET_PROTO_QSW_MAGIC		0x0be91b93
+#define LNET_PROTO_GNI_MAGIC		0xb00fbabe /* ask Kim */
+#define LNET_PROTO_TCP_MAGIC		0xeebc0ded
+#define LNET_PROTO_PTL_MAGIC		0x50746C4E /* 'PtlN' unique magic */
+#define LNET_PROTO_MX_MAGIC		 0x4d583130 /* 'MX10'! */
+#define LNET_PROTO_ACCEPTOR_MAGIC	   0xacce7100
+#define LNET_PROTO_PING_MAGIC	       0x70696E67 /* 'ping' */
+
+/* Placeholder for a future "unified" protocol across all LNDs */
+/* Current LNDs that receive a request with this magic will respond with a
+ * "stub" reply using their current protocol */
+#define LNET_PROTO_MAGIC		    0x45726963 /* ! */
+
+#define LNET_PROTO_TCP_VERSION_MAJOR	1
+#define LNET_PROTO_TCP_VERSION_MINOR	0
+
+/* Acceptor connection request */
+typedef struct {
+	__u32       acr_magic;		  /* PTL_ACCEPTOR_PROTO_MAGIC */
+	__u32       acr_version;		/* protocol version */
+	__u64       acr_nid;		    /* target NID */
+} WIRE_ATTR lnet_acceptor_connreq_t;
+
+#define LNET_PROTO_ACCEPTOR_VERSION       1
+
+/* forward refs */
+struct lnet_libmd;
+
+typedef struct lnet_msg {
+	struct list_head	    msg_activelist;
+	struct list_head	    msg_list;	   /* Q for credits/MD */
+
+	lnet_process_id_t     msg_target;
+	/* where is it from, it's only for building event */
+	lnet_nid_t		msg_from;
+	__u32			msg_type;
+
+	/* committed for sending */
+	unsigned int		msg_tx_committed:1;
+	/* CPT # this message committed for sending */
+	unsigned int		msg_tx_cpt:15;
+	/* committed for receiving */
+	unsigned int		msg_rx_committed:1;
+	/* CPT # this message committed for receiving */
+	unsigned int		msg_rx_cpt:15;
+	/* queued for tx credit */
+	unsigned int		msg_tx_delayed:1;
+	/* queued for RX buffer */
+	unsigned int		msg_rx_delayed:1;
+	/* ready for pending on RX delay list */
+	unsigned int		msg_rx_ready_delay:1;
+
+	unsigned int	  msg_vmflush:1;      /* VM trying to free memory */
+	unsigned int	  msg_target_is_router:1; /* sending to a router */
+	unsigned int	  msg_routing:1;      /* being forwarded */
+	unsigned int	  msg_ack:1;	  /* ack on finalize (PUT) */
+	unsigned int	  msg_sending:1;      /* outgoing message */
+	unsigned int	  msg_receiving:1;    /* being received */
+	unsigned int	  msg_txcredit:1;     /* taken an NI send credit */
+	unsigned int	  msg_peertxcredit:1; /* taken a peer send credit */
+	unsigned int	  msg_rtrcredit:1;    /* taken a global router credit */
+	unsigned int	  msg_peerrtrcredit:1; /* taken a peer router credit */
+	unsigned int	  msg_onactivelist:1; /* on the activelist */
+
+	struct lnet_peer     *msg_txpeer;	 /* peer I'm sending to */
+	struct lnet_peer     *msg_rxpeer;	 /* peer I received from */
+
+	void		 *msg_private;
+	struct lnet_libmd    *msg_md;
+
+	unsigned int	  msg_len;
+	unsigned int	  msg_wanted;
+	unsigned int	  msg_offset;
+	unsigned int	  msg_niov;
+	struct kvec	 *msg_iov;
+	lnet_kiov_t	  *msg_kiov;
+
+	lnet_event_t	  msg_ev;
+	lnet_hdr_t	    msg_hdr;
+} lnet_msg_t;
+
+typedef struct lnet_libhandle {
+	struct list_head	    lh_hash_chain;
+	__u64		 lh_cookie;
+} lnet_libhandle_t;
+
+#define lh_entry(ptr, type, member) \
+	((type *)((char *)(ptr)-(char *)(&((type *)0)->member)))
+
+typedef struct lnet_eq {
+	struct list_head		eq_list;
+	lnet_libhandle_t	eq_lh;
+	lnet_seq_t		eq_enq_seq;
+	lnet_seq_t		eq_deq_seq;
+	unsigned int		eq_size;
+	lnet_eq_handler_t	eq_callback;
+	lnet_event_t		*eq_events;
+	int			**eq_refs;	/* percpt refcount for EQ */
+} lnet_eq_t;
+
+typedef struct lnet_me {
+	struct list_head	     me_list;
+	lnet_libhandle_t       me_lh;
+	lnet_process_id_t      me_match_id;
+	unsigned int	   me_portal;
+	unsigned int	   me_pos;		/* hash offset in mt_hash */
+	__u64		  me_match_bits;
+	__u64		  me_ignore_bits;
+	lnet_unlink_t	  me_unlink;
+	struct lnet_libmd     *me_md;
+} lnet_me_t;
+
+typedef struct lnet_libmd {
+	struct list_head	    md_list;
+	lnet_libhandle_t      md_lh;
+	lnet_me_t	    *md_me;
+	char		 *md_start;
+	unsigned int	  md_offset;
+	unsigned int	  md_length;
+	unsigned int	  md_max_size;
+	int		   md_threshold;
+	int		   md_refcount;
+	unsigned int	  md_options;
+	unsigned int	  md_flags;
+	void		 *md_user_ptr;
+	lnet_eq_t	    *md_eq;
+	unsigned int	  md_niov;		/* # frags */
+	union {
+		struct kvec   iov[LNET_MAX_IOV];
+		lnet_kiov_t   kiov[LNET_MAX_IOV];
+	} md_iov;
+} lnet_libmd_t;
+
+#define LNET_MD_FLAG_ZOMBIE	   (1 << 0)
+#define LNET_MD_FLAG_AUTO_UNLINK      (1 << 1)
+#define LNET_MD_FLAG_ABORTED	 (1 << 2)
+
+#ifdef LNET_USE_LIB_FREELIST
+typedef struct {
+	void		  *fl_objs;	  /* single contiguous array of objects */
+	int		    fl_nobjs;	 /* the number of them */
+	int		    fl_objsize;       /* the size (including overhead) of each of them */
+	struct list_head	     fl_list;	  /* where they are enqueued */
+} lnet_freelist_t;
+
+typedef struct {
+	struct list_head	     fo_list;	     /* enqueue on fl_list */
+	void		  *fo_contents;	 /* aligned contents */
+} lnet_freeobj_t;
+#endif
+
+typedef struct {
+	/* info about peers we are trying to fail */
+	struct list_head	     tp_list;	     /* ln_test_peers */
+	lnet_nid_t	     tp_nid;	      /* matching nid */
+	unsigned int	   tp_threshold;	/* # failures to simulate */
+} lnet_test_peer_t;
+
+#define LNET_COOKIE_TYPE_MD    1
+#define LNET_COOKIE_TYPE_ME    2
+#define LNET_COOKIE_TYPE_EQ    3
+#define LNET_COOKIE_TYPE_BITS  2
+#define LNET_COOKIE_MASK	((1ULL << LNET_COOKIE_TYPE_BITS) - 1ULL)
+
+struct lnet_ni;				  /* forward ref */
+
+typedef struct lnet_lnd {
+	/* fields managed by portals */
+	struct list_head	    lnd_list;	     /* stash in the LND table */
+	int		   lnd_refcount;	 /* # active instances */
+
+	/* fields initialised by the LND */
+	unsigned int	  lnd_type;
+
+	int  (*lnd_startup)(struct lnet_ni *ni);
+	void (*lnd_shutdown)(struct lnet_ni *ni);
+	int  (*lnd_ctl)(struct lnet_ni *ni, unsigned int cmd, void *arg);
+
+	/* In data movement APIs below, payload buffers are described as a set
+	 * of 'niov' fragments which are...
+	 * EITHER
+	 *    in virtual memory (struct iovec *iov != NULL)
+	 * OR
+	 *    in pages (kernel only: plt_kiov_t *kiov != NULL).
+	 * The LND may NOT overwrite these fragment descriptors.
+	 * An 'offset' and may specify a byte offset within the set of
+	 * fragments to start from
+	 */
+
+	/* Start sending a preformatted message.  'private' is NULL for PUT and
+	 * GET messages; otherwise this is a response to an incoming message
+	 * and 'private' is the 'private' passed to lnet_parse().  Return
+	 * non-zero for immediate failure, otherwise complete later with
+	 * lnet_finalize() */
+	int (*lnd_send)(struct lnet_ni *ni, void *private, lnet_msg_t *msg);
+
+	/* Start receiving 'mlen' bytes of payload data, skipping the following
+	 * 'rlen' - 'mlen' bytes. 'private' is the 'private' passed to
+	 * lnet_parse().  Return non-zero for immediate failure, otherwise
+	 * complete later with lnet_finalize().  This also gives back a receive
+	 * credit if the LND does flow control. */
+	int (*lnd_recv)(struct lnet_ni *ni, void *private, lnet_msg_t *msg,
+			int delayed, unsigned int niov,
+			struct kvec *iov, lnet_kiov_t *kiov,
+			unsigned int offset, unsigned int mlen, unsigned int rlen);
+
+	/* lnet_parse() has had to delay processing of this message
+	 * (e.g. waiting for a forwarding buffer or send credits).  Give the
+	 * LND a chance to free urgently needed resources.  If called, return 0
+	 * for success and do NOT give back a receive credit; that has to wait
+	 * until lnd_recv() gets called.  On failure return < 0 and
+	 * release resources; lnd_recv() will not be called. */
+	int (*lnd_eager_recv)(struct lnet_ni *ni, void *private, lnet_msg_t *msg,
+			      void **new_privatep);
+
+	/* notification of peer health */
+	void (*lnd_notify)(struct lnet_ni *ni, lnet_nid_t peer, int alive);
+
+	/* query of peer aliveness */
+	void (*lnd_query)(struct lnet_ni *ni, lnet_nid_t peer, unsigned long *when);
+
+	/* accept a new connection */
+	int (*lnd_accept)(struct lnet_ni *ni, struct socket *sock);
+
+} lnd_t;
+
+#define LNET_NI_STATUS_UP      0x15aac0de
+#define LNET_NI_STATUS_DOWN    0xdeadface
+#define LNET_NI_STATUS_INVALID 0x00000000
+typedef struct {
+	lnet_nid_t ns_nid;
+	__u32      ns_status;
+	__u32      ns_unused;
+} WIRE_ATTR lnet_ni_status_t;
+
+struct lnet_tx_queue {
+	int			tq_credits;	/* # tx credits free */
+	int			tq_credits_min;	/* lowest it's been */
+	int			tq_credits_max;	/* total # tx credits */
+	struct list_head		tq_delayed;	/* delayed TXs */
+};
+
+#define LNET_MAX_INTERFACES   16
+
+typedef struct lnet_ni {
+	spinlock_t		ni_lock;
+	struct list_head		ni_list;	/* chain on ln_nis */
+	struct list_head		ni_cptlist;	/* chain on ln_nis_cpt */
+	int			ni_maxtxcredits; /* # tx credits  */
+	/* # per-peer send credits */
+	int			ni_peertxcredits;
+	/* # per-peer router buffer credits */
+	int			ni_peerrtrcredits;
+	/* seconds to consider peer dead */
+	int			ni_peertimeout;
+	int			ni_ncpts;	/* number of CPTs */
+	__u32			*ni_cpts;	/* bond NI on some CPTs */
+	lnet_nid_t		ni_nid;		/* interface's NID */
+	void			*ni_data;	/* instance-specific data */
+	lnd_t			*ni_lnd;	/* procedural interface */
+	struct lnet_tx_queue	**ni_tx_queues;	/* percpt TX queues */
+	int			**ni_refs;	/* percpt reference count */
+	long			ni_last_alive;	/* when I was last alive */
+	lnet_ni_status_t	*ni_status;	/* my health status */
+	/* equivalent interfaces to use */
+	char			*ni_interfaces[LNET_MAX_INTERFACES];
+} lnet_ni_t;
+
+#define LNET_PROTO_PING_MATCHBITS	0x8000000000000000LL
+
+/* NB: value of these features equal to LNET_PROTO_PING_VERSION_x
+ * of old LNet, so there shouldn't be any compatibility issue */
+#define LNET_PING_FEAT_INVAL		(0)		/* no feature */
+#define LNET_PING_FEAT_BASE		(1 << 0)	/* just a ping */
+#define LNET_PING_FEAT_NI_STATUS	(1 << 1)	/* return NI status */
+
+#define LNET_PING_FEAT_MASK		(LNET_PING_FEAT_BASE | \
+					 LNET_PING_FEAT_NI_STATUS)
+
+typedef struct {
+	__u32			pi_magic;
+	__u32			pi_features;
+	lnet_pid_t		pi_pid;
+	__u32			pi_nnis;
+	lnet_ni_status_t	pi_ni[0];
+} WIRE_ATTR lnet_ping_info_t;
+
+/* router checker data, per router */
+#define LNET_MAX_RTR_NIS   16
+#define LNET_PINGINFO_SIZE offsetof(lnet_ping_info_t, pi_ni[LNET_MAX_RTR_NIS])
+typedef struct {
+	/* chain on the_lnet.ln_zombie_rcd or ln_deathrow_rcd */
+	struct list_head		rcd_list;
+	lnet_handle_md_t	rcd_mdh;	/* ping buffer MD */
+	struct lnet_peer	*rcd_gateway;	/* reference to gateway */
+	lnet_ping_info_t	*rcd_pinginfo;	/* ping buffer */
+} lnet_rc_data_t;
+
+typedef struct lnet_peer {
+	struct list_head	lp_hashlist;	  /* chain on peer hash */
+	struct list_head	lp_txq;	       /* messages blocking for tx credits */
+	struct list_head	lp_rtrq;	      /* messages blocking for router credits */
+	struct list_head	lp_rtr_list;	  /* chain on router list */
+	int	       lp_txcredits;	 /* # tx credits available */
+	int	       lp_mintxcredits;      /* low water mark */
+	int	       lp_rtrcredits;	/* # router credits */
+	int	       lp_minrtrcredits;     /* low water mark */
+	unsigned int      lp_alive:1;	   /* alive/dead? */
+	unsigned int      lp_notify:1;	  /* notification outstanding? */
+	unsigned int      lp_notifylnd:1;       /* outstanding notification for LND? */
+	unsigned int      lp_notifying:1;       /* some thread is handling notification */
+	unsigned int      lp_ping_notsent;      /* SEND event outstanding from ping */
+	int	       lp_alive_count;       /* # times router went dead<->alive */
+	long	      lp_txqnob;	    /* bytes queued for sending */
+	unsigned long	lp_timestamp;	 /* time of last aliveness news */
+	unsigned long	lp_ping_timestamp;    /* time of last ping attempt */
+	unsigned long	lp_ping_deadline;     /* != 0 if ping reply expected */
+	unsigned long	lp_last_alive;	/* when I was last alive */
+	unsigned long	lp_last_query;	/* when lp_ni was queried last time */
+	lnet_ni_t	*lp_ni;		/* interface peer is on */
+	lnet_nid_t	lp_nid;	       /* peer's NID */
+	int	       lp_refcount;	  /* # refs */
+	int			lp_cpt;		/* CPT this peer attached on */
+	/* # refs from lnet_route_t::lr_gateway */
+	int			lp_rtr_refcount;
+	/* returned RC ping features */
+	unsigned int		lp_ping_feats;
+	struct list_head		lp_routes;	/* routers on this peer */
+	lnet_rc_data_t		*lp_rcd;	/* router checker state */
+} lnet_peer_t;
+
+/* peer hash size */
+#define LNET_PEER_HASH_BITS     9
+#define LNET_PEER_HASH_SIZE     (1 << LNET_PEER_HASH_BITS)
+
+/* peer hash table */
+struct lnet_peer_table {
+	int			pt_version;	/* /proc validity stamp */
+	int			pt_number;	/* # peers extant */
+	struct list_head		pt_deathrow;	/* zombie peers */
+	struct list_head		*pt_hash;	/* NID->peer hash */
+};
+
+/* peer aliveness is enabled only on routers for peers in a network where the
+ * lnet_ni_t::ni_peertimeout has been set to a positive value */
+#define lnet_peer_aliveness_enabled(lp) (the_lnet.ln_routing != 0 && \
+					 (lp)->lp_ni->ni_peertimeout > 0)
+
+typedef struct {
+	struct list_head		lr_list;	/* chain on net */
+	struct list_head		lr_gwlist;	/* chain on gateway */
+	lnet_peer_t		*lr_gateway;	/* router node */
+	__u32			lr_net;		/* remote network number */
+	int			lr_seq;		/* sequence for round-robin */
+	unsigned int		lr_downis;	/* number of down NIs */
+	unsigned int		lr_hops;	/* how far I am */
+	unsigned int            lr_priority;    /* route priority */
+} lnet_route_t;
+
+#define LNET_REMOTE_NETS_HASH_DEFAULT	(1U << 7)
+#define LNET_REMOTE_NETS_HASH_MAX	(1U << 16)
+#define LNET_REMOTE_NETS_HASH_SIZE	(1 << the_lnet.ln_remote_nets_hbits)
+
+typedef struct {
+	struct list_head	      lrn_list;       /* chain on ln_remote_nets_hash */
+	struct list_head	      lrn_routes;     /* routes to me */
+	__u32		   lrn_net;	/* my net number */
+} lnet_remotenet_t;
+
+typedef struct {
+	struct list_head rbp_bufs;	     /* my free buffer pool */
+	struct list_head rbp_msgs;	     /* messages blocking for a buffer */
+	int	rbp_npages;	   /* # pages in each buffer */
+	int	rbp_nbuffers;	 /* # buffers */
+	int	rbp_credits;	  /* # free buffers / blocked messages */
+	int	rbp_mincredits;       /* low water mark */
+} lnet_rtrbufpool_t;
+
+typedef struct {
+	struct list_head	     rb_list;	     /* chain on rbp_bufs */
+	lnet_rtrbufpool_t     *rb_pool;	     /* owning pool */
+	lnet_kiov_t	    rb_kiov[0];	  /* the buffer space */
+} lnet_rtrbuf_t;
+
+typedef struct {
+	__u32	msgs_alloc;
+	__u32	msgs_max;
+	__u32	errors;
+	__u32	send_count;
+	__u32	recv_count;
+	__u32	route_count;
+	__u32	drop_count;
+	__u64	send_length;
+	__u64	recv_length;
+	__u64	route_length;
+	__u64	drop_length;
+} WIRE_ATTR lnet_counters_t;
+
+#define LNET_PEER_HASHSIZE   503		/* prime! */
+
+#define LNET_NRBPOOLS	 3		 /* # different router buffer pools */
+
+enum {
+	/* Didn't match anything */
+	LNET_MATCHMD_NONE	= (1 << 0),
+	/* Matched OK */
+	LNET_MATCHMD_OK		= (1 << 1),
+	/* Must be discarded */
+	LNET_MATCHMD_DROP	= (1 << 2),
+	/* match and buffer is exhausted */
+	LNET_MATCHMD_EXHAUSTED  = (1 << 3),
+	/* match or drop */
+	LNET_MATCHMD_FINISH     = (LNET_MATCHMD_OK | LNET_MATCHMD_DROP),
+};
+
+/* Options for lnet_portal_t::ptl_options */
+#define LNET_PTL_LAZY	       (1 << 0)
+#define LNET_PTL_MATCH_UNIQUE       (1 << 1)    /* unique match, for RDMA */
+#define LNET_PTL_MATCH_WILDCARD     (1 << 2)    /* wildcard match, request portal */
+
+/* parameter for matching operations (GET, PUT) */
+struct lnet_match_info {
+	__u64			mi_mbits;
+	lnet_process_id_t	mi_id;
+	unsigned int		mi_opc;
+	unsigned int		mi_portal;
+	unsigned int		mi_rlength;
+	unsigned int		mi_roffset;
+};
+
+/* ME hash of RDMA portal */
+#define LNET_MT_HASH_BITS		8
+#define LNET_MT_HASH_SIZE		(1 << LNET_MT_HASH_BITS)
+#define LNET_MT_HASH_MASK		(LNET_MT_HASH_SIZE - 1)
+/* we allocate (LNET_MT_HASH_SIZE + 1) entries for lnet_match_table::mt_hash,
+ * the last entry is reserved for MEs with ignore-bits */
+#define LNET_MT_HASH_IGNORE		LNET_MT_HASH_SIZE
+/* __u64 has 2^6 bits, so need 2^(LNET_MT_HASH_BITS - LNET_MT_BITS_U64) which
+ * is 4 __u64s as bit-map, and add an extra __u64 (only use one bit) for the
+ * ME-list with ignore-bits, which is mtable::mt_hash[LNET_MT_HASH_IGNORE] */
+#define LNET_MT_BITS_U64		6	/* 2^6 bits */
+#define LNET_MT_EXHAUSTED_BITS		(LNET_MT_HASH_BITS - LNET_MT_BITS_U64)
+#define LNET_MT_EXHAUSTED_BMAP		((1 << LNET_MT_EXHAUSTED_BITS) + 1)
+
+/* portal match table */
+struct lnet_match_table {
+	/* reserved for upcoming patches, CPU partition ID */
+	unsigned int		mt_cpt;
+	unsigned int		mt_portal;      /* portal index */
+	/* match table is set as "enabled" if there's non-exhausted MD
+	 * attached on mt_mhash, it's only valid for wildcard portal */
+	unsigned int		mt_enabled;
+	/* bitmap to flag whether MEs on mt_hash are exhausted or not */
+	__u64			mt_exhausted[LNET_MT_EXHAUSTED_BMAP];
+	struct list_head		*mt_mhash;      /* matching hash */
+};
+
+/* these are only useful for wildcard portal */
+/* Turn off message rotor for wildcard portals */
+#define	LNET_PTL_ROTOR_OFF	0
+/* round-robin dispatch all PUT messages for wildcard portals */
+#define	LNET_PTL_ROTOR_ON	1
+/* round-robin dispatch routed PUT message for wildcard portals */
+#define	LNET_PTL_ROTOR_RR_RT	2
+/* dispatch routed PUT message by hashing source NID for wildcard portals */
+#define	LNET_PTL_ROTOR_HASH_RT	3
+
+typedef struct lnet_portal {
+	spinlock_t		ptl_lock;
+	unsigned int		ptl_index;	/* portal ID, reserved */
+	/* flags on this portal: lazy, unique... */
+	unsigned int		ptl_options;
+	/* list of messages which are stealing buffer */
+	struct list_head		ptl_msg_stealing;
+	/* messages blocking for MD */
+	struct list_head		ptl_msg_delayed;
+	/* Match table for each CPT */
+	struct lnet_match_table	**ptl_mtables;
+	/* spread rotor of incoming "PUT" */
+	unsigned int		ptl_rotor;
+	/* # active entries for this portal */
+	int		     ptl_mt_nmaps;
+	/* array of active entries' cpu-partition-id */
+	int		     ptl_mt_maps[0];
+} lnet_portal_t;
+
+#define LNET_LH_HASH_BITS	12
+#define LNET_LH_HASH_SIZE	(1ULL << LNET_LH_HASH_BITS)
+#define LNET_LH_HASH_MASK	(LNET_LH_HASH_SIZE - 1)
+
+/* resource container (ME, MD, EQ) */
+struct lnet_res_container {
+	unsigned int		rec_type;	/* container type */
+	__u64			rec_lh_cookie;	/* cookie generator */
+	struct list_head		rec_active;	/* active resource list */
+	struct list_head		*rec_lh_hash;	/* handle hash */
+#ifdef LNET_USE_LIB_FREELIST
+	lnet_freelist_t		rec_freelist;	/* freelist for resources */
+#endif
+};
+
+/* message container */
+struct lnet_msg_container {
+	int			msc_init;	/* initialized or not */
+	/* max # threads finalizing */
+	int			msc_nfinalizers;
+	/* msgs waiting to complete finalizing */
+	struct list_head		msc_finalizing;
+	struct list_head		msc_active;	/* active message list */
+	/* threads doing finalization */
+	void			**msc_finalizers;
+#ifdef LNET_USE_LIB_FREELIST
+	lnet_freelist_t		msc_freelist;	/* freelist for messages */
+#endif
+};
+
+/* Router Checker states */
+#define LNET_RC_STATE_SHUTDOWN		0	/* not started */
+#define LNET_RC_STATE_RUNNING		1	/* started up OK */
+#define LNET_RC_STATE_STOPPING		2	/* telling thread to stop */
+
+typedef struct {
+	/* CPU partition table of LNet */
+	struct cfs_cpt_table		*ln_cpt_table;
+	/* number of CPTs in ln_cpt_table */
+	unsigned int			ln_cpt_number;
+	unsigned int			ln_cpt_bits;
+
+	/* protect LNet resources (ME/MD/EQ) */
+	struct cfs_percpt_lock		*ln_res_lock;
+	/* # portals */
+	int				ln_nportals;
+	/* the vector of portals */
+	lnet_portal_t			**ln_portals;
+	/* percpt ME containers */
+	struct lnet_res_container	**ln_me_containers;
+	/* percpt MD container */
+	struct lnet_res_container	**ln_md_containers;
+
+	/* Event Queue container */
+	struct lnet_res_container	ln_eq_container;
+	wait_queue_head_t			ln_eq_waitq;
+	spinlock_t			ln_eq_wait_lock;
+	unsigned int			ln_remote_nets_hbits;
+
+	/* protect NI, peer table, credits, routers, rtrbuf... */
+	struct cfs_percpt_lock		*ln_net_lock;
+	/* percpt message containers for active/finalizing/freed message */
+	struct lnet_msg_container	**ln_msg_containers;
+	lnet_counters_t			**ln_counters;
+	struct lnet_peer_table		**ln_peer_tables;
+	/* failure simulation */
+	struct list_head			ln_test_peers;
+
+	struct list_head			ln_nis;		/* LND instances */
+	/* NIs bond on specific CPT(s) */
+	struct list_head			ln_nis_cpt;
+	/* dying LND instances */
+	struct list_head			ln_nis_zombie;
+	lnet_ni_t			*ln_loni;	/* the loopback NI */
+	/* NI to wait for events in */
+	lnet_ni_t			*ln_eq_waitni;
+
+	/* remote networks with routes to them */
+	struct list_head			*ln_remote_nets_hash;
+	/* validity stamp */
+	__u64				ln_remote_nets_version;
+	/* list of all known routers */
+	struct list_head			ln_routers;
+	/* validity stamp */
+	__u64				ln_routers_version;
+	/* percpt router buffer pools */
+	lnet_rtrbufpool_t		**ln_rtrpools;
+
+	lnet_handle_md_t		ln_ping_target_md;
+	lnet_handle_eq_t		ln_ping_target_eq;
+	lnet_ping_info_t		*ln_ping_info;
+
+	/* router checker startup/shutdown state */
+	int				ln_rc_state;
+	/* router checker's event queue */
+	lnet_handle_eq_t		ln_rc_eqh;
+	/* rcd still pending on net */
+	struct list_head			ln_rcd_deathrow;
+	/* rcd ready for free */
+	struct list_head			ln_rcd_zombie;
+	/* serialise startup/shutdown */
+	struct semaphore		ln_rc_signal;
+
+	struct mutex			ln_api_mutex;
+	struct mutex			ln_lnd_mutex;
+	int				ln_init;	/* LNetInit() called? */
+	/* Have I called LNetNIInit myself? */
+	int				ln_niinit_self;
+	/* LNetNIInit/LNetNIFini counter */
+	int				ln_refcount;
+	/* shutdown in progress */
+	int				ln_shutdown;
+
+	int				ln_routing;	/* am I a router? */
+	lnet_pid_t			ln_pid;		/* requested pid */
+	/* uniquely identifies this ni in this epoch */
+	__u64				ln_interface_cookie;
+	/* registered LNDs */
+	struct list_head			ln_lnds;
+
+	/* space for network names */
+	char				*ln_network_tokens;
+	int				ln_network_tokens_nob;
+	/* test protocol compatibility flags */
+	int				ln_testprotocompat;
+
+} lnet_t;
+
+#endif
diff --git a/kernel/drivers/staging/lustre/include/linux/lnet/linux/api-support.h b/kernel/drivers/staging/lustre/include/linux/lnet/linux/api-support.h
new file mode 100644
index 000000000..e237ad6af
--- /dev/null
+++ b/kernel/drivers/staging/lustre/include/linux/lnet/linux/api-support.h
@@ -0,0 +1,42 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef __LINUX_API_SUPPORT_H__
+#define __LINUX_API_SUPPORT_H__
+
+#ifndef __LNET_API_SUPPORT_H__
+#error Do not #include this file directly. #include <lnet /api-support.h> instead
+#endif
+
+#endif
diff --git a/kernel/drivers/staging/lustre/include/linux/lnet/linux/lib-lnet.h b/kernel/drivers/staging/lustre/include/linux/lnet/linux/lib-lnet.h
new file mode 100644
index 000000000..0f8f04d1e
--- /dev/null
+++ b/kernel/drivers/staging/lustre/include/linux/lnet/linux/lib-lnet.h
@@ -0,0 +1,71 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef __LNET_LINUX_LIB_LNET_H__
+#define __LNET_LINUX_LIB_LNET_H__
+
+#ifndef __LNET_LIB_LNET_H__
+#error Do not #include this file directly. #include <linux/lnet/lib-lnet.h> instead
+#endif
+
+# include <asm/page.h>
+# include <linux/string.h>
+# include <asm/io.h>
+#include "../../libcfs/libcfs.h"
+
+static inline __u64
+lnet_page2phys(struct page *p)
+{
+	/* compiler optimizer will elide unused branches */
+
+	switch (sizeof(typeof(page_to_phys(p)))) {
+	case 4:
+		/* page_to_phys returns a 32 bit physical address.  This must
+		 * be a 32 bit machine with <= 4G memory and we must ensure we
+		 * don't sign extend when converting to 64 bits. */
+		return (unsigned long)page_to_phys(p);
+
+	case 8:
+		/* page_to_phys returns a 64 bit physical address :) */
+		return page_to_phys(p);
+
+	default:
+		LBUG();
+		return 0;
+	}
+}
+
+#define LNET_ROUTER
+
+#endif /* __LNET_LINUX_LIB_LNET_H__ */
diff --git a/kernel/drivers/staging/lustre/include/linux/lnet/linux/lib-types.h b/kernel/drivers/staging/lustre/include/linux/lnet/linux/lib-types.h
new file mode 100644
index 000000000..669e8c038
--- /dev/null
+++ b/kernel/drivers/staging/lustre/include/linux/lnet/linux/lib-types.h
@@ -0,0 +1,45 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef __LNET_LINUX_LIB_TYPES_H__
+#define __LNET_LINUX_LIB_TYPES_H__
+
+#ifndef __LNET_LIB_TYPES_H__
+#error Do not #include this file directly. #include <linux/lnet/lib-types.h> instead
+#endif
+
+# include <linux/uio.h>
+# include <linux/types.h>
+
+#endif
diff --git a/kernel/drivers/staging/lustre/include/linux/lnet/linux/lnet.h b/kernel/drivers/staging/lustre/include/linux/lnet/linux/lnet.h
new file mode 100644
index 000000000..1e888f1ef
--- /dev/null
+++ b/kernel/drivers/staging/lustre/include/linux/lnet/linux/lnet.h
@@ -0,0 +1,56 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef __LNET_LINUX_LNET_H__
+#define __LNET_LINUX_LNET_H__
+
+#ifndef __LNET_H__
+#error Do not #include this file directly. #include <linux/lnet/lnet.h> instead
+#endif
+
+/*
+ * lnet.h
+ *
+ * User application interface file
+ */
+
+#include <linux/uio.h>
+#include <linux/types.h>
+
+#define cfs_tcp_sendpage(sk, page, offset, size, flags) \
+	tcp_sendpage(sk, page, offset, size, flags)
+
+#endif
diff --git a/kernel/drivers/staging/lustre/include/linux/lnet/lnet-sysctl.h b/kernel/drivers/staging/lustre/include/linux/lnet/lnet-sysctl.h
new file mode 100644
index 000000000..2dee1b97f
--- /dev/null
+++ b/kernel/drivers/staging/lustre/include/linux/lnet/lnet-sysctl.h
@@ -0,0 +1,49 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef __LNET_SYSCTL_H__
+#define __LNET_SYSCTL_H__
+
+#if defined(CONFIG_SYSCTL)
+
+#define CTL_KRANAL      201
+#define CTL_O2IBLND     205
+#define CTL_PTLLND      206
+#define CTL_QSWNAL      207
+#define CTL_SOCKLND     208
+#define CTL_GNILND      210
+
+#endif
+
+#endif
diff --git a/kernel/drivers/staging/lustre/include/linux/lnet/lnet.h b/kernel/drivers/staging/lustre/include/linux/lnet/lnet.h
new file mode 100644
index 000000000..75c0ab919
--- /dev/null
+++ b/kernel/drivers/staging/lustre/include/linux/lnet/lnet.h
@@ -0,0 +1,51 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef __LNET_H__
+#define __LNET_H__
+
+/*
+ * lnet.h
+ *
+ * User application interface file
+ */
+#include "linux/lnet.h"
+
+#include "types.h"
+#include "api.h"
+
+#define LNET_NIDSTR_COUNT  1024    /* # of nidstrings */
+#define LNET_NIDSTR_SIZE   32      /* size of each one (see below for usage) */
+
+#endif
diff --git a/kernel/drivers/staging/lustre/include/linux/lnet/lnetctl.h b/kernel/drivers/staging/lustre/include/linux/lnet/lnetctl.h
new file mode 100644
index 000000000..98181d389
--- /dev/null
+++ b/kernel/drivers/staging/lustre/include/linux/lnet/lnetctl.h
@@ -0,0 +1,80 @@
+/*
+ *   This file is part of Portals, http://www.sf.net/projects/lustre/
+ *
+ *   Portals is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Portals is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Portals; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ * header for libptlctl.a
+ */
+#ifndef _PTLCTL_H_
+#define _PTLCTL_H_
+
+#include "../libcfs/libcfs.h"
+#include "types.h"
+
+#define LNET_DEV_ID 0
+#define LNET_DEV_PATH "/dev/lnet"
+#define LNET_DEV_MAJOR 10
+#define LNET_DEV_MINOR 240
+#define OBD_DEV_ID 1
+#define OBD_DEV_NAME "obd"
+#define OBD_DEV_PATH "/dev/" OBD_DEV_NAME
+#define OBD_DEV_MAJOR 10
+#define OBD_DEV_MINOR 241
+#define SMFS_DEV_ID  2
+#define SMFS_DEV_PATH "/dev/snapdev"
+#define SMFS_DEV_MAJOR 10
+#define SMFS_DEV_MINOR 242
+
+int ptl_initialize(int argc, char **argv);
+int jt_ptl_network(int argc, char **argv);
+int jt_ptl_list_nids(int argc, char **argv);
+int jt_ptl_which_nid(int argc, char **argv);
+int jt_ptl_print_interfaces(int argc, char **argv);
+int jt_ptl_add_interface(int argc, char **argv);
+int jt_ptl_del_interface(int argc, char **argv);
+int jt_ptl_print_peers(int argc, char **argv);
+int jt_ptl_add_peer(int argc, char **argv);
+int jt_ptl_del_peer(int argc, char **argv);
+int jt_ptl_print_connections(int argc, char **argv);
+int jt_ptl_disconnect(int argc, char **argv);
+int jt_ptl_push_connection(int argc, char **argv);
+int jt_ptl_print_active_txs(int argc, char **argv);
+int jt_ptl_ping(int argc, char **argv);
+int jt_ptl_mynid(int argc, char **argv);
+int jt_ptl_add_uuid(int argc, char **argv);
+int jt_ptl_add_uuid_old(int argc, char **argv); /* backwards compatibility  */
+int jt_ptl_close_uuid(int argc, char **argv);
+int jt_ptl_del_uuid(int argc, char **argv);
+int jt_ptl_add_route(int argc, char **argv);
+int jt_ptl_del_route(int argc, char **argv);
+int jt_ptl_notify_router(int argc, char **argv);
+int jt_ptl_print_routes(int argc, char **argv);
+int jt_ptl_fail_nid(int argc, char **argv);
+int jt_ptl_lwt(int argc, char **argv);
+int jt_ptl_testprotocompat(int argc, char **argv);
+int jt_ptl_memhog(int argc, char **argv);
+
+int dbg_initialize(int argc, char **argv);
+int jt_dbg_filter(int argc, char **argv);
+int jt_dbg_show(int argc, char **argv);
+int jt_dbg_list(int argc, char **argv);
+int jt_dbg_debug_kernel(int argc, char **argv);
+int jt_dbg_debug_daemon(int argc, char **argv);
+int jt_dbg_debug_file(int argc, char **argv);
+int jt_dbg_clear_debug_buf(int argc, char **argv);
+int jt_dbg_mark_debug_buf(int argc, char **argv);
+int jt_dbg_modules(int argc, char **argv);
+int jt_dbg_panic(int argc, char **argv);
+
+#endif
diff --git a/kernel/drivers/staging/lustre/include/linux/lnet/lnetst.h b/kernel/drivers/staging/lustre/include/linux/lnet/lnetst.h
new file mode 100644
index 000000000..885f708d4
--- /dev/null
+++ b/kernel/drivers/staging/lustre/include/linux/lnet/lnetst.h
@@ -0,0 +1,491 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/include/lnet/lnetst.h
+ *
+ * Author: Liang Zhen <liangzhen@clusterfs.com>
+ */
+
+#ifndef __LNET_ST_H__
+#define __LNET_ST_H__
+
+#include "../libcfs/libcfs.h"
+#include "lnet.h"
+#include "lib-types.h"
+
+#define LST_FEAT_NONE		(0)
+#define LST_FEAT_BULK_LEN	(1 << 0)	/* enable variable page size */
+
+#define LST_FEATS_EMPTY		(LST_FEAT_NONE)
+#define LST_FEATS_MASK		(LST_FEAT_NONE | LST_FEAT_BULK_LEN)
+
+#define LST_NAME_SIZE	   32	      /* max name buffer length */
+
+#define LSTIO_DEBUG	     0xC00	   /* debug */
+#define LSTIO_SESSION_NEW       0xC01	   /* create session */
+#define LSTIO_SESSION_END       0xC02	   /* end session */
+#define LSTIO_SESSION_INFO      0xC03	   /* query session */
+#define LSTIO_GROUP_ADD	 0xC10	   /* add group */
+#define LSTIO_GROUP_LIST	0xC11	   /* list all groups in session */
+#define LSTIO_GROUP_INFO	0xC12	   /* query default information of specified group */
+#define LSTIO_GROUP_DEL	 0xC13	   /* delete group */
+#define LSTIO_NODES_ADD	 0xC14	   /* add nodes to specified group */
+#define LSTIO_GROUP_UPDATE      0xC15	   /* update group */
+#define LSTIO_BATCH_ADD	 0xC20	   /* add batch */
+#define LSTIO_BATCH_START       0xC21	   /* start batch */
+#define LSTIO_BATCH_STOP	0xC22	   /* stop batch */
+#define LSTIO_BATCH_DEL	 0xC23	   /* delete batch */
+#define LSTIO_BATCH_LIST	0xC24	   /* show all batches in the session */
+#define LSTIO_BATCH_INFO	0xC25	   /* show defail of specified batch */
+#define LSTIO_TEST_ADD	  0xC26	   /* add test (to batch) */
+#define LSTIO_BATCH_QUERY       0xC27	   /* query batch status */
+#define LSTIO_STAT_QUERY	0xC30	   /* get stats */
+
+typedef struct {
+	lnet_nid_t	      ses_nid;		/* nid of console node */
+	__u64		   ses_stamp;	      /* time stamp */
+} lst_sid_t;					    /*** session id */
+
+extern lst_sid_t LST_INVALID_SID;
+
+typedef struct {
+	__u64		   bat_id;		 /* unique id in session */
+} lst_bid_t;					    /*** batch id (group of tests) */
+
+/* Status of test node */
+#define LST_NODE_ACTIVE	 0x1		     /* node in this session */
+#define LST_NODE_BUSY	   0x2		     /* node is taken by other session */
+#define LST_NODE_DOWN	   0x4		     /* node is down */
+#define LST_NODE_UNKNOWN	0x8		     /* node not in session */
+
+typedef struct {
+	lnet_process_id_t       nde_id;		 /* id of node */
+	int		     nde_state;	      /* state of node */
+} lstcon_node_ent_t;				    /*** node entry, for list_group command */
+
+typedef struct {
+	int		     nle_nnode;	      /* # of nodes */
+	int		     nle_nactive;	    /* # of active nodes */
+	int		     nle_nbusy;	      /* # of busy nodes */
+	int		     nle_ndown;	      /* # of down nodes */
+	int		     nle_nunknown;	   /* # of unknown nodes */
+} lstcon_ndlist_ent_t;				  /*** node_list entry, for list_batch command */
+
+typedef struct {
+	int		     tse_type;	       /* test type */
+	int		     tse_loop;	       /* loop count */
+	int		     tse_concur;	     /* concurrency of test */
+} lstcon_test_ent_t;				    /*** test summary entry, for list_batch command */
+
+typedef struct {
+	int		     bae_state;	      /* batch status */
+	int		     bae_timeout;	    /* batch timeout */
+	int		     bae_ntest;	      /* # of tests in the batch */
+} lstcon_batch_ent_t;				   /*** batch summary entry, for list_batch command */
+
+typedef struct {
+	lstcon_ndlist_ent_t     tbe_cli_nle;	    /* client (group) node_list entry */
+	lstcon_ndlist_ent_t     tbe_srv_nle;	    /* server (group) node_list entry */
+	union {
+		lstcon_test_ent_t  tbe_test;	    /* test entry */
+		lstcon_batch_ent_t tbe_batch;	   /* batch entry */
+	} u;
+} lstcon_test_batch_ent_t;			      /*** test/batch verbose information entry,
+							 *** for list_batch command */
+
+typedef struct {
+	struct list_head	      rpe_link;	       /* link chain */
+	lnet_process_id_t       rpe_peer;	       /* peer's id */
+	struct timeval	  rpe_stamp;	      /* time stamp of RPC */
+	int		     rpe_state;	      /* peer's state */
+	int		     rpe_rpc_errno;	  /* RPC errno */
+
+	lst_sid_t	       rpe_sid;		/* peer's session id */
+	int		     rpe_fwk_errno;	  /* framework errno */
+	int		     rpe_priv[4];	    /* private data */
+	char		    rpe_payload[0];	 /* private reply payload */
+} lstcon_rpc_ent_t;
+
+typedef struct {
+	int		     trs_rpc_stat[4];	/* RPCs stat (0: total, 1: failed, 2: finished, 4: reserved */
+	int		     trs_rpc_errno;	  /* RPC errno */
+	int		     trs_fwk_stat[8];	/* framework stat */
+	int		     trs_fwk_errno;	  /* errno of the first remote error */
+	void		   *trs_fwk_private;	/* private framework stat */
+} lstcon_trans_stat_t;
+
+static inline int
+lstcon_rpc_stat_total(lstcon_trans_stat_t *stat, int inc)
+{
+	return inc ? ++stat->trs_rpc_stat[0] : stat->trs_rpc_stat[0];
+}
+
+static inline int
+lstcon_rpc_stat_success(lstcon_trans_stat_t *stat, int inc)
+{
+	return inc ? ++stat->trs_rpc_stat[1] : stat->trs_rpc_stat[1];
+}
+
+static inline int
+lstcon_rpc_stat_failure(lstcon_trans_stat_t *stat, int inc)
+{
+	return inc ? ++stat->trs_rpc_stat[2] : stat->trs_rpc_stat[2];
+}
+
+static inline int
+lstcon_sesop_stat_success(lstcon_trans_stat_t *stat, int inc)
+{
+	return inc ? ++stat->trs_fwk_stat[0] : stat->trs_fwk_stat[0];
+}
+
+static inline int
+lstcon_sesop_stat_failure(lstcon_trans_stat_t *stat, int inc)
+{
+	return inc ? ++stat->trs_fwk_stat[1] : stat->trs_fwk_stat[1];
+}
+
+static inline int
+lstcon_sesqry_stat_active(lstcon_trans_stat_t *stat, int inc)
+{
+	return inc ? ++stat->trs_fwk_stat[0] : stat->trs_fwk_stat[0];
+}
+
+static inline int
+lstcon_sesqry_stat_busy(lstcon_trans_stat_t *stat, int inc)
+{
+	return inc ? ++stat->trs_fwk_stat[1] : stat->trs_fwk_stat[1];
+}
+
+static inline int
+lstcon_sesqry_stat_unknown(lstcon_trans_stat_t *stat, int inc)
+{
+	return inc ? ++stat->trs_fwk_stat[2] : stat->trs_fwk_stat[2];
+}
+
+static inline int
+lstcon_tsbop_stat_success(lstcon_trans_stat_t *stat, int inc)
+{
+	return inc ? ++stat->trs_fwk_stat[0] : stat->trs_fwk_stat[0];
+}
+
+static inline int
+lstcon_tsbop_stat_failure(lstcon_trans_stat_t *stat, int inc)
+{
+	return inc ? ++stat->trs_fwk_stat[1] : stat->trs_fwk_stat[1];
+}
+
+static inline int
+lstcon_tsbqry_stat_idle(lstcon_trans_stat_t *stat, int inc)
+{
+	return inc ? ++stat->trs_fwk_stat[0] : stat->trs_fwk_stat[0];
+}
+
+static inline int
+lstcon_tsbqry_stat_run(lstcon_trans_stat_t *stat, int inc)
+{
+	return inc ? ++stat->trs_fwk_stat[1] : stat->trs_fwk_stat[1];
+}
+
+static inline int
+lstcon_tsbqry_stat_failure(lstcon_trans_stat_t *stat, int inc)
+{
+	return inc ? ++stat->trs_fwk_stat[2] : stat->trs_fwk_stat[2];
+}
+
+static inline int
+lstcon_statqry_stat_success(lstcon_trans_stat_t *stat, int inc)
+{
+	return inc ? ++stat->trs_fwk_stat[0] : stat->trs_fwk_stat[0];
+}
+
+static inline int
+lstcon_statqry_stat_failure(lstcon_trans_stat_t *stat, int inc)
+{
+	return inc ? ++stat->trs_fwk_stat[1] : stat->trs_fwk_stat[1];
+}
+
+/* create a session */
+typedef struct {
+	int		     lstio_ses_key;	  /* IN: local key */
+	int		     lstio_ses_timeout;      /* IN: session timeout */
+	int		     lstio_ses_force;	/* IN: force create ? */
+	/** IN: session features */
+	unsigned		lstio_ses_feats;
+	lst_sid_t	      *lstio_ses_idp;	  /* OUT: session id */
+	int		     lstio_ses_nmlen;	/* IN: name length */
+	char		   *lstio_ses_namep;	/* IN: session name */
+} lstio_session_new_args_t;
+
+/* query current session */
+typedef struct {
+	lst_sid_t	      *lstio_ses_idp;	  /* OUT: session id */
+	int		    *lstio_ses_keyp;	 /* OUT: local key */
+	/** OUT: session features */
+	unsigned	       *lstio_ses_featp;
+	lstcon_ndlist_ent_t    *lstio_ses_ndinfo;       /* OUT: */
+	int		     lstio_ses_nmlen;	/* IN: name length */
+	char		   *lstio_ses_namep;	/* OUT: session name */
+} lstio_session_info_args_t;
+
+/* delete a session */
+typedef struct {
+	int		     lstio_ses_key;	  /* IN: session key */
+} lstio_session_end_args_t;
+
+#define LST_OPC_SESSION	 1
+#define LST_OPC_GROUP	   2
+#define LST_OPC_NODES	   3
+#define LST_OPC_BATCHCLI	4
+#define LST_OPC_BATCHSRV	5
+
+typedef struct {
+	int		     lstio_dbg_key;	  /* IN: session key */
+	int		     lstio_dbg_type;	 /* IN: debug sessin|batch|group|nodes list */
+	int		     lstio_dbg_flags;	/* IN: reserved debug flags */
+	int		     lstio_dbg_timeout;      /* IN: timeout of debug */
+
+	int		     lstio_dbg_nmlen;	/* IN: len of name */
+	char		   *lstio_dbg_namep;	/* IN: name of group|batch */
+	int		     lstio_dbg_count;	/* IN: # of test nodes to debug */
+	lnet_process_id_t      *lstio_dbg_idsp;	 /* IN: id of test nodes */
+	struct list_head	     *lstio_dbg_resultp;      /* OUT: list head of result buffer */
+} lstio_debug_args_t;
+
+typedef struct {
+	int		     lstio_grp_key;	  /* IN: session key */
+	int		     lstio_grp_nmlen;	/* IN: name length */
+	char		   *lstio_grp_namep;	/* IN: group name */
+} lstio_group_add_args_t;
+
+typedef struct {
+	int		     lstio_grp_key;	  /* IN: session key */
+	int		     lstio_grp_nmlen;	/* IN: name length */
+	char		   *lstio_grp_namep;	/* IN: group name */
+} lstio_group_del_args_t;
+
+#define LST_GROUP_CLEAN	 1		       /* remove inactive nodes in the group */
+#define LST_GROUP_REFRESH       2		       /* refresh inactive nodes in the group */
+#define LST_GROUP_RMND	  3		       /* delete nodes from the group */
+
+typedef struct {
+	int		     lstio_grp_key;	  /* IN: session key */
+	int		     lstio_grp_opc;	  /* IN: OPC */
+	int		     lstio_grp_args;	 /* IN: arguments */
+	int		     lstio_grp_nmlen;	/* IN: name length */
+	char		   *lstio_grp_namep;	/* IN: group name */
+	int		     lstio_grp_count;	/* IN: # of nodes id */
+	lnet_process_id_t      *lstio_grp_idsp;	 /* IN: array of nodes */
+	struct list_head	     *lstio_grp_resultp;      /* OUT: list head of result buffer */
+} lstio_group_update_args_t;
+
+typedef struct {
+	int		     lstio_grp_key;	  /* IN: session key */
+	int		     lstio_grp_nmlen;	/* IN: name length */
+	char		   *lstio_grp_namep;	/* IN: group name */
+	int		     lstio_grp_count;	/* IN: # of nodes */
+	/** OUT: session features */
+	unsigned	       *lstio_grp_featp;
+	lnet_process_id_t      *lstio_grp_idsp;	 /* IN: nodes */
+	struct list_head	     *lstio_grp_resultp;      /* OUT: list head of result buffer */
+} lstio_group_nodes_args_t;
+
+typedef struct {
+	int		     lstio_grp_key;	  /* IN: session key */
+	int		     lstio_grp_idx;	  /* IN: group idx */
+	int		     lstio_grp_nmlen;	/* IN: name len */
+	char		   *lstio_grp_namep;	/* OUT: name */
+} lstio_group_list_args_t;
+
+typedef struct {
+	int		     lstio_grp_key;	  /* IN: session key */
+	int		     lstio_grp_nmlen;	/* IN: name len */
+	char		   *lstio_grp_namep;	/* IN: name */
+	lstcon_ndlist_ent_t    *lstio_grp_entp;	 /* OUT: description of group */
+
+	int		    *lstio_grp_idxp;	 /* IN/OUT: node index */
+	int		    *lstio_grp_ndentp;       /* IN/OUT: # of nodent */
+	lstcon_node_ent_t      *lstio_grp_dentsp;       /* OUT: nodent array */
+} lstio_group_info_args_t;
+
+#define LST_DEFAULT_BATCH       "batch"		 /* default batch name */
+
+typedef struct {
+	int		     lstio_bat_key;	  /* IN: session key */
+	int		     lstio_bat_nmlen;	/* IN: name length */
+	char		   *lstio_bat_namep;	/* IN: batch name */
+} lstio_batch_add_args_t;
+
+typedef struct {
+	int		     lstio_bat_key;	  /* IN: session key */
+	int		     lstio_bat_nmlen;	/* IN: name length */
+	char		   *lstio_bat_namep;	/* IN: batch name */
+} lstio_batch_del_args_t;
+
+typedef struct {
+	int		     lstio_bat_key;	  /* IN: session key */
+	int		     lstio_bat_timeout;      /* IN: timeout for the batch */
+	int		     lstio_bat_nmlen;	/* IN: name length */
+	char		   *lstio_bat_namep;	/* IN: batch name */
+	struct list_head	     *lstio_bat_resultp;      /* OUT: list head of result buffer */
+} lstio_batch_run_args_t;
+
+typedef struct {
+	int		     lstio_bat_key;	  /* IN: session key */
+	int		     lstio_bat_force;	/* IN: abort unfinished test RPC */
+	int		     lstio_bat_nmlen;	/* IN: name length */
+	char		   *lstio_bat_namep;	/* IN: batch name */
+	struct list_head	     *lstio_bat_resultp;      /* OUT: list head of result buffer */
+} lstio_batch_stop_args_t;
+
+typedef struct {
+	int		     lstio_bat_key;	  /* IN: session key */
+	int		     lstio_bat_testidx;      /* IN: test index */
+	int		     lstio_bat_client;       /* IN: is test client? */
+	int		     lstio_bat_timeout;      /* IN: timeout for waiting */
+	int		     lstio_bat_nmlen;	/* IN: name length */
+	char		   *lstio_bat_namep;	/* IN: batch name */
+	struct list_head	     *lstio_bat_resultp;      /* OUT: list head of result buffer */
+} lstio_batch_query_args_t;
+
+typedef struct {
+	int		     lstio_bat_key;	  /* IN: session key */
+	int		     lstio_bat_idx;	  /* IN: index */
+	int		     lstio_bat_nmlen;	/* IN: name length */
+	char		   *lstio_bat_namep;	/* IN: batch name */
+} lstio_batch_list_args_t;
+
+typedef struct {
+	int		     lstio_bat_key;	  /* IN: session key */
+	int		     lstio_bat_nmlen;	/* IN: name length */
+	char		   *lstio_bat_namep;	/* IN: name */
+	int		     lstio_bat_server;       /* IN: query server or not */
+	int		     lstio_bat_testidx;      /* IN: test index */
+	lstcon_test_batch_ent_t *lstio_bat_entp;	/* OUT: batch ent */
+
+	int		    *lstio_bat_idxp;	 /* IN/OUT: index of node */
+	int		    *lstio_bat_ndentp;       /* IN/OUT: # of nodent */
+	lstcon_node_ent_t      *lstio_bat_dentsp;       /* array of nodent */
+} lstio_batch_info_args_t;
+
+/* add stat in session */
+typedef struct {
+	int		     lstio_sta_key;	  /* IN: session key */
+	int		     lstio_sta_timeout;      /* IN: timeout for stat request */
+	int		     lstio_sta_nmlen;	/* IN: group name length */
+	char		   *lstio_sta_namep;	/* IN: group name */
+	int		     lstio_sta_count;	/* IN: # of pid */
+	lnet_process_id_t      *lstio_sta_idsp;	 /* IN: pid */
+	struct list_head	     *lstio_sta_resultp;      /* OUT: list head of result buffer */
+} lstio_stat_args_t;
+
+typedef enum {
+	LST_TEST_BULK   = 1,
+	LST_TEST_PING   = 2
+} lst_test_type_t;
+
+/* create a test in a batch */
+#define LST_MAX_CONCUR	  1024		    /* Max concurrency of test */
+
+typedef struct {
+	int		     lstio_tes_key;	  /* IN: session key */
+	int		     lstio_tes_bat_nmlen;    /* IN: batch name len */
+	char		   *lstio_tes_bat_name;     /* IN: batch name */
+	int		     lstio_tes_type;	 /* IN: test type */
+	int		     lstio_tes_oneside;      /* IN: one sided test */
+	int		     lstio_tes_loop;	 /* IN: loop count */
+	int		     lstio_tes_concur;       /* IN: concurrency */
+
+	int		     lstio_tes_dist;	 /* IN: node distribution in destination groups */
+	int		     lstio_tes_span;	 /* IN: node span in destination groups */
+	int		     lstio_tes_sgrp_nmlen;   /* IN: source group name length */
+	char		   *lstio_tes_sgrp_name;    /* IN: group name */
+	int		     lstio_tes_dgrp_nmlen;   /* IN: destination group name length */
+	char		   *lstio_tes_dgrp_name;    /* IN: group name */
+
+	int		     lstio_tes_param_len;    /* IN: param buffer len */
+	void		   *lstio_tes_param;	/* IN: parameter for specified test:
+							       lstio_bulk_param_t,
+							       lstio_ping_param_t,
+							       ... more */
+	int		    *lstio_tes_retp;	 /* OUT: private returned value */
+	struct list_head	     *lstio_tes_resultp;      /* OUT: list head of result buffer */
+} lstio_test_args_t;
+
+typedef enum {
+	LST_BRW_READ    = 1,
+	LST_BRW_WRITE   = 2
+} lst_brw_type_t;
+
+typedef enum {
+	LST_BRW_CHECK_NONE   = 1,
+	LST_BRW_CHECK_SIMPLE = 2,
+	LST_BRW_CHECK_FULL   = 3
+} lst_brw_flags_t;
+
+typedef struct {
+	int		     blk_opc;		/* bulk operation code */
+	int		     blk_size;	       /* size (bytes) */
+	int		     blk_time;	       /* time of running the test*/
+	int		     blk_flags;	      /* reserved flags */
+} lst_test_bulk_param_t;
+
+typedef struct {
+	int		     png_size;	       /* size of ping message */
+	int		     png_time;	       /* time */
+	int		     png_loop;	       /* loop */
+	int		     png_flags;	      /* reserved flags */
+} lst_test_ping_param_t;
+
+/* more tests */
+typedef struct {
+	__u32 errors;
+	__u32 rpcs_sent;
+	__u32 rpcs_rcvd;
+	__u32 rpcs_dropped;
+	__u32 rpcs_expired;
+	__u64 bulk_get;
+	__u64 bulk_put;
+} WIRE_ATTR srpc_counters_t;
+
+typedef struct {
+	/** milliseconds since current session started */
+	__u32 running_ms;
+	__u32 active_batches;
+	__u32 zombie_sessions;
+	__u32 brw_errors;
+	__u32 ping_errors;
+} WIRE_ATTR sfw_counters_t;
+
+#endif
diff --git a/kernel/drivers/staging/lustre/include/linux/lnet/ptllnd.h b/kernel/drivers/staging/lustre/include/linux/lnet/ptllnd.h
new file mode 100644
index 000000000..c91d65329
--- /dev/null
+++ b/kernel/drivers/staging/lustre/include/linux/lnet/ptllnd.h
@@ -0,0 +1,93 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/include/lnet/ptllnd.h
+ *
+ * Author: PJ Kirner <pjkirner@clusterfs.com>
+ */
+
+/*
+ * The PTLLND was designed to support Portals with
+ * Lustre and non-lustre UNLINK semantics.
+ * However for now the two targets are Cray Portals
+ * on the XT3 and Lustre Portals (for testing) both
+ * have Lustre UNLINK semantics, so this is defined
+ * by default.
+ */
+#define LUSTRE_PORTALS_UNLINK_SEMANTICS
+
+#ifdef _USING_LUSTRE_PORTALS_
+
+/* NIDs are 64-bits on Lustre Portals */
+#define FMT_NID "%llu"
+#define FMT_PID "%d"
+
+/* When using Lustre Portals Lustre completion semantics are imlicit*/
+#define PTL_MD_LUSTRE_COMPLETION_SEMANTICS      0
+
+#else /* _USING_CRAY_PORTALS_ */
+
+/* NIDs are integers on Cray Portals */
+#define FMT_NID "%u"
+#define FMT_PID "%d"
+
+/* When using Cray Portals this is defined in the Cray Portals Header*/
+/*#define PTL_MD_LUSTRE_COMPLETION_SEMANTICS */
+
+/* Can compare handles directly on Cray Portals */
+#define PtlHandleIsEqual(a, b) ((a) == (b))
+
+/* Different error types on Cray Portals*/
+#define ptl_err_t ptl_ni_fail_t
+
+/*
+ * The Cray Portals has no maximum number of IOVs.  The
+ * maximum is limited only by memory and size of the
+ * int parameters (2^31-1).
+ * Lustre only really require that the underyling
+ * implementation to support at least LNET_MAX_IOV,
+ * so for Cray portals we can safely just use that
+ * value here.
+ *
+ */
+#define PTL_MD_MAX_IOV	  LNET_MAX_IOV
+
+#endif
+
+#define FMT_PTLID "ptlid:"FMT_PID"-"FMT_NID
+
+/* Align incoming small request messages to an 8 byte boundary if this is
+ * supported to avoid alignment issues on some architectures */
+#ifndef PTL_MD_LOCAL_ALIGN8
+# define PTL_MD_LOCAL_ALIGN8 0
+#endif
diff --git a/kernel/drivers/staging/lustre/include/linux/lnet/ptllnd_wire.h b/kernel/drivers/staging/lustre/include/linux/lnet/ptllnd_wire.h
new file mode 100644
index 000000000..808f37b64
--- /dev/null
+++ b/kernel/drivers/staging/lustre/include/linux/lnet/ptllnd_wire.h
@@ -0,0 +1,119 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/include/lnet/ptllnd_wire.h
+ *
+ * Author: PJ Kirner <pjkirner@clusterfs.com>
+ */
+
+/* Minimum buffer size that any peer will post to receive ptllnd messages */
+#define PTLLND_MIN_BUFFER_SIZE  256
+
+/************************************************************************
+ * Tunable defaults that {u,k}lnds/ptllnd should have in common.
+ */
+
+#define PTLLND_PORTAL	   9	  /* The same portal PTLPRC used when talking to cray portals */
+#define PTLLND_PID	      9	  /* The Portals PID */
+#define PTLLND_PEERCREDITS      8	  /* concurrent sends to 1 peer */
+
+/* Default buffer size for kernel ptllnds (guaranteed eager) */
+#define PTLLND_MAX_KLND_MSG_SIZE 512
+
+/* Default buffer size for catamount ptllnds (not guaranteed eager) - large
+ * enough to avoid RDMA for anything sent while control is not in liblustre */
+#define PTLLND_MAX_ULND_MSG_SIZE 512
+
+/************************************************************************
+ * Portals LND Wire message format.
+ * These are sent in sender's byte order (i.e. receiver flips).
+ */
+
+#define PTL_RESERVED_MATCHBITS  0x100	/* below this value is reserved
+					 * above is for bulk data transfer */
+#define LNET_MSG_MATCHBITS       0      /* the value for the message channel */
+
+typedef struct {
+	lnet_hdr_t	kptlim_hdr;	     /* portals header */
+	char	      kptlim_payload[0];      /* piggy-backed payload */
+} WIRE_ATTR kptl_immediate_msg_t;
+
+typedef struct {
+	lnet_hdr_t	kptlrm_hdr;	     /* portals header */
+	__u64	     kptlrm_matchbits;       /* matchbits */
+} WIRE_ATTR kptl_rdma_msg_t;
+
+typedef struct {
+	__u64	     kptlhm_matchbits;       /* matchbits */
+	__u32	     kptlhm_max_msg_size;    /* max message size */
+} WIRE_ATTR kptl_hello_msg_t;
+
+typedef struct {
+	/* First 2 fields fixed FOR ALL TIME */
+	__u32	   ptlm_magic;     /* I'm a Portals LND message */
+	__u16	   ptlm_version;   /* this is my version number */
+	__u8	    ptlm_type;      /* the message type */
+	__u8	    ptlm_credits;   /* returned credits */
+	__u32	   ptlm_nob;       /* # bytes in whole message */
+	__u32	   ptlm_cksum;     /* checksum (0 == no checksum) */
+	__u64	   ptlm_srcnid;    /* sender's NID */
+	__u64	   ptlm_srcstamp;  /* sender's incarnation */
+	__u64	   ptlm_dstnid;    /* destination's NID */
+	__u64	   ptlm_dststamp;  /* destination's incarnation */
+	__u32	   ptlm_srcpid;    /* sender's PID */
+	__u32	   ptlm_dstpid;    /* destination's PID */
+
+	 union {
+		kptl_immediate_msg_t    immediate;
+		kptl_rdma_msg_t	 rdma;
+		kptl_hello_msg_t	hello;
+	} WIRE_ATTR ptlm_u;
+
+} kptl_msg_t;
+
+/* kptl_msg_t::ptlm_credits is only a __u8 */
+#define PTLLND_MSG_MAX_CREDITS ((typeof(((kptl_msg_t *)0)->ptlm_credits)) - 1)
+
+#define PTLLND_MSG_MAGIC		LNET_PROTO_PTL_MAGIC
+#define PTLLND_MSG_VERSION	      0x04
+
+#define PTLLND_RDMA_OK		  0x00
+#define PTLLND_RDMA_FAIL		0x01
+
+#define PTLLND_MSG_TYPE_INVALID	 0x00
+#define PTLLND_MSG_TYPE_PUT	     0x01
+#define PTLLND_MSG_TYPE_GET	     0x02
+#define PTLLND_MSG_TYPE_IMMEDIATE       0x03    /* No bulk data xfer*/
+#define PTLLND_MSG_TYPE_NOOP	    0x04
+#define PTLLND_MSG_TYPE_HELLO	   0x05
+#define PTLLND_MSG_TYPE_NAK	     0x06
diff --git a/kernel/drivers/staging/lustre/include/linux/lnet/socklnd.h b/kernel/drivers/staging/lustre/include/linux/lnet/socklnd.h
new file mode 100644
index 000000000..389038b12
--- /dev/null
+++ b/kernel/drivers/staging/lustre/include/linux/lnet/socklnd.h
@@ -0,0 +1,103 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/include/lnet/socklnd.h
+ *
+ * #defines shared between socknal implementation and utilities
+ */
+#ifndef __LNET_LNET_SOCKLND_H__
+#define __LNET_LNET_SOCKLND_H__
+
+#include "types.h"
+#include "lib-types.h"
+
+#define SOCKLND_CONN_NONE     (-1)
+#define SOCKLND_CONN_ANY	0
+#define SOCKLND_CONN_CONTROL    1
+#define SOCKLND_CONN_BULK_IN    2
+#define SOCKLND_CONN_BULK_OUT   3
+#define SOCKLND_CONN_NTYPES     4
+
+#define SOCKLND_CONN_ACK	SOCKLND_CONN_BULK_IN
+
+typedef struct {
+	__u32		   kshm_magic;     /* magic number of socklnd message */
+	__u32		   kshm_version;   /* version of socklnd message */
+	lnet_nid_t	      kshm_src_nid;   /* sender's nid */
+	lnet_nid_t	      kshm_dst_nid;   /* destination nid */
+	lnet_pid_t	      kshm_src_pid;   /* sender's pid */
+	lnet_pid_t	      kshm_dst_pid;   /* destination pid */
+	__u64		   kshm_src_incarnation; /* sender's incarnation */
+	__u64		   kshm_dst_incarnation; /* destination's incarnation */
+	__u32		   kshm_ctype;     /* connection type */
+	__u32		   kshm_nips;      /* # IP addrs */
+	__u32		   kshm_ips[0];    /* IP addrs */
+} WIRE_ATTR ksock_hello_msg_t;
+
+typedef struct {
+	lnet_hdr_t	      ksnm_hdr;       /* lnet hdr */
+
+	/*
+	 * ksnm_payload is removed because of winnt compiler's limitation:
+	 * zero-sized array can only be placed at the tail of [nested]
+	 * structure definitions. lnet payload will be stored just after
+	 * the body of structure ksock_lnet_msg_t
+	 */
+} WIRE_ATTR ksock_lnet_msg_t;
+
+typedef struct {
+	__u32		   ksm_type;       /* type of socklnd message */
+	__u32		   ksm_csum;       /* checksum if != 0 */
+	__u64		   ksm_zc_cookies[2]; /* Zero-Copy request/ACK cookie */
+	union {
+		ksock_lnet_msg_t lnetmsg;       /* lnet message, it's empty if it's NOOP */
+	} WIRE_ATTR ksm_u;
+} WIRE_ATTR ksock_msg_t;
+
+static inline void
+socklnd_init_msg(ksock_msg_t *msg, int type)
+{
+	msg->ksm_csum	   = 0;
+	msg->ksm_type	   = type;
+	msg->ksm_zc_cookies[0]  = msg->ksm_zc_cookies[1]  = 0;
+}
+
+#define KSOCK_MSG_NOOP	  0xc0	    /* ksm_u empty */
+#define KSOCK_MSG_LNET	  0xc1	    /* lnet msg */
+
+/* We need to know this number to parse hello msg from ksocklnd in
+ * other LND (usocklnd, for example) */
+#define KSOCK_PROTO_V2	  2
+#define KSOCK_PROTO_V3	  3
+
+#endif
diff --git a/kernel/drivers/staging/lustre/include/linux/lnet/types.h b/kernel/drivers/staging/lustre/include/linux/lnet/types.h
new file mode 100644
index 000000000..68d8139a2
--- /dev/null
+++ b/kernel/drivers/staging/lustre/include/linux/lnet/types.h
@@ -0,0 +1,492 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef __LNET_TYPES_H__
+#define __LNET_TYPES_H__
+
+/** \addtogroup lnet
+ * @{ */
+
+#include "../libcfs/libcfs.h"
+
+/** \addtogroup lnet_addr
+ * @{ */
+
+/** Portal reserved for LNet's own use.
+ * \see lustre/include/lustre/lustre_idl.h for Lustre portal assignments.
+ */
+#define LNET_RESERVED_PORTAL      0
+
+/**
+ * Address of an end-point in an LNet network.
+ *
+ * A node can have multiple end-points and hence multiple addresses.
+ * An LNet network can be a simple network (e.g. tcp0) or a network of
+ * LNet networks connected by LNet routers. Therefore an end-point address
+ * has two parts: network ID, and address within a network.
+ *
+ * \see LNET_NIDNET, LNET_NIDADDR, and LNET_MKNID.
+ */
+typedef __u64 lnet_nid_t;
+/**
+ * ID of a process in a node. Shortened as PID to distinguish from
+ * lnet_process_id_t, the global process ID.
+ */
+typedef __u32 lnet_pid_t;
+
+/** wildcard NID that matches any end-point address */
+#define LNET_NID_ANY      ((lnet_nid_t) -1)
+/** wildcard PID that matches any lnet_pid_t */
+#define LNET_PID_ANY      ((lnet_pid_t) -1)
+
+#define LNET_PID_RESERVED 0xf0000000 /* reserved bits in PID */
+#define LNET_PID_USERFLAG 0x80000000 /* set in userspace peers */
+
+#define LNET_TIME_FOREVER    (-1)
+
+/**
+ * Objects maintained by the LNet are accessed through handles. Handle types
+ * have names of the form lnet_handle_xx_t, where xx is one of the two letter
+ * object type codes ('eq' for event queue, 'md' for memory descriptor, and
+ * 'me' for match entry).
+ * Each type of object is given a unique handle type to enhance type checking.
+ * The type lnet_handle_any_t can be used when a generic handle is needed.
+ * Every handle value can be converted into a value of type lnet_handle_any_t
+ * without loss of information.
+ */
+typedef struct {
+	__u64	 cookie;
+} lnet_handle_any_t;
+
+typedef lnet_handle_any_t lnet_handle_eq_t;
+typedef lnet_handle_any_t lnet_handle_md_t;
+typedef lnet_handle_any_t lnet_handle_me_t;
+
+#define LNET_WIRE_HANDLE_COOKIE_NONE   (-1)
+
+/**
+ * Invalidate handle \a h.
+ */
+static inline void LNetInvalidateHandle(lnet_handle_any_t *h)
+{
+	h->cookie = LNET_WIRE_HANDLE_COOKIE_NONE;
+}
+
+/**
+ * Compare handles \a h1 and \a h2.
+ *
+ * \return 1 if handles are equal, 0 if otherwise.
+ */
+static inline int LNetHandleIsEqual(lnet_handle_any_t h1, lnet_handle_any_t h2)
+{
+	return h1.cookie == h2.cookie;
+}
+
+/**
+ * Check whether handle \a h is invalid.
+ *
+ * \return 1 if handle is invalid, 0 if valid.
+ */
+static inline int LNetHandleIsInvalid(lnet_handle_any_t h)
+{
+	return LNET_WIRE_HANDLE_COOKIE_NONE == h.cookie;
+}
+
+/**
+ * Global process ID.
+ */
+typedef struct {
+	/** node id */
+	lnet_nid_t nid;
+	/** process id */
+	lnet_pid_t pid;
+} lnet_process_id_t;
+/** @} lnet_addr */
+
+/** \addtogroup lnet_me
+ * @{ */
+
+/**
+ * Specifies whether the match entry or memory descriptor should be unlinked
+ * automatically (LNET_UNLINK) or not (LNET_RETAIN).
+ */
+typedef enum {
+	LNET_RETAIN = 0,
+	LNET_UNLINK
+} lnet_unlink_t;
+
+/**
+ * Values of the type lnet_ins_pos_t are used to control where a new match
+ * entry is inserted. The value LNET_INS_BEFORE is used to insert the new
+ * entry before the current entry or before the head of the list. The value
+ * LNET_INS_AFTER is used to insert the new entry after the current entry
+ * or after the last item in the list.
+ */
+typedef enum {
+	/** insert ME before current position or head of the list */
+	LNET_INS_BEFORE,
+	/** insert ME after current position or tail of the list */
+	LNET_INS_AFTER,
+	/** attach ME at tail of local CPU partition ME list */
+	LNET_INS_LOCAL
+} lnet_ins_pos_t;
+
+/** @} lnet_me */
+
+/** \addtogroup lnet_md
+ * @{ */
+
+/**
+ * Defines the visible parts of a memory descriptor. Values of this type
+ * are used to initialize memory descriptors.
+ */
+typedef struct {
+	/**
+	 * Specify the memory region associated with the memory descriptor.
+	 * If the options field has:
+	 * - LNET_MD_KIOV bit set: The start field points to the starting
+	 * address of an array of lnet_kiov_t and the length field specifies
+	 * the number of entries in the array. The length can't be bigger
+	 * than LNET_MAX_IOV. The lnet_kiov_t is used to describe page-based
+	 * fragments that are not necessarily mapped in virtual memory.
+	 * - LNET_MD_IOVEC bit set: The start field points to the starting
+	 * address of an array of struct iovec and the length field specifies
+	 * the number of entries in the array. The length can't be bigger
+	 * than LNET_MAX_IOV. The struct iovec is used to describe fragments
+	 * that have virtual addresses.
+	 * - Otherwise: The memory region is contiguous. The start field
+	 * specifies the starting address for the memory region and the
+	 * length field specifies its length.
+	 *
+	 * When the memory region is fragmented, all fragments but the first
+	 * one must start on page boundary, and all but the last must end on
+	 * page boundary.
+	 */
+	void	    *start;
+	unsigned int     length;
+	/**
+	 * Specifies the maximum number of operations that can be performed
+	 * on the memory descriptor. An operation is any action that could
+	 * possibly generate an event. In the usual case, the threshold value
+	 * is decremented for each operation on the MD. When the threshold
+	 * drops to zero, the MD becomes inactive and does not respond to
+	 * operations. A threshold value of LNET_MD_THRESH_INF indicates that
+	 * there is no bound on the number of operations that may be applied
+	 * to a MD.
+	 */
+	int	      threshold;
+	/**
+	 * Specifies the largest incoming request that the memory descriptor
+	 * should respond to. When the unused portion of a MD (length -
+	 * local offset) falls below this value, the MD becomes inactive and
+	 * does not respond to further operations. This value is only used
+	 * if the LNET_MD_MAX_SIZE option is set.
+	 */
+	int	      max_size;
+	/**
+	 * Specifies the behavior of the memory descriptor. A bitwise OR
+	 * of the following values can be used:
+	 * - LNET_MD_OP_PUT: The LNet PUT operation is allowed on this MD.
+	 * - LNET_MD_OP_GET: The LNet GET operation is allowed on this MD.
+	 * - LNET_MD_MANAGE_REMOTE: The offset used in accessing the memory
+	 *   region is provided by the incoming request. By default, the
+	 *   offset is maintained locally. When maintained locally, the
+	 *   offset is incremented by the length of the request so that
+	 *   the next operation (PUT or GET) will access the next part of
+	 *   the memory region. Note that only one offset variable exists
+	 *   per memory descriptor. If both PUT and GET operations are
+	 *   performed on a memory descriptor, the offset is updated each time.
+	 * - LNET_MD_TRUNCATE: The length provided in the incoming request can
+	 *   be reduced to match the memory available in the region (determined
+	 *   by subtracting the offset from the length of the memory region).
+	 *   By default, if the length in the incoming operation is greater
+	 *   than the amount of memory available, the operation is rejected.
+	 * - LNET_MD_ACK_DISABLE: An acknowledgment should not be sent for
+	 *   incoming PUT operations, even if requested. By default,
+	 *   acknowledgments are sent for PUT operations that request an
+	 *   acknowledgment. Acknowledgments are never sent for GET operations.
+	 *   The data sent in the REPLY serves as an implicit acknowledgment.
+	 * - LNET_MD_KIOV: The start and length fields specify an array of
+	 *   lnet_kiov_t.
+	 * - LNET_MD_IOVEC: The start and length fields specify an array of
+	 *   struct iovec.
+	 * - LNET_MD_MAX_SIZE: The max_size field is valid.
+	 *
+	 * Note:
+	 * - LNET_MD_KIOV or LNET_MD_IOVEC allows for a scatter/gather
+	 *   capability for memory descriptors. They can't be both set.
+	 * - When LNET_MD_MAX_SIZE is set, the total length of the memory
+	 *   region (i.e. sum of all fragment lengths) must not be less than
+	 *   \a max_size.
+	 */
+	unsigned int     options;
+	/**
+	 * A user-specified value that is associated with the memory
+	 * descriptor. The value does not need to be a pointer, but must fit
+	 * in the space used by a pointer. This value is recorded in events
+	 * associated with operations on this MD.
+	 */
+	void	    *user_ptr;
+	/**
+	 * A handle for the event queue used to log the operations performed on
+	 * the memory region. If this argument is a NULL handle (i.e. nullified
+	 * by LNetInvalidateHandle()), operations performed on this memory
+	 * descriptor are not logged.
+	 */
+	lnet_handle_eq_t eq_handle;
+} lnet_md_t;
+
+/* Max Transfer Unit (minimum supported everywhere).
+ * CAVEAT EMPTOR, with multinet (i.e. routers forwarding between networks)
+ * these limits are system wide and not interface-local. */
+#define LNET_MTU_BITS	20
+#define LNET_MTU	(1 << LNET_MTU_BITS)
+
+/** limit on the number of fragments in discontiguous MDs */
+#define LNET_MAX_IOV    256
+
+/* Max payload size */
+# define LNET_MAX_PAYLOAD	CONFIG_LNET_MAX_PAYLOAD
+# if (LNET_MAX_PAYLOAD < LNET_MTU)
+#  error "LNET_MAX_PAYLOAD too small - error in configure --with-max-payload-mb"
+# else
+#  if (LNET_MAX_PAYLOAD > (PAGE_SIZE * LNET_MAX_IOV))
+/*  PAGE_SIZE is a constant: check with cpp! */
+#   error "LNET_MAX_PAYLOAD too large - error in configure --with-max-payload-mb"
+#  endif
+# endif
+
+/**
+ * Options for the MD structure. See lnet_md_t::options.
+ */
+#define LNET_MD_OP_PUT	       (1 << 0)
+/** See lnet_md_t::options. */
+#define LNET_MD_OP_GET	       (1 << 1)
+/** See lnet_md_t::options. */
+#define LNET_MD_MANAGE_REMOTE	(1 << 2)
+/* unused			    (1 << 3) */
+/** See lnet_md_t::options. */
+#define LNET_MD_TRUNCATE	     (1 << 4)
+/** See lnet_md_t::options. */
+#define LNET_MD_ACK_DISABLE	  (1 << 5)
+/** See lnet_md_t::options. */
+#define LNET_MD_IOVEC		(1 << 6)
+/** See lnet_md_t::options. */
+#define LNET_MD_MAX_SIZE	     (1 << 7)
+/** See lnet_md_t::options. */
+#define LNET_MD_KIOV		 (1 << 8)
+
+/* For compatibility with Cray Portals */
+#define LNET_MD_PHYS			 0
+
+/** Infinite threshold on MD operations. See lnet_md_t::threshold */
+#define LNET_MD_THRESH_INF       (-1)
+
+/* NB lustre portals uses struct iovec internally! */
+typedef struct iovec lnet_md_iovec_t;
+
+/**
+ * A page-based fragment of a MD.
+ */
+typedef struct {
+	/** Pointer to the page where the fragment resides */
+	struct page      *kiov_page;
+	/** Length in bytes of the fragment */
+	unsigned int     kiov_len;
+	/**
+	 * Starting offset of the fragment within the page. Note that the
+	 * end of the fragment must not pass the end of the page; i.e.,
+	 * kiov_len + kiov_offset <= PAGE_CACHE_SIZE.
+	 */
+	unsigned int     kiov_offset;
+} lnet_kiov_t;
+/** @} lnet_md */
+
+/** \addtogroup lnet_eq
+ * @{ */
+
+/**
+ * Six types of events can be logged in an event queue.
+ */
+typedef enum {
+	/** An incoming GET operation has completed on the MD. */
+	LNET_EVENT_GET		= 1,
+	/**
+	 * An incoming PUT operation has completed on the MD. The
+	 * underlying layers will not alter the memory (on behalf of this
+	 * operation) once this event has been logged.
+	 */
+	LNET_EVENT_PUT,
+	/**
+	 * A REPLY operation has completed. This event is logged after the
+	 * data (if any) from the REPLY has been written into the MD.
+	 */
+	LNET_EVENT_REPLY,
+	/** An acknowledgment has been received. */
+	LNET_EVENT_ACK,
+	/**
+	 * An outgoing send (PUT or GET) operation has completed. This event
+	 * is logged after the entire buffer has been sent and it is safe for
+	 * the caller to reuse the buffer.
+	 *
+	 * Note:
+	 * - The LNET_EVENT_SEND doesn't guarantee message delivery. It can
+	 *   happen even when the message has not yet been put out on wire.
+	 * - It's unsafe to assume that in an outgoing GET operation
+	 *   the LNET_EVENT_SEND event would happen before the
+	 *   LNET_EVENT_REPLY event. The same holds for LNET_EVENT_SEND and
+	 *   LNET_EVENT_ACK events in an outgoing PUT operation.
+	 */
+	LNET_EVENT_SEND,
+	/**
+	 * A MD has been unlinked. Note that LNetMDUnlink() does not
+	 * necessarily trigger an LNET_EVENT_UNLINK event.
+	 * \see LNetMDUnlink
+	 */
+	LNET_EVENT_UNLINK,
+} lnet_event_kind_t;
+
+#define LNET_SEQ_BASETYPE       long
+typedef unsigned LNET_SEQ_BASETYPE lnet_seq_t;
+#define LNET_SEQ_GT(a, b)	(((signed LNET_SEQ_BASETYPE)((a) - (b))) > 0)
+
+/**
+ * Information about an event on a MD.
+ */
+typedef struct {
+	/** The identifier (nid, pid) of the target. */
+	lnet_process_id_t   target;
+	/** The identifier (nid, pid) of the initiator. */
+	lnet_process_id_t   initiator;
+	/**
+	 * The NID of the immediate sender. If the request has been forwarded
+	 * by routers, this is the NID of the last hop; otherwise it's the
+	 * same as the initiator.
+	 */
+	lnet_nid_t	  sender;
+	/** Indicates the type of the event. */
+	lnet_event_kind_t   type;
+	/** The portal table index specified in the request */
+	unsigned int	pt_index;
+	/** A copy of the match bits specified in the request. */
+	__u64	       match_bits;
+	/** The length (in bytes) specified in the request. */
+	unsigned int	rlength;
+	/**
+	 * The length (in bytes) of the data that was manipulated by the
+	 * operation. For truncated operations, the manipulated length will be
+	 * the number of bytes specified by the MD (possibly with an offset,
+	 * see lnet_md_t). For all other operations, the manipulated length
+	 * will be the length of the requested operation, i.e. rlength.
+	 */
+	unsigned int	mlength;
+	/**
+	 * The handle to the MD associated with the event. The handle may be
+	 * invalid if the MD has been unlinked.
+	 */
+	lnet_handle_md_t    md_handle;
+	/**
+	 * A snapshot of the state of the MD immediately after the event has
+	 * been processed. In particular, the threshold field in md will
+	 * reflect the value of the threshold after the operation occurred.
+	 */
+	lnet_md_t	   md;
+	/**
+	 * 64 bits of out-of-band user data. Only valid for LNET_EVENT_PUT.
+	 * \see LNetPut
+	 */
+	__u64	       hdr_data;
+	/**
+	 * Indicates the completion status of the operation. It's 0 for
+	 * successful operations, otherwise it's an error code.
+	 */
+	int		 status;
+	/**
+	 * Indicates whether the MD has been unlinked. Note that:
+	 * - An event with unlinked set is the last event on the MD.
+	 * - This field is also set for an explicit LNET_EVENT_UNLINK event.
+	 * \see LNetMDUnlink
+	 */
+	int		 unlinked;
+	/**
+	 * The displacement (in bytes) into the memory region that the
+	 * operation used. The offset can be determined by the operation for
+	 * a remote managed MD or by the local MD.
+	 * \see lnet_md_t::options
+	 */
+	unsigned int	offset;
+	/**
+	 * The sequence number for this event. Sequence numbers are unique
+	 * to each event.
+	 */
+	volatile lnet_seq_t sequence;
+} lnet_event_t;
+
+/**
+ * Event queue handler function type.
+ *
+ * The EQ handler runs for each event that is deposited into the EQ. The
+ * handler is supplied with a pointer to the event that triggered the
+ * handler invocation.
+ *
+ * The handler must not block, must be reentrant, and must not call any LNet
+ * API functions. It should return as quickly as possible.
+ */
+typedef void (*lnet_eq_handler_t)(lnet_event_t *event);
+#define LNET_EQ_HANDLER_NONE NULL
+/** @} lnet_eq */
+
+/** \addtogroup lnet_data
+ * @{ */
+
+/**
+ * Specify whether an acknowledgment should be sent by target when the PUT
+ * operation completes (i.e., when the data has been written to a MD of the
+ * target process).
+ *
+ * \see lnet_md_t::options for the discussion on LNET_MD_ACK_DISABLE by which
+ * acknowledgments can be disabled for a MD.
+ */
+typedef enum {
+	/** Request an acknowledgment */
+	LNET_ACK_REQ,
+	/** Request that no acknowledgment should be generated. */
+	LNET_NOACK_REQ
+} lnet_ack_req_t;
+/** @} lnet_data */
+
+/** @} lnet */
+#endif
diff --git a/kernel/drivers/staging/lustre/lnet/Kconfig b/kernel/drivers/staging/lustre/lnet/Kconfig
new file mode 100644
index 000000000..00850eeb6
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lnet/Kconfig
@@ -0,0 +1,40 @@
+config LNET
+	tristate "Lustre networking subsystem"
+	depends on LUSTRE_FS
+
+config LNET_MAX_PAYLOAD
+	int "Lustre lnet max transfer payload (default 2MB)"
+	depends on LUSTRE_FS
+	default "1048576"
+	help
+	  This option defines the maximum size of payload in bytes that lnet
+	  can put into its transport.
+
+	  If unsure, use default.
+
+config LNET_SELFTEST
+	tristate "Lustre networking self testing"
+	depends on LNET
+	help
+	  Choose Y here if you want to do lnet self testing. To compile this
+	  as a module, choose M here: the module will be called lnet_selftest.
+
+	  To compile this as a kernel modules, choose M here and it will be
+	  called lnet_selftest.
+
+	  If unsure, say N.
+
+	  See also http://wiki.lustre.org/
+
+config LNET_XPRT_IB
+	tristate "LNET infiniband support"
+	depends on LNET && INFINIBAND && INFINIBAND_ADDR_TRANS
+	default LNET && INFINIBAND
+	help
+	  This option allows the LNET users to use infiniband as an
+	  RDMA-enabled transport.
+
+	  To compile this as a kernel module, choose M here and it will be
+	  called ko2iblnd.
+
+	  If unsure, say N.
diff --git a/kernel/drivers/staging/lustre/lnet/Makefile b/kernel/drivers/staging/lustre/lnet/Makefile
new file mode 100644
index 000000000..f6f03e304
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lnet/Makefile
@@ -0,0 +1 @@
+obj-$(CONFIG_LNET) += lnet/ klnds/ selftest/
diff --git a/kernel/drivers/staging/lustre/lnet/klnds/Makefile b/kernel/drivers/staging/lustre/lnet/klnds/Makefile
new file mode 100644
index 000000000..c23e4f67f
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lnet/klnds/Makefile
@@ -0,0 +1 @@
+obj-$(CONFIG_LNET) += o2iblnd/  socklnd/
diff --git a/kernel/drivers/staging/lustre/lnet/klnds/o2iblnd/Makefile b/kernel/drivers/staging/lustre/lnet/klnds/o2iblnd/Makefile
new file mode 100644
index 000000000..e0a7aa72b
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lnet/klnds/o2iblnd/Makefile
@@ -0,0 +1,2 @@
+obj-$(CONFIG_LNET_XPRT_IB) += ko2iblnd.o
+ko2iblnd-y := o2iblnd.o o2iblnd_cb.o o2iblnd_modparams.o
diff --git a/kernel/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd.c b/kernel/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd.c
new file mode 100644
index 000000000..3bad441de
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd.c
@@ -0,0 +1,3118 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/klnds/o2iblnd/o2iblnd.c
+ *
+ * Author: Eric Barton <eric@bartonsoftware.com>
+ */
+
+#include "o2iblnd.h"
+#include <asm/div64.h>
+
+static lnd_t the_o2iblnd = {
+	.lnd_type       = O2IBLND,
+	.lnd_startup    = kiblnd_startup,
+	.lnd_shutdown   = kiblnd_shutdown,
+	.lnd_ctl	= kiblnd_ctl,
+	.lnd_query      = kiblnd_query,
+	.lnd_send       = kiblnd_send,
+	.lnd_recv       = kiblnd_recv,
+};
+
+kib_data_t	      kiblnd_data;
+
+static __u32 kiblnd_cksum(void *ptr, int nob)
+{
+	char  *c  = ptr;
+	__u32  sum = 0;
+
+	while (nob-- > 0)
+		sum = ((sum << 1) | (sum >> 31)) + *c++;
+
+	/* ensure I don't return 0 (== no checksum) */
+	return (sum == 0) ? 1 : sum;
+}
+
+static char *kiblnd_msgtype2str(int type)
+{
+	switch (type) {
+	case IBLND_MSG_CONNREQ:
+		return "CONNREQ";
+
+	case IBLND_MSG_CONNACK:
+		return "CONNACK";
+
+	case IBLND_MSG_NOOP:
+		return "NOOP";
+
+	case IBLND_MSG_IMMEDIATE:
+		return "IMMEDIATE";
+
+	case IBLND_MSG_PUT_REQ:
+		return "PUT_REQ";
+
+	case IBLND_MSG_PUT_NAK:
+		return "PUT_NAK";
+
+	case IBLND_MSG_PUT_ACK:
+		return "PUT_ACK";
+
+	case IBLND_MSG_PUT_DONE:
+		return "PUT_DONE";
+
+	case IBLND_MSG_GET_REQ:
+		return "GET_REQ";
+
+	case IBLND_MSG_GET_DONE:
+		return "GET_DONE";
+
+	default:
+		return "???";
+	}
+}
+
+static int kiblnd_msgtype2size(int type)
+{
+	const int hdr_size = offsetof(kib_msg_t, ibm_u);
+
+	switch (type) {
+	case IBLND_MSG_CONNREQ:
+	case IBLND_MSG_CONNACK:
+		return hdr_size + sizeof(kib_connparams_t);
+
+	case IBLND_MSG_NOOP:
+		return hdr_size;
+
+	case IBLND_MSG_IMMEDIATE:
+		return offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[0]);
+
+	case IBLND_MSG_PUT_REQ:
+		return hdr_size + sizeof(kib_putreq_msg_t);
+
+	case IBLND_MSG_PUT_ACK:
+		return hdr_size + sizeof(kib_putack_msg_t);
+
+	case IBLND_MSG_GET_REQ:
+		return hdr_size + sizeof(kib_get_msg_t);
+
+	case IBLND_MSG_PUT_NAK:
+	case IBLND_MSG_PUT_DONE:
+	case IBLND_MSG_GET_DONE:
+		return hdr_size + sizeof(kib_completion_msg_t);
+	default:
+		return -1;
+	}
+}
+
+static int kiblnd_unpack_rd(kib_msg_t *msg, int flip)
+{
+	kib_rdma_desc_t   *rd;
+	int		nob;
+	int		n;
+	int		i;
+
+	LASSERT(msg->ibm_type == IBLND_MSG_GET_REQ ||
+		 msg->ibm_type == IBLND_MSG_PUT_ACK);
+
+	rd = msg->ibm_type == IBLND_MSG_GET_REQ ?
+			      &msg->ibm_u.get.ibgm_rd :
+			      &msg->ibm_u.putack.ibpam_rd;
+
+	if (flip) {
+		__swab32s(&rd->rd_key);
+		__swab32s(&rd->rd_nfrags);
+	}
+
+	n = rd->rd_nfrags;
+
+	if (n <= 0 || n > IBLND_MAX_RDMA_FRAGS) {
+		CERROR("Bad nfrags: %d, should be 0 < n <= %d\n",
+		       n, IBLND_MAX_RDMA_FRAGS);
+		return 1;
+	}
+
+	nob = offsetof(kib_msg_t, ibm_u) +
+	      kiblnd_rd_msg_size(rd, msg->ibm_type, n);
+
+	if (msg->ibm_nob < nob) {
+		CERROR("Short %s: %d(%d)\n",
+		       kiblnd_msgtype2str(msg->ibm_type), msg->ibm_nob, nob);
+		return 1;
+	}
+
+	if (!flip)
+		return 0;
+
+	for (i = 0; i < n; i++) {
+		__swab32s(&rd->rd_frags[i].rf_nob);
+		__swab64s(&rd->rd_frags[i].rf_addr);
+	}
+
+	return 0;
+}
+
+void kiblnd_pack_msg(lnet_ni_t *ni, kib_msg_t *msg, int version,
+		     int credits, lnet_nid_t dstnid, __u64 dststamp)
+{
+	kib_net_t *net = ni->ni_data;
+
+	/* CAVEAT EMPTOR! all message fields not set here should have been
+	 * initialised previously. */
+	msg->ibm_magic    = IBLND_MSG_MAGIC;
+	msg->ibm_version  = version;
+	/*   ibm_type */
+	msg->ibm_credits  = credits;
+	/*   ibm_nob */
+	msg->ibm_cksum    = 0;
+	msg->ibm_srcnid   = ni->ni_nid;
+	msg->ibm_srcstamp = net->ibn_incarnation;
+	msg->ibm_dstnid   = dstnid;
+	msg->ibm_dststamp = dststamp;
+
+	if (*kiblnd_tunables.kib_cksum) {
+		/* NB ibm_cksum zero while computing cksum */
+		msg->ibm_cksum = kiblnd_cksum(msg, msg->ibm_nob);
+	}
+}
+
+int kiblnd_unpack_msg(kib_msg_t *msg, int nob)
+{
+	const int hdr_size = offsetof(kib_msg_t, ibm_u);
+	__u32     msg_cksum;
+	__u16     version;
+	int       msg_nob;
+	int       flip;
+
+	/* 6 bytes are enough to have received magic + version */
+	if (nob < 6) {
+		CERROR("Short message: %d\n", nob);
+		return -EPROTO;
+	}
+
+	if (msg->ibm_magic == IBLND_MSG_MAGIC) {
+		flip = 0;
+	} else if (msg->ibm_magic == __swab32(IBLND_MSG_MAGIC)) {
+		flip = 1;
+	} else {
+		CERROR("Bad magic: %08x\n", msg->ibm_magic);
+		return -EPROTO;
+	}
+
+	version = flip ? __swab16(msg->ibm_version) : msg->ibm_version;
+	if (version != IBLND_MSG_VERSION &&
+	    version != IBLND_MSG_VERSION_1) {
+		CERROR("Bad version: %x\n", version);
+		return -EPROTO;
+	}
+
+	if (nob < hdr_size) {
+		CERROR("Short message: %d\n", nob);
+		return -EPROTO;
+	}
+
+	msg_nob = flip ? __swab32(msg->ibm_nob) : msg->ibm_nob;
+	if (msg_nob > nob) {
+		CERROR("Short message: got %d, wanted %d\n", nob, msg_nob);
+		return -EPROTO;
+	}
+
+	/* checksum must be computed with ibm_cksum zero and BEFORE anything
+	 * gets flipped */
+	msg_cksum = flip ? __swab32(msg->ibm_cksum) : msg->ibm_cksum;
+	msg->ibm_cksum = 0;
+	if (msg_cksum != 0 &&
+	    msg_cksum != kiblnd_cksum(msg, msg_nob)) {
+		CERROR("Bad checksum\n");
+		return -EPROTO;
+	}
+
+	msg->ibm_cksum = msg_cksum;
+
+	if (flip) {
+		/* leave magic unflipped as a clue to peer endianness */
+		msg->ibm_version = version;
+		CLASSERT(sizeof(msg->ibm_type) == 1);
+		CLASSERT(sizeof(msg->ibm_credits) == 1);
+		msg->ibm_nob     = msg_nob;
+		__swab64s(&msg->ibm_srcnid);
+		__swab64s(&msg->ibm_srcstamp);
+		__swab64s(&msg->ibm_dstnid);
+		__swab64s(&msg->ibm_dststamp);
+	}
+
+	if (msg->ibm_srcnid == LNET_NID_ANY) {
+		CERROR("Bad src nid: %s\n", libcfs_nid2str(msg->ibm_srcnid));
+		return -EPROTO;
+	}
+
+	if (msg_nob < kiblnd_msgtype2size(msg->ibm_type)) {
+		CERROR("Short %s: %d(%d)\n", kiblnd_msgtype2str(msg->ibm_type),
+		       msg_nob, kiblnd_msgtype2size(msg->ibm_type));
+		return -EPROTO;
+	}
+
+	switch (msg->ibm_type) {
+	default:
+		CERROR("Unknown message type %x\n", msg->ibm_type);
+		return -EPROTO;
+
+	case IBLND_MSG_NOOP:
+	case IBLND_MSG_IMMEDIATE:
+	case IBLND_MSG_PUT_REQ:
+		break;
+
+	case IBLND_MSG_PUT_ACK:
+	case IBLND_MSG_GET_REQ:
+		if (kiblnd_unpack_rd(msg, flip))
+			return -EPROTO;
+		break;
+
+	case IBLND_MSG_PUT_NAK:
+	case IBLND_MSG_PUT_DONE:
+	case IBLND_MSG_GET_DONE:
+		if (flip)
+			__swab32s(&msg->ibm_u.completion.ibcm_status);
+		break;
+
+	case IBLND_MSG_CONNREQ:
+	case IBLND_MSG_CONNACK:
+		if (flip) {
+			__swab16s(&msg->ibm_u.connparams.ibcp_queue_depth);
+			__swab16s(&msg->ibm_u.connparams.ibcp_max_frags);
+			__swab32s(&msg->ibm_u.connparams.ibcp_max_msg_size);
+		}
+		break;
+	}
+	return 0;
+}
+
+int kiblnd_create_peer(lnet_ni_t *ni, kib_peer_t **peerp, lnet_nid_t nid)
+{
+	kib_peer_t	*peer;
+	kib_net_t	*net = ni->ni_data;
+	int		cpt = lnet_cpt_of_nid(nid);
+	unsigned long   flags;
+
+	LASSERT(net != NULL);
+	LASSERT(nid != LNET_NID_ANY);
+
+	LIBCFS_CPT_ALLOC(peer, lnet_cpt_table(), cpt, sizeof(*peer));
+	if (peer == NULL) {
+		CERROR("Cannot allocate peer\n");
+		return -ENOMEM;
+	}
+
+	memset(peer, 0, sizeof(*peer));	 /* zero flags etc */
+
+	peer->ibp_ni = ni;
+	peer->ibp_nid = nid;
+	peer->ibp_error = 0;
+	peer->ibp_last_alive = 0;
+	atomic_set(&peer->ibp_refcount, 1);  /* 1 ref for caller */
+
+	INIT_LIST_HEAD(&peer->ibp_list);     /* not in the peer table yet */
+	INIT_LIST_HEAD(&peer->ibp_conns);
+	INIT_LIST_HEAD(&peer->ibp_tx_queue);
+
+	write_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
+
+	/* always called with a ref on ni, which prevents ni being shutdown */
+	LASSERT(net->ibn_shutdown == 0);
+
+	/* npeers only grows with the global lock held */
+	atomic_inc(&net->ibn_npeers);
+
+	write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+
+	*peerp = peer;
+	return 0;
+}
+
+void kiblnd_destroy_peer(kib_peer_t *peer)
+{
+	kib_net_t *net = peer->ibp_ni->ni_data;
+
+	LASSERT(net != NULL);
+	LASSERT(atomic_read(&peer->ibp_refcount) == 0);
+	LASSERT(!kiblnd_peer_active(peer));
+	LASSERT(peer->ibp_connecting == 0);
+	LASSERT(peer->ibp_accepting == 0);
+	LASSERT(list_empty(&peer->ibp_conns));
+	LASSERT(list_empty(&peer->ibp_tx_queue));
+
+	LIBCFS_FREE(peer, sizeof(*peer));
+
+	/* NB a peer's connections keep a reference on their peer until
+	 * they are destroyed, so we can be assured that _all_ state to do
+	 * with this peer has been cleaned up when its refcount drops to
+	 * zero. */
+	atomic_dec(&net->ibn_npeers);
+}
+
+kib_peer_t *kiblnd_find_peer_locked(lnet_nid_t nid)
+{
+	/* the caller is responsible for accounting the additional reference
+	 * that this creates */
+	struct list_head       *peer_list = kiblnd_nid2peerlist(nid);
+	struct list_head       *tmp;
+	kib_peer_t       *peer;
+
+	list_for_each(tmp, peer_list) {
+
+		peer = list_entry(tmp, kib_peer_t, ibp_list);
+
+		LASSERT(peer->ibp_connecting > 0 || /* creating conns */
+			 peer->ibp_accepting > 0 ||
+			 !list_empty(&peer->ibp_conns));  /* active conn */
+
+		if (peer->ibp_nid != nid)
+			continue;
+
+		CDEBUG(D_NET, "got peer [%p] -> %s (%d) version: %x\n",
+		       peer, libcfs_nid2str(nid),
+		       atomic_read(&peer->ibp_refcount),
+		       peer->ibp_version);
+		return peer;
+	}
+	return NULL;
+}
+
+void kiblnd_unlink_peer_locked(kib_peer_t *peer)
+{
+	LASSERT(list_empty(&peer->ibp_conns));
+
+	LASSERT(kiblnd_peer_active(peer));
+	list_del_init(&peer->ibp_list);
+	/* lose peerlist's ref */
+	kiblnd_peer_decref(peer);
+}
+
+static int kiblnd_get_peer_info(lnet_ni_t *ni, int index,
+				lnet_nid_t *nidp, int *count)
+{
+	kib_peer_t	    *peer;
+	struct list_head	    *ptmp;
+	int		    i;
+	unsigned long	  flags;
+
+	read_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
+
+	for (i = 0; i < kiblnd_data.kib_peer_hash_size; i++) {
+
+		list_for_each(ptmp, &kiblnd_data.kib_peers[i]) {
+
+			peer = list_entry(ptmp, kib_peer_t, ibp_list);
+			LASSERT(peer->ibp_connecting > 0 ||
+				 peer->ibp_accepting > 0 ||
+				 !list_empty(&peer->ibp_conns));
+
+			if (peer->ibp_ni != ni)
+				continue;
+
+			if (index-- > 0)
+				continue;
+
+			*nidp = peer->ibp_nid;
+			*count = atomic_read(&peer->ibp_refcount);
+
+			read_unlock_irqrestore(&kiblnd_data.kib_global_lock,
+					       flags);
+			return 0;
+		}
+	}
+
+	read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+	return -ENOENT;
+}
+
+static void kiblnd_del_peer_locked(kib_peer_t *peer)
+{
+	struct list_head	   *ctmp;
+	struct list_head	   *cnxt;
+	kib_conn_t	   *conn;
+
+	if (list_empty(&peer->ibp_conns)) {
+		kiblnd_unlink_peer_locked(peer);
+	} else {
+		list_for_each_safe(ctmp, cnxt, &peer->ibp_conns) {
+			conn = list_entry(ctmp, kib_conn_t, ibc_list);
+
+			kiblnd_close_conn_locked(conn, 0);
+		}
+		/* NB closing peer's last conn unlinked it. */
+	}
+	/* NB peer now unlinked; might even be freed if the peer table had the
+	 * last ref on it. */
+}
+
+static int kiblnd_del_peer(lnet_ni_t *ni, lnet_nid_t nid)
+{
+	LIST_HEAD(zombies);
+	struct list_head	    *ptmp;
+	struct list_head	    *pnxt;
+	kib_peer_t	    *peer;
+	int		    lo;
+	int		    hi;
+	int		    i;
+	unsigned long	  flags;
+	int		    rc = -ENOENT;
+
+	write_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
+
+	if (nid != LNET_NID_ANY) {
+		lo = hi = kiblnd_nid2peerlist(nid) - kiblnd_data.kib_peers;
+	} else {
+		lo = 0;
+		hi = kiblnd_data.kib_peer_hash_size - 1;
+	}
+
+	for (i = lo; i <= hi; i++) {
+		list_for_each_safe(ptmp, pnxt, &kiblnd_data.kib_peers[i]) {
+			peer = list_entry(ptmp, kib_peer_t, ibp_list);
+			LASSERT(peer->ibp_connecting > 0 ||
+				 peer->ibp_accepting > 0 ||
+				 !list_empty(&peer->ibp_conns));
+
+			if (peer->ibp_ni != ni)
+				continue;
+
+			if (!(nid == LNET_NID_ANY || peer->ibp_nid == nid))
+				continue;
+
+			if (!list_empty(&peer->ibp_tx_queue)) {
+				LASSERT(list_empty(&peer->ibp_conns));
+
+				list_splice_init(&peer->ibp_tx_queue,
+						     &zombies);
+			}
+
+			kiblnd_del_peer_locked(peer);
+			rc = 0;	 /* matched something */
+		}
+	}
+
+	write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+
+	kiblnd_txlist_done(ni, &zombies, -EIO);
+
+	return rc;
+}
+
+static kib_conn_t *kiblnd_get_conn_by_idx(lnet_ni_t *ni, int index)
+{
+	kib_peer_t	    *peer;
+	struct list_head	    *ptmp;
+	kib_conn_t	    *conn;
+	struct list_head	    *ctmp;
+	int		    i;
+	unsigned long	  flags;
+
+	read_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
+
+	for (i = 0; i < kiblnd_data.kib_peer_hash_size; i++) {
+		list_for_each(ptmp, &kiblnd_data.kib_peers[i]) {
+
+			peer = list_entry(ptmp, kib_peer_t, ibp_list);
+			LASSERT(peer->ibp_connecting > 0 ||
+				 peer->ibp_accepting > 0 ||
+				 !list_empty(&peer->ibp_conns));
+
+			if (peer->ibp_ni != ni)
+				continue;
+
+			list_for_each(ctmp, &peer->ibp_conns) {
+				if (index-- > 0)
+					continue;
+
+				conn = list_entry(ctmp, kib_conn_t,
+						      ibc_list);
+				kiblnd_conn_addref(conn);
+				read_unlock_irqrestore(
+					&kiblnd_data.kib_global_lock,
+					flags);
+				return conn;
+			}
+		}
+	}
+
+	read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+	return NULL;
+}
+
+int kiblnd_translate_mtu(int value)
+{
+	switch (value) {
+	default:
+		return -1;
+	case 0:
+		return 0;
+	case 256:
+		return IB_MTU_256;
+	case 512:
+		return IB_MTU_512;
+	case 1024:
+		return IB_MTU_1024;
+	case 2048:
+		return IB_MTU_2048;
+	case 4096:
+		return IB_MTU_4096;
+	}
+}
+
+static void kiblnd_setup_mtu_locked(struct rdma_cm_id *cmid)
+{
+	int	   mtu;
+
+	/* XXX There is no path record for iWARP, set by netdev->change_mtu? */
+	if (cmid->route.path_rec == NULL)
+		return;
+
+	mtu = kiblnd_translate_mtu(*kiblnd_tunables.kib_ib_mtu);
+	LASSERT(mtu >= 0);
+	if (mtu != 0)
+		cmid->route.path_rec->mtu = mtu;
+}
+
+static int kiblnd_get_completion_vector(kib_conn_t *conn, int cpt)
+{
+	cpumask_t	*mask;
+	int		vectors;
+	int		off;
+	int		i;
+	lnet_nid_t	nid = conn->ibc_peer->ibp_nid;
+
+	vectors = conn->ibc_cmid->device->num_comp_vectors;
+	if (vectors <= 1)
+		return 0;
+
+	mask = cfs_cpt_cpumask(lnet_cpt_table(), cpt);
+	if (mask == NULL)
+		return 0;
+
+	/* hash NID to CPU id in this partition... */
+	off = do_div(nid, cpumask_weight(mask));
+	for_each_cpu(i, mask) {
+		if (off-- == 0)
+			return i % vectors;
+	}
+
+	LBUG();
+	return 1;
+}
+
+kib_conn_t *kiblnd_create_conn(kib_peer_t *peer, struct rdma_cm_id *cmid,
+				int state, int version)
+{
+	/* CAVEAT EMPTOR:
+	 * If the new conn is created successfully it takes over the caller's
+	 * ref on 'peer'.  It also "owns" 'cmid' and destroys it when it itself
+	 * is destroyed.  On failure, the caller's ref on 'peer' remains and
+	 * she must dispose of 'cmid'.  (Actually I'd block forever if I tried
+	 * to destroy 'cmid' here since I'm called from the CM which still has
+	 * its ref on 'cmid'). */
+	rwlock_t		*glock = &kiblnd_data.kib_global_lock;
+	kib_net_t	      *net = peer->ibp_ni->ni_data;
+	kib_dev_t	      *dev;
+	struct ib_qp_init_attr *init_qp_attr;
+	struct kib_sched_info	*sched;
+	kib_conn_t		*conn;
+	struct ib_cq		*cq;
+	unsigned long		flags;
+	int			cpt;
+	int			rc;
+	int			i;
+
+	LASSERT(net != NULL);
+	LASSERT(!in_interrupt());
+
+	dev = net->ibn_dev;
+
+	cpt = lnet_cpt_of_nid(peer->ibp_nid);
+	sched = kiblnd_data.kib_scheds[cpt];
+
+	LASSERT(sched->ibs_nthreads > 0);
+
+	LIBCFS_CPT_ALLOC(init_qp_attr, lnet_cpt_table(), cpt,
+			 sizeof(*init_qp_attr));
+	if (init_qp_attr == NULL) {
+		CERROR("Can't allocate qp_attr for %s\n",
+		       libcfs_nid2str(peer->ibp_nid));
+		goto failed_0;
+	}
+
+	LIBCFS_CPT_ALLOC(conn, lnet_cpt_table(), cpt, sizeof(*conn));
+	if (conn == NULL) {
+		CERROR("Can't allocate connection for %s\n",
+		       libcfs_nid2str(peer->ibp_nid));
+		goto failed_1;
+	}
+
+	conn->ibc_state = IBLND_CONN_INIT;
+	conn->ibc_version = version;
+	conn->ibc_peer = peer;		  /* I take the caller's ref */
+	cmid->context = conn;		   /* for future CM callbacks */
+	conn->ibc_cmid = cmid;
+
+	INIT_LIST_HEAD(&conn->ibc_early_rxs);
+	INIT_LIST_HEAD(&conn->ibc_tx_noops);
+	INIT_LIST_HEAD(&conn->ibc_tx_queue);
+	INIT_LIST_HEAD(&conn->ibc_tx_queue_rsrvd);
+	INIT_LIST_HEAD(&conn->ibc_tx_queue_nocred);
+	INIT_LIST_HEAD(&conn->ibc_active_txs);
+	spin_lock_init(&conn->ibc_lock);
+
+	LIBCFS_CPT_ALLOC(conn->ibc_connvars, lnet_cpt_table(), cpt,
+			 sizeof(*conn->ibc_connvars));
+	if (conn->ibc_connvars == NULL) {
+		CERROR("Can't allocate in-progress connection state\n");
+		goto failed_2;
+	}
+
+	write_lock_irqsave(glock, flags);
+	if (dev->ibd_failover) {
+		write_unlock_irqrestore(glock, flags);
+		CERROR("%s: failover in progress\n", dev->ibd_ifname);
+		goto failed_2;
+	}
+
+	if (dev->ibd_hdev->ibh_ibdev != cmid->device) {
+		/* wakeup failover thread and teardown connection */
+		if (kiblnd_dev_can_failover(dev)) {
+			list_add_tail(&dev->ibd_fail_list,
+				      &kiblnd_data.kib_failed_devs);
+			wake_up(&kiblnd_data.kib_failover_waitq);
+		}
+
+		write_unlock_irqrestore(glock, flags);
+		CERROR("cmid HCA(%s), kib_dev(%s) need failover\n",
+		       cmid->device->name, dev->ibd_ifname);
+		goto failed_2;
+	}
+
+	kiblnd_hdev_addref_locked(dev->ibd_hdev);
+	conn->ibc_hdev = dev->ibd_hdev;
+
+	kiblnd_setup_mtu_locked(cmid);
+
+	write_unlock_irqrestore(glock, flags);
+
+	LIBCFS_CPT_ALLOC(conn->ibc_rxs, lnet_cpt_table(), cpt,
+			 IBLND_RX_MSGS(version) * sizeof(kib_rx_t));
+	if (conn->ibc_rxs == NULL) {
+		CERROR("Cannot allocate RX buffers\n");
+		goto failed_2;
+	}
+
+	rc = kiblnd_alloc_pages(&conn->ibc_rx_pages, cpt,
+				IBLND_RX_MSG_PAGES(version));
+	if (rc != 0)
+		goto failed_2;
+
+	kiblnd_map_rx_descs(conn);
+
+	cq = ib_create_cq(cmid->device,
+			  kiblnd_cq_completion, kiblnd_cq_event, conn,
+			  IBLND_CQ_ENTRIES(version),
+			  kiblnd_get_completion_vector(conn, cpt));
+	if (IS_ERR(cq)) {
+		CERROR("Can't create CQ: %ld, cqe: %d\n",
+		       PTR_ERR(cq), IBLND_CQ_ENTRIES(version));
+		goto failed_2;
+	}
+
+	conn->ibc_cq = cq;
+
+	rc = ib_req_notify_cq(cq, IB_CQ_NEXT_COMP);
+	if (rc != 0) {
+		CERROR("Can't request completion notificiation: %d\n", rc);
+		goto failed_2;
+	}
+
+	init_qp_attr->event_handler = kiblnd_qp_event;
+	init_qp_attr->qp_context = conn;
+	init_qp_attr->cap.max_send_wr = IBLND_SEND_WRS(version);
+	init_qp_attr->cap.max_recv_wr = IBLND_RECV_WRS(version);
+	init_qp_attr->cap.max_send_sge = 1;
+	init_qp_attr->cap.max_recv_sge = 1;
+	init_qp_attr->sq_sig_type = IB_SIGNAL_REQ_WR;
+	init_qp_attr->qp_type = IB_QPT_RC;
+	init_qp_attr->send_cq = cq;
+	init_qp_attr->recv_cq = cq;
+
+	conn->ibc_sched = sched;
+
+	rc = rdma_create_qp(cmid, conn->ibc_hdev->ibh_pd, init_qp_attr);
+	if (rc != 0) {
+		CERROR("Can't create QP: %d, send_wr: %d, recv_wr: %d\n",
+		       rc, init_qp_attr->cap.max_send_wr,
+		       init_qp_attr->cap.max_recv_wr);
+		goto failed_2;
+	}
+
+	LIBCFS_FREE(init_qp_attr, sizeof(*init_qp_attr));
+
+	/* 1 ref for caller and each rxmsg */
+	atomic_set(&conn->ibc_refcount, 1 + IBLND_RX_MSGS(version));
+	conn->ibc_nrx = IBLND_RX_MSGS(version);
+
+	/* post receives */
+	for (i = 0; i < IBLND_RX_MSGS(version); i++) {
+		rc = kiblnd_post_rx(&conn->ibc_rxs[i],
+				    IBLND_POSTRX_NO_CREDIT);
+		if (rc != 0) {
+			CERROR("Can't post rxmsg: %d\n", rc);
+
+			/* Make posted receives complete */
+			kiblnd_abort_receives(conn);
+
+			/* correct # of posted buffers
+			 * NB locking needed now I'm racing with completion */
+			spin_lock_irqsave(&sched->ibs_lock, flags);
+			conn->ibc_nrx -= IBLND_RX_MSGS(version) - i;
+			spin_unlock_irqrestore(&sched->ibs_lock, flags);
+
+			/* cmid will be destroyed by CM(ofed) after cm_callback
+			 * returned, so we can't refer it anymore
+			 * (by kiblnd_connd()->kiblnd_destroy_conn) */
+			rdma_destroy_qp(conn->ibc_cmid);
+			conn->ibc_cmid = NULL;
+
+			/* Drop my own and unused rxbuffer refcounts */
+			while (i++ <= IBLND_RX_MSGS(version))
+				kiblnd_conn_decref(conn);
+
+			return NULL;
+		}
+	}
+
+	/* Init successful! */
+	LASSERT(state == IBLND_CONN_ACTIVE_CONNECT ||
+		 state == IBLND_CONN_PASSIVE_WAIT);
+	conn->ibc_state = state;
+
+	/* 1 more conn */
+	atomic_inc(&net->ibn_nconns);
+	return conn;
+
+ failed_2:
+	kiblnd_destroy_conn(conn);
+ failed_1:
+	LIBCFS_FREE(init_qp_attr, sizeof(*init_qp_attr));
+ failed_0:
+	return NULL;
+}
+
+void kiblnd_destroy_conn(kib_conn_t *conn)
+{
+	struct rdma_cm_id *cmid = conn->ibc_cmid;
+	kib_peer_t	*peer = conn->ibc_peer;
+	int		rc;
+
+	LASSERT(!in_interrupt());
+	LASSERT(atomic_read(&conn->ibc_refcount) == 0);
+	LASSERT(list_empty(&conn->ibc_early_rxs));
+	LASSERT(list_empty(&conn->ibc_tx_noops));
+	LASSERT(list_empty(&conn->ibc_tx_queue));
+	LASSERT(list_empty(&conn->ibc_tx_queue_rsrvd));
+	LASSERT(list_empty(&conn->ibc_tx_queue_nocred));
+	LASSERT(list_empty(&conn->ibc_active_txs));
+	LASSERT(conn->ibc_noops_posted == 0);
+	LASSERT(conn->ibc_nsends_posted == 0);
+
+	switch (conn->ibc_state) {
+	default:
+		/* conn must be completely disengaged from the network */
+		LBUG();
+
+	case IBLND_CONN_DISCONNECTED:
+		/* connvars should have been freed already */
+		LASSERT(conn->ibc_connvars == NULL);
+		break;
+
+	case IBLND_CONN_INIT:
+		break;
+	}
+
+	/* conn->ibc_cmid might be destroyed by CM already */
+	if (cmid != NULL && cmid->qp != NULL)
+		rdma_destroy_qp(cmid);
+
+	if (conn->ibc_cq != NULL) {
+		rc = ib_destroy_cq(conn->ibc_cq);
+		if (rc != 0)
+			CWARN("Error destroying CQ: %d\n", rc);
+	}
+
+	if (conn->ibc_rx_pages != NULL)
+		kiblnd_unmap_rx_descs(conn);
+
+	if (conn->ibc_rxs != NULL) {
+		LIBCFS_FREE(conn->ibc_rxs,
+			    IBLND_RX_MSGS(conn->ibc_version)
+			      * sizeof(kib_rx_t));
+	}
+
+	if (conn->ibc_connvars != NULL)
+		LIBCFS_FREE(conn->ibc_connvars, sizeof(*conn->ibc_connvars));
+
+	if (conn->ibc_hdev != NULL)
+		kiblnd_hdev_decref(conn->ibc_hdev);
+
+	/* See CAVEAT EMPTOR above in kiblnd_create_conn */
+	if (conn->ibc_state != IBLND_CONN_INIT) {
+		kib_net_t *net = peer->ibp_ni->ni_data;
+
+		kiblnd_peer_decref(peer);
+		rdma_destroy_id(cmid);
+		atomic_dec(&net->ibn_nconns);
+	}
+
+	LIBCFS_FREE(conn, sizeof(*conn));
+}
+
+int kiblnd_close_peer_conns_locked(kib_peer_t *peer, int why)
+{
+	kib_conn_t	     *conn;
+	struct list_head	     *ctmp;
+	struct list_head	     *cnxt;
+	int		     count = 0;
+
+	list_for_each_safe(ctmp, cnxt, &peer->ibp_conns) {
+		conn = list_entry(ctmp, kib_conn_t, ibc_list);
+
+		CDEBUG(D_NET, "Closing conn -> %s, version: %x, reason: %d\n",
+		       libcfs_nid2str(peer->ibp_nid),
+		       conn->ibc_version, why);
+
+		kiblnd_close_conn_locked(conn, why);
+		count++;
+	}
+
+	return count;
+}
+
+int kiblnd_close_stale_conns_locked(kib_peer_t *peer,
+				     int version, __u64 incarnation)
+{
+	kib_conn_t	     *conn;
+	struct list_head	     *ctmp;
+	struct list_head	     *cnxt;
+	int		     count = 0;
+
+	list_for_each_safe(ctmp, cnxt, &peer->ibp_conns) {
+		conn = list_entry(ctmp, kib_conn_t, ibc_list);
+
+		if (conn->ibc_version     == version &&
+		    conn->ibc_incarnation == incarnation)
+			continue;
+
+		CDEBUG(D_NET,
+		       "Closing stale conn -> %s version: %x, incarnation:%#llx(%x, %#llx)\n",
+		       libcfs_nid2str(peer->ibp_nid),
+		       conn->ibc_version, conn->ibc_incarnation,
+		       version, incarnation);
+
+		kiblnd_close_conn_locked(conn, -ESTALE);
+		count++;
+	}
+
+	return count;
+}
+
+static int kiblnd_close_matching_conns(lnet_ni_t *ni, lnet_nid_t nid)
+{
+	kib_peer_t	     *peer;
+	struct list_head	     *ptmp;
+	struct list_head	     *pnxt;
+	int		     lo;
+	int		     hi;
+	int		     i;
+	unsigned long	   flags;
+	int		     count = 0;
+
+	write_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
+
+	if (nid != LNET_NID_ANY)
+		lo = hi = kiblnd_nid2peerlist(nid) - kiblnd_data.kib_peers;
+	else {
+		lo = 0;
+		hi = kiblnd_data.kib_peer_hash_size - 1;
+	}
+
+	for (i = lo; i <= hi; i++) {
+		list_for_each_safe(ptmp, pnxt, &kiblnd_data.kib_peers[i]) {
+
+			peer = list_entry(ptmp, kib_peer_t, ibp_list);
+			LASSERT(peer->ibp_connecting > 0 ||
+				 peer->ibp_accepting > 0 ||
+				 !list_empty(&peer->ibp_conns));
+
+			if (peer->ibp_ni != ni)
+				continue;
+
+			if (!(nid == LNET_NID_ANY || nid == peer->ibp_nid))
+				continue;
+
+			count += kiblnd_close_peer_conns_locked(peer, 0);
+		}
+	}
+
+	write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+
+	/* wildcards always succeed */
+	if (nid == LNET_NID_ANY)
+		return 0;
+
+	return (count == 0) ? -ENOENT : 0;
+}
+
+int kiblnd_ctl(lnet_ni_t *ni, unsigned int cmd, void *arg)
+{
+	struct libcfs_ioctl_data *data = arg;
+	int		       rc = -EINVAL;
+
+	switch (cmd) {
+	case IOC_LIBCFS_GET_PEER: {
+		lnet_nid_t   nid = 0;
+		int	  count = 0;
+
+		rc = kiblnd_get_peer_info(ni, data->ioc_count,
+					  &nid, &count);
+		data->ioc_nid    = nid;
+		data->ioc_count  = count;
+		break;
+	}
+
+	case IOC_LIBCFS_DEL_PEER: {
+		rc = kiblnd_del_peer(ni, data->ioc_nid);
+		break;
+	}
+	case IOC_LIBCFS_GET_CONN: {
+		kib_conn_t *conn;
+
+		rc = 0;
+		conn = kiblnd_get_conn_by_idx(ni, data->ioc_count);
+		if (conn == NULL) {
+			rc = -ENOENT;
+			break;
+		}
+
+		LASSERT(conn->ibc_cmid != NULL);
+		data->ioc_nid = conn->ibc_peer->ibp_nid;
+		if (conn->ibc_cmid->route.path_rec == NULL)
+			data->ioc_u32[0] = 0; /* iWarp has no path MTU */
+		else
+			data->ioc_u32[0] =
+			ib_mtu_enum_to_int(conn->ibc_cmid->route.path_rec->mtu);
+		kiblnd_conn_decref(conn);
+		break;
+	}
+	case IOC_LIBCFS_CLOSE_CONNECTION: {
+		rc = kiblnd_close_matching_conns(ni, data->ioc_nid);
+		break;
+	}
+
+	default:
+		break;
+	}
+
+	return rc;
+}
+
+void kiblnd_query(lnet_ni_t *ni, lnet_nid_t nid, unsigned long *when)
+{
+	unsigned long	last_alive = 0;
+	unsigned long	now = cfs_time_current();
+	rwlock_t	*glock = &kiblnd_data.kib_global_lock;
+	kib_peer_t	*peer;
+	unsigned long	flags;
+
+	read_lock_irqsave(glock, flags);
+
+	peer = kiblnd_find_peer_locked(nid);
+	if (peer != NULL) {
+		LASSERT(peer->ibp_connecting > 0 || /* creating conns */
+			 peer->ibp_accepting > 0 ||
+			 !list_empty(&peer->ibp_conns));  /* active conn */
+		last_alive = peer->ibp_last_alive;
+	}
+
+	read_unlock_irqrestore(glock, flags);
+
+	if (last_alive != 0)
+		*when = last_alive;
+
+	/* peer is not persistent in hash, trigger peer creation
+	 * and connection establishment with a NULL tx */
+	if (peer == NULL)
+		kiblnd_launch_tx(ni, NULL, nid);
+
+	CDEBUG(D_NET, "Peer %s %p, alive %ld secs ago\n",
+	       libcfs_nid2str(nid), peer,
+	       last_alive ? cfs_duration_sec(now - last_alive) : -1);
+}
+
+void kiblnd_free_pages(kib_pages_t *p)
+{
+	int	npages = p->ibp_npages;
+	int	i;
+
+	for (i = 0; i < npages; i++) {
+		if (p->ibp_pages[i] != NULL)
+			__free_page(p->ibp_pages[i]);
+	}
+
+	LIBCFS_FREE(p, offsetof(kib_pages_t, ibp_pages[npages]));
+}
+
+int kiblnd_alloc_pages(kib_pages_t **pp, int cpt, int npages)
+{
+	kib_pages_t	*p;
+	int		i;
+
+	LIBCFS_CPT_ALLOC(p, lnet_cpt_table(), cpt,
+			 offsetof(kib_pages_t, ibp_pages[npages]));
+	if (p == NULL) {
+		CERROR("Can't allocate descriptor for %d pages\n", npages);
+		return -ENOMEM;
+	}
+
+	memset(p, 0, offsetof(kib_pages_t, ibp_pages[npages]));
+	p->ibp_npages = npages;
+
+	for (i = 0; i < npages; i++) {
+		p->ibp_pages[i] = alloc_pages_node(
+				    cfs_cpt_spread_node(lnet_cpt_table(), cpt),
+				    GFP_NOFS, 0);
+		if (p->ibp_pages[i] == NULL) {
+			CERROR("Can't allocate page %d of %d\n", i, npages);
+			kiblnd_free_pages(p);
+			return -ENOMEM;
+		}
+	}
+
+	*pp = p;
+	return 0;
+}
+
+void kiblnd_unmap_rx_descs(kib_conn_t *conn)
+{
+	kib_rx_t *rx;
+	int       i;
+
+	LASSERT(conn->ibc_rxs != NULL);
+	LASSERT(conn->ibc_hdev != NULL);
+
+	for (i = 0; i < IBLND_RX_MSGS(conn->ibc_version); i++) {
+		rx = &conn->ibc_rxs[i];
+
+		LASSERT(rx->rx_nob >= 0); /* not posted */
+
+		kiblnd_dma_unmap_single(conn->ibc_hdev->ibh_ibdev,
+					KIBLND_UNMAP_ADDR(rx, rx_msgunmap,
+							  rx->rx_msgaddr),
+					IBLND_MSG_SIZE, DMA_FROM_DEVICE);
+	}
+
+	kiblnd_free_pages(conn->ibc_rx_pages);
+
+	conn->ibc_rx_pages = NULL;
+}
+
+void kiblnd_map_rx_descs(kib_conn_t *conn)
+{
+	kib_rx_t       *rx;
+	struct page    *pg;
+	int	     pg_off;
+	int	     ipg;
+	int	     i;
+
+	for (pg_off = ipg = i = 0;
+	     i < IBLND_RX_MSGS(conn->ibc_version); i++) {
+		pg = conn->ibc_rx_pages->ibp_pages[ipg];
+		rx = &conn->ibc_rxs[i];
+
+		rx->rx_conn = conn;
+		rx->rx_msg = (kib_msg_t *)(((char *)page_address(pg)) + pg_off);
+
+		rx->rx_msgaddr = kiblnd_dma_map_single(conn->ibc_hdev->ibh_ibdev,
+						       rx->rx_msg,
+						       IBLND_MSG_SIZE,
+						       DMA_FROM_DEVICE);
+		LASSERT(!kiblnd_dma_mapping_error(conn->ibc_hdev->ibh_ibdev,
+						   rx->rx_msgaddr));
+		KIBLND_UNMAP_ADDR_SET(rx, rx_msgunmap, rx->rx_msgaddr);
+
+		CDEBUG(D_NET, "rx %d: %p %#llx(%#llx)\n",
+		       i, rx->rx_msg, rx->rx_msgaddr,
+		       lnet_page2phys(pg) + pg_off);
+
+		pg_off += IBLND_MSG_SIZE;
+		LASSERT(pg_off <= PAGE_SIZE);
+
+		if (pg_off == PAGE_SIZE) {
+			pg_off = 0;
+			ipg++;
+			LASSERT(ipg <= IBLND_RX_MSG_PAGES(conn->ibc_version));
+		}
+	}
+}
+
+static void kiblnd_unmap_tx_pool(kib_tx_pool_t *tpo)
+{
+	kib_hca_dev_t  *hdev = tpo->tpo_hdev;
+	kib_tx_t       *tx;
+	int	     i;
+
+	LASSERT(tpo->tpo_pool.po_allocated == 0);
+
+	if (hdev == NULL)
+		return;
+
+	for (i = 0; i < tpo->tpo_pool.po_size; i++) {
+		tx = &tpo->tpo_tx_descs[i];
+		kiblnd_dma_unmap_single(hdev->ibh_ibdev,
+					KIBLND_UNMAP_ADDR(tx, tx_msgunmap,
+							  tx->tx_msgaddr),
+					IBLND_MSG_SIZE, DMA_TO_DEVICE);
+	}
+
+	kiblnd_hdev_decref(hdev);
+	tpo->tpo_hdev = NULL;
+}
+
+static kib_hca_dev_t *kiblnd_current_hdev(kib_dev_t *dev)
+{
+	kib_hca_dev_t *hdev;
+	unsigned long  flags;
+	int	    i = 0;
+
+	read_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
+	while (dev->ibd_failover) {
+		read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+		if (i++ % 50 == 0)
+			CDEBUG(D_NET, "%s: Wait for failover\n",
+			       dev->ibd_ifname);
+		schedule_timeout(cfs_time_seconds(1) / 100);
+
+		read_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
+	}
+
+	kiblnd_hdev_addref_locked(dev->ibd_hdev);
+	hdev = dev->ibd_hdev;
+
+	read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+
+	return hdev;
+}
+
+static void kiblnd_map_tx_pool(kib_tx_pool_t *tpo)
+{
+	kib_pages_t    *txpgs = tpo->tpo_tx_pages;
+	kib_pool_t     *pool  = &tpo->tpo_pool;
+	kib_net_t      *net   = pool->po_owner->ps_net;
+	kib_dev_t      *dev;
+	struct page    *page;
+	kib_tx_t       *tx;
+	int	     page_offset;
+	int	     ipage;
+	int	     i;
+
+	LASSERT(net != NULL);
+
+	dev = net->ibn_dev;
+
+	/* pre-mapped messages are not bigger than 1 page */
+	CLASSERT(IBLND_MSG_SIZE <= PAGE_SIZE);
+
+	/* No fancy arithmetic when we do the buffer calculations */
+	CLASSERT(PAGE_SIZE % IBLND_MSG_SIZE == 0);
+
+	tpo->tpo_hdev = kiblnd_current_hdev(dev);
+
+	for (ipage = page_offset = i = 0; i < pool->po_size; i++) {
+		page = txpgs->ibp_pages[ipage];
+		tx = &tpo->tpo_tx_descs[i];
+
+		tx->tx_msg = (kib_msg_t *)(((char *)page_address(page)) +
+					   page_offset);
+
+		tx->tx_msgaddr = kiblnd_dma_map_single(
+			tpo->tpo_hdev->ibh_ibdev, tx->tx_msg,
+			IBLND_MSG_SIZE, DMA_TO_DEVICE);
+		LASSERT(!kiblnd_dma_mapping_error(tpo->tpo_hdev->ibh_ibdev,
+						   tx->tx_msgaddr));
+		KIBLND_UNMAP_ADDR_SET(tx, tx_msgunmap, tx->tx_msgaddr);
+
+		list_add(&tx->tx_list, &pool->po_free_list);
+
+		page_offset += IBLND_MSG_SIZE;
+		LASSERT(page_offset <= PAGE_SIZE);
+
+		if (page_offset == PAGE_SIZE) {
+			page_offset = 0;
+			ipage++;
+			LASSERT(ipage <= txpgs->ibp_npages);
+		}
+	}
+}
+
+struct ib_mr *kiblnd_find_dma_mr(kib_hca_dev_t *hdev, __u64 addr, __u64 size)
+{
+	__u64   index;
+
+	LASSERT(hdev->ibh_mrs[0] != NULL);
+
+	if (hdev->ibh_nmrs == 1)
+		return hdev->ibh_mrs[0];
+
+	index = addr >> hdev->ibh_mr_shift;
+
+	if (index <  hdev->ibh_nmrs &&
+	    index == ((addr + size - 1) >> hdev->ibh_mr_shift))
+		return hdev->ibh_mrs[index];
+
+	return NULL;
+}
+
+struct ib_mr *kiblnd_find_rd_dma_mr(kib_hca_dev_t *hdev, kib_rdma_desc_t *rd)
+{
+	struct ib_mr *prev_mr;
+	struct ib_mr *mr;
+	int	   i;
+
+	LASSERT(hdev->ibh_mrs[0] != NULL);
+
+	if (*kiblnd_tunables.kib_map_on_demand > 0 &&
+	    *kiblnd_tunables.kib_map_on_demand <= rd->rd_nfrags)
+		return NULL;
+
+	if (hdev->ibh_nmrs == 1)
+		return hdev->ibh_mrs[0];
+
+	for (i = 0, mr = prev_mr = NULL;
+	     i < rd->rd_nfrags; i++) {
+		mr = kiblnd_find_dma_mr(hdev,
+					rd->rd_frags[i].rf_addr,
+					rd->rd_frags[i].rf_nob);
+		if (prev_mr == NULL)
+			prev_mr = mr;
+
+		if (mr == NULL || prev_mr != mr) {
+			/* Can't covered by one single MR */
+			mr = NULL;
+			break;
+		}
+	}
+
+	return mr;
+}
+
+static void kiblnd_destroy_fmr_pool(kib_fmr_pool_t *pool)
+{
+	LASSERT(pool->fpo_map_count == 0);
+
+	if (pool->fpo_fmr_pool != NULL)
+		ib_destroy_fmr_pool(pool->fpo_fmr_pool);
+
+	if (pool->fpo_hdev != NULL)
+		kiblnd_hdev_decref(pool->fpo_hdev);
+
+	LIBCFS_FREE(pool, sizeof(kib_fmr_pool_t));
+}
+
+static void kiblnd_destroy_fmr_pool_list(struct list_head *head)
+{
+	kib_fmr_pool_t *pool;
+
+	while (!list_empty(head)) {
+		pool = list_entry(head->next, kib_fmr_pool_t, fpo_list);
+		list_del(&pool->fpo_list);
+		kiblnd_destroy_fmr_pool(pool);
+	}
+}
+
+static int kiblnd_fmr_pool_size(int ncpts)
+{
+	int size = *kiblnd_tunables.kib_fmr_pool_size / ncpts;
+
+	return max(IBLND_FMR_POOL, size);
+}
+
+static int kiblnd_fmr_flush_trigger(int ncpts)
+{
+	int size = *kiblnd_tunables.kib_fmr_flush_trigger / ncpts;
+
+	return max(IBLND_FMR_POOL_FLUSH, size);
+}
+
+static int kiblnd_create_fmr_pool(kib_fmr_poolset_t *fps,
+				  kib_fmr_pool_t **pp_fpo)
+{
+	/* FMR pool for RDMA */
+	kib_dev_t	       *dev = fps->fps_net->ibn_dev;
+	kib_fmr_pool_t	  *fpo;
+	struct ib_fmr_pool_param param = {
+		.max_pages_per_fmr = LNET_MAX_PAYLOAD/PAGE_SIZE,
+		.page_shift	= PAGE_SHIFT,
+		.access	    = (IB_ACCESS_LOCAL_WRITE |
+				      IB_ACCESS_REMOTE_WRITE),
+		.pool_size	   = fps->fps_pool_size,
+		.dirty_watermark   = fps->fps_flush_trigger,
+		.flush_function    = NULL,
+		.flush_arg	 = NULL,
+		.cache	     = !!*kiblnd_tunables.kib_fmr_cache};
+	int rc;
+
+	LIBCFS_CPT_ALLOC(fpo, lnet_cpt_table(), fps->fps_cpt, sizeof(*fpo));
+	if (fpo == NULL)
+		return -ENOMEM;
+
+	fpo->fpo_hdev = kiblnd_current_hdev(dev);
+
+	fpo->fpo_fmr_pool = ib_create_fmr_pool(fpo->fpo_hdev->ibh_pd, &param);
+	if (IS_ERR(fpo->fpo_fmr_pool)) {
+		rc = PTR_ERR(fpo->fpo_fmr_pool);
+		CERROR("Failed to create FMR pool: %d\n", rc);
+
+		kiblnd_hdev_decref(fpo->fpo_hdev);
+		LIBCFS_FREE(fpo, sizeof(kib_fmr_pool_t));
+		return rc;
+	}
+
+	fpo->fpo_deadline = cfs_time_shift(IBLND_POOL_DEADLINE);
+	fpo->fpo_owner    = fps;
+	*pp_fpo = fpo;
+
+	return 0;
+}
+
+static void kiblnd_fail_fmr_poolset(kib_fmr_poolset_t *fps,
+				    struct list_head *zombies)
+{
+	if (fps->fps_net == NULL) /* intialized? */
+		return;
+
+	spin_lock(&fps->fps_lock);
+
+	while (!list_empty(&fps->fps_pool_list)) {
+		kib_fmr_pool_t *fpo = list_entry(fps->fps_pool_list.next,
+						 kib_fmr_pool_t, fpo_list);
+		fpo->fpo_failed = 1;
+		list_del(&fpo->fpo_list);
+		if (fpo->fpo_map_count == 0)
+			list_add(&fpo->fpo_list, zombies);
+		else
+			list_add(&fpo->fpo_list, &fps->fps_failed_pool_list);
+	}
+
+	spin_unlock(&fps->fps_lock);
+}
+
+static void kiblnd_fini_fmr_poolset(kib_fmr_poolset_t *fps)
+{
+	if (fps->fps_net != NULL) { /* initialized? */
+		kiblnd_destroy_fmr_pool_list(&fps->fps_failed_pool_list);
+		kiblnd_destroy_fmr_pool_list(&fps->fps_pool_list);
+	}
+}
+
+static int kiblnd_init_fmr_poolset(kib_fmr_poolset_t *fps, int cpt,
+				   kib_net_t *net, int pool_size,
+				   int flush_trigger)
+{
+	kib_fmr_pool_t *fpo;
+	int	     rc;
+
+	memset(fps, 0, sizeof(kib_fmr_poolset_t));
+
+	fps->fps_net = net;
+	fps->fps_cpt = cpt;
+	fps->fps_pool_size = pool_size;
+	fps->fps_flush_trigger = flush_trigger;
+	spin_lock_init(&fps->fps_lock);
+	INIT_LIST_HEAD(&fps->fps_pool_list);
+	INIT_LIST_HEAD(&fps->fps_failed_pool_list);
+
+	rc = kiblnd_create_fmr_pool(fps, &fpo);
+	if (rc == 0)
+		list_add_tail(&fpo->fpo_list, &fps->fps_pool_list);
+
+	return rc;
+}
+
+static int kiblnd_fmr_pool_is_idle(kib_fmr_pool_t *fpo, unsigned long now)
+{
+	if (fpo->fpo_map_count != 0) /* still in use */
+		return 0;
+	if (fpo->fpo_failed)
+		return 1;
+	return cfs_time_aftereq(now, fpo->fpo_deadline);
+}
+
+void kiblnd_fmr_pool_unmap(kib_fmr_t *fmr, int status)
+{
+	LIST_HEAD(zombies);
+	kib_fmr_pool_t    *fpo = fmr->fmr_pool;
+	kib_fmr_poolset_t *fps = fpo->fpo_owner;
+	unsigned long	 now = cfs_time_current();
+	kib_fmr_pool_t    *tmp;
+	int		rc;
+
+	rc = ib_fmr_pool_unmap(fmr->fmr_pfmr);
+	LASSERT(rc == 0);
+
+	if (status != 0) {
+		rc = ib_flush_fmr_pool(fpo->fpo_fmr_pool);
+		LASSERT(rc == 0);
+	}
+
+	fmr->fmr_pool = NULL;
+	fmr->fmr_pfmr = NULL;
+
+	spin_lock(&fps->fps_lock);
+	fpo->fpo_map_count--;  /* decref the pool */
+
+	list_for_each_entry_safe(fpo, tmp, &fps->fps_pool_list, fpo_list) {
+		/* the first pool is persistent */
+		if (fps->fps_pool_list.next == &fpo->fpo_list)
+			continue;
+
+		if (kiblnd_fmr_pool_is_idle(fpo, now)) {
+			list_move(&fpo->fpo_list, &zombies);
+			fps->fps_version++;
+		}
+	}
+	spin_unlock(&fps->fps_lock);
+
+	if (!list_empty(&zombies))
+		kiblnd_destroy_fmr_pool_list(&zombies);
+}
+
+int kiblnd_fmr_pool_map(kib_fmr_poolset_t *fps, __u64 *pages, int npages,
+			__u64 iov, kib_fmr_t *fmr)
+{
+	struct ib_pool_fmr *pfmr;
+	kib_fmr_pool_t     *fpo;
+	__u64	       version;
+	int		 rc;
+
+ again:
+	spin_lock(&fps->fps_lock);
+	version = fps->fps_version;
+	list_for_each_entry(fpo, &fps->fps_pool_list, fpo_list) {
+		fpo->fpo_deadline = cfs_time_shift(IBLND_POOL_DEADLINE);
+		fpo->fpo_map_count++;
+		spin_unlock(&fps->fps_lock);
+
+		pfmr = ib_fmr_pool_map_phys(fpo->fpo_fmr_pool,
+					    pages, npages, iov);
+		if (likely(!IS_ERR(pfmr))) {
+			fmr->fmr_pool = fpo;
+			fmr->fmr_pfmr = pfmr;
+			return 0;
+		}
+
+		spin_lock(&fps->fps_lock);
+		fpo->fpo_map_count--;
+		if (PTR_ERR(pfmr) != -EAGAIN) {
+			spin_unlock(&fps->fps_lock);
+			return PTR_ERR(pfmr);
+		}
+
+		/* EAGAIN and ... */
+		if (version != fps->fps_version) {
+			spin_unlock(&fps->fps_lock);
+			goto again;
+		}
+	}
+
+	if (fps->fps_increasing) {
+		spin_unlock(&fps->fps_lock);
+		CDEBUG(D_NET,
+			"Another thread is allocating new FMR pool, waiting for her to complete\n");
+		schedule();
+		goto again;
+
+	}
+
+	if (time_before(cfs_time_current(), fps->fps_next_retry)) {
+		/* someone failed recently */
+		spin_unlock(&fps->fps_lock);
+		return -EAGAIN;
+	}
+
+	fps->fps_increasing = 1;
+	spin_unlock(&fps->fps_lock);
+
+	CDEBUG(D_NET, "Allocate new FMR pool\n");
+	rc = kiblnd_create_fmr_pool(fps, &fpo);
+	spin_lock(&fps->fps_lock);
+	fps->fps_increasing = 0;
+	if (rc == 0) {
+		fps->fps_version++;
+		list_add_tail(&fpo->fpo_list, &fps->fps_pool_list);
+	} else {
+		fps->fps_next_retry = cfs_time_shift(IBLND_POOL_RETRY);
+	}
+	spin_unlock(&fps->fps_lock);
+
+	goto again;
+}
+
+static void kiblnd_fini_pool(kib_pool_t *pool)
+{
+	LASSERT(list_empty(&pool->po_free_list));
+	LASSERT(pool->po_allocated == 0);
+
+	CDEBUG(D_NET, "Finalize %s pool\n", pool->po_owner->ps_name);
+}
+
+static void kiblnd_init_pool(kib_poolset_t *ps, kib_pool_t *pool, int size)
+{
+	CDEBUG(D_NET, "Initialize %s pool\n", ps->ps_name);
+
+	memset(pool, 0, sizeof(kib_pool_t));
+	INIT_LIST_HEAD(&pool->po_free_list);
+	pool->po_deadline = cfs_time_shift(IBLND_POOL_DEADLINE);
+	pool->po_owner    = ps;
+	pool->po_size     = size;
+}
+
+static void kiblnd_destroy_pool_list(struct list_head *head)
+{
+	kib_pool_t *pool;
+
+	while (!list_empty(head)) {
+		pool = list_entry(head->next, kib_pool_t, po_list);
+		list_del(&pool->po_list);
+
+		LASSERT(pool->po_owner != NULL);
+		pool->po_owner->ps_pool_destroy(pool);
+	}
+}
+
+static void kiblnd_fail_poolset(kib_poolset_t *ps, struct list_head *zombies)
+{
+	if (ps->ps_net == NULL) /* intialized? */
+		return;
+
+	spin_lock(&ps->ps_lock);
+	while (!list_empty(&ps->ps_pool_list)) {
+		kib_pool_t *po = list_entry(ps->ps_pool_list.next,
+					    kib_pool_t, po_list);
+		po->po_failed = 1;
+		list_del(&po->po_list);
+		if (po->po_allocated == 0)
+			list_add(&po->po_list, zombies);
+		else
+			list_add(&po->po_list, &ps->ps_failed_pool_list);
+	}
+	spin_unlock(&ps->ps_lock);
+}
+
+static void kiblnd_fini_poolset(kib_poolset_t *ps)
+{
+	if (ps->ps_net != NULL) { /* initialized? */
+		kiblnd_destroy_pool_list(&ps->ps_failed_pool_list);
+		kiblnd_destroy_pool_list(&ps->ps_pool_list);
+	}
+}
+
+static int kiblnd_init_poolset(kib_poolset_t *ps, int cpt,
+			       kib_net_t *net, char *name, int size,
+			       kib_ps_pool_create_t po_create,
+			       kib_ps_pool_destroy_t po_destroy,
+			       kib_ps_node_init_t nd_init,
+			       kib_ps_node_fini_t nd_fini)
+{
+	kib_pool_t	*pool;
+	int		rc;
+
+	memset(ps, 0, sizeof(kib_poolset_t));
+
+	ps->ps_cpt	    = cpt;
+	ps->ps_net	  = net;
+	ps->ps_pool_create  = po_create;
+	ps->ps_pool_destroy = po_destroy;
+	ps->ps_node_init    = nd_init;
+	ps->ps_node_fini    = nd_fini;
+	ps->ps_pool_size    = size;
+	if (strlcpy(ps->ps_name, name, sizeof(ps->ps_name))
+	    >= sizeof(ps->ps_name))
+		return -E2BIG;
+	spin_lock_init(&ps->ps_lock);
+	INIT_LIST_HEAD(&ps->ps_pool_list);
+	INIT_LIST_HEAD(&ps->ps_failed_pool_list);
+
+	rc = ps->ps_pool_create(ps, size, &pool);
+	if (rc == 0)
+		list_add(&pool->po_list, &ps->ps_pool_list);
+	else
+		CERROR("Failed to create the first pool for %s\n", ps->ps_name);
+
+	return rc;
+}
+
+static int kiblnd_pool_is_idle(kib_pool_t *pool, unsigned long now)
+{
+	if (pool->po_allocated != 0) /* still in use */
+		return 0;
+	if (pool->po_failed)
+		return 1;
+	return cfs_time_aftereq(now, pool->po_deadline);
+}
+
+void kiblnd_pool_free_node(kib_pool_t *pool, struct list_head *node)
+{
+	LIST_HEAD(zombies);
+	kib_poolset_t  *ps = pool->po_owner;
+	kib_pool_t     *tmp;
+	unsigned long      now = cfs_time_current();
+
+	spin_lock(&ps->ps_lock);
+
+	if (ps->ps_node_fini != NULL)
+		ps->ps_node_fini(pool, node);
+
+	LASSERT(pool->po_allocated > 0);
+	list_add(node, &pool->po_free_list);
+	pool->po_allocated--;
+
+	list_for_each_entry_safe(pool, tmp, &ps->ps_pool_list, po_list) {
+		/* the first pool is persistent */
+		if (ps->ps_pool_list.next == &pool->po_list)
+			continue;
+
+		if (kiblnd_pool_is_idle(pool, now))
+			list_move(&pool->po_list, &zombies);
+	}
+	spin_unlock(&ps->ps_lock);
+
+	if (!list_empty(&zombies))
+		kiblnd_destroy_pool_list(&zombies);
+}
+
+struct list_head *kiblnd_pool_alloc_node(kib_poolset_t *ps)
+{
+	struct list_head	    *node;
+	kib_pool_t	    *pool;
+	int		    rc;
+
+ again:
+	spin_lock(&ps->ps_lock);
+	list_for_each_entry(pool, &ps->ps_pool_list, po_list) {
+		if (list_empty(&pool->po_free_list))
+			continue;
+
+		pool->po_allocated++;
+		pool->po_deadline = cfs_time_shift(IBLND_POOL_DEADLINE);
+		node = pool->po_free_list.next;
+		list_del(node);
+
+		if (ps->ps_node_init != NULL) {
+			/* still hold the lock */
+			ps->ps_node_init(pool, node);
+		}
+		spin_unlock(&ps->ps_lock);
+		return node;
+	}
+
+	/* no available tx pool and ... */
+	if (ps->ps_increasing) {
+		/* another thread is allocating a new pool */
+		spin_unlock(&ps->ps_lock);
+		CDEBUG(D_NET, "Another thread is allocating new %s pool, waiting for her to complete\n",
+		       ps->ps_name);
+		schedule();
+		goto again;
+	}
+
+	if (time_before(cfs_time_current(), ps->ps_next_retry)) {
+		/* someone failed recently */
+		spin_unlock(&ps->ps_lock);
+		return NULL;
+	}
+
+	ps->ps_increasing = 1;
+	spin_unlock(&ps->ps_lock);
+
+	CDEBUG(D_NET, "%s pool exhausted, allocate new pool\n", ps->ps_name);
+
+	rc = ps->ps_pool_create(ps, ps->ps_pool_size, &pool);
+
+	spin_lock(&ps->ps_lock);
+	ps->ps_increasing = 0;
+	if (rc == 0) {
+		list_add_tail(&pool->po_list, &ps->ps_pool_list);
+	} else {
+		ps->ps_next_retry = cfs_time_shift(IBLND_POOL_RETRY);
+		CERROR("Can't allocate new %s pool because out of memory\n",
+		       ps->ps_name);
+	}
+	spin_unlock(&ps->ps_lock);
+
+	goto again;
+}
+
+void kiblnd_pmr_pool_unmap(kib_phys_mr_t *pmr)
+{
+	kib_pmr_pool_t      *ppo = pmr->pmr_pool;
+	struct ib_mr	*mr  = pmr->pmr_mr;
+
+	pmr->pmr_mr = NULL;
+	kiblnd_pool_free_node(&ppo->ppo_pool, &pmr->pmr_list);
+	if (mr != NULL)
+		ib_dereg_mr(mr);
+}
+
+int kiblnd_pmr_pool_map(kib_pmr_poolset_t *pps, kib_hca_dev_t *hdev,
+		    kib_rdma_desc_t *rd, __u64 *iova, kib_phys_mr_t **pp_pmr)
+{
+	kib_phys_mr_t *pmr;
+	struct list_head    *node;
+	int	    rc;
+	int	    i;
+
+	node = kiblnd_pool_alloc_node(&pps->pps_poolset);
+	if (node == NULL) {
+		CERROR("Failed to allocate PMR descriptor\n");
+		return -ENOMEM;
+	}
+
+	pmr = container_of(node, kib_phys_mr_t, pmr_list);
+	if (pmr->pmr_pool->ppo_hdev != hdev) {
+		kiblnd_pool_free_node(&pmr->pmr_pool->ppo_pool, node);
+		return -EAGAIN;
+	}
+
+	for (i = 0; i < rd->rd_nfrags; i++) {
+		pmr->pmr_ipb[i].addr = rd->rd_frags[i].rf_addr;
+		pmr->pmr_ipb[i].size = rd->rd_frags[i].rf_nob;
+	}
+
+	pmr->pmr_mr = ib_reg_phys_mr(hdev->ibh_pd,
+				     pmr->pmr_ipb, rd->rd_nfrags,
+				     IB_ACCESS_LOCAL_WRITE |
+				     IB_ACCESS_REMOTE_WRITE,
+				     iova);
+	if (!IS_ERR(pmr->pmr_mr)) {
+		pmr->pmr_iova = *iova;
+		*pp_pmr = pmr;
+		return 0;
+	}
+
+	rc = PTR_ERR(pmr->pmr_mr);
+	CERROR("Failed ib_reg_phys_mr: %d\n", rc);
+
+	pmr->pmr_mr = NULL;
+	kiblnd_pool_free_node(&pmr->pmr_pool->ppo_pool, node);
+
+	return rc;
+}
+
+static void kiblnd_destroy_pmr_pool(kib_pool_t *pool)
+{
+	kib_pmr_pool_t *ppo = container_of(pool, kib_pmr_pool_t, ppo_pool);
+	kib_phys_mr_t  *pmr;
+	kib_phys_mr_t *tmp;
+
+	LASSERT(pool->po_allocated == 0);
+
+	list_for_each_entry_safe(pmr, tmp, &pool->po_free_list, pmr_list) {
+		LASSERT(pmr->pmr_mr == NULL);
+		list_del(&pmr->pmr_list);
+
+		if (pmr->pmr_ipb != NULL) {
+			LIBCFS_FREE(pmr->pmr_ipb,
+				    IBLND_MAX_RDMA_FRAGS *
+				    sizeof(struct ib_phys_buf));
+		}
+
+		LIBCFS_FREE(pmr, sizeof(kib_phys_mr_t));
+	}
+
+	kiblnd_fini_pool(pool);
+	if (ppo->ppo_hdev != NULL)
+		kiblnd_hdev_decref(ppo->ppo_hdev);
+
+	LIBCFS_FREE(ppo, sizeof(kib_pmr_pool_t));
+}
+
+static inline int kiblnd_pmr_pool_size(int ncpts)
+{
+	int size = *kiblnd_tunables.kib_pmr_pool_size / ncpts;
+
+	return max(IBLND_PMR_POOL, size);
+}
+
+static int kiblnd_create_pmr_pool(kib_poolset_t *ps, int size,
+				  kib_pool_t **pp_po)
+{
+	struct kib_pmr_pool	*ppo;
+	struct kib_pool		*pool;
+	kib_phys_mr_t		*pmr;
+	int			i;
+
+	LIBCFS_CPT_ALLOC(ppo, lnet_cpt_table(),
+			 ps->ps_cpt, sizeof(kib_pmr_pool_t));
+	if (ppo == NULL) {
+		CERROR("Failed to allocate PMR pool\n");
+		return -ENOMEM;
+	}
+
+	pool = &ppo->ppo_pool;
+	kiblnd_init_pool(ps, pool, size);
+
+	for (i = 0; i < size; i++) {
+		LIBCFS_CPT_ALLOC(pmr, lnet_cpt_table(),
+				 ps->ps_cpt, sizeof(kib_phys_mr_t));
+		if (pmr == NULL)
+			break;
+
+		pmr->pmr_pool = ppo;
+		LIBCFS_CPT_ALLOC(pmr->pmr_ipb, lnet_cpt_table(), ps->ps_cpt,
+				 IBLND_MAX_RDMA_FRAGS * sizeof(*pmr->pmr_ipb));
+		if (pmr->pmr_ipb == NULL)
+			break;
+
+		list_add(&pmr->pmr_list, &pool->po_free_list);
+	}
+
+	if (i < size) {
+		ps->ps_pool_destroy(pool);
+		return -ENOMEM;
+	}
+
+	ppo->ppo_hdev = kiblnd_current_hdev(ps->ps_net->ibn_dev);
+	*pp_po = pool;
+	return 0;
+}
+
+static void kiblnd_destroy_tx_pool(kib_pool_t *pool)
+{
+	kib_tx_pool_t  *tpo = container_of(pool, kib_tx_pool_t, tpo_pool);
+	int	     i;
+
+	LASSERT(pool->po_allocated == 0);
+
+	if (tpo->tpo_tx_pages != NULL) {
+		kiblnd_unmap_tx_pool(tpo);
+		kiblnd_free_pages(tpo->tpo_tx_pages);
+	}
+
+	if (tpo->tpo_tx_descs == NULL)
+		goto out;
+
+	for (i = 0; i < pool->po_size; i++) {
+		kib_tx_t *tx = &tpo->tpo_tx_descs[i];
+
+		list_del(&tx->tx_list);
+		if (tx->tx_pages != NULL)
+			LIBCFS_FREE(tx->tx_pages,
+				    LNET_MAX_IOV *
+				    sizeof(*tx->tx_pages));
+		if (tx->tx_frags != NULL)
+			LIBCFS_FREE(tx->tx_frags,
+				    IBLND_MAX_RDMA_FRAGS *
+					    sizeof(*tx->tx_frags));
+		if (tx->tx_wrq != NULL)
+			LIBCFS_FREE(tx->tx_wrq,
+				    (1 + IBLND_MAX_RDMA_FRAGS) *
+				    sizeof(*tx->tx_wrq));
+		if (tx->tx_sge != NULL)
+			LIBCFS_FREE(tx->tx_sge,
+				    (1 + IBLND_MAX_RDMA_FRAGS) *
+				    sizeof(*tx->tx_sge));
+		if (tx->tx_rd != NULL)
+			LIBCFS_FREE(tx->tx_rd,
+				    offsetof(kib_rdma_desc_t,
+					     rd_frags[IBLND_MAX_RDMA_FRAGS]));
+	}
+
+	LIBCFS_FREE(tpo->tpo_tx_descs,
+		    pool->po_size * sizeof(kib_tx_t));
+out:
+	kiblnd_fini_pool(pool);
+	LIBCFS_FREE(tpo, sizeof(kib_tx_pool_t));
+}
+
+static int kiblnd_tx_pool_size(int ncpts)
+{
+	int ntx = *kiblnd_tunables.kib_ntx / ncpts;
+
+	return max(IBLND_TX_POOL, ntx);
+}
+
+static int kiblnd_create_tx_pool(kib_poolset_t *ps, int size,
+				 kib_pool_t **pp_po)
+{
+	int	    i;
+	int	    npg;
+	kib_pool_t    *pool;
+	kib_tx_pool_t *tpo;
+
+	LIBCFS_CPT_ALLOC(tpo, lnet_cpt_table(), ps->ps_cpt, sizeof(*tpo));
+	if (tpo == NULL) {
+		CERROR("Failed to allocate TX pool\n");
+		return -ENOMEM;
+	}
+
+	pool = &tpo->tpo_pool;
+	kiblnd_init_pool(ps, pool, size);
+	tpo->tpo_tx_descs = NULL;
+	tpo->tpo_tx_pages = NULL;
+
+	npg = (size * IBLND_MSG_SIZE + PAGE_SIZE - 1) / PAGE_SIZE;
+	if (kiblnd_alloc_pages(&tpo->tpo_tx_pages, ps->ps_cpt, npg) != 0) {
+		CERROR("Can't allocate tx pages: %d\n", npg);
+		LIBCFS_FREE(tpo, sizeof(kib_tx_pool_t));
+		return -ENOMEM;
+	}
+
+	LIBCFS_CPT_ALLOC(tpo->tpo_tx_descs, lnet_cpt_table(), ps->ps_cpt,
+			 size * sizeof(kib_tx_t));
+	if (tpo->tpo_tx_descs == NULL) {
+		CERROR("Can't allocate %d tx descriptors\n", size);
+		ps->ps_pool_destroy(pool);
+		return -ENOMEM;
+	}
+
+	memset(tpo->tpo_tx_descs, 0, size * sizeof(kib_tx_t));
+
+	for (i = 0; i < size; i++) {
+		kib_tx_t *tx = &tpo->tpo_tx_descs[i];
+
+		tx->tx_pool = tpo;
+		if (ps->ps_net->ibn_fmr_ps != NULL) {
+			LIBCFS_CPT_ALLOC(tx->tx_pages,
+					 lnet_cpt_table(), ps->ps_cpt,
+					 LNET_MAX_IOV * sizeof(*tx->tx_pages));
+			if (tx->tx_pages == NULL)
+				break;
+		}
+
+		LIBCFS_CPT_ALLOC(tx->tx_frags, lnet_cpt_table(), ps->ps_cpt,
+				 IBLND_MAX_RDMA_FRAGS * sizeof(*tx->tx_frags));
+		if (tx->tx_frags == NULL)
+			break;
+
+		sg_init_table(tx->tx_frags, IBLND_MAX_RDMA_FRAGS);
+
+		LIBCFS_CPT_ALLOC(tx->tx_wrq, lnet_cpt_table(), ps->ps_cpt,
+				 (1 + IBLND_MAX_RDMA_FRAGS) *
+				 sizeof(*tx->tx_wrq));
+		if (tx->tx_wrq == NULL)
+			break;
+
+		LIBCFS_CPT_ALLOC(tx->tx_sge, lnet_cpt_table(), ps->ps_cpt,
+				 (1 + IBLND_MAX_RDMA_FRAGS) *
+				 sizeof(*tx->tx_sge));
+		if (tx->tx_sge == NULL)
+			break;
+
+		LIBCFS_CPT_ALLOC(tx->tx_rd, lnet_cpt_table(), ps->ps_cpt,
+				 offsetof(kib_rdma_desc_t,
+					  rd_frags[IBLND_MAX_RDMA_FRAGS]));
+		if (tx->tx_rd == NULL)
+			break;
+	}
+
+	if (i == size) {
+		kiblnd_map_tx_pool(tpo);
+		*pp_po = pool;
+		return 0;
+	}
+
+	ps->ps_pool_destroy(pool);
+	return -ENOMEM;
+}
+
+static void kiblnd_tx_init(kib_pool_t *pool, struct list_head *node)
+{
+	kib_tx_poolset_t *tps = container_of(pool->po_owner, kib_tx_poolset_t,
+					     tps_poolset);
+	kib_tx_t	 *tx  = list_entry(node, kib_tx_t, tx_list);
+
+	tx->tx_cookie = tps->tps_next_tx_cookie++;
+}
+
+static void kiblnd_net_fini_pools(kib_net_t *net)
+{
+	int	i;
+
+	cfs_cpt_for_each(i, lnet_cpt_table()) {
+		kib_tx_poolset_t	*tps;
+		kib_fmr_poolset_t	*fps;
+		kib_pmr_poolset_t	*pps;
+
+		if (net->ibn_tx_ps != NULL) {
+			tps = net->ibn_tx_ps[i];
+			kiblnd_fini_poolset(&tps->tps_poolset);
+		}
+
+		if (net->ibn_fmr_ps != NULL) {
+			fps = net->ibn_fmr_ps[i];
+			kiblnd_fini_fmr_poolset(fps);
+		}
+
+		if (net->ibn_pmr_ps != NULL) {
+			pps = net->ibn_pmr_ps[i];
+			kiblnd_fini_poolset(&pps->pps_poolset);
+		}
+	}
+
+	if (net->ibn_tx_ps != NULL) {
+		cfs_percpt_free(net->ibn_tx_ps);
+		net->ibn_tx_ps = NULL;
+	}
+
+	if (net->ibn_fmr_ps != NULL) {
+		cfs_percpt_free(net->ibn_fmr_ps);
+		net->ibn_fmr_ps = NULL;
+	}
+
+	if (net->ibn_pmr_ps != NULL) {
+		cfs_percpt_free(net->ibn_pmr_ps);
+		net->ibn_pmr_ps = NULL;
+	}
+}
+
+static int kiblnd_net_init_pools(kib_net_t *net, __u32 *cpts, int ncpts)
+{
+	unsigned long	flags;
+	int		cpt;
+	int		rc;
+	int		i;
+
+	read_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
+	if (*kiblnd_tunables.kib_map_on_demand == 0 &&
+	    net->ibn_dev->ibd_hdev->ibh_nmrs == 1) {
+		read_unlock_irqrestore(&kiblnd_data.kib_global_lock,
+					   flags);
+		goto create_tx_pool;
+	}
+
+	read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+
+	if (*kiblnd_tunables.kib_fmr_pool_size <
+	    *kiblnd_tunables.kib_ntx / 4) {
+		CERROR("Can't set fmr pool size (%d) < ntx / 4(%d)\n",
+		       *kiblnd_tunables.kib_fmr_pool_size,
+		       *kiblnd_tunables.kib_ntx / 4);
+		rc = -EINVAL;
+		goto failed;
+	}
+
+	/* TX pool must be created later than FMR/PMR, see LU-2268
+	 * for details */
+	LASSERT(net->ibn_tx_ps == NULL);
+
+	/* premapping can fail if ibd_nmr > 1, so we always create
+	 * FMR/PMR pool and map-on-demand if premapping failed */
+
+	net->ibn_fmr_ps = cfs_percpt_alloc(lnet_cpt_table(),
+					   sizeof(kib_fmr_poolset_t));
+	if (net->ibn_fmr_ps == NULL) {
+		CERROR("Failed to allocate FMR pool array\n");
+		rc = -ENOMEM;
+		goto failed;
+	}
+
+	for (i = 0; i < ncpts; i++) {
+		cpt = (cpts == NULL) ? i : cpts[i];
+		rc = kiblnd_init_fmr_poolset(net->ibn_fmr_ps[cpt], cpt, net,
+					     kiblnd_fmr_pool_size(ncpts),
+					     kiblnd_fmr_flush_trigger(ncpts));
+		if (rc == -ENOSYS && i == 0) /* no FMR */
+			break; /* create PMR pool */
+
+		if (rc != 0) { /* a real error */
+			CERROR("Can't initialize FMR pool for CPT %d: %d\n",
+			       cpt, rc);
+			goto failed;
+		}
+	}
+
+	if (i > 0) {
+		LASSERT(i == ncpts);
+		goto create_tx_pool;
+	}
+
+	cfs_percpt_free(net->ibn_fmr_ps);
+	net->ibn_fmr_ps = NULL;
+
+	CWARN("Device does not support FMR, failing back to PMR\n");
+
+	if (*kiblnd_tunables.kib_pmr_pool_size <
+	    *kiblnd_tunables.kib_ntx / 4) {
+		CERROR("Can't set pmr pool size (%d) < ntx / 4(%d)\n",
+		       *kiblnd_tunables.kib_pmr_pool_size,
+		       *kiblnd_tunables.kib_ntx / 4);
+		rc = -EINVAL;
+		goto failed;
+	}
+
+	net->ibn_pmr_ps = cfs_percpt_alloc(lnet_cpt_table(),
+					   sizeof(kib_pmr_poolset_t));
+	if (net->ibn_pmr_ps == NULL) {
+		CERROR("Failed to allocate PMR pool array\n");
+		rc = -ENOMEM;
+		goto failed;
+	}
+
+	for (i = 0; i < ncpts; i++) {
+		cpt = (cpts == NULL) ? i : cpts[i];
+		rc = kiblnd_init_poolset(&net->ibn_pmr_ps[cpt]->pps_poolset,
+					 cpt, net, "PMR",
+					 kiblnd_pmr_pool_size(ncpts),
+					 kiblnd_create_pmr_pool,
+					 kiblnd_destroy_pmr_pool, NULL, NULL);
+		if (rc != 0) {
+			CERROR("Can't initialize PMR pool for CPT %d: %d\n",
+			       cpt, rc);
+			goto failed;
+		}
+	}
+
+ create_tx_pool:
+	net->ibn_tx_ps = cfs_percpt_alloc(lnet_cpt_table(),
+					  sizeof(kib_tx_poolset_t));
+	if (net->ibn_tx_ps == NULL) {
+		CERROR("Failed to allocate tx pool array\n");
+		rc = -ENOMEM;
+		goto failed;
+	}
+
+	for (i = 0; i < ncpts; i++) {
+		cpt = (cpts == NULL) ? i : cpts[i];
+		rc = kiblnd_init_poolset(&net->ibn_tx_ps[cpt]->tps_poolset,
+					 cpt, net, "TX",
+					 kiblnd_tx_pool_size(ncpts),
+					 kiblnd_create_tx_pool,
+					 kiblnd_destroy_tx_pool,
+					 kiblnd_tx_init, NULL);
+		if (rc != 0) {
+			CERROR("Can't initialize TX pool for CPT %d: %d\n",
+			       cpt, rc);
+			goto failed;
+		}
+	}
+
+	return 0;
+ failed:
+	kiblnd_net_fini_pools(net);
+	LASSERT(rc != 0);
+	return rc;
+}
+
+static int kiblnd_hdev_get_attr(kib_hca_dev_t *hdev)
+{
+	struct ib_device_attr *attr;
+	int		    rc;
+
+	/* It's safe to assume a HCA can handle a page size
+	 * matching that of the native system */
+	hdev->ibh_page_shift = PAGE_SHIFT;
+	hdev->ibh_page_size  = 1 << PAGE_SHIFT;
+	hdev->ibh_page_mask  = ~((__u64)hdev->ibh_page_size - 1);
+
+	LIBCFS_ALLOC(attr, sizeof(*attr));
+	if (attr == NULL) {
+		CERROR("Out of memory\n");
+		return -ENOMEM;
+	}
+
+	rc = ib_query_device(hdev->ibh_ibdev, attr);
+	if (rc == 0)
+		hdev->ibh_mr_size = attr->max_mr_size;
+
+	LIBCFS_FREE(attr, sizeof(*attr));
+
+	if (rc != 0) {
+		CERROR("Failed to query IB device: %d\n", rc);
+		return rc;
+	}
+
+	if (hdev->ibh_mr_size == ~0ULL) {
+		hdev->ibh_mr_shift = 64;
+		return 0;
+	}
+
+	for (hdev->ibh_mr_shift = 0;
+	     hdev->ibh_mr_shift < 64; hdev->ibh_mr_shift++) {
+		if (hdev->ibh_mr_size == (1ULL << hdev->ibh_mr_shift) ||
+		    hdev->ibh_mr_size == (1ULL << hdev->ibh_mr_shift) - 1)
+			return 0;
+	}
+
+	CERROR("Invalid mr size: %#llx\n", hdev->ibh_mr_size);
+	return -EINVAL;
+}
+
+static void kiblnd_hdev_cleanup_mrs(kib_hca_dev_t *hdev)
+{
+	int     i;
+
+	if (hdev->ibh_nmrs == 0 || hdev->ibh_mrs == NULL)
+		return;
+
+	for (i = 0; i < hdev->ibh_nmrs; i++) {
+		if (hdev->ibh_mrs[i] == NULL)
+			break;
+
+		ib_dereg_mr(hdev->ibh_mrs[i]);
+	}
+
+	LIBCFS_FREE(hdev->ibh_mrs, sizeof(*hdev->ibh_mrs) * hdev->ibh_nmrs);
+	hdev->ibh_mrs  = NULL;
+	hdev->ibh_nmrs = 0;
+}
+
+void kiblnd_hdev_destroy(kib_hca_dev_t *hdev)
+{
+	kiblnd_hdev_cleanup_mrs(hdev);
+
+	if (hdev->ibh_pd != NULL)
+		ib_dealloc_pd(hdev->ibh_pd);
+
+	if (hdev->ibh_cmid != NULL)
+		rdma_destroy_id(hdev->ibh_cmid);
+
+	LIBCFS_FREE(hdev, sizeof(*hdev));
+}
+
+static int kiblnd_hdev_setup_mrs(kib_hca_dev_t *hdev)
+{
+	struct ib_mr *mr;
+	int	   i;
+	int	   rc;
+	__u64	 mm_size;
+	__u64	 mr_size;
+	int	   acflags = IB_ACCESS_LOCAL_WRITE |
+				IB_ACCESS_REMOTE_WRITE;
+
+	rc = kiblnd_hdev_get_attr(hdev);
+	if (rc != 0)
+		return rc;
+
+	if (hdev->ibh_mr_shift == 64) {
+		LIBCFS_ALLOC(hdev->ibh_mrs, 1 * sizeof(*hdev->ibh_mrs));
+		if (hdev->ibh_mrs == NULL) {
+			CERROR("Failed to allocate MRs table\n");
+			return -ENOMEM;
+		}
+
+		hdev->ibh_mrs[0] = NULL;
+		hdev->ibh_nmrs   = 1;
+
+		mr = ib_get_dma_mr(hdev->ibh_pd, acflags);
+		if (IS_ERR(mr)) {
+			CERROR("Failed ib_get_dma_mr : %ld\n", PTR_ERR(mr));
+			kiblnd_hdev_cleanup_mrs(hdev);
+			return PTR_ERR(mr);
+		}
+
+		hdev->ibh_mrs[0] = mr;
+
+		goto out;
+	}
+
+	mr_size = 1ULL << hdev->ibh_mr_shift;
+	mm_size = (unsigned long)high_memory - PAGE_OFFSET;
+
+	hdev->ibh_nmrs = (int)((mm_size + mr_size - 1) >> hdev->ibh_mr_shift);
+
+	if (hdev->ibh_mr_shift < 32 || hdev->ibh_nmrs > 1024) {
+		/* it's 4T..., assume we will re-code at that time */
+		CERROR("Can't support memory size: x%#llx with MR size: x%#llx\n",
+		       mm_size, mr_size);
+		return -EINVAL;
+	}
+
+	/* create an array of MRs to cover all memory */
+	LIBCFS_ALLOC(hdev->ibh_mrs, sizeof(*hdev->ibh_mrs) * hdev->ibh_nmrs);
+	if (hdev->ibh_mrs == NULL) {
+		CERROR("Failed to allocate MRs' table\n");
+		return -ENOMEM;
+	}
+
+	for (i = 0; i < hdev->ibh_nmrs; i++) {
+		struct ib_phys_buf ipb;
+		__u64	      iova;
+
+		ipb.size = hdev->ibh_mr_size;
+		ipb.addr = i * mr_size;
+		iova     = ipb.addr;
+
+		mr = ib_reg_phys_mr(hdev->ibh_pd, &ipb, 1, acflags, &iova);
+		if (IS_ERR(mr)) {
+			CERROR("Failed ib_reg_phys_mr addr %#llx size %#llx : %ld\n",
+			       ipb.addr, ipb.size, PTR_ERR(mr));
+			kiblnd_hdev_cleanup_mrs(hdev);
+			return PTR_ERR(mr);
+		}
+
+		LASSERT(iova == ipb.addr);
+
+		hdev->ibh_mrs[i] = mr;
+	}
+
+out:
+	if (hdev->ibh_mr_size != ~0ULL || hdev->ibh_nmrs != 1)
+		LCONSOLE_INFO("Register global MR array, MR size: %#llx, array size: %d\n",
+			      hdev->ibh_mr_size, hdev->ibh_nmrs);
+	return 0;
+}
+
+/* DUMMY */
+static int kiblnd_dummy_callback(struct rdma_cm_id *cmid,
+				 struct rdma_cm_event *event)
+{
+	return 0;
+}
+
+static int kiblnd_dev_need_failover(kib_dev_t *dev)
+{
+	struct rdma_cm_id  *cmid;
+	struct sockaddr_in  srcaddr;
+	struct sockaddr_in  dstaddr;
+	int		 rc;
+
+	if (dev->ibd_hdev == NULL || /* initializing */
+	    dev->ibd_hdev->ibh_cmid == NULL || /* listener is dead */
+	    *kiblnd_tunables.kib_dev_failover > 1) /* debugging */
+		return 1;
+
+	/* XXX: it's UGLY, but I don't have better way to find
+	 * ib-bonding HCA failover because:
+	 *
+	 * a. no reliable CM event for HCA failover...
+	 * b. no OFED API to get ib_device for current net_device...
+	 *
+	 * We have only two choices at this point:
+	 *
+	 * a. rdma_bind_addr(), it will conflict with listener cmid
+	 * b. rdma_resolve_addr() to zero addr */
+	cmid = kiblnd_rdma_create_id(kiblnd_dummy_callback, dev, RDMA_PS_TCP,
+				     IB_QPT_RC);
+	if (IS_ERR(cmid)) {
+		rc = PTR_ERR(cmid);
+		CERROR("Failed to create cmid for failover: %d\n", rc);
+		return rc;
+	}
+
+	memset(&srcaddr, 0, sizeof(srcaddr));
+	srcaddr.sin_family      = AF_INET;
+	srcaddr.sin_addr.s_addr = (__force u32)htonl(dev->ibd_ifip);
+
+	memset(&dstaddr, 0, sizeof(dstaddr));
+	dstaddr.sin_family = AF_INET;
+	rc = rdma_resolve_addr(cmid, (struct sockaddr *)&srcaddr,
+			       (struct sockaddr *)&dstaddr, 1);
+	if (rc != 0 || cmid->device == NULL) {
+		CERROR("Failed to bind %s:%pI4h to device(%p): %d\n",
+		       dev->ibd_ifname, &dev->ibd_ifip,
+		       cmid->device, rc);
+		rdma_destroy_id(cmid);
+		return rc;
+	}
+
+	if (dev->ibd_hdev->ibh_ibdev == cmid->device) {
+		/* don't need device failover */
+		rdma_destroy_id(cmid);
+		return 0;
+	}
+
+	return 1;
+}
+
+int kiblnd_dev_failover(kib_dev_t *dev)
+{
+	LIST_HEAD(zombie_tpo);
+	LIST_HEAD(zombie_ppo);
+	LIST_HEAD(zombie_fpo);
+	struct rdma_cm_id  *cmid  = NULL;
+	kib_hca_dev_t      *hdev  = NULL;
+	kib_hca_dev_t      *old;
+	struct ib_pd       *pd;
+	kib_net_t	  *net;
+	struct sockaddr_in  addr;
+	unsigned long       flags;
+	int		 rc = 0;
+	int		    i;
+
+	LASSERT(*kiblnd_tunables.kib_dev_failover > 1 ||
+		 dev->ibd_can_failover ||
+		 dev->ibd_hdev == NULL);
+
+	rc = kiblnd_dev_need_failover(dev);
+	if (rc <= 0)
+		goto out;
+
+	if (dev->ibd_hdev != NULL &&
+	    dev->ibd_hdev->ibh_cmid != NULL) {
+		/* XXX it's not good to close old listener at here,
+		 * because we can fail to create new listener.
+		 * But we have to close it now, otherwise rdma_bind_addr
+		 * will return EADDRINUSE... How crap! */
+		write_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
+
+		cmid = dev->ibd_hdev->ibh_cmid;
+		/* make next schedule of kiblnd_dev_need_failover()
+		 * return 1 for me */
+		dev->ibd_hdev->ibh_cmid  = NULL;
+		write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+
+		rdma_destroy_id(cmid);
+	}
+
+	cmid = kiblnd_rdma_create_id(kiblnd_cm_callback, dev, RDMA_PS_TCP,
+				     IB_QPT_RC);
+	if (IS_ERR(cmid)) {
+		rc = PTR_ERR(cmid);
+		CERROR("Failed to create cmid for failover: %d\n", rc);
+		goto out;
+	}
+
+	memset(&addr, 0, sizeof(addr));
+	addr.sin_family      = AF_INET;
+	addr.sin_addr.s_addr = (__force u32)htonl(dev->ibd_ifip);
+	addr.sin_port	= htons(*kiblnd_tunables.kib_service);
+
+	/* Bind to failover device or port */
+	rc = rdma_bind_addr(cmid, (struct sockaddr *)&addr);
+	if (rc != 0 || cmid->device == NULL) {
+		CERROR("Failed to bind %s:%pI4h to device(%p): %d\n",
+		       dev->ibd_ifname, &dev->ibd_ifip,
+		       cmid->device, rc);
+		rdma_destroy_id(cmid);
+		goto out;
+	}
+
+	LIBCFS_ALLOC(hdev, sizeof(*hdev));
+	if (hdev == NULL) {
+		CERROR("Failed to allocate kib_hca_dev\n");
+		rdma_destroy_id(cmid);
+		rc = -ENOMEM;
+		goto out;
+	}
+
+	atomic_set(&hdev->ibh_ref, 1);
+	hdev->ibh_dev   = dev;
+	hdev->ibh_cmid  = cmid;
+	hdev->ibh_ibdev = cmid->device;
+
+	pd = ib_alloc_pd(cmid->device);
+	if (IS_ERR(pd)) {
+		rc = PTR_ERR(pd);
+		CERROR("Can't allocate PD: %d\n", rc);
+		goto out;
+	}
+
+	hdev->ibh_pd = pd;
+
+	rc = rdma_listen(cmid, 0);
+	if (rc != 0) {
+		CERROR("Can't start new listener: %d\n", rc);
+		goto out;
+	}
+
+	rc = kiblnd_hdev_setup_mrs(hdev);
+	if (rc != 0) {
+		CERROR("Can't setup device: %d\n", rc);
+		goto out;
+	}
+
+	write_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
+
+	old = dev->ibd_hdev;
+	dev->ibd_hdev = hdev; /* take over the refcount */
+	hdev = old;
+
+	list_for_each_entry(net, &dev->ibd_nets, ibn_list) {
+		cfs_cpt_for_each(i, lnet_cpt_table()) {
+			kiblnd_fail_poolset(&net->ibn_tx_ps[i]->tps_poolset,
+					    &zombie_tpo);
+
+			if (net->ibn_fmr_ps != NULL) {
+				kiblnd_fail_fmr_poolset(net->ibn_fmr_ps[i],
+							&zombie_fpo);
+
+			} else if (net->ibn_pmr_ps != NULL) {
+				kiblnd_fail_poolset(&net->ibn_pmr_ps[i]->
+						    pps_poolset, &zombie_ppo);
+			}
+		}
+	}
+
+	write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+ out:
+	if (!list_empty(&zombie_tpo))
+		kiblnd_destroy_pool_list(&zombie_tpo);
+	if (!list_empty(&zombie_ppo))
+		kiblnd_destroy_pool_list(&zombie_ppo);
+	if (!list_empty(&zombie_fpo))
+		kiblnd_destroy_fmr_pool_list(&zombie_fpo);
+	if (hdev != NULL)
+		kiblnd_hdev_decref(hdev);
+
+	if (rc != 0)
+		dev->ibd_failed_failover++;
+	else
+		dev->ibd_failed_failover = 0;
+
+	return rc;
+}
+
+void kiblnd_destroy_dev(kib_dev_t *dev)
+{
+	LASSERT(dev->ibd_nnets == 0);
+	LASSERT(list_empty(&dev->ibd_nets));
+
+	list_del(&dev->ibd_fail_list);
+	list_del(&dev->ibd_list);
+
+	if (dev->ibd_hdev != NULL)
+		kiblnd_hdev_decref(dev->ibd_hdev);
+
+	LIBCFS_FREE(dev, sizeof(*dev));
+}
+
+static kib_dev_t *kiblnd_create_dev(char *ifname)
+{
+	struct net_device *netdev;
+	kib_dev_t	 *dev;
+	__u32	      netmask;
+	__u32	      ip;
+	int		up;
+	int		rc;
+
+	rc = libcfs_ipif_query(ifname, &up, &ip, &netmask);
+	if (rc != 0) {
+		CERROR("Can't query IPoIB interface %s: %d\n",
+		       ifname, rc);
+		return NULL;
+	}
+
+	if (!up) {
+		CERROR("Can't query IPoIB interface %s: it's down\n", ifname);
+		return NULL;
+	}
+
+	LIBCFS_ALLOC(dev, sizeof(*dev));
+	if (dev == NULL)
+		return NULL;
+
+	netdev = dev_get_by_name(&init_net, ifname);
+	if (netdev == NULL) {
+		dev->ibd_can_failover = 0;
+	} else {
+		dev->ibd_can_failover = !!(netdev->flags & IFF_MASTER);
+		dev_put(netdev);
+	}
+
+	INIT_LIST_HEAD(&dev->ibd_nets);
+	INIT_LIST_HEAD(&dev->ibd_list); /* not yet in kib_devs */
+	INIT_LIST_HEAD(&dev->ibd_fail_list);
+	dev->ibd_ifip = ip;
+	strcpy(&dev->ibd_ifname[0], ifname);
+
+	/* initialize the device */
+	rc = kiblnd_dev_failover(dev);
+	if (rc != 0) {
+		CERROR("Can't initialize device: %d\n", rc);
+		LIBCFS_FREE(dev, sizeof(*dev));
+		return NULL;
+	}
+
+	list_add_tail(&dev->ibd_list,
+			  &kiblnd_data.kib_devs);
+	return dev;
+}
+
+static void kiblnd_base_shutdown(void)
+{
+	struct kib_sched_info	*sched;
+	int			i;
+
+	LASSERT(list_empty(&kiblnd_data.kib_devs));
+
+	CDEBUG(D_MALLOC, "before LND base cleanup: kmem %d\n",
+	       atomic_read(&libcfs_kmemory));
+
+	switch (kiblnd_data.kib_init) {
+	default:
+		LBUG();
+
+	case IBLND_INIT_ALL:
+	case IBLND_INIT_DATA:
+		LASSERT(kiblnd_data.kib_peers != NULL);
+		for (i = 0; i < kiblnd_data.kib_peer_hash_size; i++)
+			LASSERT(list_empty(&kiblnd_data.kib_peers[i]));
+		LASSERT(list_empty(&kiblnd_data.kib_connd_zombies));
+		LASSERT(list_empty(&kiblnd_data.kib_connd_conns));
+
+		/* flag threads to terminate; wake and wait for them to die */
+		kiblnd_data.kib_shutdown = 1;
+
+		/* NB: we really want to stop scheduler threads net by net
+		 * instead of the whole module, this should be improved
+		 * with dynamic configuration LNet */
+		cfs_percpt_for_each(sched, i, kiblnd_data.kib_scheds)
+			wake_up_all(&sched->ibs_waitq);
+
+		wake_up_all(&kiblnd_data.kib_connd_waitq);
+		wake_up_all(&kiblnd_data.kib_failover_waitq);
+
+		i = 2;
+		while (atomic_read(&kiblnd_data.kib_nthreads) != 0) {
+			i++;
+			/* power of 2 ? */
+			CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET,
+			       "Waiting for %d threads to terminate\n",
+			       atomic_read(&kiblnd_data.kib_nthreads));
+			set_current_state(TASK_UNINTERRUPTIBLE);
+			schedule_timeout(cfs_time_seconds(1));
+		}
+
+		/* fall through */
+
+	case IBLND_INIT_NOTHING:
+		break;
+	}
+
+	if (kiblnd_data.kib_peers != NULL) {
+		LIBCFS_FREE(kiblnd_data.kib_peers,
+			    sizeof(struct list_head) *
+			    kiblnd_data.kib_peer_hash_size);
+	}
+
+	if (kiblnd_data.kib_scheds != NULL)
+		cfs_percpt_free(kiblnd_data.kib_scheds);
+
+	CDEBUG(D_MALLOC, "after LND base cleanup: kmem %d\n",
+	       atomic_read(&libcfs_kmemory));
+
+	kiblnd_data.kib_init = IBLND_INIT_NOTHING;
+	module_put(THIS_MODULE);
+}
+
+void kiblnd_shutdown(lnet_ni_t *ni)
+{
+	kib_net_t	*net = ni->ni_data;
+	rwlock_t     *g_lock = &kiblnd_data.kib_global_lock;
+	int	       i;
+	unsigned long     flags;
+
+	LASSERT(kiblnd_data.kib_init == IBLND_INIT_ALL);
+
+	if (net == NULL)
+		goto out;
+
+	CDEBUG(D_MALLOC, "before LND net cleanup: kmem %d\n",
+	       atomic_read(&libcfs_kmemory));
+
+	write_lock_irqsave(g_lock, flags);
+	net->ibn_shutdown = 1;
+	write_unlock_irqrestore(g_lock, flags);
+
+	switch (net->ibn_init) {
+	default:
+		LBUG();
+
+	case IBLND_INIT_ALL:
+		/* nuke all existing peers within this net */
+		kiblnd_del_peer(ni, LNET_NID_ANY);
+
+		/* Wait for all peer state to clean up */
+		i = 2;
+		while (atomic_read(&net->ibn_npeers) != 0) {
+			i++;
+			CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* 2**n? */
+			       "%s: waiting for %d peers to disconnect\n",
+			       libcfs_nid2str(ni->ni_nid),
+			       atomic_read(&net->ibn_npeers));
+			set_current_state(TASK_UNINTERRUPTIBLE);
+			schedule_timeout(cfs_time_seconds(1));
+		}
+
+		kiblnd_net_fini_pools(net);
+
+		write_lock_irqsave(g_lock, flags);
+		LASSERT(net->ibn_dev->ibd_nnets > 0);
+		net->ibn_dev->ibd_nnets--;
+		list_del(&net->ibn_list);
+		write_unlock_irqrestore(g_lock, flags);
+
+		/* fall through */
+
+	case IBLND_INIT_NOTHING:
+		LASSERT(atomic_read(&net->ibn_nconns) == 0);
+
+		if (net->ibn_dev != NULL &&
+		    net->ibn_dev->ibd_nnets == 0)
+			kiblnd_destroy_dev(net->ibn_dev);
+
+		break;
+	}
+
+	CDEBUG(D_MALLOC, "after LND net cleanup: kmem %d\n",
+	       atomic_read(&libcfs_kmemory));
+
+	net->ibn_init = IBLND_INIT_NOTHING;
+	ni->ni_data = NULL;
+
+	LIBCFS_FREE(net, sizeof(*net));
+
+out:
+	if (list_empty(&kiblnd_data.kib_devs))
+		kiblnd_base_shutdown();
+}
+
+static int kiblnd_base_startup(void)
+{
+	struct kib_sched_info	*sched;
+	int			rc;
+	int			i;
+
+	LASSERT(kiblnd_data.kib_init == IBLND_INIT_NOTHING);
+
+	try_module_get(THIS_MODULE);
+	/* zero pointers, flags etc */
+	memset(&kiblnd_data, 0, sizeof(kiblnd_data));
+
+	rwlock_init(&kiblnd_data.kib_global_lock);
+
+	INIT_LIST_HEAD(&kiblnd_data.kib_devs);
+	INIT_LIST_HEAD(&kiblnd_data.kib_failed_devs);
+
+	kiblnd_data.kib_peer_hash_size = IBLND_PEER_HASH_SIZE;
+	LIBCFS_ALLOC(kiblnd_data.kib_peers,
+		     sizeof(struct list_head) *
+			    kiblnd_data.kib_peer_hash_size);
+	if (kiblnd_data.kib_peers == NULL)
+		goto failed;
+	for (i = 0; i < kiblnd_data.kib_peer_hash_size; i++)
+		INIT_LIST_HEAD(&kiblnd_data.kib_peers[i]);
+
+	spin_lock_init(&kiblnd_data.kib_connd_lock);
+	INIT_LIST_HEAD(&kiblnd_data.kib_connd_conns);
+	INIT_LIST_HEAD(&kiblnd_data.kib_connd_zombies);
+	init_waitqueue_head(&kiblnd_data.kib_connd_waitq);
+	init_waitqueue_head(&kiblnd_data.kib_failover_waitq);
+
+	kiblnd_data.kib_scheds = cfs_percpt_alloc(lnet_cpt_table(),
+						  sizeof(*sched));
+	if (kiblnd_data.kib_scheds == NULL)
+		goto failed;
+
+	cfs_percpt_for_each(sched, i, kiblnd_data.kib_scheds) {
+		int	nthrs;
+
+		spin_lock_init(&sched->ibs_lock);
+		INIT_LIST_HEAD(&sched->ibs_conns);
+		init_waitqueue_head(&sched->ibs_waitq);
+
+		nthrs = cfs_cpt_weight(lnet_cpt_table(), i);
+		if (*kiblnd_tunables.kib_nscheds > 0) {
+			nthrs = min(nthrs, *kiblnd_tunables.kib_nscheds);
+		} else {
+			/* max to half of CPUs, another half is reserved for
+			 * upper layer modules */
+			nthrs = min(max(IBLND_N_SCHED, nthrs >> 1), nthrs);
+		}
+
+		sched->ibs_nthreads_max = nthrs;
+		sched->ibs_cpt = i;
+	}
+
+	kiblnd_data.kib_error_qpa.qp_state = IB_QPS_ERR;
+
+	/* lists/ptrs/locks initialised */
+	kiblnd_data.kib_init = IBLND_INIT_DATA;
+	/*****************************************************/
+
+	rc = kiblnd_thread_start(kiblnd_connd, NULL, "kiblnd_connd");
+	if (rc != 0) {
+		CERROR("Can't spawn o2iblnd connd: %d\n", rc);
+		goto failed;
+	}
+
+	if (*kiblnd_tunables.kib_dev_failover != 0)
+		rc = kiblnd_thread_start(kiblnd_failover_thread, NULL,
+					 "kiblnd_failover");
+
+	if (rc != 0) {
+		CERROR("Can't spawn o2iblnd failover thread: %d\n", rc);
+		goto failed;
+	}
+
+	/* flag everything initialised */
+	kiblnd_data.kib_init = IBLND_INIT_ALL;
+	/*****************************************************/
+
+	return 0;
+
+ failed:
+	kiblnd_base_shutdown();
+	return -ENETDOWN;
+}
+
+static int kiblnd_start_schedulers(struct kib_sched_info *sched)
+{
+	int	rc = 0;
+	int	nthrs;
+	int	i;
+
+	if (sched->ibs_nthreads == 0) {
+		if (*kiblnd_tunables.kib_nscheds > 0) {
+			nthrs = sched->ibs_nthreads_max;
+		} else {
+			nthrs = cfs_cpt_weight(lnet_cpt_table(),
+					       sched->ibs_cpt);
+			nthrs = min(max(IBLND_N_SCHED, nthrs >> 1), nthrs);
+			nthrs = min(IBLND_N_SCHED_HIGH, nthrs);
+		}
+	} else {
+		LASSERT(sched->ibs_nthreads <= sched->ibs_nthreads_max);
+		/* increase one thread if there is new interface */
+		nthrs = sched->ibs_nthreads < sched->ibs_nthreads_max;
+	}
+
+	for (i = 0; i < nthrs; i++) {
+		long	id;
+		char	name[20];
+
+		id = KIB_THREAD_ID(sched->ibs_cpt, sched->ibs_nthreads + i);
+		snprintf(name, sizeof(name), "kiblnd_sd_%02ld_%02ld",
+			 KIB_THREAD_CPT(id), KIB_THREAD_TID(id));
+		rc = kiblnd_thread_start(kiblnd_scheduler, (void *)id, name);
+		if (rc == 0)
+			continue;
+
+		CERROR("Can't spawn thread %d for scheduler[%d]: %d\n",
+		       sched->ibs_cpt, sched->ibs_nthreads + i, rc);
+		break;
+	}
+
+	sched->ibs_nthreads += i;
+	return rc;
+}
+
+static int kiblnd_dev_start_threads(kib_dev_t *dev, int newdev, __u32 *cpts,
+				    int ncpts)
+{
+	int	cpt;
+	int	rc;
+	int	i;
+
+	for (i = 0; i < ncpts; i++) {
+		struct kib_sched_info *sched;
+
+		cpt = (cpts == NULL) ? i : cpts[i];
+		sched = kiblnd_data.kib_scheds[cpt];
+
+		if (!newdev && sched->ibs_nthreads > 0)
+			continue;
+
+		rc = kiblnd_start_schedulers(kiblnd_data.kib_scheds[cpt]);
+		if (rc != 0) {
+			CERROR("Failed to start scheduler threads for %s\n",
+			       dev->ibd_ifname);
+			return rc;
+		}
+	}
+	return 0;
+}
+
+static kib_dev_t *kiblnd_dev_search(char *ifname)
+{
+	kib_dev_t	*alias = NULL;
+	kib_dev_t	*dev;
+	char		*colon;
+	char		*colon2;
+
+	colon = strchr(ifname, ':');
+	list_for_each_entry(dev, &kiblnd_data.kib_devs, ibd_list) {
+		if (strcmp(&dev->ibd_ifname[0], ifname) == 0)
+			return dev;
+
+		if (alias != NULL)
+			continue;
+
+		colon2 = strchr(dev->ibd_ifname, ':');
+		if (colon != NULL)
+			*colon = 0;
+		if (colon2 != NULL)
+			*colon2 = 0;
+
+		if (strcmp(&dev->ibd_ifname[0], ifname) == 0)
+			alias = dev;
+
+		if (colon != NULL)
+			*colon = ':';
+		if (colon2 != NULL)
+			*colon2 = ':';
+	}
+	return alias;
+}
+
+int kiblnd_startup(lnet_ni_t *ni)
+{
+	char		     *ifname;
+	kib_dev_t		*ibdev = NULL;
+	kib_net_t		*net;
+	struct timeval	    tv;
+	unsigned long	     flags;
+	int		       rc;
+	int			  newdev;
+
+	LASSERT(ni->ni_lnd == &the_o2iblnd);
+
+	if (kiblnd_data.kib_init == IBLND_INIT_NOTHING) {
+		rc = kiblnd_base_startup();
+		if (rc != 0)
+			return rc;
+	}
+
+	LIBCFS_ALLOC(net, sizeof(*net));
+	ni->ni_data = net;
+	if (net == NULL)
+		goto net_failed;
+
+	do_gettimeofday(&tv);
+	net->ibn_incarnation = (((__u64)tv.tv_sec) * 1000000) + tv.tv_usec;
+
+	ni->ni_peertimeout    = *kiblnd_tunables.kib_peertimeout;
+	ni->ni_maxtxcredits   = *kiblnd_tunables.kib_credits;
+	ni->ni_peertxcredits  = *kiblnd_tunables.kib_peertxcredits;
+	ni->ni_peerrtrcredits = *kiblnd_tunables.kib_peerrtrcredits;
+
+	if (ni->ni_interfaces[0] != NULL) {
+		/* Use the IPoIB interface specified in 'networks=' */
+
+		CLASSERT(LNET_MAX_INTERFACES > 1);
+		if (ni->ni_interfaces[1] != NULL) {
+			CERROR("Multiple interfaces not supported\n");
+			goto failed;
+		}
+
+		ifname = ni->ni_interfaces[0];
+	} else {
+		ifname = *kiblnd_tunables.kib_default_ipif;
+	}
+
+	if (strlen(ifname) >= sizeof(ibdev->ibd_ifname)) {
+		CERROR("IPoIB interface name too long: %s\n", ifname);
+		goto failed;
+	}
+
+	ibdev = kiblnd_dev_search(ifname);
+
+	newdev = ibdev == NULL;
+	/* hmm...create kib_dev even for alias */
+	if (ibdev == NULL || strcmp(&ibdev->ibd_ifname[0], ifname) != 0)
+		ibdev = kiblnd_create_dev(ifname);
+
+	if (ibdev == NULL)
+		goto failed;
+
+	net->ibn_dev = ibdev;
+	ni->ni_nid = LNET_MKNID(LNET_NIDNET(ni->ni_nid), ibdev->ibd_ifip);
+
+	rc = kiblnd_dev_start_threads(ibdev, newdev,
+				      ni->ni_cpts, ni->ni_ncpts);
+	if (rc != 0)
+		goto failed;
+
+	rc = kiblnd_net_init_pools(net, ni->ni_cpts, ni->ni_ncpts);
+	if (rc != 0) {
+		CERROR("Failed to initialize NI pools: %d\n", rc);
+		goto failed;
+	}
+
+	write_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
+	ibdev->ibd_nnets++;
+	list_add_tail(&net->ibn_list, &ibdev->ibd_nets);
+	write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+
+	net->ibn_init = IBLND_INIT_ALL;
+
+	return 0;
+
+failed:
+	if (net->ibn_dev == NULL && ibdev != NULL)
+		kiblnd_destroy_dev(ibdev);
+
+net_failed:
+	kiblnd_shutdown(ni);
+
+	CDEBUG(D_NET, "kiblnd_startup failed\n");
+	return -ENETDOWN;
+}
+
+static void __exit kiblnd_module_fini(void)
+{
+	lnet_unregister_lnd(&the_o2iblnd);
+}
+
+static int __init kiblnd_module_init(void)
+{
+	int    rc;
+
+	CLASSERT(sizeof(kib_msg_t) <= IBLND_MSG_SIZE);
+	CLASSERT(offsetof(kib_msg_t,
+		ibm_u.get.ibgm_rd.rd_frags[IBLND_MAX_RDMA_FRAGS])
+		<= IBLND_MSG_SIZE);
+	CLASSERT(offsetof(kib_msg_t,
+		ibm_u.putack.ibpam_rd.rd_frags[IBLND_MAX_RDMA_FRAGS])
+		<= IBLND_MSG_SIZE);
+
+	rc = kiblnd_tunables_init();
+	if (rc != 0)
+		return rc;
+
+	lnet_register_lnd(&the_o2iblnd);
+
+	return 0;
+}
+
+MODULE_AUTHOR("Sun Microsystems, Inc. <http://www.lustre.org/>");
+MODULE_DESCRIPTION("Kernel OpenIB gen2 LND v2.00");
+MODULE_LICENSE("GPL");
+
+module_init(kiblnd_module_init);
+module_exit(kiblnd_module_fini);
diff --git a/kernel/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd.h b/kernel/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd.h
new file mode 100644
index 000000000..cd664d025
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd.h
@@ -0,0 +1,1030 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/klnds/o2iblnd/o2iblnd.h
+ *
+ * Author: Eric Barton <eric@bartonsoftware.com>
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/stat.h>
+#include <linux/errno.h>
+#include <linux/unistd.h>
+#include <linux/uio.h>
+#include <linux/uaccess.h>
+
+#include <asm/io.h>
+
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/list.h>
+#include <linux/kmod.h>
+#include <linux/sysctl.h>
+#include <linux/pci.h>
+
+#include <net/sock.h>
+#include <linux/in.h>
+
+#define DEBUG_SUBSYSTEM S_LND
+
+#include "../../../include/linux/libcfs/libcfs.h"
+#include "../../../include/linux/lnet/lnet.h"
+#include "../../../include/linux/lnet/lib-lnet.h"
+#include "../../../include/linux/lnet/lnet-sysctl.h"
+
+#include <rdma/rdma_cm.h>
+#include <rdma/ib_cm.h>
+#include <rdma/ib_verbs.h>
+#include <rdma/ib_fmr_pool.h>
+
+#define IBLND_PEER_HASH_SIZE		101	/* # peer lists */
+/* # scheduler loops before reschedule */
+#define IBLND_RESCHED			100
+
+#define IBLND_N_SCHED			2
+#define IBLND_N_SCHED_HIGH		4
+
+typedef struct {
+	int	      *kib_dev_failover;     /* HCA failover */
+	unsigned int     *kib_service;	  /* IB service number */
+	int	      *kib_min_reconnect_interval; /* first failed connection retry... */
+	int	      *kib_max_reconnect_interval; /* ...exponentially increasing to this */
+	int	      *kib_cksum;	    /* checksum kib_msg_t? */
+	int	      *kib_timeout;	  /* comms timeout (seconds) */
+	int	      *kib_keepalive;	/* keepalive timeout (seconds) */
+	int	      *kib_ntx;	      /* # tx descs */
+	int	      *kib_credits;	  /* # concurrent sends */
+	int	      *kib_peertxcredits;    /* # concurrent sends to 1 peer */
+	int	      *kib_peerrtrcredits;   /* # per-peer router buffer credits */
+	int	      *kib_peercredits_hiw;  /* # when eagerly to return credits */
+	int	      *kib_peertimeout;      /* seconds to consider peer dead */
+	char	    **kib_default_ipif;     /* default IPoIB interface */
+	int	      *kib_retry_count;
+	int	      *kib_rnr_retry_count;
+	int	      *kib_concurrent_sends; /* send work queue sizing */
+	int		 *kib_ib_mtu;		/* IB MTU */
+	int	      *kib_map_on_demand;    /* map-on-demand if RD has more fragments
+						 * than this value, 0 disable map-on-demand */
+	int	      *kib_pmr_pool_size;    /* # physical MR in pool */
+	int	      *kib_fmr_pool_size;    /* # FMRs in pool */
+	int	      *kib_fmr_flush_trigger; /* When to trigger FMR flush */
+	int	      *kib_fmr_cache;	/* enable FMR pool cache? */
+	int	      *kib_require_priv_port;/* accept only privileged ports */
+	int	      *kib_use_priv_port;    /* use privileged port for active connect */
+	/* # threads on each CPT */
+	int		 *kib_nscheds;
+} kib_tunables_t;
+
+extern kib_tunables_t  kiblnd_tunables;
+
+#define IBLND_MSG_QUEUE_SIZE_V1      8	  /* V1 only : # messages/RDMAs in-flight */
+#define IBLND_CREDIT_HIGHWATER_V1    7	  /* V1 only : when eagerly to return credits */
+
+#define IBLND_CREDITS_DEFAULT	8	  /* default # of peer credits */
+#define IBLND_CREDITS_MAX	  ((typeof(((kib_msg_t*) 0)->ibm_credits)) - 1)  /* Max # of peer credits */
+
+#define IBLND_MSG_QUEUE_SIZE(v)    ((v) == IBLND_MSG_VERSION_1 ? \
+				     IBLND_MSG_QUEUE_SIZE_V1 :   \
+				     *kiblnd_tunables.kib_peertxcredits) /* # messages/RDMAs in-flight */
+#define IBLND_CREDITS_HIGHWATER(v) ((v) == IBLND_MSG_VERSION_1 ? \
+				     IBLND_CREDIT_HIGHWATER_V1 : \
+				     *kiblnd_tunables.kib_peercredits_hiw) /* when eagerly to return credits */
+
+#define kiblnd_rdma_create_id(cb, dev, ps, qpt) rdma_create_id(cb, dev, ps, qpt)
+
+static inline int
+kiblnd_concurrent_sends_v1(void)
+{
+	if (*kiblnd_tunables.kib_concurrent_sends > IBLND_MSG_QUEUE_SIZE_V1 * 2)
+		return IBLND_MSG_QUEUE_SIZE_V1 * 2;
+
+	if (*kiblnd_tunables.kib_concurrent_sends < IBLND_MSG_QUEUE_SIZE_V1 / 2)
+		return IBLND_MSG_QUEUE_SIZE_V1 / 2;
+
+	return *kiblnd_tunables.kib_concurrent_sends;
+}
+
+#define IBLND_CONCURRENT_SENDS(v)  ((v) == IBLND_MSG_VERSION_1 ? \
+				     kiblnd_concurrent_sends_v1() : \
+				     *kiblnd_tunables.kib_concurrent_sends)
+/* 2 OOB shall suffice for 1 keepalive and 1 returning credits */
+#define IBLND_OOB_CAPABLE(v)       ((v) != IBLND_MSG_VERSION_1)
+#define IBLND_OOB_MSGS(v)	   (IBLND_OOB_CAPABLE(v) ? 2 : 0)
+
+#define IBLND_MSG_SIZE	      (4<<10)		 /* max size of queued messages (inc hdr) */
+#define IBLND_MAX_RDMA_FRAGS	 LNET_MAX_IOV	   /* max # of fragments supported */
+#define IBLND_CFG_RDMA_FRAGS       (*kiblnd_tunables.kib_map_on_demand != 0 ? \
+				    *kiblnd_tunables.kib_map_on_demand :      \
+				     IBLND_MAX_RDMA_FRAGS)  /* max # of fragments configured by user */
+#define IBLND_RDMA_FRAGS(v)	((v) == IBLND_MSG_VERSION_1 ? \
+				     IBLND_MAX_RDMA_FRAGS : IBLND_CFG_RDMA_FRAGS)
+
+/************************/
+/* derived constants... */
+/* Pools (shared by connections on each CPT) */
+/* These pools can grow at runtime, so don't need give a very large value */
+#define IBLND_TX_POOL			256
+#define IBLND_PMR_POOL			256
+#define IBLND_FMR_POOL			256
+#define IBLND_FMR_POOL_FLUSH		192
+
+/* TX messages (shared by all connections) */
+#define IBLND_TX_MSGS()	    (*kiblnd_tunables.kib_ntx)
+
+/* RX messages (per connection) */
+#define IBLND_RX_MSGS(v)	    (IBLND_MSG_QUEUE_SIZE(v) * 2 + IBLND_OOB_MSGS(v))
+#define IBLND_RX_MSG_BYTES(v)       (IBLND_RX_MSGS(v) * IBLND_MSG_SIZE)
+#define IBLND_RX_MSG_PAGES(v)      ((IBLND_RX_MSG_BYTES(v) + PAGE_SIZE - 1) / PAGE_SIZE)
+
+/* WRs and CQEs (per connection) */
+#define IBLND_RECV_WRS(v)	    IBLND_RX_MSGS(v)
+#define IBLND_SEND_WRS(v)	  ((IBLND_RDMA_FRAGS(v) + 1) * IBLND_CONCURRENT_SENDS(v))
+#define IBLND_CQ_ENTRIES(v)	 (IBLND_RECV_WRS(v) + IBLND_SEND_WRS(v))
+
+struct kib_hca_dev;
+
+/* o2iblnd can run over aliased interface */
+#ifdef IFALIASZ
+#define KIB_IFNAME_SIZE	      IFALIASZ
+#else
+#define KIB_IFNAME_SIZE	      256
+#endif
+
+typedef struct {
+	struct list_head	   ibd_list;	  /* chain on kib_devs */
+	struct list_head	   ibd_fail_list;     /* chain on kib_failed_devs */
+	__u32		ibd_ifip;	  /* IPoIB interface IP */
+	/** IPoIB interface name */
+	char		 ibd_ifname[KIB_IFNAME_SIZE];
+	int		  ibd_nnets;	 /* # nets extant */
+
+	unsigned long	   ibd_next_failover;
+	int		  ibd_failed_failover; /* # failover failures */
+	unsigned int	 ibd_failover;      /* failover in progress */
+	unsigned int	 ibd_can_failover;  /* IPoIB interface is a bonding master */
+	struct list_head	   ibd_nets;
+	struct kib_hca_dev  *ibd_hdev;
+} kib_dev_t;
+
+typedef struct kib_hca_dev {
+	struct rdma_cm_id   *ibh_cmid;	  /* listener cmid */
+	struct ib_device    *ibh_ibdev;	 /* IB device */
+	int		  ibh_page_shift;    /* page shift of current HCA */
+	int		  ibh_page_size;     /* page size of current HCA */
+	__u64		ibh_page_mask;     /* page mask of current HCA */
+	int		  ibh_mr_shift;      /* bits shift of max MR size */
+	__u64		ibh_mr_size;       /* size of MR */
+	int		  ibh_nmrs;	  /* # of global MRs */
+	struct ib_mr       **ibh_mrs;	   /* global MR */
+	struct ib_pd	*ibh_pd;	    /* PD */
+	kib_dev_t	   *ibh_dev;	   /* owner */
+	atomic_t	 ibh_ref;	   /* refcount */
+} kib_hca_dev_t;
+
+/** # of seconds to keep pool alive */
+#define IBLND_POOL_DEADLINE     300
+/** # of seconds to retry if allocation failed */
+#define IBLND_POOL_RETRY	1
+
+typedef struct {
+	int		     ibp_npages;	     /* # pages */
+	struct page	    *ibp_pages[0];	   /* page array */
+} kib_pages_t;
+
+struct kib_pmr_pool;
+
+typedef struct {
+	struct list_head	      pmr_list;	       /* chain node */
+	struct ib_phys_buf     *pmr_ipb;		/* physical buffer */
+	struct ib_mr	   *pmr_mr;		 /* IB MR */
+	struct kib_pmr_pool    *pmr_pool;	       /* owner of this MR */
+	__u64		   pmr_iova;	       /* Virtual I/O address */
+	int		     pmr_refcount;	   /* reference count */
+} kib_phys_mr_t;
+
+struct kib_pool;
+struct kib_poolset;
+
+typedef int  (*kib_ps_pool_create_t)(struct kib_poolset *ps,
+				     int inc, struct kib_pool **pp_po);
+typedef void (*kib_ps_pool_destroy_t)(struct kib_pool *po);
+typedef void (*kib_ps_node_init_t)(struct kib_pool *po, struct list_head *node);
+typedef void (*kib_ps_node_fini_t)(struct kib_pool *po, struct list_head *node);
+
+struct kib_net;
+
+#define IBLND_POOL_NAME_LEN     32
+
+typedef struct kib_poolset {
+	spinlock_t		ps_lock;		/* serialize */
+	struct kib_net	 *ps_net;		 /* network it belongs to */
+	char		    ps_name[IBLND_POOL_NAME_LEN]; /* pool set name */
+	struct list_head	      ps_pool_list;	   /* list of pools */
+	struct list_head	      ps_failed_pool_list;    /* failed pool list */
+	unsigned long	      ps_next_retry;	  /* time stamp for retry if failed to allocate */
+	int		     ps_increasing;	  /* is allocating new pool */
+	int		     ps_pool_size;	   /* new pool size */
+	int			ps_cpt;			/* CPT id */
+
+	kib_ps_pool_create_t    ps_pool_create;	 /* create a new pool */
+	kib_ps_pool_destroy_t   ps_pool_destroy;	/* destroy a pool */
+	kib_ps_node_init_t      ps_node_init;	   /* initialize new allocated node */
+	kib_ps_node_fini_t      ps_node_fini;	   /* finalize node */
+} kib_poolset_t;
+
+typedef struct kib_pool {
+	struct list_head	      po_list;		/* chain on pool list */
+	struct list_head	      po_free_list;	   /* pre-allocated node */
+	kib_poolset_t	  *po_owner;	       /* pool_set of this pool */
+	unsigned long	      po_deadline;	    /* deadline of this pool */
+	int		     po_allocated;	   /* # of elements in use */
+	int		     po_failed;	      /* pool is created on failed HCA */
+	int		     po_size;		/* # of pre-allocated elements */
+} kib_pool_t;
+
+typedef struct {
+	kib_poolset_t	   tps_poolset;	    /* pool-set */
+	__u64		   tps_next_tx_cookie;     /* cookie of TX */
+} kib_tx_poolset_t;
+
+typedef struct {
+	kib_pool_t	      tpo_pool;	       /* pool */
+	struct kib_hca_dev     *tpo_hdev;	       /* device for this pool */
+	struct kib_tx	  *tpo_tx_descs;	   /* all the tx descriptors */
+	kib_pages_t	    *tpo_tx_pages;	   /* premapped tx msg pages */
+} kib_tx_pool_t;
+
+typedef struct {
+	kib_poolset_t	   pps_poolset;	    /* pool-set */
+} kib_pmr_poolset_t;
+
+typedef struct kib_pmr_pool {
+	struct kib_hca_dev     *ppo_hdev;	       /* device for this pool */
+	kib_pool_t	      ppo_pool;	       /* pool */
+} kib_pmr_pool_t;
+
+typedef struct {
+	spinlock_t		fps_lock;		/* serialize */
+	struct kib_net	 *fps_net;		/* IB network */
+	struct list_head	      fps_pool_list;	  /* FMR pool list */
+	struct list_head	      fps_failed_pool_list;   /* FMR pool list */
+	__u64		   fps_version;	    /* validity stamp */
+	int			fps_cpt;		/* CPT id */
+	int			fps_pool_size;
+	int			fps_flush_trigger;
+	/* is allocating new pool */
+	int			fps_increasing;
+	/* time stamp for retry if failed to allocate */
+	unsigned long		fps_next_retry;
+} kib_fmr_poolset_t;
+
+typedef struct {
+	struct list_head	      fpo_list;	       /* chain on pool list */
+	struct kib_hca_dev     *fpo_hdev;	       /* device for this pool */
+	kib_fmr_poolset_t      *fpo_owner;	      /* owner of this pool */
+	struct ib_fmr_pool     *fpo_fmr_pool;	   /* IB FMR pool */
+	unsigned long	      fpo_deadline;	   /* deadline of this pool */
+	int		     fpo_failed;	     /* fmr pool is failed */
+	int		     fpo_map_count;	  /* # of mapped FMR */
+} kib_fmr_pool_t;
+
+typedef struct {
+	struct ib_pool_fmr     *fmr_pfmr;	       /* IB pool fmr */
+	kib_fmr_pool_t	 *fmr_pool;	       /* pool of FMR */
+} kib_fmr_t;
+
+typedef struct kib_net {
+	struct list_head	   ibn_list;	  /* chain on kib_dev_t::ibd_nets */
+	__u64		ibn_incarnation;   /* my epoch */
+	int		  ibn_init;	  /* initialisation state */
+	int		  ibn_shutdown;      /* shutting down? */
+
+	atomic_t		ibn_npeers;	/* # peers extant */
+	atomic_t		ibn_nconns;	/* # connections extant */
+
+	kib_tx_poolset_t	**ibn_tx_ps;	/* tx pool-set */
+	kib_fmr_poolset_t	**ibn_fmr_ps;	/* fmr pool-set */
+	kib_pmr_poolset_t	**ibn_pmr_ps;	/* pmr pool-set */
+
+	kib_dev_t		*ibn_dev;	/* underlying IB device */
+} kib_net_t;
+
+#define KIB_THREAD_SHIFT		16
+#define KIB_THREAD_ID(cpt, tid)		((cpt) << KIB_THREAD_SHIFT | (tid))
+#define KIB_THREAD_CPT(id)		((id) >> KIB_THREAD_SHIFT)
+#define KIB_THREAD_TID(id)		((id) & ((1UL << KIB_THREAD_SHIFT) - 1))
+
+struct kib_sched_info {
+	/* serialise */
+	spinlock_t		ibs_lock;
+	/* schedulers sleep here */
+	wait_queue_head_t		ibs_waitq;
+	/* conns to check for rx completions */
+	struct list_head		ibs_conns;
+	/* number of scheduler threads */
+	int			ibs_nthreads;
+	/* max allowed scheduler threads */
+	int			ibs_nthreads_max;
+	int			ibs_cpt;	/* CPT id */
+};
+
+typedef struct {
+	int			kib_init;	/* initialisation state */
+	int			kib_shutdown;	/* shut down? */
+	struct list_head		kib_devs;	/* IB devices extant */
+	/* list head of failed devices */
+	struct list_head		kib_failed_devs;
+	/* schedulers sleep here */
+	wait_queue_head_t		kib_failover_waitq;
+	atomic_t		kib_nthreads;	/* # live threads */
+	/* stabilize net/dev/peer/conn ops */
+	rwlock_t		kib_global_lock;
+	/* hash table of all my known peers */
+	struct list_head		*kib_peers;
+	/* size of kib_peers */
+	int			kib_peer_hash_size;
+	/* the connd task (serialisation assertions) */
+	void			*kib_connd;
+	/* connections to setup/teardown */
+	struct list_head		kib_connd_conns;
+	/* connections with zero refcount */
+	struct list_head		kib_connd_zombies;
+	/* connection daemon sleeps here */
+	wait_queue_head_t		kib_connd_waitq;
+	spinlock_t		kib_connd_lock;	/* serialise */
+	struct ib_qp_attr	kib_error_qpa;	/* QP->ERROR */
+	/* percpt data for schedulers */
+	struct kib_sched_info	**kib_scheds;
+} kib_data_t;
+
+#define IBLND_INIT_NOTHING	 0
+#define IBLND_INIT_DATA	    1
+#define IBLND_INIT_ALL	     2
+
+/************************************************************************
+ * IB Wire message format.
+ * These are sent in sender's byte order (i.e. receiver flips).
+ */
+
+typedef struct kib_connparams {
+	__u16	     ibcp_queue_depth;
+	__u16	     ibcp_max_frags;
+	__u32	     ibcp_max_msg_size;
+} WIRE_ATTR kib_connparams_t;
+
+typedef struct {
+	lnet_hdr_t	ibim_hdr;	     /* portals header */
+	char	      ibim_payload[0];      /* piggy-backed payload */
+} WIRE_ATTR kib_immediate_msg_t;
+
+typedef struct {
+	__u32	     rf_nob;	       /* # bytes this frag */
+	__u64	     rf_addr;	      /* CAVEAT EMPTOR: misaligned!! */
+} WIRE_ATTR kib_rdma_frag_t;
+
+typedef struct {
+	__u32	     rd_key;	       /* local/remote key */
+	__u32	     rd_nfrags;	    /* # fragments */
+	kib_rdma_frag_t   rd_frags[0];	  /* buffer frags */
+} WIRE_ATTR kib_rdma_desc_t;
+
+typedef struct {
+	lnet_hdr_t	ibprm_hdr;	    /* portals header */
+	__u64	     ibprm_cookie;	 /* opaque completion cookie */
+} WIRE_ATTR kib_putreq_msg_t;
+
+typedef struct {
+	__u64	     ibpam_src_cookie;     /* reflected completion cookie */
+	__u64	     ibpam_dst_cookie;     /* opaque completion cookie */
+	kib_rdma_desc_t   ibpam_rd;	     /* sender's sink buffer */
+} WIRE_ATTR kib_putack_msg_t;
+
+typedef struct {
+	lnet_hdr_t	ibgm_hdr;	     /* portals header */
+	__u64	     ibgm_cookie;	  /* opaque completion cookie */
+	kib_rdma_desc_t   ibgm_rd;	      /* rdma descriptor */
+} WIRE_ATTR kib_get_msg_t;
+
+typedef struct {
+	__u64	     ibcm_cookie;	  /* opaque completion cookie */
+	__s32	     ibcm_status;	  /* < 0 failure: >= 0 length */
+} WIRE_ATTR kib_completion_msg_t;
+
+typedef struct {
+	/* First 2 fields fixed FOR ALL TIME */
+	__u32	     ibm_magic;	    /* I'm an ibnal message */
+	__u16	     ibm_version;	  /* this is my version number */
+
+	__u8	      ibm_type;	     /* msg type */
+	__u8	      ibm_credits;	  /* returned credits */
+	__u32	     ibm_nob;	      /* # bytes in whole message */
+	__u32	     ibm_cksum;	    /* checksum (0 == no checksum) */
+	__u64	     ibm_srcnid;	   /* sender's NID */
+	__u64	     ibm_srcstamp;	 /* sender's incarnation */
+	__u64	     ibm_dstnid;	   /* destination's NID */
+	__u64	     ibm_dststamp;	 /* destination's incarnation */
+
+	union {
+		kib_connparams_t      connparams;
+		kib_immediate_msg_t   immediate;
+		kib_putreq_msg_t      putreq;
+		kib_putack_msg_t      putack;
+		kib_get_msg_t	 get;
+		kib_completion_msg_t  completion;
+	} WIRE_ATTR ibm_u;
+} WIRE_ATTR kib_msg_t;
+
+#define IBLND_MSG_MAGIC LNET_PROTO_IB_MAGIC	/* unique magic */
+
+#define IBLND_MSG_VERSION_1	 0x11
+#define IBLND_MSG_VERSION_2	 0x12
+#define IBLND_MSG_VERSION	   IBLND_MSG_VERSION_2
+
+#define IBLND_MSG_CONNREQ	   0xc0	/* connection request */
+#define IBLND_MSG_CONNACK	   0xc1	/* connection acknowledge */
+#define IBLND_MSG_NOOP	      0xd0	/* nothing (just credits) */
+#define IBLND_MSG_IMMEDIATE	 0xd1	/* immediate */
+#define IBLND_MSG_PUT_REQ	   0xd2	/* putreq (src->sink) */
+#define IBLND_MSG_PUT_NAK	   0xd3	/* completion (sink->src) */
+#define IBLND_MSG_PUT_ACK	   0xd4	/* putack (sink->src) */
+#define IBLND_MSG_PUT_DONE	  0xd5	/* completion (src->sink) */
+#define IBLND_MSG_GET_REQ	   0xd6	/* getreq (sink->src) */
+#define IBLND_MSG_GET_DONE	  0xd7	/* completion (src->sink: all OK) */
+
+typedef struct {
+	__u32	    ibr_magic;	     /* sender's magic */
+	__u16	    ibr_version;	   /* sender's version */
+	__u8	     ibr_why;	       /* reject reason */
+	__u8	     ibr_padding;	   /* padding */
+	__u64	    ibr_incarnation;       /* incarnation of peer */
+	kib_connparams_t ibr_cp;		/* connection parameters */
+} WIRE_ATTR kib_rej_t;
+
+/* connection rejection reasons */
+#define IBLND_REJECT_CONN_RACE       1	  /* You lost connection race */
+#define IBLND_REJECT_NO_RESOURCES    2	  /* Out of memory/conns etc */
+#define IBLND_REJECT_FATAL	   3	  /* Anything else */
+
+#define IBLND_REJECT_CONN_UNCOMPAT   4	  /* incompatible version peer */
+#define IBLND_REJECT_CONN_STALE      5	  /* stale peer */
+
+#define IBLND_REJECT_RDMA_FRAGS      6	  /* Fatal: peer's rdma frags can't match mine */
+#define IBLND_REJECT_MSG_QUEUE_SIZE  7	  /* Fatal: peer's msg queue size can't match mine */
+
+/***********************************************************************/
+
+typedef struct kib_rx			   /* receive message */
+{
+	struct list_head		rx_list;      /* queue for attention */
+	struct kib_conn	  *rx_conn;      /* owning conn */
+	int		       rx_nob;       /* # bytes received (-1 while posted) */
+	enum ib_wc_status	 rx_status;    /* completion status */
+	kib_msg_t		*rx_msg;       /* message buffer (host vaddr) */
+	__u64		     rx_msgaddr;   /* message buffer (I/O addr) */
+	DECLARE_PCI_UNMAP_ADDR   (rx_msgunmap); /* for dma_unmap_single() */
+	struct ib_recv_wr	 rx_wrq;       /* receive work item... */
+	struct ib_sge	     rx_sge;       /* ...and its memory */
+} kib_rx_t;
+
+#define IBLND_POSTRX_DONT_POST    0	     /* don't post */
+#define IBLND_POSTRX_NO_CREDIT    1	     /* post: no credits */
+#define IBLND_POSTRX_PEER_CREDIT  2	     /* post: give peer back 1 credit */
+#define IBLND_POSTRX_RSRVD_CREDIT 3	     /* post: give myself back 1 reserved credit */
+
+typedef struct kib_tx			   /* transmit message */
+{
+	struct list_head		tx_list;      /* queue on idle_txs ibc_tx_queue etc. */
+	kib_tx_pool_t	    *tx_pool;      /* pool I'm from */
+	struct kib_conn	  *tx_conn;      /* owning conn */
+	short		     tx_sending;   /* # tx callbacks outstanding */
+	short		     tx_queued;    /* queued for sending */
+	short		     tx_waiting;   /* waiting for peer */
+	int		       tx_status;    /* LNET completion status */
+	unsigned long	     tx_deadline;  /* completion deadline */
+	__u64		     tx_cookie;    /* completion cookie */
+	lnet_msg_t	       *tx_lntmsg[2]; /* lnet msgs to finalize on completion */
+	kib_msg_t		*tx_msg;       /* message buffer (host vaddr) */
+	__u64		     tx_msgaddr;   /* message buffer (I/O addr) */
+	DECLARE_PCI_UNMAP_ADDR   (tx_msgunmap); /* for dma_unmap_single() */
+	int		       tx_nwrq;      /* # send work items */
+	struct ib_send_wr	*tx_wrq;       /* send work items... */
+	struct ib_sge	    *tx_sge;       /* ...and their memory */
+	kib_rdma_desc_t	  *tx_rd;	/* rdma descriptor */
+	int		       tx_nfrags;    /* # entries in... */
+	struct scatterlist       *tx_frags;     /* dma_map_sg descriptor */
+	__u64		    *tx_pages;     /* rdma phys page addrs */
+	union {
+		kib_phys_mr_t      *pmr;	/* MR for physical buffer */
+		kib_fmr_t	   fmr;	/* FMR */
+	}			 tx_u;
+	int		       tx_dmadir;    /* dma direction */
+} kib_tx_t;
+
+typedef struct kib_connvars {
+	/* connection-in-progress variables */
+	kib_msg_t		 cv_msg;
+} kib_connvars_t;
+
+typedef struct kib_conn {
+	struct kib_sched_info *ibc_sched;	/* scheduler information */
+	struct kib_peer     *ibc_peer;	  /* owning peer */
+	kib_hca_dev_t       *ibc_hdev;	  /* HCA bound on */
+	struct list_head	   ibc_list;	  /* stash on peer's conn list */
+	struct list_head	   ibc_sched_list;    /* schedule for attention */
+	__u16		ibc_version;       /* version of connection */
+	__u64		ibc_incarnation;   /* which instance of the peer */
+	atomic_t	 ibc_refcount;      /* # users */
+	int		  ibc_state;	 /* what's happening */
+	int		  ibc_nsends_posted; /* # uncompleted sends */
+	int		  ibc_noops_posted;  /* # uncompleted NOOPs */
+	int		  ibc_credits;       /* # credits I have */
+	int		  ibc_outstanding_credits; /* # credits to return */
+	int		  ibc_reserved_credits;/* # ACK/DONE msg credits */
+	int		  ibc_comms_error;   /* set on comms error */
+	unsigned int	     ibc_nrx:16;	/* receive buffers owned */
+	unsigned int	     ibc_scheduled:1;   /* scheduled for attention */
+	unsigned int	     ibc_ready:1;       /* CQ callback fired */
+	/* time of last send */
+	unsigned long	ibc_last_send;
+	/** link chain for kiblnd_check_conns only */
+	struct list_head	   ibc_connd_list;
+	/** rxs completed before ESTABLISHED */
+	struct list_head	   ibc_early_rxs;
+	/** IBLND_MSG_NOOPs for IBLND_MSG_VERSION_1 */
+	struct list_head	   ibc_tx_noops;
+	struct list_head	   ibc_tx_queue;       /* sends that need a credit */
+	struct list_head	   ibc_tx_queue_nocred;/* sends that don't need a credit */
+	struct list_head	   ibc_tx_queue_rsrvd; /* sends that need to reserve an ACK/DONE msg */
+	struct list_head	   ibc_active_txs;     /* active tx awaiting completion */
+	spinlock_t	     ibc_lock;		 /* serialise */
+	kib_rx_t	    *ibc_rxs;	    /* the rx descs */
+	kib_pages_t	 *ibc_rx_pages;       /* premapped rx msg pages */
+
+	struct rdma_cm_id   *ibc_cmid;	   /* CM id */
+	struct ib_cq	*ibc_cq;	     /* completion queue */
+
+	kib_connvars_t      *ibc_connvars;       /* in-progress connection state */
+} kib_conn_t;
+
+#define IBLND_CONN_INIT	       0	 /* being initialised */
+#define IBLND_CONN_ACTIVE_CONNECT     1	 /* active sending req */
+#define IBLND_CONN_PASSIVE_WAIT       2	 /* passive waiting for rtu */
+#define IBLND_CONN_ESTABLISHED	3	 /* connection established */
+#define IBLND_CONN_CLOSING	    4	 /* being closed */
+#define IBLND_CONN_DISCONNECTED       5	 /* disconnected */
+
+typedef struct kib_peer {
+	struct list_head	   ibp_list;	   /* stash on global peer list */
+	lnet_nid_t	   ibp_nid;	    /* who's on the other end(s) */
+	lnet_ni_t	   *ibp_ni;	     /* LNet interface */
+	atomic_t	 ibp_refcount;       /* # users */
+	struct list_head	   ibp_conns;	  /* all active connections */
+	struct list_head	   ibp_tx_queue;       /* msgs waiting for a conn */
+	__u16		ibp_version;	/* version of peer */
+	__u64		ibp_incarnation;    /* incarnation of peer */
+	int		  ibp_connecting;     /* current active connection attempts */
+	int		  ibp_accepting;      /* current passive connection attempts */
+	int		  ibp_error;	  /* errno on closing this peer */
+	unsigned long	   ibp_last_alive;     /* when (in jiffies) I was last alive */
+} kib_peer_t;
+
+extern kib_data_t      kiblnd_data;
+
+extern void kiblnd_hdev_destroy(kib_hca_dev_t *hdev);
+
+static inline void
+kiblnd_hdev_addref_locked(kib_hca_dev_t *hdev)
+{
+	LASSERT (atomic_read(&hdev->ibh_ref) > 0);
+	atomic_inc(&hdev->ibh_ref);
+}
+
+static inline void
+kiblnd_hdev_decref(kib_hca_dev_t *hdev)
+{
+	LASSERT (atomic_read(&hdev->ibh_ref) > 0);
+	if (atomic_dec_and_test(&hdev->ibh_ref))
+		kiblnd_hdev_destroy(hdev);
+}
+
+static inline int
+kiblnd_dev_can_failover(kib_dev_t *dev)
+{
+	if (!list_empty(&dev->ibd_fail_list)) /* already scheduled */
+		return 0;
+
+	if (*kiblnd_tunables.kib_dev_failover == 0) /* disabled */
+		return 0;
+
+	if (*kiblnd_tunables.kib_dev_failover > 1) /* force failover */
+		return 1;
+
+	return dev->ibd_can_failover;
+}
+
+#define kiblnd_conn_addref(conn)				\
+do {							    \
+	CDEBUG(D_NET, "conn[%p] (%d)++\n",		      \
+	       (conn), atomic_read(&(conn)->ibc_refcount)); \
+	atomic_inc(&(conn)->ibc_refcount);		  \
+} while (0)
+
+#define kiblnd_conn_decref(conn)					\
+do {									\
+	unsigned long flags;						\
+									\
+	CDEBUG(D_NET, "conn[%p] (%d)--\n",				\
+	       (conn), atomic_read(&(conn)->ibc_refcount));		\
+	LASSERT_ATOMIC_POS(&(conn)->ibc_refcount);			\
+	if (atomic_dec_and_test(&(conn)->ibc_refcount)) {		\
+		spin_lock_irqsave(&kiblnd_data.kib_connd_lock, flags);	\
+		list_add_tail(&(conn)->ibc_list,			\
+				  &kiblnd_data.kib_connd_zombies);	\
+		wake_up(&kiblnd_data.kib_connd_waitq);		\
+		spin_unlock_irqrestore(&kiblnd_data.kib_connd_lock, flags);\
+	}								\
+} while (0)
+
+#define kiblnd_peer_addref(peer)				\
+do {							    \
+	CDEBUG(D_NET, "peer[%p] -> %s (%d)++\n",		\
+	       (peer), libcfs_nid2str((peer)->ibp_nid),	 \
+	       atomic_read (&(peer)->ibp_refcount));	\
+	atomic_inc(&(peer)->ibp_refcount);		  \
+} while (0)
+
+#define kiblnd_peer_decref(peer)				\
+do {							    \
+	CDEBUG(D_NET, "peer[%p] -> %s (%d)--\n",		\
+	       (peer), libcfs_nid2str((peer)->ibp_nid),	 \
+	       atomic_read (&(peer)->ibp_refcount));	\
+	LASSERT_ATOMIC_POS(&(peer)->ibp_refcount);	      \
+	if (atomic_dec_and_test(&(peer)->ibp_refcount))     \
+		kiblnd_destroy_peer(peer);		      \
+} while (0)
+
+static inline struct list_head *
+kiblnd_nid2peerlist (lnet_nid_t nid)
+{
+	unsigned int hash =
+		((unsigned int)nid) % kiblnd_data.kib_peer_hash_size;
+
+	return (&kiblnd_data.kib_peers [hash]);
+}
+
+static inline int
+kiblnd_peer_active (kib_peer_t *peer)
+{
+	/* Am I in the peer hash table? */
+	return (!list_empty(&peer->ibp_list));
+}
+
+static inline kib_conn_t *
+kiblnd_get_conn_locked (kib_peer_t *peer)
+{
+	LASSERT (!list_empty(&peer->ibp_conns));
+
+	/* just return the first connection */
+	return list_entry(peer->ibp_conns.next, kib_conn_t, ibc_list);
+}
+
+static inline int
+kiblnd_send_keepalive(kib_conn_t *conn)
+{
+	return (*kiblnd_tunables.kib_keepalive > 0) &&
+		cfs_time_after(jiffies, conn->ibc_last_send +
+			       *kiblnd_tunables.kib_keepalive*HZ);
+}
+
+static inline int
+kiblnd_need_noop(kib_conn_t *conn)
+{
+	LASSERT (conn->ibc_state >= IBLND_CONN_ESTABLISHED);
+
+	if (conn->ibc_outstanding_credits <
+	    IBLND_CREDITS_HIGHWATER(conn->ibc_version) &&
+	    !kiblnd_send_keepalive(conn))
+		return 0; /* No need to send NOOP */
+
+	if (IBLND_OOB_CAPABLE(conn->ibc_version)) {
+		if (!list_empty(&conn->ibc_tx_queue_nocred))
+			return 0; /* NOOP can be piggybacked */
+
+		/* No tx to piggyback NOOP onto or no credit to send a tx */
+		return (list_empty(&conn->ibc_tx_queue) ||
+			conn->ibc_credits == 0);
+	}
+
+	if (!list_empty(&conn->ibc_tx_noops) || /* NOOP already queued */
+	    !list_empty(&conn->ibc_tx_queue_nocred) || /* piggyback NOOP */
+	    conn->ibc_credits == 0)		    /* no credit */
+		return 0;
+
+	if (conn->ibc_credits == 1 &&      /* last credit reserved for */
+	    conn->ibc_outstanding_credits == 0) /* giving back credits */
+		return 0;
+
+	/* No tx to piggyback NOOP onto or no credit to send a tx */
+	return (list_empty(&conn->ibc_tx_queue) || conn->ibc_credits == 1);
+}
+
+static inline void
+kiblnd_abort_receives(kib_conn_t *conn)
+{
+	ib_modify_qp(conn->ibc_cmid->qp,
+		     &kiblnd_data.kib_error_qpa, IB_QP_STATE);
+}
+
+static inline const char *
+kiblnd_queue2str (kib_conn_t *conn, struct list_head *q)
+{
+	if (q == &conn->ibc_tx_queue)
+		return "tx_queue";
+
+	if (q == &conn->ibc_tx_queue_rsrvd)
+		return "tx_queue_rsrvd";
+
+	if (q == &conn->ibc_tx_queue_nocred)
+		return "tx_queue_nocred";
+
+	if (q == &conn->ibc_active_txs)
+		return "active_txs";
+
+	LBUG();
+	return NULL;
+}
+
+/* CAVEAT EMPTOR: We rely on descriptor alignment to allow us to use the
+ * lowest bits of the work request id to stash the work item type. */
+
+#define IBLND_WID_TX    0
+#define IBLND_WID_RDMA  1
+#define IBLND_WID_RX    2
+#define IBLND_WID_MASK  3UL
+
+static inline __u64
+kiblnd_ptr2wreqid (void *ptr, int type)
+{
+	unsigned long lptr = (unsigned long)ptr;
+
+	LASSERT ((lptr & IBLND_WID_MASK) == 0);
+	LASSERT ((type & ~IBLND_WID_MASK) == 0);
+	return (__u64)(lptr | type);
+}
+
+static inline void *
+kiblnd_wreqid2ptr (__u64 wreqid)
+{
+	return (void *)(((unsigned long)wreqid) & ~IBLND_WID_MASK);
+}
+
+static inline int
+kiblnd_wreqid2type (__u64 wreqid)
+{
+	return (wreqid & IBLND_WID_MASK);
+}
+
+static inline void
+kiblnd_set_conn_state (kib_conn_t *conn, int state)
+{
+	conn->ibc_state = state;
+	mb();
+}
+
+static inline void
+kiblnd_init_msg (kib_msg_t *msg, int type, int body_nob)
+{
+	msg->ibm_type = type;
+	msg->ibm_nob  = offsetof(kib_msg_t, ibm_u) + body_nob;
+}
+
+static inline int
+kiblnd_rd_size (kib_rdma_desc_t *rd)
+{
+	int   i;
+	int   size;
+
+	for (i = size = 0; i < rd->rd_nfrags; i++)
+		size += rd->rd_frags[i].rf_nob;
+
+	return size;
+}
+
+static inline __u64
+kiblnd_rd_frag_addr(kib_rdma_desc_t *rd, int index)
+{
+	return rd->rd_frags[index].rf_addr;
+}
+
+static inline __u32
+kiblnd_rd_frag_size(kib_rdma_desc_t *rd, int index)
+{
+	return rd->rd_frags[index].rf_nob;
+}
+
+static inline __u32
+kiblnd_rd_frag_key(kib_rdma_desc_t *rd, int index)
+{
+	return rd->rd_key;
+}
+
+static inline int
+kiblnd_rd_consume_frag(kib_rdma_desc_t *rd, int index, __u32 nob)
+{
+	if (nob < rd->rd_frags[index].rf_nob) {
+		rd->rd_frags[index].rf_addr += nob;
+		rd->rd_frags[index].rf_nob  -= nob;
+	} else {
+		index ++;
+	}
+
+	return index;
+}
+
+static inline int
+kiblnd_rd_msg_size(kib_rdma_desc_t *rd, int msgtype, int n)
+{
+	LASSERT (msgtype == IBLND_MSG_GET_REQ ||
+		 msgtype == IBLND_MSG_PUT_ACK);
+
+	return msgtype == IBLND_MSG_GET_REQ ?
+	       offsetof(kib_get_msg_t, ibgm_rd.rd_frags[n]) :
+	       offsetof(kib_putack_msg_t, ibpam_rd.rd_frags[n]);
+}
+
+
+static inline __u64
+kiblnd_dma_mapping_error(struct ib_device *dev, u64 dma_addr)
+{
+	return ib_dma_mapping_error(dev, dma_addr);
+}
+
+static inline __u64 kiblnd_dma_map_single(struct ib_device *dev,
+					  void *msg, size_t size,
+					  enum dma_data_direction direction)
+{
+	return ib_dma_map_single(dev, msg, size, direction);
+}
+
+static inline void kiblnd_dma_unmap_single(struct ib_device *dev,
+					   __u64 addr, size_t size,
+					  enum dma_data_direction direction)
+{
+	ib_dma_unmap_single(dev, addr, size, direction);
+}
+
+#define KIBLND_UNMAP_ADDR_SET(p, m, a)  do {} while (0)
+#define KIBLND_UNMAP_ADDR(p, m, a)      (a)
+
+static inline int kiblnd_dma_map_sg(struct ib_device *dev,
+				    struct scatterlist *sg, int nents,
+				    enum dma_data_direction direction)
+{
+	return ib_dma_map_sg(dev, sg, nents, direction);
+}
+
+static inline void kiblnd_dma_unmap_sg(struct ib_device *dev,
+				       struct scatterlist *sg, int nents,
+				       enum dma_data_direction direction)
+{
+	ib_dma_unmap_sg(dev, sg, nents, direction);
+}
+
+static inline __u64 kiblnd_sg_dma_address(struct ib_device *dev,
+					  struct scatterlist *sg)
+{
+	return ib_sg_dma_address(dev, sg);
+}
+
+static inline unsigned int kiblnd_sg_dma_len(struct ib_device *dev,
+					     struct scatterlist *sg)
+{
+	return ib_sg_dma_len(dev, sg);
+}
+
+/* XXX We use KIBLND_CONN_PARAM(e) as writable buffer, it's not strictly
+ * right because OFED1.2 defines it as const, to use it we have to add
+ * (void *) cast to overcome "const" */
+
+#define KIBLND_CONN_PARAM(e)	    ((e)->param.conn.private_data)
+#define KIBLND_CONN_PARAM_LEN(e)	((e)->param.conn.private_data_len)
+
+
+struct ib_mr *kiblnd_find_rd_dma_mr(kib_hca_dev_t *hdev,
+				    kib_rdma_desc_t *rd);
+struct ib_mr *kiblnd_find_dma_mr(kib_hca_dev_t *hdev,
+				 __u64 addr, __u64 size);
+void kiblnd_map_rx_descs(kib_conn_t *conn);
+void kiblnd_unmap_rx_descs(kib_conn_t *conn);
+int kiblnd_map_tx(lnet_ni_t *ni, kib_tx_t *tx,
+		  kib_rdma_desc_t *rd, int nfrags);
+void kiblnd_unmap_tx(lnet_ni_t *ni, kib_tx_t *tx);
+void kiblnd_pool_free_node(kib_pool_t *pool, struct list_head *node);
+struct list_head *kiblnd_pool_alloc_node(kib_poolset_t *ps);
+
+int  kiblnd_fmr_pool_map(kib_fmr_poolset_t *fps, __u64 *pages,
+			 int npages, __u64 iov, kib_fmr_t *fmr);
+void kiblnd_fmr_pool_unmap(kib_fmr_t *fmr, int status);
+
+int  kiblnd_pmr_pool_map(kib_pmr_poolset_t *pps, kib_hca_dev_t *hdev,
+			 kib_rdma_desc_t *rd, __u64 *iova, kib_phys_mr_t **pp_pmr);
+void kiblnd_pmr_pool_unmap(kib_phys_mr_t *pmr);
+
+int  kiblnd_startup (lnet_ni_t *ni);
+void kiblnd_shutdown (lnet_ni_t *ni);
+int  kiblnd_ctl (lnet_ni_t *ni, unsigned int cmd, void *arg);
+void kiblnd_query (struct lnet_ni *ni, lnet_nid_t nid, unsigned long *when);
+
+int  kiblnd_tunables_init(void);
+void kiblnd_tunables_fini(void);
+
+int  kiblnd_connd (void *arg);
+int  kiblnd_scheduler(void *arg);
+int  kiblnd_thread_start(int (*fn)(void *arg), void *arg, char *name);
+int  kiblnd_failover_thread (void *arg);
+
+int  kiblnd_alloc_pages(kib_pages_t **pp, int cpt, int npages);
+void kiblnd_free_pages (kib_pages_t *p);
+
+int  kiblnd_cm_callback(struct rdma_cm_id *cmid,
+			struct rdma_cm_event *event);
+int  kiblnd_translate_mtu(int value);
+
+int  kiblnd_dev_failover(kib_dev_t *dev);
+int  kiblnd_create_peer (lnet_ni_t *ni, kib_peer_t **peerp, lnet_nid_t nid);
+void kiblnd_destroy_peer (kib_peer_t *peer);
+void kiblnd_destroy_dev (kib_dev_t *dev);
+void kiblnd_unlink_peer_locked (kib_peer_t *peer);
+void kiblnd_peer_alive (kib_peer_t *peer);
+kib_peer_t *kiblnd_find_peer_locked (lnet_nid_t nid);
+void kiblnd_peer_connect_failed (kib_peer_t *peer, int active, int error);
+int  kiblnd_close_stale_conns_locked (kib_peer_t *peer,
+				      int version, __u64 incarnation);
+int  kiblnd_close_peer_conns_locked (kib_peer_t *peer, int why);
+
+void kiblnd_connreq_done(kib_conn_t *conn, int status);
+kib_conn_t *kiblnd_create_conn (kib_peer_t *peer, struct rdma_cm_id *cmid,
+				int state, int version);
+void kiblnd_destroy_conn (kib_conn_t *conn);
+void kiblnd_close_conn (kib_conn_t *conn, int error);
+void kiblnd_close_conn_locked (kib_conn_t *conn, int error);
+
+int  kiblnd_init_rdma (kib_conn_t *conn, kib_tx_t *tx, int type,
+		       int nob, kib_rdma_desc_t *dstrd, __u64 dstcookie);
+
+void kiblnd_launch_tx (lnet_ni_t *ni, kib_tx_t *tx, lnet_nid_t nid);
+void kiblnd_queue_tx_locked (kib_tx_t *tx, kib_conn_t *conn);
+void kiblnd_queue_tx (kib_tx_t *tx, kib_conn_t *conn);
+void kiblnd_init_tx_msg (lnet_ni_t *ni, kib_tx_t *tx, int type, int body_nob);
+void kiblnd_txlist_done (lnet_ni_t *ni, struct list_head *txlist,
+			 int status);
+void kiblnd_check_sends (kib_conn_t *conn);
+
+void kiblnd_qp_event(struct ib_event *event, void *arg);
+void kiblnd_cq_event(struct ib_event *event, void *arg);
+void kiblnd_cq_completion(struct ib_cq *cq, void *arg);
+
+void kiblnd_pack_msg (lnet_ni_t *ni, kib_msg_t *msg, int version,
+		      int credits, lnet_nid_t dstnid, __u64 dststamp);
+int  kiblnd_unpack_msg(kib_msg_t *msg, int nob);
+int  kiblnd_post_rx (kib_rx_t *rx, int credit);
+
+int  kiblnd_send(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg);
+int  kiblnd_recv(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg, int delayed,
+		 unsigned int niov, struct kvec *iov, lnet_kiov_t *kiov,
+		 unsigned int offset, unsigned int mlen, unsigned int rlen);
diff --git a/kernel/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd_cb.c b/kernel/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd_cb.c
new file mode 100644
index 000000000..dbf374983
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd_cb.c
@@ -0,0 +1,3519 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/klnds/o2iblnd/o2iblnd_cb.c
+ *
+ * Author: Eric Barton <eric@bartonsoftware.com>
+ */
+
+#include "o2iblnd.h"
+
+static void
+kiblnd_tx_done(lnet_ni_t *ni, kib_tx_t *tx)
+{
+	lnet_msg_t *lntmsg[2];
+	kib_net_t  *net = ni->ni_data;
+	int	 rc;
+	int	 i;
+
+	LASSERT(net != NULL);
+	LASSERT(!in_interrupt());
+	LASSERT(!tx->tx_queued);	       /* mustn't be queued for sending */
+	LASSERT(tx->tx_sending == 0);	  /* mustn't be awaiting sent callback */
+	LASSERT(!tx->tx_waiting);	      /* mustn't be awaiting peer response */
+	LASSERT(tx->tx_pool != NULL);
+
+	kiblnd_unmap_tx(ni, tx);
+
+	/* tx may have up to 2 lnet msgs to finalise */
+	lntmsg[0] = tx->tx_lntmsg[0]; tx->tx_lntmsg[0] = NULL;
+	lntmsg[1] = tx->tx_lntmsg[1]; tx->tx_lntmsg[1] = NULL;
+	rc = tx->tx_status;
+
+	if (tx->tx_conn != NULL) {
+		LASSERT(ni == tx->tx_conn->ibc_peer->ibp_ni);
+
+		kiblnd_conn_decref(tx->tx_conn);
+		tx->tx_conn = NULL;
+	}
+
+	tx->tx_nwrq = 0;
+	tx->tx_status = 0;
+
+	kiblnd_pool_free_node(&tx->tx_pool->tpo_pool, &tx->tx_list);
+
+	/* delay finalize until my descs have been freed */
+	for (i = 0; i < 2; i++) {
+		if (lntmsg[i] == NULL)
+			continue;
+
+		lnet_finalize(ni, lntmsg[i], rc);
+	}
+}
+
+void
+kiblnd_txlist_done(lnet_ni_t *ni, struct list_head *txlist, int status)
+{
+	kib_tx_t *tx;
+
+	while (!list_empty(txlist)) {
+		tx = list_entry(txlist->next, kib_tx_t, tx_list);
+
+		list_del(&tx->tx_list);
+		/* complete now */
+		tx->tx_waiting = 0;
+		tx->tx_status = status;
+		kiblnd_tx_done(ni, tx);
+	}
+}
+
+static kib_tx_t *
+kiblnd_get_idle_tx(lnet_ni_t *ni, lnet_nid_t target)
+{
+	kib_net_t		*net = (kib_net_t *)ni->ni_data;
+	struct list_head		*node;
+	kib_tx_t		*tx;
+	kib_tx_poolset_t	*tps;
+
+	tps = net->ibn_tx_ps[lnet_cpt_of_nid(target)];
+	node = kiblnd_pool_alloc_node(&tps->tps_poolset);
+	if (node == NULL)
+		return NULL;
+	tx = container_of(node, kib_tx_t, tx_list);
+
+	LASSERT(tx->tx_nwrq == 0);
+	LASSERT(!tx->tx_queued);
+	LASSERT(tx->tx_sending == 0);
+	LASSERT(!tx->tx_waiting);
+	LASSERT(tx->tx_status == 0);
+	LASSERT(tx->tx_conn == NULL);
+	LASSERT(tx->tx_lntmsg[0] == NULL);
+	LASSERT(tx->tx_lntmsg[1] == NULL);
+	LASSERT(tx->tx_u.pmr == NULL);
+	LASSERT(tx->tx_nfrags == 0);
+
+	return tx;
+}
+
+static void
+kiblnd_drop_rx(kib_rx_t *rx)
+{
+	kib_conn_t		*conn	= rx->rx_conn;
+	struct kib_sched_info	*sched	= conn->ibc_sched;
+	unsigned long		flags;
+
+	spin_lock_irqsave(&sched->ibs_lock, flags);
+	LASSERT(conn->ibc_nrx > 0);
+	conn->ibc_nrx--;
+	spin_unlock_irqrestore(&sched->ibs_lock, flags);
+
+	kiblnd_conn_decref(conn);
+}
+
+int
+kiblnd_post_rx(kib_rx_t *rx, int credit)
+{
+	kib_conn_t	 *conn = rx->rx_conn;
+	kib_net_t	  *net = conn->ibc_peer->ibp_ni->ni_data;
+	struct ib_recv_wr  *bad_wrq = NULL;
+	struct ib_mr       *mr;
+	int		 rc;
+
+	LASSERT(net != NULL);
+	LASSERT(!in_interrupt());
+	LASSERT(credit == IBLND_POSTRX_NO_CREDIT ||
+		credit == IBLND_POSTRX_PEER_CREDIT ||
+		credit == IBLND_POSTRX_RSRVD_CREDIT);
+
+	mr = kiblnd_find_dma_mr(conn->ibc_hdev, rx->rx_msgaddr, IBLND_MSG_SIZE);
+	LASSERT(mr != NULL);
+
+	rx->rx_sge.lkey   = mr->lkey;
+	rx->rx_sge.addr   = rx->rx_msgaddr;
+	rx->rx_sge.length = IBLND_MSG_SIZE;
+
+	rx->rx_wrq.next = NULL;
+	rx->rx_wrq.sg_list = &rx->rx_sge;
+	rx->rx_wrq.num_sge = 1;
+	rx->rx_wrq.wr_id = kiblnd_ptr2wreqid(rx, IBLND_WID_RX);
+
+	LASSERT(conn->ibc_state >= IBLND_CONN_INIT);
+	LASSERT(rx->rx_nob >= 0);	      /* not posted */
+
+	if (conn->ibc_state > IBLND_CONN_ESTABLISHED) {
+		kiblnd_drop_rx(rx);	     /* No more posts for this rx */
+		return 0;
+	}
+
+	rx->rx_nob = -1;			/* flag posted */
+
+	rc = ib_post_recv(conn->ibc_cmid->qp, &rx->rx_wrq, &bad_wrq);
+	if (rc != 0) {
+		CERROR("Can't post rx for %s: %d, bad_wrq: %p\n",
+		       libcfs_nid2str(conn->ibc_peer->ibp_nid), rc, bad_wrq);
+		rx->rx_nob = 0;
+	}
+
+	if (conn->ibc_state < IBLND_CONN_ESTABLISHED) /* Initial post */
+		return rc;
+
+	if (rc != 0) {
+		kiblnd_close_conn(conn, rc);
+		kiblnd_drop_rx(rx);	     /* No more posts for this rx */
+		return rc;
+	}
+
+	if (credit == IBLND_POSTRX_NO_CREDIT)
+		return 0;
+
+	spin_lock(&conn->ibc_lock);
+	if (credit == IBLND_POSTRX_PEER_CREDIT)
+		conn->ibc_outstanding_credits++;
+	else
+		conn->ibc_reserved_credits++;
+	spin_unlock(&conn->ibc_lock);
+
+	kiblnd_check_sends(conn);
+	return 0;
+}
+
+static kib_tx_t *
+kiblnd_find_waiting_tx_locked(kib_conn_t *conn, int txtype, __u64 cookie)
+{
+	struct list_head   *tmp;
+
+	list_for_each(tmp, &conn->ibc_active_txs) {
+		kib_tx_t *tx = list_entry(tmp, kib_tx_t, tx_list);
+
+		LASSERT(!tx->tx_queued);
+		LASSERT(tx->tx_sending != 0 || tx->tx_waiting);
+
+		if (tx->tx_cookie != cookie)
+			continue;
+
+		if (tx->tx_waiting &&
+		    tx->tx_msg->ibm_type == txtype)
+			return tx;
+
+		CWARN("Bad completion: %swaiting, type %x (wanted %x)\n",
+		      tx->tx_waiting ? "" : "NOT ",
+		      tx->tx_msg->ibm_type, txtype);
+	}
+	return NULL;
+}
+
+static void
+kiblnd_handle_completion(kib_conn_t *conn, int txtype, int status, __u64 cookie)
+{
+	kib_tx_t    *tx;
+	lnet_ni_t   *ni = conn->ibc_peer->ibp_ni;
+	int	  idle;
+
+	spin_lock(&conn->ibc_lock);
+
+	tx = kiblnd_find_waiting_tx_locked(conn, txtype, cookie);
+	if (tx == NULL) {
+		spin_unlock(&conn->ibc_lock);
+
+		CWARN("Unmatched completion type %x cookie %#llx from %s\n",
+		      txtype, cookie, libcfs_nid2str(conn->ibc_peer->ibp_nid));
+		kiblnd_close_conn(conn, -EPROTO);
+		return;
+	}
+
+	if (tx->tx_status == 0) {	       /* success so far */
+		if (status < 0) {	       /* failed? */
+			tx->tx_status = status;
+		} else if (txtype == IBLND_MSG_GET_REQ) {
+			lnet_set_reply_msg_len(ni, tx->tx_lntmsg[1], status);
+		}
+	}
+
+	tx->tx_waiting = 0;
+
+	idle = !tx->tx_queued && (tx->tx_sending == 0);
+	if (idle)
+		list_del(&tx->tx_list);
+
+	spin_unlock(&conn->ibc_lock);
+
+	if (idle)
+		kiblnd_tx_done(ni, tx);
+}
+
+static void
+kiblnd_send_completion(kib_conn_t *conn, int type, int status, __u64 cookie)
+{
+	lnet_ni_t   *ni = conn->ibc_peer->ibp_ni;
+	kib_tx_t    *tx = kiblnd_get_idle_tx(ni, conn->ibc_peer->ibp_nid);
+
+	if (tx == NULL) {
+		CERROR("Can't get tx for completion %x for %s\n",
+		       type, libcfs_nid2str(conn->ibc_peer->ibp_nid));
+		return;
+	}
+
+	tx->tx_msg->ibm_u.completion.ibcm_status = status;
+	tx->tx_msg->ibm_u.completion.ibcm_cookie = cookie;
+	kiblnd_init_tx_msg(ni, tx, type, sizeof(kib_completion_msg_t));
+
+	kiblnd_queue_tx(tx, conn);
+}
+
+static void
+kiblnd_handle_rx(kib_rx_t *rx)
+{
+	kib_msg_t    *msg = rx->rx_msg;
+	kib_conn_t   *conn = rx->rx_conn;
+	lnet_ni_t    *ni = conn->ibc_peer->ibp_ni;
+	int	   credits = msg->ibm_credits;
+	kib_tx_t     *tx;
+	int	   rc = 0;
+	int	   rc2;
+	int	   post_credit;
+
+	LASSERT(conn->ibc_state >= IBLND_CONN_ESTABLISHED);
+
+	CDEBUG(D_NET, "Received %x[%d] from %s\n",
+	       msg->ibm_type, credits,
+	       libcfs_nid2str(conn->ibc_peer->ibp_nid));
+
+	if (credits != 0) {
+		/* Have I received credits that will let me send? */
+		spin_lock(&conn->ibc_lock);
+
+		if (conn->ibc_credits + credits >
+		    IBLND_MSG_QUEUE_SIZE(conn->ibc_version)) {
+			rc2 = conn->ibc_credits;
+			spin_unlock(&conn->ibc_lock);
+
+			CERROR("Bad credits from %s: %d + %d > %d\n",
+			       libcfs_nid2str(conn->ibc_peer->ibp_nid),
+			       rc2, credits,
+			       IBLND_MSG_QUEUE_SIZE(conn->ibc_version));
+
+			kiblnd_close_conn(conn, -EPROTO);
+			kiblnd_post_rx(rx, IBLND_POSTRX_NO_CREDIT);
+			return;
+		}
+
+		conn->ibc_credits += credits;
+
+		/* This ensures the credit taken by NOOP can be returned */
+		if (msg->ibm_type == IBLND_MSG_NOOP &&
+		    !IBLND_OOB_CAPABLE(conn->ibc_version)) /* v1 only */
+			conn->ibc_outstanding_credits++;
+
+		spin_unlock(&conn->ibc_lock);
+		kiblnd_check_sends(conn);
+	}
+
+	switch (msg->ibm_type) {
+	default:
+		CERROR("Bad IBLND message type %x from %s\n",
+		       msg->ibm_type, libcfs_nid2str(conn->ibc_peer->ibp_nid));
+		post_credit = IBLND_POSTRX_NO_CREDIT;
+		rc = -EPROTO;
+		break;
+
+	case IBLND_MSG_NOOP:
+		if (IBLND_OOB_CAPABLE(conn->ibc_version)) {
+			post_credit = IBLND_POSTRX_NO_CREDIT;
+			break;
+		}
+
+		if (credits != 0) /* credit already posted */
+			post_credit = IBLND_POSTRX_NO_CREDIT;
+		else	      /* a keepalive NOOP */
+			post_credit = IBLND_POSTRX_PEER_CREDIT;
+		break;
+
+	case IBLND_MSG_IMMEDIATE:
+		post_credit = IBLND_POSTRX_DONT_POST;
+		rc = lnet_parse(ni, &msg->ibm_u.immediate.ibim_hdr,
+				msg->ibm_srcnid, rx, 0);
+		if (rc < 0)		     /* repost on error */
+			post_credit = IBLND_POSTRX_PEER_CREDIT;
+		break;
+
+	case IBLND_MSG_PUT_REQ:
+		post_credit = IBLND_POSTRX_DONT_POST;
+		rc = lnet_parse(ni, &msg->ibm_u.putreq.ibprm_hdr,
+				msg->ibm_srcnid, rx, 1);
+		if (rc < 0)		     /* repost on error */
+			post_credit = IBLND_POSTRX_PEER_CREDIT;
+		break;
+
+	case IBLND_MSG_PUT_NAK:
+		CWARN("PUT_NACK from %s\n",
+		      libcfs_nid2str(conn->ibc_peer->ibp_nid));
+		post_credit = IBLND_POSTRX_RSRVD_CREDIT;
+		kiblnd_handle_completion(conn, IBLND_MSG_PUT_REQ,
+					 msg->ibm_u.completion.ibcm_status,
+					 msg->ibm_u.completion.ibcm_cookie);
+		break;
+
+	case IBLND_MSG_PUT_ACK:
+		post_credit = IBLND_POSTRX_RSRVD_CREDIT;
+
+		spin_lock(&conn->ibc_lock);
+		tx = kiblnd_find_waiting_tx_locked(conn, IBLND_MSG_PUT_REQ,
+					msg->ibm_u.putack.ibpam_src_cookie);
+		if (tx != NULL)
+			list_del(&tx->tx_list);
+		spin_unlock(&conn->ibc_lock);
+
+		if (tx == NULL) {
+			CERROR("Unmatched PUT_ACK from %s\n",
+			       libcfs_nid2str(conn->ibc_peer->ibp_nid));
+			rc = -EPROTO;
+			break;
+		}
+
+		LASSERT(tx->tx_waiting);
+		/* CAVEAT EMPTOR: I could be racing with tx_complete, but...
+		 * (a) I can overwrite tx_msg since my peer has received it!
+		 * (b) tx_waiting set tells tx_complete() it's not done. */
+
+		tx->tx_nwrq = 0;		/* overwrite PUT_REQ */
+
+		rc2 = kiblnd_init_rdma(conn, tx, IBLND_MSG_PUT_DONE,
+				       kiblnd_rd_size(&msg->ibm_u.putack.ibpam_rd),
+				       &msg->ibm_u.putack.ibpam_rd,
+				       msg->ibm_u.putack.ibpam_dst_cookie);
+		if (rc2 < 0)
+			CERROR("Can't setup rdma for PUT to %s: %d\n",
+			       libcfs_nid2str(conn->ibc_peer->ibp_nid), rc2);
+
+		spin_lock(&conn->ibc_lock);
+		tx->tx_waiting = 0;	/* clear waiting and queue atomically */
+		kiblnd_queue_tx_locked(tx, conn);
+		spin_unlock(&conn->ibc_lock);
+		break;
+
+	case IBLND_MSG_PUT_DONE:
+		post_credit = IBLND_POSTRX_PEER_CREDIT;
+		kiblnd_handle_completion(conn, IBLND_MSG_PUT_ACK,
+					 msg->ibm_u.completion.ibcm_status,
+					 msg->ibm_u.completion.ibcm_cookie);
+		break;
+
+	case IBLND_MSG_GET_REQ:
+		post_credit = IBLND_POSTRX_DONT_POST;
+		rc = lnet_parse(ni, &msg->ibm_u.get.ibgm_hdr,
+				msg->ibm_srcnid, rx, 1);
+		if (rc < 0)		     /* repost on error */
+			post_credit = IBLND_POSTRX_PEER_CREDIT;
+		break;
+
+	case IBLND_MSG_GET_DONE:
+		post_credit = IBLND_POSTRX_RSRVD_CREDIT;
+		kiblnd_handle_completion(conn, IBLND_MSG_GET_REQ,
+					 msg->ibm_u.completion.ibcm_status,
+					 msg->ibm_u.completion.ibcm_cookie);
+		break;
+	}
+
+	if (rc < 0)			     /* protocol error */
+		kiblnd_close_conn(conn, rc);
+
+	if (post_credit != IBLND_POSTRX_DONT_POST)
+		kiblnd_post_rx(rx, post_credit);
+}
+
+static void
+kiblnd_rx_complete(kib_rx_t *rx, int status, int nob)
+{
+	kib_msg_t    *msg = rx->rx_msg;
+	kib_conn_t   *conn = rx->rx_conn;
+	lnet_ni_t    *ni = conn->ibc_peer->ibp_ni;
+	kib_net_t    *net = ni->ni_data;
+	int	   rc;
+	int	   err = -EIO;
+
+	LASSERT(net != NULL);
+	LASSERT(rx->rx_nob < 0);	       /* was posted */
+	rx->rx_nob = 0;			 /* isn't now */
+
+	if (conn->ibc_state > IBLND_CONN_ESTABLISHED)
+		goto ignore;
+
+	if (status != IB_WC_SUCCESS) {
+		CNETERR("Rx from %s failed: %d\n",
+			libcfs_nid2str(conn->ibc_peer->ibp_nid), status);
+		goto failed;
+	}
+
+	LASSERT(nob >= 0);
+	rx->rx_nob = nob;
+
+	rc = kiblnd_unpack_msg(msg, rx->rx_nob);
+	if (rc != 0) {
+		CERROR("Error %d unpacking rx from %s\n",
+			rc, libcfs_nid2str(conn->ibc_peer->ibp_nid));
+		goto failed;
+	}
+
+	if (msg->ibm_srcnid != conn->ibc_peer->ibp_nid ||
+	    msg->ibm_dstnid != ni->ni_nid ||
+	    msg->ibm_srcstamp != conn->ibc_incarnation ||
+	    msg->ibm_dststamp != net->ibn_incarnation) {
+		CERROR("Stale rx from %s\n",
+			libcfs_nid2str(conn->ibc_peer->ibp_nid));
+		err = -ESTALE;
+		goto failed;
+	}
+
+	/* set time last known alive */
+	kiblnd_peer_alive(conn->ibc_peer);
+
+	/* racing with connection establishment/teardown! */
+
+	if (conn->ibc_state < IBLND_CONN_ESTABLISHED) {
+		rwlock_t  *g_lock = &kiblnd_data.kib_global_lock;
+		unsigned long  flags;
+
+		write_lock_irqsave(g_lock, flags);
+		/* must check holding global lock to eliminate race */
+		if (conn->ibc_state < IBLND_CONN_ESTABLISHED) {
+			list_add_tail(&rx->rx_list, &conn->ibc_early_rxs);
+			write_unlock_irqrestore(g_lock, flags);
+			return;
+		}
+		write_unlock_irqrestore(g_lock, flags);
+	}
+	kiblnd_handle_rx(rx);
+	return;
+
+ failed:
+	CDEBUG(D_NET, "rx %p conn %p\n", rx, conn);
+	kiblnd_close_conn(conn, err);
+ ignore:
+	kiblnd_drop_rx(rx);		     /* Don't re-post rx. */
+}
+
+static struct page *
+kiblnd_kvaddr_to_page(unsigned long vaddr)
+{
+	struct page *page;
+
+	if (is_vmalloc_addr((void *)vaddr)) {
+		page = vmalloc_to_page((void *)vaddr);
+		LASSERT(page != NULL);
+		return page;
+	}
+#ifdef CONFIG_HIGHMEM
+	if (vaddr >= PKMAP_BASE &&
+	    vaddr < (PKMAP_BASE + LAST_PKMAP * PAGE_SIZE)) {
+		/* No highmem pages only used for bulk (kiov) I/O */
+		CERROR("find page for address in highmem\n");
+		LBUG();
+	}
+#endif
+	page = virt_to_page(vaddr);
+	LASSERT(page != NULL);
+	return page;
+}
+
+static int
+kiblnd_fmr_map_tx(kib_net_t *net, kib_tx_t *tx, kib_rdma_desc_t *rd, int nob)
+{
+	kib_hca_dev_t		*hdev;
+	__u64			*pages = tx->tx_pages;
+	kib_fmr_poolset_t	*fps;
+	int			npages;
+	int			size;
+	int			cpt;
+	int			rc;
+	int			i;
+
+	LASSERT(tx->tx_pool != NULL);
+	LASSERT(tx->tx_pool->tpo_pool.po_owner != NULL);
+
+	hdev  = tx->tx_pool->tpo_hdev;
+
+	for (i = 0, npages = 0; i < rd->rd_nfrags; i++) {
+		for (size = 0; size <  rd->rd_frags[i].rf_nob;
+			       size += hdev->ibh_page_size) {
+			pages[npages++] = (rd->rd_frags[i].rf_addr &
+					    hdev->ibh_page_mask) + size;
+		}
+	}
+
+	cpt = tx->tx_pool->tpo_pool.po_owner->ps_cpt;
+
+	fps = net->ibn_fmr_ps[cpt];
+	rc = kiblnd_fmr_pool_map(fps, pages, npages, 0, &tx->tx_u.fmr);
+	if (rc != 0) {
+		CERROR("Can't map %d pages: %d\n", npages, rc);
+		return rc;
+	}
+
+	/* If rd is not tx_rd, it's going to get sent to a peer, who will need
+	 * the rkey */
+	rd->rd_key = (rd != tx->tx_rd) ? tx->tx_u.fmr.fmr_pfmr->fmr->rkey :
+					 tx->tx_u.fmr.fmr_pfmr->fmr->lkey;
+	rd->rd_frags[0].rf_addr &= ~hdev->ibh_page_mask;
+	rd->rd_frags[0].rf_nob   = nob;
+	rd->rd_nfrags = 1;
+
+	return 0;
+}
+
+static int
+kiblnd_pmr_map_tx(kib_net_t *net, kib_tx_t *tx, kib_rdma_desc_t *rd, int nob)
+{
+	kib_hca_dev_t		*hdev;
+	kib_pmr_poolset_t	*pps;
+	__u64			iova;
+	int			cpt;
+	int			rc;
+
+	LASSERT(tx->tx_pool != NULL);
+	LASSERT(tx->tx_pool->tpo_pool.po_owner != NULL);
+
+	hdev = tx->tx_pool->tpo_hdev;
+
+	iova = rd->rd_frags[0].rf_addr & ~hdev->ibh_page_mask;
+
+	cpt = tx->tx_pool->tpo_pool.po_owner->ps_cpt;
+
+	pps = net->ibn_pmr_ps[cpt];
+	rc = kiblnd_pmr_pool_map(pps, hdev, rd, &iova, &tx->tx_u.pmr);
+	if (rc != 0) {
+		CERROR("Failed to create MR by phybuf: %d\n", rc);
+		return rc;
+	}
+
+	/* If rd is not tx_rd, it's going to get sent to a peer, who will need
+	 * the rkey */
+	rd->rd_key = (rd != tx->tx_rd) ? tx->tx_u.pmr->pmr_mr->rkey :
+					 tx->tx_u.pmr->pmr_mr->lkey;
+	rd->rd_nfrags = 1;
+	rd->rd_frags[0].rf_addr = iova;
+	rd->rd_frags[0].rf_nob  = nob;
+
+	return 0;
+}
+
+void
+kiblnd_unmap_tx(lnet_ni_t *ni, kib_tx_t *tx)
+{
+	kib_net_t  *net = ni->ni_data;
+
+	LASSERT(net != NULL);
+
+	if (net->ibn_fmr_ps != NULL && tx->tx_u.fmr.fmr_pfmr != NULL) {
+		kiblnd_fmr_pool_unmap(&tx->tx_u.fmr, tx->tx_status);
+		tx->tx_u.fmr.fmr_pfmr = NULL;
+
+	} else if (net->ibn_pmr_ps != NULL && tx->tx_u.pmr != NULL) {
+		kiblnd_pmr_pool_unmap(tx->tx_u.pmr);
+		tx->tx_u.pmr = NULL;
+	}
+
+	if (tx->tx_nfrags != 0) {
+		kiblnd_dma_unmap_sg(tx->tx_pool->tpo_hdev->ibh_ibdev,
+				    tx->tx_frags, tx->tx_nfrags, tx->tx_dmadir);
+		tx->tx_nfrags = 0;
+	}
+}
+
+int
+kiblnd_map_tx(lnet_ni_t *ni, kib_tx_t *tx,
+	      kib_rdma_desc_t *rd, int nfrags)
+{
+	kib_hca_dev_t      *hdev  = tx->tx_pool->tpo_hdev;
+	kib_net_t	  *net   = ni->ni_data;
+	struct ib_mr       *mr    = NULL;
+	__u32	       nob;
+	int		 i;
+
+	/* If rd is not tx_rd, it's going to get sent to a peer and I'm the
+	 * RDMA sink */
+	tx->tx_dmadir = (rd != tx->tx_rd) ? DMA_FROM_DEVICE : DMA_TO_DEVICE;
+	tx->tx_nfrags = nfrags;
+
+	rd->rd_nfrags =
+		kiblnd_dma_map_sg(hdev->ibh_ibdev,
+				  tx->tx_frags, tx->tx_nfrags, tx->tx_dmadir);
+
+	for (i = 0, nob = 0; i < rd->rd_nfrags; i++) {
+		rd->rd_frags[i].rf_nob  = kiblnd_sg_dma_len(
+			hdev->ibh_ibdev, &tx->tx_frags[i]);
+		rd->rd_frags[i].rf_addr = kiblnd_sg_dma_address(
+			hdev->ibh_ibdev, &tx->tx_frags[i]);
+		nob += rd->rd_frags[i].rf_nob;
+	}
+
+	/* looking for pre-mapping MR */
+	mr = kiblnd_find_rd_dma_mr(hdev, rd);
+	if (mr != NULL) {
+		/* found pre-mapping MR */
+		rd->rd_key = (rd != tx->tx_rd) ? mr->rkey : mr->lkey;
+		return 0;
+	}
+
+	if (net->ibn_fmr_ps != NULL)
+		return kiblnd_fmr_map_tx(net, tx, rd, nob);
+	else if (net->ibn_pmr_ps != NULL)
+		return kiblnd_pmr_map_tx(net, tx, rd, nob);
+
+	return -EINVAL;
+}
+
+
+static int
+kiblnd_setup_rd_iov(lnet_ni_t *ni, kib_tx_t *tx, kib_rdma_desc_t *rd,
+		    unsigned int niov, struct kvec *iov, int offset, int nob)
+{
+	kib_net_t	  *net = ni->ni_data;
+	struct page	*page;
+	struct scatterlist *sg;
+	unsigned long       vaddr;
+	int		 fragnob;
+	int		 page_offset;
+
+	LASSERT(nob > 0);
+	LASSERT(niov > 0);
+	LASSERT(net != NULL);
+
+	while (offset >= iov->iov_len) {
+		offset -= iov->iov_len;
+		niov--;
+		iov++;
+		LASSERT(niov > 0);
+	}
+
+	sg = tx->tx_frags;
+	do {
+		LASSERT(niov > 0);
+
+		vaddr = ((unsigned long)iov->iov_base) + offset;
+		page_offset = vaddr & (PAGE_SIZE - 1);
+		page = kiblnd_kvaddr_to_page(vaddr);
+		if (page == NULL) {
+			CERROR("Can't find page\n");
+			return -EFAULT;
+		}
+
+		fragnob = min((int)(iov->iov_len - offset), nob);
+		fragnob = min(fragnob, (int)PAGE_SIZE - page_offset);
+
+		sg_set_page(sg, page, fragnob, page_offset);
+		sg++;
+
+		if (offset + fragnob < iov->iov_len) {
+			offset += fragnob;
+		} else {
+			offset = 0;
+			iov++;
+			niov--;
+		}
+		nob -= fragnob;
+	} while (nob > 0);
+
+	return kiblnd_map_tx(ni, tx, rd, sg - tx->tx_frags);
+}
+
+static int
+kiblnd_setup_rd_kiov(lnet_ni_t *ni, kib_tx_t *tx, kib_rdma_desc_t *rd,
+		      int nkiov, lnet_kiov_t *kiov, int offset, int nob)
+{
+	kib_net_t	  *net = ni->ni_data;
+	struct scatterlist *sg;
+	int		 fragnob;
+
+	CDEBUG(D_NET, "niov %d offset %d nob %d\n", nkiov, offset, nob);
+
+	LASSERT(nob > 0);
+	LASSERT(nkiov > 0);
+	LASSERT(net != NULL);
+
+	while (offset >= kiov->kiov_len) {
+		offset -= kiov->kiov_len;
+		nkiov--;
+		kiov++;
+		LASSERT(nkiov > 0);
+	}
+
+	sg = tx->tx_frags;
+	do {
+		LASSERT(nkiov > 0);
+
+		fragnob = min((int)(kiov->kiov_len - offset), nob);
+
+		sg_set_page(sg, kiov->kiov_page, fragnob,
+			    kiov->kiov_offset + offset);
+		sg++;
+
+		offset = 0;
+		kiov++;
+		nkiov--;
+		nob -= fragnob;
+	} while (nob > 0);
+
+	return kiblnd_map_tx(ni, tx, rd, sg - tx->tx_frags);
+}
+
+static int
+kiblnd_post_tx_locked(kib_conn_t *conn, kib_tx_t *tx, int credit)
+	__releases(conn->ibc_lock)
+	__acquires(conn->ibc_lock)
+{
+	kib_msg_t	 *msg = tx->tx_msg;
+	kib_peer_t	*peer = conn->ibc_peer;
+	int		ver = conn->ibc_version;
+	int		rc;
+	int		done;
+	struct ib_send_wr *bad_wrq;
+
+	LASSERT(tx->tx_queued);
+	/* We rely on this for QP sizing */
+	LASSERT(tx->tx_nwrq > 0);
+	LASSERT(tx->tx_nwrq <= 1 + IBLND_RDMA_FRAGS(ver));
+
+	LASSERT(credit == 0 || credit == 1);
+	LASSERT(conn->ibc_outstanding_credits >= 0);
+	LASSERT(conn->ibc_outstanding_credits <= IBLND_MSG_QUEUE_SIZE(ver));
+	LASSERT(conn->ibc_credits >= 0);
+	LASSERT(conn->ibc_credits <= IBLND_MSG_QUEUE_SIZE(ver));
+
+	if (conn->ibc_nsends_posted == IBLND_CONCURRENT_SENDS(ver)) {
+		/* tx completions outstanding... */
+		CDEBUG(D_NET, "%s: posted enough\n",
+		       libcfs_nid2str(peer->ibp_nid));
+		return -EAGAIN;
+	}
+
+	if (credit != 0 && conn->ibc_credits == 0) {   /* no credits */
+		CDEBUG(D_NET, "%s: no credits\n",
+		       libcfs_nid2str(peer->ibp_nid));
+		return -EAGAIN;
+	}
+
+	if (credit != 0 && !IBLND_OOB_CAPABLE(ver) &&
+	    conn->ibc_credits == 1 &&   /* last credit reserved */
+	    msg->ibm_type != IBLND_MSG_NOOP) {      /* for NOOP */
+		CDEBUG(D_NET, "%s: not using last credit\n",
+		       libcfs_nid2str(peer->ibp_nid));
+		return -EAGAIN;
+	}
+
+	/* NB don't drop ibc_lock before bumping tx_sending */
+	list_del(&tx->tx_list);
+	tx->tx_queued = 0;
+
+	if (msg->ibm_type == IBLND_MSG_NOOP &&
+	    (!kiblnd_need_noop(conn) ||     /* redundant NOOP */
+	     (IBLND_OOB_CAPABLE(ver) && /* posted enough NOOP */
+	      conn->ibc_noops_posted == IBLND_OOB_MSGS(ver)))) {
+		/* OK to drop when posted enough NOOPs, since
+		 * kiblnd_check_sends will queue NOOP again when
+		 * posted NOOPs complete */
+		spin_unlock(&conn->ibc_lock);
+		kiblnd_tx_done(peer->ibp_ni, tx);
+		spin_lock(&conn->ibc_lock);
+		CDEBUG(D_NET, "%s(%d): redundant or enough NOOP\n",
+		       libcfs_nid2str(peer->ibp_nid),
+		       conn->ibc_noops_posted);
+		return 0;
+	}
+
+	kiblnd_pack_msg(peer->ibp_ni, msg, ver, conn->ibc_outstanding_credits,
+			peer->ibp_nid, conn->ibc_incarnation);
+
+	conn->ibc_credits -= credit;
+	conn->ibc_outstanding_credits = 0;
+	conn->ibc_nsends_posted++;
+	if (msg->ibm_type == IBLND_MSG_NOOP)
+		conn->ibc_noops_posted++;
+
+	/* CAVEAT EMPTOR!  This tx could be the PUT_DONE of an RDMA
+	 * PUT.  If so, it was first queued here as a PUT_REQ, sent and
+	 * stashed on ibc_active_txs, matched by an incoming PUT_ACK,
+	 * and then re-queued here.  It's (just) possible that
+	 * tx_sending is non-zero if we've not done the tx_complete()
+	 * from the first send; hence the ++ rather than = below. */
+	tx->tx_sending++;
+	list_add(&tx->tx_list, &conn->ibc_active_txs);
+
+	/* I'm still holding ibc_lock! */
+	if (conn->ibc_state != IBLND_CONN_ESTABLISHED) {
+		rc = -ECONNABORTED;
+	} else if (tx->tx_pool->tpo_pool.po_failed ||
+		 conn->ibc_hdev != tx->tx_pool->tpo_hdev) {
+		/* close_conn will launch failover */
+		rc = -ENETDOWN;
+	} else {
+		rc = ib_post_send(conn->ibc_cmid->qp,
+				  tx->tx_wrq, &bad_wrq);
+	}
+
+	conn->ibc_last_send = jiffies;
+
+	if (rc == 0)
+		return 0;
+
+	/* NB credits are transferred in the actual
+	 * message, which can only be the last work item */
+	conn->ibc_credits += credit;
+	conn->ibc_outstanding_credits += msg->ibm_credits;
+	conn->ibc_nsends_posted--;
+	if (msg->ibm_type == IBLND_MSG_NOOP)
+		conn->ibc_noops_posted--;
+
+	tx->tx_status = rc;
+	tx->tx_waiting = 0;
+	tx->tx_sending--;
+
+	done = (tx->tx_sending == 0);
+	if (done)
+		list_del(&tx->tx_list);
+
+	spin_unlock(&conn->ibc_lock);
+
+	if (conn->ibc_state == IBLND_CONN_ESTABLISHED)
+		CERROR("Error %d posting transmit to %s\n",
+		       rc, libcfs_nid2str(peer->ibp_nid));
+	else
+		CDEBUG(D_NET, "Error %d posting transmit to %s\n",
+		       rc, libcfs_nid2str(peer->ibp_nid));
+
+	kiblnd_close_conn(conn, rc);
+
+	if (done)
+		kiblnd_tx_done(peer->ibp_ni, tx);
+
+	spin_lock(&conn->ibc_lock);
+
+	return -EIO;
+}
+
+void
+kiblnd_check_sends(kib_conn_t *conn)
+{
+	int	ver = conn->ibc_version;
+	lnet_ni_t *ni = conn->ibc_peer->ibp_ni;
+	kib_tx_t  *tx;
+
+	/* Don't send anything until after the connection is established */
+	if (conn->ibc_state < IBLND_CONN_ESTABLISHED) {
+		CDEBUG(D_NET, "%s too soon\n",
+		       libcfs_nid2str(conn->ibc_peer->ibp_nid));
+		return;
+	}
+
+	spin_lock(&conn->ibc_lock);
+
+	LASSERT(conn->ibc_nsends_posted <= IBLND_CONCURRENT_SENDS(ver));
+	LASSERT(!IBLND_OOB_CAPABLE(ver) ||
+		 conn->ibc_noops_posted <= IBLND_OOB_MSGS(ver));
+	LASSERT(conn->ibc_reserved_credits >= 0);
+
+	while (conn->ibc_reserved_credits > 0 &&
+	       !list_empty(&conn->ibc_tx_queue_rsrvd)) {
+		tx = list_entry(conn->ibc_tx_queue_rsrvd.next,
+				    kib_tx_t, tx_list);
+		list_del(&tx->tx_list);
+		list_add_tail(&tx->tx_list, &conn->ibc_tx_queue);
+		conn->ibc_reserved_credits--;
+	}
+
+	if (kiblnd_need_noop(conn)) {
+		spin_unlock(&conn->ibc_lock);
+
+		tx = kiblnd_get_idle_tx(ni, conn->ibc_peer->ibp_nid);
+		if (tx != NULL)
+			kiblnd_init_tx_msg(ni, tx, IBLND_MSG_NOOP, 0);
+
+		spin_lock(&conn->ibc_lock);
+		if (tx != NULL)
+			kiblnd_queue_tx_locked(tx, conn);
+	}
+
+	kiblnd_conn_addref(conn); /* 1 ref for me.... (see b21911) */
+
+	for (;;) {
+		int credit;
+
+		if (!list_empty(&conn->ibc_tx_queue_nocred)) {
+			credit = 0;
+			tx = list_entry(conn->ibc_tx_queue_nocred.next,
+					    kib_tx_t, tx_list);
+		} else if (!list_empty(&conn->ibc_tx_noops)) {
+			LASSERT(!IBLND_OOB_CAPABLE(ver));
+			credit = 1;
+			tx = list_entry(conn->ibc_tx_noops.next,
+					kib_tx_t, tx_list);
+		} else if (!list_empty(&conn->ibc_tx_queue)) {
+			credit = 1;
+			tx = list_entry(conn->ibc_tx_queue.next,
+					    kib_tx_t, tx_list);
+		} else
+			break;
+
+		if (kiblnd_post_tx_locked(conn, tx, credit) != 0)
+			break;
+	}
+
+	spin_unlock(&conn->ibc_lock);
+
+	kiblnd_conn_decref(conn); /* ...until here */
+}
+
+static void
+kiblnd_tx_complete(kib_tx_t *tx, int status)
+{
+	int	   failed = (status != IB_WC_SUCCESS);
+	kib_conn_t   *conn = tx->tx_conn;
+	int	   idle;
+
+	LASSERT(tx->tx_sending > 0);
+
+	if (failed) {
+		if (conn->ibc_state == IBLND_CONN_ESTABLISHED)
+			CNETERR("Tx -> %s cookie %#llx sending %d waiting %d: failed %d\n",
+				libcfs_nid2str(conn->ibc_peer->ibp_nid),
+				tx->tx_cookie, tx->tx_sending, tx->tx_waiting,
+				status);
+
+		kiblnd_close_conn(conn, -EIO);
+	} else {
+		kiblnd_peer_alive(conn->ibc_peer);
+	}
+
+	spin_lock(&conn->ibc_lock);
+
+	/* I could be racing with rdma completion.  Whoever makes 'tx' idle
+	 * gets to free it, which also drops its ref on 'conn'. */
+
+	tx->tx_sending--;
+	conn->ibc_nsends_posted--;
+	if (tx->tx_msg->ibm_type == IBLND_MSG_NOOP)
+		conn->ibc_noops_posted--;
+
+	if (failed) {
+		tx->tx_waiting = 0;	     /* don't wait for peer */
+		tx->tx_status = -EIO;
+	}
+
+	idle = (tx->tx_sending == 0) &&	 /* This is the final callback */
+	       !tx->tx_waiting &&	       /* Not waiting for peer */
+	       !tx->tx_queued;		  /* Not re-queued (PUT_DONE) */
+	if (idle)
+		list_del(&tx->tx_list);
+
+	kiblnd_conn_addref(conn);	       /* 1 ref for me.... */
+
+	spin_unlock(&conn->ibc_lock);
+
+	if (idle)
+		kiblnd_tx_done(conn->ibc_peer->ibp_ni, tx);
+
+	kiblnd_check_sends(conn);
+
+	kiblnd_conn_decref(conn);	       /* ...until here */
+}
+
+void
+kiblnd_init_tx_msg(lnet_ni_t *ni, kib_tx_t *tx, int type, int body_nob)
+{
+	kib_hca_dev_t     *hdev = tx->tx_pool->tpo_hdev;
+	struct ib_sge     *sge = &tx->tx_sge[tx->tx_nwrq];
+	struct ib_send_wr *wrq = &tx->tx_wrq[tx->tx_nwrq];
+	int		nob = offsetof(kib_msg_t, ibm_u) + body_nob;
+	struct ib_mr      *mr;
+
+	LASSERT(tx->tx_nwrq >= 0);
+	LASSERT(tx->tx_nwrq < IBLND_MAX_RDMA_FRAGS + 1);
+	LASSERT(nob <= IBLND_MSG_SIZE);
+
+	kiblnd_init_msg(tx->tx_msg, type, body_nob);
+
+	mr = kiblnd_find_dma_mr(hdev, tx->tx_msgaddr, nob);
+	LASSERT(mr != NULL);
+
+	sge->lkey   = mr->lkey;
+	sge->addr   = tx->tx_msgaddr;
+	sge->length = nob;
+
+	memset(wrq, 0, sizeof(*wrq));
+
+	wrq->next       = NULL;
+	wrq->wr_id      = kiblnd_ptr2wreqid(tx, IBLND_WID_TX);
+	wrq->sg_list    = sge;
+	wrq->num_sge    = 1;
+	wrq->opcode     = IB_WR_SEND;
+	wrq->send_flags = IB_SEND_SIGNALED;
+
+	tx->tx_nwrq++;
+}
+
+int
+kiblnd_init_rdma(kib_conn_t *conn, kib_tx_t *tx, int type,
+		  int resid, kib_rdma_desc_t *dstrd, __u64 dstcookie)
+{
+	kib_msg_t	 *ibmsg = tx->tx_msg;
+	kib_rdma_desc_t   *srcrd = tx->tx_rd;
+	struct ib_sge     *sge = &tx->tx_sge[0];
+	struct ib_send_wr *wrq = &tx->tx_wrq[0];
+	int		rc  = resid;
+	int		srcidx;
+	int		dstidx;
+	int		wrknob;
+
+	LASSERT(!in_interrupt());
+	LASSERT(tx->tx_nwrq == 0);
+	LASSERT(type == IBLND_MSG_GET_DONE ||
+		 type == IBLND_MSG_PUT_DONE);
+
+	srcidx = dstidx = 0;
+
+	while (resid > 0) {
+		if (srcidx >= srcrd->rd_nfrags) {
+			CERROR("Src buffer exhausted: %d frags\n", srcidx);
+			rc = -EPROTO;
+			break;
+		}
+
+		if (dstidx == dstrd->rd_nfrags) {
+			CERROR("Dst buffer exhausted: %d frags\n", dstidx);
+			rc = -EPROTO;
+			break;
+		}
+
+		if (tx->tx_nwrq == IBLND_RDMA_FRAGS(conn->ibc_version)) {
+			CERROR("RDMA too fragmented for %s (%d): %d/%d src %d/%d dst frags\n",
+			       libcfs_nid2str(conn->ibc_peer->ibp_nid),
+			       IBLND_RDMA_FRAGS(conn->ibc_version),
+			       srcidx, srcrd->rd_nfrags,
+			       dstidx, dstrd->rd_nfrags);
+			rc = -EMSGSIZE;
+			break;
+		}
+
+		wrknob = min(min(kiblnd_rd_frag_size(srcrd, srcidx),
+				 kiblnd_rd_frag_size(dstrd, dstidx)),
+			     (__u32) resid);
+
+		sge = &tx->tx_sge[tx->tx_nwrq];
+		sge->addr   = kiblnd_rd_frag_addr(srcrd, srcidx);
+		sge->lkey   = kiblnd_rd_frag_key(srcrd, srcidx);
+		sge->length = wrknob;
+
+		wrq = &tx->tx_wrq[tx->tx_nwrq];
+
+		wrq->next       = wrq + 1;
+		wrq->wr_id      = kiblnd_ptr2wreqid(tx, IBLND_WID_RDMA);
+		wrq->sg_list    = sge;
+		wrq->num_sge    = 1;
+		wrq->opcode     = IB_WR_RDMA_WRITE;
+		wrq->send_flags = 0;
+
+		wrq->wr.rdma.remote_addr = kiblnd_rd_frag_addr(dstrd, dstidx);
+		wrq->wr.rdma.rkey	= kiblnd_rd_frag_key(dstrd, dstidx);
+
+		srcidx = kiblnd_rd_consume_frag(srcrd, srcidx, wrknob);
+		dstidx = kiblnd_rd_consume_frag(dstrd, dstidx, wrknob);
+
+		resid -= wrknob;
+
+		tx->tx_nwrq++;
+		wrq++;
+		sge++;
+	}
+
+	if (rc < 0)			     /* no RDMA if completing with failure */
+		tx->tx_nwrq = 0;
+
+	ibmsg->ibm_u.completion.ibcm_status = rc;
+	ibmsg->ibm_u.completion.ibcm_cookie = dstcookie;
+	kiblnd_init_tx_msg(conn->ibc_peer->ibp_ni, tx,
+			   type, sizeof(kib_completion_msg_t));
+
+	return rc;
+}
+
+void
+kiblnd_queue_tx_locked(kib_tx_t *tx, kib_conn_t *conn)
+{
+	struct list_head   *q;
+
+	LASSERT(tx->tx_nwrq > 0);	      /* work items set up */
+	LASSERT(!tx->tx_queued);	       /* not queued for sending already */
+	LASSERT(conn->ibc_state >= IBLND_CONN_ESTABLISHED);
+
+	tx->tx_queued = 1;
+	tx->tx_deadline = jiffies + (*kiblnd_tunables.kib_timeout * HZ);
+
+	if (tx->tx_conn == NULL) {
+		kiblnd_conn_addref(conn);
+		tx->tx_conn = conn;
+		LASSERT(tx->tx_msg->ibm_type != IBLND_MSG_PUT_DONE);
+	} else {
+		/* PUT_DONE first attached to conn as a PUT_REQ */
+		LASSERT(tx->tx_conn == conn);
+		LASSERT(tx->tx_msg->ibm_type == IBLND_MSG_PUT_DONE);
+	}
+
+	switch (tx->tx_msg->ibm_type) {
+	default:
+		LBUG();
+
+	case IBLND_MSG_PUT_REQ:
+	case IBLND_MSG_GET_REQ:
+		q = &conn->ibc_tx_queue_rsrvd;
+		break;
+
+	case IBLND_MSG_PUT_NAK:
+	case IBLND_MSG_PUT_ACK:
+	case IBLND_MSG_PUT_DONE:
+	case IBLND_MSG_GET_DONE:
+		q = &conn->ibc_tx_queue_nocred;
+		break;
+
+	case IBLND_MSG_NOOP:
+		if (IBLND_OOB_CAPABLE(conn->ibc_version))
+			q = &conn->ibc_tx_queue_nocred;
+		else
+			q = &conn->ibc_tx_noops;
+		break;
+
+	case IBLND_MSG_IMMEDIATE:
+		q = &conn->ibc_tx_queue;
+		break;
+	}
+
+	list_add_tail(&tx->tx_list, q);
+}
+
+void
+kiblnd_queue_tx(kib_tx_t *tx, kib_conn_t *conn)
+{
+	spin_lock(&conn->ibc_lock);
+	kiblnd_queue_tx_locked(tx, conn);
+	spin_unlock(&conn->ibc_lock);
+
+	kiblnd_check_sends(conn);
+}
+
+static int kiblnd_resolve_addr(struct rdma_cm_id *cmid,
+			       struct sockaddr_in *srcaddr,
+			       struct sockaddr_in *dstaddr,
+			       int timeout_ms)
+{
+	unsigned short port;
+	int rc;
+
+	/* allow the port to be reused */
+	rc = rdma_set_reuseaddr(cmid, 1);
+	if (rc != 0) {
+		CERROR("Unable to set reuse on cmid: %d\n", rc);
+		return rc;
+	}
+
+	/* look for a free privileged port */
+	for (port = PROT_SOCK-1; port > 0; port--) {
+		srcaddr->sin_port = htons(port);
+		rc = rdma_resolve_addr(cmid,
+				       (struct sockaddr *)srcaddr,
+				       (struct sockaddr *)dstaddr,
+				       timeout_ms);
+		if (rc == 0) {
+			CDEBUG(D_NET, "bound to port %hu\n", port);
+			return 0;
+		} else if (rc == -EADDRINUSE || rc == -EADDRNOTAVAIL) {
+			CDEBUG(D_NET, "bind to port %hu failed: %d\n",
+			       port, rc);
+		} else {
+			return rc;
+		}
+	}
+
+	CERROR("Failed to bind to a free privileged port\n");
+	return rc;
+}
+
+static void
+kiblnd_connect_peer(kib_peer_t *peer)
+{
+	struct rdma_cm_id *cmid;
+	kib_dev_t	 *dev;
+	kib_net_t	 *net = peer->ibp_ni->ni_data;
+	struct sockaddr_in srcaddr;
+	struct sockaddr_in dstaddr;
+	int		rc;
+
+	LASSERT(net != NULL);
+	LASSERT(peer->ibp_connecting > 0);
+
+	cmid = kiblnd_rdma_create_id(kiblnd_cm_callback, peer, RDMA_PS_TCP,
+				     IB_QPT_RC);
+
+	if (IS_ERR(cmid)) {
+		CERROR("Can't create CMID for %s: %ld\n",
+		       libcfs_nid2str(peer->ibp_nid), PTR_ERR(cmid));
+		rc = PTR_ERR(cmid);
+		goto failed;
+	}
+
+	dev = net->ibn_dev;
+	memset(&srcaddr, 0, sizeof(srcaddr));
+	srcaddr.sin_family = AF_INET;
+	srcaddr.sin_addr.s_addr = htonl(dev->ibd_ifip);
+
+	memset(&dstaddr, 0, sizeof(dstaddr));
+	dstaddr.sin_family = AF_INET;
+	dstaddr.sin_port = htons(*kiblnd_tunables.kib_service);
+	dstaddr.sin_addr.s_addr = htonl(LNET_NIDADDR(peer->ibp_nid));
+
+	kiblnd_peer_addref(peer);	       /* cmid's ref */
+
+	if (*kiblnd_tunables.kib_use_priv_port) {
+		rc = kiblnd_resolve_addr(cmid, &srcaddr, &dstaddr,
+					 *kiblnd_tunables.kib_timeout * 1000);
+	} else {
+		rc = rdma_resolve_addr(cmid,
+				       (struct sockaddr *)&srcaddr,
+				       (struct sockaddr *)&dstaddr,
+				       *kiblnd_tunables.kib_timeout * 1000);
+	}
+	if (rc != 0) {
+		/* Can't initiate address resolution:  */
+		CERROR("Can't resolve addr for %s: %d\n",
+		       libcfs_nid2str(peer->ibp_nid), rc);
+		goto failed2;
+	}
+
+	LASSERT(cmid->device != NULL);
+	CDEBUG(D_NET, "%s: connection bound to %s:%pI4h:%s\n",
+	       libcfs_nid2str(peer->ibp_nid), dev->ibd_ifname,
+	       &dev->ibd_ifip, cmid->device->name);
+
+	return;
+
+ failed2:
+	kiblnd_peer_decref(peer);	       /* cmid's ref */
+	rdma_destroy_id(cmid);
+ failed:
+	kiblnd_peer_connect_failed(peer, 1, rc);
+}
+
+void
+kiblnd_launch_tx(lnet_ni_t *ni, kib_tx_t *tx, lnet_nid_t nid)
+{
+	kib_peer_t	*peer;
+	kib_peer_t	*peer2;
+	kib_conn_t	*conn;
+	rwlock_t	*g_lock = &kiblnd_data.kib_global_lock;
+	unsigned long      flags;
+	int		rc;
+
+	/* If I get here, I've committed to send, so I complete the tx with
+	 * failure on any problems */
+
+	LASSERT(tx == NULL || tx->tx_conn == NULL); /* only set when assigned a conn */
+	LASSERT(tx == NULL || tx->tx_nwrq > 0);     /* work items have been set up */
+
+	/* First time, just use a read lock since I expect to find my peer
+	 * connected */
+	read_lock_irqsave(g_lock, flags);
+
+	peer = kiblnd_find_peer_locked(nid);
+	if (peer != NULL && !list_empty(&peer->ibp_conns)) {
+		/* Found a peer with an established connection */
+		conn = kiblnd_get_conn_locked(peer);
+		kiblnd_conn_addref(conn); /* 1 ref for me... */
+
+		read_unlock_irqrestore(g_lock, flags);
+
+		if (tx != NULL)
+			kiblnd_queue_tx(tx, conn);
+		kiblnd_conn_decref(conn); /* ...to here */
+		return;
+	}
+
+	read_unlock(g_lock);
+	/* Re-try with a write lock */
+	write_lock(g_lock);
+
+	peer = kiblnd_find_peer_locked(nid);
+	if (peer != NULL) {
+		if (list_empty(&peer->ibp_conns)) {
+			/* found a peer, but it's still connecting... */
+			LASSERT(peer->ibp_connecting != 0 ||
+				 peer->ibp_accepting != 0);
+			if (tx != NULL)
+				list_add_tail(&tx->tx_list,
+						  &peer->ibp_tx_queue);
+			write_unlock_irqrestore(g_lock, flags);
+		} else {
+			conn = kiblnd_get_conn_locked(peer);
+			kiblnd_conn_addref(conn); /* 1 ref for me... */
+
+			write_unlock_irqrestore(g_lock, flags);
+
+			if (tx != NULL)
+				kiblnd_queue_tx(tx, conn);
+			kiblnd_conn_decref(conn); /* ...to here */
+		}
+		return;
+	}
+
+	write_unlock_irqrestore(g_lock, flags);
+
+	/* Allocate a peer ready to add to the peer table and retry */
+	rc = kiblnd_create_peer(ni, &peer, nid);
+	if (rc != 0) {
+		CERROR("Can't create peer %s\n", libcfs_nid2str(nid));
+		if (tx != NULL) {
+			tx->tx_status = -EHOSTUNREACH;
+			tx->tx_waiting = 0;
+			kiblnd_tx_done(ni, tx);
+		}
+		return;
+	}
+
+	write_lock_irqsave(g_lock, flags);
+
+	peer2 = kiblnd_find_peer_locked(nid);
+	if (peer2 != NULL) {
+		if (list_empty(&peer2->ibp_conns)) {
+			/* found a peer, but it's still connecting... */
+			LASSERT(peer2->ibp_connecting != 0 ||
+				 peer2->ibp_accepting != 0);
+			if (tx != NULL)
+				list_add_tail(&tx->tx_list,
+						  &peer2->ibp_tx_queue);
+			write_unlock_irqrestore(g_lock, flags);
+		} else {
+			conn = kiblnd_get_conn_locked(peer2);
+			kiblnd_conn_addref(conn); /* 1 ref for me... */
+
+			write_unlock_irqrestore(g_lock, flags);
+
+			if (tx != NULL)
+				kiblnd_queue_tx(tx, conn);
+			kiblnd_conn_decref(conn); /* ...to here */
+		}
+
+		kiblnd_peer_decref(peer);
+		return;
+	}
+
+	/* Brand new peer */
+	LASSERT(peer->ibp_connecting == 0);
+	peer->ibp_connecting = 1;
+
+	/* always called with a ref on ni, which prevents ni being shutdown */
+	LASSERT(((kib_net_t *)ni->ni_data)->ibn_shutdown == 0);
+
+	if (tx != NULL)
+		list_add_tail(&tx->tx_list, &peer->ibp_tx_queue);
+
+	kiblnd_peer_addref(peer);
+	list_add_tail(&peer->ibp_list, kiblnd_nid2peerlist(nid));
+
+	write_unlock_irqrestore(g_lock, flags);
+
+	kiblnd_connect_peer(peer);
+	kiblnd_peer_decref(peer);
+}
+
+int
+kiblnd_send(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg)
+{
+	lnet_hdr_t       *hdr = &lntmsg->msg_hdr;
+	int	       type = lntmsg->msg_type;
+	lnet_process_id_t target = lntmsg->msg_target;
+	int	       target_is_router = lntmsg->msg_target_is_router;
+	int	       routing = lntmsg->msg_routing;
+	unsigned int      payload_niov = lntmsg->msg_niov;
+	struct kvec      *payload_iov = lntmsg->msg_iov;
+	lnet_kiov_t      *payload_kiov = lntmsg->msg_kiov;
+	unsigned int      payload_offset = lntmsg->msg_offset;
+	unsigned int      payload_nob = lntmsg->msg_len;
+	kib_msg_t	*ibmsg;
+	kib_tx_t	 *tx;
+	int	       nob;
+	int	       rc;
+
+	/* NB 'private' is different depending on what we're sending.... */
+
+	CDEBUG(D_NET, "sending %d bytes in %d frags to %s\n",
+	       payload_nob, payload_niov, libcfs_id2str(target));
+
+	LASSERT(payload_nob == 0 || payload_niov > 0);
+	LASSERT(payload_niov <= LNET_MAX_IOV);
+
+	/* Thread context */
+	LASSERT(!in_interrupt());
+	/* payload is either all vaddrs or all pages */
+	LASSERT(!(payload_kiov != NULL && payload_iov != NULL));
+
+	switch (type) {
+	default:
+		LBUG();
+		return -EIO;
+
+	case LNET_MSG_ACK:
+		LASSERT(payload_nob == 0);
+		break;
+
+	case LNET_MSG_GET:
+		if (routing || target_is_router)
+			break;		  /* send IMMEDIATE */
+
+		/* is the REPLY message too small for RDMA? */
+		nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[lntmsg->msg_md->md_length]);
+		if (nob <= IBLND_MSG_SIZE)
+			break;		  /* send IMMEDIATE */
+
+		tx = kiblnd_get_idle_tx(ni, target.nid);
+		if (tx == NULL) {
+			CERROR("Can't allocate txd for GET to %s\n",
+			       libcfs_nid2str(target.nid));
+			return -ENOMEM;
+		}
+
+		ibmsg = tx->tx_msg;
+
+		if ((lntmsg->msg_md->md_options & LNET_MD_KIOV) == 0)
+			rc = kiblnd_setup_rd_iov(ni, tx,
+						 &ibmsg->ibm_u.get.ibgm_rd,
+						 lntmsg->msg_md->md_niov,
+						 lntmsg->msg_md->md_iov.iov,
+						 0, lntmsg->msg_md->md_length);
+		else
+			rc = kiblnd_setup_rd_kiov(ni, tx,
+						  &ibmsg->ibm_u.get.ibgm_rd,
+						  lntmsg->msg_md->md_niov,
+						  lntmsg->msg_md->md_iov.kiov,
+						  0, lntmsg->msg_md->md_length);
+		if (rc != 0) {
+			CERROR("Can't setup GET sink for %s: %d\n",
+			       libcfs_nid2str(target.nid), rc);
+			kiblnd_tx_done(ni, tx);
+			return -EIO;
+		}
+
+		nob = offsetof(kib_get_msg_t, ibgm_rd.rd_frags[tx->tx_nfrags]);
+		ibmsg->ibm_u.get.ibgm_cookie = tx->tx_cookie;
+		ibmsg->ibm_u.get.ibgm_hdr = *hdr;
+
+		kiblnd_init_tx_msg(ni, tx, IBLND_MSG_GET_REQ, nob);
+
+		tx->tx_lntmsg[1] = lnet_create_reply_msg(ni, lntmsg);
+		if (tx->tx_lntmsg[1] == NULL) {
+			CERROR("Can't create reply for GET -> %s\n",
+			       libcfs_nid2str(target.nid));
+			kiblnd_tx_done(ni, tx);
+			return -EIO;
+		}
+
+		tx->tx_lntmsg[0] = lntmsg;      /* finalise lntmsg[0,1] on completion */
+		tx->tx_waiting = 1;	     /* waiting for GET_DONE */
+		kiblnd_launch_tx(ni, tx, target.nid);
+		return 0;
+
+	case LNET_MSG_REPLY:
+	case LNET_MSG_PUT:
+		/* Is the payload small enough not to need RDMA? */
+		nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[payload_nob]);
+		if (nob <= IBLND_MSG_SIZE)
+			break;		  /* send IMMEDIATE */
+
+		tx = kiblnd_get_idle_tx(ni, target.nid);
+		if (tx == NULL) {
+			CERROR("Can't allocate %s txd for %s\n",
+			       type == LNET_MSG_PUT ? "PUT" : "REPLY",
+			       libcfs_nid2str(target.nid));
+			return -ENOMEM;
+		}
+
+		if (payload_kiov == NULL)
+			rc = kiblnd_setup_rd_iov(ni, tx, tx->tx_rd,
+						 payload_niov, payload_iov,
+						 payload_offset, payload_nob);
+		else
+			rc = kiblnd_setup_rd_kiov(ni, tx, tx->tx_rd,
+						  payload_niov, payload_kiov,
+						  payload_offset, payload_nob);
+		if (rc != 0) {
+			CERROR("Can't setup PUT src for %s: %d\n",
+			       libcfs_nid2str(target.nid), rc);
+			kiblnd_tx_done(ni, tx);
+			return -EIO;
+		}
+
+		ibmsg = tx->tx_msg;
+		ibmsg->ibm_u.putreq.ibprm_hdr = *hdr;
+		ibmsg->ibm_u.putreq.ibprm_cookie = tx->tx_cookie;
+		kiblnd_init_tx_msg(ni, tx, IBLND_MSG_PUT_REQ, sizeof(kib_putreq_msg_t));
+
+		tx->tx_lntmsg[0] = lntmsg;      /* finalise lntmsg on completion */
+		tx->tx_waiting = 1;	     /* waiting for PUT_{ACK,NAK} */
+		kiblnd_launch_tx(ni, tx, target.nid);
+		return 0;
+	}
+
+	/* send IMMEDIATE */
+
+	LASSERT(offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[payload_nob])
+		 <= IBLND_MSG_SIZE);
+
+	tx = kiblnd_get_idle_tx(ni, target.nid);
+	if (tx == NULL) {
+		CERROR("Can't send %d to %s: tx descs exhausted\n",
+			type, libcfs_nid2str(target.nid));
+		return -ENOMEM;
+	}
+
+	ibmsg = tx->tx_msg;
+	ibmsg->ibm_u.immediate.ibim_hdr = *hdr;
+
+	if (payload_kiov != NULL)
+		lnet_copy_kiov2flat(IBLND_MSG_SIZE, ibmsg,
+				    offsetof(kib_msg_t, ibm_u.immediate.ibim_payload),
+				    payload_niov, payload_kiov,
+				    payload_offset, payload_nob);
+	else
+		lnet_copy_iov2flat(IBLND_MSG_SIZE, ibmsg,
+				   offsetof(kib_msg_t, ibm_u.immediate.ibim_payload),
+				   payload_niov, payload_iov,
+				   payload_offset, payload_nob);
+
+	nob = offsetof(kib_immediate_msg_t, ibim_payload[payload_nob]);
+	kiblnd_init_tx_msg(ni, tx, IBLND_MSG_IMMEDIATE, nob);
+
+	tx->tx_lntmsg[0] = lntmsg;	      /* finalise lntmsg on completion */
+	kiblnd_launch_tx(ni, tx, target.nid);
+	return 0;
+}
+
+static void
+kiblnd_reply(lnet_ni_t *ni, kib_rx_t *rx, lnet_msg_t *lntmsg)
+{
+	lnet_process_id_t target = lntmsg->msg_target;
+	unsigned int      niov = lntmsg->msg_niov;
+	struct kvec      *iov = lntmsg->msg_iov;
+	lnet_kiov_t      *kiov = lntmsg->msg_kiov;
+	unsigned int      offset = lntmsg->msg_offset;
+	unsigned int      nob = lntmsg->msg_len;
+	kib_tx_t	 *tx;
+	int	       rc;
+
+	tx = kiblnd_get_idle_tx(ni, rx->rx_conn->ibc_peer->ibp_nid);
+	if (tx == NULL) {
+		CERROR("Can't get tx for REPLY to %s\n",
+		       libcfs_nid2str(target.nid));
+		goto failed_0;
+	}
+
+	if (nob == 0)
+		rc = 0;
+	else if (kiov == NULL)
+		rc = kiblnd_setup_rd_iov(ni, tx, tx->tx_rd,
+					 niov, iov, offset, nob);
+	else
+		rc = kiblnd_setup_rd_kiov(ni, tx, tx->tx_rd,
+					  niov, kiov, offset, nob);
+
+	if (rc != 0) {
+		CERROR("Can't setup GET src for %s: %d\n",
+		       libcfs_nid2str(target.nid), rc);
+		goto failed_1;
+	}
+
+	rc = kiblnd_init_rdma(rx->rx_conn, tx,
+			      IBLND_MSG_GET_DONE, nob,
+			      &rx->rx_msg->ibm_u.get.ibgm_rd,
+			      rx->rx_msg->ibm_u.get.ibgm_cookie);
+	if (rc < 0) {
+		CERROR("Can't setup rdma for GET from %s: %d\n",
+		       libcfs_nid2str(target.nid), rc);
+		goto failed_1;
+	}
+
+	if (nob == 0) {
+		/* No RDMA: local completion may happen now! */
+		lnet_finalize(ni, lntmsg, 0);
+	} else {
+		/* RDMA: lnet_finalize(lntmsg) when it
+		 * completes */
+		tx->tx_lntmsg[0] = lntmsg;
+	}
+
+	kiblnd_queue_tx(tx, rx->rx_conn);
+	return;
+
+ failed_1:
+	kiblnd_tx_done(ni, tx);
+ failed_0:
+	lnet_finalize(ni, lntmsg, -EIO);
+}
+
+int
+kiblnd_recv(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg, int delayed,
+	     unsigned int niov, struct kvec *iov, lnet_kiov_t *kiov,
+	     unsigned int offset, unsigned int mlen, unsigned int rlen)
+{
+	kib_rx_t    *rx = private;
+	kib_msg_t   *rxmsg = rx->rx_msg;
+	kib_conn_t  *conn = rx->rx_conn;
+	kib_tx_t    *tx;
+	kib_msg_t   *txmsg;
+	int	  nob;
+	int	  post_credit = IBLND_POSTRX_PEER_CREDIT;
+	int	  rc = 0;
+
+	LASSERT(mlen <= rlen);
+	LASSERT(!in_interrupt());
+	/* Either all pages or all vaddrs */
+	LASSERT(!(kiov != NULL && iov != NULL));
+
+	switch (rxmsg->ibm_type) {
+	default:
+		LBUG();
+
+	case IBLND_MSG_IMMEDIATE:
+		nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[rlen]);
+		if (nob > rx->rx_nob) {
+			CERROR("Immediate message from %s too big: %d(%d)\n",
+				libcfs_nid2str(rxmsg->ibm_u.immediate.ibim_hdr.src_nid),
+				nob, rx->rx_nob);
+			rc = -EPROTO;
+			break;
+		}
+
+		if (kiov != NULL)
+			lnet_copy_flat2kiov(niov, kiov, offset,
+					    IBLND_MSG_SIZE, rxmsg,
+					    offsetof(kib_msg_t, ibm_u.immediate.ibim_payload),
+					    mlen);
+		else
+			lnet_copy_flat2iov(niov, iov, offset,
+					   IBLND_MSG_SIZE, rxmsg,
+					   offsetof(kib_msg_t, ibm_u.immediate.ibim_payload),
+					   mlen);
+		lnet_finalize(ni, lntmsg, 0);
+		break;
+
+	case IBLND_MSG_PUT_REQ:
+		if (mlen == 0) {
+			lnet_finalize(ni, lntmsg, 0);
+			kiblnd_send_completion(rx->rx_conn, IBLND_MSG_PUT_NAK, 0,
+					       rxmsg->ibm_u.putreq.ibprm_cookie);
+			break;
+		}
+
+		tx = kiblnd_get_idle_tx(ni, conn->ibc_peer->ibp_nid);
+		if (tx == NULL) {
+			CERROR("Can't allocate tx for %s\n",
+			       libcfs_nid2str(conn->ibc_peer->ibp_nid));
+			/* Not replying will break the connection */
+			rc = -ENOMEM;
+			break;
+		}
+
+		txmsg = tx->tx_msg;
+		if (kiov == NULL)
+			rc = kiblnd_setup_rd_iov(ni, tx,
+						 &txmsg->ibm_u.putack.ibpam_rd,
+						 niov, iov, offset, mlen);
+		else
+			rc = kiblnd_setup_rd_kiov(ni, tx,
+						  &txmsg->ibm_u.putack.ibpam_rd,
+						  niov, kiov, offset, mlen);
+		if (rc != 0) {
+			CERROR("Can't setup PUT sink for %s: %d\n",
+			       libcfs_nid2str(conn->ibc_peer->ibp_nid), rc);
+			kiblnd_tx_done(ni, tx);
+			/* tell peer it's over */
+			kiblnd_send_completion(rx->rx_conn, IBLND_MSG_PUT_NAK, rc,
+					       rxmsg->ibm_u.putreq.ibprm_cookie);
+			break;
+		}
+
+		nob = offsetof(kib_putack_msg_t, ibpam_rd.rd_frags[tx->tx_nfrags]);
+		txmsg->ibm_u.putack.ibpam_src_cookie = rxmsg->ibm_u.putreq.ibprm_cookie;
+		txmsg->ibm_u.putack.ibpam_dst_cookie = tx->tx_cookie;
+
+		kiblnd_init_tx_msg(ni, tx, IBLND_MSG_PUT_ACK, nob);
+
+		tx->tx_lntmsg[0] = lntmsg;      /* finalise lntmsg on completion */
+		tx->tx_waiting = 1;	     /* waiting for PUT_DONE */
+		kiblnd_queue_tx(tx, conn);
+
+		/* reposted buffer reserved for PUT_DONE */
+		post_credit = IBLND_POSTRX_NO_CREDIT;
+		break;
+
+	case IBLND_MSG_GET_REQ:
+		if (lntmsg != NULL) {
+			/* Optimized GET; RDMA lntmsg's payload */
+			kiblnd_reply(ni, rx, lntmsg);
+		} else {
+			/* GET didn't match anything */
+			kiblnd_send_completion(rx->rx_conn, IBLND_MSG_GET_DONE,
+					       -ENODATA,
+					       rxmsg->ibm_u.get.ibgm_cookie);
+		}
+		break;
+	}
+
+	kiblnd_post_rx(rx, post_credit);
+	return rc;
+}
+
+int
+kiblnd_thread_start(int (*fn)(void *arg), void *arg, char *name)
+{
+	struct task_struct *task = kthread_run(fn, arg, "%s", name);
+
+	if (IS_ERR(task))
+		return PTR_ERR(task);
+
+	atomic_inc(&kiblnd_data.kib_nthreads);
+	return 0;
+}
+
+static void
+kiblnd_thread_fini(void)
+{
+	atomic_dec(&kiblnd_data.kib_nthreads);
+}
+
+void
+kiblnd_peer_alive(kib_peer_t *peer)
+{
+	/* This is racy, but everyone's only writing cfs_time_current() */
+	peer->ibp_last_alive = cfs_time_current();
+	mb();
+}
+
+static void
+kiblnd_peer_notify(kib_peer_t *peer)
+{
+	int	   error = 0;
+	unsigned long    last_alive = 0;
+	unsigned long flags;
+
+	read_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
+
+	if (list_empty(&peer->ibp_conns) &&
+	    peer->ibp_accepting == 0 &&
+	    peer->ibp_connecting == 0 &&
+	    peer->ibp_error != 0) {
+		error = peer->ibp_error;
+		peer->ibp_error = 0;
+
+		last_alive = peer->ibp_last_alive;
+	}
+
+	read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+
+	if (error != 0)
+		lnet_notify(peer->ibp_ni,
+			    peer->ibp_nid, 0, last_alive);
+}
+
+void
+kiblnd_close_conn_locked(kib_conn_t *conn, int error)
+{
+	/* This just does the immediate housekeeping.  'error' is zero for a
+	 * normal shutdown which can happen only after the connection has been
+	 * established.  If the connection is established, schedule the
+	 * connection to be finished off by the connd.  Otherwise the connd is
+	 * already dealing with it (either to set it up or tear it down).
+	 * Caller holds kib_global_lock exclusively in irq context */
+	kib_peer_t       *peer = conn->ibc_peer;
+	kib_dev_t	*dev;
+	unsigned long     flags;
+
+	LASSERT(error != 0 || conn->ibc_state >= IBLND_CONN_ESTABLISHED);
+
+	if (error != 0 && conn->ibc_comms_error == 0)
+		conn->ibc_comms_error = error;
+
+	if (conn->ibc_state != IBLND_CONN_ESTABLISHED)
+		return; /* already being handled  */
+
+	if (error == 0 &&
+	    list_empty(&conn->ibc_tx_noops) &&
+	    list_empty(&conn->ibc_tx_queue) &&
+	    list_empty(&conn->ibc_tx_queue_rsrvd) &&
+	    list_empty(&conn->ibc_tx_queue_nocred) &&
+	    list_empty(&conn->ibc_active_txs)) {
+		CDEBUG(D_NET, "closing conn to %s\n",
+		       libcfs_nid2str(peer->ibp_nid));
+	} else {
+		CNETERR("Closing conn to %s: error %d%s%s%s%s%s\n",
+		       libcfs_nid2str(peer->ibp_nid), error,
+		       list_empty(&conn->ibc_tx_queue) ? "" : "(sending)",
+		       list_empty(&conn->ibc_tx_noops) ? "" : "(sending_noops)",
+		       list_empty(&conn->ibc_tx_queue_rsrvd) ? "" : "(sending_rsrvd)",
+		       list_empty(&conn->ibc_tx_queue_nocred) ? "" : "(sending_nocred)",
+		       list_empty(&conn->ibc_active_txs) ? "" : "(waiting)");
+	}
+
+	dev = ((kib_net_t *)peer->ibp_ni->ni_data)->ibn_dev;
+	list_del(&conn->ibc_list);
+	/* connd (see below) takes over ibc_list's ref */
+
+	if (list_empty(&peer->ibp_conns) &&    /* no more conns */
+	    kiblnd_peer_active(peer)) {	 /* still in peer table */
+		kiblnd_unlink_peer_locked(peer);
+
+		/* set/clear error on last conn */
+		peer->ibp_error = conn->ibc_comms_error;
+	}
+
+	kiblnd_set_conn_state(conn, IBLND_CONN_CLOSING);
+
+	if (error != 0 &&
+	    kiblnd_dev_can_failover(dev)) {
+		list_add_tail(&dev->ibd_fail_list,
+			      &kiblnd_data.kib_failed_devs);
+		wake_up(&kiblnd_data.kib_failover_waitq);
+	}
+
+	spin_lock_irqsave(&kiblnd_data.kib_connd_lock, flags);
+
+	list_add_tail(&conn->ibc_list, &kiblnd_data.kib_connd_conns);
+	wake_up(&kiblnd_data.kib_connd_waitq);
+
+	spin_unlock_irqrestore(&kiblnd_data.kib_connd_lock, flags);
+}
+
+void
+kiblnd_close_conn(kib_conn_t *conn, int error)
+{
+	unsigned long flags;
+
+	write_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
+
+	kiblnd_close_conn_locked(conn, error);
+
+	write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+}
+
+static void
+kiblnd_handle_early_rxs(kib_conn_t *conn)
+{
+	unsigned long    flags;
+	kib_rx_t	*rx;
+	kib_rx_t *tmp;
+
+	LASSERT(!in_interrupt());
+	LASSERT(conn->ibc_state >= IBLND_CONN_ESTABLISHED);
+
+	write_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
+	list_for_each_entry_safe(rx, tmp, &conn->ibc_early_rxs, rx_list) {
+		list_del(&rx->rx_list);
+		write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+
+		kiblnd_handle_rx(rx);
+
+		write_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
+	}
+	write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+}
+
+static void
+kiblnd_abort_txs(kib_conn_t *conn, struct list_head *txs)
+{
+	LIST_HEAD(zombies);
+	struct list_head	  *tmp;
+	struct list_head	  *nxt;
+	kib_tx_t	    *tx;
+
+	spin_lock(&conn->ibc_lock);
+
+	list_for_each_safe(tmp, nxt, txs) {
+		tx = list_entry(tmp, kib_tx_t, tx_list);
+
+		if (txs == &conn->ibc_active_txs) {
+			LASSERT(!tx->tx_queued);
+			LASSERT(tx->tx_waiting ||
+				 tx->tx_sending != 0);
+		} else {
+			LASSERT(tx->tx_queued);
+		}
+
+		tx->tx_status = -ECONNABORTED;
+		tx->tx_waiting = 0;
+
+		if (tx->tx_sending == 0) {
+			tx->tx_queued = 0;
+			list_del(&tx->tx_list);
+			list_add(&tx->tx_list, &zombies);
+		}
+	}
+
+	spin_unlock(&conn->ibc_lock);
+
+	kiblnd_txlist_done(conn->ibc_peer->ibp_ni, &zombies, -ECONNABORTED);
+}
+
+static void
+kiblnd_finalise_conn(kib_conn_t *conn)
+{
+	LASSERT(!in_interrupt());
+	LASSERT(conn->ibc_state > IBLND_CONN_INIT);
+
+	kiblnd_set_conn_state(conn, IBLND_CONN_DISCONNECTED);
+
+	/* abort_receives moves QP state to IB_QPS_ERR.  This is only required
+	 * for connections that didn't get as far as being connected, because
+	 * rdma_disconnect() does this for free. */
+	kiblnd_abort_receives(conn);
+
+	/* Complete all tx descs not waiting for sends to complete.
+	 * NB we should be safe from RDMA now that the QP has changed state */
+
+	kiblnd_abort_txs(conn, &conn->ibc_tx_noops);
+	kiblnd_abort_txs(conn, &conn->ibc_tx_queue);
+	kiblnd_abort_txs(conn, &conn->ibc_tx_queue_rsrvd);
+	kiblnd_abort_txs(conn, &conn->ibc_tx_queue_nocred);
+	kiblnd_abort_txs(conn, &conn->ibc_active_txs);
+
+	kiblnd_handle_early_rxs(conn);
+}
+
+void
+kiblnd_peer_connect_failed(kib_peer_t *peer, int active, int error)
+{
+	LIST_HEAD(zombies);
+	unsigned long     flags;
+
+	LASSERT(error != 0);
+	LASSERT(!in_interrupt());
+
+	write_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
+
+	if (active) {
+		LASSERT(peer->ibp_connecting > 0);
+		peer->ibp_connecting--;
+	} else {
+		LASSERT(peer->ibp_accepting > 0);
+		peer->ibp_accepting--;
+	}
+
+	if (peer->ibp_connecting != 0 ||
+	    peer->ibp_accepting != 0) {
+		/* another connection attempt under way... */
+		write_unlock_irqrestore(&kiblnd_data.kib_global_lock,
+					    flags);
+		return;
+	}
+
+	if (list_empty(&peer->ibp_conns)) {
+		/* Take peer's blocked transmits to complete with error */
+		list_add(&zombies, &peer->ibp_tx_queue);
+		list_del_init(&peer->ibp_tx_queue);
+
+		if (kiblnd_peer_active(peer))
+			kiblnd_unlink_peer_locked(peer);
+
+		peer->ibp_error = error;
+	} else {
+		/* Can't have blocked transmits if there are connections */
+		LASSERT(list_empty(&peer->ibp_tx_queue));
+	}
+
+	write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+
+	kiblnd_peer_notify(peer);
+
+	if (list_empty(&zombies))
+		return;
+
+	CNETERR("Deleting messages for %s: connection failed\n",
+		libcfs_nid2str(peer->ibp_nid));
+
+	kiblnd_txlist_done(peer->ibp_ni, &zombies, -EHOSTUNREACH);
+}
+
+void
+kiblnd_connreq_done(kib_conn_t *conn, int status)
+{
+	kib_peer_t	*peer = conn->ibc_peer;
+	kib_tx_t	  *tx;
+	kib_tx_t *tmp;
+	struct list_head	 txs;
+	unsigned long      flags;
+	int		active;
+
+	active = (conn->ibc_state == IBLND_CONN_ACTIVE_CONNECT);
+
+	CDEBUG(D_NET, "%s: active(%d), version(%x), status(%d)\n",
+	       libcfs_nid2str(peer->ibp_nid), active,
+	       conn->ibc_version, status);
+
+	LASSERT(!in_interrupt());
+	LASSERT((conn->ibc_state == IBLND_CONN_ACTIVE_CONNECT &&
+		  peer->ibp_connecting > 0) ||
+		 (conn->ibc_state == IBLND_CONN_PASSIVE_WAIT &&
+		  peer->ibp_accepting > 0));
+
+	LIBCFS_FREE(conn->ibc_connvars, sizeof(*conn->ibc_connvars));
+	conn->ibc_connvars = NULL;
+
+	if (status != 0) {
+		/* failed to establish connection */
+		kiblnd_peer_connect_failed(peer, active, status);
+		kiblnd_finalise_conn(conn);
+		return;
+	}
+
+	/* connection established */
+	write_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
+
+	conn->ibc_last_send = jiffies;
+	kiblnd_set_conn_state(conn, IBLND_CONN_ESTABLISHED);
+	kiblnd_peer_alive(peer);
+
+	/* Add conn to peer's list and nuke any dangling conns from a different
+	 * peer instance... */
+	kiblnd_conn_addref(conn);	       /* +1 ref for ibc_list */
+	list_add(&conn->ibc_list, &peer->ibp_conns);
+	if (active)
+		peer->ibp_connecting--;
+	else
+		peer->ibp_accepting--;
+
+	if (peer->ibp_version == 0) {
+		peer->ibp_version     = conn->ibc_version;
+		peer->ibp_incarnation = conn->ibc_incarnation;
+	}
+
+	if (peer->ibp_version     != conn->ibc_version ||
+	    peer->ibp_incarnation != conn->ibc_incarnation) {
+		kiblnd_close_stale_conns_locked(peer, conn->ibc_version,
+						conn->ibc_incarnation);
+		peer->ibp_version     = conn->ibc_version;
+		peer->ibp_incarnation = conn->ibc_incarnation;
+	}
+
+	/* grab pending txs while I have the lock */
+	list_add(&txs, &peer->ibp_tx_queue);
+	list_del_init(&peer->ibp_tx_queue);
+
+	if (!kiblnd_peer_active(peer) ||	/* peer has been deleted */
+	    conn->ibc_comms_error != 0) {       /* error has happened already */
+		lnet_ni_t *ni = peer->ibp_ni;
+
+		/* start to shut down connection */
+		kiblnd_close_conn_locked(conn, -ECONNABORTED);
+		write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+
+		kiblnd_txlist_done(ni, &txs, -ECONNABORTED);
+
+		return;
+	}
+
+	write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+
+	/* Schedule blocked txs */
+	spin_lock(&conn->ibc_lock);
+	list_for_each_entry_safe(tx, tmp, &txs, tx_list) {
+		list_del(&tx->tx_list);
+
+		kiblnd_queue_tx_locked(tx, conn);
+	}
+	spin_unlock(&conn->ibc_lock);
+
+	kiblnd_check_sends(conn);
+
+	/* schedule blocked rxs */
+	kiblnd_handle_early_rxs(conn);
+}
+
+static void
+kiblnd_reject(struct rdma_cm_id *cmid, kib_rej_t *rej)
+{
+	int	  rc;
+
+	rc = rdma_reject(cmid, rej, sizeof(*rej));
+
+	if (rc != 0)
+		CWARN("Error %d sending reject\n", rc);
+}
+
+static int
+kiblnd_passive_connect(struct rdma_cm_id *cmid, void *priv, int priv_nob)
+{
+	rwlock_t		*g_lock = &kiblnd_data.kib_global_lock;
+	kib_msg_t	     *reqmsg = priv;
+	kib_msg_t	     *ackmsg;
+	kib_dev_t	     *ibdev;
+	kib_peer_t	    *peer;
+	kib_peer_t	    *peer2;
+	kib_conn_t	    *conn;
+	lnet_ni_t	     *ni  = NULL;
+	kib_net_t	     *net = NULL;
+	lnet_nid_t	     nid;
+	struct rdma_conn_param cp;
+	kib_rej_t	      rej;
+	int		    version = IBLND_MSG_VERSION;
+	unsigned long	  flags;
+	int		    rc;
+	struct sockaddr_in    *peer_addr;
+	LASSERT(!in_interrupt());
+
+	/* cmid inherits 'context' from the corresponding listener id */
+	ibdev = (kib_dev_t *)cmid->context;
+	LASSERT(ibdev != NULL);
+
+	memset(&rej, 0, sizeof(rej));
+	rej.ibr_magic		= IBLND_MSG_MAGIC;
+	rej.ibr_why		  = IBLND_REJECT_FATAL;
+	rej.ibr_cp.ibcp_max_msg_size = IBLND_MSG_SIZE;
+
+	peer_addr = (struct sockaddr_in *)&(cmid->route.addr.dst_addr);
+	if (*kiblnd_tunables.kib_require_priv_port &&
+	    ntohs(peer_addr->sin_port) >= PROT_SOCK) {
+		__u32 ip = ntohl(peer_addr->sin_addr.s_addr);
+		CERROR("Peer's port (%pI4h:%hu) is not privileged\n",
+		       &ip, ntohs(peer_addr->sin_port));
+		goto failed;
+	}
+
+	if (priv_nob < offsetof(kib_msg_t, ibm_type)) {
+		CERROR("Short connection request\n");
+		goto failed;
+	}
+
+	/* Future protocol version compatibility support!  If the
+	 * o2iblnd-specific protocol changes, or when LNET unifies
+	 * protocols over all LNDs, the initial connection will
+	 * negotiate a protocol version.  I trap this here to avoid
+	 * console errors; the reject tells the peer which protocol I
+	 * speak. */
+	if (reqmsg->ibm_magic == LNET_PROTO_MAGIC ||
+	    reqmsg->ibm_magic == __swab32(LNET_PROTO_MAGIC))
+		goto failed;
+	if (reqmsg->ibm_magic == IBLND_MSG_MAGIC &&
+	    reqmsg->ibm_version != IBLND_MSG_VERSION &&
+	    reqmsg->ibm_version != IBLND_MSG_VERSION_1)
+		goto failed;
+	if (reqmsg->ibm_magic == __swab32(IBLND_MSG_MAGIC) &&
+	    reqmsg->ibm_version != __swab16(IBLND_MSG_VERSION) &&
+	    reqmsg->ibm_version != __swab16(IBLND_MSG_VERSION_1))
+		goto failed;
+
+	rc = kiblnd_unpack_msg(reqmsg, priv_nob);
+	if (rc != 0) {
+		CERROR("Can't parse connection request: %d\n", rc);
+		goto failed;
+	}
+
+	nid = reqmsg->ibm_srcnid;
+	ni  = lnet_net2ni(LNET_NIDNET(reqmsg->ibm_dstnid));
+
+	if (ni != NULL) {
+		net = (kib_net_t *)ni->ni_data;
+		rej.ibr_incarnation = net->ibn_incarnation;
+	}
+
+	if (ni == NULL ||			 /* no matching net */
+	    ni->ni_nid != reqmsg->ibm_dstnid ||   /* right NET, wrong NID! */
+	    net->ibn_dev != ibdev) {	      /* wrong device */
+		CERROR("Can't accept %s on %s (%s:%d:%pI4h): bad dst nid %s\n",
+		       libcfs_nid2str(nid),
+		       ni == NULL ? "NA" : libcfs_nid2str(ni->ni_nid),
+		       ibdev->ibd_ifname, ibdev->ibd_nnets,
+		       &ibdev->ibd_ifip,
+		       libcfs_nid2str(reqmsg->ibm_dstnid));
+
+		goto failed;
+	}
+
+       /* check time stamp as soon as possible */
+	if (reqmsg->ibm_dststamp != 0 &&
+	    reqmsg->ibm_dststamp != net->ibn_incarnation) {
+		CWARN("Stale connection request\n");
+		rej.ibr_why = IBLND_REJECT_CONN_STALE;
+		goto failed;
+	}
+
+	/* I can accept peer's version */
+	version = reqmsg->ibm_version;
+
+	if (reqmsg->ibm_type != IBLND_MSG_CONNREQ) {
+		CERROR("Unexpected connreq msg type: %x from %s\n",
+		       reqmsg->ibm_type, libcfs_nid2str(nid));
+		goto failed;
+	}
+
+	if (reqmsg->ibm_u.connparams.ibcp_queue_depth !=
+	    IBLND_MSG_QUEUE_SIZE(version)) {
+		CERROR("Can't accept %s: incompatible queue depth %d (%d wanted)\n",
+		       libcfs_nid2str(nid), reqmsg->ibm_u.connparams.ibcp_queue_depth,
+		       IBLND_MSG_QUEUE_SIZE(version));
+
+		if (version == IBLND_MSG_VERSION)
+			rej.ibr_why = IBLND_REJECT_MSG_QUEUE_SIZE;
+
+		goto failed;
+	}
+
+	if (reqmsg->ibm_u.connparams.ibcp_max_frags !=
+	    IBLND_RDMA_FRAGS(version)) {
+		CERROR("Can't accept %s(version %x): incompatible max_frags %d (%d wanted)\n",
+		       libcfs_nid2str(nid), version,
+		       reqmsg->ibm_u.connparams.ibcp_max_frags,
+		       IBLND_RDMA_FRAGS(version));
+
+		if (version == IBLND_MSG_VERSION)
+			rej.ibr_why = IBLND_REJECT_RDMA_FRAGS;
+
+		goto failed;
+
+	}
+
+	if (reqmsg->ibm_u.connparams.ibcp_max_msg_size > IBLND_MSG_SIZE) {
+		CERROR("Can't accept %s: message size %d too big (%d max)\n",
+		       libcfs_nid2str(nid),
+		       reqmsg->ibm_u.connparams.ibcp_max_msg_size,
+		       IBLND_MSG_SIZE);
+		goto failed;
+	}
+
+	/* assume 'nid' is a new peer; create  */
+	rc = kiblnd_create_peer(ni, &peer, nid);
+	if (rc != 0) {
+		CERROR("Can't create peer for %s\n", libcfs_nid2str(nid));
+		rej.ibr_why = IBLND_REJECT_NO_RESOURCES;
+		goto failed;
+	}
+
+	write_lock_irqsave(g_lock, flags);
+
+	peer2 = kiblnd_find_peer_locked(nid);
+	if (peer2 != NULL) {
+		if (peer2->ibp_version == 0) {
+			peer2->ibp_version     = version;
+			peer2->ibp_incarnation = reqmsg->ibm_srcstamp;
+		}
+
+		/* not the guy I've talked with */
+		if (peer2->ibp_incarnation != reqmsg->ibm_srcstamp ||
+		    peer2->ibp_version     != version) {
+			kiblnd_close_peer_conns_locked(peer2, -ESTALE);
+			write_unlock_irqrestore(g_lock, flags);
+
+			CWARN("Conn stale %s [old ver: %x, new ver: %x]\n",
+			      libcfs_nid2str(nid), peer2->ibp_version, version);
+
+			kiblnd_peer_decref(peer);
+			rej.ibr_why = IBLND_REJECT_CONN_STALE;
+			goto failed;
+		}
+
+		/* tie-break connection race in favour of the higher NID */
+		if (peer2->ibp_connecting != 0 &&
+		    nid < ni->ni_nid) {
+			write_unlock_irqrestore(g_lock, flags);
+
+			CWARN("Conn race %s\n", libcfs_nid2str(peer2->ibp_nid));
+
+			kiblnd_peer_decref(peer);
+			rej.ibr_why = IBLND_REJECT_CONN_RACE;
+			goto failed;
+		}
+
+		peer2->ibp_accepting++;
+		kiblnd_peer_addref(peer2);
+
+		write_unlock_irqrestore(g_lock, flags);
+		kiblnd_peer_decref(peer);
+		peer = peer2;
+	} else {
+		/* Brand new peer */
+		LASSERT(peer->ibp_accepting == 0);
+		LASSERT(peer->ibp_version == 0 &&
+			 peer->ibp_incarnation == 0);
+
+		peer->ibp_accepting   = 1;
+		peer->ibp_version     = version;
+		peer->ibp_incarnation = reqmsg->ibm_srcstamp;
+
+		/* I have a ref on ni that prevents it being shutdown */
+		LASSERT(net->ibn_shutdown == 0);
+
+		kiblnd_peer_addref(peer);
+		list_add_tail(&peer->ibp_list, kiblnd_nid2peerlist(nid));
+
+		write_unlock_irqrestore(g_lock, flags);
+	}
+
+	conn = kiblnd_create_conn(peer, cmid, IBLND_CONN_PASSIVE_WAIT, version);
+	if (conn == NULL) {
+		kiblnd_peer_connect_failed(peer, 0, -ENOMEM);
+		kiblnd_peer_decref(peer);
+		rej.ibr_why = IBLND_REJECT_NO_RESOURCES;
+		goto failed;
+	}
+
+	/* conn now "owns" cmid, so I return success from here on to ensure the
+	 * CM callback doesn't destroy cmid. */
+
+	conn->ibc_incarnation      = reqmsg->ibm_srcstamp;
+	conn->ibc_credits	  = IBLND_MSG_QUEUE_SIZE(version);
+	conn->ibc_reserved_credits = IBLND_MSG_QUEUE_SIZE(version);
+	LASSERT(conn->ibc_credits + conn->ibc_reserved_credits + IBLND_OOB_MSGS(version)
+		 <= IBLND_RX_MSGS(version));
+
+	ackmsg = &conn->ibc_connvars->cv_msg;
+	memset(ackmsg, 0, sizeof(*ackmsg));
+
+	kiblnd_init_msg(ackmsg, IBLND_MSG_CONNACK,
+			sizeof(ackmsg->ibm_u.connparams));
+	ackmsg->ibm_u.connparams.ibcp_queue_depth  = IBLND_MSG_QUEUE_SIZE(version);
+	ackmsg->ibm_u.connparams.ibcp_max_msg_size = IBLND_MSG_SIZE;
+	ackmsg->ibm_u.connparams.ibcp_max_frags    = IBLND_RDMA_FRAGS(version);
+
+	kiblnd_pack_msg(ni, ackmsg, version, 0, nid, reqmsg->ibm_srcstamp);
+
+	memset(&cp, 0, sizeof(cp));
+	cp.private_data	= ackmsg;
+	cp.private_data_len    = ackmsg->ibm_nob;
+	cp.responder_resources = 0;	     /* No atomic ops or RDMA reads */
+	cp.initiator_depth     = 0;
+	cp.flow_control	= 1;
+	cp.retry_count	 = *kiblnd_tunables.kib_retry_count;
+	cp.rnr_retry_count     = *kiblnd_tunables.kib_rnr_retry_count;
+
+	CDEBUG(D_NET, "Accept %s\n", libcfs_nid2str(nid));
+
+	rc = rdma_accept(cmid, &cp);
+	if (rc != 0) {
+		CERROR("Can't accept %s: %d\n", libcfs_nid2str(nid), rc);
+		rej.ibr_version = version;
+		rej.ibr_why     = IBLND_REJECT_FATAL;
+
+		kiblnd_reject(cmid, &rej);
+		kiblnd_connreq_done(conn, rc);
+		kiblnd_conn_decref(conn);
+	}
+
+	lnet_ni_decref(ni);
+	return 0;
+
+ failed:
+	if (ni != NULL)
+		lnet_ni_decref(ni);
+
+	rej.ibr_version = version;
+	rej.ibr_cp.ibcp_queue_depth = IBLND_MSG_QUEUE_SIZE(version);
+	rej.ibr_cp.ibcp_max_frags   = IBLND_RDMA_FRAGS(version);
+	kiblnd_reject(cmid, &rej);
+
+	return -ECONNREFUSED;
+}
+
+static void
+kiblnd_reconnect(kib_conn_t *conn, int version,
+		  __u64 incarnation, int why, kib_connparams_t *cp)
+{
+	kib_peer_t    *peer = conn->ibc_peer;
+	char	  *reason;
+	int	    retry = 0;
+	unsigned long  flags;
+
+	LASSERT(conn->ibc_state == IBLND_CONN_ACTIVE_CONNECT);
+	LASSERT(peer->ibp_connecting > 0);     /* 'conn' at least */
+
+	write_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
+
+	/* retry connection if it's still needed and no other connection
+	 * attempts (active or passive) are in progress
+	 * NB: reconnect is still needed even when ibp_tx_queue is
+	 * empty if ibp_version != version because reconnect may be
+	 * initiated by kiblnd_query() */
+	if ((!list_empty(&peer->ibp_tx_queue) ||
+	     peer->ibp_version != version) &&
+	    peer->ibp_connecting == 1 &&
+	    peer->ibp_accepting == 0) {
+		retry = 1;
+		peer->ibp_connecting++;
+
+		peer->ibp_version     = version;
+		peer->ibp_incarnation = incarnation;
+	}
+
+	write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+
+	if (!retry)
+		return;
+
+	switch (why) {
+	default:
+		reason = "Unknown";
+		break;
+
+	case IBLND_REJECT_CONN_STALE:
+		reason = "stale";
+		break;
+
+	case IBLND_REJECT_CONN_RACE:
+		reason = "conn race";
+		break;
+
+	case IBLND_REJECT_CONN_UNCOMPAT:
+		reason = "version negotiation";
+		break;
+	}
+
+	CNETERR("%s: retrying (%s), %x, %x, queue_dep: %d, max_frag: %d, msg_size: %d\n",
+		libcfs_nid2str(peer->ibp_nid),
+		reason, IBLND_MSG_VERSION, version,
+		cp != NULL ? cp->ibcp_queue_depth  : IBLND_MSG_QUEUE_SIZE(version),
+		cp != NULL ? cp->ibcp_max_frags    : IBLND_RDMA_FRAGS(version),
+		cp != NULL ? cp->ibcp_max_msg_size : IBLND_MSG_SIZE);
+
+	kiblnd_connect_peer(peer);
+}
+
+static void
+kiblnd_rejected(kib_conn_t *conn, int reason, void *priv, int priv_nob)
+{
+	kib_peer_t    *peer = conn->ibc_peer;
+
+	LASSERT(!in_interrupt());
+	LASSERT(conn->ibc_state == IBLND_CONN_ACTIVE_CONNECT);
+
+	switch (reason) {
+	case IB_CM_REJ_STALE_CONN:
+		kiblnd_reconnect(conn, IBLND_MSG_VERSION, 0,
+				 IBLND_REJECT_CONN_STALE, NULL);
+		break;
+
+	case IB_CM_REJ_INVALID_SERVICE_ID:
+		CNETERR("%s rejected: no listener at %d\n",
+			libcfs_nid2str(peer->ibp_nid),
+			*kiblnd_tunables.kib_service);
+		break;
+
+	case IB_CM_REJ_CONSUMER_DEFINED:
+		if (priv_nob >= offsetof(kib_rej_t, ibr_padding)) {
+			kib_rej_t	*rej	 = priv;
+			kib_connparams_t *cp	  = NULL;
+			int	       flip	= 0;
+			__u64	     incarnation = -1;
+
+			/* NB. default incarnation is -1 because:
+			 * a) V1 will ignore dst incarnation in connreq.
+			 * b) V2 will provide incarnation while rejecting me,
+			 *    -1 will be overwrote.
+			 *
+			 * if I try to connect to a V1 peer with V2 protocol,
+			 * it rejected me then upgrade to V2, I have no idea
+			 * about the upgrading and try to reconnect with V1,
+			 * in this case upgraded V2 can find out I'm trying to
+			 * talk to the old guy and reject me(incarnation is -1).
+			 */
+
+			if (rej->ibr_magic == __swab32(IBLND_MSG_MAGIC) ||
+			    rej->ibr_magic == __swab32(LNET_PROTO_MAGIC)) {
+				__swab32s(&rej->ibr_magic);
+				__swab16s(&rej->ibr_version);
+				flip = 1;
+			}
+
+			if (priv_nob >= sizeof(kib_rej_t) &&
+			    rej->ibr_version > IBLND_MSG_VERSION_1) {
+				/* priv_nob is always 148 in current version
+				 * of OFED, so we still need to check version.
+				 * (define of IB_CM_REJ_PRIVATE_DATA_SIZE) */
+				cp = &rej->ibr_cp;
+
+				if (flip) {
+					__swab64s(&rej->ibr_incarnation);
+					__swab16s(&cp->ibcp_queue_depth);
+					__swab16s(&cp->ibcp_max_frags);
+					__swab32s(&cp->ibcp_max_msg_size);
+				}
+
+				incarnation = rej->ibr_incarnation;
+			}
+
+			if (rej->ibr_magic != IBLND_MSG_MAGIC &&
+			    rej->ibr_magic != LNET_PROTO_MAGIC) {
+				CERROR("%s rejected: consumer defined fatal error\n",
+				       libcfs_nid2str(peer->ibp_nid));
+				break;
+			}
+
+			if (rej->ibr_version != IBLND_MSG_VERSION &&
+			    rej->ibr_version != IBLND_MSG_VERSION_1) {
+				CERROR("%s rejected: o2iblnd version %x error\n",
+				       libcfs_nid2str(peer->ibp_nid),
+				       rej->ibr_version);
+				break;
+			}
+
+			if (rej->ibr_why     == IBLND_REJECT_FATAL &&
+			    rej->ibr_version == IBLND_MSG_VERSION_1) {
+				CDEBUG(D_NET, "rejected by old version peer %s: %x\n",
+				       libcfs_nid2str(peer->ibp_nid), rej->ibr_version);
+
+				if (conn->ibc_version != IBLND_MSG_VERSION_1)
+					rej->ibr_why = IBLND_REJECT_CONN_UNCOMPAT;
+			}
+
+			switch (rej->ibr_why) {
+			case IBLND_REJECT_CONN_RACE:
+			case IBLND_REJECT_CONN_STALE:
+			case IBLND_REJECT_CONN_UNCOMPAT:
+				kiblnd_reconnect(conn, rej->ibr_version,
+						 incarnation, rej->ibr_why, cp);
+				break;
+
+			case IBLND_REJECT_MSG_QUEUE_SIZE:
+				CERROR("%s rejected: incompatible message queue depth %d, %d\n",
+				       libcfs_nid2str(peer->ibp_nid),
+				       cp != NULL ? cp->ibcp_queue_depth :
+				       IBLND_MSG_QUEUE_SIZE(rej->ibr_version),
+				       IBLND_MSG_QUEUE_SIZE(conn->ibc_version));
+				break;
+
+			case IBLND_REJECT_RDMA_FRAGS:
+				CERROR("%s rejected: incompatible # of RDMA fragments %d, %d\n",
+				       libcfs_nid2str(peer->ibp_nid),
+				       cp != NULL ? cp->ibcp_max_frags :
+				       IBLND_RDMA_FRAGS(rej->ibr_version),
+				       IBLND_RDMA_FRAGS(conn->ibc_version));
+				break;
+
+			case IBLND_REJECT_NO_RESOURCES:
+				CERROR("%s rejected: o2iblnd no resources\n",
+				       libcfs_nid2str(peer->ibp_nid));
+				break;
+
+			case IBLND_REJECT_FATAL:
+				CERROR("%s rejected: o2iblnd fatal error\n",
+				       libcfs_nid2str(peer->ibp_nid));
+				break;
+
+			default:
+				CERROR("%s rejected: o2iblnd reason %d\n",
+				       libcfs_nid2str(peer->ibp_nid),
+				       rej->ibr_why);
+				break;
+			}
+			break;
+		}
+		/* fall through */
+	default:
+		CNETERR("%s rejected: reason %d, size %d\n",
+			libcfs_nid2str(peer->ibp_nid), reason, priv_nob);
+		break;
+	}
+
+	kiblnd_connreq_done(conn, -ECONNREFUSED);
+}
+
+static void
+kiblnd_check_connreply(kib_conn_t *conn, void *priv, int priv_nob)
+{
+	kib_peer_t    *peer = conn->ibc_peer;
+	lnet_ni_t     *ni   = peer->ibp_ni;
+	kib_net_t     *net  = ni->ni_data;
+	kib_msg_t     *msg  = priv;
+	int	    ver  = conn->ibc_version;
+	int	    rc   = kiblnd_unpack_msg(msg, priv_nob);
+	unsigned long  flags;
+
+	LASSERT(net != NULL);
+
+	if (rc != 0) {
+		CERROR("Can't unpack connack from %s: %d\n",
+		       libcfs_nid2str(peer->ibp_nid), rc);
+		goto failed;
+	}
+
+	if (msg->ibm_type != IBLND_MSG_CONNACK) {
+		CERROR("Unexpected message %d from %s\n",
+		       msg->ibm_type, libcfs_nid2str(peer->ibp_nid));
+		rc = -EPROTO;
+		goto failed;
+	}
+
+	if (ver != msg->ibm_version) {
+		CERROR("%s replied version %x is different with requested version %x\n",
+		       libcfs_nid2str(peer->ibp_nid), msg->ibm_version, ver);
+		rc = -EPROTO;
+		goto failed;
+	}
+
+	if (msg->ibm_u.connparams.ibcp_queue_depth !=
+	    IBLND_MSG_QUEUE_SIZE(ver)) {
+		CERROR("%s has incompatible queue depth %d(%d wanted)\n",
+		       libcfs_nid2str(peer->ibp_nid),
+		       msg->ibm_u.connparams.ibcp_queue_depth,
+		       IBLND_MSG_QUEUE_SIZE(ver));
+		rc = -EPROTO;
+		goto failed;
+	}
+
+	if (msg->ibm_u.connparams.ibcp_max_frags !=
+	    IBLND_RDMA_FRAGS(ver)) {
+		CERROR("%s has incompatible max_frags %d (%d wanted)\n",
+		       libcfs_nid2str(peer->ibp_nid),
+		       msg->ibm_u.connparams.ibcp_max_frags,
+		       IBLND_RDMA_FRAGS(ver));
+		rc = -EPROTO;
+		goto failed;
+	}
+
+	if (msg->ibm_u.connparams.ibcp_max_msg_size > IBLND_MSG_SIZE) {
+		CERROR("%s max message size %d too big (%d max)\n",
+		       libcfs_nid2str(peer->ibp_nid),
+		       msg->ibm_u.connparams.ibcp_max_msg_size,
+		       IBLND_MSG_SIZE);
+		rc = -EPROTO;
+		goto failed;
+	}
+
+	read_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
+	if (msg->ibm_dstnid == ni->ni_nid &&
+	    msg->ibm_dststamp == net->ibn_incarnation)
+		rc = 0;
+	else
+		rc = -ESTALE;
+	read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+
+	if (rc != 0) {
+		CERROR("Bad connection reply from %s, rc = %d, version: %x max_frags: %d\n",
+		       libcfs_nid2str(peer->ibp_nid), rc,
+		       msg->ibm_version, msg->ibm_u.connparams.ibcp_max_frags);
+		goto failed;
+	}
+
+	conn->ibc_incarnation      = msg->ibm_srcstamp;
+	conn->ibc_credits	  =
+	conn->ibc_reserved_credits = IBLND_MSG_QUEUE_SIZE(ver);
+	LASSERT(conn->ibc_credits + conn->ibc_reserved_credits + IBLND_OOB_MSGS(ver)
+		 <= IBLND_RX_MSGS(ver));
+
+	kiblnd_connreq_done(conn, 0);
+	return;
+
+ failed:
+	/* NB My QP has already established itself, so I handle anything going
+	 * wrong here by setting ibc_comms_error.
+	 * kiblnd_connreq_done(0) moves the conn state to ESTABLISHED, but then
+	 * immediately tears it down. */
+
+	LASSERT(rc != 0);
+	conn->ibc_comms_error = rc;
+	kiblnd_connreq_done(conn, 0);
+}
+
+static int
+kiblnd_active_connect(struct rdma_cm_id *cmid)
+{
+	kib_peer_t	      *peer = (kib_peer_t *)cmid->context;
+	kib_conn_t	      *conn;
+	kib_msg_t	       *msg;
+	struct rdma_conn_param   cp;
+	int		      version;
+	__u64		    incarnation;
+	unsigned long	    flags;
+	int		      rc;
+
+	read_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
+
+	incarnation = peer->ibp_incarnation;
+	version     = (peer->ibp_version == 0) ? IBLND_MSG_VERSION :
+						 peer->ibp_version;
+
+	read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+
+	conn = kiblnd_create_conn(peer, cmid, IBLND_CONN_ACTIVE_CONNECT, version);
+	if (conn == NULL) {
+		kiblnd_peer_connect_failed(peer, 1, -ENOMEM);
+		kiblnd_peer_decref(peer); /* lose cmid's ref */
+		return -ENOMEM;
+	}
+
+	/* conn "owns" cmid now, so I return success from here on to ensure the
+	 * CM callback doesn't destroy cmid. conn also takes over cmid's ref
+	 * on peer */
+
+	msg = &conn->ibc_connvars->cv_msg;
+
+	memset(msg, 0, sizeof(*msg));
+	kiblnd_init_msg(msg, IBLND_MSG_CONNREQ, sizeof(msg->ibm_u.connparams));
+	msg->ibm_u.connparams.ibcp_queue_depth  = IBLND_MSG_QUEUE_SIZE(version);
+	msg->ibm_u.connparams.ibcp_max_frags    = IBLND_RDMA_FRAGS(version);
+	msg->ibm_u.connparams.ibcp_max_msg_size = IBLND_MSG_SIZE;
+
+	kiblnd_pack_msg(peer->ibp_ni, msg, version,
+			0, peer->ibp_nid, incarnation);
+
+	memset(&cp, 0, sizeof(cp));
+	cp.private_data	= msg;
+	cp.private_data_len    = msg->ibm_nob;
+	cp.responder_resources = 0;	     /* No atomic ops or RDMA reads */
+	cp.initiator_depth     = 0;
+	cp.flow_control	= 1;
+	cp.retry_count	 = *kiblnd_tunables.kib_retry_count;
+	cp.rnr_retry_count     = *kiblnd_tunables.kib_rnr_retry_count;
+
+	LASSERT(cmid->context == (void *)conn);
+	LASSERT(conn->ibc_cmid == cmid);
+
+	rc = rdma_connect(cmid, &cp);
+	if (rc != 0) {
+		CERROR("Can't connect to %s: %d\n",
+		       libcfs_nid2str(peer->ibp_nid), rc);
+		kiblnd_connreq_done(conn, rc);
+		kiblnd_conn_decref(conn);
+	}
+
+	return 0;
+}
+
+int
+kiblnd_cm_callback(struct rdma_cm_id *cmid, struct rdma_cm_event *event)
+{
+	kib_peer_t  *peer;
+	kib_conn_t  *conn;
+	int	  rc;
+
+	switch (event->event) {
+	default:
+		CERROR("Unexpected event: %d, status: %d\n",
+		       event->event, event->status);
+		LBUG();
+
+	case RDMA_CM_EVENT_CONNECT_REQUEST:
+		/* destroy cmid on failure */
+		rc = kiblnd_passive_connect(cmid,
+					    (void *)KIBLND_CONN_PARAM(event),
+					    KIBLND_CONN_PARAM_LEN(event));
+		CDEBUG(D_NET, "connreq: %d\n", rc);
+		return rc;
+
+	case RDMA_CM_EVENT_ADDR_ERROR:
+		peer = (kib_peer_t *)cmid->context;
+		CNETERR("%s: ADDR ERROR %d\n",
+		       libcfs_nid2str(peer->ibp_nid), event->status);
+		kiblnd_peer_connect_failed(peer, 1, -EHOSTUNREACH);
+		kiblnd_peer_decref(peer);
+		return -EHOSTUNREACH;      /* rc != 0 destroys cmid */
+
+	case RDMA_CM_EVENT_ADDR_RESOLVED:
+		peer = (kib_peer_t *)cmid->context;
+
+		CDEBUG(D_NET, "%s Addr resolved: %d\n",
+		       libcfs_nid2str(peer->ibp_nid), event->status);
+
+		if (event->status != 0) {
+			CNETERR("Can't resolve address for %s: %d\n",
+				libcfs_nid2str(peer->ibp_nid), event->status);
+			rc = event->status;
+		} else {
+			rc = rdma_resolve_route(
+				cmid, *kiblnd_tunables.kib_timeout * 1000);
+			if (rc == 0)
+				return 0;
+			/* Can't initiate route resolution */
+			CERROR("Can't resolve route for %s: %d\n",
+			       libcfs_nid2str(peer->ibp_nid), rc);
+		}
+		kiblnd_peer_connect_failed(peer, 1, rc);
+		kiblnd_peer_decref(peer);
+		return rc;		      /* rc != 0 destroys cmid */
+
+	case RDMA_CM_EVENT_ROUTE_ERROR:
+		peer = (kib_peer_t *)cmid->context;
+		CNETERR("%s: ROUTE ERROR %d\n",
+			libcfs_nid2str(peer->ibp_nid), event->status);
+		kiblnd_peer_connect_failed(peer, 1, -EHOSTUNREACH);
+		kiblnd_peer_decref(peer);
+		return -EHOSTUNREACH;	   /* rc != 0 destroys cmid */
+
+	case RDMA_CM_EVENT_ROUTE_RESOLVED:
+		peer = (kib_peer_t *)cmid->context;
+		CDEBUG(D_NET, "%s Route resolved: %d\n",
+		       libcfs_nid2str(peer->ibp_nid), event->status);
+
+		if (event->status == 0)
+			return kiblnd_active_connect(cmid);
+
+		CNETERR("Can't resolve route for %s: %d\n",
+		       libcfs_nid2str(peer->ibp_nid), event->status);
+		kiblnd_peer_connect_failed(peer, 1, event->status);
+		kiblnd_peer_decref(peer);
+		return event->status;	   /* rc != 0 destroys cmid */
+
+	case RDMA_CM_EVENT_UNREACHABLE:
+		conn = (kib_conn_t *)cmid->context;
+		LASSERT(conn->ibc_state == IBLND_CONN_ACTIVE_CONNECT ||
+			conn->ibc_state == IBLND_CONN_PASSIVE_WAIT);
+		CNETERR("%s: UNREACHABLE %d\n",
+		       libcfs_nid2str(conn->ibc_peer->ibp_nid), event->status);
+		kiblnd_connreq_done(conn, -ENETDOWN);
+		kiblnd_conn_decref(conn);
+		return 0;
+
+	case RDMA_CM_EVENT_CONNECT_ERROR:
+		conn = (kib_conn_t *)cmid->context;
+		LASSERT(conn->ibc_state == IBLND_CONN_ACTIVE_CONNECT ||
+			conn->ibc_state == IBLND_CONN_PASSIVE_WAIT);
+		CNETERR("%s: CONNECT ERROR %d\n",
+			libcfs_nid2str(conn->ibc_peer->ibp_nid), event->status);
+		kiblnd_connreq_done(conn, -ENOTCONN);
+		kiblnd_conn_decref(conn);
+		return 0;
+
+	case RDMA_CM_EVENT_REJECTED:
+		conn = (kib_conn_t *)cmid->context;
+		switch (conn->ibc_state) {
+		default:
+			LBUG();
+
+		case IBLND_CONN_PASSIVE_WAIT:
+			CERROR("%s: REJECTED %d\n",
+				libcfs_nid2str(conn->ibc_peer->ibp_nid),
+				event->status);
+			kiblnd_connreq_done(conn, -ECONNRESET);
+			break;
+
+		case IBLND_CONN_ACTIVE_CONNECT:
+			kiblnd_rejected(conn, event->status,
+					(void *)KIBLND_CONN_PARAM(event),
+					KIBLND_CONN_PARAM_LEN(event));
+			break;
+		}
+		kiblnd_conn_decref(conn);
+		return 0;
+
+	case RDMA_CM_EVENT_ESTABLISHED:
+		conn = (kib_conn_t *)cmid->context;
+		switch (conn->ibc_state) {
+		default:
+			LBUG();
+
+		case IBLND_CONN_PASSIVE_WAIT:
+			CDEBUG(D_NET, "ESTABLISHED (passive): %s\n",
+			       libcfs_nid2str(conn->ibc_peer->ibp_nid));
+			kiblnd_connreq_done(conn, 0);
+			break;
+
+		case IBLND_CONN_ACTIVE_CONNECT:
+			CDEBUG(D_NET, "ESTABLISHED(active): %s\n",
+			       libcfs_nid2str(conn->ibc_peer->ibp_nid));
+			kiblnd_check_connreply(conn,
+					       (void *)KIBLND_CONN_PARAM(event),
+					       KIBLND_CONN_PARAM_LEN(event));
+			break;
+		}
+		/* net keeps its ref on conn! */
+		return 0;
+
+	case RDMA_CM_EVENT_TIMEWAIT_EXIT:
+		CDEBUG(D_NET, "Ignore TIMEWAIT_EXIT event\n");
+		return 0;
+	case RDMA_CM_EVENT_DISCONNECTED:
+		conn = (kib_conn_t *)cmid->context;
+		if (conn->ibc_state < IBLND_CONN_ESTABLISHED) {
+			CERROR("%s DISCONNECTED\n",
+			       libcfs_nid2str(conn->ibc_peer->ibp_nid));
+			kiblnd_connreq_done(conn, -ECONNRESET);
+		} else {
+			kiblnd_close_conn(conn, 0);
+		}
+		kiblnd_conn_decref(conn);
+		cmid->context = NULL;
+		return 0;
+
+	case RDMA_CM_EVENT_DEVICE_REMOVAL:
+		LCONSOLE_ERROR_MSG(0x131,
+				   "Received notification of device removal\n"
+				   "Please shutdown LNET to allow this to proceed\n");
+		/* Can't remove network from underneath LNET for now, so I have
+		 * to ignore this */
+		return 0;
+
+	case RDMA_CM_EVENT_ADDR_CHANGE:
+		LCONSOLE_INFO("Physical link changed (eg hca/port)\n");
+		return 0;
+	}
+}
+
+static int
+kiblnd_check_txs_locked(kib_conn_t *conn, struct list_head *txs)
+{
+	kib_tx_t	  *tx;
+	struct list_head	*ttmp;
+
+	list_for_each(ttmp, txs) {
+		tx = list_entry(ttmp, kib_tx_t, tx_list);
+
+		if (txs != &conn->ibc_active_txs) {
+			LASSERT(tx->tx_queued);
+		} else {
+			LASSERT(!tx->tx_queued);
+			LASSERT(tx->tx_waiting || tx->tx_sending != 0);
+		}
+
+		if (cfs_time_aftereq(jiffies, tx->tx_deadline)) {
+			CERROR("Timed out tx: %s, %lu seconds\n",
+			       kiblnd_queue2str(conn, txs),
+			       cfs_duration_sec(jiffies - tx->tx_deadline));
+			return 1;
+		}
+	}
+
+	return 0;
+}
+
+static int
+kiblnd_conn_timed_out_locked(kib_conn_t *conn)
+{
+	return  kiblnd_check_txs_locked(conn, &conn->ibc_tx_queue) ||
+		kiblnd_check_txs_locked(conn, &conn->ibc_tx_noops) ||
+		kiblnd_check_txs_locked(conn, &conn->ibc_tx_queue_rsrvd) ||
+		kiblnd_check_txs_locked(conn, &conn->ibc_tx_queue_nocred) ||
+		kiblnd_check_txs_locked(conn, &conn->ibc_active_txs);
+}
+
+static void
+kiblnd_check_conns(int idx)
+{
+	LIST_HEAD(closes);
+	LIST_HEAD(checksends);
+	struct list_head    *peers = &kiblnd_data.kib_peers[idx];
+	struct list_head    *ptmp;
+	kib_peer_t    *peer;
+	kib_conn_t    *conn;
+	kib_conn_t *tmp;
+	struct list_head    *ctmp;
+	unsigned long  flags;
+
+	/* NB. We expect to have a look at all the peers and not find any
+	 * RDMAs to time out, so we just use a shared lock while we
+	 * take a look... */
+	read_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
+
+	list_for_each(ptmp, peers) {
+		peer = list_entry(ptmp, kib_peer_t, ibp_list);
+
+		list_for_each(ctmp, &peer->ibp_conns) {
+			int timedout;
+			int sendnoop;
+
+			conn = list_entry(ctmp, kib_conn_t, ibc_list);
+
+			LASSERT(conn->ibc_state == IBLND_CONN_ESTABLISHED);
+
+			spin_lock(&conn->ibc_lock);
+
+			sendnoop = kiblnd_need_noop(conn);
+			timedout = kiblnd_conn_timed_out_locked(conn);
+			if (!sendnoop && !timedout) {
+				spin_unlock(&conn->ibc_lock);
+				continue;
+			}
+
+			if (timedout) {
+				CERROR("Timed out RDMA with %s (%lu): c: %u, oc: %u, rc: %u\n",
+				       libcfs_nid2str(peer->ibp_nid),
+				       cfs_duration_sec(cfs_time_current() -
+							peer->ibp_last_alive),
+				       conn->ibc_credits,
+				       conn->ibc_outstanding_credits,
+				       conn->ibc_reserved_credits);
+				list_add(&conn->ibc_connd_list, &closes);
+			} else {
+				list_add(&conn->ibc_connd_list,
+					     &checksends);
+			}
+			/* +ref for 'closes' or 'checksends' */
+			kiblnd_conn_addref(conn);
+
+			spin_unlock(&conn->ibc_lock);
+		}
+	}
+
+	read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+
+	/* Handle timeout by closing the whole
+	 * connection. We can only be sure RDMA activity
+	 * has ceased once the QP has been modified. */
+	list_for_each_entry_safe(conn, tmp, &closes, ibc_connd_list) {
+		list_del(&conn->ibc_connd_list);
+		kiblnd_close_conn(conn, -ETIMEDOUT);
+		kiblnd_conn_decref(conn);
+	}
+
+	/* In case we have enough credits to return via a
+	 * NOOP, but there were no non-blocking tx descs
+	 * free to do it last time... */
+	while (!list_empty(&checksends)) {
+		conn = list_entry(checksends.next,
+				      kib_conn_t, ibc_connd_list);
+		list_del(&conn->ibc_connd_list);
+		kiblnd_check_sends(conn);
+		kiblnd_conn_decref(conn);
+	}
+}
+
+static void
+kiblnd_disconnect_conn(kib_conn_t *conn)
+{
+	LASSERT(!in_interrupt());
+	LASSERT(current == kiblnd_data.kib_connd);
+	LASSERT(conn->ibc_state == IBLND_CONN_CLOSING);
+
+	rdma_disconnect(conn->ibc_cmid);
+	kiblnd_finalise_conn(conn);
+
+	kiblnd_peer_notify(conn->ibc_peer);
+}
+
+int
+kiblnd_connd(void *arg)
+{
+	wait_queue_t     wait;
+	unsigned long      flags;
+	kib_conn_t	*conn;
+	int		timeout;
+	int		i;
+	int		dropped_lock;
+	int		peer_index = 0;
+	unsigned long      deadline = jiffies;
+
+	cfs_block_allsigs();
+
+	init_waitqueue_entry(&wait, current);
+	kiblnd_data.kib_connd = current;
+
+	spin_lock_irqsave(&kiblnd_data.kib_connd_lock, flags);
+
+	while (!kiblnd_data.kib_shutdown) {
+
+		dropped_lock = 0;
+
+		if (!list_empty(&kiblnd_data.kib_connd_zombies)) {
+			conn = list_entry(kiblnd_data. \
+					      kib_connd_zombies.next,
+					      kib_conn_t, ibc_list);
+			list_del(&conn->ibc_list);
+
+			spin_unlock_irqrestore(&kiblnd_data.kib_connd_lock,
+					       flags);
+			dropped_lock = 1;
+
+			kiblnd_destroy_conn(conn);
+
+			spin_lock_irqsave(&kiblnd_data.kib_connd_lock, flags);
+		}
+
+		if (!list_empty(&kiblnd_data.kib_connd_conns)) {
+			conn = list_entry(kiblnd_data.kib_connd_conns.next,
+					      kib_conn_t, ibc_list);
+			list_del(&conn->ibc_list);
+
+			spin_unlock_irqrestore(&kiblnd_data.kib_connd_lock,
+					       flags);
+			dropped_lock = 1;
+
+			kiblnd_disconnect_conn(conn);
+			kiblnd_conn_decref(conn);
+
+			spin_lock_irqsave(&kiblnd_data.kib_connd_lock, flags);
+		}
+
+		/* careful with the jiffy wrap... */
+		timeout = (int)(deadline - jiffies);
+		if (timeout <= 0) {
+			const int n = 4;
+			const int p = 1;
+			int       chunk = kiblnd_data.kib_peer_hash_size;
+
+			spin_unlock_irqrestore(&kiblnd_data.kib_connd_lock, flags);
+			dropped_lock = 1;
+
+			/* Time to check for RDMA timeouts on a few more
+			 * peers: I do checks every 'p' seconds on a
+			 * proportion of the peer table and I need to check
+			 * every connection 'n' times within a timeout
+			 * interval, to ensure I detect a timeout on any
+			 * connection within (n+1)/n times the timeout
+			 * interval. */
+
+			if (*kiblnd_tunables.kib_timeout > n * p)
+				chunk = (chunk * n * p) /
+					*kiblnd_tunables.kib_timeout;
+			if (chunk == 0)
+				chunk = 1;
+
+			for (i = 0; i < chunk; i++) {
+				kiblnd_check_conns(peer_index);
+				peer_index = (peer_index + 1) %
+					     kiblnd_data.kib_peer_hash_size;
+			}
+
+			deadline += p * HZ;
+			spin_lock_irqsave(&kiblnd_data.kib_connd_lock, flags);
+		}
+
+		if (dropped_lock)
+			continue;
+
+		/* Nothing to do for 'timeout'  */
+		set_current_state(TASK_INTERRUPTIBLE);
+		add_wait_queue(&kiblnd_data.kib_connd_waitq, &wait);
+		spin_unlock_irqrestore(&kiblnd_data.kib_connd_lock, flags);
+
+		schedule_timeout(timeout);
+
+		remove_wait_queue(&kiblnd_data.kib_connd_waitq, &wait);
+		spin_lock_irqsave(&kiblnd_data.kib_connd_lock, flags);
+	}
+
+	spin_unlock_irqrestore(&kiblnd_data.kib_connd_lock, flags);
+
+	kiblnd_thread_fini();
+	return 0;
+}
+
+void
+kiblnd_qp_event(struct ib_event *event, void *arg)
+{
+	kib_conn_t *conn = arg;
+
+	switch (event->event) {
+	case IB_EVENT_COMM_EST:
+		CDEBUG(D_NET, "%s established\n",
+		       libcfs_nid2str(conn->ibc_peer->ibp_nid));
+		return;
+
+	default:
+		CERROR("%s: Async QP event type %d\n",
+		       libcfs_nid2str(conn->ibc_peer->ibp_nid), event->event);
+		return;
+	}
+}
+
+static void
+kiblnd_complete(struct ib_wc *wc)
+{
+	switch (kiblnd_wreqid2type(wc->wr_id)) {
+	default:
+		LBUG();
+
+	case IBLND_WID_RDMA:
+		/* We only get RDMA completion notification if it fails.  All
+		 * subsequent work items, including the final SEND will fail
+		 * too.  However we can't print out any more info about the
+		 * failing RDMA because 'tx' might be back on the idle list or
+		 * even reused already if we didn't manage to post all our work
+		 * items */
+		CNETERR("RDMA (tx: %p) failed: %d\n",
+			kiblnd_wreqid2ptr(wc->wr_id), wc->status);
+		return;
+
+	case IBLND_WID_TX:
+		kiblnd_tx_complete(kiblnd_wreqid2ptr(wc->wr_id), wc->status);
+		return;
+
+	case IBLND_WID_RX:
+		kiblnd_rx_complete(kiblnd_wreqid2ptr(wc->wr_id), wc->status,
+				   wc->byte_len);
+		return;
+	}
+}
+
+void
+kiblnd_cq_completion(struct ib_cq *cq, void *arg)
+{
+	/* NB I'm not allowed to schedule this conn once its refcount has
+	 * reached 0.  Since fundamentally I'm racing with scheduler threads
+	 * consuming my CQ I could be called after all completions have
+	 * occurred.  But in this case, ibc_nrx == 0 && ibc_nsends_posted == 0
+	 * and this CQ is about to be destroyed so I NOOP. */
+	kib_conn_t		*conn = (kib_conn_t *)arg;
+	struct kib_sched_info	*sched = conn->ibc_sched;
+	unsigned long		flags;
+
+	LASSERT(cq == conn->ibc_cq);
+
+	spin_lock_irqsave(&sched->ibs_lock, flags);
+
+	conn->ibc_ready = 1;
+
+	if (!conn->ibc_scheduled &&
+	    (conn->ibc_nrx > 0 ||
+	     conn->ibc_nsends_posted > 0)) {
+		kiblnd_conn_addref(conn); /* +1 ref for sched_conns */
+		conn->ibc_scheduled = 1;
+		list_add_tail(&conn->ibc_sched_list, &sched->ibs_conns);
+
+		if (waitqueue_active(&sched->ibs_waitq))
+			wake_up(&sched->ibs_waitq);
+	}
+
+	spin_unlock_irqrestore(&sched->ibs_lock, flags);
+}
+
+void
+kiblnd_cq_event(struct ib_event *event, void *arg)
+{
+	kib_conn_t *conn = arg;
+
+	CERROR("%s: async CQ event type %d\n",
+	       libcfs_nid2str(conn->ibc_peer->ibp_nid), event->event);
+}
+
+int
+kiblnd_scheduler(void *arg)
+{
+	long			id = (long)arg;
+	struct kib_sched_info	*sched;
+	kib_conn_t		*conn;
+	wait_queue_t		wait;
+	unsigned long		flags;
+	struct ib_wc		wc;
+	int			did_something;
+	int			busy_loops = 0;
+	int			rc;
+
+	cfs_block_allsigs();
+
+	init_waitqueue_entry(&wait, current);
+
+	sched = kiblnd_data.kib_scheds[KIB_THREAD_CPT(id)];
+
+	rc = cfs_cpt_bind(lnet_cpt_table(), sched->ibs_cpt);
+	if (rc != 0) {
+		CWARN("Failed to bind on CPT %d, please verify whether all CPUs are healthy and reload modules if necessary, otherwise your system might under risk of low performance\n",
+		      sched->ibs_cpt);
+	}
+
+	spin_lock_irqsave(&sched->ibs_lock, flags);
+
+	while (!kiblnd_data.kib_shutdown) {
+		if (busy_loops++ >= IBLND_RESCHED) {
+			spin_unlock_irqrestore(&sched->ibs_lock, flags);
+
+			cond_resched();
+			busy_loops = 0;
+
+			spin_lock_irqsave(&sched->ibs_lock, flags);
+		}
+
+		did_something = 0;
+
+		if (!list_empty(&sched->ibs_conns)) {
+			conn = list_entry(sched->ibs_conns.next,
+					      kib_conn_t, ibc_sched_list);
+			/* take over kib_sched_conns' ref on conn... */
+			LASSERT(conn->ibc_scheduled);
+			list_del(&conn->ibc_sched_list);
+			conn->ibc_ready = 0;
+
+			spin_unlock_irqrestore(&sched->ibs_lock, flags);
+
+			rc = ib_poll_cq(conn->ibc_cq, 1, &wc);
+			if (rc == 0) {
+				rc = ib_req_notify_cq(conn->ibc_cq,
+						      IB_CQ_NEXT_COMP);
+				if (rc < 0) {
+					CWARN("%s: ib_req_notify_cq failed: %d, closing connection\n",
+					      libcfs_nid2str(conn->ibc_peer->ibp_nid), rc);
+					kiblnd_close_conn(conn, -EIO);
+					kiblnd_conn_decref(conn);
+					spin_lock_irqsave(&sched->ibs_lock,
+							      flags);
+					continue;
+				}
+
+				rc = ib_poll_cq(conn->ibc_cq, 1, &wc);
+			}
+
+			if (rc < 0) {
+				CWARN("%s: ib_poll_cq failed: %d, closing connection\n",
+				      libcfs_nid2str(conn->ibc_peer->ibp_nid),
+				      rc);
+				kiblnd_close_conn(conn, -EIO);
+				kiblnd_conn_decref(conn);
+				spin_lock_irqsave(&sched->ibs_lock, flags);
+				continue;
+			}
+
+			spin_lock_irqsave(&sched->ibs_lock, flags);
+
+			if (rc != 0 || conn->ibc_ready) {
+				/* There may be another completion waiting; get
+				 * another scheduler to check while I handle
+				 * this one... */
+				/* +1 ref for sched_conns */
+				kiblnd_conn_addref(conn);
+				list_add_tail(&conn->ibc_sched_list,
+						  &sched->ibs_conns);
+				if (waitqueue_active(&sched->ibs_waitq))
+					wake_up(&sched->ibs_waitq);
+			} else {
+				conn->ibc_scheduled = 0;
+			}
+
+			if (rc != 0) {
+				spin_unlock_irqrestore(&sched->ibs_lock, flags);
+				kiblnd_complete(&wc);
+
+				spin_lock_irqsave(&sched->ibs_lock, flags);
+			}
+
+			kiblnd_conn_decref(conn); /* ...drop my ref from above */
+			did_something = 1;
+		}
+
+		if (did_something)
+			continue;
+
+		set_current_state(TASK_INTERRUPTIBLE);
+		add_wait_queue_exclusive(&sched->ibs_waitq, &wait);
+		spin_unlock_irqrestore(&sched->ibs_lock, flags);
+
+		schedule();
+		busy_loops = 0;
+
+		remove_wait_queue(&sched->ibs_waitq, &wait);
+		spin_lock_irqsave(&sched->ibs_lock, flags);
+	}
+
+	spin_unlock_irqrestore(&sched->ibs_lock, flags);
+
+	kiblnd_thread_fini();
+	return 0;
+}
+
+int
+kiblnd_failover_thread(void *arg)
+{
+	rwlock_t		*glock = &kiblnd_data.kib_global_lock;
+	kib_dev_t	 *dev;
+	wait_queue_t     wait;
+	unsigned long      flags;
+	int		rc;
+
+	LASSERT(*kiblnd_tunables.kib_dev_failover != 0);
+
+	cfs_block_allsigs();
+
+	init_waitqueue_entry(&wait, current);
+	write_lock_irqsave(glock, flags);
+
+	while (!kiblnd_data.kib_shutdown) {
+		int     do_failover = 0;
+		int     long_sleep;
+
+		list_for_each_entry(dev, &kiblnd_data.kib_failed_devs,
+				    ibd_fail_list) {
+			if (time_before(cfs_time_current(),
+					dev->ibd_next_failover))
+				continue;
+			do_failover = 1;
+			break;
+		}
+
+		if (do_failover) {
+			list_del_init(&dev->ibd_fail_list);
+			dev->ibd_failover = 1;
+			write_unlock_irqrestore(glock, flags);
+
+			rc = kiblnd_dev_failover(dev);
+
+			write_lock_irqsave(glock, flags);
+
+			LASSERT(dev->ibd_failover);
+			dev->ibd_failover = 0;
+			if (rc >= 0) { /* Device is OK or failover succeed */
+				dev->ibd_next_failover = cfs_time_shift(3);
+				continue;
+			}
+
+			/* failed to failover, retry later */
+			dev->ibd_next_failover =
+				cfs_time_shift(min(dev->ibd_failed_failover, 10));
+			if (kiblnd_dev_can_failover(dev)) {
+				list_add_tail(&dev->ibd_fail_list,
+					      &kiblnd_data.kib_failed_devs);
+			}
+
+			continue;
+		}
+
+		/* long sleep if no more pending failover */
+		long_sleep = list_empty(&kiblnd_data.kib_failed_devs);
+
+		set_current_state(TASK_INTERRUPTIBLE);
+		add_wait_queue(&kiblnd_data.kib_failover_waitq, &wait);
+		write_unlock_irqrestore(glock, flags);
+
+		rc = schedule_timeout(long_sleep ? cfs_time_seconds(10) :
+						   cfs_time_seconds(1));
+		remove_wait_queue(&kiblnd_data.kib_failover_waitq, &wait);
+		write_lock_irqsave(glock, flags);
+
+		if (!long_sleep || rc != 0)
+			continue;
+
+		/* have a long sleep, routine check all active devices,
+		 * we need checking like this because if there is not active
+		 * connection on the dev and no SEND from local, we may listen
+		 * on wrong HCA for ever while there is a bonding failover */
+		list_for_each_entry(dev, &kiblnd_data.kib_devs, ibd_list) {
+			if (kiblnd_dev_can_failover(dev)) {
+				list_add_tail(&dev->ibd_fail_list,
+					      &kiblnd_data.kib_failed_devs);
+			}
+		}
+	}
+
+	write_unlock_irqrestore(glock, flags);
+
+	kiblnd_thread_fini();
+	return 0;
+}
diff --git a/kernel/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd_modparams.c b/kernel/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd_modparams.c
new file mode 100644
index 000000000..eedf01afd
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd_modparams.c
@@ -0,0 +1,230 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/klnds/o2iblnd/o2iblnd_modparams.c
+ *
+ * Author: Eric Barton <eric@bartonsoftware.com>
+ */
+
+#include "o2iblnd.h"
+
+static int service = 987;
+module_param(service, int, 0444);
+MODULE_PARM_DESC(service, "service number (within RDMA_PS_TCP)");
+
+static int cksum;
+module_param(cksum, int, 0644);
+MODULE_PARM_DESC(cksum, "set non-zero to enable message (not RDMA) checksums");
+
+static int timeout = 50;
+module_param(timeout, int, 0644);
+MODULE_PARM_DESC(timeout, "timeout (seconds)");
+
+/* Number of threads in each scheduler pool which is percpt,
+ * we will estimate reasonable value based on CPUs if it's set to zero. */
+static int nscheds;
+module_param(nscheds, int, 0444);
+MODULE_PARM_DESC(nscheds, "number of threads in each scheduler pool");
+
+/* NB: this value is shared by all CPTs, it can grow at runtime */
+static int ntx = 512;
+module_param(ntx, int, 0444);
+MODULE_PARM_DESC(ntx, "# of message descriptors allocated for each pool");
+
+/* NB: this value is shared by all CPTs */
+static int credits = 256;
+module_param(credits, int, 0444);
+MODULE_PARM_DESC(credits, "# concurrent sends");
+
+static int peer_credits = 8;
+module_param(peer_credits, int, 0444);
+MODULE_PARM_DESC(peer_credits, "# concurrent sends to 1 peer");
+
+static int peer_credits_hiw;
+module_param(peer_credits_hiw, int, 0444);
+MODULE_PARM_DESC(peer_credits_hiw, "when eagerly to return credits");
+
+static int peer_buffer_credits;
+module_param(peer_buffer_credits, int, 0444);
+MODULE_PARM_DESC(peer_buffer_credits, "# per-peer router buffer credits");
+
+static int peer_timeout = 180;
+module_param(peer_timeout, int, 0444);
+MODULE_PARM_DESC(peer_timeout, "Seconds without aliveness news to declare peer dead (<=0 to disable)");
+
+static char *ipif_name = "ib0";
+module_param(ipif_name, charp, 0444);
+MODULE_PARM_DESC(ipif_name, "IPoIB interface name");
+
+static int retry_count = 5;
+module_param(retry_count, int, 0644);
+MODULE_PARM_DESC(retry_count, "Retransmissions when no ACK received");
+
+static int rnr_retry_count = 6;
+module_param(rnr_retry_count, int, 0644);
+MODULE_PARM_DESC(rnr_retry_count, "RNR retransmissions");
+
+static int keepalive = 100;
+module_param(keepalive, int, 0644);
+MODULE_PARM_DESC(keepalive, "Idle time in seconds before sending a keepalive");
+
+static int ib_mtu;
+module_param(ib_mtu, int, 0444);
+MODULE_PARM_DESC(ib_mtu, "IB MTU 256/512/1024/2048/4096");
+
+static int concurrent_sends;
+module_param(concurrent_sends, int, 0444);
+MODULE_PARM_DESC(concurrent_sends, "send work-queue sizing");
+
+static int map_on_demand;
+module_param(map_on_demand, int, 0444);
+MODULE_PARM_DESC(map_on_demand, "map on demand");
+
+/* NB: this value is shared by all CPTs, it can grow at runtime */
+static int fmr_pool_size = 512;
+module_param(fmr_pool_size, int, 0444);
+MODULE_PARM_DESC(fmr_pool_size, "size of fmr pool on each CPT (>= ntx / 4)");
+
+/* NB: this value is shared by all CPTs, it can grow at runtime */
+static int fmr_flush_trigger = 384;
+module_param(fmr_flush_trigger, int, 0444);
+MODULE_PARM_DESC(fmr_flush_trigger, "# dirty FMRs that triggers pool flush");
+
+static int fmr_cache = 1;
+module_param(fmr_cache, int, 0444);
+MODULE_PARM_DESC(fmr_cache, "non-zero to enable FMR caching");
+
+/* NB: this value is shared by all CPTs, it can grow at runtime */
+static int pmr_pool_size = 512;
+module_param(pmr_pool_size, int, 0444);
+MODULE_PARM_DESC(pmr_pool_size, "size of MR cache pmr pool on each CPT");
+
+/*
+ * 0: disable failover
+ * 1: enable failover if necessary
+ * 2: force to failover (for debug)
+ */
+static int dev_failover;
+module_param(dev_failover, int, 0444);
+MODULE_PARM_DESC(dev_failover, "HCA failover for bonding (0 off, 1 on, other values reserved)");
+
+
+static int require_privileged_port;
+module_param(require_privileged_port, int, 0644);
+MODULE_PARM_DESC(require_privileged_port, "require privileged port when accepting connection");
+
+static int use_privileged_port = 1;
+module_param(use_privileged_port, int, 0644);
+MODULE_PARM_DESC(use_privileged_port, "use privileged port when initiating connection");
+
+kib_tunables_t kiblnd_tunables = {
+	.kib_dev_failover	   = &dev_failover,
+	.kib_service		= &service,
+	.kib_cksum		  = &cksum,
+	.kib_timeout		= &timeout,
+	.kib_keepalive	      = &keepalive,
+	.kib_ntx		    = &ntx,
+	.kib_credits		= &credits,
+	.kib_peertxcredits	  = &peer_credits,
+	.kib_peercredits_hiw	= &peer_credits_hiw,
+	.kib_peerrtrcredits	 = &peer_buffer_credits,
+	.kib_peertimeout	    = &peer_timeout,
+	.kib_default_ipif	   = &ipif_name,
+	.kib_retry_count	    = &retry_count,
+	.kib_rnr_retry_count	= &rnr_retry_count,
+	.kib_concurrent_sends       = &concurrent_sends,
+	.kib_ib_mtu		 = &ib_mtu,
+	.kib_map_on_demand	  = &map_on_demand,
+	.kib_fmr_pool_size	  = &fmr_pool_size,
+	.kib_fmr_flush_trigger      = &fmr_flush_trigger,
+	.kib_fmr_cache	      = &fmr_cache,
+	.kib_pmr_pool_size	  = &pmr_pool_size,
+	.kib_require_priv_port      = &require_privileged_port,
+	.kib_use_priv_port	    = &use_privileged_port,
+	.kib_nscheds		    = &nscheds
+};
+
+int
+kiblnd_tunables_init(void)
+{
+	if (kiblnd_translate_mtu(*kiblnd_tunables.kib_ib_mtu) < 0) {
+		CERROR("Invalid ib_mtu %d, expected 256/512/1024/2048/4096\n",
+		       *kiblnd_tunables.kib_ib_mtu);
+		return -EINVAL;
+	}
+
+	if (*kiblnd_tunables.kib_peertxcredits < IBLND_CREDITS_DEFAULT)
+		*kiblnd_tunables.kib_peertxcredits = IBLND_CREDITS_DEFAULT;
+
+	if (*kiblnd_tunables.kib_peertxcredits > IBLND_CREDITS_MAX)
+		*kiblnd_tunables.kib_peertxcredits = IBLND_CREDITS_MAX;
+
+	if (*kiblnd_tunables.kib_peertxcredits > *kiblnd_tunables.kib_credits)
+		*kiblnd_tunables.kib_peertxcredits = *kiblnd_tunables.kib_credits;
+
+	if (*kiblnd_tunables.kib_peercredits_hiw < *kiblnd_tunables.kib_peertxcredits / 2)
+		*kiblnd_tunables.kib_peercredits_hiw = *kiblnd_tunables.kib_peertxcredits / 2;
+
+	if (*kiblnd_tunables.kib_peercredits_hiw >= *kiblnd_tunables.kib_peertxcredits)
+		*kiblnd_tunables.kib_peercredits_hiw = *kiblnd_tunables.kib_peertxcredits - 1;
+
+	if (*kiblnd_tunables.kib_map_on_demand < 0 ||
+	    *kiblnd_tunables.kib_map_on_demand > IBLND_MAX_RDMA_FRAGS)
+		*kiblnd_tunables.kib_map_on_demand = 0; /* disable map-on-demand */
+
+	if (*kiblnd_tunables.kib_map_on_demand == 1)
+		*kiblnd_tunables.kib_map_on_demand = 2; /* don't make sense to create map if only one fragment */
+
+	if (*kiblnd_tunables.kib_concurrent_sends == 0) {
+		if (*kiblnd_tunables.kib_map_on_demand > 0 &&
+		    *kiblnd_tunables.kib_map_on_demand <= IBLND_MAX_RDMA_FRAGS / 8)
+			*kiblnd_tunables.kib_concurrent_sends = (*kiblnd_tunables.kib_peertxcredits) * 2;
+		else
+			*kiblnd_tunables.kib_concurrent_sends = (*kiblnd_tunables.kib_peertxcredits);
+	}
+
+	if (*kiblnd_tunables.kib_concurrent_sends > *kiblnd_tunables.kib_peertxcredits * 2)
+		*kiblnd_tunables.kib_concurrent_sends = *kiblnd_tunables.kib_peertxcredits * 2;
+
+	if (*kiblnd_tunables.kib_concurrent_sends < *kiblnd_tunables.kib_peertxcredits / 2)
+		*kiblnd_tunables.kib_concurrent_sends = *kiblnd_tunables.kib_peertxcredits / 2;
+
+	if (*kiblnd_tunables.kib_concurrent_sends < *kiblnd_tunables.kib_peertxcredits) {
+		CWARN("Concurrent sends %d is lower than message queue size: %d, performance may drop slightly.\n",
+		      *kiblnd_tunables.kib_concurrent_sends, *kiblnd_tunables.kib_peertxcredits);
+	}
+
+	return 0;
+}
diff --git a/kernel/drivers/staging/lustre/lnet/klnds/socklnd/Makefile b/kernel/drivers/staging/lustre/lnet/klnds/socklnd/Makefile
new file mode 100644
index 000000000..f3fb8778c
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lnet/klnds/socklnd/Makefile
@@ -0,0 +1,3 @@
+obj-$(CONFIG_LNET) += ksocklnd.o
+
+ksocklnd-y := socklnd.o socklnd_cb.o socklnd_proto.o socklnd_modparams.o socklnd_lib-linux.o
diff --git a/kernel/drivers/staging/lustre/lnet/klnds/socklnd/socklnd.c b/kernel/drivers/staging/lustre/lnet/klnds/socklnd/socklnd.c
new file mode 100644
index 000000000..7586b7e40
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lnet/klnds/socklnd/socklnd.c
@@ -0,0 +1,2886 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/klnds/socklnd/socklnd.c
+ *
+ * Author: Zach Brown <zab@zabbo.net>
+ * Author: Peter J. Braam <braam@clusterfs.com>
+ * Author: Phil Schwan <phil@clusterfs.com>
+ * Author: Eric Barton <eric@bartonsoftware.com>
+ */
+
+#include "socklnd.h"
+
+static lnd_t the_ksocklnd;
+ksock_nal_data_t ksocknal_data;
+
+static ksock_interface_t *
+ksocknal_ip2iface(lnet_ni_t *ni, __u32 ip)
+{
+	ksock_net_t       *net = ni->ni_data;
+	int		i;
+	ksock_interface_t *iface;
+
+	for (i = 0; i < net->ksnn_ninterfaces; i++) {
+		LASSERT(i < LNET_MAX_INTERFACES);
+		iface = &net->ksnn_interfaces[i];
+
+		if (iface->ksni_ipaddr == ip)
+			return iface;
+	}
+
+	return NULL;
+}
+
+static ksock_route_t *
+ksocknal_create_route(__u32 ipaddr, int port)
+{
+	ksock_route_t *route;
+
+	LIBCFS_ALLOC(route, sizeof(*route));
+	if (route == NULL)
+		return NULL;
+
+	atomic_set(&route->ksnr_refcount, 1);
+	route->ksnr_peer = NULL;
+	route->ksnr_retry_interval = 0;	 /* OK to connect at any time */
+	route->ksnr_ipaddr = ipaddr;
+	route->ksnr_port = port;
+	route->ksnr_scheduled = 0;
+	route->ksnr_connecting = 0;
+	route->ksnr_connected = 0;
+	route->ksnr_deleted = 0;
+	route->ksnr_conn_count = 0;
+	route->ksnr_share_count = 0;
+
+	return route;
+}
+
+void
+ksocknal_destroy_route(ksock_route_t *route)
+{
+	LASSERT(atomic_read(&route->ksnr_refcount) == 0);
+
+	if (route->ksnr_peer != NULL)
+		ksocknal_peer_decref(route->ksnr_peer);
+
+	LIBCFS_FREE(route, sizeof(*route));
+}
+
+static int
+ksocknal_create_peer(ksock_peer_t **peerp, lnet_ni_t *ni, lnet_process_id_t id)
+{
+	ksock_net_t   *net = ni->ni_data;
+	ksock_peer_t  *peer;
+
+	LASSERT(id.nid != LNET_NID_ANY);
+	LASSERT(id.pid != LNET_PID_ANY);
+	LASSERT(!in_interrupt());
+
+	LIBCFS_ALLOC(peer, sizeof(*peer));
+	if (peer == NULL)
+		return -ENOMEM;
+
+	peer->ksnp_ni = ni;
+	peer->ksnp_id = id;
+	atomic_set(&peer->ksnp_refcount, 1);   /* 1 ref for caller */
+	peer->ksnp_closing = 0;
+	peer->ksnp_accepting = 0;
+	peer->ksnp_proto = NULL;
+	peer->ksnp_last_alive = 0;
+	peer->ksnp_zc_next_cookie = SOCKNAL_KEEPALIVE_PING + 1;
+
+	INIT_LIST_HEAD(&peer->ksnp_conns);
+	INIT_LIST_HEAD(&peer->ksnp_routes);
+	INIT_LIST_HEAD(&peer->ksnp_tx_queue);
+	INIT_LIST_HEAD(&peer->ksnp_zc_req_list);
+	spin_lock_init(&peer->ksnp_lock);
+
+	spin_lock_bh(&net->ksnn_lock);
+
+	if (net->ksnn_shutdown) {
+		spin_unlock_bh(&net->ksnn_lock);
+
+		LIBCFS_FREE(peer, sizeof(*peer));
+		CERROR("Can't create peer: network shutdown\n");
+		return -ESHUTDOWN;
+	}
+
+	net->ksnn_npeers++;
+
+	spin_unlock_bh(&net->ksnn_lock);
+
+	*peerp = peer;
+	return 0;
+}
+
+void
+ksocknal_destroy_peer(ksock_peer_t *peer)
+{
+	ksock_net_t    *net = peer->ksnp_ni->ni_data;
+
+	CDEBUG(D_NET, "peer %s %p deleted\n",
+		libcfs_id2str(peer->ksnp_id), peer);
+
+	LASSERT(atomic_read(&peer->ksnp_refcount) == 0);
+	LASSERT(peer->ksnp_accepting == 0);
+	LASSERT(list_empty(&peer->ksnp_conns));
+	LASSERT(list_empty(&peer->ksnp_routes));
+	LASSERT(list_empty(&peer->ksnp_tx_queue));
+	LASSERT(list_empty(&peer->ksnp_zc_req_list));
+
+	LIBCFS_FREE(peer, sizeof(*peer));
+
+	/* NB a peer's connections and routes keep a reference on their peer
+	 * until they are destroyed, so we can be assured that _all_ state to
+	 * do with this peer has been cleaned up when its refcount drops to
+	 * zero. */
+	spin_lock_bh(&net->ksnn_lock);
+	net->ksnn_npeers--;
+	spin_unlock_bh(&net->ksnn_lock);
+}
+
+ksock_peer_t *
+ksocknal_find_peer_locked(lnet_ni_t *ni, lnet_process_id_t id)
+{
+	struct list_head       *peer_list = ksocknal_nid2peerlist(id.nid);
+	struct list_head       *tmp;
+	ksock_peer_t     *peer;
+
+	list_for_each(tmp, peer_list) {
+
+		peer = list_entry(tmp, ksock_peer_t, ksnp_list);
+
+		LASSERT(!peer->ksnp_closing);
+
+		if (peer->ksnp_ni != ni)
+			continue;
+
+		if (peer->ksnp_id.nid != id.nid ||
+		    peer->ksnp_id.pid != id.pid)
+			continue;
+
+		CDEBUG(D_NET, "got peer [%p] -> %s (%d)\n",
+		       peer, libcfs_id2str(id),
+		       atomic_read(&peer->ksnp_refcount));
+		return peer;
+	}
+	return NULL;
+}
+
+ksock_peer_t *
+ksocknal_find_peer(lnet_ni_t *ni, lnet_process_id_t id)
+{
+	ksock_peer_t     *peer;
+
+	read_lock(&ksocknal_data.ksnd_global_lock);
+	peer = ksocknal_find_peer_locked(ni, id);
+	if (peer != NULL)			/* +1 ref for caller? */
+		ksocknal_peer_addref(peer);
+	read_unlock(&ksocknal_data.ksnd_global_lock);
+
+	return peer;
+}
+
+static void
+ksocknal_unlink_peer_locked(ksock_peer_t *peer)
+{
+	int		i;
+	__u32	      ip;
+	ksock_interface_t *iface;
+
+	for (i = 0; i < peer->ksnp_n_passive_ips; i++) {
+		LASSERT(i < LNET_MAX_INTERFACES);
+		ip = peer->ksnp_passive_ips[i];
+
+		iface = ksocknal_ip2iface(peer->ksnp_ni, ip);
+		/* All IPs in peer->ksnp_passive_ips[] come from the
+		 * interface list, therefore the call must succeed. */
+		LASSERT(iface != NULL);
+
+		CDEBUG(D_NET, "peer=%p iface=%p ksni_nroutes=%d\n",
+		       peer, iface, iface->ksni_nroutes);
+		iface->ksni_npeers--;
+	}
+
+	LASSERT(list_empty(&peer->ksnp_conns));
+	LASSERT(list_empty(&peer->ksnp_routes));
+	LASSERT(!peer->ksnp_closing);
+	peer->ksnp_closing = 1;
+	list_del(&peer->ksnp_list);
+	/* lose peerlist's ref */
+	ksocknal_peer_decref(peer);
+}
+
+static int
+ksocknal_get_peer_info(lnet_ni_t *ni, int index,
+			lnet_process_id_t *id, __u32 *myip, __u32 *peer_ip,
+			int *port, int *conn_count, int *share_count)
+{
+	ksock_peer_t      *peer;
+	struct list_head	*ptmp;
+	ksock_route_t     *route;
+	struct list_head	*rtmp;
+	int		i;
+	int		j;
+	int		rc = -ENOENT;
+
+	read_lock(&ksocknal_data.ksnd_global_lock);
+
+	for (i = 0; i < ksocknal_data.ksnd_peer_hash_size; i++) {
+
+		list_for_each(ptmp, &ksocknal_data.ksnd_peers[i]) {
+			peer = list_entry(ptmp, ksock_peer_t, ksnp_list);
+
+			if (peer->ksnp_ni != ni)
+				continue;
+
+			if (peer->ksnp_n_passive_ips == 0 &&
+			    list_empty(&peer->ksnp_routes)) {
+				if (index-- > 0)
+					continue;
+
+				*id = peer->ksnp_id;
+				*myip = 0;
+				*peer_ip = 0;
+				*port = 0;
+				*conn_count = 0;
+				*share_count = 0;
+				rc = 0;
+				goto out;
+			}
+
+			for (j = 0; j < peer->ksnp_n_passive_ips; j++) {
+				if (index-- > 0)
+					continue;
+
+				*id = peer->ksnp_id;
+				*myip = peer->ksnp_passive_ips[j];
+				*peer_ip = 0;
+				*port = 0;
+				*conn_count = 0;
+				*share_count = 0;
+				rc = 0;
+				goto out;
+			}
+
+			list_for_each(rtmp, &peer->ksnp_routes) {
+				if (index-- > 0)
+					continue;
+
+				route = list_entry(rtmp, ksock_route_t,
+						       ksnr_list);
+
+				*id = peer->ksnp_id;
+				*myip = route->ksnr_myipaddr;
+				*peer_ip = route->ksnr_ipaddr;
+				*port = route->ksnr_port;
+				*conn_count = route->ksnr_conn_count;
+				*share_count = route->ksnr_share_count;
+				rc = 0;
+				goto out;
+			}
+		}
+	}
+ out:
+	read_unlock(&ksocknal_data.ksnd_global_lock);
+	return rc;
+}
+
+static void
+ksocknal_associate_route_conn_locked(ksock_route_t *route, ksock_conn_t *conn)
+{
+	ksock_peer_t      *peer = route->ksnr_peer;
+	int		type = conn->ksnc_type;
+	ksock_interface_t *iface;
+
+	conn->ksnc_route = route;
+	ksocknal_route_addref(route);
+
+	if (route->ksnr_myipaddr != conn->ksnc_myipaddr) {
+		if (route->ksnr_myipaddr == 0) {
+			/* route wasn't bound locally yet (the initial route) */
+			CDEBUG(D_NET, "Binding %s %pI4h to %pI4h\n",
+			       libcfs_id2str(peer->ksnp_id),
+			       &route->ksnr_ipaddr,
+			       &conn->ksnc_myipaddr);
+		} else {
+			CDEBUG(D_NET, "Rebinding %s %pI4h from %pI4h to %pI4h\n",
+			       libcfs_id2str(peer->ksnp_id),
+			       &route->ksnr_ipaddr,
+			       &route->ksnr_myipaddr,
+			       &conn->ksnc_myipaddr);
+
+			iface = ksocknal_ip2iface(route->ksnr_peer->ksnp_ni,
+						  route->ksnr_myipaddr);
+			if (iface != NULL)
+				iface->ksni_nroutes--;
+		}
+		route->ksnr_myipaddr = conn->ksnc_myipaddr;
+		iface = ksocknal_ip2iface(route->ksnr_peer->ksnp_ni,
+					  route->ksnr_myipaddr);
+		if (iface != NULL)
+			iface->ksni_nroutes++;
+	}
+
+	route->ksnr_connected |= (1<<type);
+	route->ksnr_conn_count++;
+
+	/* Successful connection => further attempts can
+	 * proceed immediately */
+	route->ksnr_retry_interval = 0;
+}
+
+static void
+ksocknal_add_route_locked(ksock_peer_t *peer, ksock_route_t *route)
+{
+	struct list_head	*tmp;
+	ksock_conn_t      *conn;
+	ksock_route_t     *route2;
+
+	LASSERT(!peer->ksnp_closing);
+	LASSERT(route->ksnr_peer == NULL);
+	LASSERT(!route->ksnr_scheduled);
+	LASSERT(!route->ksnr_connecting);
+	LASSERT(route->ksnr_connected == 0);
+
+	/* LASSERT(unique) */
+	list_for_each(tmp, &peer->ksnp_routes) {
+		route2 = list_entry(tmp, ksock_route_t, ksnr_list);
+
+		if (route2->ksnr_ipaddr == route->ksnr_ipaddr) {
+			CERROR("Duplicate route %s %pI4h\n",
+				libcfs_id2str(peer->ksnp_id),
+				&route->ksnr_ipaddr);
+			LBUG();
+		}
+	}
+
+	route->ksnr_peer = peer;
+	ksocknal_peer_addref(peer);
+	/* peer's routelist takes over my ref on 'route' */
+	list_add_tail(&route->ksnr_list, &peer->ksnp_routes);
+
+	list_for_each(tmp, &peer->ksnp_conns) {
+		conn = list_entry(tmp, ksock_conn_t, ksnc_list);
+
+		if (conn->ksnc_ipaddr != route->ksnr_ipaddr)
+			continue;
+
+		ksocknal_associate_route_conn_locked(route, conn);
+		/* keep going (typed routes) */
+	}
+}
+
+static void
+ksocknal_del_route_locked(ksock_route_t *route)
+{
+	ksock_peer_t      *peer = route->ksnr_peer;
+	ksock_interface_t *iface;
+	ksock_conn_t      *conn;
+	struct list_head	*ctmp;
+	struct list_head	*cnxt;
+
+	LASSERT(!route->ksnr_deleted);
+
+	/* Close associated conns */
+	list_for_each_safe(ctmp, cnxt, &peer->ksnp_conns) {
+		conn = list_entry(ctmp, ksock_conn_t, ksnc_list);
+
+		if (conn->ksnc_route != route)
+			continue;
+
+		ksocknal_close_conn_locked(conn, 0);
+	}
+
+	if (route->ksnr_myipaddr != 0) {
+		iface = ksocknal_ip2iface(route->ksnr_peer->ksnp_ni,
+					  route->ksnr_myipaddr);
+		if (iface != NULL)
+			iface->ksni_nroutes--;
+	}
+
+	route->ksnr_deleted = 1;
+	list_del(&route->ksnr_list);
+	ksocknal_route_decref(route);	     /* drop peer's ref */
+
+	if (list_empty(&peer->ksnp_routes) &&
+	    list_empty(&peer->ksnp_conns)) {
+		/* I've just removed the last route to a peer with no active
+		 * connections */
+		ksocknal_unlink_peer_locked(peer);
+	}
+}
+
+int
+ksocknal_add_peer(lnet_ni_t *ni, lnet_process_id_t id, __u32 ipaddr, int port)
+{
+	struct list_head	*tmp;
+	ksock_peer_t      *peer;
+	ksock_peer_t      *peer2;
+	ksock_route_t     *route;
+	ksock_route_t     *route2;
+	int		rc;
+
+	if (id.nid == LNET_NID_ANY ||
+	    id.pid == LNET_PID_ANY)
+		return -EINVAL;
+
+	/* Have a brand new peer ready... */
+	rc = ksocknal_create_peer(&peer, ni, id);
+	if (rc != 0)
+		return rc;
+
+	route = ksocknal_create_route(ipaddr, port);
+	if (route == NULL) {
+		ksocknal_peer_decref(peer);
+		return -ENOMEM;
+	}
+
+	write_lock_bh(&ksocknal_data.ksnd_global_lock);
+
+	/* always called with a ref on ni, so shutdown can't have started */
+	LASSERT(((ksock_net_t *) ni->ni_data)->ksnn_shutdown == 0);
+
+	peer2 = ksocknal_find_peer_locked(ni, id);
+	if (peer2 != NULL) {
+		ksocknal_peer_decref(peer);
+		peer = peer2;
+	} else {
+		/* peer table takes my ref on peer */
+		list_add_tail(&peer->ksnp_list,
+				   ksocknal_nid2peerlist(id.nid));
+	}
+
+	route2 = NULL;
+	list_for_each(tmp, &peer->ksnp_routes) {
+		route2 = list_entry(tmp, ksock_route_t, ksnr_list);
+
+		if (route2->ksnr_ipaddr == ipaddr)
+			break;
+
+		route2 = NULL;
+	}
+	if (route2 == NULL) {
+		ksocknal_add_route_locked(peer, route);
+		route->ksnr_share_count++;
+	} else {
+		ksocknal_route_decref(route);
+		route2->ksnr_share_count++;
+	}
+
+	write_unlock_bh(&ksocknal_data.ksnd_global_lock);
+
+	return 0;
+}
+
+static void
+ksocknal_del_peer_locked(ksock_peer_t *peer, __u32 ip)
+{
+	ksock_conn_t     *conn;
+	ksock_route_t    *route;
+	struct list_head       *tmp;
+	struct list_head       *nxt;
+	int	       nshared;
+
+	LASSERT(!peer->ksnp_closing);
+
+	/* Extra ref prevents peer disappearing until I'm done with it */
+	ksocknal_peer_addref(peer);
+
+	list_for_each_safe(tmp, nxt, &peer->ksnp_routes) {
+		route = list_entry(tmp, ksock_route_t, ksnr_list);
+
+		/* no match */
+		if (!(ip == 0 || route->ksnr_ipaddr == ip))
+			continue;
+
+		route->ksnr_share_count = 0;
+		/* This deletes associated conns too */
+		ksocknal_del_route_locked(route);
+	}
+
+	nshared = 0;
+	list_for_each_safe(tmp, nxt, &peer->ksnp_routes) {
+		route = list_entry(tmp, ksock_route_t, ksnr_list);
+		nshared += route->ksnr_share_count;
+	}
+
+	if (nshared == 0) {
+		/* remove everything else if there are no explicit entries
+		 * left */
+
+		list_for_each_safe(tmp, nxt, &peer->ksnp_routes) {
+			route = list_entry(tmp, ksock_route_t, ksnr_list);
+
+			/* we should only be removing auto-entries */
+			LASSERT(route->ksnr_share_count == 0);
+			ksocknal_del_route_locked(route);
+		}
+
+		list_for_each_safe(tmp, nxt, &peer->ksnp_conns) {
+			conn = list_entry(tmp, ksock_conn_t, ksnc_list);
+
+			ksocknal_close_conn_locked(conn, 0);
+		}
+	}
+
+	ksocknal_peer_decref(peer);
+	/* NB peer unlinks itself when last conn/route is removed */
+}
+
+static int
+ksocknal_del_peer(lnet_ni_t *ni, lnet_process_id_t id, __u32 ip)
+{
+	LIST_HEAD(zombies);
+	struct list_head	*ptmp;
+	struct list_head	*pnxt;
+	ksock_peer_t      *peer;
+	int		lo;
+	int		hi;
+	int		i;
+	int		rc = -ENOENT;
+
+	write_lock_bh(&ksocknal_data.ksnd_global_lock);
+
+	if (id.nid != LNET_NID_ANY)
+		lo = hi = (int)(ksocknal_nid2peerlist(id.nid) - ksocknal_data.ksnd_peers);
+	else {
+		lo = 0;
+		hi = ksocknal_data.ksnd_peer_hash_size - 1;
+	}
+
+	for (i = lo; i <= hi; i++) {
+		list_for_each_safe(ptmp, pnxt,
+					&ksocknal_data.ksnd_peers[i]) {
+			peer = list_entry(ptmp, ksock_peer_t, ksnp_list);
+
+			if (peer->ksnp_ni != ni)
+				continue;
+
+			if (!((id.nid == LNET_NID_ANY || peer->ksnp_id.nid == id.nid) &&
+			      (id.pid == LNET_PID_ANY || peer->ksnp_id.pid == id.pid)))
+				continue;
+
+			ksocknal_peer_addref(peer);     /* a ref for me... */
+
+			ksocknal_del_peer_locked(peer, ip);
+
+			if (peer->ksnp_closing &&
+			    !list_empty(&peer->ksnp_tx_queue)) {
+				LASSERT(list_empty(&peer->ksnp_conns));
+				LASSERT(list_empty(&peer->ksnp_routes));
+
+				list_splice_init(&peer->ksnp_tx_queue,
+						     &zombies);
+			}
+
+			ksocknal_peer_decref(peer);     /* ...till here */
+
+			rc = 0;		 /* matched! */
+		}
+	}
+
+	write_unlock_bh(&ksocknal_data.ksnd_global_lock);
+
+	ksocknal_txlist_done(ni, &zombies, 1);
+
+	return rc;
+}
+
+static ksock_conn_t *
+ksocknal_get_conn_by_idx(lnet_ni_t *ni, int index)
+{
+	ksock_peer_t      *peer;
+	struct list_head	*ptmp;
+	ksock_conn_t      *conn;
+	struct list_head	*ctmp;
+	int		i;
+
+	read_lock(&ksocknal_data.ksnd_global_lock);
+
+	for (i = 0; i < ksocknal_data.ksnd_peer_hash_size; i++) {
+		list_for_each(ptmp, &ksocknal_data.ksnd_peers[i]) {
+			peer = list_entry(ptmp, ksock_peer_t, ksnp_list);
+
+			LASSERT(!peer->ksnp_closing);
+
+			if (peer->ksnp_ni != ni)
+				continue;
+
+			list_for_each(ctmp, &peer->ksnp_conns) {
+				if (index-- > 0)
+					continue;
+
+				conn = list_entry(ctmp, ksock_conn_t,
+						       ksnc_list);
+				ksocknal_conn_addref(conn);
+				read_unlock(&ksocknal_data.ksnd_global_lock);
+				return conn;
+			}
+		}
+	}
+
+	read_unlock(&ksocknal_data.ksnd_global_lock);
+	return NULL;
+}
+
+static ksock_sched_t *
+ksocknal_choose_scheduler_locked(unsigned int cpt)
+{
+	struct ksock_sched_info	*info = ksocknal_data.ksnd_sched_info[cpt];
+	ksock_sched_t		*sched;
+	int			i;
+
+	LASSERT(info->ksi_nthreads > 0);
+
+	sched = &info->ksi_scheds[0];
+	/*
+	 * NB: it's safe so far, but info->ksi_nthreads could be changed
+	 * at runtime when we have dynamic LNet configuration, then we
+	 * need to take care of this.
+	 */
+	for (i = 1; i < info->ksi_nthreads; i++) {
+		if (sched->kss_nconns > info->ksi_scheds[i].kss_nconns)
+			sched = &info->ksi_scheds[i];
+	}
+
+	return sched;
+}
+
+static int
+ksocknal_local_ipvec(lnet_ni_t *ni, __u32 *ipaddrs)
+{
+	ksock_net_t       *net = ni->ni_data;
+	int		i;
+	int		nip;
+
+	read_lock(&ksocknal_data.ksnd_global_lock);
+
+	nip = net->ksnn_ninterfaces;
+	LASSERT(nip <= LNET_MAX_INTERFACES);
+
+	/* Only offer interfaces for additional connections if I have
+	 * more than one. */
+	if (nip < 2) {
+		read_unlock(&ksocknal_data.ksnd_global_lock);
+		return 0;
+	}
+
+	for (i = 0; i < nip; i++) {
+		ipaddrs[i] = net->ksnn_interfaces[i].ksni_ipaddr;
+		LASSERT(ipaddrs[i] != 0);
+	}
+
+	read_unlock(&ksocknal_data.ksnd_global_lock);
+	return nip;
+}
+
+static int
+ksocknal_match_peerip(ksock_interface_t *iface, __u32 *ips, int nips)
+{
+	int   best_netmatch = 0;
+	int   best_xor      = 0;
+	int   best	  = -1;
+	int   this_xor;
+	int   this_netmatch;
+	int   i;
+
+	for (i = 0; i < nips; i++) {
+		if (ips[i] == 0)
+			continue;
+
+		this_xor = ips[i] ^ iface->ksni_ipaddr;
+		this_netmatch = ((this_xor & iface->ksni_netmask) == 0) ? 1 : 0;
+
+		if (!(best < 0 ||
+		      best_netmatch < this_netmatch ||
+		      (best_netmatch == this_netmatch &&
+		       best_xor > this_xor)))
+			continue;
+
+		best = i;
+		best_netmatch = this_netmatch;
+		best_xor = this_xor;
+	}
+
+	LASSERT(best >= 0);
+	return best;
+}
+
+static int
+ksocknal_select_ips(ksock_peer_t *peer, __u32 *peerips, int n_peerips)
+{
+	rwlock_t		*global_lock = &ksocknal_data.ksnd_global_lock;
+	ksock_net_t	*net = peer->ksnp_ni->ni_data;
+	ksock_interface_t  *iface;
+	ksock_interface_t  *best_iface;
+	int		 n_ips;
+	int		 i;
+	int		 j;
+	int		 k;
+	__u32	       ip;
+	__u32	       xor;
+	int		 this_netmatch;
+	int		 best_netmatch;
+	int		 best_npeers;
+
+	/* CAVEAT EMPTOR: We do all our interface matching with an
+	 * exclusive hold of global lock at IRQ priority.  We're only
+	 * expecting to be dealing with small numbers of interfaces, so the
+	 * O(n**3)-ness shouldn't matter */
+
+	/* Also note that I'm not going to return more than n_peerips
+	 * interfaces, even if I have more myself */
+
+	write_lock_bh(global_lock);
+
+	LASSERT(n_peerips <= LNET_MAX_INTERFACES);
+	LASSERT(net->ksnn_ninterfaces <= LNET_MAX_INTERFACES);
+
+	/* Only match interfaces for additional connections
+	 * if I have > 1 interface */
+	n_ips = (net->ksnn_ninterfaces < 2) ? 0 :
+		min(n_peerips, net->ksnn_ninterfaces);
+
+	for (i = 0; peer->ksnp_n_passive_ips < n_ips; i++) {
+		/*	      ^ yes really... */
+
+		/* If we have any new interfaces, first tick off all the
+		 * peer IPs that match old interfaces, then choose new
+		 * interfaces to match the remaining peer IPS.
+		 * We don't forget interfaces we've stopped using; we might
+		 * start using them again... */
+
+		if (i < peer->ksnp_n_passive_ips) {
+			/* Old interface. */
+			ip = peer->ksnp_passive_ips[i];
+			best_iface = ksocknal_ip2iface(peer->ksnp_ni, ip);
+
+		} else {
+			/* choose a new interface */
+			LASSERT(i == peer->ksnp_n_passive_ips);
+
+			best_iface = NULL;
+			best_netmatch = 0;
+			best_npeers = 0;
+
+			for (j = 0; j < net->ksnn_ninterfaces; j++) {
+				iface = &net->ksnn_interfaces[j];
+				ip = iface->ksni_ipaddr;
+
+				for (k = 0; k < peer->ksnp_n_passive_ips; k++)
+					if (peer->ksnp_passive_ips[k] == ip)
+						break;
+
+				if (k < peer->ksnp_n_passive_ips) /* using it already */
+					continue;
+
+				k = ksocknal_match_peerip(iface, peerips, n_peerips);
+				xor = ip ^ peerips[k];
+				this_netmatch = ((xor & iface->ksni_netmask) == 0) ? 1 : 0;
+
+				if (!(best_iface == NULL ||
+				      best_netmatch < this_netmatch ||
+				      (best_netmatch == this_netmatch &&
+				       best_npeers > iface->ksni_npeers)))
+					continue;
+
+				best_iface = iface;
+				best_netmatch = this_netmatch;
+				best_npeers = iface->ksni_npeers;
+			}
+
+			best_iface->ksni_npeers++;
+			ip = best_iface->ksni_ipaddr;
+			peer->ksnp_passive_ips[i] = ip;
+			peer->ksnp_n_passive_ips = i+1;
+		}
+
+		/* mark the best matching peer IP used */
+		j = ksocknal_match_peerip(best_iface, peerips, n_peerips);
+		peerips[j] = 0;
+	}
+
+	/* Overwrite input peer IP addresses */
+	memcpy(peerips, peer->ksnp_passive_ips, n_ips * sizeof(*peerips));
+
+	write_unlock_bh(global_lock);
+
+	return n_ips;
+}
+
+static void
+ksocknal_create_routes(ksock_peer_t *peer, int port,
+		       __u32 *peer_ipaddrs, int npeer_ipaddrs)
+{
+	ksock_route_t       *newroute = NULL;
+	rwlock_t		*global_lock = &ksocknal_data.ksnd_global_lock;
+	lnet_ni_t	   *ni = peer->ksnp_ni;
+	ksock_net_t	 *net = ni->ni_data;
+	struct list_head	  *rtmp;
+	ksock_route_t       *route;
+	ksock_interface_t   *iface;
+	ksock_interface_t   *best_iface;
+	int		  best_netmatch;
+	int		  this_netmatch;
+	int		  best_nroutes;
+	int		  i;
+	int		  j;
+
+	/* CAVEAT EMPTOR: We do all our interface matching with an
+	 * exclusive hold of global lock at IRQ priority.  We're only
+	 * expecting to be dealing with small numbers of interfaces, so the
+	 * O(n**3)-ness here shouldn't matter */
+
+	write_lock_bh(global_lock);
+
+	if (net->ksnn_ninterfaces < 2) {
+		/* Only create additional connections
+		 * if I have > 1 interface */
+		write_unlock_bh(global_lock);
+		return;
+	}
+
+	LASSERT(npeer_ipaddrs <= LNET_MAX_INTERFACES);
+
+	for (i = 0; i < npeer_ipaddrs; i++) {
+		if (newroute != NULL) {
+			newroute->ksnr_ipaddr = peer_ipaddrs[i];
+		} else {
+			write_unlock_bh(global_lock);
+
+			newroute = ksocknal_create_route(peer_ipaddrs[i], port);
+			if (newroute == NULL)
+				return;
+
+			write_lock_bh(global_lock);
+		}
+
+		if (peer->ksnp_closing) {
+			/* peer got closed under me */
+			break;
+		}
+
+		/* Already got a route? */
+		route = NULL;
+		list_for_each(rtmp, &peer->ksnp_routes) {
+			route = list_entry(rtmp, ksock_route_t, ksnr_list);
+
+			if (route->ksnr_ipaddr == newroute->ksnr_ipaddr)
+				break;
+
+			route = NULL;
+		}
+		if (route != NULL)
+			continue;
+
+		best_iface = NULL;
+		best_nroutes = 0;
+		best_netmatch = 0;
+
+		LASSERT(net->ksnn_ninterfaces <= LNET_MAX_INTERFACES);
+
+		/* Select interface to connect from */
+		for (j = 0; j < net->ksnn_ninterfaces; j++) {
+			iface = &net->ksnn_interfaces[j];
+
+			/* Using this interface already? */
+			list_for_each(rtmp, &peer->ksnp_routes) {
+				route = list_entry(rtmp, ksock_route_t,
+						       ksnr_list);
+
+				if (route->ksnr_myipaddr == iface->ksni_ipaddr)
+					break;
+
+				route = NULL;
+			}
+			if (route != NULL)
+				continue;
+
+			this_netmatch = (((iface->ksni_ipaddr ^
+					   newroute->ksnr_ipaddr) &
+					   iface->ksni_netmask) == 0) ? 1 : 0;
+
+			if (!(best_iface == NULL ||
+			      best_netmatch < this_netmatch ||
+			      (best_netmatch == this_netmatch &&
+			       best_nroutes > iface->ksni_nroutes)))
+				continue;
+
+			best_iface = iface;
+			best_netmatch = this_netmatch;
+			best_nroutes = iface->ksni_nroutes;
+		}
+
+		if (best_iface == NULL)
+			continue;
+
+		newroute->ksnr_myipaddr = best_iface->ksni_ipaddr;
+		best_iface->ksni_nroutes++;
+
+		ksocknal_add_route_locked(peer, newroute);
+		newroute = NULL;
+	}
+
+	write_unlock_bh(global_lock);
+	if (newroute != NULL)
+		ksocknal_route_decref(newroute);
+}
+
+int
+ksocknal_accept(lnet_ni_t *ni, struct socket *sock)
+{
+	ksock_connreq_t    *cr;
+	int		 rc;
+	__u32	       peer_ip;
+	int		 peer_port;
+
+	rc = libcfs_sock_getaddr(sock, 1, &peer_ip, &peer_port);
+	LASSERT(rc == 0);		      /* we succeeded before */
+
+	LIBCFS_ALLOC(cr, sizeof(*cr));
+	if (cr == NULL) {
+		LCONSOLE_ERROR_MSG(0x12f, "Dropping connection request from %pI4h: memory exhausted\n",
+				   &peer_ip);
+		return -ENOMEM;
+	}
+
+	lnet_ni_addref(ni);
+	cr->ksncr_ni   = ni;
+	cr->ksncr_sock = sock;
+
+	spin_lock_bh(&ksocknal_data.ksnd_connd_lock);
+
+	list_add_tail(&cr->ksncr_list, &ksocknal_data.ksnd_connd_connreqs);
+	wake_up(&ksocknal_data.ksnd_connd_waitq);
+
+	spin_unlock_bh(&ksocknal_data.ksnd_connd_lock);
+	return 0;
+}
+
+static int
+ksocknal_connecting(ksock_peer_t *peer, __u32 ipaddr)
+{
+	ksock_route_t   *route;
+
+	list_for_each_entry(route, &peer->ksnp_routes, ksnr_list) {
+
+		if (route->ksnr_ipaddr == ipaddr)
+			return route->ksnr_connecting;
+	}
+	return 0;
+}
+
+int
+ksocknal_create_conn(lnet_ni_t *ni, ksock_route_t *route,
+		      struct socket *sock, int type)
+{
+	rwlock_t		*global_lock = &ksocknal_data.ksnd_global_lock;
+	LIST_HEAD(zombies);
+	lnet_process_id_t  peerid;
+	struct list_head	*tmp;
+	__u64	      incarnation;
+	ksock_conn_t      *conn;
+	ksock_conn_t      *conn2;
+	ksock_peer_t      *peer = NULL;
+	ksock_peer_t      *peer2;
+	ksock_sched_t     *sched;
+	ksock_hello_msg_t *hello;
+	int		   cpt;
+	ksock_tx_t	*tx;
+	ksock_tx_t	*txtmp;
+	int		rc;
+	int		active;
+	char	      *warn = NULL;
+
+	active = (route != NULL);
+
+	LASSERT(active == (type != SOCKLND_CONN_NONE));
+
+	LIBCFS_ALLOC(conn, sizeof(*conn));
+	if (conn == NULL) {
+		rc = -ENOMEM;
+		goto failed_0;
+	}
+
+	conn->ksnc_peer = NULL;
+	conn->ksnc_route = NULL;
+	conn->ksnc_sock = sock;
+	/* 2 ref, 1 for conn, another extra ref prevents socket
+	 * being closed before establishment of connection */
+	atomic_set(&conn->ksnc_sock_refcount, 2);
+	conn->ksnc_type = type;
+	ksocknal_lib_save_callback(sock, conn);
+	atomic_set(&conn->ksnc_conn_refcount, 1); /* 1 ref for me */
+
+	conn->ksnc_rx_ready = 0;
+	conn->ksnc_rx_scheduled = 0;
+
+	INIT_LIST_HEAD(&conn->ksnc_tx_queue);
+	conn->ksnc_tx_ready = 0;
+	conn->ksnc_tx_scheduled = 0;
+	conn->ksnc_tx_carrier = NULL;
+	atomic_set(&conn->ksnc_tx_nob, 0);
+
+	LIBCFS_ALLOC(hello, offsetof(ksock_hello_msg_t,
+				     kshm_ips[LNET_MAX_INTERFACES]));
+	if (hello == NULL) {
+		rc = -ENOMEM;
+		goto failed_1;
+	}
+
+	/* stash conn's local and remote addrs */
+	rc = ksocknal_lib_get_conn_addrs(conn);
+	if (rc != 0)
+		goto failed_1;
+
+	/* Find out/confirm peer's NID and connection type and get the
+	 * vector of interfaces she's willing to let me connect to.
+	 * Passive connections use the listener timeout since the peer sends
+	 * eagerly */
+
+	if (active) {
+		peer = route->ksnr_peer;
+		LASSERT(ni == peer->ksnp_ni);
+
+		/* Active connection sends HELLO eagerly */
+		hello->kshm_nips = ksocknal_local_ipvec(ni, hello->kshm_ips);
+		peerid = peer->ksnp_id;
+
+		write_lock_bh(global_lock);
+		conn->ksnc_proto = peer->ksnp_proto;
+		write_unlock_bh(global_lock);
+
+		if (conn->ksnc_proto == NULL) {
+			 conn->ksnc_proto = &ksocknal_protocol_v3x;
+#if SOCKNAL_VERSION_DEBUG
+			 if (*ksocknal_tunables.ksnd_protocol == 2)
+				 conn->ksnc_proto = &ksocknal_protocol_v2x;
+			 else if (*ksocknal_tunables.ksnd_protocol == 1)
+				 conn->ksnc_proto = &ksocknal_protocol_v1x;
+#endif
+		}
+
+		rc = ksocknal_send_hello(ni, conn, peerid.nid, hello);
+		if (rc != 0)
+			goto failed_1;
+	} else {
+		peerid.nid = LNET_NID_ANY;
+		peerid.pid = LNET_PID_ANY;
+
+		/* Passive, get protocol from peer */
+		conn->ksnc_proto = NULL;
+	}
+
+	rc = ksocknal_recv_hello(ni, conn, hello, &peerid, &incarnation);
+	if (rc < 0)
+		goto failed_1;
+
+	LASSERT(rc == 0 || active);
+	LASSERT(conn->ksnc_proto != NULL);
+	LASSERT(peerid.nid != LNET_NID_ANY);
+
+	cpt = lnet_cpt_of_nid(peerid.nid);
+
+	if (active) {
+		ksocknal_peer_addref(peer);
+		write_lock_bh(global_lock);
+	} else {
+		rc = ksocknal_create_peer(&peer, ni, peerid);
+		if (rc != 0)
+			goto failed_1;
+
+		write_lock_bh(global_lock);
+
+		/* called with a ref on ni, so shutdown can't have started */
+		LASSERT(((ksock_net_t *) ni->ni_data)->ksnn_shutdown == 0);
+
+		peer2 = ksocknal_find_peer_locked(ni, peerid);
+		if (peer2 == NULL) {
+			/* NB this puts an "empty" peer in the peer
+			 * table (which takes my ref) */
+			list_add_tail(&peer->ksnp_list,
+					  ksocknal_nid2peerlist(peerid.nid));
+		} else {
+			ksocknal_peer_decref(peer);
+			peer = peer2;
+		}
+
+		/* +1 ref for me */
+		ksocknal_peer_addref(peer);
+		peer->ksnp_accepting++;
+
+		/* Am I already connecting to this guy?  Resolve in
+		 * favour of higher NID... */
+		if (peerid.nid < ni->ni_nid &&
+		    ksocknal_connecting(peer, conn->ksnc_ipaddr)) {
+			rc = EALREADY;
+			warn = "connection race resolution";
+			goto failed_2;
+		}
+	}
+
+	if (peer->ksnp_closing ||
+	    (active && route->ksnr_deleted)) {
+		/* peer/route got closed under me */
+		rc = -ESTALE;
+		warn = "peer/route removed";
+		goto failed_2;
+	}
+
+	if (peer->ksnp_proto == NULL) {
+		/* Never connected before.
+		 * NB recv_hello may have returned EPROTO to signal my peer
+		 * wants a different protocol than the one I asked for.
+		 */
+		LASSERT(list_empty(&peer->ksnp_conns));
+
+		peer->ksnp_proto = conn->ksnc_proto;
+		peer->ksnp_incarnation = incarnation;
+	}
+
+	if (peer->ksnp_proto != conn->ksnc_proto ||
+	    peer->ksnp_incarnation != incarnation) {
+		/* Peer rebooted or I've got the wrong protocol version */
+		ksocknal_close_peer_conns_locked(peer, 0, 0);
+
+		peer->ksnp_proto = NULL;
+		rc = ESTALE;
+		warn = peer->ksnp_incarnation != incarnation ?
+		       "peer rebooted" :
+		       "wrong proto version";
+		goto failed_2;
+	}
+
+	switch (rc) {
+	default:
+		LBUG();
+	case 0:
+		break;
+	case EALREADY:
+		warn = "lost conn race";
+		goto failed_2;
+	case EPROTO:
+		warn = "retry with different protocol version";
+		goto failed_2;
+	}
+
+	/* Refuse to duplicate an existing connection, unless this is a
+	 * loopback connection */
+	if (conn->ksnc_ipaddr != conn->ksnc_myipaddr) {
+		list_for_each(tmp, &peer->ksnp_conns) {
+			conn2 = list_entry(tmp, ksock_conn_t, ksnc_list);
+
+			if (conn2->ksnc_ipaddr != conn->ksnc_ipaddr ||
+			    conn2->ksnc_myipaddr != conn->ksnc_myipaddr ||
+			    conn2->ksnc_type != conn->ksnc_type)
+				continue;
+
+			/* Reply on a passive connection attempt so the peer
+			 * realises we're connected. */
+			LASSERT(rc == 0);
+			if (!active)
+				rc = EALREADY;
+
+			warn = "duplicate";
+			goto failed_2;
+		}
+	}
+
+	/* If the connection created by this route didn't bind to the IP
+	 * address the route connected to, the connection/route matching
+	 * code below probably isn't going to work. */
+	if (active &&
+	    route->ksnr_ipaddr != conn->ksnc_ipaddr) {
+		CERROR("Route %s %pI4h connected to %pI4h\n",
+		       libcfs_id2str(peer->ksnp_id),
+		       &route->ksnr_ipaddr,
+		       &conn->ksnc_ipaddr);
+	}
+
+	/* Search for a route corresponding to the new connection and
+	 * create an association.  This allows incoming connections created
+	 * by routes in my peer to match my own route entries so I don't
+	 * continually create duplicate routes. */
+	list_for_each(tmp, &peer->ksnp_routes) {
+		route = list_entry(tmp, ksock_route_t, ksnr_list);
+
+		if (route->ksnr_ipaddr != conn->ksnc_ipaddr)
+			continue;
+
+		ksocknal_associate_route_conn_locked(route, conn);
+		break;
+	}
+
+	conn->ksnc_peer = peer;		 /* conn takes my ref on peer */
+	peer->ksnp_last_alive = cfs_time_current();
+	peer->ksnp_send_keepalive = 0;
+	peer->ksnp_error = 0;
+
+	sched = ksocknal_choose_scheduler_locked(cpt);
+	sched->kss_nconns++;
+	conn->ksnc_scheduler = sched;
+
+	conn->ksnc_tx_last_post = cfs_time_current();
+	/* Set the deadline for the outgoing HELLO to drain */
+	conn->ksnc_tx_bufnob = sock->sk->sk_wmem_queued;
+	conn->ksnc_tx_deadline = cfs_time_shift(*ksocknal_tunables.ksnd_timeout);
+	mb();   /* order with adding to peer's conn list */
+
+	list_add(&conn->ksnc_list, &peer->ksnp_conns);
+	ksocknal_conn_addref(conn);
+
+	ksocknal_new_packet(conn, 0);
+
+	conn->ksnc_zc_capable = ksocknal_lib_zc_capable(conn);
+
+	/* Take packets blocking for this connection. */
+	list_for_each_entry_safe(tx, txtmp, &peer->ksnp_tx_queue, tx_list) {
+		if (conn->ksnc_proto->pro_match_tx(conn, tx, tx->tx_nonblk) == SOCKNAL_MATCH_NO)
+				continue;
+
+		list_del(&tx->tx_list);
+		ksocknal_queue_tx_locked(tx, conn);
+	}
+
+	write_unlock_bh(global_lock);
+
+	/* We've now got a new connection.  Any errors from here on are just
+	 * like "normal" comms errors and we close the connection normally.
+	 * NB (a) we still have to send the reply HELLO for passive
+	 *	connections,
+	 *    (b) normal I/O on the conn is blocked until I setup and call the
+	 *	socket callbacks.
+	 */
+
+	CDEBUG(D_NET, "New conn %s p %d.x %pI4h -> %pI4h/%d incarnation:%lld sched[%d:%d]\n",
+	       libcfs_id2str(peerid), conn->ksnc_proto->pro_version,
+	       &conn->ksnc_myipaddr, &conn->ksnc_ipaddr,
+	       conn->ksnc_port, incarnation, cpt,
+	       (int)(sched - &sched->kss_info->ksi_scheds[0]));
+
+	if (active) {
+		/* additional routes after interface exchange? */
+		ksocknal_create_routes(peer, conn->ksnc_port,
+				       hello->kshm_ips, hello->kshm_nips);
+	} else {
+		hello->kshm_nips = ksocknal_select_ips(peer, hello->kshm_ips,
+						       hello->kshm_nips);
+		rc = ksocknal_send_hello(ni, conn, peerid.nid, hello);
+	}
+
+	LIBCFS_FREE(hello, offsetof(ksock_hello_msg_t,
+				    kshm_ips[LNET_MAX_INTERFACES]));
+
+	/* setup the socket AFTER I've received hello (it disables
+	 * SO_LINGER).  I might call back to the acceptor who may want
+	 * to send a protocol version response and then close the
+	 * socket; this ensures the socket only tears down after the
+	 * response has been sent. */
+	if (rc == 0)
+		rc = ksocknal_lib_setup_sock(sock);
+
+	write_lock_bh(global_lock);
+
+	/* NB my callbacks block while I hold ksnd_global_lock */
+	ksocknal_lib_set_callback(sock, conn);
+
+	if (!active)
+		peer->ksnp_accepting--;
+
+	write_unlock_bh(global_lock);
+
+	if (rc != 0) {
+		write_lock_bh(global_lock);
+		if (!conn->ksnc_closing) {
+			/* could be closed by another thread */
+			ksocknal_close_conn_locked(conn, rc);
+		}
+		write_unlock_bh(global_lock);
+	} else if (ksocknal_connsock_addref(conn) == 0) {
+		/* Allow I/O to proceed. */
+		ksocknal_read_callback(conn);
+		ksocknal_write_callback(conn);
+		ksocknal_connsock_decref(conn);
+	}
+
+	ksocknal_connsock_decref(conn);
+	ksocknal_conn_decref(conn);
+	return rc;
+
+ failed_2:
+	if (!peer->ksnp_closing &&
+	    list_empty(&peer->ksnp_conns) &&
+	    list_empty(&peer->ksnp_routes)) {
+		list_add(&zombies, &peer->ksnp_tx_queue);
+		list_del_init(&peer->ksnp_tx_queue);
+		ksocknal_unlink_peer_locked(peer);
+	}
+
+	write_unlock_bh(global_lock);
+
+	if (warn != NULL) {
+		if (rc < 0)
+			CERROR("Not creating conn %s type %d: %s\n",
+			       libcfs_id2str(peerid), conn->ksnc_type, warn);
+		else
+			CDEBUG(D_NET, "Not creating conn %s type %d: %s\n",
+			      libcfs_id2str(peerid), conn->ksnc_type, warn);
+	}
+
+	if (!active) {
+		if (rc > 0) {
+			/* Request retry by replying with CONN_NONE
+			 * ksnc_proto has been set already */
+			conn->ksnc_type = SOCKLND_CONN_NONE;
+			hello->kshm_nips = 0;
+			ksocknal_send_hello(ni, conn, peerid.nid, hello);
+		}
+
+		write_lock_bh(global_lock);
+		peer->ksnp_accepting--;
+		write_unlock_bh(global_lock);
+	}
+
+	ksocknal_txlist_done(ni, &zombies, 1);
+	ksocknal_peer_decref(peer);
+
+ failed_1:
+	if (hello != NULL)
+		LIBCFS_FREE(hello, offsetof(ksock_hello_msg_t,
+					    kshm_ips[LNET_MAX_INTERFACES]));
+
+	LIBCFS_FREE(conn, sizeof(*conn));
+
+ failed_0:
+	libcfs_sock_release(sock);
+	return rc;
+}
+
+void
+ksocknal_close_conn_locked(ksock_conn_t *conn, int error)
+{
+	/* This just does the immmediate housekeeping, and queues the
+	 * connection for the reaper to terminate.
+	 * Caller holds ksnd_global_lock exclusively in irq context */
+	ksock_peer_t      *peer = conn->ksnc_peer;
+	ksock_route_t     *route;
+	ksock_conn_t      *conn2;
+	struct list_head	*tmp;
+
+	LASSERT(peer->ksnp_error == 0);
+	LASSERT(!conn->ksnc_closing);
+	conn->ksnc_closing = 1;
+
+	/* ksnd_deathrow_conns takes over peer's ref */
+	list_del(&conn->ksnc_list);
+
+	route = conn->ksnc_route;
+	if (route != NULL) {
+		/* dissociate conn from route... */
+		LASSERT(!route->ksnr_deleted);
+		LASSERT((route->ksnr_connected & (1 << conn->ksnc_type)) != 0);
+
+		conn2 = NULL;
+		list_for_each(tmp, &peer->ksnp_conns) {
+			conn2 = list_entry(tmp, ksock_conn_t, ksnc_list);
+
+			if (conn2->ksnc_route == route &&
+			    conn2->ksnc_type == conn->ksnc_type)
+				break;
+
+			conn2 = NULL;
+		}
+		if (conn2 == NULL)
+			route->ksnr_connected &= ~(1 << conn->ksnc_type);
+
+		conn->ksnc_route = NULL;
+
+#if 0	   /* irrelevant with only eager routes */
+		/* make route least favourite */
+		list_del(&route->ksnr_list);
+		list_add_tail(&route->ksnr_list, &peer->ksnp_routes);
+#endif
+		ksocknal_route_decref(route);     /* drop conn's ref on route */
+	}
+
+	if (list_empty(&peer->ksnp_conns)) {
+		/* No more connections to this peer */
+
+		if (!list_empty(&peer->ksnp_tx_queue)) {
+			ksock_tx_t *tx;
+
+			LASSERT(conn->ksnc_proto == &ksocknal_protocol_v3x);
+
+			/* throw them to the last connection...,
+			 * these TXs will be send to /dev/null by scheduler */
+			list_for_each_entry(tx, &peer->ksnp_tx_queue,
+						tx_list)
+				ksocknal_tx_prep(conn, tx);
+
+			spin_lock_bh(&conn->ksnc_scheduler->kss_lock);
+			list_splice_init(&peer->ksnp_tx_queue,
+					     &conn->ksnc_tx_queue);
+			spin_unlock_bh(&conn->ksnc_scheduler->kss_lock);
+		}
+
+		peer->ksnp_proto = NULL;	/* renegotiate protocol version */
+		peer->ksnp_error = error;       /* stash last conn close reason */
+
+		if (list_empty(&peer->ksnp_routes)) {
+			/* I've just closed last conn belonging to a
+			 * peer with no routes to it */
+			ksocknal_unlink_peer_locked(peer);
+		}
+	}
+
+	spin_lock_bh(&ksocknal_data.ksnd_reaper_lock);
+
+	list_add_tail(&conn->ksnc_list,
+			  &ksocknal_data.ksnd_deathrow_conns);
+	wake_up(&ksocknal_data.ksnd_reaper_waitq);
+
+	spin_unlock_bh(&ksocknal_data.ksnd_reaper_lock);
+}
+
+void
+ksocknal_peer_failed(ksock_peer_t *peer)
+{
+	int	notify = 0;
+	unsigned long last_alive = 0;
+
+	/* There has been a connection failure or comms error; but I'll only
+	 * tell LNET I think the peer is dead if it's to another kernel and
+	 * there are no connections or connection attempts in existence. */
+
+	read_lock(&ksocknal_data.ksnd_global_lock);
+
+	if ((peer->ksnp_id.pid & LNET_PID_USERFLAG) == 0 &&
+	    list_empty(&peer->ksnp_conns) &&
+	    peer->ksnp_accepting == 0 &&
+	    ksocknal_find_connecting_route_locked(peer) == NULL) {
+		notify = 1;
+		last_alive = peer->ksnp_last_alive;
+	}
+
+	read_unlock(&ksocknal_data.ksnd_global_lock);
+
+	if (notify)
+		lnet_notify(peer->ksnp_ni, peer->ksnp_id.nid, 0,
+			     last_alive);
+}
+
+void
+ksocknal_finalize_zcreq(ksock_conn_t *conn)
+{
+	ksock_peer_t     *peer = conn->ksnc_peer;
+	ksock_tx_t       *tx;
+	ksock_tx_t       *tmp;
+	LIST_HEAD(zlist);
+
+	/* NB safe to finalize TXs because closing of socket will
+	 * abort all buffered data */
+	LASSERT(conn->ksnc_sock == NULL);
+
+	spin_lock(&peer->ksnp_lock);
+
+	list_for_each_entry_safe(tx, tmp, &peer->ksnp_zc_req_list, tx_zc_list) {
+		if (tx->tx_conn != conn)
+			continue;
+
+		LASSERT(tx->tx_msg.ksm_zc_cookies[0] != 0);
+
+		tx->tx_msg.ksm_zc_cookies[0] = 0;
+		tx->tx_zc_aborted = 1; /* mark it as not-acked */
+		list_del(&tx->tx_zc_list);
+		list_add(&tx->tx_zc_list, &zlist);
+	}
+
+	spin_unlock(&peer->ksnp_lock);
+
+	while (!list_empty(&zlist)) {
+		tx = list_entry(zlist.next, ksock_tx_t, tx_zc_list);
+
+		list_del(&tx->tx_zc_list);
+		ksocknal_tx_decref(tx);
+	}
+}
+
+void
+ksocknal_terminate_conn(ksock_conn_t *conn)
+{
+	/* This gets called by the reaper (guaranteed thread context) to
+	 * disengage the socket from its callbacks and close it.
+	 * ksnc_refcount will eventually hit zero, and then the reaper will
+	 * destroy it. */
+	ksock_peer_t     *peer = conn->ksnc_peer;
+	ksock_sched_t    *sched = conn->ksnc_scheduler;
+	int	       failed = 0;
+
+	LASSERT(conn->ksnc_closing);
+
+	/* wake up the scheduler to "send" all remaining packets to /dev/null */
+	spin_lock_bh(&sched->kss_lock);
+
+	/* a closing conn is always ready to tx */
+	conn->ksnc_tx_ready = 1;
+
+	if (!conn->ksnc_tx_scheduled &&
+	    !list_empty(&conn->ksnc_tx_queue)) {
+		list_add_tail(&conn->ksnc_tx_list,
+			       &sched->kss_tx_conns);
+		conn->ksnc_tx_scheduled = 1;
+		/* extra ref for scheduler */
+		ksocknal_conn_addref(conn);
+
+		wake_up(&sched->kss_waitq);
+	}
+
+	spin_unlock_bh(&sched->kss_lock);
+
+	/* serialise with callbacks */
+	write_lock_bh(&ksocknal_data.ksnd_global_lock);
+
+	ksocknal_lib_reset_callback(conn->ksnc_sock, conn);
+
+	/* OK, so this conn may not be completely disengaged from its
+	 * scheduler yet, but it _has_ committed to terminate... */
+	conn->ksnc_scheduler->kss_nconns--;
+
+	if (peer->ksnp_error != 0) {
+		/* peer's last conn closed in error */
+		LASSERT(list_empty(&peer->ksnp_conns));
+		failed = 1;
+		peer->ksnp_error = 0;     /* avoid multiple notifications */
+	}
+
+	write_unlock_bh(&ksocknal_data.ksnd_global_lock);
+
+	if (failed)
+		ksocknal_peer_failed(peer);
+
+	/* The socket is closed on the final put; either here, or in
+	 * ksocknal_{send,recv}msg().  Since we set up the linger2 option
+	 * when the connection was established, this will close the socket
+	 * immediately, aborting anything buffered in it. Any hung
+	 * zero-copy transmits will therefore complete in finite time. */
+	ksocknal_connsock_decref(conn);
+}
+
+void
+ksocknal_queue_zombie_conn(ksock_conn_t *conn)
+{
+	/* Queue the conn for the reaper to destroy */
+
+	LASSERT(atomic_read(&conn->ksnc_conn_refcount) == 0);
+	spin_lock_bh(&ksocknal_data.ksnd_reaper_lock);
+
+	list_add_tail(&conn->ksnc_list, &ksocknal_data.ksnd_zombie_conns);
+	wake_up(&ksocknal_data.ksnd_reaper_waitq);
+
+	spin_unlock_bh(&ksocknal_data.ksnd_reaper_lock);
+}
+
+void
+ksocknal_destroy_conn(ksock_conn_t *conn)
+{
+	unsigned long      last_rcv;
+
+	/* Final coup-de-grace of the reaper */
+	CDEBUG(D_NET, "connection %p\n", conn);
+
+	LASSERT(atomic_read(&conn->ksnc_conn_refcount) == 0);
+	LASSERT(atomic_read(&conn->ksnc_sock_refcount) == 0);
+	LASSERT(conn->ksnc_sock == NULL);
+	LASSERT(conn->ksnc_route == NULL);
+	LASSERT(!conn->ksnc_tx_scheduled);
+	LASSERT(!conn->ksnc_rx_scheduled);
+	LASSERT(list_empty(&conn->ksnc_tx_queue));
+
+	/* complete current receive if any */
+	switch (conn->ksnc_rx_state) {
+	case SOCKNAL_RX_LNET_PAYLOAD:
+		last_rcv = conn->ksnc_rx_deadline -
+			   cfs_time_seconds(*ksocknal_tunables.ksnd_timeout);
+		CERROR("Completing partial receive from %s[%d], ip %pI4h:%d, with error, wanted: %d, left: %d, last alive is %ld secs ago\n",
+		       libcfs_id2str(conn->ksnc_peer->ksnp_id), conn->ksnc_type,
+		       &conn->ksnc_ipaddr, conn->ksnc_port,
+		       conn->ksnc_rx_nob_wanted, conn->ksnc_rx_nob_left,
+		       cfs_duration_sec(cfs_time_sub(cfs_time_current(),
+						     last_rcv)));
+		lnet_finalize(conn->ksnc_peer->ksnp_ni,
+			       conn->ksnc_cookie, -EIO);
+		break;
+	case SOCKNAL_RX_LNET_HEADER:
+		if (conn->ksnc_rx_started)
+			CERROR("Incomplete receive of lnet header from %s, ip %pI4h:%d, with error, protocol: %d.x.\n",
+			       libcfs_id2str(conn->ksnc_peer->ksnp_id),
+			       &conn->ksnc_ipaddr, conn->ksnc_port,
+			       conn->ksnc_proto->pro_version);
+		break;
+	case SOCKNAL_RX_KSM_HEADER:
+		if (conn->ksnc_rx_started)
+			CERROR("Incomplete receive of ksock message from %s, ip %pI4h:%d, with error, protocol: %d.x.\n",
+			       libcfs_id2str(conn->ksnc_peer->ksnp_id),
+			       &conn->ksnc_ipaddr, conn->ksnc_port,
+			       conn->ksnc_proto->pro_version);
+		break;
+	case SOCKNAL_RX_SLOP:
+		if (conn->ksnc_rx_started)
+			CERROR("Incomplete receive of slops from %s, ip %pI4h:%d, with error\n",
+			       libcfs_id2str(conn->ksnc_peer->ksnp_id),
+			       &conn->ksnc_ipaddr, conn->ksnc_port);
+	       break;
+	default:
+		LBUG();
+		break;
+	}
+
+	ksocknal_peer_decref(conn->ksnc_peer);
+
+	LIBCFS_FREE(conn, sizeof(*conn));
+}
+
+int
+ksocknal_close_peer_conns_locked(ksock_peer_t *peer, __u32 ipaddr, int why)
+{
+	ksock_conn_t       *conn;
+	struct list_head	 *ctmp;
+	struct list_head	 *cnxt;
+	int		 count = 0;
+
+	list_for_each_safe(ctmp, cnxt, &peer->ksnp_conns) {
+		conn = list_entry(ctmp, ksock_conn_t, ksnc_list);
+
+		if (ipaddr == 0 ||
+		    conn->ksnc_ipaddr == ipaddr) {
+			count++;
+			ksocknal_close_conn_locked(conn, why);
+		}
+	}
+
+	return count;
+}
+
+int
+ksocknal_close_conn_and_siblings(ksock_conn_t *conn, int why)
+{
+	ksock_peer_t     *peer = conn->ksnc_peer;
+	__u32	     ipaddr = conn->ksnc_ipaddr;
+	int	       count;
+
+	write_lock_bh(&ksocknal_data.ksnd_global_lock);
+
+	count = ksocknal_close_peer_conns_locked(peer, ipaddr, why);
+
+	write_unlock_bh(&ksocknal_data.ksnd_global_lock);
+
+	return count;
+}
+
+int
+ksocknal_close_matching_conns(lnet_process_id_t id, __u32 ipaddr)
+{
+	ksock_peer_t       *peer;
+	struct list_head	 *ptmp;
+	struct list_head	 *pnxt;
+	int		 lo;
+	int		 hi;
+	int		 i;
+	int		 count = 0;
+
+	write_lock_bh(&ksocknal_data.ksnd_global_lock);
+
+	if (id.nid != LNET_NID_ANY)
+		lo = hi = (int)(ksocknal_nid2peerlist(id.nid) - ksocknal_data.ksnd_peers);
+	else {
+		lo = 0;
+		hi = ksocknal_data.ksnd_peer_hash_size - 1;
+	}
+
+	for (i = lo; i <= hi; i++) {
+		list_for_each_safe(ptmp, pnxt,
+					&ksocknal_data.ksnd_peers[i]) {
+
+			peer = list_entry(ptmp, ksock_peer_t, ksnp_list);
+
+			if (!((id.nid == LNET_NID_ANY || id.nid == peer->ksnp_id.nid) &&
+			      (id.pid == LNET_PID_ANY || id.pid == peer->ksnp_id.pid)))
+				continue;
+
+			count += ksocknal_close_peer_conns_locked(peer, ipaddr, 0);
+		}
+	}
+
+	write_unlock_bh(&ksocknal_data.ksnd_global_lock);
+
+	/* wildcards always succeed */
+	if (id.nid == LNET_NID_ANY || id.pid == LNET_PID_ANY || ipaddr == 0)
+		return 0;
+
+	if (count == 0)
+		return -ENOENT;
+	else
+		return 0;
+}
+
+void
+ksocknal_notify(lnet_ni_t *ni, lnet_nid_t gw_nid, int alive)
+{
+	/* The router is telling me she's been notified of a change in
+	 * gateway state.... */
+	lnet_process_id_t  id = {0};
+
+	id.nid = gw_nid;
+	id.pid = LNET_PID_ANY;
+
+	CDEBUG(D_NET, "gw %s %s\n", libcfs_nid2str(gw_nid),
+		alive ? "up" : "down");
+
+	if (!alive) {
+		/* If the gateway crashed, close all open connections... */
+		ksocknal_close_matching_conns(id, 0);
+		return;
+	}
+
+	/* ...otherwise do nothing.  We can only establish new connections
+	 * if we have autroutes, and these connect on demand. */
+}
+
+void
+ksocknal_query(lnet_ni_t *ni, lnet_nid_t nid, unsigned long *when)
+{
+	int		connect = 1;
+	unsigned long	 last_alive = 0;
+	unsigned long	 now = cfs_time_current();
+	ksock_peer_t      *peer = NULL;
+	rwlock_t		*glock = &ksocknal_data.ksnd_global_lock;
+	lnet_process_id_t  id = {.nid = nid, .pid = LUSTRE_SRV_LNET_PID};
+
+	read_lock(glock);
+
+	peer = ksocknal_find_peer_locked(ni, id);
+	if (peer != NULL) {
+		struct list_head       *tmp;
+		ksock_conn_t     *conn;
+		int	       bufnob;
+
+		list_for_each(tmp, &peer->ksnp_conns) {
+			conn = list_entry(tmp, ksock_conn_t, ksnc_list);
+			bufnob = conn->ksnc_sock->sk->sk_wmem_queued;
+
+			if (bufnob < conn->ksnc_tx_bufnob) {
+				/* something got ACKed */
+				conn->ksnc_tx_deadline =
+					cfs_time_shift(*ksocknal_tunables.ksnd_timeout);
+				peer->ksnp_last_alive = now;
+				conn->ksnc_tx_bufnob = bufnob;
+			}
+		}
+
+		last_alive = peer->ksnp_last_alive;
+		if (ksocknal_find_connectable_route_locked(peer) == NULL)
+			connect = 0;
+	}
+
+	read_unlock(glock);
+
+	if (last_alive != 0)
+		*when = last_alive;
+
+	CDEBUG(D_NET, "Peer %s %p, alive %ld secs ago, connect %d\n",
+	       libcfs_nid2str(nid), peer,
+	       last_alive ? cfs_duration_sec(now - last_alive) : -1,
+	       connect);
+
+	if (!connect)
+		return;
+
+	ksocknal_add_peer(ni, id, LNET_NIDADDR(nid), lnet_acceptor_port());
+
+	write_lock_bh(glock);
+
+	peer = ksocknal_find_peer_locked(ni, id);
+	if (peer != NULL)
+		ksocknal_launch_all_connections_locked(peer);
+
+	write_unlock_bh(glock);
+	return;
+}
+
+static void
+ksocknal_push_peer(ksock_peer_t *peer)
+{
+	int	       index;
+	int	       i;
+	struct list_head       *tmp;
+	ksock_conn_t     *conn;
+
+	for (index = 0; ; index++) {
+		read_lock(&ksocknal_data.ksnd_global_lock);
+
+		i = 0;
+		conn = NULL;
+
+		list_for_each(tmp, &peer->ksnp_conns) {
+			if (i++ == index) {
+				conn = list_entry(tmp, ksock_conn_t,
+						       ksnc_list);
+				ksocknal_conn_addref(conn);
+				break;
+			}
+		}
+
+		read_unlock(&ksocknal_data.ksnd_global_lock);
+
+		if (conn == NULL)
+			break;
+
+		ksocknal_lib_push_conn(conn);
+		ksocknal_conn_decref(conn);
+	}
+}
+
+static int
+ksocknal_push(lnet_ni_t *ni, lnet_process_id_t id)
+{
+	ksock_peer_t      *peer;
+	struct list_head	*tmp;
+	int		index;
+	int		i;
+	int		j;
+	int		rc = -ENOENT;
+
+	for (i = 0; i < ksocknal_data.ksnd_peer_hash_size; i++) {
+		for (j = 0; ; j++) {
+			read_lock(&ksocknal_data.ksnd_global_lock);
+
+			index = 0;
+			peer = NULL;
+
+			list_for_each(tmp, &ksocknal_data.ksnd_peers[i]) {
+				peer = list_entry(tmp, ksock_peer_t,
+						      ksnp_list);
+
+				if (!((id.nid == LNET_NID_ANY ||
+				       id.nid == peer->ksnp_id.nid) &&
+				      (id.pid == LNET_PID_ANY ||
+				       id.pid == peer->ksnp_id.pid))) {
+					peer = NULL;
+					continue;
+				}
+
+				if (index++ == j) {
+					ksocknal_peer_addref(peer);
+					break;
+				}
+			}
+
+			read_unlock(&ksocknal_data.ksnd_global_lock);
+
+			if (peer != NULL) {
+				rc = 0;
+				ksocknal_push_peer(peer);
+				ksocknal_peer_decref(peer);
+			}
+		}
+
+	}
+
+	return rc;
+}
+
+static int
+ksocknal_add_interface(lnet_ni_t *ni, __u32 ipaddress, __u32 netmask)
+{
+	ksock_net_t       *net = ni->ni_data;
+	ksock_interface_t *iface;
+	int		rc;
+	int		i;
+	int		j;
+	struct list_head	*ptmp;
+	ksock_peer_t      *peer;
+	struct list_head	*rtmp;
+	ksock_route_t     *route;
+
+	if (ipaddress == 0 ||
+	    netmask == 0)
+		return -EINVAL;
+
+	write_lock_bh(&ksocknal_data.ksnd_global_lock);
+
+	iface = ksocknal_ip2iface(ni, ipaddress);
+	if (iface != NULL) {
+		/* silently ignore dups */
+		rc = 0;
+	} else if (net->ksnn_ninterfaces == LNET_MAX_INTERFACES) {
+		rc = -ENOSPC;
+	} else {
+		iface = &net->ksnn_interfaces[net->ksnn_ninterfaces++];
+
+		iface->ksni_ipaddr = ipaddress;
+		iface->ksni_netmask = netmask;
+		iface->ksni_nroutes = 0;
+		iface->ksni_npeers = 0;
+
+		for (i = 0; i < ksocknal_data.ksnd_peer_hash_size; i++) {
+			list_for_each(ptmp, &ksocknal_data.ksnd_peers[i]) {
+				peer = list_entry(ptmp, ksock_peer_t,
+						      ksnp_list);
+
+				for (j = 0; j < peer->ksnp_n_passive_ips; j++)
+					if (peer->ksnp_passive_ips[j] == ipaddress)
+						iface->ksni_npeers++;
+
+				list_for_each(rtmp, &peer->ksnp_routes) {
+					route = list_entry(rtmp,
+							       ksock_route_t,
+							       ksnr_list);
+
+					if (route->ksnr_myipaddr == ipaddress)
+						iface->ksni_nroutes++;
+				}
+			}
+		}
+
+		rc = 0;
+		/* NB only new connections will pay attention to the new interface! */
+	}
+
+	write_unlock_bh(&ksocknal_data.ksnd_global_lock);
+
+	return rc;
+}
+
+static void
+ksocknal_peer_del_interface_locked(ksock_peer_t *peer, __u32 ipaddr)
+{
+	struct list_head	 *tmp;
+	struct list_head	 *nxt;
+	ksock_route_t      *route;
+	ksock_conn_t       *conn;
+	int		 i;
+	int		 j;
+
+	for (i = 0; i < peer->ksnp_n_passive_ips; i++)
+		if (peer->ksnp_passive_ips[i] == ipaddr) {
+			for (j = i+1; j < peer->ksnp_n_passive_ips; j++)
+				peer->ksnp_passive_ips[j-1] =
+					peer->ksnp_passive_ips[j];
+			peer->ksnp_n_passive_ips--;
+			break;
+		}
+
+	list_for_each_safe(tmp, nxt, &peer->ksnp_routes) {
+		route = list_entry(tmp, ksock_route_t, ksnr_list);
+
+		if (route->ksnr_myipaddr != ipaddr)
+			continue;
+
+		if (route->ksnr_share_count != 0) {
+			/* Manually created; keep, but unbind */
+			route->ksnr_myipaddr = 0;
+		} else {
+			ksocknal_del_route_locked(route);
+		}
+	}
+
+	list_for_each_safe(tmp, nxt, &peer->ksnp_conns) {
+		conn = list_entry(tmp, ksock_conn_t, ksnc_list);
+
+		if (conn->ksnc_myipaddr == ipaddr)
+			ksocknal_close_conn_locked(conn, 0);
+	}
+}
+
+static int
+ksocknal_del_interface(lnet_ni_t *ni, __u32 ipaddress)
+{
+	ksock_net_t       *net = ni->ni_data;
+	int		rc = -ENOENT;
+	struct list_head	*tmp;
+	struct list_head	*nxt;
+	ksock_peer_t      *peer;
+	__u32	      this_ip;
+	int		i;
+	int		j;
+
+	write_lock_bh(&ksocknal_data.ksnd_global_lock);
+
+	for (i = 0; i < net->ksnn_ninterfaces; i++) {
+		this_ip = net->ksnn_interfaces[i].ksni_ipaddr;
+
+		if (!(ipaddress == 0 ||
+		      ipaddress == this_ip))
+			continue;
+
+		rc = 0;
+
+		for (j = i+1; j < net->ksnn_ninterfaces; j++)
+			net->ksnn_interfaces[j-1] =
+				net->ksnn_interfaces[j];
+
+		net->ksnn_ninterfaces--;
+
+		for (j = 0; j < ksocknal_data.ksnd_peer_hash_size; j++) {
+			list_for_each_safe(tmp, nxt,
+					       &ksocknal_data.ksnd_peers[j]) {
+				peer = list_entry(tmp, ksock_peer_t,
+						      ksnp_list);
+
+				if (peer->ksnp_ni != ni)
+					continue;
+
+				ksocknal_peer_del_interface_locked(peer, this_ip);
+			}
+		}
+	}
+
+	write_unlock_bh(&ksocknal_data.ksnd_global_lock);
+
+	return rc;
+}
+
+int
+ksocknal_ctl(lnet_ni_t *ni, unsigned int cmd, void *arg)
+{
+	lnet_process_id_t id = {0};
+	struct libcfs_ioctl_data *data = arg;
+	int rc;
+
+	switch (cmd) {
+	case IOC_LIBCFS_GET_INTERFACE: {
+		ksock_net_t       *net = ni->ni_data;
+		ksock_interface_t *iface;
+
+		read_lock(&ksocknal_data.ksnd_global_lock);
+
+		if (data->ioc_count >= (__u32)net->ksnn_ninterfaces) {
+			rc = -ENOENT;
+		} else {
+			rc = 0;
+			iface = &net->ksnn_interfaces[data->ioc_count];
+
+			data->ioc_u32[0] = iface->ksni_ipaddr;
+			data->ioc_u32[1] = iface->ksni_netmask;
+			data->ioc_u32[2] = iface->ksni_npeers;
+			data->ioc_u32[3] = iface->ksni_nroutes;
+		}
+
+		read_unlock(&ksocknal_data.ksnd_global_lock);
+		return rc;
+	}
+
+	case IOC_LIBCFS_ADD_INTERFACE:
+		return ksocknal_add_interface(ni,
+					      data->ioc_u32[0], /* IP address */
+					      data->ioc_u32[1]); /* net mask */
+
+	case IOC_LIBCFS_DEL_INTERFACE:
+		return ksocknal_del_interface(ni,
+					      data->ioc_u32[0]); /* IP address */
+
+	case IOC_LIBCFS_GET_PEER: {
+		__u32	    myip = 0;
+		__u32	    ip = 0;
+		int	      port = 0;
+		int	      conn_count = 0;
+		int	      share_count = 0;
+
+		rc = ksocknal_get_peer_info(ni, data->ioc_count,
+					    &id, &myip, &ip, &port,
+					    &conn_count,  &share_count);
+		if (rc != 0)
+			return rc;
+
+		data->ioc_nid    = id.nid;
+		data->ioc_count  = share_count;
+		data->ioc_u32[0] = ip;
+		data->ioc_u32[1] = port;
+		data->ioc_u32[2] = myip;
+		data->ioc_u32[3] = conn_count;
+		data->ioc_u32[4] = id.pid;
+		return 0;
+	}
+
+	case IOC_LIBCFS_ADD_PEER:
+		id.nid = data->ioc_nid;
+		id.pid = LUSTRE_SRV_LNET_PID;
+		return ksocknal_add_peer(ni, id,
+					  data->ioc_u32[0], /* IP */
+					  data->ioc_u32[1]); /* port */
+
+	case IOC_LIBCFS_DEL_PEER:
+		id.nid = data->ioc_nid;
+		id.pid = LNET_PID_ANY;
+		return ksocknal_del_peer(ni, id,
+					  data->ioc_u32[0]); /* IP */
+
+	case IOC_LIBCFS_GET_CONN: {
+		int	   txmem;
+		int	   rxmem;
+		int	   nagle;
+		ksock_conn_t *conn = ksocknal_get_conn_by_idx(ni, data->ioc_count);
+
+		if (conn == NULL)
+			return -ENOENT;
+
+		ksocknal_lib_get_conn_tunables(conn, &txmem, &rxmem, &nagle);
+
+		data->ioc_count  = txmem;
+		data->ioc_nid    = conn->ksnc_peer->ksnp_id.nid;
+		data->ioc_flags  = nagle;
+		data->ioc_u32[0] = conn->ksnc_ipaddr;
+		data->ioc_u32[1] = conn->ksnc_port;
+		data->ioc_u32[2] = conn->ksnc_myipaddr;
+		data->ioc_u32[3] = conn->ksnc_type;
+		data->ioc_u32[4] = conn->ksnc_scheduler->kss_info->ksi_cpt;
+		data->ioc_u32[5] = rxmem;
+		data->ioc_u32[6] = conn->ksnc_peer->ksnp_id.pid;
+		ksocknal_conn_decref(conn);
+		return 0;
+	}
+
+	case IOC_LIBCFS_CLOSE_CONNECTION:
+		id.nid = data->ioc_nid;
+		id.pid = LNET_PID_ANY;
+		return ksocknal_close_matching_conns(id,
+						      data->ioc_u32[0]);
+
+	case IOC_LIBCFS_REGISTER_MYNID:
+		/* Ignore if this is a noop */
+		if (data->ioc_nid == ni->ni_nid)
+			return 0;
+
+		CERROR("obsolete IOC_LIBCFS_REGISTER_MYNID: %s(%s)\n",
+		       libcfs_nid2str(data->ioc_nid),
+		       libcfs_nid2str(ni->ni_nid));
+		return -EINVAL;
+
+	case IOC_LIBCFS_PUSH_CONNECTION:
+		id.nid = data->ioc_nid;
+		id.pid = LNET_PID_ANY;
+		return ksocknal_push(ni, id);
+
+	default:
+		return -EINVAL;
+	}
+	/* not reached */
+}
+
+static void
+ksocknal_free_buffers(void)
+{
+	LASSERT(atomic_read(&ksocknal_data.ksnd_nactive_txs) == 0);
+
+	if (ksocknal_data.ksnd_sched_info != NULL) {
+		struct ksock_sched_info	*info;
+		int			i;
+
+		cfs_percpt_for_each(info, i, ksocknal_data.ksnd_sched_info) {
+			if (info->ksi_scheds != NULL) {
+				LIBCFS_FREE(info->ksi_scheds,
+					    info->ksi_nthreads_max *
+					    sizeof(info->ksi_scheds[0]));
+			}
+		}
+		cfs_percpt_free(ksocknal_data.ksnd_sched_info);
+	}
+
+	LIBCFS_FREE(ksocknal_data.ksnd_peers,
+		     sizeof(struct list_head) *
+		     ksocknal_data.ksnd_peer_hash_size);
+
+	spin_lock(&ksocknal_data.ksnd_tx_lock);
+
+	if (!list_empty(&ksocknal_data.ksnd_idle_noop_txs)) {
+		struct list_head	zlist;
+		ksock_tx_t	*tx;
+
+		list_add(&zlist, &ksocknal_data.ksnd_idle_noop_txs);
+		list_del_init(&ksocknal_data.ksnd_idle_noop_txs);
+		spin_unlock(&ksocknal_data.ksnd_tx_lock);
+
+		while (!list_empty(&zlist)) {
+			tx = list_entry(zlist.next, ksock_tx_t, tx_list);
+			list_del(&tx->tx_list);
+			LIBCFS_FREE(tx, tx->tx_desc_size);
+		}
+	} else {
+		spin_unlock(&ksocknal_data.ksnd_tx_lock);
+	}
+}
+
+static void
+ksocknal_base_shutdown(void)
+{
+	struct ksock_sched_info *info;
+	ksock_sched_t		*sched;
+	int			i;
+	int			j;
+
+	CDEBUG(D_MALLOC, "before NAL cleanup: kmem %d\n",
+	       atomic_read(&libcfs_kmemory));
+	LASSERT(ksocknal_data.ksnd_nnets == 0);
+
+	switch (ksocknal_data.ksnd_init) {
+	default:
+		LASSERT(0);
+
+	case SOCKNAL_INIT_ALL:
+	case SOCKNAL_INIT_DATA:
+		LASSERT(ksocknal_data.ksnd_peers != NULL);
+		for (i = 0; i < ksocknal_data.ksnd_peer_hash_size; i++) {
+			LASSERT(list_empty(&ksocknal_data.ksnd_peers[i]));
+		}
+
+		LASSERT(list_empty(&ksocknal_data.ksnd_nets));
+		LASSERT(list_empty(&ksocknal_data.ksnd_enomem_conns));
+		LASSERT(list_empty(&ksocknal_data.ksnd_zombie_conns));
+		LASSERT(list_empty(&ksocknal_data.ksnd_connd_connreqs));
+		LASSERT(list_empty(&ksocknal_data.ksnd_connd_routes));
+
+		if (ksocknal_data.ksnd_sched_info != NULL) {
+			cfs_percpt_for_each(info, i,
+					    ksocknal_data.ksnd_sched_info) {
+				if (info->ksi_scheds == NULL)
+					continue;
+
+				for (j = 0; j < info->ksi_nthreads_max; j++) {
+
+					sched = &info->ksi_scheds[j];
+					LASSERT(list_empty(
+						&sched->kss_tx_conns));
+					LASSERT(list_empty(
+						&sched->kss_rx_conns));
+					LASSERT(list_empty(
+						&sched->kss_zombie_noop_txs));
+					LASSERT(sched->kss_nconns == 0);
+				}
+			}
+		}
+
+		/* flag threads to terminate; wake and wait for them to die */
+		ksocknal_data.ksnd_shuttingdown = 1;
+		wake_up_all(&ksocknal_data.ksnd_connd_waitq);
+		wake_up_all(&ksocknal_data.ksnd_reaper_waitq);
+
+		if (ksocknal_data.ksnd_sched_info != NULL) {
+			cfs_percpt_for_each(info, i,
+					    ksocknal_data.ksnd_sched_info) {
+				if (info->ksi_scheds == NULL)
+					continue;
+
+				for (j = 0; j < info->ksi_nthreads_max; j++) {
+					sched = &info->ksi_scheds[j];
+					wake_up_all(&sched->kss_waitq);
+				}
+			}
+		}
+
+		i = 4;
+		read_lock(&ksocknal_data.ksnd_global_lock);
+		while (ksocknal_data.ksnd_nthreads != 0) {
+			i++;
+			CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* power of 2? */
+			       "waiting for %d threads to terminate\n",
+				ksocknal_data.ksnd_nthreads);
+			read_unlock(&ksocknal_data.ksnd_global_lock);
+			set_current_state(TASK_UNINTERRUPTIBLE);
+			schedule_timeout(cfs_time_seconds(1));
+			read_lock(&ksocknal_data.ksnd_global_lock);
+		}
+		read_unlock(&ksocknal_data.ksnd_global_lock);
+
+		ksocknal_free_buffers();
+
+		ksocknal_data.ksnd_init = SOCKNAL_INIT_NOTHING;
+		break;
+	}
+
+	CDEBUG(D_MALLOC, "after NAL cleanup: kmem %d\n",
+	       atomic_read(&libcfs_kmemory));
+
+	module_put(THIS_MODULE);
+}
+
+static __u64
+ksocknal_new_incarnation(void)
+{
+
+	/* The incarnation number is the time this module loaded and it
+	 * identifies this particular instance of the socknal.
+	 */
+	return ktime_get_ns();
+}
+
+static int
+ksocknal_base_startup(void)
+{
+	struct ksock_sched_info	*info;
+	int			rc;
+	int			i;
+
+	LASSERT(ksocknal_data.ksnd_init == SOCKNAL_INIT_NOTHING);
+	LASSERT(ksocknal_data.ksnd_nnets == 0);
+
+	memset(&ksocknal_data, 0, sizeof(ksocknal_data)); /* zero pointers */
+
+	ksocknal_data.ksnd_peer_hash_size = SOCKNAL_PEER_HASH_SIZE;
+	LIBCFS_ALLOC(ksocknal_data.ksnd_peers,
+		      sizeof(struct list_head) *
+		      ksocknal_data.ksnd_peer_hash_size);
+	if (ksocknal_data.ksnd_peers == NULL)
+		return -ENOMEM;
+
+	for (i = 0; i < ksocknal_data.ksnd_peer_hash_size; i++)
+		INIT_LIST_HEAD(&ksocknal_data.ksnd_peers[i]);
+
+	rwlock_init(&ksocknal_data.ksnd_global_lock);
+	INIT_LIST_HEAD(&ksocknal_data.ksnd_nets);
+
+	spin_lock_init(&ksocknal_data.ksnd_reaper_lock);
+	INIT_LIST_HEAD(&ksocknal_data.ksnd_enomem_conns);
+	INIT_LIST_HEAD(&ksocknal_data.ksnd_zombie_conns);
+	INIT_LIST_HEAD(&ksocknal_data.ksnd_deathrow_conns);
+	init_waitqueue_head(&ksocknal_data.ksnd_reaper_waitq);
+
+	spin_lock_init(&ksocknal_data.ksnd_connd_lock);
+	INIT_LIST_HEAD(&ksocknal_data.ksnd_connd_connreqs);
+	INIT_LIST_HEAD(&ksocknal_data.ksnd_connd_routes);
+	init_waitqueue_head(&ksocknal_data.ksnd_connd_waitq);
+
+	spin_lock_init(&ksocknal_data.ksnd_tx_lock);
+	INIT_LIST_HEAD(&ksocknal_data.ksnd_idle_noop_txs);
+
+	/* NB memset above zeros whole of ksocknal_data */
+
+	/* flag lists/ptrs/locks initialised */
+	ksocknal_data.ksnd_init = SOCKNAL_INIT_DATA;
+	try_module_get(THIS_MODULE);
+
+	ksocknal_data.ksnd_sched_info = cfs_percpt_alloc(lnet_cpt_table(),
+							 sizeof(*info));
+	if (ksocknal_data.ksnd_sched_info == NULL)
+		goto failed;
+
+	cfs_percpt_for_each(info, i, ksocknal_data.ksnd_sched_info) {
+		ksock_sched_t	*sched;
+		int		nthrs;
+
+		nthrs = cfs_cpt_weight(lnet_cpt_table(), i);
+		if (*ksocknal_tunables.ksnd_nscheds > 0) {
+			nthrs = min(nthrs, *ksocknal_tunables.ksnd_nscheds);
+		} else {
+			/* max to half of CPUs, assume another half should be
+			 * reserved for upper layer modules */
+			nthrs = min(max(SOCKNAL_NSCHEDS, nthrs >> 1), nthrs);
+		}
+
+		info->ksi_nthreads_max = nthrs;
+		info->ksi_cpt = i;
+
+		LIBCFS_CPT_ALLOC(info->ksi_scheds, lnet_cpt_table(), i,
+				 info->ksi_nthreads_max * sizeof(*sched));
+		if (info->ksi_scheds == NULL)
+			goto failed;
+
+		for (; nthrs > 0; nthrs--) {
+			sched = &info->ksi_scheds[nthrs - 1];
+
+			sched->kss_info = info;
+			spin_lock_init(&sched->kss_lock);
+			INIT_LIST_HEAD(&sched->kss_rx_conns);
+			INIT_LIST_HEAD(&sched->kss_tx_conns);
+			INIT_LIST_HEAD(&sched->kss_zombie_noop_txs);
+			init_waitqueue_head(&sched->kss_waitq);
+		}
+	}
+
+	ksocknal_data.ksnd_connd_starting	 = 0;
+	ksocknal_data.ksnd_connd_failed_stamp     = 0;
+	ksocknal_data.ksnd_connd_starting_stamp   = get_seconds();
+	/* must have at least 2 connds to remain responsive to accepts while
+	 * connecting */
+	if (*ksocknal_tunables.ksnd_nconnds < SOCKNAL_CONND_RESV + 1)
+		*ksocknal_tunables.ksnd_nconnds = SOCKNAL_CONND_RESV + 1;
+
+	if (*ksocknal_tunables.ksnd_nconnds_max <
+	    *ksocknal_tunables.ksnd_nconnds) {
+		ksocknal_tunables.ksnd_nconnds_max =
+			ksocknal_tunables.ksnd_nconnds;
+	}
+
+	for (i = 0; i < *ksocknal_tunables.ksnd_nconnds; i++) {
+		char name[16];
+		spin_lock_bh(&ksocknal_data.ksnd_connd_lock);
+		ksocknal_data.ksnd_connd_starting++;
+		spin_unlock_bh(&ksocknal_data.ksnd_connd_lock);
+
+
+		snprintf(name, sizeof(name), "socknal_cd%02d", i);
+		rc = ksocknal_thread_start(ksocknal_connd,
+					   (void *)((ulong_ptr_t)i), name);
+		if (rc != 0) {
+			spin_lock_bh(&ksocknal_data.ksnd_connd_lock);
+			ksocknal_data.ksnd_connd_starting--;
+			spin_unlock_bh(&ksocknal_data.ksnd_connd_lock);
+			CERROR("Can't spawn socknal connd: %d\n", rc);
+			goto failed;
+		}
+	}
+
+	rc = ksocknal_thread_start(ksocknal_reaper, NULL, "socknal_reaper");
+	if (rc != 0) {
+		CERROR("Can't spawn socknal reaper: %d\n", rc);
+		goto failed;
+	}
+
+	/* flag everything initialised */
+	ksocknal_data.ksnd_init = SOCKNAL_INIT_ALL;
+
+	return 0;
+
+ failed:
+	ksocknal_base_shutdown();
+	return -ENETDOWN;
+}
+
+static void
+ksocknal_debug_peerhash(lnet_ni_t *ni)
+{
+	ksock_peer_t	*peer = NULL;
+	struct list_head	*tmp;
+	int		i;
+
+	read_lock(&ksocknal_data.ksnd_global_lock);
+
+	for (i = 0; i < ksocknal_data.ksnd_peer_hash_size; i++) {
+		list_for_each(tmp, &ksocknal_data.ksnd_peers[i]) {
+			peer = list_entry(tmp, ksock_peer_t, ksnp_list);
+
+			if (peer->ksnp_ni == ni)
+				break;
+
+			peer = NULL;
+		}
+	}
+
+	if (peer != NULL) {
+		ksock_route_t *route;
+		ksock_conn_t  *conn;
+
+		CWARN("Active peer on shutdown: %s, ref %d, scnt %d, closing %d, accepting %d, err %d, zcookie %llu, txq %d, zc_req %d\n",
+		      libcfs_id2str(peer->ksnp_id),
+		      atomic_read(&peer->ksnp_refcount),
+		      peer->ksnp_sharecount, peer->ksnp_closing,
+		      peer->ksnp_accepting, peer->ksnp_error,
+		      peer->ksnp_zc_next_cookie,
+		      !list_empty(&peer->ksnp_tx_queue),
+		      !list_empty(&peer->ksnp_zc_req_list));
+
+		list_for_each(tmp, &peer->ksnp_routes) {
+			route = list_entry(tmp, ksock_route_t, ksnr_list);
+			CWARN("Route: ref %d, schd %d, conn %d, cnted %d, del %d\n",
+			      atomic_read(&route->ksnr_refcount),
+			      route->ksnr_scheduled, route->ksnr_connecting,
+			      route->ksnr_connected, route->ksnr_deleted);
+		}
+
+		list_for_each(tmp, &peer->ksnp_conns) {
+			conn = list_entry(tmp, ksock_conn_t, ksnc_list);
+			CWARN("Conn: ref %d, sref %d, t %d, c %d\n",
+			       atomic_read(&conn->ksnc_conn_refcount),
+			       atomic_read(&conn->ksnc_sock_refcount),
+			       conn->ksnc_type, conn->ksnc_closing);
+		}
+	}
+
+	read_unlock(&ksocknal_data.ksnd_global_lock);
+	return;
+}
+
+void
+ksocknal_shutdown(lnet_ni_t *ni)
+{
+	ksock_net_t      *net = ni->ni_data;
+	int	       i;
+	lnet_process_id_t anyid = {0};
+
+	anyid.nid =  LNET_NID_ANY;
+	anyid.pid =  LNET_PID_ANY;
+
+	LASSERT(ksocknal_data.ksnd_init == SOCKNAL_INIT_ALL);
+	LASSERT(ksocknal_data.ksnd_nnets > 0);
+
+	spin_lock_bh(&net->ksnn_lock);
+	net->ksnn_shutdown = 1;		 /* prevent new peers */
+	spin_unlock_bh(&net->ksnn_lock);
+
+	/* Delete all peers */
+	ksocknal_del_peer(ni, anyid, 0);
+
+	/* Wait for all peer state to clean up */
+	i = 2;
+	spin_lock_bh(&net->ksnn_lock);
+	while (net->ksnn_npeers != 0) {
+		spin_unlock_bh(&net->ksnn_lock);
+
+		i++;
+		CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* power of 2? */
+		       "waiting for %d peers to disconnect\n",
+		       net->ksnn_npeers);
+		set_current_state(TASK_UNINTERRUPTIBLE);
+		schedule_timeout(cfs_time_seconds(1));
+
+		ksocknal_debug_peerhash(ni);
+
+		spin_lock_bh(&net->ksnn_lock);
+	}
+	spin_unlock_bh(&net->ksnn_lock);
+
+	for (i = 0; i < net->ksnn_ninterfaces; i++) {
+		LASSERT(net->ksnn_interfaces[i].ksni_npeers == 0);
+		LASSERT(net->ksnn_interfaces[i].ksni_nroutes == 0);
+	}
+
+	list_del(&net->ksnn_list);
+	LIBCFS_FREE(net, sizeof(*net));
+
+	ksocknal_data.ksnd_nnets--;
+	if (ksocknal_data.ksnd_nnets == 0)
+		ksocknal_base_shutdown();
+}
+
+static int
+ksocknal_enumerate_interfaces(ksock_net_t *net)
+{
+	char      **names;
+	int	 i;
+	int	 j;
+	int	 rc;
+	int	 n;
+
+	n = libcfs_ipif_enumerate(&names);
+	if (n <= 0) {
+		CERROR("Can't enumerate interfaces: %d\n", n);
+		return n;
+	}
+
+	for (i = j = 0; i < n; i++) {
+		int	up;
+		__u32      ip;
+		__u32      mask;
+
+		if (!strcmp(names[i], "lo")) /* skip the loopback IF */
+			continue;
+
+		rc = libcfs_ipif_query(names[i], &up, &ip, &mask);
+		if (rc != 0) {
+			CWARN("Can't get interface %s info: %d\n",
+			      names[i], rc);
+			continue;
+		}
+
+		if (!up) {
+			CWARN("Ignoring interface %s (down)\n",
+			      names[i]);
+			continue;
+		}
+
+		if (j == LNET_MAX_INTERFACES) {
+			CWARN("Ignoring interface %s (too many interfaces)\n",
+			      names[i]);
+			continue;
+		}
+
+		net->ksnn_interfaces[j].ksni_ipaddr = ip;
+		net->ksnn_interfaces[j].ksni_netmask = mask;
+		strncpy(&net->ksnn_interfaces[j].ksni_name[0],
+			names[i], IFNAMSIZ);
+		j++;
+	}
+
+	libcfs_ipif_free_enumeration(names, n);
+
+	if (j == 0)
+		CERROR("Can't find any usable interfaces\n");
+
+	return j;
+}
+
+static int
+ksocknal_search_new_ipif(ksock_net_t *net)
+{
+	int	new_ipif = 0;
+	int	i;
+
+	for (i = 0; i < net->ksnn_ninterfaces; i++) {
+		char		*ifnam = &net->ksnn_interfaces[i].ksni_name[0];
+		char		*colon = strchr(ifnam, ':');
+		int		found  = 0;
+		ksock_net_t	*tmp;
+		int		j;
+
+		if (colon != NULL) /* ignore alias device */
+			*colon = 0;
+
+		list_for_each_entry(tmp, &ksocknal_data.ksnd_nets,
+					ksnn_list) {
+			for (j = 0; !found && j < tmp->ksnn_ninterfaces; j++) {
+				char *ifnam2 =
+					&tmp->ksnn_interfaces[j].ksni_name[0];
+				char *colon2 = strchr(ifnam2, ':');
+
+				if (colon2 != NULL)
+					*colon2 = 0;
+
+				found = strcmp(ifnam, ifnam2) == 0;
+				if (colon2 != NULL)
+					*colon2 = ':';
+			}
+			if (found)
+				break;
+		}
+
+		new_ipif += !found;
+		if (colon != NULL)
+			*colon = ':';
+	}
+
+	return new_ipif;
+}
+
+static int
+ksocknal_start_schedulers(struct ksock_sched_info *info)
+{
+	int	nthrs;
+	int	rc = 0;
+	int	i;
+
+	if (info->ksi_nthreads == 0) {
+		if (*ksocknal_tunables.ksnd_nscheds > 0) {
+			nthrs = info->ksi_nthreads_max;
+		} else {
+			nthrs = cfs_cpt_weight(lnet_cpt_table(),
+					       info->ksi_cpt);
+			nthrs = min(max(SOCKNAL_NSCHEDS, nthrs >> 1), nthrs);
+			nthrs = min(SOCKNAL_NSCHEDS_HIGH, nthrs);
+		}
+		nthrs = min(nthrs, info->ksi_nthreads_max);
+	} else {
+		LASSERT(info->ksi_nthreads <= info->ksi_nthreads_max);
+		/* increase two threads if there is new interface */
+		nthrs = min(2, info->ksi_nthreads_max - info->ksi_nthreads);
+	}
+
+	for (i = 0; i < nthrs; i++) {
+		long		id;
+		char		name[20];
+		ksock_sched_t	*sched;
+		id = KSOCK_THREAD_ID(info->ksi_cpt, info->ksi_nthreads + i);
+		sched = &info->ksi_scheds[KSOCK_THREAD_SID(id)];
+		snprintf(name, sizeof(name), "socknal_sd%02d_%02d",
+			 info->ksi_cpt, (int)(sched - &info->ksi_scheds[0]));
+
+		rc = ksocknal_thread_start(ksocknal_scheduler,
+					   (void *)id, name);
+		if (rc == 0)
+			continue;
+
+		CERROR("Can't spawn thread %d for scheduler[%d]: %d\n",
+		       info->ksi_cpt, info->ksi_nthreads + i, rc);
+		break;
+	}
+
+	info->ksi_nthreads += i;
+	return rc;
+}
+
+static int
+ksocknal_net_start_threads(ksock_net_t *net, __u32 *cpts, int ncpts)
+{
+	int	newif = ksocknal_search_new_ipif(net);
+	int	rc;
+	int	i;
+
+	LASSERT(ncpts > 0 && ncpts <= cfs_cpt_number(lnet_cpt_table()));
+
+	for (i = 0; i < ncpts; i++) {
+		struct ksock_sched_info	*info;
+		int cpt = (cpts == NULL) ? i : cpts[i];
+
+		LASSERT(cpt < cfs_cpt_number(lnet_cpt_table()));
+		info = ksocknal_data.ksnd_sched_info[cpt];
+
+		if (!newif && info->ksi_nthreads > 0)
+			continue;
+
+		rc = ksocknal_start_schedulers(info);
+		if (rc != 0)
+			return rc;
+	}
+	return 0;
+}
+
+int
+ksocknal_startup(lnet_ni_t *ni)
+{
+	ksock_net_t  *net;
+	int	   rc;
+	int	   i;
+
+	LASSERT(ni->ni_lnd == &the_ksocklnd);
+
+	if (ksocknal_data.ksnd_init == SOCKNAL_INIT_NOTHING) {
+		rc = ksocknal_base_startup();
+		if (rc != 0)
+			return rc;
+	}
+
+	LIBCFS_ALLOC(net, sizeof(*net));
+	if (net == NULL)
+		goto fail_0;
+
+	spin_lock_init(&net->ksnn_lock);
+	net->ksnn_incarnation = ksocknal_new_incarnation();
+	ni->ni_data = net;
+	ni->ni_peertimeout    = *ksocknal_tunables.ksnd_peertimeout;
+	ni->ni_maxtxcredits   = *ksocknal_tunables.ksnd_credits;
+	ni->ni_peertxcredits  = *ksocknal_tunables.ksnd_peertxcredits;
+	ni->ni_peerrtrcredits = *ksocknal_tunables.ksnd_peerrtrcredits;
+
+	if (ni->ni_interfaces[0] == NULL) {
+		rc = ksocknal_enumerate_interfaces(net);
+		if (rc <= 0)
+			goto fail_1;
+
+		net->ksnn_ninterfaces = 1;
+	} else {
+		for (i = 0; i < LNET_MAX_INTERFACES; i++) {
+			int    up;
+
+			if (ni->ni_interfaces[i] == NULL)
+				break;
+
+			rc = libcfs_ipif_query(
+				ni->ni_interfaces[i], &up,
+				&net->ksnn_interfaces[i].ksni_ipaddr,
+				&net->ksnn_interfaces[i].ksni_netmask);
+
+			if (rc != 0) {
+				CERROR("Can't get interface %s info: %d\n",
+				       ni->ni_interfaces[i], rc);
+				goto fail_1;
+			}
+
+			if (!up) {
+				CERROR("Interface %s is down\n",
+				       ni->ni_interfaces[i]);
+				goto fail_1;
+			}
+
+			strncpy(&net->ksnn_interfaces[i].ksni_name[0],
+				ni->ni_interfaces[i], IFNAMSIZ);
+		}
+		net->ksnn_ninterfaces = i;
+	}
+
+	/* call it before add it to ksocknal_data.ksnd_nets */
+	rc = ksocknal_net_start_threads(net, ni->ni_cpts, ni->ni_ncpts);
+	if (rc != 0)
+		goto fail_1;
+
+	ni->ni_nid = LNET_MKNID(LNET_NIDNET(ni->ni_nid),
+				net->ksnn_interfaces[0].ksni_ipaddr);
+	list_add(&net->ksnn_list, &ksocknal_data.ksnd_nets);
+
+	ksocknal_data.ksnd_nnets++;
+
+	return 0;
+
+ fail_1:
+	LIBCFS_FREE(net, sizeof(*net));
+ fail_0:
+	if (ksocknal_data.ksnd_nnets == 0)
+		ksocknal_base_shutdown();
+
+	return -ENETDOWN;
+}
+
+
+static void __exit
+ksocknal_module_fini(void)
+{
+	lnet_unregister_lnd(&the_ksocklnd);
+}
+
+static int __init
+ksocknal_module_init(void)
+{
+	int    rc;
+
+	/* check ksnr_connected/connecting field large enough */
+	CLASSERT(SOCKLND_CONN_NTYPES <= 4);
+	CLASSERT(SOCKLND_CONN_ACK == SOCKLND_CONN_BULK_IN);
+
+	/* initialize the_ksocklnd */
+	the_ksocklnd.lnd_type     = SOCKLND;
+	the_ksocklnd.lnd_startup  = ksocknal_startup;
+	the_ksocklnd.lnd_shutdown = ksocknal_shutdown;
+	the_ksocklnd.lnd_ctl      = ksocknal_ctl;
+	the_ksocklnd.lnd_send     = ksocknal_send;
+	the_ksocklnd.lnd_recv     = ksocknal_recv;
+	the_ksocklnd.lnd_notify   = ksocknal_notify;
+	the_ksocklnd.lnd_query    = ksocknal_query;
+	the_ksocklnd.lnd_accept   = ksocknal_accept;
+
+	rc = ksocknal_tunables_init();
+	if (rc != 0)
+		return rc;
+
+	lnet_register_lnd(&the_ksocklnd);
+
+	return 0;
+}
+
+MODULE_AUTHOR("Sun Microsystems, Inc. <http://www.lustre.org/>");
+MODULE_DESCRIPTION("Kernel TCP Socket LND v3.0.0");
+MODULE_LICENSE("GPL");
+MODULE_VERSION("3.0.0");
+
+module_init(ksocknal_module_init);
+module_exit(ksocknal_module_fini);
diff --git a/kernel/drivers/staging/lustre/lnet/klnds/socklnd/socklnd.h b/kernel/drivers/staging/lustre/lnet/klnds/socklnd/socklnd.h
new file mode 100644
index 000000000..c54c99551
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lnet/klnds/socklnd/socklnd.h
@@ -0,0 +1,588 @@
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ *
+ *   Author: Zach Brown <zab@zabbo.net>
+ *   Author: Peter J. Braam <braam@clusterfs.com>
+ *   Author: Phil Schwan <phil@clusterfs.com>
+ *   Author: Eric Barton <eric@bartonsoftware.com>
+ *
+ *   This file is part of Lustre, http://www.lustre.org
+ *
+ *   Portals is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Portals is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Portals; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+
+#define DEBUG_PORTAL_ALLOC
+#define DEBUG_SUBSYSTEM S_LND
+
+#include "socklnd_lib-linux.h"
+
+#include "../../../include/linux/libcfs/libcfs.h"
+#include "../../../include/linux/lnet/lnet.h"
+#include "../../../include/linux/lnet/lib-lnet.h"
+#include "../../../include/linux/lnet/socklnd.h"
+#include "../../../include/linux/lnet/lnet-sysctl.h"
+
+#define SOCKNAL_PEER_HASH_SIZE  101	     /* # peer lists */
+#define SOCKNAL_RESCHED	 100	     /* # scheduler loops before reschedule */
+#define SOCKNAL_INSANITY_RECONN 5000	    /* connd is trying on reconn infinitely */
+#define SOCKNAL_ENOMEM_RETRY    CFS_TICK	/* jiffies between retries */
+
+#define SOCKNAL_SINGLE_FRAG_TX      0	   /* disable multi-fragment sends */
+#define SOCKNAL_SINGLE_FRAG_RX      0	   /* disable multi-fragment receives */
+
+#define SOCKNAL_VERSION_DEBUG       0	   /* enable protocol version debugging */
+
+/* risk kmap deadlock on multi-frag I/O (backs off to single-frag if disabled).
+ * no risk if we're not running on a CONFIG_HIGHMEM platform. */
+#ifdef CONFIG_HIGHMEM
+# define SOCKNAL_RISK_KMAP_DEADLOCK  0
+#else
+# define SOCKNAL_RISK_KMAP_DEADLOCK  1
+#endif
+
+struct ksock_sched_info;
+
+typedef struct				  /* per scheduler state */
+{
+	spinlock_t		kss_lock;	/* serialise */
+	struct list_head		kss_rx_conns;	/* conn waiting to be read */
+	/* conn waiting to be written */
+	struct list_head		kss_tx_conns;
+	/* zombie noop tx list */
+	struct list_head		kss_zombie_noop_txs;
+	wait_queue_head_t		kss_waitq;	/* where scheduler sleeps */
+	/* # connections assigned to this scheduler */
+	int			kss_nconns;
+	struct ksock_sched_info	*kss_info;	/* owner of it */
+	struct page		*kss_rx_scratch_pgs[LNET_MAX_IOV];
+	struct kvec		kss_scratch_iov[LNET_MAX_IOV];
+} ksock_sched_t;
+
+struct ksock_sched_info {
+	int			ksi_nthreads_max; /* max allowed threads */
+	int			ksi_nthreads;	/* number of threads */
+	int			ksi_cpt;	/* CPT id */
+	ksock_sched_t		*ksi_scheds;	/* array of schedulers */
+};
+
+#define KSOCK_CPT_SHIFT			16
+#define KSOCK_THREAD_ID(cpt, sid)	(((cpt) << KSOCK_CPT_SHIFT) | (sid))
+#define KSOCK_THREAD_CPT(id)		((id) >> KSOCK_CPT_SHIFT)
+#define KSOCK_THREAD_SID(id)		((id) & ((1UL << KSOCK_CPT_SHIFT) - 1))
+
+typedef struct				  /* in-use interface */
+{
+	__u32		ksni_ipaddr;		/* interface's IP address */
+	__u32		ksni_netmask;		/* interface's network mask */
+	int		ksni_nroutes;		/* # routes using (active) */
+	int		ksni_npeers;		/* # peers using (passive) */
+	char		ksni_name[IFNAMSIZ];	/* interface name */
+} ksock_interface_t;
+
+typedef struct {
+	/* "stuck" socket timeout (seconds) */
+	int	      *ksnd_timeout;
+	/* # scheduler threads in each pool while starting */
+	int		 *ksnd_nscheds;
+	int	      *ksnd_nconnds;	 /* # connection daemons */
+	int	      *ksnd_nconnds_max;     /* max # connection daemons */
+	int	      *ksnd_min_reconnectms; /* first connection retry after (ms)... */
+	int	      *ksnd_max_reconnectms; /* ...exponentially increasing to this */
+	int	      *ksnd_eager_ack;       /* make TCP ack eagerly? */
+	int	      *ksnd_typed_conns;     /* drive sockets by type? */
+	int	      *ksnd_min_bulk;	/* smallest "large" message */
+	int	      *ksnd_tx_buffer_size;  /* socket tx buffer size */
+	int	      *ksnd_rx_buffer_size;  /* socket rx buffer size */
+	int	      *ksnd_nagle;	   /* enable NAGLE? */
+	int	      *ksnd_round_robin;     /* round robin for multiple interfaces */
+	int	      *ksnd_keepalive;       /* # secs for sending keepalive NOOP */
+	int	      *ksnd_keepalive_idle;  /* # idle secs before 1st probe */
+	int	      *ksnd_keepalive_count; /* # probes */
+	int	      *ksnd_keepalive_intvl; /* time between probes */
+	int	      *ksnd_credits;	 /* # concurrent sends */
+	int	      *ksnd_peertxcredits;   /* # concurrent sends to 1 peer */
+	int	      *ksnd_peerrtrcredits;  /* # per-peer router buffer credits */
+	int	      *ksnd_peertimeout;     /* seconds to consider peer dead */
+	int	      *ksnd_enable_csum;     /* enable check sum */
+	int	      *ksnd_inject_csum_error; /* set non-zero to inject checksum error */
+	int	      *ksnd_nonblk_zcack;    /* always send zc-ack on non-blocking connection */
+	unsigned int     *ksnd_zc_min_payload;  /* minimum zero copy payload size */
+	int	      *ksnd_zc_recv;	 /* enable ZC receive (for Chelsio TOE) */
+	int	      *ksnd_zc_recv_min_nfrags; /* minimum # of fragments to enable ZC receive */
+} ksock_tunables_t;
+
+typedef struct {
+	__u64		  ksnn_incarnation;	/* my epoch */
+	spinlock_t	  ksnn_lock;		/* serialise */
+	struct list_head	  ksnn_list;		/* chain on global list */
+	int		  ksnn_npeers;		/* # peers */
+	int		  ksnn_shutdown;	/* shutting down? */
+	int		  ksnn_ninterfaces;	/* IP interfaces */
+	ksock_interface_t ksnn_interfaces[LNET_MAX_INTERFACES];
+} ksock_net_t;
+
+/** connd timeout */
+#define SOCKNAL_CONND_TIMEOUT  120
+/** reserved thread for accepting & creating new connd */
+#define SOCKNAL_CONND_RESV     1
+
+typedef struct {
+	int			ksnd_init;	/* initialisation state */
+	int			ksnd_nnets;	/* # networks set up */
+	struct list_head		ksnd_nets;	/* list of nets */
+	/* stabilize peer/conn ops */
+	rwlock_t		ksnd_global_lock;
+	/* hash table of all my known peers */
+	struct list_head		*ksnd_peers;
+	int			ksnd_peer_hash_size; /* size of ksnd_peers */
+
+	int			ksnd_nthreads;	/* # live threads */
+	int			ksnd_shuttingdown; /* tell threads to exit */
+	/* schedulers information */
+	struct ksock_sched_info	**ksnd_sched_info;
+
+	atomic_t      ksnd_nactive_txs;    /* #active txs */
+
+	struct list_head	ksnd_deathrow_conns; /* conns to close: reaper_lock*/
+	struct list_head	ksnd_zombie_conns;   /* conns to free: reaper_lock */
+	struct list_head	ksnd_enomem_conns;   /* conns to retry: reaper_lock*/
+	wait_queue_head_t       ksnd_reaper_waitq;   /* reaper sleeps here */
+	unsigned long	ksnd_reaper_waketime;/* when reaper will wake */
+	spinlock_t	  ksnd_reaper_lock;	/* serialise */
+
+	int	       ksnd_enomem_tx;      /* test ENOMEM sender */
+	int	       ksnd_stall_tx;       /* test sluggish sender */
+	int	       ksnd_stall_rx;       /* test sluggish receiver */
+
+	struct list_head	ksnd_connd_connreqs; /* incoming connection requests */
+	struct list_head	ksnd_connd_routes;   /* routes waiting to be connected */
+	wait_queue_head_t       ksnd_connd_waitq;    /* connds sleep here */
+	int	       ksnd_connd_connecting;/* # connds connecting */
+	/** time stamp of the last failed connecting attempt */
+	long	      ksnd_connd_failed_stamp;
+	/** # starting connd */
+	unsigned	  ksnd_connd_starting;
+	/** time stamp of the last starting connd */
+	long	      ksnd_connd_starting_stamp;
+	/** # running connd */
+	unsigned	  ksnd_connd_running;
+	spinlock_t	  ksnd_connd_lock;	/* serialise */
+
+	struct list_head	  ksnd_idle_noop_txs;	/* list head for freed noop tx */
+	spinlock_t	  ksnd_tx_lock;		/* serialise, g_lock unsafe */
+
+} ksock_nal_data_t;
+
+#define SOCKNAL_INIT_NOTHING    0
+#define SOCKNAL_INIT_DATA       1
+#define SOCKNAL_INIT_ALL	2
+
+/* A packet just assembled for transmission is represented by 1 or more
+ * struct iovec fragments (the first frag contains the portals header),
+ * followed by 0 or more lnet_kiov_t fragments.
+ *
+ * On the receive side, initially 1 struct iovec fragment is posted for
+ * receive (the header).  Once the header has been received, the payload is
+ * received into either struct iovec or lnet_kiov_t fragments, depending on
+ * what the header matched or whether the message needs forwarding. */
+
+struct ksock_conn;			      /* forward ref */
+struct ksock_peer;			      /* forward ref */
+struct ksock_route;			     /* forward ref */
+struct ksock_proto;			     /* forward ref */
+
+typedef struct				  /* transmit packet */
+{
+	struct list_head     tx_list;	/* queue on conn for transmission etc */
+	struct list_head     tx_zc_list;     /* queue on peer for ZC request */
+	atomic_t   tx_refcount;    /* tx reference count */
+	int	    tx_nob;	 /* # packet bytes */
+	int	    tx_resid;       /* residual bytes */
+	int	    tx_niov;	/* # packet iovec frags */
+	struct kvec  *tx_iov;	 /* packet iovec frags */
+	int	    tx_nkiov;       /* # packet page frags */
+	unsigned short tx_zc_aborted;  /* aborted ZC request */
+	unsigned short tx_zc_capable:1; /* payload is large enough for ZC */
+	unsigned short tx_zc_checked:1; /* Have I checked if I should ZC? */
+	unsigned short tx_nonblk:1;    /* it's a non-blocking ACK */
+	lnet_kiov_t   *tx_kiov;	/* packet page frags */
+	struct ksock_conn  *tx_conn;	/* owning conn */
+	lnet_msg_t    *tx_lnetmsg;     /* lnet message for lnet_finalize() */
+	unsigned long     tx_deadline;    /* when (in jiffies) tx times out */
+	ksock_msg_t    tx_msg;	 /* socklnd message buffer */
+	int	    tx_desc_size;   /* size of this descriptor */
+	union {
+		struct {
+			struct kvec iov;       /* virt hdr */
+			lnet_kiov_t  kiov[0];   /* paged payload */
+		}		  paged;
+		struct {
+			struct kvec iov[1];    /* virt hdr + payload */
+		}		  virt;
+	}		       tx_frags;
+} ksock_tx_t;
+
+#define KSOCK_NOOP_TX_SIZE  ((int)offsetof(ksock_tx_t, tx_frags.paged.kiov[0]))
+
+/* network zero copy callback descriptor embedded in ksock_tx_t */
+
+/* space for the rx frag descriptors; we either read a single contiguous
+ * header, or up to LNET_MAX_IOV frags of payload of either type. */
+typedef union {
+	struct kvec      iov[LNET_MAX_IOV];
+	lnet_kiov_t      kiov[LNET_MAX_IOV];
+} ksock_rxiovspace_t;
+
+#define SOCKNAL_RX_KSM_HEADER   1	       /* reading ksock message header */
+#define SOCKNAL_RX_LNET_HEADER  2	       /* reading lnet message header */
+#define SOCKNAL_RX_PARSE	3	       /* Calling lnet_parse() */
+#define SOCKNAL_RX_PARSE_WAIT   4	       /* waiting to be told to read the body */
+#define SOCKNAL_RX_LNET_PAYLOAD 5	       /* reading lnet payload (to deliver here) */
+#define SOCKNAL_RX_SLOP	 6	       /* skipping body */
+
+typedef struct ksock_conn {
+	struct ksock_peer  *ksnc_peer;	 /* owning peer */
+	struct ksock_route *ksnc_route;	/* owning route */
+	struct list_head	  ksnc_list;	 /* stash on peer's conn list */
+	struct socket       *ksnc_sock;	 /* actual socket */
+	void	       *ksnc_saved_data_ready; /* socket's original data_ready() callback */
+	void	       *ksnc_saved_write_space; /* socket's original write_space() callback */
+	atomic_t	ksnc_conn_refcount; /* conn refcount */
+	atomic_t	ksnc_sock_refcount; /* sock refcount */
+	ksock_sched_t      *ksnc_scheduler;  /* who schedules this connection */
+	__u32	       ksnc_myipaddr;   /* my IP */
+	__u32	       ksnc_ipaddr;     /* peer's IP */
+	int		 ksnc_port;       /* peer's port */
+	signed int	  ksnc_type:3;     /* type of connection,
+					      * should be signed value */
+	unsigned int	    ksnc_closing:1;  /* being shut down */
+	unsigned int	    ksnc_flip:1;     /* flip or not, only for V2.x */
+	unsigned int	    ksnc_zc_capable:1; /* enable to ZC */
+	struct ksock_proto *ksnc_proto;      /* protocol for the connection */
+
+	/* reader */
+	struct list_head  ksnc_rx_list;     /* where I enq waiting input or a forwarding descriptor */
+	unsigned long	    ksnc_rx_deadline; /* when (in jiffies) receive times out */
+	__u8		  ksnc_rx_started;  /* started receiving a message */
+	__u8		  ksnc_rx_ready;    /* data ready to read */
+	__u8		  ksnc_rx_scheduled;/* being progressed */
+	__u8		  ksnc_rx_state;    /* what is being read */
+	int		   ksnc_rx_nob_left; /* # bytes to next hdr/body */
+	int		   ksnc_rx_nob_wanted; /* bytes actually wanted */
+	int		   ksnc_rx_niov;     /* # iovec frags */
+	struct kvec 	 *ksnc_rx_iov;      /* the iovec frags */
+	int		   ksnc_rx_nkiov;    /* # page frags */
+	lnet_kiov_t	  *ksnc_rx_kiov;     /* the page frags */
+	ksock_rxiovspace_t    ksnc_rx_iov_space;/* space for frag descriptors */
+	__u32		 ksnc_rx_csum;     /* partial checksum for incoming data */
+	void		 *ksnc_cookie;      /* rx lnet_finalize passthru arg */
+	ksock_msg_t	   ksnc_msg;	 /* incoming message buffer:
+						 * V2.x message takes the
+						 * whole struct
+						 * V1.x message is a bare
+						 * lnet_hdr_t, it's stored in
+						 * ksnc_msg.ksm_u.lnetmsg */
+
+	/* WRITER */
+	struct list_head	    ksnc_tx_list;     /* where I enq waiting for output space */
+	struct list_head	    ksnc_tx_queue;    /* packets waiting to be sent */
+	ksock_tx_t	   *ksnc_tx_carrier;  /* next TX that can carry a LNet message or ZC-ACK */
+	unsigned long	    ksnc_tx_deadline; /* when (in jiffies) tx times out */
+	int		   ksnc_tx_bufnob;     /* send buffer marker */
+	atomic_t	  ksnc_tx_nob;	/* # bytes queued */
+	int		   ksnc_tx_ready;      /* write space */
+	int		   ksnc_tx_scheduled;  /* being progressed */
+	unsigned long	    ksnc_tx_last_post;  /* time stamp of the last posted TX */
+} ksock_conn_t;
+
+typedef struct ksock_route {
+	struct list_head	    ksnr_list;	/* chain on peer route list */
+	struct list_head	    ksnr_connd_list;  /* chain on ksnr_connd_routes */
+	struct ksock_peer    *ksnr_peer;	/* owning peer */
+	atomic_t	  ksnr_refcount;    /* # users */
+	unsigned long	    ksnr_timeout;     /* when (in jiffies) reconnection can happen next */
+	long	ksnr_retry_interval; /* how long between retries */
+	__u32		 ksnr_myipaddr;    /* my IP */
+	__u32		 ksnr_ipaddr;      /* IP address to connect to */
+	int		   ksnr_port;	/* port to connect to */
+	unsigned int	  ksnr_scheduled:1; /* scheduled for attention */
+	unsigned int	  ksnr_connecting:1;/* connection establishment in progress */
+	unsigned int	  ksnr_connected:4; /* connections established by type */
+	unsigned int	  ksnr_deleted:1;   /* been removed from peer? */
+	unsigned int	  ksnr_share_count; /* created explicitly? */
+	int		   ksnr_conn_count;  /* # conns established by this route */
+} ksock_route_t;
+
+#define SOCKNAL_KEEPALIVE_PING	  1       /* cookie for keepalive ping */
+
+typedef struct ksock_peer {
+	struct list_head	    ksnp_list;	/* stash on global peer list */
+	unsigned long	    ksnp_last_alive;  /* when (in jiffies) I was last alive */
+	lnet_process_id_t     ksnp_id;       /* who's on the other end(s) */
+	atomic_t	  ksnp_refcount; /* # users */
+	int		   ksnp_sharecount;  /* lconf usage counter */
+	int		   ksnp_closing;  /* being closed */
+	int		   ksnp_accepting;/* # passive connections pending */
+	int		   ksnp_error;    /* errno on closing last conn */
+	__u64		 ksnp_zc_next_cookie;/* ZC completion cookie */
+	__u64		 ksnp_incarnation;   /* latest known peer incarnation */
+	struct ksock_proto   *ksnp_proto;    /* latest known peer protocol */
+	struct list_head	    ksnp_conns;    /* all active connections */
+	struct list_head	    ksnp_routes;   /* routes */
+	struct list_head	    ksnp_tx_queue; /* waiting packets */
+	spinlock_t	      ksnp_lock;	/* serialize, g_lock unsafe */
+	struct list_head	    ksnp_zc_req_list;   /* zero copy requests wait for ACK  */
+	unsigned long	    ksnp_send_keepalive; /* time to send keepalive */
+	lnet_ni_t	    *ksnp_ni;       /* which network */
+	int		   ksnp_n_passive_ips; /* # of... */
+	__u32		 ksnp_passive_ips[LNET_MAX_INTERFACES]; /* preferred local interfaces */
+} ksock_peer_t;
+
+typedef struct ksock_connreq {
+	struct list_head	    ksncr_list;     /* stash on ksnd_connd_connreqs */
+	lnet_ni_t	    *ksncr_ni;       /* chosen NI */
+	struct socket	 *ksncr_sock;     /* accepted socket */
+} ksock_connreq_t;
+
+extern ksock_nal_data_t ksocknal_data;
+extern ksock_tunables_t ksocknal_tunables;
+
+#define SOCKNAL_MATCH_NO	0	/* TX can't match type of connection */
+#define SOCKNAL_MATCH_YES       1	/* TX matches type of connection */
+#define SOCKNAL_MATCH_MAY       2	/* TX can be sent on the connection, but not preferred */
+
+typedef struct ksock_proto {
+	int	   pro_version;					      /* version number of protocol */
+	int	 (*pro_send_hello)(ksock_conn_t *, ksock_hello_msg_t *);     /* handshake function */
+	int	 (*pro_recv_hello)(ksock_conn_t *, ksock_hello_msg_t *, int);/* handshake function */
+	void	(*pro_pack)(ksock_tx_t *);				  /* message pack */
+	void	(*pro_unpack)(ksock_msg_t *);			       /* message unpack */
+	ksock_tx_t *(*pro_queue_tx_msg)(ksock_conn_t *, ksock_tx_t *);	  /* queue tx on the connection */
+	int	 (*pro_queue_tx_zcack)(ksock_conn_t *, ksock_tx_t *, __u64); /* queue ZC ack on the connection */
+	int	 (*pro_handle_zcreq)(ksock_conn_t *, __u64, int);	    /* handle ZC request */
+	int	 (*pro_handle_zcack)(ksock_conn_t *, __u64, __u64);	  /* handle ZC ACK */
+	int	 (*pro_match_tx)(ksock_conn_t *, ksock_tx_t *, int);	 /* msg type matches the connection type:
+										 * return value:
+										 *   return MATCH_NO  : no
+										 *   return MATCH_YES : matching type
+										 *   return MATCH_MAY : can be backup */
+} ksock_proto_t;
+
+extern ksock_proto_t ksocknal_protocol_v1x;
+extern ksock_proto_t ksocknal_protocol_v2x;
+extern ksock_proto_t ksocknal_protocol_v3x;
+
+#define KSOCK_PROTO_V1_MAJOR    LNET_PROTO_TCP_VERSION_MAJOR
+#define KSOCK_PROTO_V1_MINOR    LNET_PROTO_TCP_VERSION_MINOR
+#define KSOCK_PROTO_V1	  KSOCK_PROTO_V1_MAJOR
+
+#ifndef CPU_MASK_NONE
+#define CPU_MASK_NONE   0UL
+#endif
+
+static inline int
+ksocknal_route_mask(void)
+{
+	if (!*ksocknal_tunables.ksnd_typed_conns)
+		return (1 << SOCKLND_CONN_ANY);
+
+	return ((1 << SOCKLND_CONN_CONTROL) |
+		(1 << SOCKLND_CONN_BULK_IN) |
+		(1 << SOCKLND_CONN_BULK_OUT));
+}
+
+static inline struct list_head *
+ksocknal_nid2peerlist(lnet_nid_t nid)
+{
+	unsigned int hash = ((unsigned int)nid) % ksocknal_data.ksnd_peer_hash_size;
+
+	return &ksocknal_data.ksnd_peers[hash];
+}
+
+static inline void
+ksocknal_conn_addref(ksock_conn_t *conn)
+{
+	LASSERT(atomic_read(&conn->ksnc_conn_refcount) > 0);
+	atomic_inc(&conn->ksnc_conn_refcount);
+}
+
+extern void ksocknal_queue_zombie_conn(ksock_conn_t *conn);
+extern void ksocknal_finalize_zcreq(ksock_conn_t *conn);
+
+static inline void
+ksocknal_conn_decref(ksock_conn_t *conn)
+{
+	LASSERT(atomic_read(&conn->ksnc_conn_refcount) > 0);
+	if (atomic_dec_and_test(&conn->ksnc_conn_refcount))
+		ksocknal_queue_zombie_conn(conn);
+}
+
+static inline int
+ksocknal_connsock_addref(ksock_conn_t *conn)
+{
+	int   rc = -ESHUTDOWN;
+
+	read_lock(&ksocknal_data.ksnd_global_lock);
+	if (!conn->ksnc_closing) {
+		LASSERT(atomic_read(&conn->ksnc_sock_refcount) > 0);
+		atomic_inc(&conn->ksnc_sock_refcount);
+		rc = 0;
+	}
+	read_unlock(&ksocknal_data.ksnd_global_lock);
+
+	return rc;
+}
+
+static inline void
+ksocknal_connsock_decref(ksock_conn_t *conn)
+{
+	LASSERT(atomic_read(&conn->ksnc_sock_refcount) > 0);
+	if (atomic_dec_and_test(&conn->ksnc_sock_refcount)) {
+		LASSERT(conn->ksnc_closing);
+		libcfs_sock_release(conn->ksnc_sock);
+		conn->ksnc_sock = NULL;
+		ksocknal_finalize_zcreq(conn);
+	}
+}
+
+static inline void
+ksocknal_tx_addref(ksock_tx_t *tx)
+{
+	LASSERT(atomic_read(&tx->tx_refcount) > 0);
+	atomic_inc(&tx->tx_refcount);
+}
+
+extern void ksocknal_tx_prep(ksock_conn_t *, ksock_tx_t *tx);
+extern void ksocknal_tx_done(lnet_ni_t *ni, ksock_tx_t *tx);
+
+static inline void
+ksocknal_tx_decref(ksock_tx_t *tx)
+{
+	LASSERT(atomic_read(&tx->tx_refcount) > 0);
+	if (atomic_dec_and_test(&tx->tx_refcount))
+		ksocknal_tx_done(NULL, tx);
+}
+
+static inline void
+ksocknal_route_addref(ksock_route_t *route)
+{
+	LASSERT(atomic_read(&route->ksnr_refcount) > 0);
+	atomic_inc(&route->ksnr_refcount);
+}
+
+extern void ksocknal_destroy_route(ksock_route_t *route);
+
+static inline void
+ksocknal_route_decref(ksock_route_t *route)
+{
+	LASSERT(atomic_read(&route->ksnr_refcount) > 0);
+	if (atomic_dec_and_test(&route->ksnr_refcount))
+		ksocknal_destroy_route(route);
+}
+
+static inline void
+ksocknal_peer_addref(ksock_peer_t *peer)
+{
+	LASSERT(atomic_read(&peer->ksnp_refcount) > 0);
+	atomic_inc(&peer->ksnp_refcount);
+}
+
+extern void ksocknal_destroy_peer(ksock_peer_t *peer);
+
+static inline void
+ksocknal_peer_decref(ksock_peer_t *peer)
+{
+	LASSERT(atomic_read(&peer->ksnp_refcount) > 0);
+	if (atomic_dec_and_test(&peer->ksnp_refcount))
+		ksocknal_destroy_peer(peer);
+}
+
+int ksocknal_startup(lnet_ni_t *ni);
+void ksocknal_shutdown(lnet_ni_t *ni);
+int ksocknal_ctl(lnet_ni_t *ni, unsigned int cmd, void *arg);
+int ksocknal_send(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg);
+int ksocknal_recv(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg,
+		  int delayed, unsigned int niov,
+		  struct kvec *iov, lnet_kiov_t *kiov,
+		  unsigned int offset, unsigned int mlen, unsigned int rlen);
+int ksocknal_accept(lnet_ni_t *ni, struct socket *sock);
+
+extern int ksocknal_add_peer(lnet_ni_t *ni, lnet_process_id_t id, __u32 ip, int port);
+extern ksock_peer_t *ksocknal_find_peer_locked(lnet_ni_t *ni, lnet_process_id_t id);
+extern ksock_peer_t *ksocknal_find_peer(lnet_ni_t *ni, lnet_process_id_t id);
+extern void ksocknal_peer_failed(ksock_peer_t *peer);
+extern int ksocknal_create_conn(lnet_ni_t *ni, ksock_route_t *route,
+				 struct socket *sock, int type);
+extern void ksocknal_close_conn_locked(ksock_conn_t *conn, int why);
+extern void ksocknal_terminate_conn(ksock_conn_t *conn);
+extern void ksocknal_destroy_conn(ksock_conn_t *conn);
+extern int  ksocknal_close_peer_conns_locked(ksock_peer_t *peer,
+					      __u32 ipaddr, int why);
+extern int ksocknal_close_conn_and_siblings(ksock_conn_t *conn, int why);
+extern int ksocknal_close_matching_conns(lnet_process_id_t id, __u32 ipaddr);
+extern ksock_conn_t *ksocknal_find_conn_locked(ksock_peer_t *peer,
+					       ksock_tx_t *tx, int nonblk);
+
+extern int  ksocknal_launch_packet(lnet_ni_t *ni, ksock_tx_t *tx,
+				   lnet_process_id_t id);
+extern ksock_tx_t *ksocknal_alloc_tx(int type, int size);
+extern void ksocknal_free_tx(ksock_tx_t *tx);
+extern ksock_tx_t *ksocknal_alloc_tx_noop(__u64 cookie, int nonblk);
+extern void ksocknal_next_tx_carrier(ksock_conn_t *conn);
+extern void ksocknal_queue_tx_locked(ksock_tx_t *tx, ksock_conn_t *conn);
+extern void ksocknal_txlist_done(lnet_ni_t *ni, struct list_head *txlist,
+				  int error);
+extern void ksocknal_notify(lnet_ni_t *ni, lnet_nid_t gw_nid, int alive);
+extern void ksocknal_query(struct lnet_ni *ni, lnet_nid_t nid, unsigned long *when);
+extern int ksocknal_thread_start(int (*fn)(void *arg), void *arg, char *name);
+extern void ksocknal_thread_fini(void);
+extern void ksocknal_launch_all_connections_locked(ksock_peer_t *peer);
+extern ksock_route_t *ksocknal_find_connectable_route_locked(ksock_peer_t *peer);
+extern ksock_route_t *ksocknal_find_connecting_route_locked(ksock_peer_t *peer);
+extern int ksocknal_new_packet(ksock_conn_t *conn, int skip);
+extern int ksocknal_scheduler(void *arg);
+extern int ksocknal_connd(void *arg);
+extern int ksocknal_reaper(void *arg);
+extern int ksocknal_send_hello(lnet_ni_t *ni, ksock_conn_t *conn,
+				lnet_nid_t peer_nid, ksock_hello_msg_t *hello);
+extern int ksocknal_recv_hello(lnet_ni_t *ni, ksock_conn_t *conn,
+				ksock_hello_msg_t *hello, lnet_process_id_t *id,
+				__u64 *incarnation);
+extern void ksocknal_read_callback(ksock_conn_t *conn);
+extern void ksocknal_write_callback(ksock_conn_t *conn);
+
+extern int ksocknal_lib_zc_capable(ksock_conn_t *conn);
+extern void ksocknal_lib_save_callback(struct socket *sock, ksock_conn_t *conn);
+extern void ksocknal_lib_set_callback(struct socket *sock,  ksock_conn_t *conn);
+extern void ksocknal_lib_reset_callback(struct socket *sock, ksock_conn_t *conn);
+extern void ksocknal_lib_push_conn(ksock_conn_t *conn);
+extern int ksocknal_lib_get_conn_addrs(ksock_conn_t *conn);
+extern int ksocknal_lib_setup_sock(struct socket *so);
+extern int ksocknal_lib_send_iov(ksock_conn_t *conn, ksock_tx_t *tx);
+extern int ksocknal_lib_send_kiov(ksock_conn_t *conn, ksock_tx_t *tx);
+extern void ksocknal_lib_eager_ack(ksock_conn_t *conn);
+extern int ksocknal_lib_recv_iov(ksock_conn_t *conn);
+extern int ksocknal_lib_recv_kiov(ksock_conn_t *conn);
+extern int ksocknal_lib_get_conn_tunables(ksock_conn_t *conn, int *txmem,
+					   int *rxmem, int *nagle);
+
+extern int ksocknal_tunables_init(void);
+
+extern void ksocknal_lib_csum_tx(ksock_tx_t *tx);
+
+extern int ksocknal_lib_memory_pressure(ksock_conn_t *conn);
+extern int ksocknal_lib_bind_thread_to_cpu(int id);
diff --git a/kernel/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_cb.c b/kernel/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_cb.c
new file mode 100644
index 000000000..fa7ad883b
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_cb.c
@@ -0,0 +1,2634 @@
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ *
+ *   Author: Zach Brown <zab@zabbo.net>
+ *   Author: Peter J. Braam <braam@clusterfs.com>
+ *   Author: Phil Schwan <phil@clusterfs.com>
+ *   Author: Eric Barton <eric@bartonsoftware.com>
+ *
+ *   This file is part of Portals, http://www.sf.net/projects/sandiaportals/
+ *
+ *   Portals is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Portals is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Portals; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include "socklnd.h"
+
+ksock_tx_t *
+ksocknal_alloc_tx(int type, int size)
+{
+	ksock_tx_t *tx = NULL;
+
+	if (type == KSOCK_MSG_NOOP) {
+		LASSERT(size == KSOCK_NOOP_TX_SIZE);
+
+		/* searching for a noop tx in free list */
+		spin_lock(&ksocknal_data.ksnd_tx_lock);
+
+		if (!list_empty(&ksocknal_data.ksnd_idle_noop_txs)) {
+			tx = list_entry(ksocknal_data.ksnd_idle_noop_txs. \
+					    next, ksock_tx_t, tx_list);
+			LASSERT(tx->tx_desc_size == size);
+			list_del(&tx->tx_list);
+		}
+
+		spin_unlock(&ksocknal_data.ksnd_tx_lock);
+	}
+
+	if (tx == NULL)
+		LIBCFS_ALLOC(tx, size);
+
+	if (tx == NULL)
+		return NULL;
+
+	atomic_set(&tx->tx_refcount, 1);
+	tx->tx_zc_aborted = 0;
+	tx->tx_zc_capable = 0;
+	tx->tx_zc_checked = 0;
+	tx->tx_desc_size  = size;
+
+	atomic_inc(&ksocknal_data.ksnd_nactive_txs);
+
+	return tx;
+}
+
+ksock_tx_t *
+ksocknal_alloc_tx_noop(__u64 cookie, int nonblk)
+{
+	ksock_tx_t *tx;
+
+	tx = ksocknal_alloc_tx(KSOCK_MSG_NOOP, KSOCK_NOOP_TX_SIZE);
+	if (tx == NULL) {
+		CERROR("Can't allocate noop tx desc\n");
+		return NULL;
+	}
+
+	tx->tx_conn     = NULL;
+	tx->tx_lnetmsg  = NULL;
+	tx->tx_kiov     = NULL;
+	tx->tx_nkiov    = 0;
+	tx->tx_iov      = tx->tx_frags.virt.iov;
+	tx->tx_niov     = 1;
+	tx->tx_nonblk   = nonblk;
+
+	socklnd_init_msg(&tx->tx_msg, KSOCK_MSG_NOOP);
+	tx->tx_msg.ksm_zc_cookies[1] = cookie;
+
+	return tx;
+}
+
+
+void
+ksocknal_free_tx (ksock_tx_t *tx)
+{
+	atomic_dec(&ksocknal_data.ksnd_nactive_txs);
+
+	if (tx->tx_lnetmsg == NULL && tx->tx_desc_size == KSOCK_NOOP_TX_SIZE) {
+		/* it's a noop tx */
+		spin_lock(&ksocknal_data.ksnd_tx_lock);
+
+		list_add(&tx->tx_list, &ksocknal_data.ksnd_idle_noop_txs);
+
+		spin_unlock(&ksocknal_data.ksnd_tx_lock);
+	} else {
+		LIBCFS_FREE(tx, tx->tx_desc_size);
+	}
+}
+
+static int
+ksocknal_send_iov (ksock_conn_t *conn, ksock_tx_t *tx)
+{
+	struct kvec  *iov = tx->tx_iov;
+	int    nob;
+	int    rc;
+
+	LASSERT (tx->tx_niov > 0);
+
+	/* Never touch tx->tx_iov inside ksocknal_lib_send_iov() */
+	rc = ksocknal_lib_send_iov(conn, tx);
+
+	if (rc <= 0)			    /* sent nothing? */
+		return rc;
+
+	nob = rc;
+	LASSERT (nob <= tx->tx_resid);
+	tx->tx_resid -= nob;
+
+	/* "consume" iov */
+	do {
+		LASSERT (tx->tx_niov > 0);
+
+		if (nob < (int) iov->iov_len) {
+			iov->iov_base = (void *)((char *)iov->iov_base + nob);
+			iov->iov_len -= nob;
+			return rc;
+		}
+
+		nob -= iov->iov_len;
+		tx->tx_iov = ++iov;
+		tx->tx_niov--;
+	} while (nob != 0);
+
+	return rc;
+}
+
+static int
+ksocknal_send_kiov (ksock_conn_t *conn, ksock_tx_t *tx)
+{
+	lnet_kiov_t    *kiov = tx->tx_kiov;
+	int     nob;
+	int     rc;
+
+	LASSERT (tx->tx_niov == 0);
+	LASSERT (tx->tx_nkiov > 0);
+
+	/* Never touch tx->tx_kiov inside ksocknal_lib_send_kiov() */
+	rc = ksocknal_lib_send_kiov(conn, tx);
+
+	if (rc <= 0)			    /* sent nothing? */
+		return rc;
+
+	nob = rc;
+	LASSERT (nob <= tx->tx_resid);
+	tx->tx_resid -= nob;
+
+	/* "consume" kiov */
+	do {
+		LASSERT(tx->tx_nkiov > 0);
+
+		if (nob < (int)kiov->kiov_len) {
+			kiov->kiov_offset += nob;
+			kiov->kiov_len -= nob;
+			return rc;
+		}
+
+		nob -= (int)kiov->kiov_len;
+		tx->tx_kiov = ++kiov;
+		tx->tx_nkiov--;
+	} while (nob != 0);
+
+	return rc;
+}
+
+static int
+ksocknal_transmit (ksock_conn_t *conn, ksock_tx_t *tx)
+{
+	int      rc;
+	int      bufnob;
+
+	if (ksocknal_data.ksnd_stall_tx != 0) {
+		set_current_state(TASK_UNINTERRUPTIBLE);
+		schedule_timeout(cfs_time_seconds(ksocknal_data.ksnd_stall_tx));
+	}
+
+	LASSERT (tx->tx_resid != 0);
+
+	rc = ksocknal_connsock_addref(conn);
+	if (rc != 0) {
+		LASSERT (conn->ksnc_closing);
+		return -ESHUTDOWN;
+	}
+
+	do {
+		if (ksocknal_data.ksnd_enomem_tx > 0) {
+			/* testing... */
+			ksocknal_data.ksnd_enomem_tx--;
+			rc = -EAGAIN;
+		} else if (tx->tx_niov != 0) {
+			rc = ksocknal_send_iov (conn, tx);
+		} else {
+			rc = ksocknal_send_kiov (conn, tx);
+		}
+
+		bufnob = conn->ksnc_sock->sk->sk_wmem_queued;
+		if (rc > 0)		     /* sent something? */
+			conn->ksnc_tx_bufnob += rc; /* account it */
+
+		if (bufnob < conn->ksnc_tx_bufnob) {
+			/* allocated send buffer bytes < computed; infer
+			 * something got ACKed */
+			conn->ksnc_tx_deadline =
+				cfs_time_shift(*ksocknal_tunables.ksnd_timeout);
+			conn->ksnc_peer->ksnp_last_alive = cfs_time_current();
+			conn->ksnc_tx_bufnob = bufnob;
+			mb();
+		}
+
+		if (rc <= 0) { /* Didn't write anything? */
+
+			if (rc == 0) /* some stacks return 0 instead of -EAGAIN */
+				rc = -EAGAIN;
+
+			/* Check if EAGAIN is due to memory pressure */
+			if (rc == -EAGAIN && ksocknal_lib_memory_pressure(conn))
+				rc = -ENOMEM;
+
+			break;
+		}
+
+		/* socket's wmem_queued now includes 'rc' bytes */
+		atomic_sub (rc, &conn->ksnc_tx_nob);
+		rc = 0;
+
+	} while (tx->tx_resid != 0);
+
+	ksocknal_connsock_decref(conn);
+	return rc;
+}
+
+static int
+ksocknal_recv_iov (ksock_conn_t *conn)
+{
+	struct kvec *iov = conn->ksnc_rx_iov;
+	int     nob;
+	int     rc;
+
+	LASSERT (conn->ksnc_rx_niov > 0);
+
+	/* Never touch conn->ksnc_rx_iov or change connection
+	 * status inside ksocknal_lib_recv_iov */
+	rc = ksocknal_lib_recv_iov(conn);
+
+	if (rc <= 0)
+		return rc;
+
+	/* received something... */
+	nob = rc;
+
+	conn->ksnc_peer->ksnp_last_alive = cfs_time_current();
+	conn->ksnc_rx_deadline =
+		cfs_time_shift(*ksocknal_tunables.ksnd_timeout);
+	mb();		       /* order with setting rx_started */
+	conn->ksnc_rx_started = 1;
+
+	conn->ksnc_rx_nob_wanted -= nob;
+	conn->ksnc_rx_nob_left -= nob;
+
+	do {
+		LASSERT (conn->ksnc_rx_niov > 0);
+
+		if (nob < (int)iov->iov_len) {
+			iov->iov_len -= nob;
+			iov->iov_base += nob;
+			return -EAGAIN;
+		}
+
+		nob -= iov->iov_len;
+		conn->ksnc_rx_iov = ++iov;
+		conn->ksnc_rx_niov--;
+	} while (nob != 0);
+
+	return rc;
+}
+
+static int
+ksocknal_recv_kiov (ksock_conn_t *conn)
+{
+	lnet_kiov_t   *kiov = conn->ksnc_rx_kiov;
+	int     nob;
+	int     rc;
+	LASSERT (conn->ksnc_rx_nkiov > 0);
+
+	/* Never touch conn->ksnc_rx_kiov or change connection
+	 * status inside ksocknal_lib_recv_iov */
+	rc = ksocknal_lib_recv_kiov(conn);
+
+	if (rc <= 0)
+		return rc;
+
+	/* received something... */
+	nob = rc;
+
+	conn->ksnc_peer->ksnp_last_alive = cfs_time_current();
+	conn->ksnc_rx_deadline =
+		cfs_time_shift(*ksocknal_tunables.ksnd_timeout);
+	mb();		       /* order with setting rx_started */
+	conn->ksnc_rx_started = 1;
+
+	conn->ksnc_rx_nob_wanted -= nob;
+	conn->ksnc_rx_nob_left -= nob;
+
+	do {
+		LASSERT (conn->ksnc_rx_nkiov > 0);
+
+		if (nob < (int) kiov->kiov_len) {
+			kiov->kiov_offset += nob;
+			kiov->kiov_len -= nob;
+			return -EAGAIN;
+		}
+
+		nob -= kiov->kiov_len;
+		conn->ksnc_rx_kiov = ++kiov;
+		conn->ksnc_rx_nkiov--;
+	} while (nob != 0);
+
+	return 1;
+}
+
+static int
+ksocknal_receive (ksock_conn_t *conn)
+{
+	/* Return 1 on success, 0 on EOF, < 0 on error.
+	 * Caller checks ksnc_rx_nob_wanted to determine
+	 * progress/completion. */
+	int     rc;
+
+	if (ksocknal_data.ksnd_stall_rx != 0) {
+		set_current_state(TASK_UNINTERRUPTIBLE);
+		schedule_timeout(cfs_time_seconds(ksocknal_data.ksnd_stall_rx));
+	}
+
+	rc = ksocknal_connsock_addref(conn);
+	if (rc != 0) {
+		LASSERT (conn->ksnc_closing);
+		return -ESHUTDOWN;
+	}
+
+	for (;;) {
+		if (conn->ksnc_rx_niov != 0)
+			rc = ksocknal_recv_iov (conn);
+		else
+			rc = ksocknal_recv_kiov (conn);
+
+		if (rc <= 0) {
+			/* error/EOF or partial receive */
+			if (rc == -EAGAIN) {
+				rc = 1;
+			} else if (rc == 0 && conn->ksnc_rx_started) {
+				/* EOF in the middle of a message */
+				rc = -EPROTO;
+			}
+			break;
+		}
+
+		/* Completed a fragment */
+
+		if (conn->ksnc_rx_nob_wanted == 0) {
+			rc = 1;
+			break;
+		}
+	}
+
+	ksocknal_connsock_decref(conn);
+	return rc;
+}
+
+void
+ksocknal_tx_done (lnet_ni_t *ni, ksock_tx_t *tx)
+{
+	lnet_msg_t  *lnetmsg = tx->tx_lnetmsg;
+	int	  rc = (tx->tx_resid == 0 && !tx->tx_zc_aborted) ? 0 : -EIO;
+
+	LASSERT(ni != NULL || tx->tx_conn != NULL);
+
+	if (tx->tx_conn != NULL)
+		ksocknal_conn_decref(tx->tx_conn);
+
+	if (ni == NULL && tx->tx_conn != NULL)
+		ni = tx->tx_conn->ksnc_peer->ksnp_ni;
+
+	ksocknal_free_tx (tx);
+	if (lnetmsg != NULL) /* KSOCK_MSG_NOOP go without lnetmsg */
+		lnet_finalize (ni, lnetmsg, rc);
+}
+
+void
+ksocknal_txlist_done (lnet_ni_t *ni, struct list_head *txlist, int error)
+{
+	ksock_tx_t *tx;
+
+	while (!list_empty (txlist)) {
+		tx = list_entry (txlist->next, ksock_tx_t, tx_list);
+
+		if (error && tx->tx_lnetmsg != NULL) {
+			CNETERR("Deleting packet type %d len %d %s->%s\n",
+				le32_to_cpu (tx->tx_lnetmsg->msg_hdr.type),
+				le32_to_cpu (tx->tx_lnetmsg->msg_hdr.payload_length),
+				libcfs_nid2str(le64_to_cpu(tx->tx_lnetmsg->msg_hdr.src_nid)),
+				libcfs_nid2str(le64_to_cpu(tx->tx_lnetmsg->msg_hdr.dest_nid)));
+		} else if (error) {
+			CNETERR("Deleting noop packet\n");
+		}
+
+		list_del (&tx->tx_list);
+
+		LASSERT (atomic_read(&tx->tx_refcount) == 1);
+		ksocknal_tx_done (ni, tx);
+	}
+}
+
+static void
+ksocknal_check_zc_req(ksock_tx_t *tx)
+{
+	ksock_conn_t   *conn = tx->tx_conn;
+	ksock_peer_t   *peer = conn->ksnc_peer;
+
+	/* Set tx_msg.ksm_zc_cookies[0] to a unique non-zero cookie and add tx
+	 * to ksnp_zc_req_list if some fragment of this message should be sent
+	 * zero-copy.  Our peer will send an ACK containing this cookie when
+	 * she has received this message to tell us we can signal completion.
+	 * tx_msg.ksm_zc_cookies[0] remains non-zero while tx is on
+	 * ksnp_zc_req_list. */
+	LASSERT (tx->tx_msg.ksm_type != KSOCK_MSG_NOOP);
+	LASSERT (tx->tx_zc_capable);
+
+	tx->tx_zc_checked = 1;
+
+	if (conn->ksnc_proto == &ksocknal_protocol_v1x ||
+	    !conn->ksnc_zc_capable)
+		return;
+
+	/* assign cookie and queue tx to pending list, it will be released when
+	 * a matching ack is received. See ksocknal_handle_zcack() */
+
+	ksocknal_tx_addref(tx);
+
+	spin_lock(&peer->ksnp_lock);
+
+	/* ZC_REQ is going to be pinned to the peer */
+	tx->tx_deadline =
+		cfs_time_shift(*ksocknal_tunables.ksnd_timeout);
+
+	LASSERT (tx->tx_msg.ksm_zc_cookies[0] == 0);
+
+	tx->tx_msg.ksm_zc_cookies[0] = peer->ksnp_zc_next_cookie++;
+
+	if (peer->ksnp_zc_next_cookie == 0)
+		peer->ksnp_zc_next_cookie = SOCKNAL_KEEPALIVE_PING + 1;
+
+	list_add_tail(&tx->tx_zc_list, &peer->ksnp_zc_req_list);
+
+	spin_unlock(&peer->ksnp_lock);
+}
+
+static void
+ksocknal_uncheck_zc_req(ksock_tx_t *tx)
+{
+	ksock_peer_t   *peer = tx->tx_conn->ksnc_peer;
+
+	LASSERT(tx->tx_msg.ksm_type != KSOCK_MSG_NOOP);
+	LASSERT(tx->tx_zc_capable);
+
+	tx->tx_zc_checked = 0;
+
+	spin_lock(&peer->ksnp_lock);
+
+	if (tx->tx_msg.ksm_zc_cookies[0] == 0) {
+		/* Not waiting for an ACK */
+		spin_unlock(&peer->ksnp_lock);
+		return;
+	}
+
+	tx->tx_msg.ksm_zc_cookies[0] = 0;
+	list_del(&tx->tx_zc_list);
+
+	spin_unlock(&peer->ksnp_lock);
+
+	ksocknal_tx_decref(tx);
+}
+
+static int
+ksocknal_process_transmit (ksock_conn_t *conn, ksock_tx_t *tx)
+{
+	int	    rc;
+
+	if (tx->tx_zc_capable && !tx->tx_zc_checked)
+		ksocknal_check_zc_req(tx);
+
+	rc = ksocknal_transmit (conn, tx);
+
+	CDEBUG (D_NET, "send(%d) %d\n", tx->tx_resid, rc);
+
+	if (tx->tx_resid == 0) {
+		/* Sent everything OK */
+		LASSERT (rc == 0);
+
+		return 0;
+	}
+
+	if (rc == -EAGAIN)
+		return rc;
+
+	if (rc == -ENOMEM) {
+		static int counter;
+
+		counter++;   /* exponential backoff warnings */
+		if ((counter & (-counter)) == counter)
+			CWARN("%u ENOMEM tx %p (%u allocated)\n",
+			      counter, conn, atomic_read(&libcfs_kmemory));
+
+		/* Queue on ksnd_enomem_conns for retry after a timeout */
+		spin_lock_bh(&ksocknal_data.ksnd_reaper_lock);
+
+		/* enomem list takes over scheduler's ref... */
+		LASSERT (conn->ksnc_tx_scheduled);
+		list_add_tail(&conn->ksnc_tx_list,
+				  &ksocknal_data.ksnd_enomem_conns);
+		if (!cfs_time_aftereq(cfs_time_add(cfs_time_current(),
+						   SOCKNAL_ENOMEM_RETRY),
+				   ksocknal_data.ksnd_reaper_waketime))
+			wake_up (&ksocknal_data.ksnd_reaper_waitq);
+
+		spin_unlock_bh(&ksocknal_data.ksnd_reaper_lock);
+		return rc;
+	}
+
+	/* Actual error */
+	LASSERT (rc < 0);
+
+	if (!conn->ksnc_closing) {
+		switch (rc) {
+		case -ECONNRESET:
+			LCONSOLE_WARN("Host %pI4h reset our connection while we were sending data; it may have rebooted.\n",
+				      &conn->ksnc_ipaddr);
+			break;
+		default:
+			LCONSOLE_WARN("There was an unexpected network error while writing to %pI4h: %d.\n",
+				      &conn->ksnc_ipaddr, rc);
+			break;
+		}
+		CDEBUG(D_NET, "[%p] Error %d on write to %s ip %pI4h:%d\n",
+		       conn, rc,
+		       libcfs_id2str(conn->ksnc_peer->ksnp_id),
+		       &conn->ksnc_ipaddr,
+		       conn->ksnc_port);
+	}
+
+	if (tx->tx_zc_checked)
+		ksocknal_uncheck_zc_req(tx);
+
+	/* it's not an error if conn is being closed */
+	ksocknal_close_conn_and_siblings (conn,
+					  (conn->ksnc_closing) ? 0 : rc);
+
+	return rc;
+}
+
+static void
+ksocknal_launch_connection_locked (ksock_route_t *route)
+{
+
+	/* called holding write lock on ksnd_global_lock */
+
+	LASSERT (!route->ksnr_scheduled);
+	LASSERT (!route->ksnr_connecting);
+	LASSERT ((ksocknal_route_mask() & ~route->ksnr_connected) != 0);
+
+	route->ksnr_scheduled = 1;	      /* scheduling conn for connd */
+	ksocknal_route_addref(route);	   /* extra ref for connd */
+
+	spin_lock_bh(&ksocknal_data.ksnd_connd_lock);
+
+	list_add_tail(&route->ksnr_connd_list,
+			  &ksocknal_data.ksnd_connd_routes);
+	wake_up(&ksocknal_data.ksnd_connd_waitq);
+
+	spin_unlock_bh(&ksocknal_data.ksnd_connd_lock);
+}
+
+void
+ksocknal_launch_all_connections_locked (ksock_peer_t *peer)
+{
+	ksock_route_t *route;
+
+	/* called holding write lock on ksnd_global_lock */
+	for (;;) {
+		/* launch any/all connections that need it */
+		route = ksocknal_find_connectable_route_locked(peer);
+		if (route == NULL)
+			return;
+
+		ksocknal_launch_connection_locked(route);
+	}
+}
+
+ksock_conn_t *
+ksocknal_find_conn_locked(ksock_peer_t *peer, ksock_tx_t *tx, int nonblk)
+{
+	struct list_head       *tmp;
+	ksock_conn_t     *conn;
+	ksock_conn_t     *typed = NULL;
+	ksock_conn_t     *fallback = NULL;
+	int	       tnob     = 0;
+	int	       fnob     = 0;
+
+	list_for_each (tmp, &peer->ksnp_conns) {
+		ksock_conn_t *c  = list_entry(tmp, ksock_conn_t, ksnc_list);
+		int	   nob = atomic_read(&c->ksnc_tx_nob) +
+				    c->ksnc_sock->sk->sk_wmem_queued;
+		int	   rc;
+
+		LASSERT (!c->ksnc_closing);
+		LASSERT (c->ksnc_proto != NULL &&
+			 c->ksnc_proto->pro_match_tx != NULL);
+
+		rc = c->ksnc_proto->pro_match_tx(c, tx, nonblk);
+
+		switch (rc) {
+		default:
+			LBUG();
+		case SOCKNAL_MATCH_NO: /* protocol rejected the tx */
+			continue;
+
+		case SOCKNAL_MATCH_YES: /* typed connection */
+			if (typed == NULL || tnob > nob ||
+			    (tnob == nob && *ksocknal_tunables.ksnd_round_robin &&
+			     cfs_time_after(typed->ksnc_tx_last_post, c->ksnc_tx_last_post))) {
+				typed = c;
+				tnob  = nob;
+			}
+			break;
+
+		case SOCKNAL_MATCH_MAY: /* fallback connection */
+			if (fallback == NULL || fnob > nob ||
+			    (fnob == nob && *ksocknal_tunables.ksnd_round_robin &&
+			     cfs_time_after(fallback->ksnc_tx_last_post, c->ksnc_tx_last_post))) {
+				fallback = c;
+				fnob     = nob;
+			}
+			break;
+		}
+	}
+
+	/* prefer the typed selection */
+	conn = (typed != NULL) ? typed : fallback;
+
+	if (conn != NULL)
+		conn->ksnc_tx_last_post = cfs_time_current();
+
+	return conn;
+}
+
+void
+ksocknal_tx_prep(ksock_conn_t *conn, ksock_tx_t *tx)
+{
+	conn->ksnc_proto->pro_pack(tx);
+
+	atomic_add (tx->tx_nob, &conn->ksnc_tx_nob);
+	ksocknal_conn_addref(conn); /* +1 ref for tx */
+	tx->tx_conn = conn;
+}
+
+void
+ksocknal_queue_tx_locked (ksock_tx_t *tx, ksock_conn_t *conn)
+{
+	ksock_sched_t *sched = conn->ksnc_scheduler;
+	ksock_msg_t   *msg = &tx->tx_msg;
+	ksock_tx_t    *ztx = NULL;
+	int	    bufnob = 0;
+
+	/* called holding global lock (read or irq-write) and caller may
+	 * not have dropped this lock between finding conn and calling me,
+	 * so we don't need the {get,put}connsock dance to deref
+	 * ksnc_sock... */
+	LASSERT(!conn->ksnc_closing);
+
+	CDEBUG(D_NET, "Sending to %s ip %pI4h:%d\n",
+		libcfs_id2str(conn->ksnc_peer->ksnp_id),
+		&conn->ksnc_ipaddr,
+		conn->ksnc_port);
+
+	ksocknal_tx_prep(conn, tx);
+
+	/* Ensure the frags we've been given EXACTLY match the number of
+	 * bytes we want to send.  Many TCP/IP stacks disregard any total
+	 * size parameters passed to them and just look at the frags.
+	 *
+	 * We always expect at least 1 mapped fragment containing the
+	 * complete ksocknal message header. */
+	LASSERT (lnet_iov_nob (tx->tx_niov, tx->tx_iov) +
+		 lnet_kiov_nob(tx->tx_nkiov, tx->tx_kiov) ==
+		 (unsigned int)tx->tx_nob);
+	LASSERT (tx->tx_niov >= 1);
+	LASSERT (tx->tx_resid == tx->tx_nob);
+
+	CDEBUG (D_NET, "Packet %p type %d, nob %d niov %d nkiov %d\n",
+		tx, (tx->tx_lnetmsg != NULL) ? tx->tx_lnetmsg->msg_hdr.type:
+					       KSOCK_MSG_NOOP,
+		tx->tx_nob, tx->tx_niov, tx->tx_nkiov);
+
+	/*
+	 * FIXME: SOCK_WMEM_QUEUED and SOCK_ERROR could block in __DARWIN8__
+	 * but they're used inside spinlocks a lot.
+	 */
+	bufnob = conn->ksnc_sock->sk->sk_wmem_queued;
+	spin_lock_bh(&sched->kss_lock);
+
+	if (list_empty(&conn->ksnc_tx_queue) && bufnob == 0) {
+		/* First packet starts the timeout */
+		conn->ksnc_tx_deadline =
+			cfs_time_shift(*ksocknal_tunables.ksnd_timeout);
+		if (conn->ksnc_tx_bufnob > 0) /* something got ACKed */
+			conn->ksnc_peer->ksnp_last_alive = cfs_time_current();
+		conn->ksnc_tx_bufnob = 0;
+		mb(); /* order with adding to tx_queue */
+	}
+
+	if (msg->ksm_type == KSOCK_MSG_NOOP) {
+		/* The packet is noop ZC ACK, try to piggyback the ack_cookie
+		 * on a normal packet so I don't need to send it */
+		LASSERT (msg->ksm_zc_cookies[1] != 0);
+		LASSERT (conn->ksnc_proto->pro_queue_tx_zcack != NULL);
+
+		if (conn->ksnc_proto->pro_queue_tx_zcack(conn, tx, 0))
+			ztx = tx; /* ZC ACK piggybacked on ztx release tx later */
+
+	} else {
+		/* It's a normal packet - can it piggback a noop zc-ack that
+		 * has been queued already? */
+		LASSERT (msg->ksm_zc_cookies[1] == 0);
+		LASSERT (conn->ksnc_proto->pro_queue_tx_msg != NULL);
+
+		ztx = conn->ksnc_proto->pro_queue_tx_msg(conn, tx);
+		/* ztx will be released later */
+	}
+
+	if (ztx != NULL) {
+		atomic_sub (ztx->tx_nob, &conn->ksnc_tx_nob);
+		list_add_tail(&ztx->tx_list, &sched->kss_zombie_noop_txs);
+	}
+
+	if (conn->ksnc_tx_ready &&      /* able to send */
+	    !conn->ksnc_tx_scheduled) { /* not scheduled to send */
+		/* +1 ref for scheduler */
+		ksocknal_conn_addref(conn);
+		list_add_tail (&conn->ksnc_tx_list,
+				   &sched->kss_tx_conns);
+		conn->ksnc_tx_scheduled = 1;
+		wake_up (&sched->kss_waitq);
+	}
+
+	spin_unlock_bh(&sched->kss_lock);
+}
+
+
+ksock_route_t *
+ksocknal_find_connectable_route_locked (ksock_peer_t *peer)
+{
+	unsigned long     now = cfs_time_current();
+	struct list_head    *tmp;
+	ksock_route_t *route;
+
+	list_for_each (tmp, &peer->ksnp_routes) {
+		route = list_entry (tmp, ksock_route_t, ksnr_list);
+
+		LASSERT (!route->ksnr_connecting || route->ksnr_scheduled);
+
+		if (route->ksnr_scheduled)      /* connections being established */
+			continue;
+
+		/* all route types connected ? */
+		if ((ksocknal_route_mask() & ~route->ksnr_connected) == 0)
+			continue;
+
+		if (!(route->ksnr_retry_interval == 0 || /* first attempt */
+		      cfs_time_aftereq(now, route->ksnr_timeout))) {
+			CDEBUG(D_NET,
+			       "Too soon to retry route %pI4h (cnted %d, interval %ld, %ld secs later)\n",
+			       &route->ksnr_ipaddr,
+			       route->ksnr_connected,
+			       route->ksnr_retry_interval,
+			       cfs_duration_sec(route->ksnr_timeout - now));
+			continue;
+		}
+
+		return route;
+	}
+
+	return NULL;
+}
+
+ksock_route_t *
+ksocknal_find_connecting_route_locked (ksock_peer_t *peer)
+{
+	struct list_head	*tmp;
+	ksock_route_t     *route;
+
+	list_for_each (tmp, &peer->ksnp_routes) {
+		route = list_entry (tmp, ksock_route_t, ksnr_list);
+
+		LASSERT (!route->ksnr_connecting || route->ksnr_scheduled);
+
+		if (route->ksnr_scheduled)
+			return route;
+	}
+
+	return NULL;
+}
+
+int
+ksocknal_launch_packet (lnet_ni_t *ni, ksock_tx_t *tx, lnet_process_id_t id)
+{
+	ksock_peer_t     *peer;
+	ksock_conn_t     *conn;
+	rwlock_t     *g_lock;
+	int	       retry;
+	int	       rc;
+
+	LASSERT (tx->tx_conn == NULL);
+
+	g_lock = &ksocknal_data.ksnd_global_lock;
+
+	for (retry = 0;; retry = 1) {
+		read_lock(g_lock);
+		peer = ksocknal_find_peer_locked(ni, id);
+		if (peer != NULL) {
+			if (ksocknal_find_connectable_route_locked(peer) == NULL) {
+				conn = ksocknal_find_conn_locked(peer, tx, tx->tx_nonblk);
+				if (conn != NULL) {
+					/* I've got no routes that need to be
+					 * connecting and I do have an actual
+					 * connection... */
+					ksocknal_queue_tx_locked (tx, conn);
+					read_unlock(g_lock);
+					return 0;
+				}
+			}
+		}
+
+		/* I'll need a write lock... */
+		read_unlock(g_lock);
+
+		write_lock_bh(g_lock);
+
+		peer = ksocknal_find_peer_locked(ni, id);
+		if (peer != NULL)
+			break;
+
+		write_unlock_bh(g_lock);
+
+		if ((id.pid & LNET_PID_USERFLAG) != 0) {
+			CERROR("Refusing to create a connection to userspace process %s\n",
+			       libcfs_id2str(id));
+			return -EHOSTUNREACH;
+		}
+
+		if (retry) {
+			CERROR("Can't find peer %s\n", libcfs_id2str(id));
+			return -EHOSTUNREACH;
+		}
+
+		rc = ksocknal_add_peer(ni, id,
+				       LNET_NIDADDR(id.nid),
+				       lnet_acceptor_port());
+		if (rc != 0) {
+			CERROR("Can't add peer %s: %d\n",
+			       libcfs_id2str(id), rc);
+			return rc;
+		}
+	}
+
+	ksocknal_launch_all_connections_locked(peer);
+
+	conn = ksocknal_find_conn_locked(peer, tx, tx->tx_nonblk);
+	if (conn != NULL) {
+		/* Connection exists; queue message on it */
+		ksocknal_queue_tx_locked (tx, conn);
+		write_unlock_bh(g_lock);
+		return 0;
+	}
+
+	if (peer->ksnp_accepting > 0 ||
+	    ksocknal_find_connecting_route_locked (peer) != NULL) {
+		/* the message is going to be pinned to the peer */
+		tx->tx_deadline =
+			cfs_time_shift(*ksocknal_tunables.ksnd_timeout);
+
+		/* Queue the message until a connection is established */
+		list_add_tail (&tx->tx_list, &peer->ksnp_tx_queue);
+		write_unlock_bh(g_lock);
+		return 0;
+	}
+
+	write_unlock_bh(g_lock);
+
+	/* NB Routes may be ignored if connections to them failed recently */
+	CNETERR("No usable routes to %s\n", libcfs_id2str(id));
+	return -EHOSTUNREACH;
+}
+
+int
+ksocknal_send(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg)
+{
+	int	       mpflag = 1;
+	int	       type = lntmsg->msg_type;
+	lnet_process_id_t target = lntmsg->msg_target;
+	unsigned int      payload_niov = lntmsg->msg_niov;
+	struct kvec      *payload_iov = lntmsg->msg_iov;
+	lnet_kiov_t      *payload_kiov = lntmsg->msg_kiov;
+	unsigned int      payload_offset = lntmsg->msg_offset;
+	unsigned int      payload_nob = lntmsg->msg_len;
+	ksock_tx_t       *tx;
+	int	       desc_size;
+	int	       rc;
+
+	/* NB 'private' is different depending on what we're sending.
+	 * Just ignore it... */
+
+	CDEBUG(D_NET, "sending %u bytes in %d frags to %s\n",
+	       payload_nob, payload_niov, libcfs_id2str(target));
+
+	LASSERT (payload_nob == 0 || payload_niov > 0);
+	LASSERT (payload_niov <= LNET_MAX_IOV);
+	/* payload is either all vaddrs or all pages */
+	LASSERT (!(payload_kiov != NULL && payload_iov != NULL));
+	LASSERT (!in_interrupt ());
+
+	if (payload_iov != NULL)
+		desc_size = offsetof(ksock_tx_t,
+				     tx_frags.virt.iov[1 + payload_niov]);
+	else
+		desc_size = offsetof(ksock_tx_t,
+				     tx_frags.paged.kiov[payload_niov]);
+
+	if (lntmsg->msg_vmflush)
+		mpflag = cfs_memory_pressure_get_and_set();
+	tx = ksocknal_alloc_tx(KSOCK_MSG_LNET, desc_size);
+	if (tx == NULL) {
+		CERROR("Can't allocate tx desc type %d size %d\n",
+		       type, desc_size);
+		if (lntmsg->msg_vmflush)
+			cfs_memory_pressure_restore(mpflag);
+		return -ENOMEM;
+	}
+
+	tx->tx_conn = NULL;		     /* set when assigned a conn */
+	tx->tx_lnetmsg = lntmsg;
+
+	if (payload_iov != NULL) {
+		tx->tx_kiov = NULL;
+		tx->tx_nkiov = 0;
+		tx->tx_iov = tx->tx_frags.virt.iov;
+		tx->tx_niov = 1 +
+			      lnet_extract_iov(payload_niov, &tx->tx_iov[1],
+					       payload_niov, payload_iov,
+					       payload_offset, payload_nob);
+	} else {
+		tx->tx_niov = 1;
+		tx->tx_iov = &tx->tx_frags.paged.iov;
+		tx->tx_kiov = tx->tx_frags.paged.kiov;
+		tx->tx_nkiov = lnet_extract_kiov(payload_niov, tx->tx_kiov,
+						 payload_niov, payload_kiov,
+						 payload_offset, payload_nob);
+
+		if (payload_nob >= *ksocknal_tunables.ksnd_zc_min_payload)
+			tx->tx_zc_capable = 1;
+	}
+
+	socklnd_init_msg(&tx->tx_msg, KSOCK_MSG_LNET);
+
+	/* The first fragment will be set later in pro_pack */
+	rc = ksocknal_launch_packet(ni, tx, target);
+	if (!mpflag)
+		cfs_memory_pressure_restore(mpflag);
+
+	if (rc == 0)
+		return 0;
+
+	ksocknal_free_tx(tx);
+	return -EIO;
+}
+
+int
+ksocknal_thread_start(int (*fn)(void *arg), void *arg, char *name)
+{
+	struct task_struct *task = kthread_run(fn, arg, "%s", name);
+
+	if (IS_ERR(task))
+		return PTR_ERR(task);
+
+	write_lock_bh(&ksocknal_data.ksnd_global_lock);
+	ksocknal_data.ksnd_nthreads++;
+	write_unlock_bh(&ksocknal_data.ksnd_global_lock);
+	return 0;
+}
+
+void
+ksocknal_thread_fini (void)
+{
+	write_lock_bh(&ksocknal_data.ksnd_global_lock);
+	ksocknal_data.ksnd_nthreads--;
+	write_unlock_bh(&ksocknal_data.ksnd_global_lock);
+}
+
+int
+ksocknal_new_packet (ksock_conn_t *conn, int nob_to_skip)
+{
+	static char ksocknal_slop_buffer[4096];
+
+	int	    nob;
+	unsigned int   niov;
+	int	    skipped;
+
+	LASSERT(conn->ksnc_proto != NULL);
+
+	if ((*ksocknal_tunables.ksnd_eager_ack & conn->ksnc_type) != 0) {
+		/* Remind the socket to ack eagerly... */
+		ksocknal_lib_eager_ack(conn);
+	}
+
+	if (nob_to_skip == 0) {	 /* right at next packet boundary now */
+		conn->ksnc_rx_started = 0;
+		mb();		       /* racing with timeout thread */
+
+		switch (conn->ksnc_proto->pro_version) {
+		case  KSOCK_PROTO_V2:
+		case  KSOCK_PROTO_V3:
+			conn->ksnc_rx_state = SOCKNAL_RX_KSM_HEADER;
+			conn->ksnc_rx_iov = (struct kvec *)&conn->ksnc_rx_iov_space;
+			conn->ksnc_rx_iov[0].iov_base = &conn->ksnc_msg;
+
+			conn->ksnc_rx_nob_wanted = offsetof(ksock_msg_t, ksm_u);
+			conn->ksnc_rx_nob_left = offsetof(ksock_msg_t, ksm_u);
+			conn->ksnc_rx_iov[0].iov_len  = offsetof(ksock_msg_t, ksm_u);
+			break;
+
+		case KSOCK_PROTO_V1:
+			/* Receiving bare lnet_hdr_t */
+			conn->ksnc_rx_state = SOCKNAL_RX_LNET_HEADER;
+			conn->ksnc_rx_nob_wanted = sizeof(lnet_hdr_t);
+			conn->ksnc_rx_nob_left = sizeof(lnet_hdr_t);
+
+			conn->ksnc_rx_iov = (struct kvec *)&conn->ksnc_rx_iov_space;
+			conn->ksnc_rx_iov[0].iov_base = &conn->ksnc_msg.ksm_u.lnetmsg;
+			conn->ksnc_rx_iov[0].iov_len  = sizeof (lnet_hdr_t);
+			break;
+
+		default:
+			LBUG ();
+		}
+		conn->ksnc_rx_niov = 1;
+
+		conn->ksnc_rx_kiov = NULL;
+		conn->ksnc_rx_nkiov = 0;
+		conn->ksnc_rx_csum = ~0;
+		return 1;
+	}
+
+	/* Set up to skip as much as possible now.  If there's more left
+	 * (ran out of iov entries) we'll get called again */
+
+	conn->ksnc_rx_state = SOCKNAL_RX_SLOP;
+	conn->ksnc_rx_nob_left = nob_to_skip;
+	conn->ksnc_rx_iov = (struct kvec *)&conn->ksnc_rx_iov_space;
+	skipped = 0;
+	niov = 0;
+
+	do {
+		nob = min_t(int, nob_to_skip, sizeof(ksocknal_slop_buffer));
+
+		conn->ksnc_rx_iov[niov].iov_base = ksocknal_slop_buffer;
+		conn->ksnc_rx_iov[niov].iov_len  = nob;
+		niov++;
+		skipped += nob;
+		nob_to_skip -=nob;
+
+	} while (nob_to_skip != 0 &&    /* mustn't overflow conn's rx iov */
+		 niov < sizeof(conn->ksnc_rx_iov_space) / sizeof (struct iovec));
+
+	conn->ksnc_rx_niov = niov;
+	conn->ksnc_rx_kiov = NULL;
+	conn->ksnc_rx_nkiov = 0;
+	conn->ksnc_rx_nob_wanted = skipped;
+	return 0;
+}
+
+static int
+ksocknal_process_receive (ksock_conn_t *conn)
+{
+	lnet_hdr_t	*lhdr;
+	lnet_process_id_t *id;
+	int		rc;
+
+	LASSERT (atomic_read(&conn->ksnc_conn_refcount) > 0);
+
+	/* NB: sched lock NOT held */
+	/* SOCKNAL_RX_LNET_HEADER is here for backward compatibility */
+	LASSERT (conn->ksnc_rx_state == SOCKNAL_RX_KSM_HEADER ||
+		 conn->ksnc_rx_state == SOCKNAL_RX_LNET_PAYLOAD ||
+		 conn->ksnc_rx_state == SOCKNAL_RX_LNET_HEADER ||
+		 conn->ksnc_rx_state == SOCKNAL_RX_SLOP);
+ again:
+	if (conn->ksnc_rx_nob_wanted != 0) {
+		rc = ksocknal_receive(conn);
+
+		if (rc <= 0) {
+			LASSERT (rc != -EAGAIN);
+
+			if (rc == 0)
+				CDEBUG(D_NET, "[%p] EOF from %s ip %pI4h:%d\n",
+				       conn,
+				       libcfs_id2str(conn->ksnc_peer->ksnp_id),
+				       &conn->ksnc_ipaddr,
+				       conn->ksnc_port);
+			else if (!conn->ksnc_closing)
+				CERROR("[%p] Error %d on read from %s ip %pI4h:%d\n",
+				       conn, rc,
+				       libcfs_id2str(conn->ksnc_peer->ksnp_id),
+				       &conn->ksnc_ipaddr,
+				       conn->ksnc_port);
+
+			/* it's not an error if conn is being closed */
+			ksocknal_close_conn_and_siblings (conn,
+							  (conn->ksnc_closing) ? 0 : rc);
+			return (rc == 0 ? -ESHUTDOWN : rc);
+		}
+
+		if (conn->ksnc_rx_nob_wanted != 0) {
+			/* short read */
+			return -EAGAIN;
+		}
+	}
+	switch (conn->ksnc_rx_state) {
+	case SOCKNAL_RX_KSM_HEADER:
+		if (conn->ksnc_flip) {
+			__swab32s(&conn->ksnc_msg.ksm_type);
+			__swab32s(&conn->ksnc_msg.ksm_csum);
+			__swab64s(&conn->ksnc_msg.ksm_zc_cookies[0]);
+			__swab64s(&conn->ksnc_msg.ksm_zc_cookies[1]);
+		}
+
+		if (conn->ksnc_msg.ksm_type != KSOCK_MSG_NOOP &&
+		    conn->ksnc_msg.ksm_type != KSOCK_MSG_LNET) {
+			CERROR("%s: Unknown message type: %x\n",
+			       libcfs_id2str(conn->ksnc_peer->ksnp_id),
+			       conn->ksnc_msg.ksm_type);
+			ksocknal_new_packet(conn, 0);
+			ksocknal_close_conn_and_siblings(conn, -EPROTO);
+			return -EPROTO;
+		}
+
+		if (conn->ksnc_msg.ksm_type == KSOCK_MSG_NOOP &&
+		    conn->ksnc_msg.ksm_csum != 0 &&     /* has checksum */
+		    conn->ksnc_msg.ksm_csum != conn->ksnc_rx_csum) {
+			/* NOOP Checksum error */
+			CERROR("%s: Checksum error, wire:0x%08X data:0x%08X\n",
+			       libcfs_id2str(conn->ksnc_peer->ksnp_id),
+			       conn->ksnc_msg.ksm_csum, conn->ksnc_rx_csum);
+			ksocknal_new_packet(conn, 0);
+			ksocknal_close_conn_and_siblings(conn, -EPROTO);
+			return -EIO;
+		}
+
+		if (conn->ksnc_msg.ksm_zc_cookies[1] != 0) {
+			__u64 cookie = 0;
+
+			LASSERT (conn->ksnc_proto != &ksocknal_protocol_v1x);
+
+			if (conn->ksnc_msg.ksm_type == KSOCK_MSG_NOOP)
+				cookie = conn->ksnc_msg.ksm_zc_cookies[0];
+
+			rc = conn->ksnc_proto->pro_handle_zcack(conn, cookie,
+					       conn->ksnc_msg.ksm_zc_cookies[1]);
+
+			if (rc != 0) {
+				CERROR("%s: Unknown ZC-ACK cookie: %llu, %llu\n",
+				       libcfs_id2str(conn->ksnc_peer->ksnp_id),
+				       cookie, conn->ksnc_msg.ksm_zc_cookies[1]);
+				ksocknal_new_packet(conn, 0);
+				ksocknal_close_conn_and_siblings(conn, -EPROTO);
+				return rc;
+			}
+		}
+
+		if (conn->ksnc_msg.ksm_type == KSOCK_MSG_NOOP) {
+			ksocknal_new_packet (conn, 0);
+			return 0;       /* NOOP is done and just return */
+		}
+
+		conn->ksnc_rx_state = SOCKNAL_RX_LNET_HEADER;
+		conn->ksnc_rx_nob_wanted = sizeof(ksock_lnet_msg_t);
+		conn->ksnc_rx_nob_left = sizeof(ksock_lnet_msg_t);
+
+		conn->ksnc_rx_iov = (struct kvec *)&conn->ksnc_rx_iov_space;
+		conn->ksnc_rx_iov[0].iov_base = &conn->ksnc_msg.ksm_u.lnetmsg;
+		conn->ksnc_rx_iov[0].iov_len  = sizeof(ksock_lnet_msg_t);
+
+		conn->ksnc_rx_niov = 1;
+		conn->ksnc_rx_kiov = NULL;
+		conn->ksnc_rx_nkiov = 0;
+
+		goto again;     /* read lnet header now */
+
+	case SOCKNAL_RX_LNET_HEADER:
+		/* unpack message header */
+		conn->ksnc_proto->pro_unpack(&conn->ksnc_msg);
+
+		if ((conn->ksnc_peer->ksnp_id.pid & LNET_PID_USERFLAG) != 0) {
+			/* Userspace peer */
+			lhdr = &conn->ksnc_msg.ksm_u.lnetmsg.ksnm_hdr;
+			id   = &conn->ksnc_peer->ksnp_id;
+
+			/* Substitute process ID assigned at connection time */
+			lhdr->src_pid = cpu_to_le32(id->pid);
+			lhdr->src_nid = cpu_to_le64(id->nid);
+		}
+
+		conn->ksnc_rx_state = SOCKNAL_RX_PARSE;
+		ksocknal_conn_addref(conn);     /* ++ref while parsing */
+
+		rc = lnet_parse(conn->ksnc_peer->ksnp_ni,
+				&conn->ksnc_msg.ksm_u.lnetmsg.ksnm_hdr,
+				conn->ksnc_peer->ksnp_id.nid, conn, 0);
+		if (rc < 0) {
+			/* I just received garbage: give up on this conn */
+			ksocknal_new_packet(conn, 0);
+			ksocknal_close_conn_and_siblings (conn, rc);
+			ksocknal_conn_decref(conn);
+			return -EPROTO;
+		}
+
+		/* I'm racing with ksocknal_recv() */
+		LASSERT (conn->ksnc_rx_state == SOCKNAL_RX_PARSE ||
+			 conn->ksnc_rx_state == SOCKNAL_RX_LNET_PAYLOAD);
+
+		if (conn->ksnc_rx_state != SOCKNAL_RX_LNET_PAYLOAD)
+			return 0;
+
+		/* ksocknal_recv() got called */
+		goto again;
+
+	case SOCKNAL_RX_LNET_PAYLOAD:
+		/* payload all received */
+		rc = 0;
+
+		if (conn->ksnc_rx_nob_left == 0 &&   /* not truncating */
+		    conn->ksnc_msg.ksm_csum != 0 &&  /* has checksum */
+		    conn->ksnc_msg.ksm_csum != conn->ksnc_rx_csum) {
+			CERROR("%s: Checksum error, wire:0x%08X data:0x%08X\n",
+			       libcfs_id2str(conn->ksnc_peer->ksnp_id),
+			       conn->ksnc_msg.ksm_csum, conn->ksnc_rx_csum);
+			rc = -EIO;
+		}
+
+		if (rc == 0 && conn->ksnc_msg.ksm_zc_cookies[0] != 0) {
+			LASSERT(conn->ksnc_proto != &ksocknal_protocol_v1x);
+
+			lhdr = &conn->ksnc_msg.ksm_u.lnetmsg.ksnm_hdr;
+			id   = &conn->ksnc_peer->ksnp_id;
+
+			rc = conn->ksnc_proto->pro_handle_zcreq(conn,
+					conn->ksnc_msg.ksm_zc_cookies[0],
+					*ksocknal_tunables.ksnd_nonblk_zcack ||
+					le64_to_cpu(lhdr->src_nid) != id->nid);
+		}
+
+		lnet_finalize(conn->ksnc_peer->ksnp_ni, conn->ksnc_cookie, rc);
+
+		if (rc != 0) {
+			ksocknal_new_packet(conn, 0);
+			ksocknal_close_conn_and_siblings (conn, rc);
+			return -EPROTO;
+		}
+		/* Fall through */
+
+	case SOCKNAL_RX_SLOP:
+		/* starting new packet? */
+		if (ksocknal_new_packet (conn, conn->ksnc_rx_nob_left))
+			return 0;       /* come back later */
+		goto again;	     /* try to finish reading slop now */
+
+	default:
+		break;
+	}
+
+	/* Not Reached */
+	LBUG ();
+	return -EINVAL;		       /* keep gcc happy */
+}
+
+int
+ksocknal_recv (lnet_ni_t *ni, void *private, lnet_msg_t *msg, int delayed,
+	       unsigned int niov, struct kvec *iov, lnet_kiov_t *kiov,
+	       unsigned int offset, unsigned int mlen, unsigned int rlen)
+{
+	ksock_conn_t  *conn = (ksock_conn_t *)private;
+	ksock_sched_t *sched = conn->ksnc_scheduler;
+
+	LASSERT (mlen <= rlen);
+	LASSERT (niov <= LNET_MAX_IOV);
+
+	conn->ksnc_cookie = msg;
+	conn->ksnc_rx_nob_wanted = mlen;
+	conn->ksnc_rx_nob_left   = rlen;
+
+	if (mlen == 0 || iov != NULL) {
+		conn->ksnc_rx_nkiov = 0;
+		conn->ksnc_rx_kiov = NULL;
+		conn->ksnc_rx_iov = conn->ksnc_rx_iov_space.iov;
+		conn->ksnc_rx_niov =
+			lnet_extract_iov(LNET_MAX_IOV, conn->ksnc_rx_iov,
+					 niov, iov, offset, mlen);
+	} else {
+		conn->ksnc_rx_niov = 0;
+		conn->ksnc_rx_iov  = NULL;
+		conn->ksnc_rx_kiov = conn->ksnc_rx_iov_space.kiov;
+		conn->ksnc_rx_nkiov =
+			lnet_extract_kiov(LNET_MAX_IOV, conn->ksnc_rx_kiov,
+					  niov, kiov, offset, mlen);
+	}
+
+	LASSERT (mlen ==
+		 lnet_iov_nob (conn->ksnc_rx_niov, conn->ksnc_rx_iov) +
+		 lnet_kiov_nob (conn->ksnc_rx_nkiov, conn->ksnc_rx_kiov));
+
+	LASSERT (conn->ksnc_rx_scheduled);
+
+	spin_lock_bh(&sched->kss_lock);
+
+	switch (conn->ksnc_rx_state) {
+	case SOCKNAL_RX_PARSE_WAIT:
+		list_add_tail(&conn->ksnc_rx_list, &sched->kss_rx_conns);
+		wake_up (&sched->kss_waitq);
+		LASSERT (conn->ksnc_rx_ready);
+		break;
+
+	case SOCKNAL_RX_PARSE:
+		/* scheduler hasn't noticed I'm parsing yet */
+		break;
+	}
+
+	conn->ksnc_rx_state = SOCKNAL_RX_LNET_PAYLOAD;
+
+	spin_unlock_bh(&sched->kss_lock);
+	ksocknal_conn_decref(conn);
+	return 0;
+}
+
+static inline int
+ksocknal_sched_cansleep(ksock_sched_t *sched)
+{
+	int	   rc;
+
+	spin_lock_bh(&sched->kss_lock);
+
+	rc = !ksocknal_data.ksnd_shuttingdown &&
+	      list_empty(&sched->kss_rx_conns) &&
+	      list_empty(&sched->kss_tx_conns);
+
+	spin_unlock_bh(&sched->kss_lock);
+	return rc;
+}
+
+int ksocknal_scheduler(void *arg)
+{
+	struct ksock_sched_info	*info;
+	ksock_sched_t		*sched;
+	ksock_conn_t		*conn;
+	ksock_tx_t		*tx;
+	int			rc;
+	int			nloops = 0;
+	long			id = (long)arg;
+
+	info = ksocknal_data.ksnd_sched_info[KSOCK_THREAD_CPT(id)];
+	sched = &info->ksi_scheds[KSOCK_THREAD_SID(id)];
+
+	cfs_block_allsigs();
+
+	rc = cfs_cpt_bind(lnet_cpt_table(), info->ksi_cpt);
+	if (rc != 0) {
+		CERROR("Can't set CPT affinity to %d: %d\n",
+		       info->ksi_cpt, rc);
+	}
+
+	spin_lock_bh(&sched->kss_lock);
+
+	while (!ksocknal_data.ksnd_shuttingdown) {
+		int did_something = 0;
+
+		/* Ensure I progress everything semi-fairly */
+
+		if (!list_empty (&sched->kss_rx_conns)) {
+			conn = list_entry(sched->kss_rx_conns.next,
+					      ksock_conn_t, ksnc_rx_list);
+			list_del(&conn->ksnc_rx_list);
+
+			LASSERT(conn->ksnc_rx_scheduled);
+			LASSERT(conn->ksnc_rx_ready);
+
+			/* clear rx_ready in case receive isn't complete.
+			 * Do it BEFORE we call process_recv, since
+			 * data_ready can set it any time after we release
+			 * kss_lock. */
+			conn->ksnc_rx_ready = 0;
+			spin_unlock_bh(&sched->kss_lock);
+
+			rc = ksocknal_process_receive(conn);
+
+			spin_lock_bh(&sched->kss_lock);
+
+			/* I'm the only one that can clear this flag */
+			LASSERT(conn->ksnc_rx_scheduled);
+
+			/* Did process_receive get everything it wanted? */
+			if (rc == 0)
+				conn->ksnc_rx_ready = 1;
+
+			if (conn->ksnc_rx_state == SOCKNAL_RX_PARSE) {
+				/* Conn blocked waiting for ksocknal_recv()
+				 * I change its state (under lock) to signal
+				 * it can be rescheduled */
+				conn->ksnc_rx_state = SOCKNAL_RX_PARSE_WAIT;
+			} else if (conn->ksnc_rx_ready) {
+				/* reschedule for rx */
+				list_add_tail (&conn->ksnc_rx_list,
+						   &sched->kss_rx_conns);
+			} else {
+				conn->ksnc_rx_scheduled = 0;
+				/* drop my ref */
+				ksocknal_conn_decref(conn);
+			}
+
+			did_something = 1;
+		}
+
+		if (!list_empty (&sched->kss_tx_conns)) {
+			LIST_HEAD    (zlist);
+
+			if (!list_empty(&sched->kss_zombie_noop_txs)) {
+				list_add(&zlist,
+					     &sched->kss_zombie_noop_txs);
+				list_del_init(&sched->kss_zombie_noop_txs);
+			}
+
+			conn = list_entry(sched->kss_tx_conns.next,
+					      ksock_conn_t, ksnc_tx_list);
+			list_del (&conn->ksnc_tx_list);
+
+			LASSERT(conn->ksnc_tx_scheduled);
+			LASSERT(conn->ksnc_tx_ready);
+			LASSERT(!list_empty(&conn->ksnc_tx_queue));
+
+			tx = list_entry(conn->ksnc_tx_queue.next,
+					    ksock_tx_t, tx_list);
+
+			if (conn->ksnc_tx_carrier == tx)
+				ksocknal_next_tx_carrier(conn);
+
+			/* dequeue now so empty list => more to send */
+			list_del(&tx->tx_list);
+
+			/* Clear tx_ready in case send isn't complete.  Do
+			 * it BEFORE we call process_transmit, since
+			 * write_space can set it any time after we release
+			 * kss_lock. */
+			conn->ksnc_tx_ready = 0;
+			spin_unlock_bh(&sched->kss_lock);
+
+			if (!list_empty(&zlist)) {
+				/* free zombie noop txs, it's fast because
+				 * noop txs are just put in freelist */
+				ksocknal_txlist_done(NULL, &zlist, 0);
+			}
+
+			rc = ksocknal_process_transmit(conn, tx);
+
+			if (rc == -ENOMEM || rc == -EAGAIN) {
+				/* Incomplete send: replace tx on HEAD of tx_queue */
+				spin_lock_bh(&sched->kss_lock);
+				list_add(&tx->tx_list,
+					     &conn->ksnc_tx_queue);
+			} else {
+				/* Complete send; tx -ref */
+				ksocknal_tx_decref(tx);
+
+				spin_lock_bh(&sched->kss_lock);
+				/* assume space for more */
+				conn->ksnc_tx_ready = 1;
+			}
+
+			if (rc == -ENOMEM) {
+				/* Do nothing; after a short timeout, this
+				 * conn will be reposted on kss_tx_conns. */
+			} else if (conn->ksnc_tx_ready &&
+				   !list_empty (&conn->ksnc_tx_queue)) {
+				/* reschedule for tx */
+				list_add_tail (&conn->ksnc_tx_list,
+						   &sched->kss_tx_conns);
+			} else {
+				conn->ksnc_tx_scheduled = 0;
+				/* drop my ref */
+				ksocknal_conn_decref(conn);
+			}
+
+			did_something = 1;
+		}
+		if (!did_something ||	   /* nothing to do */
+		    ++nloops == SOCKNAL_RESCHED) { /* hogging CPU? */
+			spin_unlock_bh(&sched->kss_lock);
+
+			nloops = 0;
+
+			if (!did_something) {   /* wait for something to do */
+				rc = wait_event_interruptible_exclusive(
+					sched->kss_waitq,
+					!ksocknal_sched_cansleep(sched));
+				LASSERT (rc == 0);
+			} else {
+				cond_resched();
+			}
+
+			spin_lock_bh(&sched->kss_lock);
+		}
+	}
+
+	spin_unlock_bh(&sched->kss_lock);
+	ksocknal_thread_fini();
+	return 0;
+}
+
+/*
+ * Add connection to kss_rx_conns of scheduler
+ * and wakeup the scheduler.
+ */
+void ksocknal_read_callback (ksock_conn_t *conn)
+{
+	ksock_sched_t *sched;
+
+	sched = conn->ksnc_scheduler;
+
+	spin_lock_bh(&sched->kss_lock);
+
+	conn->ksnc_rx_ready = 1;
+
+	if (!conn->ksnc_rx_scheduled) {  /* not being progressed */
+		list_add_tail(&conn->ksnc_rx_list,
+				  &sched->kss_rx_conns);
+		conn->ksnc_rx_scheduled = 1;
+		/* extra ref for scheduler */
+		ksocknal_conn_addref(conn);
+
+		wake_up (&sched->kss_waitq);
+	}
+	spin_unlock_bh(&sched->kss_lock);
+}
+
+/*
+ * Add connection to kss_tx_conns of scheduler
+ * and wakeup the scheduler.
+ */
+void ksocknal_write_callback (ksock_conn_t *conn)
+{
+	ksock_sched_t *sched;
+
+	sched = conn->ksnc_scheduler;
+
+	spin_lock_bh(&sched->kss_lock);
+
+	conn->ksnc_tx_ready = 1;
+
+	if (!conn->ksnc_tx_scheduled && /* not being progressed */
+	    !list_empty(&conn->ksnc_tx_queue)) { /* packets to send */
+		list_add_tail (&conn->ksnc_tx_list,
+				   &sched->kss_tx_conns);
+		conn->ksnc_tx_scheduled = 1;
+		/* extra ref for scheduler */
+		ksocknal_conn_addref(conn);
+
+		wake_up (&sched->kss_waitq);
+	}
+
+	spin_unlock_bh(&sched->kss_lock);
+}
+
+static ksock_proto_t *
+ksocknal_parse_proto_version (ksock_hello_msg_t *hello)
+{
+	__u32   version = 0;
+
+	if (hello->kshm_magic == LNET_PROTO_MAGIC)
+		version = hello->kshm_version;
+	else if (hello->kshm_magic == __swab32(LNET_PROTO_MAGIC))
+		version = __swab32(hello->kshm_version);
+
+	if (version != 0) {
+#if SOCKNAL_VERSION_DEBUG
+		if (*ksocknal_tunables.ksnd_protocol == 1)
+			return NULL;
+
+		if (*ksocknal_tunables.ksnd_protocol == 2 &&
+		    version == KSOCK_PROTO_V3)
+			return NULL;
+#endif
+		if (version == KSOCK_PROTO_V2)
+			return &ksocknal_protocol_v2x;
+
+		if (version == KSOCK_PROTO_V3)
+			return &ksocknal_protocol_v3x;
+
+		return NULL;
+	}
+
+	if (hello->kshm_magic == le32_to_cpu(LNET_PROTO_TCP_MAGIC)) {
+		lnet_magicversion_t *hmv = (lnet_magicversion_t *)hello;
+
+		CLASSERT (sizeof (lnet_magicversion_t) ==
+			  offsetof (ksock_hello_msg_t, kshm_src_nid));
+
+		if (hmv->version_major == cpu_to_le16 (KSOCK_PROTO_V1_MAJOR) &&
+		    hmv->version_minor == cpu_to_le16 (KSOCK_PROTO_V1_MINOR))
+			return &ksocknal_protocol_v1x;
+	}
+
+	return NULL;
+}
+
+int
+ksocknal_send_hello (lnet_ni_t *ni, ksock_conn_t *conn,
+		     lnet_nid_t peer_nid, ksock_hello_msg_t *hello)
+{
+	/* CAVEAT EMPTOR: this byte flips 'ipaddrs' */
+	ksock_net_t	 *net = (ksock_net_t *)ni->ni_data;
+
+	LASSERT (hello->kshm_nips <= LNET_MAX_INTERFACES);
+
+	/* rely on caller to hold a ref on socket so it wouldn't disappear */
+	LASSERT (conn->ksnc_proto != NULL);
+
+	hello->kshm_src_nid	 = ni->ni_nid;
+	hello->kshm_dst_nid	 = peer_nid;
+	hello->kshm_src_pid	 = the_lnet.ln_pid;
+
+	hello->kshm_src_incarnation = net->ksnn_incarnation;
+	hello->kshm_ctype	   = conn->ksnc_type;
+
+	return conn->ksnc_proto->pro_send_hello(conn, hello);
+}
+
+static int
+ksocknal_invert_type(int type)
+{
+	switch (type) {
+	case SOCKLND_CONN_ANY:
+	case SOCKLND_CONN_CONTROL:
+		return type;
+	case SOCKLND_CONN_BULK_IN:
+		return SOCKLND_CONN_BULK_OUT;
+	case SOCKLND_CONN_BULK_OUT:
+		return SOCKLND_CONN_BULK_IN;
+	default:
+		return SOCKLND_CONN_NONE;
+	}
+}
+
+int
+ksocknal_recv_hello (lnet_ni_t *ni, ksock_conn_t *conn,
+		     ksock_hello_msg_t *hello, lnet_process_id_t *peerid,
+		     __u64 *incarnation)
+{
+	/* Return < 0	fatal error
+	 *	0	  success
+	 *	EALREADY   lost connection race
+	 *	EPROTO     protocol version mismatch
+	 */
+	struct socket	*sock = conn->ksnc_sock;
+	int		  active = (conn->ksnc_proto != NULL);
+	int		  timeout;
+	int		  proto_match;
+	int		  rc;
+	ksock_proto_t       *proto;
+	lnet_process_id_t    recv_id;
+
+	/* socket type set on active connections - not set on passive */
+	LASSERT (!active == !(conn->ksnc_type != SOCKLND_CONN_NONE));
+
+	timeout = active ? *ksocknal_tunables.ksnd_timeout :
+			    lnet_acceptor_timeout();
+
+	rc = libcfs_sock_read(sock, &hello->kshm_magic, sizeof (hello->kshm_magic), timeout);
+	if (rc != 0) {
+		CERROR("Error %d reading HELLO from %pI4h\n",
+			rc, &conn->ksnc_ipaddr);
+		LASSERT (rc < 0);
+		return rc;
+	}
+
+	if (hello->kshm_magic != LNET_PROTO_MAGIC &&
+	    hello->kshm_magic != __swab32(LNET_PROTO_MAGIC) &&
+	    hello->kshm_magic != le32_to_cpu (LNET_PROTO_TCP_MAGIC)) {
+		/* Unexpected magic! */
+		CERROR("Bad magic(1) %#08x (%#08x expected) from %pI4h\n",
+		       __cpu_to_le32 (hello->kshm_magic),
+		       LNET_PROTO_TCP_MAGIC,
+		       &conn->ksnc_ipaddr);
+		return -EPROTO;
+	}
+
+	rc = libcfs_sock_read(sock, &hello->kshm_version,
+			      sizeof(hello->kshm_version), timeout);
+	if (rc != 0) {
+		CERROR("Error %d reading HELLO from %pI4h\n",
+			rc, &conn->ksnc_ipaddr);
+		LASSERT (rc < 0);
+		return rc;
+	}
+
+	proto = ksocknal_parse_proto_version(hello);
+	if (proto == NULL) {
+		if (!active) {
+			/* unknown protocol from peer, tell peer my protocol */
+			conn->ksnc_proto = &ksocknal_protocol_v3x;
+#if SOCKNAL_VERSION_DEBUG
+			if (*ksocknal_tunables.ksnd_protocol == 2)
+				conn->ksnc_proto = &ksocknal_protocol_v2x;
+			else if (*ksocknal_tunables.ksnd_protocol == 1)
+				conn->ksnc_proto = &ksocknal_protocol_v1x;
+#endif
+			hello->kshm_nips = 0;
+			ksocknal_send_hello(ni, conn, ni->ni_nid, hello);
+		}
+
+		CERROR("Unknown protocol version (%d.x expected) from %pI4h\n",
+		       conn->ksnc_proto->pro_version,
+		       &conn->ksnc_ipaddr);
+
+		return -EPROTO;
+	}
+
+	proto_match = (conn->ksnc_proto == proto);
+	conn->ksnc_proto = proto;
+
+	/* receive the rest of hello message anyway */
+	rc = conn->ksnc_proto->pro_recv_hello(conn, hello, timeout);
+	if (rc != 0) {
+		CERROR("Error %d reading or checking hello from from %pI4h\n",
+		       rc, &conn->ksnc_ipaddr);
+		LASSERT (rc < 0);
+		return rc;
+	}
+
+	*incarnation = hello->kshm_src_incarnation;
+
+	if (hello->kshm_src_nid == LNET_NID_ANY) {
+		CERROR("Expecting a HELLO hdr with a NID, but got LNET_NID_ANY from %pI4h\n",
+		       &conn->ksnc_ipaddr);
+		return -EPROTO;
+	}
+
+	if (!active &&
+	    conn->ksnc_port > LNET_ACCEPTOR_MAX_RESERVED_PORT) {
+		/* Userspace NAL assigns peer process ID from socket */
+		recv_id.pid = conn->ksnc_port | LNET_PID_USERFLAG;
+		recv_id.nid = LNET_MKNID(LNET_NIDNET(ni->ni_nid), conn->ksnc_ipaddr);
+	} else {
+		recv_id.nid = hello->kshm_src_nid;
+		recv_id.pid = hello->kshm_src_pid;
+	}
+
+	if (!active) {
+		*peerid = recv_id;
+
+		/* peer determines type */
+		conn->ksnc_type = ksocknal_invert_type(hello->kshm_ctype);
+		if (conn->ksnc_type == SOCKLND_CONN_NONE) {
+			CERROR("Unexpected type %d from %s ip %pI4h\n",
+				hello->kshm_ctype, libcfs_id2str(*peerid),
+				&conn->ksnc_ipaddr);
+			return -EPROTO;
+		}
+
+		return 0;
+	}
+
+	if (peerid->pid != recv_id.pid ||
+	    peerid->nid != recv_id.nid) {
+		LCONSOLE_ERROR_MSG(0x130, "Connected successfully to %s on host %pI4h, but they claimed they were %s; please check your Lustre configuration.\n",
+				   libcfs_id2str(*peerid),
+				   &conn->ksnc_ipaddr,
+				   libcfs_id2str(recv_id));
+		return -EPROTO;
+	}
+
+	if (hello->kshm_ctype == SOCKLND_CONN_NONE) {
+		/* Possible protocol mismatch or I lost the connection race */
+		return proto_match ? EALREADY : EPROTO;
+	}
+
+	if (ksocknal_invert_type(hello->kshm_ctype) != conn->ksnc_type) {
+		CERROR("Mismatched types: me %d, %s ip %pI4h %d\n",
+			conn->ksnc_type, libcfs_id2str(*peerid),
+			&conn->ksnc_ipaddr,
+			hello->kshm_ctype);
+		return -EPROTO;
+	}
+
+	return 0;
+}
+
+static int
+ksocknal_connect (ksock_route_t *route)
+{
+	LIST_HEAD    (zombies);
+	ksock_peer_t     *peer = route->ksnr_peer;
+	int	       type;
+	int	       wanted;
+	struct socket     *sock;
+	unsigned long	deadline;
+	int	       retry_later = 0;
+	int	       rc = 0;
+
+	deadline = cfs_time_add(cfs_time_current(),
+				cfs_time_seconds(*ksocknal_tunables.ksnd_timeout));
+
+	write_lock_bh(&ksocknal_data.ksnd_global_lock);
+
+	LASSERT (route->ksnr_scheduled);
+	LASSERT (!route->ksnr_connecting);
+
+	route->ksnr_connecting = 1;
+
+	for (;;) {
+		wanted = ksocknal_route_mask() & ~route->ksnr_connected;
+
+		/* stop connecting if peer/route got closed under me, or
+		 * route got connected while queued */
+		if (peer->ksnp_closing || route->ksnr_deleted ||
+		    wanted == 0) {
+			retry_later = 0;
+			break;
+		}
+
+		/* reschedule if peer is connecting to me */
+		if (peer->ksnp_accepting > 0) {
+			CDEBUG(D_NET,
+			       "peer %s(%d) already connecting to me, retry later.\n",
+			       libcfs_nid2str(peer->ksnp_id.nid), peer->ksnp_accepting);
+			retry_later = 1;
+		}
+
+		if (retry_later) /* needs reschedule */
+			break;
+
+		if ((wanted & (1 << SOCKLND_CONN_ANY)) != 0) {
+			type = SOCKLND_CONN_ANY;
+		} else if ((wanted & (1 << SOCKLND_CONN_CONTROL)) != 0) {
+			type = SOCKLND_CONN_CONTROL;
+		} else if ((wanted & (1 << SOCKLND_CONN_BULK_IN)) != 0) {
+			type = SOCKLND_CONN_BULK_IN;
+		} else {
+			LASSERT ((wanted & (1 << SOCKLND_CONN_BULK_OUT)) != 0);
+			type = SOCKLND_CONN_BULK_OUT;
+		}
+
+		write_unlock_bh(&ksocknal_data.ksnd_global_lock);
+
+		if (cfs_time_aftereq(cfs_time_current(), deadline)) {
+			rc = -ETIMEDOUT;
+			lnet_connect_console_error(rc, peer->ksnp_id.nid,
+						   route->ksnr_ipaddr,
+						   route->ksnr_port);
+			goto failed;
+		}
+
+		rc = lnet_connect(&sock, peer->ksnp_id.nid,
+				  route->ksnr_myipaddr,
+				  route->ksnr_ipaddr, route->ksnr_port);
+		if (rc != 0)
+			goto failed;
+
+		rc = ksocknal_create_conn(peer->ksnp_ni, route, sock, type);
+		if (rc < 0) {
+			lnet_connect_console_error(rc, peer->ksnp_id.nid,
+						   route->ksnr_ipaddr,
+						   route->ksnr_port);
+			goto failed;
+		}
+
+		/* A +ve RC means I have to retry because I lost the connection
+		 * race or I have to renegotiate protocol version */
+		retry_later = (rc != 0);
+		if (retry_later)
+			CDEBUG(D_NET, "peer %s: conn race, retry later.\n",
+			       libcfs_nid2str(peer->ksnp_id.nid));
+
+		write_lock_bh(&ksocknal_data.ksnd_global_lock);
+	}
+
+	route->ksnr_scheduled = 0;
+	route->ksnr_connecting = 0;
+
+	if (retry_later) {
+		/* re-queue for attention; this frees me up to handle
+		 * the peer's incoming connection request */
+
+		if (rc == EALREADY ||
+		    (rc == 0 && peer->ksnp_accepting > 0)) {
+			/* We want to introduce a delay before next
+			 * attempt to connect if we lost conn race,
+			 * but the race is resolved quickly usually,
+			 * so min_reconnectms should be good heuristic */
+			route->ksnr_retry_interval =
+				cfs_time_seconds(*ksocknal_tunables.ksnd_min_reconnectms)/1000;
+			route->ksnr_timeout = cfs_time_add(cfs_time_current(),
+							   route->ksnr_retry_interval);
+		}
+
+		ksocknal_launch_connection_locked(route);
+	}
+
+	write_unlock_bh(&ksocknal_data.ksnd_global_lock);
+	return retry_later;
+
+ failed:
+	write_lock_bh(&ksocknal_data.ksnd_global_lock);
+
+	route->ksnr_scheduled = 0;
+	route->ksnr_connecting = 0;
+
+	/* This is a retry rather than a new connection */
+	route->ksnr_retry_interval *= 2;
+	route->ksnr_retry_interval =
+		max(route->ksnr_retry_interval,
+		    cfs_time_seconds(*ksocknal_tunables.ksnd_min_reconnectms)/1000);
+	route->ksnr_retry_interval =
+		min(route->ksnr_retry_interval,
+		    cfs_time_seconds(*ksocknal_tunables.ksnd_max_reconnectms)/1000);
+
+	LASSERT (route->ksnr_retry_interval != 0);
+	route->ksnr_timeout = cfs_time_add(cfs_time_current(),
+					   route->ksnr_retry_interval);
+
+	if (!list_empty(&peer->ksnp_tx_queue) &&
+	    peer->ksnp_accepting == 0 &&
+	    ksocknal_find_connecting_route_locked(peer) == NULL) {
+		ksock_conn_t *conn;
+
+		/* ksnp_tx_queue is queued on a conn on successful
+		 * connection for V1.x and V2.x */
+		if (!list_empty (&peer->ksnp_conns)) {
+			conn = list_entry(peer->ksnp_conns.next,
+					      ksock_conn_t, ksnc_list);
+			LASSERT (conn->ksnc_proto == &ksocknal_protocol_v3x);
+		}
+
+		/* take all the blocked packets while I've got the lock and
+		 * complete below... */
+		list_splice_init(&peer->ksnp_tx_queue, &zombies);
+	}
+
+#if 0	   /* irrelevant with only eager routes */
+	if (!route->ksnr_deleted) {
+		/* make this route least-favourite for re-selection */
+		list_del(&route->ksnr_list);
+		list_add_tail(&route->ksnr_list, &peer->ksnp_routes);
+	}
+#endif
+	write_unlock_bh(&ksocknal_data.ksnd_global_lock);
+
+	ksocknal_peer_failed(peer);
+	ksocknal_txlist_done(peer->ksnp_ni, &zombies, 1);
+	return 0;
+}
+
+/*
+ * check whether we need to create more connds.
+ * It will try to create new thread if it's necessary, @timeout can
+ * be updated if failed to create, so caller wouldn't keep try while
+ * running out of resource.
+ */
+static int
+ksocknal_connd_check_start(long sec, long *timeout)
+{
+	char name[16];
+	int rc;
+	int total = ksocknal_data.ksnd_connd_starting +
+		    ksocknal_data.ksnd_connd_running;
+
+	if (unlikely(ksocknal_data.ksnd_init < SOCKNAL_INIT_ALL)) {
+		/* still in initializing */
+		return 0;
+	}
+
+	if (total >= *ksocknal_tunables.ksnd_nconnds_max ||
+	    total > ksocknal_data.ksnd_connd_connecting + SOCKNAL_CONND_RESV) {
+		/* can't create more connd, or still have enough
+		 * threads to handle more connecting */
+		return 0;
+	}
+
+	if (list_empty(&ksocknal_data.ksnd_connd_routes)) {
+		/* no pending connecting request */
+		return 0;
+	}
+
+	if (sec - ksocknal_data.ksnd_connd_failed_stamp <= 1) {
+		/* may run out of resource, retry later */
+		*timeout = cfs_time_seconds(1);
+		return 0;
+	}
+
+	if (ksocknal_data.ksnd_connd_starting > 0) {
+		/* serialize starting to avoid flood */
+		return 0;
+	}
+
+	ksocknal_data.ksnd_connd_starting_stamp = sec;
+	ksocknal_data.ksnd_connd_starting++;
+	spin_unlock_bh(&ksocknal_data.ksnd_connd_lock);
+
+	/* NB: total is the next id */
+	snprintf(name, sizeof(name), "socknal_cd%02d", total);
+	rc = ksocknal_thread_start(ksocknal_connd, NULL, name);
+
+	spin_lock_bh(&ksocknal_data.ksnd_connd_lock);
+	if (rc == 0)
+		return 1;
+
+	/* we tried ... */
+	LASSERT(ksocknal_data.ksnd_connd_starting > 0);
+	ksocknal_data.ksnd_connd_starting--;
+	ksocknal_data.ksnd_connd_failed_stamp = get_seconds();
+
+	return 1;
+}
+
+/*
+ * check whether current thread can exit, it will return 1 if there are too
+ * many threads and no creating in past 120 seconds.
+ * Also, this function may update @timeout to make caller come back
+ * again to recheck these conditions.
+ */
+static int
+ksocknal_connd_check_stop(long sec, long *timeout)
+{
+	int val;
+
+	if (unlikely(ksocknal_data.ksnd_init < SOCKNAL_INIT_ALL)) {
+		/* still in initializing */
+		return 0;
+	}
+
+	if (ksocknal_data.ksnd_connd_starting > 0) {
+		/* in progress of starting new thread */
+		return 0;
+	}
+
+	if (ksocknal_data.ksnd_connd_running <=
+	    *ksocknal_tunables.ksnd_nconnds) { /* can't shrink */
+		return 0;
+	}
+
+	/* created thread in past 120 seconds? */
+	val = (int)(ksocknal_data.ksnd_connd_starting_stamp +
+		    SOCKNAL_CONND_TIMEOUT - sec);
+
+	*timeout = (val > 0) ? cfs_time_seconds(val) :
+			       cfs_time_seconds(SOCKNAL_CONND_TIMEOUT);
+	if (val > 0)
+		return 0;
+
+	/* no creating in past 120 seconds */
+
+	return ksocknal_data.ksnd_connd_running >
+	       ksocknal_data.ksnd_connd_connecting + SOCKNAL_CONND_RESV;
+}
+
+/* Go through connd_routes queue looking for a route that we can process
+ * right now, @timeout_p can be updated if we need to come back later */
+static ksock_route_t *
+ksocknal_connd_get_route_locked(signed long *timeout_p)
+{
+	ksock_route_t *route;
+	unsigned long     now;
+
+	now = cfs_time_current();
+
+	/* connd_routes can contain both pending and ordinary routes */
+	list_for_each_entry (route, &ksocknal_data.ksnd_connd_routes,
+				 ksnr_connd_list) {
+
+		if (route->ksnr_retry_interval == 0 ||
+		    cfs_time_aftereq(now, route->ksnr_timeout))
+			return route;
+
+		if (*timeout_p == MAX_SCHEDULE_TIMEOUT ||
+		    (int)*timeout_p > (int)(route->ksnr_timeout - now))
+			*timeout_p = (int)(route->ksnr_timeout - now);
+	}
+
+	return NULL;
+}
+
+int
+ksocknal_connd (void *arg)
+{
+	spinlock_t    *connd_lock = &ksocknal_data.ksnd_connd_lock;
+	ksock_connreq_t   *cr;
+	wait_queue_t     wait;
+	int		nloops = 0;
+	int		cons_retry = 0;
+
+	cfs_block_allsigs ();
+
+	init_waitqueue_entry(&wait, current);
+
+	spin_lock_bh(connd_lock);
+
+	LASSERT(ksocknal_data.ksnd_connd_starting > 0);
+	ksocknal_data.ksnd_connd_starting--;
+	ksocknal_data.ksnd_connd_running++;
+
+	while (!ksocknal_data.ksnd_shuttingdown) {
+		ksock_route_t *route = NULL;
+		long sec = get_seconds();
+		long timeout = MAX_SCHEDULE_TIMEOUT;
+		int  dropped_lock = 0;
+
+		if (ksocknal_connd_check_stop(sec, &timeout)) {
+			/* wakeup another one to check stop */
+			wake_up(&ksocknal_data.ksnd_connd_waitq);
+			break;
+		}
+
+		if (ksocknal_connd_check_start(sec, &timeout)) {
+			/* created new thread */
+			dropped_lock = 1;
+		}
+
+		if (!list_empty(&ksocknal_data.ksnd_connd_connreqs)) {
+			/* Connection accepted by the listener */
+			cr = list_entry(ksocknal_data.ksnd_connd_connreqs. \
+					    next, ksock_connreq_t, ksncr_list);
+
+			list_del(&cr->ksncr_list);
+			spin_unlock_bh(connd_lock);
+			dropped_lock = 1;
+
+			ksocknal_create_conn(cr->ksncr_ni, NULL,
+					     cr->ksncr_sock, SOCKLND_CONN_NONE);
+			lnet_ni_decref(cr->ksncr_ni);
+			LIBCFS_FREE(cr, sizeof(*cr));
+
+			spin_lock_bh(connd_lock);
+		}
+
+		/* Only handle an outgoing connection request if there
+		 * is a thread left to handle incoming connections and
+		 * create new connd */
+		if (ksocknal_data.ksnd_connd_connecting + SOCKNAL_CONND_RESV <
+		    ksocknal_data.ksnd_connd_running) {
+			route = ksocknal_connd_get_route_locked(&timeout);
+		}
+		if (route != NULL) {
+			list_del (&route->ksnr_connd_list);
+			ksocknal_data.ksnd_connd_connecting++;
+			spin_unlock_bh(connd_lock);
+			dropped_lock = 1;
+
+			if (ksocknal_connect(route)) {
+				/* consecutive retry */
+				if (cons_retry++ > SOCKNAL_INSANITY_RECONN) {
+					CWARN("massive consecutive re-connecting to %pI4h\n",
+					      &route->ksnr_ipaddr);
+					cons_retry = 0;
+				}
+			} else {
+				cons_retry = 0;
+			}
+
+			ksocknal_route_decref(route);
+
+			spin_lock_bh(connd_lock);
+			ksocknal_data.ksnd_connd_connecting--;
+		}
+
+		if (dropped_lock) {
+			if (++nloops < SOCKNAL_RESCHED)
+				continue;
+			spin_unlock_bh(connd_lock);
+			nloops = 0;
+			cond_resched();
+			spin_lock_bh(connd_lock);
+			continue;
+		}
+
+		/* Nothing to do for 'timeout'  */
+		set_current_state(TASK_INTERRUPTIBLE);
+		add_wait_queue_exclusive(&ksocknal_data.ksnd_connd_waitq, &wait);
+		spin_unlock_bh(connd_lock);
+
+		nloops = 0;
+		schedule_timeout(timeout);
+
+		remove_wait_queue(&ksocknal_data.ksnd_connd_waitq, &wait);
+		spin_lock_bh(connd_lock);
+	}
+	ksocknal_data.ksnd_connd_running--;
+	spin_unlock_bh(connd_lock);
+
+	ksocknal_thread_fini();
+	return 0;
+}
+
+static ksock_conn_t *
+ksocknal_find_timed_out_conn (ksock_peer_t *peer)
+{
+	/* We're called with a shared lock on ksnd_global_lock */
+	ksock_conn_t      *conn;
+	struct list_head	*ctmp;
+
+	list_for_each (ctmp, &peer->ksnp_conns) {
+		int     error;
+		conn = list_entry (ctmp, ksock_conn_t, ksnc_list);
+
+		/* Don't need the {get,put}connsock dance to deref ksnc_sock */
+		LASSERT (!conn->ksnc_closing);
+
+		/* SOCK_ERROR will reset error code of socket in
+		 * some platform (like Darwin8.x) */
+		error = conn->ksnc_sock->sk->sk_err;
+		if (error != 0) {
+			ksocknal_conn_addref(conn);
+
+			switch (error) {
+			case ECONNRESET:
+				CNETERR("A connection with %s (%pI4h:%d) was reset; it may have rebooted.\n",
+					libcfs_id2str(peer->ksnp_id),
+					&conn->ksnc_ipaddr,
+					conn->ksnc_port);
+				break;
+			case ETIMEDOUT:
+				CNETERR("A connection with %s (%pI4h:%d) timed out; the network or node may be down.\n",
+					libcfs_id2str(peer->ksnp_id),
+					&conn->ksnc_ipaddr,
+					conn->ksnc_port);
+				break;
+			default:
+				CNETERR("An unexpected network error %d occurred with %s (%pI4h:%d\n",
+					error,
+					libcfs_id2str(peer->ksnp_id),
+					&conn->ksnc_ipaddr,
+					conn->ksnc_port);
+				break;
+			}
+
+			return conn;
+		}
+
+		if (conn->ksnc_rx_started &&
+		    cfs_time_aftereq(cfs_time_current(),
+				     conn->ksnc_rx_deadline)) {
+			/* Timed out incomplete incoming message */
+			ksocknal_conn_addref(conn);
+			CNETERR("Timeout receiving from %s (%pI4h:%d), state %d wanted %d left %d\n",
+				libcfs_id2str(peer->ksnp_id),
+				&conn->ksnc_ipaddr,
+				conn->ksnc_port,
+				conn->ksnc_rx_state,
+				conn->ksnc_rx_nob_wanted,
+				conn->ksnc_rx_nob_left);
+			return conn;
+		}
+
+		if ((!list_empty(&conn->ksnc_tx_queue) ||
+		     conn->ksnc_sock->sk->sk_wmem_queued != 0) &&
+		    cfs_time_aftereq(cfs_time_current(),
+				     conn->ksnc_tx_deadline)) {
+			/* Timed out messages queued for sending or
+			 * buffered in the socket's send buffer */
+			ksocknal_conn_addref(conn);
+			CNETERR("Timeout sending data to %s (%pI4h:%d) the network or that node may be down.\n",
+				libcfs_id2str(peer->ksnp_id),
+				&conn->ksnc_ipaddr,
+				conn->ksnc_port);
+			return conn;
+		}
+	}
+
+	return NULL;
+}
+
+static inline void
+ksocknal_flush_stale_txs(ksock_peer_t *peer)
+{
+	ksock_tx_t	*tx;
+	LIST_HEAD      (stale_txs);
+
+	write_lock_bh(&ksocknal_data.ksnd_global_lock);
+
+	while (!list_empty (&peer->ksnp_tx_queue)) {
+		tx = list_entry (peer->ksnp_tx_queue.next,
+				     ksock_tx_t, tx_list);
+
+		if (!cfs_time_aftereq(cfs_time_current(),
+				      tx->tx_deadline))
+			break;
+
+		list_del (&tx->tx_list);
+		list_add_tail (&tx->tx_list, &stale_txs);
+	}
+
+	write_unlock_bh(&ksocknal_data.ksnd_global_lock);
+
+	ksocknal_txlist_done(peer->ksnp_ni, &stale_txs, 1);
+}
+
+static int
+ksocknal_send_keepalive_locked(ksock_peer_t *peer)
+{
+	ksock_sched_t  *sched;
+	ksock_conn_t   *conn;
+	ksock_tx_t     *tx;
+
+	if (list_empty(&peer->ksnp_conns)) /* last_alive will be updated by create_conn */
+		return 0;
+
+	if (peer->ksnp_proto != &ksocknal_protocol_v3x)
+		return 0;
+
+	if (*ksocknal_tunables.ksnd_keepalive <= 0 ||
+	    time_before(cfs_time_current(),
+			cfs_time_add(peer->ksnp_last_alive,
+				     cfs_time_seconds(*ksocknal_tunables.ksnd_keepalive))))
+		return 0;
+
+	if (time_before(cfs_time_current(), peer->ksnp_send_keepalive))
+		return 0;
+
+	/* retry 10 secs later, so we wouldn't put pressure
+	 * on this peer if we failed to send keepalive this time */
+	peer->ksnp_send_keepalive = cfs_time_shift(10);
+
+	conn = ksocknal_find_conn_locked(peer, NULL, 1);
+	if (conn != NULL) {
+		sched = conn->ksnc_scheduler;
+
+		spin_lock_bh(&sched->kss_lock);
+		if (!list_empty(&conn->ksnc_tx_queue)) {
+			spin_unlock_bh(&sched->kss_lock);
+			/* there is an queued ACK, don't need keepalive */
+			return 0;
+		}
+
+		spin_unlock_bh(&sched->kss_lock);
+	}
+
+	read_unlock(&ksocknal_data.ksnd_global_lock);
+
+	/* cookie = 1 is reserved for keepalive PING */
+	tx = ksocknal_alloc_tx_noop(1, 1);
+	if (tx == NULL) {
+		read_lock(&ksocknal_data.ksnd_global_lock);
+		return -ENOMEM;
+	}
+
+	if (ksocknal_launch_packet(peer->ksnp_ni, tx, peer->ksnp_id) == 0) {
+		read_lock(&ksocknal_data.ksnd_global_lock);
+		return 1;
+	}
+
+	ksocknal_free_tx(tx);
+	read_lock(&ksocknal_data.ksnd_global_lock);
+
+	return -EIO;
+}
+
+
+static void
+ksocknal_check_peer_timeouts (int idx)
+{
+	struct list_head       *peers = &ksocknal_data.ksnd_peers[idx];
+	ksock_peer_t     *peer;
+	ksock_conn_t     *conn;
+	ksock_tx_t       *tx;
+
+ again:
+	/* NB. We expect to have a look at all the peers and not find any
+	 * connections to time out, so we just use a shared lock while we
+	 * take a look... */
+	read_lock(&ksocknal_data.ksnd_global_lock);
+
+	list_for_each_entry(peer, peers, ksnp_list) {
+		unsigned long  deadline = 0;
+		int	 resid = 0;
+		int	 n     = 0;
+
+		if (ksocknal_send_keepalive_locked(peer) != 0) {
+			read_unlock(&ksocknal_data.ksnd_global_lock);
+			goto again;
+		}
+
+		conn = ksocknal_find_timed_out_conn (peer);
+
+		if (conn != NULL) {
+			read_unlock(&ksocknal_data.ksnd_global_lock);
+
+			ksocknal_close_conn_and_siblings (conn, -ETIMEDOUT);
+
+			/* NB we won't find this one again, but we can't
+			 * just proceed with the next peer, since we dropped
+			 * ksnd_global_lock and it might be dead already! */
+			ksocknal_conn_decref(conn);
+			goto again;
+		}
+
+		/* we can't process stale txs right here because we're
+		 * holding only shared lock */
+		if (!list_empty (&peer->ksnp_tx_queue)) {
+			ksock_tx_t *tx =
+				list_entry (peer->ksnp_tx_queue.next,
+						ksock_tx_t, tx_list);
+
+			if (cfs_time_aftereq(cfs_time_current(),
+					     tx->tx_deadline)) {
+
+				ksocknal_peer_addref(peer);
+				read_unlock(&ksocknal_data.ksnd_global_lock);
+
+				ksocknal_flush_stale_txs(peer);
+
+				ksocknal_peer_decref(peer);
+				goto again;
+			}
+		}
+
+		if (list_empty(&peer->ksnp_zc_req_list))
+			continue;
+
+		spin_lock(&peer->ksnp_lock);
+		list_for_each_entry(tx, &peer->ksnp_zc_req_list, tx_zc_list) {
+			if (!cfs_time_aftereq(cfs_time_current(),
+					      tx->tx_deadline))
+				break;
+			/* ignore the TX if connection is being closed */
+			if (tx->tx_conn->ksnc_closing)
+				continue;
+			n++;
+		}
+
+		if (n == 0) {
+			spin_unlock(&peer->ksnp_lock);
+			continue;
+		}
+
+		tx = list_entry(peer->ksnp_zc_req_list.next,
+				    ksock_tx_t, tx_zc_list);
+		deadline = tx->tx_deadline;
+		resid    = tx->tx_resid;
+		conn     = tx->tx_conn;
+		ksocknal_conn_addref(conn);
+
+		spin_unlock(&peer->ksnp_lock);
+		read_unlock(&ksocknal_data.ksnd_global_lock);
+
+		CERROR("Total %d stale ZC_REQs for peer %s detected; the oldest(%p) timed out %ld secs ago, resid: %d, wmem: %d\n",
+		       n, libcfs_nid2str(peer->ksnp_id.nid), tx,
+		       cfs_duration_sec(cfs_time_current() - deadline),
+		       resid, conn->ksnc_sock->sk->sk_wmem_queued);
+
+		ksocknal_close_conn_and_siblings (conn, -ETIMEDOUT);
+		ksocknal_conn_decref(conn);
+		goto again;
+	}
+
+	read_unlock(&ksocknal_data.ksnd_global_lock);
+}
+
+int
+ksocknal_reaper (void *arg)
+{
+	wait_queue_t     wait;
+	ksock_conn_t      *conn;
+	ksock_sched_t     *sched;
+	struct list_head	 enomem_conns;
+	int		nenomem_conns;
+	long     timeout;
+	int		i;
+	int		peer_index = 0;
+	unsigned long	 deadline = cfs_time_current();
+
+	cfs_block_allsigs ();
+
+	INIT_LIST_HEAD(&enomem_conns);
+	init_waitqueue_entry(&wait, current);
+
+	spin_lock_bh(&ksocknal_data.ksnd_reaper_lock);
+
+	while (!ksocknal_data.ksnd_shuttingdown) {
+
+		if (!list_empty (&ksocknal_data.ksnd_deathrow_conns)) {
+			conn = list_entry (ksocknal_data. \
+					       ksnd_deathrow_conns.next,
+					       ksock_conn_t, ksnc_list);
+			list_del (&conn->ksnc_list);
+
+			spin_unlock_bh(&ksocknal_data.ksnd_reaper_lock);
+
+			ksocknal_terminate_conn(conn);
+			ksocknal_conn_decref(conn);
+
+			spin_lock_bh(&ksocknal_data.ksnd_reaper_lock);
+			continue;
+		}
+
+		if (!list_empty (&ksocknal_data.ksnd_zombie_conns)) {
+			conn = list_entry (ksocknal_data.ksnd_zombie_conns.\
+					       next, ksock_conn_t, ksnc_list);
+			list_del (&conn->ksnc_list);
+
+			spin_unlock_bh(&ksocknal_data.ksnd_reaper_lock);
+
+			ksocknal_destroy_conn(conn);
+
+			spin_lock_bh(&ksocknal_data.ksnd_reaper_lock);
+			continue;
+		}
+
+		if (!list_empty (&ksocknal_data.ksnd_enomem_conns)) {
+			list_add(&enomem_conns,
+				     &ksocknal_data.ksnd_enomem_conns);
+			list_del_init(&ksocknal_data.ksnd_enomem_conns);
+		}
+
+		spin_unlock_bh(&ksocknal_data.ksnd_reaper_lock);
+
+		/* reschedule all the connections that stalled with ENOMEM... */
+		nenomem_conns = 0;
+		while (!list_empty (&enomem_conns)) {
+			conn = list_entry (enomem_conns.next,
+					       ksock_conn_t, ksnc_tx_list);
+			list_del (&conn->ksnc_tx_list);
+
+			sched = conn->ksnc_scheduler;
+
+			spin_lock_bh(&sched->kss_lock);
+
+			LASSERT(conn->ksnc_tx_scheduled);
+			conn->ksnc_tx_ready = 1;
+			list_add_tail(&conn->ksnc_tx_list,
+					  &sched->kss_tx_conns);
+			wake_up(&sched->kss_waitq);
+
+			spin_unlock_bh(&sched->kss_lock);
+			nenomem_conns++;
+		}
+
+		/* careful with the jiffy wrap... */
+		while ((timeout = cfs_time_sub(deadline,
+					       cfs_time_current())) <= 0) {
+			const int n = 4;
+			const int p = 1;
+			int       chunk = ksocknal_data.ksnd_peer_hash_size;
+
+			/* Time to check for timeouts on a few more peers: I do
+			 * checks every 'p' seconds on a proportion of the peer
+			 * table and I need to check every connection 'n' times
+			 * within a timeout interval, to ensure I detect a
+			 * timeout on any connection within (n+1)/n times the
+			 * timeout interval. */
+
+			if (*ksocknal_tunables.ksnd_timeout > n * p)
+				chunk = (chunk * n * p) /
+					*ksocknal_tunables.ksnd_timeout;
+			if (chunk == 0)
+				chunk = 1;
+
+			for (i = 0; i < chunk; i++) {
+				ksocknal_check_peer_timeouts (peer_index);
+				peer_index = (peer_index + 1) %
+					     ksocknal_data.ksnd_peer_hash_size;
+			}
+
+			deadline = cfs_time_add(deadline, cfs_time_seconds(p));
+		}
+
+		if (nenomem_conns != 0) {
+			/* Reduce my timeout if I rescheduled ENOMEM conns.
+			 * This also prevents me getting woken immediately
+			 * if any go back on my enomem list. */
+			timeout = SOCKNAL_ENOMEM_RETRY;
+		}
+		ksocknal_data.ksnd_reaper_waketime =
+			cfs_time_add(cfs_time_current(), timeout);
+
+		set_current_state (TASK_INTERRUPTIBLE);
+		add_wait_queue (&ksocknal_data.ksnd_reaper_waitq, &wait);
+
+		if (!ksocknal_data.ksnd_shuttingdown &&
+		    list_empty (&ksocknal_data.ksnd_deathrow_conns) &&
+		    list_empty (&ksocknal_data.ksnd_zombie_conns))
+			schedule_timeout(timeout);
+
+		set_current_state (TASK_RUNNING);
+		remove_wait_queue (&ksocknal_data.ksnd_reaper_waitq, &wait);
+
+		spin_lock_bh(&ksocknal_data.ksnd_reaper_lock);
+	}
+
+	spin_unlock_bh(&ksocknal_data.ksnd_reaper_lock);
+
+	ksocknal_thread_fini();
+	return 0;
+}
diff --git a/kernel/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_lib-linux.c b/kernel/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_lib-linux.c
new file mode 100644
index 000000000..f5e8ab060
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_lib-linux.c
@@ -0,0 +1,714 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#include "socklnd.h"
+
+int
+ksocknal_lib_get_conn_addrs(ksock_conn_t *conn)
+{
+	int rc = libcfs_sock_getaddr(conn->ksnc_sock, 1,
+				     &conn->ksnc_ipaddr,
+				     &conn->ksnc_port);
+
+	/* Didn't need the {get,put}connsock dance to deref ksnc_sock... */
+	LASSERT(!conn->ksnc_closing);
+
+	if (rc != 0) {
+		CERROR("Error %d getting sock peer IP\n", rc);
+		return rc;
+	}
+
+	rc = libcfs_sock_getaddr(conn->ksnc_sock, 0,
+				 &conn->ksnc_myipaddr, NULL);
+	if (rc != 0) {
+		CERROR("Error %d getting sock local IP\n", rc);
+		return rc;
+	}
+
+	return 0;
+}
+
+int
+ksocknal_lib_zc_capable(ksock_conn_t *conn)
+{
+	int  caps = conn->ksnc_sock->sk->sk_route_caps;
+
+	if (conn->ksnc_proto == &ksocknal_protocol_v1x)
+		return 0;
+
+	/* ZC if the socket supports scatter/gather and doesn't need software
+	 * checksums */
+	return ((caps & NETIF_F_SG) != 0 && (caps & NETIF_F_ALL_CSUM) != 0);
+}
+
+int
+ksocknal_lib_send_iov(ksock_conn_t *conn, ksock_tx_t *tx)
+{
+	struct socket *sock = conn->ksnc_sock;
+	int	    nob;
+	int	    rc;
+
+	if (*ksocknal_tunables.ksnd_enable_csum	&& /* checksum enabled */
+	    conn->ksnc_proto == &ksocknal_protocol_v2x && /* V2.x connection  */
+	    tx->tx_nob == tx->tx_resid		 && /* frist sending    */
+	    tx->tx_msg.ksm_csum == 0)		     /* not checksummed  */
+		ksocknal_lib_csum_tx(tx);
+
+	/* NB we can't trust socket ops to either consume our iovs
+	 * or leave them alone. */
+
+	{
+#if SOCKNAL_SINGLE_FRAG_TX
+		struct kvec    scratch;
+		struct kvec   *scratchiov = &scratch;
+		unsigned int    niov = 1;
+#else
+		struct kvec   *scratchiov = conn->ksnc_scheduler->kss_scratch_iov;
+		unsigned int    niov = tx->tx_niov;
+#endif
+		struct msghdr msg = {.msg_flags = MSG_DONTWAIT};
+		int  i;
+
+		for (nob = i = 0; i < niov; i++) {
+			scratchiov[i] = tx->tx_iov[i];
+			nob += scratchiov[i].iov_len;
+		}
+
+		if (!list_empty(&conn->ksnc_tx_queue) ||
+		    nob < tx->tx_resid)
+			msg.msg_flags |= MSG_MORE;
+
+		rc = kernel_sendmsg(sock, &msg, scratchiov, niov, nob);
+	}
+	return rc;
+}
+
+int
+ksocknal_lib_send_kiov(ksock_conn_t *conn, ksock_tx_t *tx)
+{
+	struct socket *sock = conn->ksnc_sock;
+	lnet_kiov_t   *kiov = tx->tx_kiov;
+	int	    rc;
+	int	    nob;
+
+	/* Not NOOP message */
+	LASSERT(tx->tx_lnetmsg != NULL);
+
+	/* NB we can't trust socket ops to either consume our iovs
+	 * or leave them alone. */
+	if (tx->tx_msg.ksm_zc_cookies[0] != 0) {
+		/* Zero copy is enabled */
+		struct sock   *sk = sock->sk;
+		struct page   *page = kiov->kiov_page;
+		int	    offset = kiov->kiov_offset;
+		int	    fragsize = kiov->kiov_len;
+		int	    msgflg = MSG_DONTWAIT;
+
+		CDEBUG(D_NET, "page %p + offset %x for %d\n",
+			       page, offset, kiov->kiov_len);
+
+		if (!list_empty(&conn->ksnc_tx_queue) ||
+		    fragsize < tx->tx_resid)
+			msgflg |= MSG_MORE;
+
+		if (sk->sk_prot->sendpage != NULL) {
+			rc = sk->sk_prot->sendpage(sk, page,
+						   offset, fragsize, msgflg);
+		} else {
+			rc = cfs_tcp_sendpage(sk, page, offset, fragsize,
+					      msgflg);
+		}
+	} else {
+#if SOCKNAL_SINGLE_FRAG_TX || !SOCKNAL_RISK_KMAP_DEADLOCK
+		struct kvec  scratch;
+		struct kvec *scratchiov = &scratch;
+		unsigned int  niov = 1;
+#else
+#ifdef CONFIG_HIGHMEM
+#warning "XXX risk of kmap deadlock on multiple frags..."
+#endif
+		struct kvec *scratchiov = conn->ksnc_scheduler->kss_scratch_iov;
+		unsigned int  niov = tx->tx_nkiov;
+#endif
+		struct msghdr msg = {.msg_flags = MSG_DONTWAIT};
+		int	   i;
+
+		for (nob = i = 0; i < niov; i++) {
+			scratchiov[i].iov_base = kmap(kiov[i].kiov_page) +
+						 kiov[i].kiov_offset;
+			nob += scratchiov[i].iov_len = kiov[i].kiov_len;
+		}
+
+		if (!list_empty(&conn->ksnc_tx_queue) ||
+		    nob < tx->tx_resid)
+			msg.msg_flags |= MSG_MORE;
+
+		rc = kernel_sendmsg(sock, &msg, (struct kvec *)scratchiov, niov, nob);
+
+		for (i = 0; i < niov; i++)
+			kunmap(kiov[i].kiov_page);
+	}
+	return rc;
+}
+
+void
+ksocknal_lib_eager_ack(ksock_conn_t *conn)
+{
+	int	    opt = 1;
+	struct socket *sock = conn->ksnc_sock;
+
+	/* Remind the socket to ACK eagerly.  If I don't, the socket might
+	 * think I'm about to send something it could piggy-back the ACK
+	 * on, introducing delay in completing zero-copy sends in my
+	 * peer. */
+
+	kernel_setsockopt(sock, SOL_TCP, TCP_QUICKACK,
+			       (char *)&opt, sizeof(opt));
+}
+
+int
+ksocknal_lib_recv_iov(ksock_conn_t *conn)
+{
+#if SOCKNAL_SINGLE_FRAG_RX
+	struct kvec  scratch;
+	struct kvec *scratchiov = &scratch;
+	unsigned int  niov = 1;
+#else
+	struct kvec *scratchiov = conn->ksnc_scheduler->kss_scratch_iov;
+	unsigned int  niov = conn->ksnc_rx_niov;
+#endif
+	struct kvec *iov = conn->ksnc_rx_iov;
+	struct msghdr msg = {
+		.msg_flags      = 0
+	};
+	int	  nob;
+	int	  i;
+	int	  rc;
+	int	  fragnob;
+	int	  sum;
+	__u32	saved_csum;
+
+	/* NB we can't trust socket ops to either consume our iovs
+	 * or leave them alone. */
+	LASSERT(niov > 0);
+
+	for (nob = i = 0; i < niov; i++) {
+		scratchiov[i] = iov[i];
+		nob += scratchiov[i].iov_len;
+	}
+	LASSERT(nob <= conn->ksnc_rx_nob_wanted);
+
+	rc = kernel_recvmsg(conn->ksnc_sock, &msg,
+		scratchiov, niov, nob, MSG_DONTWAIT);
+
+	saved_csum = 0;
+	if (conn->ksnc_proto == &ksocknal_protocol_v2x) {
+		saved_csum = conn->ksnc_msg.ksm_csum;
+		conn->ksnc_msg.ksm_csum = 0;
+	}
+
+	if (saved_csum != 0) {
+		/* accumulate checksum */
+		for (i = 0, sum = rc; sum > 0; i++, sum -= fragnob) {
+			LASSERT(i < niov);
+
+			fragnob = iov[i].iov_len;
+			if (fragnob > sum)
+				fragnob = sum;
+
+			conn->ksnc_rx_csum = ksocknal_csum(conn->ksnc_rx_csum,
+							   iov[i].iov_base, fragnob);
+		}
+		conn->ksnc_msg.ksm_csum = saved_csum;
+	}
+
+	return rc;
+}
+
+static void
+ksocknal_lib_kiov_vunmap(void *addr)
+{
+	if (addr == NULL)
+		return;
+
+	vunmap(addr);
+}
+
+static void *
+ksocknal_lib_kiov_vmap(lnet_kiov_t *kiov, int niov,
+		       struct kvec *iov, struct page **pages)
+{
+	void	     *addr;
+	int	       nob;
+	int	       i;
+
+	if (!*ksocknal_tunables.ksnd_zc_recv || pages == NULL)
+		return NULL;
+
+	LASSERT(niov <= LNET_MAX_IOV);
+
+	if (niov < 2 ||
+	    niov < *ksocknal_tunables.ksnd_zc_recv_min_nfrags)
+		return NULL;
+
+	for (nob = i = 0; i < niov; i++) {
+		if ((kiov[i].kiov_offset != 0 && i > 0) ||
+		    (kiov[i].kiov_offset + kiov[i].kiov_len != PAGE_CACHE_SIZE && i < niov - 1))
+			return NULL;
+
+		pages[i] = kiov[i].kiov_page;
+		nob += kiov[i].kiov_len;
+	}
+
+	addr = vmap(pages, niov, VM_MAP, PAGE_KERNEL);
+	if (addr == NULL)
+		return NULL;
+
+	iov->iov_base = addr + kiov[0].kiov_offset;
+	iov->iov_len = nob;
+
+	return addr;
+}
+
+int
+ksocknal_lib_recv_kiov(ksock_conn_t *conn)
+{
+#if SOCKNAL_SINGLE_FRAG_RX || !SOCKNAL_RISK_KMAP_DEADLOCK
+	struct kvec   scratch;
+	struct kvec  *scratchiov = &scratch;
+	struct page  **pages      = NULL;
+	unsigned int   niov       = 1;
+#else
+#ifdef CONFIG_HIGHMEM
+#warning "XXX risk of kmap deadlock on multiple frags..."
+#endif
+	struct kvec  *scratchiov = conn->ksnc_scheduler->kss_scratch_iov;
+	struct page  **pages      = conn->ksnc_scheduler->kss_rx_scratch_pgs;
+	unsigned int   niov       = conn->ksnc_rx_nkiov;
+#endif
+	lnet_kiov_t   *kiov = conn->ksnc_rx_kiov;
+	struct msghdr msg = {
+		.msg_flags      = 0
+	};
+	int	  nob;
+	int	  i;
+	int	  rc;
+	void	*base;
+	void	*addr;
+	int	  sum;
+	int	  fragnob;
+	int n;
+
+	/* NB we can't trust socket ops to either consume our iovs
+	 * or leave them alone. */
+	addr = ksocknal_lib_kiov_vmap(kiov, niov, scratchiov, pages);
+	if (addr != NULL) {
+		nob = scratchiov[0].iov_len;
+		n = 1;
+
+	} else {
+		for (nob = i = 0; i < niov; i++) {
+			nob += scratchiov[i].iov_len = kiov[i].kiov_len;
+			scratchiov[i].iov_base = kmap(kiov[i].kiov_page) +
+						 kiov[i].kiov_offset;
+		}
+		n = niov;
+	}
+
+	LASSERT(nob <= conn->ksnc_rx_nob_wanted);
+
+	rc = kernel_recvmsg(conn->ksnc_sock, &msg,
+			(struct kvec *)scratchiov, n, nob, MSG_DONTWAIT);
+
+	if (conn->ksnc_msg.ksm_csum != 0) {
+		for (i = 0, sum = rc; sum > 0; i++, sum -= fragnob) {
+			LASSERT(i < niov);
+
+			/* Dang! have to kmap again because I have nowhere to stash the
+			 * mapped address.  But by doing it while the page is still
+			 * mapped, the kernel just bumps the map count and returns me
+			 * the address it stashed. */
+			base = kmap(kiov[i].kiov_page) + kiov[i].kiov_offset;
+			fragnob = kiov[i].kiov_len;
+			if (fragnob > sum)
+				fragnob = sum;
+
+			conn->ksnc_rx_csum = ksocknal_csum(conn->ksnc_rx_csum,
+							   base, fragnob);
+
+			kunmap(kiov[i].kiov_page);
+		}
+	}
+
+	if (addr != NULL) {
+		ksocknal_lib_kiov_vunmap(addr);
+	} else {
+		for (i = 0; i < niov; i++)
+			kunmap(kiov[i].kiov_page);
+	}
+
+	return rc;
+}
+
+void
+ksocknal_lib_csum_tx(ksock_tx_t *tx)
+{
+	int	  i;
+	__u32	csum;
+	void	*base;
+
+	LASSERT(tx->tx_iov[0].iov_base == &tx->tx_msg);
+	LASSERT(tx->tx_conn != NULL);
+	LASSERT(tx->tx_conn->ksnc_proto == &ksocknal_protocol_v2x);
+
+	tx->tx_msg.ksm_csum = 0;
+
+	csum = ksocknal_csum(~0, tx->tx_iov[0].iov_base,
+			     tx->tx_iov[0].iov_len);
+
+	if (tx->tx_kiov != NULL) {
+		for (i = 0; i < tx->tx_nkiov; i++) {
+			base = kmap(tx->tx_kiov[i].kiov_page) +
+			       tx->tx_kiov[i].kiov_offset;
+
+			csum = ksocknal_csum(csum, base, tx->tx_kiov[i].kiov_len);
+
+			kunmap(tx->tx_kiov[i].kiov_page);
+		}
+	} else {
+		for (i = 1; i < tx->tx_niov; i++)
+			csum = ksocknal_csum(csum, tx->tx_iov[i].iov_base,
+					     tx->tx_iov[i].iov_len);
+	}
+
+	if (*ksocknal_tunables.ksnd_inject_csum_error) {
+		csum++;
+		*ksocknal_tunables.ksnd_inject_csum_error = 0;
+	}
+
+	tx->tx_msg.ksm_csum = csum;
+}
+
+int
+ksocknal_lib_get_conn_tunables(ksock_conn_t *conn, int *txmem, int *rxmem, int *nagle)
+{
+	struct socket *sock = conn->ksnc_sock;
+	int	    len;
+	int	    rc;
+
+	rc = ksocknal_connsock_addref(conn);
+	if (rc != 0) {
+		LASSERT(conn->ksnc_closing);
+		*txmem = *rxmem = *nagle = 0;
+		return -ESHUTDOWN;
+	}
+
+	rc = libcfs_sock_getbuf(sock, txmem, rxmem);
+	if (rc == 0) {
+		len = sizeof(*nagle);
+		rc = kernel_getsockopt(sock, SOL_TCP, TCP_NODELAY,
+					   (char *)nagle, &len);
+	}
+
+	ksocknal_connsock_decref(conn);
+
+	if (rc == 0)
+		*nagle = !*nagle;
+	else
+		*txmem = *rxmem = *nagle = 0;
+
+	return rc;
+}
+
+int
+ksocknal_lib_setup_sock(struct socket *sock)
+{
+	int	     rc;
+	int	     option;
+	int	     keep_idle;
+	int	     keep_intvl;
+	int	     keep_count;
+	int	     do_keepalive;
+	struct linger   linger;
+
+	sock->sk->sk_allocation = GFP_NOFS;
+
+	/* Ensure this socket aborts active sends immediately when we close
+	 * it. */
+
+	linger.l_onoff = 0;
+	linger.l_linger = 0;
+
+	rc = kernel_setsockopt(sock, SOL_SOCKET, SO_LINGER,
+			      (char *)&linger, sizeof(linger));
+	if (rc != 0) {
+		CERROR("Can't set SO_LINGER: %d\n", rc);
+		return rc;
+	}
+
+	option = -1;
+	rc = kernel_setsockopt(sock, SOL_TCP, TCP_LINGER2,
+				    (char *)&option, sizeof(option));
+	if (rc != 0) {
+		CERROR("Can't set SO_LINGER2: %d\n", rc);
+		return rc;
+	}
+
+	if (!*ksocknal_tunables.ksnd_nagle) {
+		option = 1;
+
+		rc = kernel_setsockopt(sock, SOL_TCP, TCP_NODELAY,
+					    (char *)&option, sizeof(option));
+		if (rc != 0) {
+			CERROR("Can't disable nagle: %d\n", rc);
+			return rc;
+		}
+	}
+
+	rc = libcfs_sock_setbuf(sock,
+				*ksocknal_tunables.ksnd_tx_buffer_size,
+				*ksocknal_tunables.ksnd_rx_buffer_size);
+	if (rc != 0) {
+		CERROR("Can't set buffer tx %d, rx %d buffers: %d\n",
+			*ksocknal_tunables.ksnd_tx_buffer_size,
+			*ksocknal_tunables.ksnd_rx_buffer_size, rc);
+		return rc;
+	}
+
+/* TCP_BACKOFF_* sockopt tunables unsupported in stock kernels */
+
+	/* snapshot tunables */
+	keep_idle  = *ksocknal_tunables.ksnd_keepalive_idle;
+	keep_count = *ksocknal_tunables.ksnd_keepalive_count;
+	keep_intvl = *ksocknal_tunables.ksnd_keepalive_intvl;
+
+	do_keepalive = (keep_idle > 0 && keep_count > 0 && keep_intvl > 0);
+
+	option = (do_keepalive ? 1 : 0);
+	rc = kernel_setsockopt(sock, SOL_SOCKET, SO_KEEPALIVE,
+			      (char *)&option, sizeof(option));
+	if (rc != 0) {
+		CERROR("Can't set SO_KEEPALIVE: %d\n", rc);
+		return rc;
+	}
+
+	if (!do_keepalive)
+		return 0;
+
+	rc = kernel_setsockopt(sock, SOL_TCP, TCP_KEEPIDLE,
+				    (char *)&keep_idle, sizeof(keep_idle));
+	if (rc != 0) {
+		CERROR("Can't set TCP_KEEPIDLE: %d\n", rc);
+		return rc;
+	}
+
+	rc = kernel_setsockopt(sock, SOL_TCP, TCP_KEEPINTVL,
+				    (char *)&keep_intvl, sizeof(keep_intvl));
+	if (rc != 0) {
+		CERROR("Can't set TCP_KEEPINTVL: %d\n", rc);
+		return rc;
+	}
+
+	rc = kernel_setsockopt(sock, SOL_TCP, TCP_KEEPCNT,
+				    (char *)&keep_count, sizeof(keep_count));
+	if (rc != 0) {
+		CERROR("Can't set TCP_KEEPCNT: %d\n", rc);
+		return rc;
+	}
+
+	return 0;
+}
+
+void
+ksocknal_lib_push_conn(ksock_conn_t *conn)
+{
+	struct sock    *sk;
+	struct tcp_sock *tp;
+	int	     nonagle;
+	int	     val = 1;
+	int	     rc;
+
+	rc = ksocknal_connsock_addref(conn);
+	if (rc != 0)			    /* being shut down */
+		return;
+
+	sk = conn->ksnc_sock->sk;
+	tp = tcp_sk(sk);
+
+	lock_sock(sk);
+	nonagle = tp->nonagle;
+	tp->nonagle = 1;
+	release_sock(sk);
+
+	rc = kernel_setsockopt(conn->ksnc_sock, SOL_TCP, TCP_NODELAY,
+				      (char *)&val, sizeof(val));
+	LASSERT(rc == 0);
+
+	lock_sock(sk);
+	tp->nonagle = nonagle;
+	release_sock(sk);
+
+	ksocknal_connsock_decref(conn);
+}
+
+extern void ksocknal_read_callback(ksock_conn_t *conn);
+extern void ksocknal_write_callback(ksock_conn_t *conn);
+/*
+ * socket call back in Linux
+ */
+static void
+ksocknal_data_ready(struct sock *sk)
+{
+	ksock_conn_t  *conn;
+
+	/* interleave correctly with closing sockets... */
+	LASSERT(!in_irq());
+	read_lock(&ksocknal_data.ksnd_global_lock);
+
+	conn = sk->sk_user_data;
+	if (conn == NULL) {	     /* raced with ksocknal_terminate_conn */
+		LASSERT(sk->sk_data_ready != &ksocknal_data_ready);
+		sk->sk_data_ready(sk);
+	} else
+		ksocknal_read_callback(conn);
+
+	read_unlock(&ksocknal_data.ksnd_global_lock);
+}
+
+static void
+ksocknal_write_space(struct sock *sk)
+{
+	ksock_conn_t  *conn;
+	int	    wspace;
+	int	    min_wpace;
+
+	/* interleave correctly with closing sockets... */
+	LASSERT(!in_irq());
+	read_lock(&ksocknal_data.ksnd_global_lock);
+
+	conn = sk->sk_user_data;
+	wspace = SOCKNAL_WSPACE(sk);
+	min_wpace = SOCKNAL_MIN_WSPACE(sk);
+
+	CDEBUG(D_NET, "sk %p wspace %d low water %d conn %p%s%s%s\n",
+	       sk, wspace, min_wpace, conn,
+	       (conn == NULL) ? "" : (conn->ksnc_tx_ready ?
+				      " ready" : " blocked"),
+	       (conn == NULL) ? "" : (conn->ksnc_tx_scheduled ?
+				      " scheduled" : " idle"),
+	       (conn == NULL) ? "" : (list_empty(&conn->ksnc_tx_queue) ?
+				      " empty" : " queued"));
+
+	if (conn == NULL) {	     /* raced with ksocknal_terminate_conn */
+		LASSERT(sk->sk_write_space != &ksocknal_write_space);
+		sk->sk_write_space(sk);
+
+		read_unlock(&ksocknal_data.ksnd_global_lock);
+		return;
+	}
+
+	if (wspace >= min_wpace) {	      /* got enough space */
+		ksocknal_write_callback(conn);
+
+		/* Clear SOCK_NOSPACE _after_ ksocknal_write_callback so the
+		 * ENOMEM check in ksocknal_transmit is race-free (think about
+		 * it). */
+
+		clear_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
+	}
+
+	read_unlock(&ksocknal_data.ksnd_global_lock);
+}
+
+void
+ksocknal_lib_save_callback(struct socket *sock, ksock_conn_t *conn)
+{
+	conn->ksnc_saved_data_ready = sock->sk->sk_data_ready;
+	conn->ksnc_saved_write_space = sock->sk->sk_write_space;
+}
+
+void
+ksocknal_lib_set_callback(struct socket *sock,  ksock_conn_t *conn)
+{
+	sock->sk->sk_user_data = conn;
+	sock->sk->sk_data_ready = ksocknal_data_ready;
+	sock->sk->sk_write_space = ksocknal_write_space;
+	return;
+}
+
+void
+ksocknal_lib_reset_callback(struct socket *sock, ksock_conn_t *conn)
+{
+	/* Remove conn's network callbacks.
+	 * NB I _have_ to restore the callback, rather than storing a noop,
+	 * since the socket could survive past this module being unloaded!! */
+	sock->sk->sk_data_ready = conn->ksnc_saved_data_ready;
+	sock->sk->sk_write_space = conn->ksnc_saved_write_space;
+
+	/* A callback could be in progress already; they hold a read lock
+	 * on ksnd_global_lock (to serialise with me) and NOOP if
+	 * sk_user_data is NULL. */
+	sock->sk->sk_user_data = NULL;
+
+	return ;
+}
+
+int
+ksocknal_lib_memory_pressure(ksock_conn_t *conn)
+{
+	int	    rc = 0;
+	ksock_sched_t *sched;
+
+	sched = conn->ksnc_scheduler;
+	spin_lock_bh(&sched->kss_lock);
+
+	if (!test_bit(SOCK_NOSPACE, &conn->ksnc_sock->flags) &&
+	    !conn->ksnc_tx_ready) {
+		/* SOCK_NOSPACE is set when the socket fills
+		 * and cleared in the write_space callback
+		 * (which also sets ksnc_tx_ready).  If
+		 * SOCK_NOSPACE and ksnc_tx_ready are BOTH
+		 * zero, I didn't fill the socket and
+		 * write_space won't reschedule me, so I
+		 * return -ENOMEM to get my caller to retry
+		 * after a timeout */
+		rc = -ENOMEM;
+	}
+
+	spin_unlock_bh(&sched->kss_lock);
+
+	return rc;
+}
diff --git a/kernel/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_lib-linux.h b/kernel/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_lib-linux.h
new file mode 100644
index 000000000..f5563881b
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_lib-linux.h
@@ -0,0 +1,86 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_PORTAL_ALLOC
+
+#ifndef __LINUX_SOCKNAL_LIB_H__
+#define __LINUX_SOCKNAL_LIB_H__
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/stat.h>
+#include <linux/errno.h>
+#include <linux/unistd.h>
+#include <net/sock.h>
+#include <net/tcp.h>
+#include <linux/uio.h>
+#include <linux/if.h>
+#include <linux/uaccess.h>
+
+#include <asm/irq.h>
+
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/list.h>
+#include <linux/kmod.h>
+#include <linux/sysctl.h>
+#include <asm/div64.h>
+#include <linux/syscalls.h>
+
+#include "../../../include/linux/libcfs/libcfs.h"
+
+#include <linux/crc32.h>
+static inline __u32 ksocknal_csum(__u32 crc, unsigned char const *p, size_t len)
+{
+#if 1
+	return crc32_le(crc, p, len);
+#else
+	while (len-- > 0)
+		crc = ((crc + 0x100) & ~0xff) | ((crc + *p++) & 0xff) ;
+	return crc;
+#endif
+}
+
+#define SOCKNAL_WSPACE(sk)       sk_stream_wspace(sk)
+#define SOCKNAL_MIN_WSPACE(sk)   sk_stream_min_wspace(sk)
+
+/* assume one thread for each connection type */
+#define SOCKNAL_NSCHEDS		3
+#define SOCKNAL_NSCHEDS_HIGH	(SOCKNAL_NSCHEDS << 1)
+
+#endif
diff --git a/kernel/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_modparams.c b/kernel/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_modparams.c
new file mode 100644
index 000000000..86b88db1c
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_modparams.c
@@ -0,0 +1,188 @@
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ *
+ *   Author: Eric Barton <eric@bartonsoftware.com>
+ *
+ *   Portals is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Portals is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Portals; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include "socklnd.h"
+
+static int sock_timeout = 50;
+module_param(sock_timeout, int, 0644);
+MODULE_PARM_DESC(sock_timeout, "dead socket timeout (seconds)");
+
+static int credits = 256;
+module_param(credits, int, 0444);
+MODULE_PARM_DESC(credits, "# concurrent sends");
+
+static int peer_credits = 8;
+module_param(peer_credits, int, 0444);
+MODULE_PARM_DESC(peer_credits, "# concurrent sends to 1 peer");
+
+static int peer_buffer_credits;
+module_param(peer_buffer_credits, int, 0444);
+MODULE_PARM_DESC(peer_buffer_credits, "# per-peer router buffer credits");
+
+static int peer_timeout = 180;
+module_param(peer_timeout, int, 0444);
+MODULE_PARM_DESC(peer_timeout, "Seconds without aliveness news to declare peer dead (<=0 to disable)");
+
+/* Number of daemons in each thread pool which is percpt,
+ * we will estimate reasonable value based on CPUs if it's not set. */
+static unsigned int nscheds;
+module_param(nscheds, int, 0444);
+MODULE_PARM_DESC(nscheds, "# scheduler daemons in each pool while starting");
+
+static int nconnds = 4;
+module_param(nconnds, int, 0444);
+MODULE_PARM_DESC(nconnds, "# connection daemons while starting");
+
+static int nconnds_max = 64;
+module_param(nconnds_max, int, 0444);
+MODULE_PARM_DESC(nconnds_max, "max # connection daemons");
+
+static int min_reconnectms = 1000;
+module_param(min_reconnectms, int, 0644);
+MODULE_PARM_DESC(min_reconnectms, "min connection retry interval (mS)");
+
+static int max_reconnectms = 60000;
+module_param(max_reconnectms, int, 0644);
+MODULE_PARM_DESC(max_reconnectms, "max connection retry interval (mS)");
+
+# define DEFAULT_EAGER_ACK 0
+static int eager_ack = DEFAULT_EAGER_ACK;
+module_param(eager_ack, int, 0644);
+MODULE_PARM_DESC(eager_ack, "send tcp ack packets eagerly");
+
+static int typed_conns = 1;
+module_param(typed_conns, int, 0444);
+MODULE_PARM_DESC(typed_conns, "use different sockets for bulk");
+
+static int min_bulk = 1<<10;
+module_param(min_bulk, int, 0644);
+MODULE_PARM_DESC(min_bulk, "smallest 'large' message");
+
+# define DEFAULT_BUFFER_SIZE 0
+static int tx_buffer_size = DEFAULT_BUFFER_SIZE;
+module_param(tx_buffer_size, int, 0644);
+MODULE_PARM_DESC(tx_buffer_size, "socket tx buffer size (0 for system default)");
+
+static int rx_buffer_size = DEFAULT_BUFFER_SIZE;
+module_param(rx_buffer_size, int, 0644);
+MODULE_PARM_DESC(rx_buffer_size, "socket rx buffer size (0 for system default)");
+
+static int nagle;
+module_param(nagle, int, 0644);
+MODULE_PARM_DESC(nagle, "enable NAGLE?");
+
+static int round_robin = 1;
+module_param(round_robin, int, 0644);
+MODULE_PARM_DESC(round_robin, "Round robin for multiple interfaces");
+
+static int keepalive = 30;
+module_param(keepalive, int, 0644);
+MODULE_PARM_DESC(keepalive, "# seconds before send keepalive");
+
+static int keepalive_idle = 30;
+module_param(keepalive_idle, int, 0644);
+MODULE_PARM_DESC(keepalive_idle, "# idle seconds before probe");
+
+#define DEFAULT_KEEPALIVE_COUNT  5
+static int keepalive_count = DEFAULT_KEEPALIVE_COUNT;
+module_param(keepalive_count, int, 0644);
+MODULE_PARM_DESC(keepalive_count, "# missed probes == dead");
+
+static int keepalive_intvl = 5;
+module_param(keepalive_intvl, int, 0644);
+MODULE_PARM_DESC(keepalive_intvl, "seconds between probes");
+
+static int enable_csum;
+module_param(enable_csum, int, 0644);
+MODULE_PARM_DESC(enable_csum, "enable check sum");
+
+static int inject_csum_error;
+module_param(inject_csum_error, int, 0644);
+MODULE_PARM_DESC(inject_csum_error, "set non-zero to inject a checksum error");
+
+static int nonblk_zcack = 1;
+module_param(nonblk_zcack, int, 0644);
+MODULE_PARM_DESC(nonblk_zcack, "always send ZC-ACK on non-blocking connection");
+
+static unsigned int zc_min_payload = 16 << 10;
+module_param(zc_min_payload, int, 0644);
+MODULE_PARM_DESC(zc_min_payload, "minimum payload size to zero copy");
+
+static unsigned int zc_recv;
+module_param(zc_recv, int, 0644);
+MODULE_PARM_DESC(zc_recv, "enable ZC recv for Chelsio driver");
+
+static unsigned int zc_recv_min_nfrags = 16;
+module_param(zc_recv_min_nfrags, int, 0644);
+MODULE_PARM_DESC(zc_recv_min_nfrags, "minimum # of fragments to enable ZC recv");
+
+
+#if SOCKNAL_VERSION_DEBUG
+static int protocol = 3;
+module_param(protocol, int, 0644);
+MODULE_PARM_DESC(protocol, "protocol version");
+#endif
+
+ksock_tunables_t ksocknal_tunables;
+
+int ksocknal_tunables_init(void)
+{
+
+	/* initialize ksocknal_tunables structure */
+	ksocknal_tunables.ksnd_timeout	    = &sock_timeout;
+	ksocknal_tunables.ksnd_nscheds		  = &nscheds;
+	ksocknal_tunables.ksnd_nconnds	    = &nconnds;
+	ksocknal_tunables.ksnd_nconnds_max	= &nconnds_max;
+	ksocknal_tunables.ksnd_min_reconnectms    = &min_reconnectms;
+	ksocknal_tunables.ksnd_max_reconnectms    = &max_reconnectms;
+	ksocknal_tunables.ksnd_eager_ack	  = &eager_ack;
+	ksocknal_tunables.ksnd_typed_conns	= &typed_conns;
+	ksocknal_tunables.ksnd_min_bulk	   = &min_bulk;
+	ksocknal_tunables.ksnd_tx_buffer_size     = &tx_buffer_size;
+	ksocknal_tunables.ksnd_rx_buffer_size     = &rx_buffer_size;
+	ksocknal_tunables.ksnd_nagle	      = &nagle;
+	ksocknal_tunables.ksnd_round_robin	= &round_robin;
+	ksocknal_tunables.ksnd_keepalive	  = &keepalive;
+	ksocknal_tunables.ksnd_keepalive_idle     = &keepalive_idle;
+	ksocknal_tunables.ksnd_keepalive_count    = &keepalive_count;
+	ksocknal_tunables.ksnd_keepalive_intvl    = &keepalive_intvl;
+	ksocknal_tunables.ksnd_credits	    = &credits;
+	ksocknal_tunables.ksnd_peertxcredits      = &peer_credits;
+	ksocknal_tunables.ksnd_peerrtrcredits     = &peer_buffer_credits;
+	ksocknal_tunables.ksnd_peertimeout	= &peer_timeout;
+	ksocknal_tunables.ksnd_enable_csum	= &enable_csum;
+	ksocknal_tunables.ksnd_inject_csum_error  = &inject_csum_error;
+	ksocknal_tunables.ksnd_nonblk_zcack       = &nonblk_zcack;
+	ksocknal_tunables.ksnd_zc_min_payload     = &zc_min_payload;
+	ksocknal_tunables.ksnd_zc_recv	    = &zc_recv;
+	ksocknal_tunables.ksnd_zc_recv_min_nfrags = &zc_recv_min_nfrags;
+
+
+
+#if SOCKNAL_VERSION_DEBUG
+	ksocknal_tunables.ksnd_protocol	   = &protocol;
+#endif
+
+	if (*ksocknal_tunables.ksnd_zc_min_payload < (2 << 10))
+		*ksocknal_tunables.ksnd_zc_min_payload = 2 << 10;
+
+	return 0;
+};
diff --git a/kernel/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_proto.c b/kernel/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_proto.c
new file mode 100644
index 000000000..8596581f5
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_proto.c
@@ -0,0 +1,797 @@
+/*
+ * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ *
+ *   Author: Zach Brown <zab@zabbo.net>
+ *   Author: Peter J. Braam <braam@clusterfs.com>
+ *   Author: Phil Schwan <phil@clusterfs.com>
+ *   Author: Eric Barton <eric@bartonsoftware.com>
+ *
+ *   This file is part of Portals, http://www.sf.net/projects/sandiaportals/
+ *
+ *   Portals is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Portals is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Portals; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include "socklnd.h"
+
+/*
+ * Protocol entries :
+ *   pro_send_hello       : send hello message
+ *   pro_recv_hello       : receive hello message
+ *   pro_pack	     : pack message header
+ *   pro_unpack	   : unpack message header
+ *   pro_queue_tx_zcack() : Called holding BH lock: kss_lock
+ *			  return 1 if ACK is piggybacked, otherwise return 0
+ *   pro_queue_tx_msg()   : Called holding BH lock: kss_lock
+ *			  return the ACK that piggybacked by my message, or NULL
+ *   pro_handle_zcreq()   : handler of incoming ZC-REQ
+ *   pro_handle_zcack()   : handler of incoming ZC-ACK
+ *   pro_match_tx()       : Called holding glock
+ */
+
+static ksock_tx_t *
+ksocknal_queue_tx_msg_v1(ksock_conn_t *conn, ksock_tx_t *tx_msg)
+{
+	/* V1.x, just enqueue it */
+	list_add_tail(&tx_msg->tx_list, &conn->ksnc_tx_queue);
+	return NULL;
+}
+
+void
+ksocknal_next_tx_carrier(ksock_conn_t *conn)
+{
+	ksock_tx_t     *tx = conn->ksnc_tx_carrier;
+
+	/* Called holding BH lock: conn->ksnc_scheduler->kss_lock */
+	LASSERT(!list_empty(&conn->ksnc_tx_queue));
+	LASSERT(tx != NULL);
+
+	/* Next TX that can carry ZC-ACK or LNet message */
+	if (tx->tx_list.next == &conn->ksnc_tx_queue) {
+		/* no more packets queued */
+		conn->ksnc_tx_carrier = NULL;
+	} else {
+		conn->ksnc_tx_carrier = list_entry(tx->tx_list.next,
+						       ksock_tx_t, tx_list);
+		LASSERT(conn->ksnc_tx_carrier->tx_msg.ksm_type == tx->tx_msg.ksm_type);
+	}
+}
+
+static int
+ksocknal_queue_tx_zcack_v2(ksock_conn_t *conn,
+			   ksock_tx_t *tx_ack, __u64 cookie)
+{
+	ksock_tx_t *tx = conn->ksnc_tx_carrier;
+
+	LASSERT(tx_ack == NULL ||
+		 tx_ack->tx_msg.ksm_type == KSOCK_MSG_NOOP);
+
+	/*
+	 * Enqueue or piggyback tx_ack / cookie
+	 * . no tx can piggyback cookie of tx_ack (or cookie), just
+	 *   enqueue the tx_ack (if tx_ack != NUL) and return NULL.
+	 * . There is tx can piggyback cookie of tx_ack (or cookie),
+	 *   piggyback the cookie and return the tx.
+	 */
+	if (tx == NULL) {
+		if (tx_ack != NULL) {
+			list_add_tail(&tx_ack->tx_list,
+					  &conn->ksnc_tx_queue);
+			conn->ksnc_tx_carrier = tx_ack;
+		}
+		return 0;
+	}
+
+	if (tx->tx_msg.ksm_type == KSOCK_MSG_NOOP) {
+		/* tx is noop zc-ack, can't piggyback zc-ack cookie */
+		if (tx_ack != NULL)
+			list_add_tail(&tx_ack->tx_list,
+					  &conn->ksnc_tx_queue);
+		return 0;
+	}
+
+	LASSERT(tx->tx_msg.ksm_type == KSOCK_MSG_LNET);
+	LASSERT(tx->tx_msg.ksm_zc_cookies[1] == 0);
+
+	if (tx_ack != NULL)
+		cookie = tx_ack->tx_msg.ksm_zc_cookies[1];
+
+	/* piggyback the zc-ack cookie */
+	tx->tx_msg.ksm_zc_cookies[1] = cookie;
+	/* move on to the next TX which can carry cookie */
+	ksocknal_next_tx_carrier(conn);
+
+	return 1;
+}
+
+static ksock_tx_t *
+ksocknal_queue_tx_msg_v2(ksock_conn_t *conn, ksock_tx_t *tx_msg)
+{
+	ksock_tx_t  *tx  = conn->ksnc_tx_carrier;
+
+	/*
+	 * Enqueue tx_msg:
+	 * . If there is no NOOP on the connection, just enqueue
+	 *   tx_msg and return NULL
+	 * . If there is NOOP on the connection, piggyback the cookie
+	 *   and replace the NOOP tx, and return the NOOP tx.
+	 */
+	if (tx == NULL) { /* nothing on queue */
+		list_add_tail(&tx_msg->tx_list, &conn->ksnc_tx_queue);
+		conn->ksnc_tx_carrier = tx_msg;
+		return NULL;
+	}
+
+	if (tx->tx_msg.ksm_type == KSOCK_MSG_LNET) { /* nothing to carry */
+		list_add_tail(&tx_msg->tx_list, &conn->ksnc_tx_queue);
+		return NULL;
+	}
+
+	LASSERT(tx->tx_msg.ksm_type == KSOCK_MSG_NOOP);
+
+	/* There is a noop zc-ack can be piggybacked */
+	tx_msg->tx_msg.ksm_zc_cookies[1] = tx->tx_msg.ksm_zc_cookies[1];
+	ksocknal_next_tx_carrier(conn);
+
+	/* use new_tx to replace the noop zc-ack packet */
+	list_add(&tx_msg->tx_list, &tx->tx_list);
+	list_del(&tx->tx_list);
+
+	return tx;
+}
+
+static int
+ksocknal_queue_tx_zcack_v3(ksock_conn_t *conn,
+			   ksock_tx_t *tx_ack, __u64 cookie)
+{
+	ksock_tx_t *tx;
+
+	if (conn->ksnc_type != SOCKLND_CONN_ACK)
+		return ksocknal_queue_tx_zcack_v2(conn, tx_ack, cookie);
+
+	/* non-blocking ZC-ACK (to router) */
+	LASSERT(tx_ack == NULL ||
+		 tx_ack->tx_msg.ksm_type == KSOCK_MSG_NOOP);
+
+	tx = conn->ksnc_tx_carrier;
+	if (tx == NULL) {
+		if (tx_ack != NULL) {
+			list_add_tail(&tx_ack->tx_list,
+					  &conn->ksnc_tx_queue);
+			conn->ksnc_tx_carrier = tx_ack;
+		}
+		return 0;
+	}
+
+	/* conn->ksnc_tx_carrier != NULL */
+
+	if (tx_ack != NULL)
+		cookie = tx_ack->tx_msg.ksm_zc_cookies[1];
+
+	if (cookie == SOCKNAL_KEEPALIVE_PING) /* ignore keepalive PING */
+		return 1;
+
+	if (tx->tx_msg.ksm_zc_cookies[1] == SOCKNAL_KEEPALIVE_PING) {
+		/* replace the keepalive PING with a real ACK */
+		LASSERT(tx->tx_msg.ksm_zc_cookies[0] == 0);
+		tx->tx_msg.ksm_zc_cookies[1] = cookie;
+		return 1;
+	}
+
+	if (cookie == tx->tx_msg.ksm_zc_cookies[0] ||
+	    cookie == tx->tx_msg.ksm_zc_cookies[1]) {
+		CWARN("%s: duplicated ZC cookie: %llu\n",
+		      libcfs_id2str(conn->ksnc_peer->ksnp_id), cookie);
+		return 1; /* XXX return error in the future */
+	}
+
+	if (tx->tx_msg.ksm_zc_cookies[0] == 0) {
+		/* NOOP tx has only one ZC-ACK cookie, can carry at least one more */
+		if (tx->tx_msg.ksm_zc_cookies[1] > cookie) {
+			tx->tx_msg.ksm_zc_cookies[0] = tx->tx_msg.ksm_zc_cookies[1];
+			tx->tx_msg.ksm_zc_cookies[1] = cookie;
+		} else {
+			tx->tx_msg.ksm_zc_cookies[0] = cookie;
+		}
+
+		if (tx->tx_msg.ksm_zc_cookies[0] - tx->tx_msg.ksm_zc_cookies[1] > 2) {
+			/* not likely to carry more ACKs, skip it to simplify logic */
+			ksocknal_next_tx_carrier(conn);
+		}
+
+		return 1;
+	}
+
+	/* takes two or more cookies already */
+
+	if (tx->tx_msg.ksm_zc_cookies[0] > tx->tx_msg.ksm_zc_cookies[1]) {
+		__u64   tmp = 0;
+
+		/* two separated cookies: (a+2, a) or (a+1, a) */
+		LASSERT(tx->tx_msg.ksm_zc_cookies[0] -
+			 tx->tx_msg.ksm_zc_cookies[1] <= 2);
+
+		if (tx->tx_msg.ksm_zc_cookies[0] -
+		    tx->tx_msg.ksm_zc_cookies[1] == 2) {
+			if (cookie == tx->tx_msg.ksm_zc_cookies[1] + 1)
+				tmp = cookie;
+		} else if (cookie == tx->tx_msg.ksm_zc_cookies[1] - 1) {
+			tmp = tx->tx_msg.ksm_zc_cookies[1];
+		} else if (cookie == tx->tx_msg.ksm_zc_cookies[0] + 1) {
+			tmp = tx->tx_msg.ksm_zc_cookies[0];
+		}
+
+		if (tmp != 0) {
+			/* range of cookies */
+			tx->tx_msg.ksm_zc_cookies[0] = tmp - 1;
+			tx->tx_msg.ksm_zc_cookies[1] = tmp + 1;
+			return 1;
+		}
+
+	} else {
+		/* ksm_zc_cookies[0] < ksm_zc_cookies[1], it is range of cookies */
+		if (cookie >= tx->tx_msg.ksm_zc_cookies[0] &&
+		    cookie <= tx->tx_msg.ksm_zc_cookies[1]) {
+			CWARN("%s: duplicated ZC cookie: %llu\n",
+			      libcfs_id2str(conn->ksnc_peer->ksnp_id), cookie);
+			return 1; /* XXX: return error in the future */
+		}
+
+		if (cookie == tx->tx_msg.ksm_zc_cookies[1] + 1) {
+			tx->tx_msg.ksm_zc_cookies[1] = cookie;
+			return 1;
+		}
+
+		if (cookie == tx->tx_msg.ksm_zc_cookies[0] - 1) {
+			tx->tx_msg.ksm_zc_cookies[0] = cookie;
+			return 1;
+		}
+	}
+
+	/* failed to piggyback ZC-ACK */
+	if (tx_ack != NULL) {
+		list_add_tail(&tx_ack->tx_list, &conn->ksnc_tx_queue);
+		/* the next tx can piggyback at least 1 ACK */
+		ksocknal_next_tx_carrier(conn);
+	}
+
+	return 0;
+}
+
+static int
+ksocknal_match_tx(ksock_conn_t *conn, ksock_tx_t *tx, int nonblk)
+{
+	int nob;
+
+#if SOCKNAL_VERSION_DEBUG
+	if (!*ksocknal_tunables.ksnd_typed_conns)
+		return SOCKNAL_MATCH_YES;
+#endif
+
+	if (tx == NULL || tx->tx_lnetmsg == NULL) {
+		/* noop packet */
+		nob = offsetof(ksock_msg_t, ksm_u);
+	} else {
+		nob = tx->tx_lnetmsg->msg_len +
+		      ((conn->ksnc_proto == &ksocknal_protocol_v1x) ?
+		       sizeof(lnet_hdr_t) : sizeof(ksock_msg_t));
+	}
+
+	/* default checking for typed connection */
+	switch (conn->ksnc_type) {
+	default:
+		CERROR("ksnc_type bad: %u\n", conn->ksnc_type);
+		LBUG();
+	case SOCKLND_CONN_ANY:
+		return SOCKNAL_MATCH_YES;
+
+	case SOCKLND_CONN_BULK_IN:
+		return SOCKNAL_MATCH_MAY;
+
+	case SOCKLND_CONN_BULK_OUT:
+		if (nob < *ksocknal_tunables.ksnd_min_bulk)
+			return SOCKNAL_MATCH_MAY;
+		else
+			return SOCKNAL_MATCH_YES;
+
+	case SOCKLND_CONN_CONTROL:
+		if (nob >= *ksocknal_tunables.ksnd_min_bulk)
+			return SOCKNAL_MATCH_MAY;
+		else
+			return SOCKNAL_MATCH_YES;
+	}
+}
+
+static int
+ksocknal_match_tx_v3(ksock_conn_t *conn, ksock_tx_t *tx, int nonblk)
+{
+	int nob;
+
+	if (tx == NULL || tx->tx_lnetmsg == NULL)
+		nob = offsetof(ksock_msg_t, ksm_u);
+	else
+		nob = tx->tx_lnetmsg->msg_len + sizeof(ksock_msg_t);
+
+	switch (conn->ksnc_type) {
+	default:
+		CERROR("ksnc_type bad: %u\n", conn->ksnc_type);
+		LBUG();
+	case SOCKLND_CONN_ANY:
+		return SOCKNAL_MATCH_NO;
+
+	case SOCKLND_CONN_ACK:
+		if (nonblk)
+			return SOCKNAL_MATCH_YES;
+		else if (tx == NULL || tx->tx_lnetmsg == NULL)
+			return SOCKNAL_MATCH_MAY;
+		else
+			return SOCKNAL_MATCH_NO;
+
+	case SOCKLND_CONN_BULK_OUT:
+		if (nonblk)
+			return SOCKNAL_MATCH_NO;
+		else if (nob < *ksocknal_tunables.ksnd_min_bulk)
+			return SOCKNAL_MATCH_MAY;
+		else
+			return SOCKNAL_MATCH_YES;
+
+	case SOCKLND_CONN_CONTROL:
+		if (nonblk)
+			return SOCKNAL_MATCH_NO;
+		else if (nob >= *ksocknal_tunables.ksnd_min_bulk)
+			return SOCKNAL_MATCH_MAY;
+		else
+			return SOCKNAL_MATCH_YES;
+	}
+}
+
+/* (Sink) handle incoming ZC request from sender */
+static int
+ksocknal_handle_zcreq(ksock_conn_t *c, __u64 cookie, int remote)
+{
+	ksock_peer_t   *peer = c->ksnc_peer;
+	ksock_conn_t   *conn;
+	ksock_tx_t     *tx;
+	int	     rc;
+
+	read_lock(&ksocknal_data.ksnd_global_lock);
+
+	conn = ksocknal_find_conn_locked(peer, NULL, !!remote);
+	if (conn != NULL) {
+		ksock_sched_t *sched = conn->ksnc_scheduler;
+
+		LASSERT(conn->ksnc_proto->pro_queue_tx_zcack != NULL);
+
+		spin_lock_bh(&sched->kss_lock);
+
+		rc = conn->ksnc_proto->pro_queue_tx_zcack(conn, NULL, cookie);
+
+		spin_unlock_bh(&sched->kss_lock);
+
+		if (rc) { /* piggybacked */
+			read_unlock(&ksocknal_data.ksnd_global_lock);
+			return 0;
+		}
+	}
+
+	read_unlock(&ksocknal_data.ksnd_global_lock);
+
+	/* ACK connection is not ready, or can't piggyback the ACK */
+	tx = ksocknal_alloc_tx_noop(cookie, !!remote);
+	if (tx == NULL)
+		return -ENOMEM;
+
+	rc = ksocknal_launch_packet(peer->ksnp_ni, tx, peer->ksnp_id);
+	if (rc == 0)
+		return 0;
+
+	ksocknal_free_tx(tx);
+	return rc;
+}
+
+/* (Sender) handle ZC_ACK from sink */
+static int
+ksocknal_handle_zcack(ksock_conn_t *conn, __u64 cookie1, __u64 cookie2)
+{
+	ksock_peer_t      *peer = conn->ksnc_peer;
+	ksock_tx_t	*tx;
+	ksock_tx_t	*tmp;
+	LIST_HEAD(zlist);
+	int		count;
+
+	if (cookie1 == 0)
+		cookie1 = cookie2;
+
+	count = (cookie1 > cookie2) ? 2 : (cookie2 - cookie1 + 1);
+
+	if (cookie2 == SOCKNAL_KEEPALIVE_PING &&
+	    conn->ksnc_proto == &ksocknal_protocol_v3x) {
+		/* keepalive PING for V3.x, just ignore it */
+		return count == 1 ? 0 : -EPROTO;
+	}
+
+	spin_lock(&peer->ksnp_lock);
+
+	list_for_each_entry_safe(tx, tmp,
+				     &peer->ksnp_zc_req_list, tx_zc_list) {
+		__u64 c = tx->tx_msg.ksm_zc_cookies[0];
+
+		if (c == cookie1 || c == cookie2 || (cookie1 < c && c < cookie2)) {
+			tx->tx_msg.ksm_zc_cookies[0] = 0;
+			list_del(&tx->tx_zc_list);
+			list_add(&tx->tx_zc_list, &zlist);
+
+			if (--count == 0)
+				break;
+		}
+	}
+
+	spin_unlock(&peer->ksnp_lock);
+
+	while (!list_empty(&zlist)) {
+		tx = list_entry(zlist.next, ksock_tx_t, tx_zc_list);
+		list_del(&tx->tx_zc_list);
+		ksocknal_tx_decref(tx);
+	}
+
+	return count == 0 ? 0 : -EPROTO;
+}
+
+static int
+ksocknal_send_hello_v1(ksock_conn_t *conn, ksock_hello_msg_t *hello)
+{
+	struct socket	*sock = conn->ksnc_sock;
+	lnet_hdr_t	  *hdr;
+	lnet_magicversion_t *hmv;
+	int		  rc;
+	int		  i;
+
+	CLASSERT(sizeof(lnet_magicversion_t) == offsetof(lnet_hdr_t, src_nid));
+
+	LIBCFS_ALLOC(hdr, sizeof(*hdr));
+	if (hdr == NULL) {
+		CERROR("Can't allocate lnet_hdr_t\n");
+		return -ENOMEM;
+	}
+
+	hmv = (lnet_magicversion_t *)&hdr->dest_nid;
+
+	/* Re-organize V2.x message header to V1.x (lnet_hdr_t)
+	 * header and send out */
+	hmv->magic	 = cpu_to_le32 (LNET_PROTO_TCP_MAGIC);
+	hmv->version_major = cpu_to_le16 (KSOCK_PROTO_V1_MAJOR);
+	hmv->version_minor = cpu_to_le16 (KSOCK_PROTO_V1_MINOR);
+
+	if (the_lnet.ln_testprotocompat != 0) {
+		/* single-shot proto check */
+		LNET_LOCK();
+		if ((the_lnet.ln_testprotocompat & 1) != 0) {
+			hmv->version_major++;   /* just different! */
+			the_lnet.ln_testprotocompat &= ~1;
+		}
+		if ((the_lnet.ln_testprotocompat & 2) != 0) {
+			hmv->magic = LNET_PROTO_MAGIC;
+			the_lnet.ln_testprotocompat &= ~2;
+		}
+		LNET_UNLOCK();
+	}
+
+	hdr->src_nid	= cpu_to_le64 (hello->kshm_src_nid);
+	hdr->src_pid	= cpu_to_le32 (hello->kshm_src_pid);
+	hdr->type	   = cpu_to_le32 (LNET_MSG_HELLO);
+	hdr->payload_length = cpu_to_le32 (hello->kshm_nips * sizeof(__u32));
+	hdr->msg.hello.type = cpu_to_le32 (hello->kshm_ctype);
+	hdr->msg.hello.incarnation = cpu_to_le64 (hello->kshm_src_incarnation);
+
+	rc = libcfs_sock_write(sock, hdr, sizeof(*hdr),
+			       lnet_acceptor_timeout());
+
+	if (rc != 0) {
+		CNETERR("Error %d sending HELLO hdr to %pI4h/%d\n",
+			rc, &conn->ksnc_ipaddr, conn->ksnc_port);
+		goto out;
+	}
+
+	if (hello->kshm_nips == 0)
+		goto out;
+
+	for (i = 0; i < (int) hello->kshm_nips; i++) {
+		hello->kshm_ips[i] = __cpu_to_le32 (hello->kshm_ips[i]);
+	}
+
+	rc = libcfs_sock_write(sock, hello->kshm_ips,
+			       hello->kshm_nips * sizeof(__u32),
+			       lnet_acceptor_timeout());
+	if (rc != 0) {
+		CNETERR("Error %d sending HELLO payload (%d) to %pI4h/%d\n",
+			rc, hello->kshm_nips,
+			&conn->ksnc_ipaddr, conn->ksnc_port);
+	}
+out:
+	LIBCFS_FREE(hdr, sizeof(*hdr));
+
+	return rc;
+}
+
+static int
+ksocknal_send_hello_v2(ksock_conn_t *conn, ksock_hello_msg_t *hello)
+{
+	struct socket *sock = conn->ksnc_sock;
+	int	     rc;
+
+	hello->kshm_magic   = LNET_PROTO_MAGIC;
+	hello->kshm_version = conn->ksnc_proto->pro_version;
+
+	if (the_lnet.ln_testprotocompat != 0) {
+		/* single-shot proto check */
+		LNET_LOCK();
+		if ((the_lnet.ln_testprotocompat & 1) != 0) {
+			hello->kshm_version++;   /* just different! */
+			the_lnet.ln_testprotocompat &= ~1;
+		}
+		LNET_UNLOCK();
+	}
+
+	rc = libcfs_sock_write(sock, hello, offsetof(ksock_hello_msg_t, kshm_ips),
+			       lnet_acceptor_timeout());
+
+	if (rc != 0) {
+		CNETERR("Error %d sending HELLO hdr to %pI4h/%d\n",
+			rc, &conn->ksnc_ipaddr, conn->ksnc_port);
+		return rc;
+	}
+
+	if (hello->kshm_nips == 0)
+		return 0;
+
+	rc = libcfs_sock_write(sock, hello->kshm_ips,
+			       hello->kshm_nips * sizeof(__u32),
+			       lnet_acceptor_timeout());
+	if (rc != 0) {
+		CNETERR("Error %d sending HELLO payload (%d) to %pI4h/%d\n",
+			rc, hello->kshm_nips,
+			&conn->ksnc_ipaddr, conn->ksnc_port);
+	}
+
+	return rc;
+}
+
+static int
+ksocknal_recv_hello_v1(ksock_conn_t *conn, ksock_hello_msg_t *hello,
+		       int timeout)
+{
+	struct socket	*sock = conn->ksnc_sock;
+	lnet_hdr_t	  *hdr;
+	int		  rc;
+	int		  i;
+
+	LIBCFS_ALLOC(hdr, sizeof(*hdr));
+	if (hdr == NULL) {
+		CERROR("Can't allocate lnet_hdr_t\n");
+		return -ENOMEM;
+	}
+
+	rc = libcfs_sock_read(sock, &hdr->src_nid,
+			      sizeof(*hdr) - offsetof(lnet_hdr_t, src_nid),
+			      timeout);
+	if (rc != 0) {
+		CERROR("Error %d reading rest of HELLO hdr from %pI4h\n",
+			rc, &conn->ksnc_ipaddr);
+		LASSERT(rc < 0 && rc != -EALREADY);
+		goto out;
+	}
+
+	/* ...and check we got what we expected */
+	if (hdr->type != cpu_to_le32 (LNET_MSG_HELLO)) {
+		CERROR("Expecting a HELLO hdr, but got type %d from %pI4h\n",
+		       le32_to_cpu(hdr->type),
+		       &conn->ksnc_ipaddr);
+		rc = -EPROTO;
+		goto out;
+	}
+
+	hello->kshm_src_nid	 = le64_to_cpu(hdr->src_nid);
+	hello->kshm_src_pid	 = le32_to_cpu(hdr->src_pid);
+	hello->kshm_src_incarnation = le64_to_cpu(hdr->msg.hello.incarnation);
+	hello->kshm_ctype	   = le32_to_cpu(hdr->msg.hello.type);
+	hello->kshm_nips	    = le32_to_cpu(hdr->payload_length) /
+					 sizeof(__u32);
+
+	if (hello->kshm_nips > LNET_MAX_INTERFACES) {
+		CERROR("Bad nips %d from ip %pI4h\n",
+		       hello->kshm_nips, &conn->ksnc_ipaddr);
+		rc = -EPROTO;
+		goto out;
+	}
+
+	if (hello->kshm_nips == 0)
+		goto out;
+
+	rc = libcfs_sock_read(sock, hello->kshm_ips,
+			      hello->kshm_nips * sizeof(__u32), timeout);
+	if (rc != 0) {
+		CERROR("Error %d reading IPs from ip %pI4h\n",
+			rc, &conn->ksnc_ipaddr);
+		LASSERT(rc < 0 && rc != -EALREADY);
+		goto out;
+	}
+
+	for (i = 0; i < (int) hello->kshm_nips; i++) {
+		hello->kshm_ips[i] = __le32_to_cpu(hello->kshm_ips[i]);
+
+		if (hello->kshm_ips[i] == 0) {
+			CERROR("Zero IP[%d] from ip %pI4h\n",
+			       i, &conn->ksnc_ipaddr);
+			rc = -EPROTO;
+			break;
+		}
+	}
+out:
+	LIBCFS_FREE(hdr, sizeof(*hdr));
+
+	return rc;
+}
+
+static int
+ksocknal_recv_hello_v2(ksock_conn_t *conn, ksock_hello_msg_t *hello, int timeout)
+{
+	struct socket   *sock = conn->ksnc_sock;
+	int		rc;
+	int		i;
+
+	if (hello->kshm_magic == LNET_PROTO_MAGIC)
+		conn->ksnc_flip = 0;
+	else
+		conn->ksnc_flip = 1;
+
+	rc = libcfs_sock_read(sock, &hello->kshm_src_nid,
+			      offsetof(ksock_hello_msg_t, kshm_ips) -
+				       offsetof(ksock_hello_msg_t, kshm_src_nid),
+			      timeout);
+	if (rc != 0) {
+		CERROR("Error %d reading HELLO from %pI4h\n",
+			rc, &conn->ksnc_ipaddr);
+		LASSERT(rc < 0 && rc != -EALREADY);
+		return rc;
+	}
+
+	if (conn->ksnc_flip) {
+		__swab32s(&hello->kshm_src_pid);
+		__swab64s(&hello->kshm_src_nid);
+		__swab32s(&hello->kshm_dst_pid);
+		__swab64s(&hello->kshm_dst_nid);
+		__swab64s(&hello->kshm_src_incarnation);
+		__swab64s(&hello->kshm_dst_incarnation);
+		__swab32s(&hello->kshm_ctype);
+		__swab32s(&hello->kshm_nips);
+	}
+
+	if (hello->kshm_nips > LNET_MAX_INTERFACES) {
+		CERROR("Bad nips %d from ip %pI4h\n",
+		       hello->kshm_nips, &conn->ksnc_ipaddr);
+		return -EPROTO;
+	}
+
+	if (hello->kshm_nips == 0)
+		return 0;
+
+	rc = libcfs_sock_read(sock, hello->kshm_ips,
+			      hello->kshm_nips * sizeof(__u32), timeout);
+	if (rc != 0) {
+		CERROR("Error %d reading IPs from ip %pI4h\n",
+			rc, &conn->ksnc_ipaddr);
+		LASSERT(rc < 0 && rc != -EALREADY);
+		return rc;
+	}
+
+	for (i = 0; i < (int) hello->kshm_nips; i++) {
+		if (conn->ksnc_flip)
+			__swab32s(&hello->kshm_ips[i]);
+
+		if (hello->kshm_ips[i] == 0) {
+			CERROR("Zero IP[%d] from ip %pI4h\n",
+			       i, &conn->ksnc_ipaddr);
+			return -EPROTO;
+		}
+	}
+
+	return 0;
+}
+
+static void
+ksocknal_pack_msg_v1(ksock_tx_t *tx)
+{
+	/* V1.x has no KSOCK_MSG_NOOP */
+	LASSERT(tx->tx_msg.ksm_type != KSOCK_MSG_NOOP);
+	LASSERT(tx->tx_lnetmsg != NULL);
+
+	tx->tx_iov[0].iov_base = &tx->tx_lnetmsg->msg_hdr;
+	tx->tx_iov[0].iov_len  = sizeof(lnet_hdr_t);
+
+	tx->tx_resid = tx->tx_nob = tx->tx_lnetmsg->msg_len + sizeof(lnet_hdr_t);
+}
+
+static void
+ksocknal_pack_msg_v2(ksock_tx_t *tx)
+{
+	tx->tx_iov[0].iov_base = &tx->tx_msg;
+
+	if (tx->tx_lnetmsg != NULL) {
+		LASSERT(tx->tx_msg.ksm_type != KSOCK_MSG_NOOP);
+
+		tx->tx_msg.ksm_u.lnetmsg.ksnm_hdr = tx->tx_lnetmsg->msg_hdr;
+		tx->tx_iov[0].iov_len = sizeof(ksock_msg_t);
+		tx->tx_resid = tx->tx_nob = sizeof(ksock_msg_t) + tx->tx_lnetmsg->msg_len;
+	} else {
+		LASSERT(tx->tx_msg.ksm_type == KSOCK_MSG_NOOP);
+
+		tx->tx_iov[0].iov_len = offsetof(ksock_msg_t, ksm_u.lnetmsg.ksnm_hdr);
+		tx->tx_resid = tx->tx_nob = offsetof(ksock_msg_t,  ksm_u.lnetmsg.ksnm_hdr);
+	}
+	/* Don't checksum before start sending, because packet can be piggybacked with ACK */
+}
+
+static void
+ksocknal_unpack_msg_v1(ksock_msg_t *msg)
+{
+	msg->ksm_csum	   = 0;
+	msg->ksm_type	   = KSOCK_MSG_LNET;
+	msg->ksm_zc_cookies[0]  = msg->ksm_zc_cookies[1]  = 0;
+}
+
+static void
+ksocknal_unpack_msg_v2(ksock_msg_t *msg)
+{
+	return;  /* Do nothing */
+}
+
+ksock_proto_t  ksocknal_protocol_v1x = {
+	.pro_version	    = KSOCK_PROTO_V1,
+	.pro_send_hello	 = ksocknal_send_hello_v1,
+	.pro_recv_hello	 = ksocknal_recv_hello_v1,
+	.pro_pack	       = ksocknal_pack_msg_v1,
+	.pro_unpack	     = ksocknal_unpack_msg_v1,
+	.pro_queue_tx_msg       = ksocknal_queue_tx_msg_v1,
+	.pro_handle_zcreq       = NULL,
+	.pro_handle_zcack       = NULL,
+	.pro_queue_tx_zcack     = NULL,
+	.pro_match_tx	   = ksocknal_match_tx
+};
+
+ksock_proto_t  ksocknal_protocol_v2x = {
+	.pro_version	    = KSOCK_PROTO_V2,
+	.pro_send_hello	 = ksocknal_send_hello_v2,
+	.pro_recv_hello	 = ksocknal_recv_hello_v2,
+	.pro_pack	       = ksocknal_pack_msg_v2,
+	.pro_unpack	     = ksocknal_unpack_msg_v2,
+	.pro_queue_tx_msg       = ksocknal_queue_tx_msg_v2,
+	.pro_queue_tx_zcack     = ksocknal_queue_tx_zcack_v2,
+	.pro_handle_zcreq       = ksocknal_handle_zcreq,
+	.pro_handle_zcack       = ksocknal_handle_zcack,
+	.pro_match_tx	   = ksocknal_match_tx
+};
+
+ksock_proto_t  ksocknal_protocol_v3x = {
+	.pro_version	    = KSOCK_PROTO_V3,
+	.pro_send_hello	 = ksocknal_send_hello_v2,
+	.pro_recv_hello	 = ksocknal_recv_hello_v2,
+	.pro_pack	       = ksocknal_pack_msg_v2,
+	.pro_unpack	     = ksocknal_unpack_msg_v2,
+	.pro_queue_tx_msg       = ksocknal_queue_tx_msg_v2,
+	.pro_queue_tx_zcack     = ksocknal_queue_tx_zcack_v3,
+	.pro_handle_zcreq       = ksocknal_handle_zcreq,
+	.pro_handle_zcack       = ksocknal_handle_zcack,
+	.pro_match_tx	   = ksocknal_match_tx_v3
+};
diff --git a/kernel/drivers/staging/lustre/lnet/lnet/Makefile b/kernel/drivers/staging/lustre/lnet/lnet/Makefile
new file mode 100644
index 000000000..336b8ea4f
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lnet/lnet/Makefile
@@ -0,0 +1,5 @@
+obj-$(CONFIG_LNET) += lnet.o
+
+lnet-y := api-ni.o config.o lib-me.o lib-msg.o lib-eq.o	\
+	  lib-md.o lib-ptl.o lib-move.o module.o lo.o router.o		\
+	  router_proc.o acceptor.o peer.o
diff --git a/kernel/drivers/staging/lustre/lnet/lnet/acceptor.c b/kernel/drivers/staging/lustre/lnet/lnet/acceptor.c
new file mode 100644
index 000000000..72fd1bf70
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lnet/lnet/acceptor.c
@@ -0,0 +1,500 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+#include "../../include/linux/lnet/lib-lnet.h"
+
+
+static int   accept_port    = 988;
+static int   accept_backlog = 127;
+static int   accept_timeout = 5;
+
+static struct {
+	int			pta_shutdown;
+	struct socket		*pta_sock;
+	struct completion	pta_signal;
+} lnet_acceptor_state;
+
+int
+lnet_acceptor_port(void)
+{
+	return accept_port;
+}
+EXPORT_SYMBOL(lnet_acceptor_port);
+
+static inline int
+lnet_accept_magic(__u32 magic, __u32 constant)
+{
+	return (magic == constant ||
+		magic == __swab32(constant));
+}
+
+static char *accept = "secure";
+
+module_param(accept, charp, 0444);
+MODULE_PARM_DESC(accept, "Accept connections (secure|all|none)");
+module_param(accept_port, int, 0444);
+MODULE_PARM_DESC(accept_port, "Acceptor's port (same on all nodes)");
+module_param(accept_backlog, int, 0444);
+MODULE_PARM_DESC(accept_backlog, "Acceptor's listen backlog");
+module_param(accept_timeout, int, 0644);
+MODULE_PARM_DESC(accept_timeout, "Acceptor's timeout (seconds)");
+
+static char *accept_type;
+
+static int
+lnet_acceptor_get_tunables(void)
+{
+	/* Userland acceptor uses 'accept_type' instead of 'accept', due to
+	 * conflict with 'accept(2)', but kernel acceptor still uses 'accept'
+	 * for compatibility. Hence the trick. */
+	accept_type = accept;
+	return 0;
+}
+
+int
+lnet_acceptor_timeout(void)
+{
+	return accept_timeout;
+}
+EXPORT_SYMBOL(lnet_acceptor_timeout);
+
+void
+lnet_connect_console_error(int rc, lnet_nid_t peer_nid,
+			   __u32 peer_ip, int peer_port)
+{
+	switch (rc) {
+	/* "normal" errors */
+	case -ECONNREFUSED:
+		CNETERR("Connection to %s at host %pI4h on port %d was refused: check that Lustre is running on that node.\n",
+			libcfs_nid2str(peer_nid),
+			&peer_ip, peer_port);
+		break;
+	case -EHOSTUNREACH:
+	case -ENETUNREACH:
+		CNETERR("Connection to %s at host %pI4h was unreachable: the network or that node may be down, or Lustre may be misconfigured.\n",
+			libcfs_nid2str(peer_nid), &peer_ip);
+		break;
+	case -ETIMEDOUT:
+		CNETERR("Connection to %s at host %pI4h on port %d took too long: that node may be hung or experiencing high load.\n",
+			libcfs_nid2str(peer_nid),
+			&peer_ip, peer_port);
+		break;
+	case -ECONNRESET:
+		LCONSOLE_ERROR_MSG(0x11b, "Connection to %s at host %pI4h on port %d was reset: is it running a compatible version of Lustre and is %s one of its NIDs?\n",
+				   libcfs_nid2str(peer_nid),
+				   &peer_ip, peer_port,
+				   libcfs_nid2str(peer_nid));
+		break;
+	case -EPROTO:
+		LCONSOLE_ERROR_MSG(0x11c, "Protocol error connecting to %s at host %pI4h on port %d: is it running a compatible version of Lustre?\n",
+				   libcfs_nid2str(peer_nid),
+				   &peer_ip, peer_port);
+		break;
+	case -EADDRINUSE:
+		LCONSOLE_ERROR_MSG(0x11d, "No privileged ports available to connect to %s at host %pI4h on port %d\n",
+				   libcfs_nid2str(peer_nid),
+				   &peer_ip, peer_port);
+		break;
+	default:
+		LCONSOLE_ERROR_MSG(0x11e, "Unexpected error %d connecting to %s at host %pI4h on port %d\n",
+				   rc, libcfs_nid2str(peer_nid),
+				   &peer_ip, peer_port);
+		break;
+	}
+}
+EXPORT_SYMBOL(lnet_connect_console_error);
+
+int
+lnet_connect(struct socket **sockp, lnet_nid_t peer_nid,
+	    __u32 local_ip, __u32 peer_ip, int peer_port)
+{
+	lnet_acceptor_connreq_t cr;
+	struct socket	   *sock;
+	int		     rc;
+	int		     port;
+	int		     fatal;
+
+	CLASSERT(sizeof(cr) <= 16);	    /* not too big to be on the stack */
+
+	for (port = LNET_ACCEPTOR_MAX_RESERVED_PORT;
+	     port >= LNET_ACCEPTOR_MIN_RESERVED_PORT;
+	     --port) {
+		/* Iterate through reserved ports. */
+
+		rc = libcfs_sock_connect(&sock, &fatal,
+					 local_ip, port,
+					 peer_ip, peer_port);
+		if (rc != 0) {
+			if (fatal)
+				goto failed;
+			continue;
+		}
+
+		CLASSERT(LNET_PROTO_ACCEPTOR_VERSION == 1);
+
+		cr.acr_magic   = LNET_PROTO_ACCEPTOR_MAGIC;
+		cr.acr_version = LNET_PROTO_ACCEPTOR_VERSION;
+		cr.acr_nid     = peer_nid;
+
+		if (the_lnet.ln_testprotocompat != 0) {
+			/* single-shot proto check */
+			lnet_net_lock(LNET_LOCK_EX);
+			if ((the_lnet.ln_testprotocompat & 4) != 0) {
+				cr.acr_version++;
+				the_lnet.ln_testprotocompat &= ~4;
+			}
+			if ((the_lnet.ln_testprotocompat & 8) != 0) {
+				cr.acr_magic = LNET_PROTO_MAGIC;
+				the_lnet.ln_testprotocompat &= ~8;
+			}
+			lnet_net_unlock(LNET_LOCK_EX);
+		}
+
+		rc = libcfs_sock_write(sock, &cr, sizeof(cr),
+				       accept_timeout);
+		if (rc != 0)
+			goto failed_sock;
+
+		*sockp = sock;
+		return 0;
+	}
+
+	rc = -EADDRINUSE;
+	goto failed;
+
+ failed_sock:
+	libcfs_sock_release(sock);
+ failed:
+	lnet_connect_console_error(rc, peer_nid, peer_ip, peer_port);
+	return rc;
+}
+EXPORT_SYMBOL(lnet_connect);
+
+
+/* Below is the code common for both kernel and MT user-space */
+
+static int
+lnet_accept(struct socket *sock, __u32 magic)
+{
+	lnet_acceptor_connreq_t cr;
+	__u32		   peer_ip;
+	int		     peer_port;
+	int		     rc;
+	int		     flip;
+	lnet_ni_t	      *ni;
+	char		   *str;
+
+	LASSERT(sizeof(cr) <= 16);	     /* not too big for the stack */
+
+	rc = libcfs_sock_getaddr(sock, 1, &peer_ip, &peer_port);
+	LASSERT(rc == 0);		      /* we succeeded before */
+
+	if (!lnet_accept_magic(magic, LNET_PROTO_ACCEPTOR_MAGIC)) {
+
+		if (lnet_accept_magic(magic, LNET_PROTO_MAGIC)) {
+			/* future version compatibility!
+			 * When LNET unifies protocols over all LNDs, the first
+			 * thing sent will be a version query.  I send back
+			 * LNET_PROTO_ACCEPTOR_MAGIC to tell her I'm "old" */
+
+			memset(&cr, 0, sizeof(cr));
+			cr.acr_magic = LNET_PROTO_ACCEPTOR_MAGIC;
+			cr.acr_version = LNET_PROTO_ACCEPTOR_VERSION;
+			rc = libcfs_sock_write(sock, &cr, sizeof(cr),
+					       accept_timeout);
+
+			if (rc != 0)
+				CERROR("Error sending magic+version in response to LNET magic from %pI4h: %d\n",
+				       &peer_ip, rc);
+			return -EPROTO;
+		}
+
+		if (magic == le32_to_cpu(LNET_PROTO_TCP_MAGIC))
+			str = "'old' socknal/tcpnal";
+		else if (lnet_accept_magic(magic, LNET_PROTO_RA_MAGIC))
+			str = "'old' ranal";
+		else
+			str = "unrecognised";
+
+		LCONSOLE_ERROR_MSG(0x11f, "Refusing connection from %pI4h magic %08x: %s acceptor protocol\n",
+				   &peer_ip, magic, str);
+		return -EPROTO;
+	}
+
+	flip = (magic != LNET_PROTO_ACCEPTOR_MAGIC);
+
+	rc = libcfs_sock_read(sock, &cr.acr_version,
+			      sizeof(cr.acr_version),
+			      accept_timeout);
+	if (rc != 0) {
+		CERROR("Error %d reading connection request version from %pI4h\n",
+			rc, &peer_ip);
+		return -EIO;
+	}
+
+	if (flip)
+		__swab32s(&cr.acr_version);
+
+	if (cr.acr_version != LNET_PROTO_ACCEPTOR_VERSION) {
+		/* future version compatibility!
+		 * An acceptor-specific protocol rev will first send a version
+		 * query.  I send back my current version to tell her I'm
+		 * "old". */
+		int peer_version = cr.acr_version;
+
+		memset(&cr, 0, sizeof(cr));
+		cr.acr_magic = LNET_PROTO_ACCEPTOR_MAGIC;
+		cr.acr_version = LNET_PROTO_ACCEPTOR_VERSION;
+
+		rc = libcfs_sock_write(sock, &cr, sizeof(cr),
+				       accept_timeout);
+
+		if (rc != 0)
+			CERROR("Error sending magic+version in response to version %d from %pI4h: %d\n",
+			       peer_version, &peer_ip, rc);
+		return -EPROTO;
+	}
+
+	rc = libcfs_sock_read(sock, &cr.acr_nid,
+			      sizeof(cr) -
+			      offsetof(lnet_acceptor_connreq_t, acr_nid),
+			      accept_timeout);
+	if (rc != 0) {
+		CERROR("Error %d reading connection request from %pI4h\n",
+			rc, &peer_ip);
+		return -EIO;
+	}
+
+	if (flip)
+		__swab64s(&cr.acr_nid);
+
+	ni = lnet_net2ni(LNET_NIDNET(cr.acr_nid));
+	if (ni == NULL ||	       /* no matching net */
+	    ni->ni_nid != cr.acr_nid) { /* right NET, wrong NID! */
+		if (ni != NULL)
+			lnet_ni_decref(ni);
+		LCONSOLE_ERROR_MSG(0x120, "Refusing connection from %pI4h for %s: No matching NI\n",
+				   &peer_ip, libcfs_nid2str(cr.acr_nid));
+		return -EPERM;
+	}
+
+	if (ni->ni_lnd->lnd_accept == NULL) {
+		/* This catches a request for the loopback LND */
+		lnet_ni_decref(ni);
+		LCONSOLE_ERROR_MSG(0x121, "Refusing connection from %pI4h for %s: NI doesn not accept IP connections\n",
+				  &peer_ip, libcfs_nid2str(cr.acr_nid));
+		return -EPERM;
+	}
+
+	CDEBUG(D_NET, "Accept %s from %pI4h\n",
+	       libcfs_nid2str(cr.acr_nid), &peer_ip);
+
+	rc = ni->ni_lnd->lnd_accept(ni, sock);
+
+	lnet_ni_decref(ni);
+	return rc;
+}
+
+static int
+lnet_acceptor(void *arg)
+{
+	struct socket *newsock;
+	int	    rc;
+	__u32	  magic;
+	__u32	  peer_ip;
+	int	    peer_port;
+	int	    secure = (int)((long_ptr_t)arg);
+
+	LASSERT(lnet_acceptor_state.pta_sock == NULL);
+
+	cfs_block_allsigs();
+
+	rc = libcfs_sock_listen(&lnet_acceptor_state.pta_sock,
+				0, accept_port, accept_backlog);
+	if (rc != 0) {
+		if (rc == -EADDRINUSE)
+			LCONSOLE_ERROR_MSG(0x122, "Can't start acceptor on port %d: port already in use\n",
+					   accept_port);
+		else
+			LCONSOLE_ERROR_MSG(0x123, "Can't start acceptor on port %d: unexpected error %d\n",
+					   accept_port, rc);
+
+		lnet_acceptor_state.pta_sock = NULL;
+	} else {
+		LCONSOLE(0, "Accept %s, port %d\n", accept_type, accept_port);
+	}
+
+	/* set init status and unblock parent */
+	lnet_acceptor_state.pta_shutdown = rc;
+	complete(&lnet_acceptor_state.pta_signal);
+
+	if (rc != 0)
+		return rc;
+
+	while (!lnet_acceptor_state.pta_shutdown) {
+
+		rc = libcfs_sock_accept(&newsock, lnet_acceptor_state.pta_sock);
+		if (rc != 0) {
+			if (rc != -EAGAIN) {
+				CWARN("Accept error %d: pausing...\n", rc);
+				set_current_state(TASK_UNINTERRUPTIBLE);
+				schedule_timeout(cfs_time_seconds(1));
+			}
+			continue;
+		}
+
+		/* maybe we're waken up with libcfs_sock_abort_accept() */
+		if (lnet_acceptor_state.pta_shutdown) {
+			libcfs_sock_release(newsock);
+			break;
+		}
+
+		rc = libcfs_sock_getaddr(newsock, 1, &peer_ip, &peer_port);
+		if (rc != 0) {
+			CERROR("Can't determine new connection's address\n");
+			goto failed;
+		}
+
+		if (secure && peer_port > LNET_ACCEPTOR_MAX_RESERVED_PORT) {
+			CERROR("Refusing connection from %pI4h: insecure port %d\n",
+			       &peer_ip, peer_port);
+			goto failed;
+		}
+
+		rc = libcfs_sock_read(newsock, &magic, sizeof(magic),
+				      accept_timeout);
+		if (rc != 0) {
+			CERROR("Error %d reading connection request from %pI4h\n",
+				rc, &peer_ip);
+			goto failed;
+		}
+
+		rc = lnet_accept(newsock, magic);
+		if (rc != 0)
+			goto failed;
+
+		continue;
+
+failed:
+		libcfs_sock_release(newsock);
+	}
+
+	libcfs_sock_release(lnet_acceptor_state.pta_sock);
+	lnet_acceptor_state.pta_sock = NULL;
+
+	CDEBUG(D_NET, "Acceptor stopping\n");
+
+	/* unblock lnet_acceptor_stop() */
+	complete(&lnet_acceptor_state.pta_signal);
+	return 0;
+}
+
+static inline int
+accept2secure(const char *acc, long *sec)
+{
+	if (!strcmp(acc, "secure")) {
+		*sec = 1;
+		return 1;
+	} else if (!strcmp(acc, "all")) {
+		*sec = 0;
+		return 1;
+	} else if (!strcmp(acc, "none")) {
+		return 0;
+	}
+
+	LCONSOLE_ERROR_MSG(0x124, "Can't parse 'accept=\"%s\"'\n",
+			   acc);
+	return -EINVAL;
+}
+
+int
+lnet_acceptor_start(void)
+{
+	int  rc;
+	long rc2;
+	long secure;
+
+	LASSERT(lnet_acceptor_state.pta_sock == NULL);
+
+	rc = lnet_acceptor_get_tunables();
+	if (rc != 0)
+		return rc;
+
+
+	init_completion(&lnet_acceptor_state.pta_signal);
+	rc = accept2secure(accept_type, &secure);
+	if (rc <= 0)
+		return rc;
+
+	if (lnet_count_acceptor_nis() == 0)  /* not required */
+		return 0;
+
+	rc2 = PTR_ERR(kthread_run(lnet_acceptor,
+				  (void *)(ulong_ptr_t)secure,
+				  "acceptor_%03ld", secure));
+	if (IS_ERR_VALUE(rc2)) {
+		CERROR("Can't start acceptor thread: %ld\n", rc2);
+
+		return -ESRCH;
+	}
+
+	/* wait for acceptor to startup */
+	wait_for_completion(&lnet_acceptor_state.pta_signal);
+
+	if (!lnet_acceptor_state.pta_shutdown) {
+		/* started OK */
+		LASSERT(lnet_acceptor_state.pta_sock != NULL);
+		return 0;
+	}
+
+	LASSERT(lnet_acceptor_state.pta_sock == NULL);
+
+	return -ENETDOWN;
+}
+
+void
+lnet_acceptor_stop(void)
+{
+	if (lnet_acceptor_state.pta_sock == NULL) /* not running */
+		return;
+
+	lnet_acceptor_state.pta_shutdown = 1;
+	libcfs_sock_abort_accept(lnet_acceptor_state.pta_sock);
+
+	/* block until acceptor signals exit */
+	wait_for_completion(&lnet_acceptor_state.pta_signal);
+}
diff --git a/kernel/drivers/staging/lustre/lnet/lnet/api-ni.c b/kernel/drivers/staging/lustre/lnet/lnet/api-ni.c
new file mode 100644
index 000000000..4a14e5109
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lnet/lnet/api-ni.c
@@ -0,0 +1,1940 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+#include "../../include/linux/lnet/lib-lnet.h"
+#include <linux/log2.h>
+#include <linux/ktime.h>
+
+#define D_LNI D_CONSOLE
+
+lnet_t      the_lnet;			   /* THE state of the network */
+EXPORT_SYMBOL(the_lnet);
+
+
+static char *ip2nets = "";
+module_param(ip2nets, charp, 0444);
+MODULE_PARM_DESC(ip2nets, "LNET network <- IP table");
+
+static char *networks = "";
+module_param(networks, charp, 0444);
+MODULE_PARM_DESC(networks, "local networks");
+
+static char *routes = "";
+module_param(routes, charp, 0444);
+MODULE_PARM_DESC(routes, "routes to non-local networks");
+
+static int rnet_htable_size = LNET_REMOTE_NETS_HASH_DEFAULT;
+module_param(rnet_htable_size, int, 0444);
+MODULE_PARM_DESC(rnet_htable_size, "size of remote network hash table");
+
+static char *
+lnet_get_routes(void)
+{
+	return routes;
+}
+
+static char *
+lnet_get_networks(void)
+{
+	char   *nets;
+	int     rc;
+
+	if (*networks != 0 && *ip2nets != 0) {
+		LCONSOLE_ERROR_MSG(0x101, "Please specify EITHER 'networks' or 'ip2nets' but not both at once\n");
+		return NULL;
+	}
+
+	if (*ip2nets != 0) {
+		rc = lnet_parse_ip2nets(&nets, ip2nets);
+		return (rc == 0) ? nets : NULL;
+	}
+
+	if (*networks != 0)
+		return networks;
+
+	return "tcp";
+}
+
+static void
+lnet_init_locks(void)
+{
+	spin_lock_init(&the_lnet.ln_eq_wait_lock);
+	init_waitqueue_head(&the_lnet.ln_eq_waitq);
+	mutex_init(&the_lnet.ln_lnd_mutex);
+	mutex_init(&the_lnet.ln_api_mutex);
+}
+
+static void
+lnet_fini_locks(void)
+{
+}
+
+
+static int
+lnet_create_remote_nets_table(void)
+{
+	int		i;
+	struct list_head	*hash;
+
+	LASSERT(the_lnet.ln_remote_nets_hash == NULL);
+	LASSERT(the_lnet.ln_remote_nets_hbits > 0);
+	LIBCFS_ALLOC(hash, LNET_REMOTE_NETS_HASH_SIZE * sizeof(*hash));
+	if (hash == NULL) {
+		CERROR("Failed to create remote nets hash table\n");
+		return -ENOMEM;
+	}
+
+	for (i = 0; i < LNET_REMOTE_NETS_HASH_SIZE; i++)
+		INIT_LIST_HEAD(&hash[i]);
+	the_lnet.ln_remote_nets_hash = hash;
+	return 0;
+}
+
+static void
+lnet_destroy_remote_nets_table(void)
+{
+	int i;
+
+	if (the_lnet.ln_remote_nets_hash == NULL)
+		return;
+
+	for (i = 0; i < LNET_REMOTE_NETS_HASH_SIZE; i++)
+		LASSERT(list_empty(&the_lnet.ln_remote_nets_hash[i]));
+
+	LIBCFS_FREE(the_lnet.ln_remote_nets_hash,
+		    LNET_REMOTE_NETS_HASH_SIZE *
+		    sizeof(the_lnet.ln_remote_nets_hash[0]));
+	the_lnet.ln_remote_nets_hash = NULL;
+}
+
+static void
+lnet_destroy_locks(void)
+{
+	if (the_lnet.ln_res_lock != NULL) {
+		cfs_percpt_lock_free(the_lnet.ln_res_lock);
+		the_lnet.ln_res_lock = NULL;
+	}
+
+	if (the_lnet.ln_net_lock != NULL) {
+		cfs_percpt_lock_free(the_lnet.ln_net_lock);
+		the_lnet.ln_net_lock = NULL;
+	}
+
+	lnet_fini_locks();
+}
+
+static int
+lnet_create_locks(void)
+{
+	lnet_init_locks();
+
+	the_lnet.ln_res_lock = cfs_percpt_lock_alloc(lnet_cpt_table());
+	if (the_lnet.ln_res_lock == NULL)
+		goto failed;
+
+	the_lnet.ln_net_lock = cfs_percpt_lock_alloc(lnet_cpt_table());
+	if (the_lnet.ln_net_lock == NULL)
+		goto failed;
+
+	return 0;
+
+ failed:
+	lnet_destroy_locks();
+	return -ENOMEM;
+}
+
+static void lnet_assert_wire_constants(void)
+{
+	/* Wire protocol assertions generated by 'wirecheck'
+	 * running on Linux robert.bartonsoftware.com 2.6.8-1.521
+	 * #1 Mon Aug 16 09:01:18 EDT 2004 i686 athlon i386 GNU/Linux
+	 * with gcc version 3.3.3 20040412 (Red Hat Linux 3.3.3-7) */
+
+	/* Constants... */
+	CLASSERT(LNET_PROTO_TCP_MAGIC == 0xeebc0ded);
+	CLASSERT(LNET_PROTO_TCP_VERSION_MAJOR == 1);
+	CLASSERT(LNET_PROTO_TCP_VERSION_MINOR == 0);
+	CLASSERT(LNET_MSG_ACK == 0);
+	CLASSERT(LNET_MSG_PUT == 1);
+	CLASSERT(LNET_MSG_GET == 2);
+	CLASSERT(LNET_MSG_REPLY == 3);
+	CLASSERT(LNET_MSG_HELLO == 4);
+
+	/* Checks for struct ptl_handle_wire_t */
+	CLASSERT((int)sizeof(lnet_handle_wire_t) == 16);
+	CLASSERT((int)offsetof(lnet_handle_wire_t, wh_interface_cookie) == 0);
+	CLASSERT((int)sizeof(((lnet_handle_wire_t *)0)->wh_interface_cookie) == 8);
+	CLASSERT((int)offsetof(lnet_handle_wire_t, wh_object_cookie) == 8);
+	CLASSERT((int)sizeof(((lnet_handle_wire_t *)0)->wh_object_cookie) == 8);
+
+	/* Checks for struct lnet_magicversion_t */
+	CLASSERT((int)sizeof(lnet_magicversion_t) == 8);
+	CLASSERT((int)offsetof(lnet_magicversion_t, magic) == 0);
+	CLASSERT((int)sizeof(((lnet_magicversion_t *)0)->magic) == 4);
+	CLASSERT((int)offsetof(lnet_magicversion_t, version_major) == 4);
+	CLASSERT((int)sizeof(((lnet_magicversion_t *)0)->version_major) == 2);
+	CLASSERT((int)offsetof(lnet_magicversion_t, version_minor) == 6);
+	CLASSERT((int)sizeof(((lnet_magicversion_t *)0)->version_minor) == 2);
+
+	/* Checks for struct lnet_hdr_t */
+	CLASSERT((int)sizeof(lnet_hdr_t) == 72);
+	CLASSERT((int)offsetof(lnet_hdr_t, dest_nid) == 0);
+	CLASSERT((int)sizeof(((lnet_hdr_t *)0)->dest_nid) == 8);
+	CLASSERT((int)offsetof(lnet_hdr_t, src_nid) == 8);
+	CLASSERT((int)sizeof(((lnet_hdr_t *)0)->src_nid) == 8);
+	CLASSERT((int)offsetof(lnet_hdr_t, dest_pid) == 16);
+	CLASSERT((int)sizeof(((lnet_hdr_t *)0)->dest_pid) == 4);
+	CLASSERT((int)offsetof(lnet_hdr_t, src_pid) == 20);
+	CLASSERT((int)sizeof(((lnet_hdr_t *)0)->src_pid) == 4);
+	CLASSERT((int)offsetof(lnet_hdr_t, type) == 24);
+	CLASSERT((int)sizeof(((lnet_hdr_t *)0)->type) == 4);
+	CLASSERT((int)offsetof(lnet_hdr_t, payload_length) == 28);
+	CLASSERT((int)sizeof(((lnet_hdr_t *)0)->payload_length) == 4);
+	CLASSERT((int)offsetof(lnet_hdr_t, msg) == 32);
+	CLASSERT((int)sizeof(((lnet_hdr_t *)0)->msg) == 40);
+
+	/* Ack */
+	CLASSERT((int)offsetof(lnet_hdr_t, msg.ack.dst_wmd) == 32);
+	CLASSERT((int)sizeof(((lnet_hdr_t *)0)->msg.ack.dst_wmd) == 16);
+	CLASSERT((int)offsetof(lnet_hdr_t, msg.ack.match_bits) == 48);
+	CLASSERT((int)sizeof(((lnet_hdr_t *)0)->msg.ack.match_bits) == 8);
+	CLASSERT((int)offsetof(lnet_hdr_t, msg.ack.mlength) == 56);
+	CLASSERT((int)sizeof(((lnet_hdr_t *)0)->msg.ack.mlength) == 4);
+
+	/* Put */
+	CLASSERT((int)offsetof(lnet_hdr_t, msg.put.ack_wmd) == 32);
+	CLASSERT((int)sizeof(((lnet_hdr_t *)0)->msg.put.ack_wmd) == 16);
+	CLASSERT((int)offsetof(lnet_hdr_t, msg.put.match_bits) == 48);
+	CLASSERT((int)sizeof(((lnet_hdr_t *)0)->msg.put.match_bits) == 8);
+	CLASSERT((int)offsetof(lnet_hdr_t, msg.put.hdr_data) == 56);
+	CLASSERT((int)sizeof(((lnet_hdr_t *)0)->msg.put.hdr_data) == 8);
+	CLASSERT((int)offsetof(lnet_hdr_t, msg.put.ptl_index) == 64);
+	CLASSERT((int)sizeof(((lnet_hdr_t *)0)->msg.put.ptl_index) == 4);
+	CLASSERT((int)offsetof(lnet_hdr_t, msg.put.offset) == 68);
+	CLASSERT((int)sizeof(((lnet_hdr_t *)0)->msg.put.offset) == 4);
+
+	/* Get */
+	CLASSERT((int)offsetof(lnet_hdr_t, msg.get.return_wmd) == 32);
+	CLASSERT((int)sizeof(((lnet_hdr_t *)0)->msg.get.return_wmd) == 16);
+	CLASSERT((int)offsetof(lnet_hdr_t, msg.get.match_bits) == 48);
+	CLASSERT((int)sizeof(((lnet_hdr_t *)0)->msg.get.match_bits) == 8);
+	CLASSERT((int)offsetof(lnet_hdr_t, msg.get.ptl_index) == 56);
+	CLASSERT((int)sizeof(((lnet_hdr_t *)0)->msg.get.ptl_index) == 4);
+	CLASSERT((int)offsetof(lnet_hdr_t, msg.get.src_offset) == 60);
+	CLASSERT((int)sizeof(((lnet_hdr_t *)0)->msg.get.src_offset) == 4);
+	CLASSERT((int)offsetof(lnet_hdr_t, msg.get.sink_length) == 64);
+	CLASSERT((int)sizeof(((lnet_hdr_t *)0)->msg.get.sink_length) == 4);
+
+	/* Reply */
+	CLASSERT((int)offsetof(lnet_hdr_t, msg.reply.dst_wmd) == 32);
+	CLASSERT((int)sizeof(((lnet_hdr_t *)0)->msg.reply.dst_wmd) == 16);
+
+	/* Hello */
+	CLASSERT((int)offsetof(lnet_hdr_t, msg.hello.incarnation) == 32);
+	CLASSERT((int)sizeof(((lnet_hdr_t *)0)->msg.hello.incarnation) == 8);
+	CLASSERT((int)offsetof(lnet_hdr_t, msg.hello.type) == 40);
+	CLASSERT((int)sizeof(((lnet_hdr_t *)0)->msg.hello.type) == 4);
+}
+
+static lnd_t *
+lnet_find_lnd_by_type(int type)
+{
+	lnd_t	      *lnd;
+	struct list_head	 *tmp;
+
+	/* holding lnd mutex */
+	list_for_each(tmp, &the_lnet.ln_lnds) {
+		lnd = list_entry(tmp, lnd_t, lnd_list);
+
+		if ((int)lnd->lnd_type == type)
+			return lnd;
+	}
+
+	return NULL;
+}
+
+void
+lnet_register_lnd(lnd_t *lnd)
+{
+	LNET_MUTEX_LOCK(&the_lnet.ln_lnd_mutex);
+
+	LASSERT(the_lnet.ln_init);
+	LASSERT(libcfs_isknown_lnd(lnd->lnd_type));
+	LASSERT(lnet_find_lnd_by_type(lnd->lnd_type) == NULL);
+
+	list_add_tail(&lnd->lnd_list, &the_lnet.ln_lnds);
+	lnd->lnd_refcount = 0;
+
+	CDEBUG(D_NET, "%s LND registered\n", libcfs_lnd2str(lnd->lnd_type));
+
+	LNET_MUTEX_UNLOCK(&the_lnet.ln_lnd_mutex);
+}
+EXPORT_SYMBOL(lnet_register_lnd);
+
+void
+lnet_unregister_lnd(lnd_t *lnd)
+{
+	LNET_MUTEX_LOCK(&the_lnet.ln_lnd_mutex);
+
+	LASSERT(the_lnet.ln_init);
+	LASSERT(lnet_find_lnd_by_type(lnd->lnd_type) == lnd);
+	LASSERT(lnd->lnd_refcount == 0);
+
+	list_del(&lnd->lnd_list);
+	CDEBUG(D_NET, "%s LND unregistered\n", libcfs_lnd2str(lnd->lnd_type));
+
+	LNET_MUTEX_UNLOCK(&the_lnet.ln_lnd_mutex);
+}
+EXPORT_SYMBOL(lnet_unregister_lnd);
+
+void
+lnet_counters_get(lnet_counters_t *counters)
+{
+	lnet_counters_t *ctr;
+	int		i;
+
+	memset(counters, 0, sizeof(*counters));
+
+	lnet_net_lock(LNET_LOCK_EX);
+
+	cfs_percpt_for_each(ctr, i, the_lnet.ln_counters) {
+		counters->msgs_max     += ctr->msgs_max;
+		counters->msgs_alloc   += ctr->msgs_alloc;
+		counters->errors       += ctr->errors;
+		counters->send_count   += ctr->send_count;
+		counters->recv_count   += ctr->recv_count;
+		counters->route_count  += ctr->route_count;
+		counters->drop_count   += ctr->drop_count;
+		counters->send_length  += ctr->send_length;
+		counters->recv_length  += ctr->recv_length;
+		counters->route_length += ctr->route_length;
+		counters->drop_length  += ctr->drop_length;
+
+	}
+	lnet_net_unlock(LNET_LOCK_EX);
+}
+EXPORT_SYMBOL(lnet_counters_get);
+
+void
+lnet_counters_reset(void)
+{
+	lnet_counters_t *counters;
+	int		i;
+
+	lnet_net_lock(LNET_LOCK_EX);
+
+	cfs_percpt_for_each(counters, i, the_lnet.ln_counters)
+		memset(counters, 0, sizeof(lnet_counters_t));
+
+	lnet_net_unlock(LNET_LOCK_EX);
+}
+EXPORT_SYMBOL(lnet_counters_reset);
+
+#ifdef LNET_USE_LIB_FREELIST
+
+int
+lnet_freelist_init(lnet_freelist_t *fl, int n, int size)
+{
+	char *space;
+
+	LASSERT(n > 0);
+
+	size += offsetof(lnet_freeobj_t, fo_contents);
+
+	LIBCFS_ALLOC(space, n * size);
+	if (space == NULL)
+		return -ENOMEM;
+
+	INIT_LIST_HEAD(&fl->fl_list);
+	fl->fl_objs = space;
+	fl->fl_nobjs = n;
+	fl->fl_objsize = size;
+
+	do {
+		memset(space, 0, size);
+		list_add((struct list_head *)space, &fl->fl_list);
+		space += size;
+	} while (--n != 0);
+
+	return 0;
+}
+
+void
+lnet_freelist_fini(lnet_freelist_t *fl)
+{
+	struct list_head       *el;
+	int	       count;
+
+	if (fl->fl_nobjs == 0)
+		return;
+
+	count = 0;
+	for (el = fl->fl_list.next; el != &fl->fl_list; el = el->next)
+		count++;
+
+	LASSERT(count == fl->fl_nobjs);
+
+	LIBCFS_FREE(fl->fl_objs, fl->fl_nobjs * fl->fl_objsize);
+	memset(fl, 0, sizeof(*fl));
+}
+
+#endif /* LNET_USE_LIB_FREELIST */
+
+static __u64
+lnet_create_interface_cookie(void)
+{
+	/* NB the interface cookie in wire handles guards against delayed
+	 * replies and ACKs appearing valid after reboot.
+	 */
+	return ktime_get_ns();
+}
+
+static char *
+lnet_res_type2str(int type)
+{
+	switch (type) {
+	default:
+		LBUG();
+	case LNET_COOKIE_TYPE_MD:
+		return "MD";
+	case LNET_COOKIE_TYPE_ME:
+		return "ME";
+	case LNET_COOKIE_TYPE_EQ:
+		return "EQ";
+	}
+}
+
+static void
+lnet_res_container_cleanup(struct lnet_res_container *rec)
+{
+	int	count = 0;
+
+	if (rec->rec_type == 0) /* not set yet, it's uninitialized */
+		return;
+
+	while (!list_empty(&rec->rec_active)) {
+		struct list_head *e = rec->rec_active.next;
+
+		list_del_init(e);
+		if (rec->rec_type == LNET_COOKIE_TYPE_EQ) {
+			lnet_eq_free(list_entry(e, lnet_eq_t, eq_list));
+
+		} else if (rec->rec_type == LNET_COOKIE_TYPE_MD) {
+			lnet_md_free(list_entry(e, lnet_libmd_t, md_list));
+
+		} else { /* NB: Active MEs should be attached on portals */
+			LBUG();
+		}
+		count++;
+	}
+
+	if (count > 0) {
+		/* Found alive MD/ME/EQ, user really should unlink/free
+		 * all of them before finalize LNet, but if someone didn't,
+		 * we have to recycle garbage for him */
+		CERROR("%d active elements on exit of %s container\n",
+		       count, lnet_res_type2str(rec->rec_type));
+	}
+
+#ifdef LNET_USE_LIB_FREELIST
+	lnet_freelist_fini(&rec->rec_freelist);
+#endif
+	if (rec->rec_lh_hash != NULL) {
+		LIBCFS_FREE(rec->rec_lh_hash,
+			    LNET_LH_HASH_SIZE * sizeof(rec->rec_lh_hash[0]));
+		rec->rec_lh_hash = NULL;
+	}
+
+	rec->rec_type = 0; /* mark it as finalized */
+}
+
+static int
+lnet_res_container_setup(struct lnet_res_container *rec,
+			 int cpt, int type, int objnum, int objsz)
+{
+	int	rc = 0;
+	int	i;
+
+	LASSERT(rec->rec_type == 0);
+
+	rec->rec_type = type;
+	INIT_LIST_HEAD(&rec->rec_active);
+
+#ifdef LNET_USE_LIB_FREELIST
+	memset(&rec->rec_freelist, 0, sizeof(rec->rec_freelist));
+	rc = lnet_freelist_init(&rec->rec_freelist, objnum, objsz);
+	if (rc != 0)
+		goto out;
+#endif
+	rec->rec_lh_cookie = (cpt << LNET_COOKIE_TYPE_BITS) | type;
+
+	/* Arbitrary choice of hash table size */
+	LIBCFS_CPT_ALLOC(rec->rec_lh_hash, lnet_cpt_table(), cpt,
+			 LNET_LH_HASH_SIZE * sizeof(rec->rec_lh_hash[0]));
+	if (rec->rec_lh_hash == NULL) {
+		rc = -ENOMEM;
+		goto out;
+	}
+
+	for (i = 0; i < LNET_LH_HASH_SIZE; i++)
+		INIT_LIST_HEAD(&rec->rec_lh_hash[i]);
+
+	return 0;
+
+out:
+	CERROR("Failed to setup %s resource container\n",
+	       lnet_res_type2str(type));
+	lnet_res_container_cleanup(rec);
+	return rc;
+}
+
+static void
+lnet_res_containers_destroy(struct lnet_res_container **recs)
+{
+	struct lnet_res_container	*rec;
+	int				i;
+
+	cfs_percpt_for_each(rec, i, recs)
+		lnet_res_container_cleanup(rec);
+
+	cfs_percpt_free(recs);
+}
+
+static struct lnet_res_container **
+lnet_res_containers_create(int type, int objnum, int objsz)
+{
+	struct lnet_res_container	**recs;
+	struct lnet_res_container	*rec;
+	int				rc;
+	int				i;
+
+	recs = cfs_percpt_alloc(lnet_cpt_table(), sizeof(*rec));
+	if (recs == NULL) {
+		CERROR("Failed to allocate %s resource containers\n",
+		       lnet_res_type2str(type));
+		return NULL;
+	}
+
+	cfs_percpt_for_each(rec, i, recs) {
+		rc = lnet_res_container_setup(rec, i, type, objnum, objsz);
+		if (rc != 0) {
+			lnet_res_containers_destroy(recs);
+			return NULL;
+		}
+	}
+
+	return recs;
+}
+
+lnet_libhandle_t *
+lnet_res_lh_lookup(struct lnet_res_container *rec, __u64 cookie)
+{
+	/* ALWAYS called with lnet_res_lock held */
+	struct list_head		*head;
+	lnet_libhandle_t	*lh;
+	unsigned int		hash;
+
+	if ((cookie & LNET_COOKIE_MASK) != rec->rec_type)
+		return NULL;
+
+	hash = cookie >> (LNET_COOKIE_TYPE_BITS + LNET_CPT_BITS);
+	head = &rec->rec_lh_hash[hash & LNET_LH_HASH_MASK];
+
+	list_for_each_entry(lh, head, lh_hash_chain) {
+		if (lh->lh_cookie == cookie)
+			return lh;
+	}
+
+	return NULL;
+}
+
+void
+lnet_res_lh_initialize(struct lnet_res_container *rec, lnet_libhandle_t *lh)
+{
+	/* ALWAYS called with lnet_res_lock held */
+	unsigned int	ibits = LNET_COOKIE_TYPE_BITS + LNET_CPT_BITS;
+	unsigned int	hash;
+
+	lh->lh_cookie = rec->rec_lh_cookie;
+	rec->rec_lh_cookie += 1 << ibits;
+
+	hash = (lh->lh_cookie >> ibits) & LNET_LH_HASH_MASK;
+
+	list_add(&lh->lh_hash_chain, &rec->rec_lh_hash[hash]);
+}
+
+
+int lnet_unprepare(void);
+
+static int
+lnet_prepare(lnet_pid_t requested_pid)
+{
+	/* Prepare to bring up the network */
+	struct lnet_res_container **recs;
+	int			  rc = 0;
+
+	LASSERT(the_lnet.ln_refcount == 0);
+
+	the_lnet.ln_routing = 0;
+
+	LASSERT((requested_pid & LNET_PID_USERFLAG) == 0);
+	the_lnet.ln_pid = requested_pid;
+
+	INIT_LIST_HEAD(&the_lnet.ln_test_peers);
+	INIT_LIST_HEAD(&the_lnet.ln_nis);
+	INIT_LIST_HEAD(&the_lnet.ln_nis_cpt);
+	INIT_LIST_HEAD(&the_lnet.ln_nis_zombie);
+	INIT_LIST_HEAD(&the_lnet.ln_routers);
+
+	rc = lnet_create_remote_nets_table();
+	if (rc != 0)
+		goto failed;
+
+	the_lnet.ln_interface_cookie = lnet_create_interface_cookie();
+
+	the_lnet.ln_counters = cfs_percpt_alloc(lnet_cpt_table(),
+						sizeof(lnet_counters_t));
+	if (the_lnet.ln_counters == NULL) {
+		CERROR("Failed to allocate counters for LNet\n");
+		rc = -ENOMEM;
+		goto failed;
+	}
+
+	rc = lnet_peer_tables_create();
+	if (rc != 0)
+		goto failed;
+
+	rc = lnet_msg_containers_create();
+	if (rc != 0)
+		goto failed;
+
+	rc = lnet_res_container_setup(&the_lnet.ln_eq_container, 0,
+				      LNET_COOKIE_TYPE_EQ, LNET_FL_MAX_EQS,
+				      sizeof(lnet_eq_t));
+	if (rc != 0)
+		goto failed;
+
+	recs = lnet_res_containers_create(LNET_COOKIE_TYPE_ME, LNET_FL_MAX_MES,
+					  sizeof(lnet_me_t));
+	if (recs == NULL) {
+		rc = -ENOMEM;
+		goto failed;
+	}
+
+	the_lnet.ln_me_containers = recs;
+
+	recs = lnet_res_containers_create(LNET_COOKIE_TYPE_MD, LNET_FL_MAX_MDS,
+					  sizeof(lnet_libmd_t));
+	if (recs == NULL) {
+		rc = -ENOMEM;
+		goto failed;
+	}
+
+	the_lnet.ln_md_containers = recs;
+
+	rc = lnet_portals_create();
+	if (rc != 0) {
+		CERROR("Failed to create portals for LNet: %d\n", rc);
+		goto failed;
+	}
+
+	return 0;
+
+ failed:
+	lnet_unprepare();
+	return rc;
+}
+
+int
+lnet_unprepare(void)
+{
+	/* NB no LNET_LOCK since this is the last reference.  All LND instances
+	 * have shut down already, so it is safe to unlink and free all
+	 * descriptors, even those that appear committed to a network op (eg MD
+	 * with non-zero pending count) */
+
+	lnet_fail_nid(LNET_NID_ANY, 0);
+
+	LASSERT(the_lnet.ln_refcount == 0);
+	LASSERT(list_empty(&the_lnet.ln_test_peers));
+	LASSERT(list_empty(&the_lnet.ln_nis));
+	LASSERT(list_empty(&the_lnet.ln_nis_cpt));
+	LASSERT(list_empty(&the_lnet.ln_nis_zombie));
+
+	lnet_portals_destroy();
+
+	if (the_lnet.ln_md_containers != NULL) {
+		lnet_res_containers_destroy(the_lnet.ln_md_containers);
+		the_lnet.ln_md_containers = NULL;
+	}
+
+	if (the_lnet.ln_me_containers != NULL) {
+		lnet_res_containers_destroy(the_lnet.ln_me_containers);
+		the_lnet.ln_me_containers = NULL;
+	}
+
+	lnet_res_container_cleanup(&the_lnet.ln_eq_container);
+
+	lnet_msg_containers_destroy();
+	lnet_peer_tables_destroy();
+	lnet_rtrpools_free();
+
+	if (the_lnet.ln_counters != NULL) {
+		cfs_percpt_free(the_lnet.ln_counters);
+		the_lnet.ln_counters = NULL;
+	}
+	lnet_destroy_remote_nets_table();
+
+	return 0;
+}
+
+lnet_ni_t  *
+lnet_net2ni_locked(__u32 net, int cpt)
+{
+	struct list_head	*tmp;
+	lnet_ni_t	*ni;
+
+	LASSERT(cpt != LNET_LOCK_EX);
+
+	list_for_each(tmp, &the_lnet.ln_nis) {
+		ni = list_entry(tmp, lnet_ni_t, ni_list);
+
+		if (LNET_NIDNET(ni->ni_nid) == net) {
+			lnet_ni_addref_locked(ni, cpt);
+			return ni;
+		}
+	}
+
+	return NULL;
+}
+
+lnet_ni_t *
+lnet_net2ni(__u32 net)
+{
+	lnet_ni_t *ni;
+
+	lnet_net_lock(0);
+	ni = lnet_net2ni_locked(net, 0);
+	lnet_net_unlock(0);
+
+	return ni;
+}
+EXPORT_SYMBOL(lnet_net2ni);
+
+static unsigned int
+lnet_nid_cpt_hash(lnet_nid_t nid, unsigned int number)
+{
+	__u64		key = nid;
+	unsigned int	val;
+
+	LASSERT(number >= 1 && number <= LNET_CPT_NUMBER);
+
+	if (number == 1)
+		return 0;
+
+	val = hash_long(key, LNET_CPT_BITS);
+	/* NB: LNET_CP_NUMBER doesn't have to be PO2 */
+	if (val < number)
+		return val;
+
+	return (unsigned int)(key + val + (val >> 1)) % number;
+}
+
+int
+lnet_cpt_of_nid_locked(lnet_nid_t nid)
+{
+	struct lnet_ni *ni;
+
+	/* must called with hold of lnet_net_lock */
+	if (LNET_CPT_NUMBER == 1)
+		return 0; /* the only one */
+
+	/* take lnet_net_lock(any) would be OK */
+	if (!list_empty(&the_lnet.ln_nis_cpt)) {
+		list_for_each_entry(ni, &the_lnet.ln_nis_cpt, ni_cptlist) {
+			if (LNET_NIDNET(ni->ni_nid) != LNET_NIDNET(nid))
+				continue;
+
+			LASSERT(ni->ni_cpts != NULL);
+			return ni->ni_cpts[lnet_nid_cpt_hash
+					   (nid, ni->ni_ncpts)];
+		}
+	}
+
+	return lnet_nid_cpt_hash(nid, LNET_CPT_NUMBER);
+}
+
+int
+lnet_cpt_of_nid(lnet_nid_t nid)
+{
+	int	cpt;
+	int	cpt2;
+
+	if (LNET_CPT_NUMBER == 1)
+		return 0; /* the only one */
+
+	if (list_empty(&the_lnet.ln_nis_cpt))
+		return lnet_nid_cpt_hash(nid, LNET_CPT_NUMBER);
+
+	cpt = lnet_net_lock_current();
+	cpt2 = lnet_cpt_of_nid_locked(nid);
+	lnet_net_unlock(cpt);
+
+	return cpt2;
+}
+EXPORT_SYMBOL(lnet_cpt_of_nid);
+
+int
+lnet_islocalnet(__u32 net)
+{
+	struct lnet_ni	*ni;
+	int		cpt;
+
+	cpt = lnet_net_lock_current();
+
+	ni = lnet_net2ni_locked(net, cpt);
+	if (ni != NULL)
+		lnet_ni_decref_locked(ni, cpt);
+
+	lnet_net_unlock(cpt);
+
+	return ni != NULL;
+}
+
+lnet_ni_t  *
+lnet_nid2ni_locked(lnet_nid_t nid, int cpt)
+{
+	struct lnet_ni	*ni;
+	struct list_head	*tmp;
+
+	LASSERT(cpt != LNET_LOCK_EX);
+
+	list_for_each(tmp, &the_lnet.ln_nis) {
+		ni = list_entry(tmp, lnet_ni_t, ni_list);
+
+		if (ni->ni_nid == nid) {
+			lnet_ni_addref_locked(ni, cpt);
+			return ni;
+		}
+	}
+
+	return NULL;
+}
+
+int
+lnet_islocalnid(lnet_nid_t nid)
+{
+	struct lnet_ni	*ni;
+	int		cpt;
+
+	cpt = lnet_net_lock_current();
+	ni = lnet_nid2ni_locked(nid, cpt);
+	if (ni != NULL)
+		lnet_ni_decref_locked(ni, cpt);
+	lnet_net_unlock(cpt);
+
+	return ni != NULL;
+}
+
+int
+lnet_count_acceptor_nis(void)
+{
+	/* Return the # of NIs that need the acceptor. */
+	int		count = 0;
+	struct list_head	*tmp;
+	struct lnet_ni	*ni;
+	int		cpt;
+
+	cpt = lnet_net_lock_current();
+	list_for_each(tmp, &the_lnet.ln_nis) {
+		ni = list_entry(tmp, lnet_ni_t, ni_list);
+
+		if (ni->ni_lnd->lnd_accept != NULL)
+			count++;
+	}
+
+	lnet_net_unlock(cpt);
+
+	return count;
+}
+
+static int
+lnet_ni_tq_credits(lnet_ni_t *ni)
+{
+	int	credits;
+
+	LASSERT(ni->ni_ncpts >= 1);
+
+	if (ni->ni_ncpts == 1)
+		return ni->ni_maxtxcredits;
+
+	credits = ni->ni_maxtxcredits / ni->ni_ncpts;
+	credits = max(credits, 8 * ni->ni_peertxcredits);
+	credits = min(credits, ni->ni_maxtxcredits);
+
+	return credits;
+}
+
+static void
+lnet_shutdown_lndnis(void)
+{
+	int		i;
+	int		islo;
+	lnet_ni_t	 *ni;
+
+	/* NB called holding the global mutex */
+
+	/* All quiet on the API front */
+	LASSERT(!the_lnet.ln_shutdown);
+	LASSERT(the_lnet.ln_refcount == 0);
+	LASSERT(list_empty(&the_lnet.ln_nis_zombie));
+
+	lnet_net_lock(LNET_LOCK_EX);
+	the_lnet.ln_shutdown = 1;	/* flag shutdown */
+
+	/* Unlink NIs from the global table */
+	while (!list_empty(&the_lnet.ln_nis)) {
+		ni = list_entry(the_lnet.ln_nis.next,
+				    lnet_ni_t, ni_list);
+		/* move it to zombie list and nobody can find it anymore */
+		list_move(&ni->ni_list, &the_lnet.ln_nis_zombie);
+		lnet_ni_decref_locked(ni, 0);	/* drop ln_nis' ref */
+
+		if (!list_empty(&ni->ni_cptlist)) {
+			list_del_init(&ni->ni_cptlist);
+			lnet_ni_decref_locked(ni, 0);
+		}
+	}
+
+	/* Drop the cached eqwait NI. */
+	if (the_lnet.ln_eq_waitni != NULL) {
+		lnet_ni_decref_locked(the_lnet.ln_eq_waitni, 0);
+		the_lnet.ln_eq_waitni = NULL;
+	}
+
+	/* Drop the cached loopback NI. */
+	if (the_lnet.ln_loni != NULL) {
+		lnet_ni_decref_locked(the_lnet.ln_loni, 0);
+		the_lnet.ln_loni = NULL;
+	}
+
+	lnet_net_unlock(LNET_LOCK_EX);
+
+	/* Clear lazy portals and drop delayed messages which hold refs
+	 * on their lnet_msg_t::msg_rxpeer */
+	for (i = 0; i < the_lnet.ln_nportals; i++)
+		LNetClearLazyPortal(i);
+
+	/* Clear the peer table and wait for all peers to go (they hold refs on
+	 * their NIs) */
+	lnet_peer_tables_cleanup();
+
+	lnet_net_lock(LNET_LOCK_EX);
+	/* Now wait for the NI's I just nuked to show up on ln_zombie_nis
+	 * and shut them down in guaranteed thread context */
+	i = 2;
+	while (!list_empty(&the_lnet.ln_nis_zombie)) {
+		int	*ref;
+		int	j;
+
+		ni = list_entry(the_lnet.ln_nis_zombie.next,
+				    lnet_ni_t, ni_list);
+		list_del_init(&ni->ni_list);
+		cfs_percpt_for_each(ref, j, ni->ni_refs) {
+			if (*ref == 0)
+				continue;
+			/* still busy, add it back to zombie list */
+			list_add(&ni->ni_list, &the_lnet.ln_nis_zombie);
+			break;
+		}
+
+		if (!list_empty(&ni->ni_list)) {
+			lnet_net_unlock(LNET_LOCK_EX);
+			++i;
+			if ((i & (-i)) == i) {
+				CDEBUG(D_WARNING, "Waiting for zombie LNI %s\n",
+				       libcfs_nid2str(ni->ni_nid));
+			}
+			set_current_state(TASK_UNINTERRUPTIBLE);
+			schedule_timeout(cfs_time_seconds(1));
+			lnet_net_lock(LNET_LOCK_EX);
+			continue;
+		}
+
+		ni->ni_lnd->lnd_refcount--;
+		lnet_net_unlock(LNET_LOCK_EX);
+
+		islo = ni->ni_lnd->lnd_type == LOLND;
+
+		LASSERT(!in_interrupt());
+		(ni->ni_lnd->lnd_shutdown)(ni);
+
+		/* can't deref lnd anymore now; it might have unregistered
+		 * itself...  */
+
+		if (!islo)
+			CDEBUG(D_LNI, "Removed LNI %s\n",
+			       libcfs_nid2str(ni->ni_nid));
+
+		lnet_ni_free(ni);
+		i = 2;
+
+		lnet_net_lock(LNET_LOCK_EX);
+	}
+
+	the_lnet.ln_shutdown = 0;
+	lnet_net_unlock(LNET_LOCK_EX);
+
+	if (the_lnet.ln_network_tokens != NULL) {
+		LIBCFS_FREE(the_lnet.ln_network_tokens,
+			    the_lnet.ln_network_tokens_nob);
+		the_lnet.ln_network_tokens = NULL;
+	}
+}
+
+static int
+lnet_startup_lndnis(void)
+{
+	lnd_t			*lnd;
+	struct lnet_ni		*ni;
+	struct lnet_tx_queue	*tq;
+	struct list_head		nilist;
+	int			i;
+	int		rc = 0;
+	int		lnd_type;
+	int		nicount = 0;
+	char	      *nets = lnet_get_networks();
+
+	INIT_LIST_HEAD(&nilist);
+
+	if (nets == NULL)
+		goto failed;
+
+	rc = lnet_parse_networks(&nilist, nets);
+	if (rc != 0)
+		goto failed;
+
+	while (!list_empty(&nilist)) {
+		ni = list_entry(nilist.next, lnet_ni_t, ni_list);
+		lnd_type = LNET_NETTYP(LNET_NIDNET(ni->ni_nid));
+
+		LASSERT(libcfs_isknown_lnd(lnd_type));
+
+		if (lnd_type == CIBLND    ||
+		    lnd_type == OPENIBLND ||
+		    lnd_type == IIBLND    ||
+		    lnd_type == VIBLND) {
+			CERROR("LND %s obsoleted\n",
+			       libcfs_lnd2str(lnd_type));
+			goto failed;
+		}
+
+		LNET_MUTEX_LOCK(&the_lnet.ln_lnd_mutex);
+		lnd = lnet_find_lnd_by_type(lnd_type);
+
+		if (lnd == NULL) {
+			LNET_MUTEX_UNLOCK(&the_lnet.ln_lnd_mutex);
+			rc = request_module("%s",
+						libcfs_lnd2modname(lnd_type));
+			LNET_MUTEX_LOCK(&the_lnet.ln_lnd_mutex);
+
+			lnd = lnet_find_lnd_by_type(lnd_type);
+			if (lnd == NULL) {
+				LNET_MUTEX_UNLOCK(&the_lnet.ln_lnd_mutex);
+				CERROR("Can't load LND %s, module %s, rc=%d\n",
+				       libcfs_lnd2str(lnd_type),
+				       libcfs_lnd2modname(lnd_type), rc);
+				goto failed;
+			}
+		}
+
+		lnet_net_lock(LNET_LOCK_EX);
+		lnd->lnd_refcount++;
+		lnet_net_unlock(LNET_LOCK_EX);
+
+		ni->ni_lnd = lnd;
+
+		rc = (lnd->lnd_startup)(ni);
+
+		LNET_MUTEX_UNLOCK(&the_lnet.ln_lnd_mutex);
+
+		if (rc != 0) {
+			LCONSOLE_ERROR_MSG(0x105, "Error %d starting up LNI %s\n",
+					   rc, libcfs_lnd2str(lnd->lnd_type));
+			lnet_net_lock(LNET_LOCK_EX);
+			lnd->lnd_refcount--;
+			lnet_net_unlock(LNET_LOCK_EX);
+			goto failed;
+		}
+
+		LASSERT(ni->ni_peertimeout <= 0 || lnd->lnd_query != NULL);
+
+		list_del(&ni->ni_list);
+
+		lnet_net_lock(LNET_LOCK_EX);
+		/* refcount for ln_nis */
+		lnet_ni_addref_locked(ni, 0);
+		list_add_tail(&ni->ni_list, &the_lnet.ln_nis);
+		if (ni->ni_cpts != NULL) {
+			list_add_tail(&ni->ni_cptlist,
+					  &the_lnet.ln_nis_cpt);
+			lnet_ni_addref_locked(ni, 0);
+		}
+
+		lnet_net_unlock(LNET_LOCK_EX);
+
+		if (lnd->lnd_type == LOLND) {
+			lnet_ni_addref(ni);
+			LASSERT(the_lnet.ln_loni == NULL);
+			the_lnet.ln_loni = ni;
+			continue;
+		}
+
+		if (ni->ni_peertxcredits == 0 ||
+		    ni->ni_maxtxcredits == 0) {
+			LCONSOLE_ERROR_MSG(0x107, "LNI %s has no %scredits\n",
+					   libcfs_lnd2str(lnd->lnd_type),
+					   ni->ni_peertxcredits == 0 ?
+					   "" : "per-peer ");
+			goto failed;
+		}
+
+		cfs_percpt_for_each(tq, i, ni->ni_tx_queues) {
+			tq->tq_credits_min =
+			tq->tq_credits_max =
+			tq->tq_credits = lnet_ni_tq_credits(ni);
+		}
+
+		CDEBUG(D_LNI, "Added LNI %s [%d/%d/%d/%d]\n",
+		       libcfs_nid2str(ni->ni_nid), ni->ni_peertxcredits,
+		       lnet_ni_tq_credits(ni) * LNET_CPT_NUMBER,
+		       ni->ni_peerrtrcredits, ni->ni_peertimeout);
+
+		nicount++;
+	}
+
+	if (the_lnet.ln_eq_waitni != NULL && nicount > 1) {
+		lnd_type = the_lnet.ln_eq_waitni->ni_lnd->lnd_type;
+		LCONSOLE_ERROR_MSG(0x109, "LND %s can only run single-network\n",
+				   libcfs_lnd2str(lnd_type));
+		goto failed;
+	}
+
+	return 0;
+
+ failed:
+	lnet_shutdown_lndnis();
+
+	while (!list_empty(&nilist)) {
+		ni = list_entry(nilist.next, lnet_ni_t, ni_list);
+		list_del(&ni->ni_list);
+		lnet_ni_free(ni);
+	}
+
+	return -ENETDOWN;
+}
+
+/**
+ * Initialize LNet library.
+ *
+ * Only userspace program needs to call this function - it's automatically
+ * called in the kernel at module loading time. Caller has to call LNetFini()
+ * after a call to LNetInit(), if and only if the latter returned 0. It must
+ * be called exactly once.
+ *
+ * \return 0 on success, and -ve on failures.
+ */
+int
+LNetInit(void)
+{
+	int	rc;
+
+	lnet_assert_wire_constants();
+	LASSERT(!the_lnet.ln_init);
+
+	memset(&the_lnet, 0, sizeof(the_lnet));
+
+	/* refer to global cfs_cpt_table for now */
+	the_lnet.ln_cpt_table	= cfs_cpt_table;
+	the_lnet.ln_cpt_number	= cfs_cpt_number(cfs_cpt_table);
+
+	LASSERT(the_lnet.ln_cpt_number > 0);
+	if (the_lnet.ln_cpt_number > LNET_CPT_MAX) {
+		/* we are under risk of consuming all lh_cookie */
+		CERROR("Can't have %d CPTs for LNet (max allowed is %d), please change setting of CPT-table and retry\n",
+		       the_lnet.ln_cpt_number, LNET_CPT_MAX);
+		return -1;
+	}
+
+	while ((1 << the_lnet.ln_cpt_bits) < the_lnet.ln_cpt_number)
+		the_lnet.ln_cpt_bits++;
+
+	rc = lnet_create_locks();
+	if (rc != 0) {
+		CERROR("Can't create LNet global locks: %d\n", rc);
+		return -1;
+	}
+
+	the_lnet.ln_refcount = 0;
+	the_lnet.ln_init = 1;
+	LNetInvalidateHandle(&the_lnet.ln_rc_eqh);
+	INIT_LIST_HEAD(&the_lnet.ln_lnds);
+	INIT_LIST_HEAD(&the_lnet.ln_rcd_zombie);
+	INIT_LIST_HEAD(&the_lnet.ln_rcd_deathrow);
+
+	/* The hash table size is the number of bits it takes to express the set
+	 * ln_num_routes, minus 1 (better to under estimate than over so we
+	 * don't waste memory). */
+	if (rnet_htable_size <= 0)
+		rnet_htable_size = LNET_REMOTE_NETS_HASH_DEFAULT;
+	else if (rnet_htable_size > LNET_REMOTE_NETS_HASH_MAX)
+		rnet_htable_size = LNET_REMOTE_NETS_HASH_MAX;
+	the_lnet.ln_remote_nets_hbits = max_t(int, 1,
+					   order_base_2(rnet_htable_size) - 1);
+
+	/* All LNDs apart from the LOLND are in separate modules.  They
+	 * register themselves when their module loads, and unregister
+	 * themselves when their module is unloaded. */
+	lnet_register_lnd(&the_lolnd);
+	return 0;
+}
+EXPORT_SYMBOL(LNetInit);
+
+/**
+ * Finalize LNet library.
+ *
+ * Only userspace program needs to call this function. It can be called
+ * at most once.
+ *
+ * \pre LNetInit() called with success.
+ * \pre All LNet users called LNetNIFini() for matching LNetNIInit() calls.
+ */
+void
+LNetFini(void)
+{
+	LASSERT(the_lnet.ln_init);
+	LASSERT(the_lnet.ln_refcount == 0);
+
+	while (!list_empty(&the_lnet.ln_lnds))
+		lnet_unregister_lnd(list_entry(the_lnet.ln_lnds.next,
+						   lnd_t, lnd_list));
+	lnet_destroy_locks();
+
+	the_lnet.ln_init = 0;
+}
+EXPORT_SYMBOL(LNetFini);
+
+/**
+ * Set LNet PID and start LNet interfaces, routing, and forwarding.
+ *
+ * Userspace program should call this after a successful call to LNetInit().
+ * Users must call this function at least once before any other functions.
+ * For each successful call there must be a corresponding call to
+ * LNetNIFini(). For subsequent calls to LNetNIInit(), \a requested_pid is
+ * ignored.
+ *
+ * The PID used by LNet may be different from the one requested.
+ * See LNetGetId().
+ *
+ * \param requested_pid PID requested by the caller.
+ *
+ * \return >= 0 on success, and < 0 error code on failures.
+ */
+int
+LNetNIInit(lnet_pid_t requested_pid)
+{
+	int	 im_a_router = 0;
+	int	 rc;
+
+	LNET_MUTEX_LOCK(&the_lnet.ln_api_mutex);
+
+	LASSERT(the_lnet.ln_init);
+	CDEBUG(D_OTHER, "refs %d\n", the_lnet.ln_refcount);
+
+	if (the_lnet.ln_refcount > 0) {
+		rc = the_lnet.ln_refcount++;
+		goto out;
+	}
+
+	lnet_get_tunables();
+
+	if (requested_pid == LNET_PID_ANY) {
+		/* Don't instantiate LNET just for me */
+		rc = -ENETDOWN;
+		goto failed0;
+	}
+
+	rc = lnet_prepare(requested_pid);
+	if (rc != 0)
+		goto failed0;
+
+	rc = lnet_startup_lndnis();
+	if (rc != 0)
+		goto failed1;
+
+	rc = lnet_parse_routes(lnet_get_routes(), &im_a_router);
+	if (rc != 0)
+		goto failed2;
+
+	rc = lnet_check_routes();
+	if (rc != 0)
+		goto failed2;
+
+	rc = lnet_rtrpools_alloc(im_a_router);
+	if (rc != 0)
+		goto failed2;
+
+	rc = lnet_acceptor_start();
+	if (rc != 0)
+		goto failed2;
+
+	the_lnet.ln_refcount = 1;
+	/* Now I may use my own API functions... */
+
+	/* NB router checker needs the_lnet.ln_ping_info in
+	 * lnet_router_checker -> lnet_update_ni_status_locked */
+	rc = lnet_ping_target_init();
+	if (rc != 0)
+		goto failed3;
+
+	rc = lnet_router_checker_start();
+	if (rc != 0)
+		goto failed4;
+
+	lnet_proc_init();
+	goto out;
+
+ failed4:
+	lnet_ping_target_fini();
+ failed3:
+	the_lnet.ln_refcount = 0;
+	lnet_acceptor_stop();
+ failed2:
+	lnet_destroy_routes();
+	lnet_shutdown_lndnis();
+ failed1:
+	lnet_unprepare();
+ failed0:
+	LASSERT(rc < 0);
+ out:
+	LNET_MUTEX_UNLOCK(&the_lnet.ln_api_mutex);
+	return rc;
+}
+EXPORT_SYMBOL(LNetNIInit);
+
+/**
+ * Stop LNet interfaces, routing, and forwarding.
+ *
+ * Users must call this function once for each successful call to LNetNIInit().
+ * Once the LNetNIFini() operation has been started, the results of pending
+ * API operations are undefined.
+ *
+ * \return always 0 for current implementation.
+ */
+int
+LNetNIFini(void)
+{
+	LNET_MUTEX_LOCK(&the_lnet.ln_api_mutex);
+
+	LASSERT(the_lnet.ln_init);
+	LASSERT(the_lnet.ln_refcount > 0);
+
+	if (the_lnet.ln_refcount != 1) {
+		the_lnet.ln_refcount--;
+	} else {
+		LASSERT(!the_lnet.ln_niinit_self);
+
+		lnet_proc_fini();
+		lnet_router_checker_stop();
+		lnet_ping_target_fini();
+
+		/* Teardown fns that use my own API functions BEFORE here */
+		the_lnet.ln_refcount = 0;
+
+		lnet_acceptor_stop();
+		lnet_destroy_routes();
+		lnet_shutdown_lndnis();
+		lnet_unprepare();
+	}
+
+	LNET_MUTEX_UNLOCK(&the_lnet.ln_api_mutex);
+	return 0;
+}
+EXPORT_SYMBOL(LNetNIFini);
+
+/**
+ * This is an ugly hack to export IOC_LIBCFS_DEBUG_PEER and
+ * IOC_LIBCFS_PORTALS_COMPATIBILITY commands to users, by tweaking the LNet
+ * internal ioctl handler.
+ *
+ * IOC_LIBCFS_PORTALS_COMPATIBILITY is now deprecated, don't use it.
+ *
+ * \param cmd IOC_LIBCFS_DEBUG_PEER to print debugging data about a peer.
+ * The data will be printed to system console. Don't use it excessively.
+ * \param arg A pointer to lnet_process_id_t, process ID of the peer.
+ *
+ * \return Always return 0 when called by users directly (i.e., not via ioctl).
+ */
+int
+LNetCtl(unsigned int cmd, void *arg)
+{
+	struct libcfs_ioctl_data *data = arg;
+	lnet_process_id_t	 id = {0};
+	lnet_ni_t		*ni;
+	int		       rc;
+
+	LASSERT(the_lnet.ln_init);
+	LASSERT(the_lnet.ln_refcount > 0);
+
+	switch (cmd) {
+	case IOC_LIBCFS_GET_NI:
+		rc = LNetGetId(data->ioc_count, &id);
+		data->ioc_nid = id.nid;
+		return rc;
+
+	case IOC_LIBCFS_FAIL_NID:
+		return lnet_fail_nid(data->ioc_nid, data->ioc_count);
+
+	case IOC_LIBCFS_ADD_ROUTE:
+		rc = lnet_add_route(data->ioc_net, data->ioc_count,
+				    data->ioc_nid, data->ioc_priority);
+		return (rc != 0) ? rc : lnet_check_routes();
+
+	case IOC_LIBCFS_DEL_ROUTE:
+		return lnet_del_route(data->ioc_net, data->ioc_nid);
+
+	case IOC_LIBCFS_GET_ROUTE:
+		return lnet_get_route(data->ioc_count,
+				      &data->ioc_net, &data->ioc_count,
+				      &data->ioc_nid, &data->ioc_flags,
+				      &data->ioc_priority);
+	case IOC_LIBCFS_NOTIFY_ROUTER:
+		return lnet_notify(NULL, data->ioc_nid, data->ioc_flags,
+				   cfs_time_current() -
+				   cfs_time_seconds(get_seconds() -
+						    (time_t)data->ioc_u64[0]));
+
+	case IOC_LIBCFS_PORTALS_COMPATIBILITY:
+		/* This can be removed once lustre stops calling it */
+		return 0;
+
+	case IOC_LIBCFS_LNET_DIST:
+		rc = LNetDist(data->ioc_nid, &data->ioc_nid, &data->ioc_u32[1]);
+		if (rc < 0 && rc != -EHOSTUNREACH)
+			return rc;
+
+		data->ioc_u32[0] = rc;
+		return 0;
+
+	case IOC_LIBCFS_TESTPROTOCOMPAT:
+		lnet_net_lock(LNET_LOCK_EX);
+		the_lnet.ln_testprotocompat = data->ioc_flags;
+		lnet_net_unlock(LNET_LOCK_EX);
+		return 0;
+
+	case IOC_LIBCFS_PING:
+		id.nid = data->ioc_nid;
+		id.pid = data->ioc_u32[0];
+		rc = lnet_ping(id, data->ioc_u32[1], /* timeout */
+			       (lnet_process_id_t *)data->ioc_pbuf1,
+			       data->ioc_plen1/sizeof(lnet_process_id_t));
+		if (rc < 0)
+			return rc;
+		data->ioc_count = rc;
+		return 0;
+
+	case IOC_LIBCFS_DEBUG_PEER: {
+		/* CAVEAT EMPTOR: this one designed for calling directly; not
+		 * via an ioctl */
+		id = *((lnet_process_id_t *) arg);
+
+		lnet_debug_peer(id.nid);
+
+		ni = lnet_net2ni(LNET_NIDNET(id.nid));
+		if (ni == NULL) {
+			CDEBUG(D_WARNING, "No NI for %s\n", libcfs_id2str(id));
+		} else {
+			if (ni->ni_lnd->lnd_ctl == NULL) {
+				CDEBUG(D_WARNING, "No ctl for %s\n",
+				       libcfs_id2str(id));
+			} else {
+				(void)ni->ni_lnd->lnd_ctl(ni, cmd, arg);
+			}
+
+			lnet_ni_decref(ni);
+		}
+		return 0;
+	}
+
+	default:
+		ni = lnet_net2ni(data->ioc_net);
+		if (ni == NULL)
+			return -EINVAL;
+
+		if (ni->ni_lnd->lnd_ctl == NULL)
+			rc = -EINVAL;
+		else
+			rc = ni->ni_lnd->lnd_ctl(ni, cmd, arg);
+
+		lnet_ni_decref(ni);
+		return rc;
+	}
+	/* not reached */
+}
+EXPORT_SYMBOL(LNetCtl);
+
+/**
+ * Retrieve the lnet_process_id_t ID of LNet interface at \a index. Note that
+ * all interfaces share a same PID, as requested by LNetNIInit().
+ *
+ * \param index Index of the interface to look up.
+ * \param id On successful return, this location will hold the
+ * lnet_process_id_t ID of the interface.
+ *
+ * \retval 0 If an interface exists at \a index.
+ * \retval -ENOENT If no interface has been found.
+ */
+int
+LNetGetId(unsigned int index, lnet_process_id_t *id)
+{
+	struct lnet_ni	*ni;
+	struct list_head	*tmp;
+	int		cpt;
+	int		rc = -ENOENT;
+
+	LASSERT(the_lnet.ln_init);
+
+	/* LNetNI initilization failed? */
+	if (the_lnet.ln_refcount == 0)
+		return rc;
+
+	cpt = lnet_net_lock_current();
+
+	list_for_each(tmp, &the_lnet.ln_nis) {
+		if (index-- != 0)
+			continue;
+
+		ni = list_entry(tmp, lnet_ni_t, ni_list);
+
+		id->nid = ni->ni_nid;
+		id->pid = the_lnet.ln_pid;
+		rc = 0;
+		break;
+	}
+
+	lnet_net_unlock(cpt);
+	return rc;
+}
+EXPORT_SYMBOL(LNetGetId);
+
+/**
+ * Print a string representation of handle \a h into buffer \a str of
+ * \a len bytes.
+ */
+void
+LNetSnprintHandle(char *str, int len, lnet_handle_any_t h)
+{
+	snprintf(str, len, "%#llx", h.cookie);
+}
+EXPORT_SYMBOL(LNetSnprintHandle);
+
+static int
+lnet_create_ping_info(void)
+{
+	int	       i;
+	int	       n;
+	int	       rc;
+	unsigned int      infosz;
+	lnet_ni_t	*ni;
+	lnet_process_id_t id;
+	lnet_ping_info_t *pinfo;
+
+	for (n = 0; ; n++) {
+		rc = LNetGetId(n, &id);
+		if (rc == -ENOENT)
+			break;
+
+		LASSERT(rc == 0);
+	}
+
+	infosz = offsetof(lnet_ping_info_t, pi_ni[n]);
+	LIBCFS_ALLOC(pinfo, infosz);
+	if (pinfo == NULL) {
+		CERROR("Can't allocate ping info[%d]\n", n);
+		return -ENOMEM;
+	}
+
+	pinfo->pi_nnis    = n;
+	pinfo->pi_pid     = the_lnet.ln_pid;
+	pinfo->pi_magic   = LNET_PROTO_PING_MAGIC;
+	pinfo->pi_features = LNET_PING_FEAT_NI_STATUS;
+
+	for (i = 0; i < n; i++) {
+		lnet_ni_status_t *ns = &pinfo->pi_ni[i];
+
+		rc = LNetGetId(i, &id);
+		LASSERT(rc == 0);
+
+		ns->ns_nid    = id.nid;
+		ns->ns_status = LNET_NI_STATUS_UP;
+
+		lnet_net_lock(0);
+
+		ni = lnet_nid2ni_locked(id.nid, 0);
+		LASSERT(ni != NULL);
+
+		lnet_ni_lock(ni);
+		LASSERT(ni->ni_status == NULL);
+		ni->ni_status = ns;
+		lnet_ni_unlock(ni);
+
+		lnet_ni_decref_locked(ni, 0);
+		lnet_net_unlock(0);
+	}
+
+	the_lnet.ln_ping_info = pinfo;
+	return 0;
+}
+
+static void
+lnet_destroy_ping_info(void)
+{
+	struct lnet_ni	*ni;
+
+	lnet_net_lock(0);
+
+	list_for_each_entry(ni, &the_lnet.ln_nis, ni_list) {
+		lnet_ni_lock(ni);
+		ni->ni_status = NULL;
+		lnet_ni_unlock(ni);
+	}
+
+	lnet_net_unlock(0);
+
+	LIBCFS_FREE(the_lnet.ln_ping_info,
+		    offsetof(lnet_ping_info_t,
+			     pi_ni[the_lnet.ln_ping_info->pi_nnis]));
+	the_lnet.ln_ping_info = NULL;
+}
+
+int
+lnet_ping_target_init(void)
+{
+	lnet_md_t	 md = { NULL };
+	lnet_handle_me_t  meh;
+	lnet_process_id_t id;
+	int	       rc;
+	int	       rc2;
+	int	       infosz;
+
+	rc = lnet_create_ping_info();
+	if (rc != 0)
+		return rc;
+
+	/* We can have a tiny EQ since we only need to see the unlink event on
+	 * teardown, which by definition is the last one! */
+	rc = LNetEQAlloc(2, LNET_EQ_HANDLER_NONE, &the_lnet.ln_ping_target_eq);
+	if (rc != 0) {
+		CERROR("Can't allocate ping EQ: %d\n", rc);
+		goto failed_0;
+	}
+
+	memset(&id, 0, sizeof(lnet_process_id_t));
+	id.nid = LNET_NID_ANY;
+	id.pid = LNET_PID_ANY;
+
+	rc = LNetMEAttach(LNET_RESERVED_PORTAL, id,
+			  LNET_PROTO_PING_MATCHBITS, 0,
+			  LNET_UNLINK, LNET_INS_AFTER,
+			  &meh);
+	if (rc != 0) {
+		CERROR("Can't create ping ME: %d\n", rc);
+		goto failed_1;
+	}
+
+	/* initialize md content */
+	infosz = offsetof(lnet_ping_info_t,
+			  pi_ni[the_lnet.ln_ping_info->pi_nnis]);
+	md.start     = the_lnet.ln_ping_info;
+	md.length    = infosz;
+	md.threshold = LNET_MD_THRESH_INF;
+	md.max_size  = 0;
+	md.options   = LNET_MD_OP_GET | LNET_MD_TRUNCATE |
+		       LNET_MD_MANAGE_REMOTE;
+	md.user_ptr  = NULL;
+	md.eq_handle = the_lnet.ln_ping_target_eq;
+
+	rc = LNetMDAttach(meh, md,
+			  LNET_RETAIN,
+			  &the_lnet.ln_ping_target_md);
+	if (rc != 0) {
+		CERROR("Can't attach ping MD: %d\n", rc);
+		goto failed_2;
+	}
+
+	return 0;
+
+ failed_2:
+	rc2 = LNetMEUnlink(meh);
+	LASSERT(rc2 == 0);
+ failed_1:
+	rc2 = LNetEQFree(the_lnet.ln_ping_target_eq);
+	LASSERT(rc2 == 0);
+ failed_0:
+	lnet_destroy_ping_info();
+	return rc;
+}
+
+void
+lnet_ping_target_fini(void)
+{
+	lnet_event_t    event;
+	int	     rc;
+	int	     which;
+	int	     timeout_ms = 1000;
+	sigset_t    blocked = cfs_block_allsigs();
+
+	LNetMDUnlink(the_lnet.ln_ping_target_md);
+	/* NB md could be busy; this just starts the unlink */
+
+	for (;;) {
+		rc = LNetEQPoll(&the_lnet.ln_ping_target_eq, 1,
+				timeout_ms, &event, &which);
+
+		/* I expect overflow... */
+		LASSERT(rc >= 0 || rc == -EOVERFLOW);
+
+		if (rc == 0) {
+			/* timed out: provide a diagnostic */
+			CWARN("Still waiting for ping MD to unlink\n");
+			timeout_ms *= 2;
+			continue;
+		}
+
+		/* Got a valid event */
+		if (event.unlinked)
+			break;
+	}
+
+	rc = LNetEQFree(the_lnet.ln_ping_target_eq);
+	LASSERT(rc == 0);
+	lnet_destroy_ping_info();
+	cfs_restore_sigs(blocked);
+}
+
+int
+lnet_ping(lnet_process_id_t id, int timeout_ms, lnet_process_id_t *ids, int n_ids)
+{
+	lnet_handle_eq_t     eqh;
+	lnet_handle_md_t     mdh;
+	lnet_event_t	 event;
+	lnet_md_t	    md = { NULL };
+	int		  which;
+	int		  unlinked = 0;
+	int		  replied = 0;
+	const int	    a_long_time = 60000; /* mS */
+	int		  infosz = offsetof(lnet_ping_info_t, pi_ni[n_ids]);
+	lnet_ping_info_t    *info;
+	lnet_process_id_t    tmpid;
+	int		  i;
+	int		  nob;
+	int		  rc;
+	int		  rc2;
+	sigset_t	 blocked;
+
+	if (n_ids <= 0 ||
+	    id.nid == LNET_NID_ANY ||
+	    timeout_ms > 500000 ||	      /* arbitrary limit! */
+	    n_ids > 20)			 /* arbitrary limit! */
+		return -EINVAL;
+
+	if (id.pid == LNET_PID_ANY)
+		id.pid = LUSTRE_SRV_LNET_PID;
+
+	LIBCFS_ALLOC(info, infosz);
+	if (info == NULL)
+		return -ENOMEM;
+
+	/* NB 2 events max (including any unlink event) */
+	rc = LNetEQAlloc(2, LNET_EQ_HANDLER_NONE, &eqh);
+	if (rc != 0) {
+		CERROR("Can't allocate EQ: %d\n", rc);
+		goto out_0;
+	}
+
+	/* initialize md content */
+	md.start     = info;
+	md.length    = infosz;
+	md.threshold = 2; /*GET/REPLY*/
+	md.max_size  = 0;
+	md.options   = LNET_MD_TRUNCATE;
+	md.user_ptr  = NULL;
+	md.eq_handle = eqh;
+
+	rc = LNetMDBind(md, LNET_UNLINK, &mdh);
+	if (rc != 0) {
+		CERROR("Can't bind MD: %d\n", rc);
+		goto out_1;
+	}
+
+	rc = LNetGet(LNET_NID_ANY, mdh, id,
+		     LNET_RESERVED_PORTAL,
+		     LNET_PROTO_PING_MATCHBITS, 0);
+
+	if (rc != 0) {
+		/* Don't CERROR; this could be deliberate! */
+
+		rc2 = LNetMDUnlink(mdh);
+		LASSERT(rc2 == 0);
+
+		/* NB must wait for the UNLINK event below... */
+		unlinked = 1;
+		timeout_ms = a_long_time;
+	}
+
+	do {
+		/* MUST block for unlink to complete */
+		if (unlinked)
+			blocked = cfs_block_allsigs();
+
+		rc2 = LNetEQPoll(&eqh, 1, timeout_ms, &event, &which);
+
+		if (unlinked)
+			cfs_restore_sigs(blocked);
+
+		CDEBUG(D_NET, "poll %d(%d %d)%s\n", rc2,
+		       (rc2 <= 0) ? -1 : event.type,
+		       (rc2 <= 0) ? -1 : event.status,
+		       (rc2 > 0 && event.unlinked) ? " unlinked" : "");
+
+		LASSERT(rc2 != -EOVERFLOW);     /* can't miss anything */
+
+		if (rc2 <= 0 || event.status != 0) {
+			/* timeout or error */
+			if (!replied && rc == 0)
+				rc = (rc2 < 0) ? rc2 :
+				     (rc2 == 0) ? -ETIMEDOUT :
+				     event.status;
+
+			if (!unlinked) {
+				/* Ensure completion in finite time... */
+				LNetMDUnlink(mdh);
+				/* No assertion (racing with network) */
+				unlinked = 1;
+				timeout_ms = a_long_time;
+			} else if (rc2 == 0) {
+				/* timed out waiting for unlink */
+				CWARN("ping %s: late network completion\n",
+				      libcfs_id2str(id));
+			}
+		} else if (event.type == LNET_EVENT_REPLY) {
+			replied = 1;
+			rc = event.mlength;
+		}
+
+	} while (rc2 <= 0 || !event.unlinked);
+
+	if (!replied) {
+		if (rc >= 0)
+			CWARN("%s: Unexpected rc >= 0 but no reply!\n",
+			      libcfs_id2str(id));
+		rc = -EIO;
+		goto out_1;
+	}
+
+	nob = rc;
+	LASSERT(nob >= 0 && nob <= infosz);
+
+	rc = -EPROTO;			   /* if I can't parse... */
+
+	if (nob < 8) {
+		/* can't check magic/version */
+		CERROR("%s: ping info too short %d\n",
+		       libcfs_id2str(id), nob);
+		goto out_1;
+	}
+
+	if (info->pi_magic == __swab32(LNET_PROTO_PING_MAGIC)) {
+		lnet_swap_pinginfo(info);
+	} else if (info->pi_magic != LNET_PROTO_PING_MAGIC) {
+		CERROR("%s: Unexpected magic %08x\n",
+		       libcfs_id2str(id), info->pi_magic);
+		goto out_1;
+	}
+
+	if ((info->pi_features & LNET_PING_FEAT_NI_STATUS) == 0) {
+		CERROR("%s: ping w/o NI status: 0x%x\n",
+		       libcfs_id2str(id), info->pi_features);
+		goto out_1;
+	}
+
+	if (nob < offsetof(lnet_ping_info_t, pi_ni[0])) {
+		CERROR("%s: Short reply %d(%d min)\n", libcfs_id2str(id),
+		       nob, (int)offsetof(lnet_ping_info_t, pi_ni[0]));
+		goto out_1;
+	}
+
+	if (info->pi_nnis < n_ids)
+		n_ids = info->pi_nnis;
+
+	if (nob < offsetof(lnet_ping_info_t, pi_ni[n_ids])) {
+		CERROR("%s: Short reply %d(%d expected)\n", libcfs_id2str(id),
+		       nob, (int)offsetof(lnet_ping_info_t, pi_ni[n_ids]));
+		goto out_1;
+	}
+
+	rc = -EFAULT;			   /* If I SEGV... */
+
+	memset(&tmpid, 0, sizeof(tmpid));
+	for (i = 0; i < n_ids; i++) {
+		tmpid.pid = info->pi_pid;
+		tmpid.nid = info->pi_ni[i].ns_nid;
+		if (copy_to_user(&ids[i], &tmpid, sizeof(tmpid)))
+			goto out_1;
+	}
+	rc = info->pi_nnis;
+
+ out_1:
+	rc2 = LNetEQFree(eqh);
+	if (rc2 != 0)
+		CERROR("rc2 %d\n", rc2);
+	LASSERT(rc2 == 0);
+
+ out_0:
+	LIBCFS_FREE(info, infosz);
+	return rc;
+}
diff --git a/kernel/drivers/staging/lustre/lnet/lnet/config.c b/kernel/drivers/staging/lustre/lnet/lnet/config.c
new file mode 100644
index 000000000..2dc4c4a1a
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lnet/lnet/config.c
@@ -0,0 +1,1292 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+#include "../../include/linux/lnet/lib-lnet.h"
+
+struct lnet_text_buf_t {	    /* tmp struct for parsing routes */
+	struct list_head	 ltb_list;	/* stash on lists */
+	int		ltb_size;	/* allocated size */
+	char	       ltb_text[0];     /* text buffer */
+};
+
+static int lnet_tbnob;			/* track text buf allocation */
+#define LNET_MAX_TEXTBUF_NOB     (64<<10)	/* bound allocation */
+#define LNET_SINGLE_TEXTBUF_NOB  (4<<10)
+
+static void
+lnet_syntax(char *name, char *str, int offset, int width)
+{
+	static char dots[LNET_SINGLE_TEXTBUF_NOB];
+	static char dashes[LNET_SINGLE_TEXTBUF_NOB];
+
+	memset(dots, '.', sizeof(dots));
+	dots[sizeof(dots)-1] = 0;
+	memset(dashes, '-', sizeof(dashes));
+	dashes[sizeof(dashes)-1] = 0;
+
+	LCONSOLE_ERROR_MSG(0x10f, "Error parsing '%s=\"%s\"'\n", name, str);
+	LCONSOLE_ERROR_MSG(0x110, "here...........%.*s..%.*s|%.*s|\n",
+			   (int)strlen(name), dots, offset, dots,
+			    (width < 1) ? 0 : width - 1, dashes);
+}
+
+static int
+lnet_issep(char c)
+{
+	switch (c) {
+	case '\n':
+	case '\r':
+	case ';':
+		return 1;
+	default:
+		return 0;
+	}
+}
+
+static int
+lnet_net_unique(__u32 net, struct list_head *nilist)
+{
+	struct list_head       *tmp;
+	lnet_ni_t	*ni;
+
+	list_for_each(tmp, nilist) {
+		ni = list_entry(tmp, lnet_ni_t, ni_list);
+
+		if (LNET_NIDNET(ni->ni_nid) == net)
+			return 0;
+	}
+
+	return 1;
+}
+
+void
+lnet_ni_free(struct lnet_ni *ni)
+{
+	if (ni->ni_refs != NULL)
+		cfs_percpt_free(ni->ni_refs);
+
+	if (ni->ni_tx_queues != NULL)
+		cfs_percpt_free(ni->ni_tx_queues);
+
+	if (ni->ni_cpts != NULL)
+		cfs_expr_list_values_free(ni->ni_cpts, ni->ni_ncpts);
+
+	LIBCFS_FREE(ni, sizeof(*ni));
+}
+
+static lnet_ni_t *
+lnet_ni_alloc(__u32 net, struct cfs_expr_list *el, struct list_head *nilist)
+{
+	struct lnet_tx_queue	*tq;
+	struct lnet_ni		*ni;
+	int			rc;
+	int			i;
+
+	if (!lnet_net_unique(net, nilist)) {
+		LCONSOLE_ERROR_MSG(0x111, "Duplicate network specified: %s\n",
+				   libcfs_net2str(net));
+		return NULL;
+	}
+
+	LIBCFS_ALLOC(ni, sizeof(*ni));
+	if (ni == NULL) {
+		CERROR("Out of memory creating network %s\n",
+		       libcfs_net2str(net));
+		return NULL;
+	}
+
+	spin_lock_init(&ni->ni_lock);
+	INIT_LIST_HEAD(&ni->ni_cptlist);
+	ni->ni_refs = cfs_percpt_alloc(lnet_cpt_table(),
+				       sizeof(*ni->ni_refs[0]));
+	if (ni->ni_refs == NULL)
+		goto failed;
+
+	ni->ni_tx_queues = cfs_percpt_alloc(lnet_cpt_table(),
+					    sizeof(*ni->ni_tx_queues[0]));
+	if (ni->ni_tx_queues == NULL)
+		goto failed;
+
+	cfs_percpt_for_each(tq, i, ni->ni_tx_queues)
+		INIT_LIST_HEAD(&tq->tq_delayed);
+
+	if (el == NULL) {
+		ni->ni_cpts  = NULL;
+		ni->ni_ncpts = LNET_CPT_NUMBER;
+	} else {
+		rc = cfs_expr_list_values(el, LNET_CPT_NUMBER, &ni->ni_cpts);
+		if (rc <= 0) {
+			CERROR("Failed to set CPTs for NI %s: %d\n",
+			       libcfs_net2str(net), rc);
+			goto failed;
+		}
+
+		LASSERT(rc <= LNET_CPT_NUMBER);
+		if (rc == LNET_CPT_NUMBER) {
+			LIBCFS_FREE(ni->ni_cpts, rc * sizeof(ni->ni_cpts[0]));
+			ni->ni_cpts = NULL;
+		}
+
+		ni->ni_ncpts = rc;
+	}
+
+	/* LND will fill in the address part of the NID */
+	ni->ni_nid = LNET_MKNID(net, 0);
+	ni->ni_last_alive = get_seconds();
+	list_add_tail(&ni->ni_list, nilist);
+	return ni;
+ failed:
+	lnet_ni_free(ni);
+	return NULL;
+}
+
+int
+lnet_parse_networks(struct list_head *nilist, char *networks)
+{
+	struct cfs_expr_list *el = NULL;
+	int		tokensize = strlen(networks) + 1;
+	char		*tokens;
+	char		*str;
+	char		*tmp;
+	struct lnet_ni	*ni;
+	__u32		net;
+	int		nnets = 0;
+
+	if (strlen(networks) > LNET_SINGLE_TEXTBUF_NOB) {
+		/* _WAY_ conservative */
+		LCONSOLE_ERROR_MSG(0x112,
+				   "Can't parse networks: string too long\n");
+		return -EINVAL;
+	}
+
+	LIBCFS_ALLOC(tokens, tokensize);
+	if (tokens == NULL) {
+		CERROR("Can't allocate net tokens\n");
+		return -ENOMEM;
+	}
+
+	the_lnet.ln_network_tokens = tokens;
+	the_lnet.ln_network_tokens_nob = tokensize;
+	memcpy(tokens, networks, tokensize);
+	str = tmp = tokens;
+
+	/* Add in the loopback network */
+	ni = lnet_ni_alloc(LNET_MKNET(LOLND, 0), NULL, nilist);
+	if (ni == NULL)
+		goto failed;
+
+	while (str != NULL && *str != 0) {
+		char	*comma = strchr(str, ',');
+		char	*bracket = strchr(str, '(');
+		char	*square = strchr(str, '[');
+		char	*iface;
+		int	niface;
+		int	rc;
+
+		/* NB we don't check interface conflicts here; it's the LNDs
+		 * responsibility (if it cares at all) */
+
+		if (square != NULL && (comma == NULL || square < comma)) {
+			/* i.e: o2ib0(ib0)[1,2], number between square
+			 * brackets are CPTs this NI needs to be bond */
+			if (bracket != NULL && bracket > square) {
+				tmp = square;
+				goto failed_syntax;
+			}
+
+			tmp = strchr(square, ']');
+			if (tmp == NULL) {
+				tmp = square;
+				goto failed_syntax;
+			}
+
+			rc = cfs_expr_list_parse(square, tmp - square + 1,
+						 0, LNET_CPT_NUMBER - 1, &el);
+			if (rc != 0) {
+				tmp = square;
+				goto failed_syntax;
+			}
+
+			while (square <= tmp)
+				*square++ = ' ';
+		}
+
+		if (bracket == NULL ||
+		    (comma != NULL && comma < bracket)) {
+
+			/* no interface list specified */
+
+			if (comma != NULL)
+				*comma++ = 0;
+			net = libcfs_str2net(cfs_trimwhite(str));
+
+			if (net == LNET_NIDNET(LNET_NID_ANY)) {
+				LCONSOLE_ERROR_MSG(0x113,
+						   "Unrecognised network type\n");
+				tmp = str;
+				goto failed_syntax;
+			}
+
+			if (LNET_NETTYP(net) != LOLND && /* LO is implicit */
+			    lnet_ni_alloc(net, el, nilist) == NULL)
+				goto failed;
+
+			if (el != NULL) {
+				cfs_expr_list_free(el);
+				el = NULL;
+			}
+
+			str = comma;
+			continue;
+		}
+
+		*bracket = 0;
+		net = libcfs_str2net(cfs_trimwhite(str));
+		if (net == LNET_NIDNET(LNET_NID_ANY)) {
+			tmp = str;
+			goto failed_syntax;
+		}
+
+		nnets++;
+		ni = lnet_ni_alloc(net, el, nilist);
+		if (ni == NULL)
+			goto failed;
+
+		if (el != NULL) {
+			cfs_expr_list_free(el);
+			el = NULL;
+		}
+
+		niface = 0;
+		iface = bracket + 1;
+
+		bracket = strchr(iface, ')');
+		if (bracket == NULL) {
+			tmp = iface;
+			goto failed_syntax;
+		}
+
+		*bracket = 0;
+		do {
+			comma = strchr(iface, ',');
+			if (comma != NULL)
+				*comma++ = 0;
+
+			iface = cfs_trimwhite(iface);
+			if (*iface == 0) {
+				tmp = iface;
+				goto failed_syntax;
+			}
+
+			if (niface == LNET_MAX_INTERFACES) {
+				LCONSOLE_ERROR_MSG(0x115,
+						   "Too many interfaces for net %s\n",
+						   libcfs_net2str(net));
+				goto failed;
+			}
+
+			ni->ni_interfaces[niface++] = iface;
+			iface = comma;
+		} while (iface != NULL);
+
+		str = bracket + 1;
+		comma = strchr(bracket + 1, ',');
+		if (comma != NULL) {
+			*comma = 0;
+			str = cfs_trimwhite(str);
+			if (*str != 0) {
+				tmp = str;
+				goto failed_syntax;
+			}
+			str = comma + 1;
+			continue;
+		}
+
+		str = cfs_trimwhite(str);
+		if (*str != 0) {
+			tmp = str;
+			goto failed_syntax;
+		}
+	}
+
+	LASSERT(!list_empty(nilist));
+	return 0;
+
+ failed_syntax:
+	lnet_syntax("networks", networks, (int)(tmp - tokens), strlen(tmp));
+ failed:
+	while (!list_empty(nilist)) {
+		ni = list_entry(nilist->next, lnet_ni_t, ni_list);
+
+		list_del(&ni->ni_list);
+		lnet_ni_free(ni);
+	}
+
+	if (el != NULL)
+		cfs_expr_list_free(el);
+
+	LIBCFS_FREE(tokens, tokensize);
+	the_lnet.ln_network_tokens = NULL;
+
+	return -EINVAL;
+}
+
+static struct lnet_text_buf_t *
+lnet_new_text_buf(int str_len)
+{
+	struct lnet_text_buf_t *ltb;
+	int	      nob;
+
+	/* NB allocate space for the terminating 0 */
+	nob = offsetof(struct lnet_text_buf_t, ltb_text[str_len + 1]);
+	if (nob > LNET_SINGLE_TEXTBUF_NOB) {
+		/* _way_ conservative for "route net gateway..." */
+		CERROR("text buffer too big\n");
+		return NULL;
+	}
+
+	if (lnet_tbnob + nob > LNET_MAX_TEXTBUF_NOB) {
+		CERROR("Too many text buffers\n");
+		return NULL;
+	}
+
+	LIBCFS_ALLOC(ltb, nob);
+	if (ltb == NULL)
+		return NULL;
+
+	ltb->ltb_size = nob;
+	ltb->ltb_text[0] = 0;
+	lnet_tbnob += nob;
+	return ltb;
+}
+
+static void
+lnet_free_text_buf(struct lnet_text_buf_t *ltb)
+{
+	lnet_tbnob -= ltb->ltb_size;
+	LIBCFS_FREE(ltb, ltb->ltb_size);
+}
+
+static void
+lnet_free_text_bufs(struct list_head *tbs)
+{
+	struct lnet_text_buf_t  *ltb;
+
+	while (!list_empty(tbs)) {
+		ltb = list_entry(tbs->next, struct lnet_text_buf_t, ltb_list);
+
+		list_del(&ltb->ltb_list);
+		lnet_free_text_buf(ltb);
+	}
+}
+
+static int
+lnet_str2tbs_sep(struct list_head *tbs, char *str)
+{
+	struct list_head	pending;
+	char	     *sep;
+	int	       nob;
+	int	       i;
+	struct lnet_text_buf_t  *ltb;
+
+	INIT_LIST_HEAD(&pending);
+
+	/* Split 'str' into separate commands */
+	for (;;) {
+		/* skip leading whitespace */
+		while (isspace(*str))
+			str++;
+
+		/* scan for separator or comment */
+		for (sep = str; *sep != 0; sep++)
+			if (lnet_issep(*sep) || *sep == '#')
+				break;
+
+		nob = (int)(sep - str);
+		if (nob > 0) {
+			ltb = lnet_new_text_buf(nob);
+			if (ltb == NULL) {
+				lnet_free_text_bufs(&pending);
+				return -1;
+			}
+
+			for (i = 0; i < nob; i++)
+				if (isspace(str[i]))
+					ltb->ltb_text[i] = ' ';
+				else
+					ltb->ltb_text[i] = str[i];
+
+			ltb->ltb_text[nob] = 0;
+
+			list_add_tail(&ltb->ltb_list, &pending);
+		}
+
+		if (*sep == '#') {
+			/* scan for separator */
+			do {
+				sep++;
+			} while (*sep != 0 && !lnet_issep(*sep));
+		}
+
+		if (*sep == 0)
+			break;
+
+		str = sep + 1;
+	}
+
+	list_splice(&pending, tbs->prev);
+	return 0;
+}
+
+static int
+lnet_expand1tb(struct list_head *list,
+	       char *str, char *sep1, char *sep2,
+	       char *item, int itemlen)
+{
+	int	      len1 = (int)(sep1 - str);
+	int	      len2 = strlen(sep2 + 1);
+	struct lnet_text_buf_t *ltb;
+
+	LASSERT(*sep1 == '[');
+	LASSERT(*sep2 == ']');
+
+	ltb = lnet_new_text_buf(len1 + itemlen + len2);
+	if (ltb == NULL)
+		return -ENOMEM;
+
+	memcpy(ltb->ltb_text, str, len1);
+	memcpy(&ltb->ltb_text[len1], item, itemlen);
+	memcpy(&ltb->ltb_text[len1+itemlen], sep2 + 1, len2);
+	ltb->ltb_text[len1 + itemlen + len2] = 0;
+
+	list_add_tail(&ltb->ltb_list, list);
+	return 0;
+}
+
+static int
+lnet_str2tbs_expand(struct list_head *tbs, char *str)
+{
+	char	      num[16];
+	struct list_head	pending;
+	char	     *sep;
+	char	     *sep2;
+	char	     *parsed;
+	char	     *enditem;
+	int	       lo;
+	int	       hi;
+	int	       stride;
+	int	       i;
+	int	       nob;
+	int	       scanned;
+
+	INIT_LIST_HEAD(&pending);
+
+	sep = strchr(str, '[');
+	if (sep == NULL)			/* nothing to expand */
+		return 0;
+
+	sep2 = strchr(sep, ']');
+	if (sep2 == NULL)
+		goto failed;
+
+	for (parsed = sep; parsed < sep2; parsed = enditem) {
+
+		enditem = ++parsed;
+		while (enditem < sep2 && *enditem != ',')
+			enditem++;
+
+		if (enditem == parsed)		/* no empty items */
+			goto failed;
+
+		if (sscanf(parsed, "%d-%d/%d%n", &lo, &hi,
+			   &stride, &scanned) < 3) {
+
+			if (sscanf(parsed, "%d-%d%n", &lo, &hi, &scanned) < 2) {
+
+				/* simple string enumeration */
+				if (lnet_expand1tb(
+				     &pending, str, sep, sep2,
+				     parsed,
+				     (int)(enditem - parsed)) != 0) {
+					goto failed;
+				}
+
+				continue;
+			}
+
+			stride = 1;
+		}
+
+		/* range expansion */
+
+		if (enditem != parsed + scanned) /* no trailing junk */
+			goto failed;
+
+		if (hi < 0 || lo < 0 || stride < 0 || hi < lo ||
+		    (hi - lo) % stride != 0)
+			goto failed;
+
+		for (i = lo; i <= hi; i += stride) {
+
+			snprintf(num, sizeof(num), "%d", i);
+			nob = strlen(num);
+			if (nob + 1 == sizeof(num))
+				goto failed;
+
+			if (lnet_expand1tb(&pending, str, sep, sep2,
+					   num, nob) != 0)
+				goto failed;
+		}
+	}
+
+	list_splice(&pending, tbs->prev);
+	return 1;
+
+ failed:
+	lnet_free_text_bufs(&pending);
+	return -1;
+}
+
+static int
+lnet_parse_hops(char *str, unsigned int *hops)
+{
+	int     len = strlen(str);
+	int     nob = len;
+
+	return (sscanf(str, "%u%n", hops, &nob) >= 1 &&
+		nob == len &&
+		*hops > 0 && *hops < 256);
+}
+
+#define LNET_PRIORITY_SEPARATOR (':')
+
+static int
+lnet_parse_priority(char *str, unsigned int *priority, char **token)
+{
+	int   nob;
+	char *sep;
+	int   len;
+
+	sep = strchr(str, LNET_PRIORITY_SEPARATOR);
+	if (sep == NULL) {
+		*priority = 0;
+		return 0;
+	}
+	len = strlen(sep + 1);
+
+	if ((sscanf((sep+1), "%u%n", priority, &nob) < 1) || (len != nob)) {
+		/* Update the caller's token pointer so it treats the found
+		   priority as the token to report in the error message. */
+		*token += sep - str + 1;
+		return -1;
+	}
+
+	CDEBUG(D_NET, "gateway %s, priority %d, nob %d\n", str, *priority, nob);
+
+	/*
+	 * Change priority separator to \0 to be able to parse NID
+	 */
+	*sep = '\0';
+	return 0;
+}
+
+static int
+lnet_parse_route(char *str, int *im_a_router)
+{
+	/* static scratch buffer OK (single threaded) */
+	static char       cmd[LNET_SINGLE_TEXTBUF_NOB];
+
+	struct list_head	nets;
+	struct list_head	gateways;
+	struct list_head       *tmp1;
+	struct list_head       *tmp2;
+	__u32	     net;
+	lnet_nid_t	nid;
+	struct lnet_text_buf_t  *ltb;
+	int	       rc;
+	char	     *sep;
+	char	     *token = str;
+	int	       ntokens = 0;
+	int	       myrc = -1;
+	unsigned int      hops;
+	int	       got_hops = 0;
+	unsigned int	  priority = 0;
+
+	INIT_LIST_HEAD(&gateways);
+	INIT_LIST_HEAD(&nets);
+
+	/* save a copy of the string for error messages */
+	strncpy(cmd, str, sizeof(cmd) - 1);
+	cmd[sizeof(cmd) - 1] = 0;
+
+	sep = str;
+	for (;;) {
+		/* scan for token start */
+		while (isspace(*sep))
+			sep++;
+		if (*sep == 0) {
+			if (ntokens < (got_hops ? 3 : 2))
+				goto token_error;
+			break;
+		}
+
+		ntokens++;
+		token = sep++;
+
+		/* scan for token end */
+		while (*sep != 0 && !isspace(*sep))
+			sep++;
+		if (*sep != 0)
+			*sep++ = 0;
+
+		if (ntokens == 1) {
+			tmp2 = &nets;		/* expanding nets */
+		} else if (ntokens == 2 &&
+			   lnet_parse_hops(token, &hops)) {
+			got_hops = 1;	   /* got a hop count */
+			continue;
+		} else {
+			tmp2 = &gateways;	/* expanding gateways */
+		}
+
+		ltb = lnet_new_text_buf(strlen(token));
+		if (ltb == NULL)
+			goto out;
+
+		strcpy(ltb->ltb_text, token);
+		tmp1 = &ltb->ltb_list;
+		list_add_tail(tmp1, tmp2);
+
+		while (tmp1 != tmp2) {
+			ltb = list_entry(tmp1, struct lnet_text_buf_t,
+					 ltb_list);
+
+			rc = lnet_str2tbs_expand(tmp1->next, ltb->ltb_text);
+			if (rc < 0)
+				goto token_error;
+
+			tmp1 = tmp1->next;
+
+			if (rc > 0) {		/* expanded! */
+				list_del(&ltb->ltb_list);
+				lnet_free_text_buf(ltb);
+				continue;
+			}
+
+			if (ntokens == 1) {
+				net = libcfs_str2net(ltb->ltb_text);
+				if (net == LNET_NIDNET(LNET_NID_ANY) ||
+				    LNET_NETTYP(net) == LOLND)
+					goto token_error;
+			} else {
+				rc = lnet_parse_priority(ltb->ltb_text,
+							 &priority, &token);
+				if (rc < 0)
+					goto token_error;
+
+				nid = libcfs_str2nid(ltb->ltb_text);
+				if (nid == LNET_NID_ANY ||
+				    LNET_NETTYP(LNET_NIDNET(nid)) == LOLND)
+					goto token_error;
+			}
+		}
+	}
+
+	if (!got_hops)
+		hops = 1;
+
+	LASSERT(!list_empty(&nets));
+	LASSERT(!list_empty(&gateways));
+
+	list_for_each(tmp1, &nets) {
+		ltb = list_entry(tmp1, struct lnet_text_buf_t, ltb_list);
+		net = libcfs_str2net(ltb->ltb_text);
+		LASSERT(net != LNET_NIDNET(LNET_NID_ANY));
+
+		list_for_each(tmp2, &gateways) {
+			ltb = list_entry(tmp2, struct lnet_text_buf_t,
+					 ltb_list);
+			nid = libcfs_str2nid(ltb->ltb_text);
+			LASSERT(nid != LNET_NID_ANY);
+
+			if (lnet_islocalnid(nid)) {
+				*im_a_router = 1;
+				continue;
+			}
+
+			rc = lnet_add_route(net, hops, nid, priority);
+			if (rc != 0) {
+				CERROR("Can't create route to %s via %s\n",
+				       libcfs_net2str(net),
+				       libcfs_nid2str(nid));
+				goto out;
+			}
+		}
+	}
+
+	myrc = 0;
+	goto out;
+
+ token_error:
+	lnet_syntax("routes", cmd, (int)(token - str), strlen(token));
+ out:
+	lnet_free_text_bufs(&nets);
+	lnet_free_text_bufs(&gateways);
+	return myrc;
+}
+
+static int
+lnet_parse_route_tbs(struct list_head *tbs, int *im_a_router)
+{
+	struct lnet_text_buf_t   *ltb;
+
+	while (!list_empty(tbs)) {
+		ltb = list_entry(tbs->next, struct lnet_text_buf_t, ltb_list);
+
+		if (lnet_parse_route(ltb->ltb_text, im_a_router) < 0) {
+			lnet_free_text_bufs(tbs);
+			return -EINVAL;
+		}
+
+		list_del(&ltb->ltb_list);
+		lnet_free_text_buf(ltb);
+	}
+
+	return 0;
+}
+
+int
+lnet_parse_routes(char *routes, int *im_a_router)
+{
+	struct list_head	tbs;
+	int	       rc = 0;
+
+	*im_a_router = 0;
+
+	INIT_LIST_HEAD(&tbs);
+
+	if (lnet_str2tbs_sep(&tbs, routes) < 0) {
+		CERROR("Error parsing routes\n");
+		rc = -EINVAL;
+	} else {
+		rc = lnet_parse_route_tbs(&tbs, im_a_router);
+	}
+
+	LASSERT(lnet_tbnob == 0);
+	return rc;
+}
+
+static int
+lnet_match_network_token(char *token, int len, __u32 *ipaddrs, int nip)
+{
+	LIST_HEAD(list);
+	int		rc;
+	int		i;
+
+	rc = cfs_ip_addr_parse(token, len, &list);
+	if (rc != 0)
+		return rc;
+
+	for (rc = i = 0; !rc && i < nip; i++)
+		rc = cfs_ip_addr_match(ipaddrs[i], &list);
+
+	cfs_ip_addr_free(&list);
+
+	return rc;
+}
+
+static int
+lnet_match_network_tokens(char *net_entry, __u32 *ipaddrs, int nip)
+{
+	static char tokens[LNET_SINGLE_TEXTBUF_NOB];
+
+	int   matched = 0;
+	int   ntokens = 0;
+	int   len;
+	char *net = NULL;
+	char *sep;
+	char *token;
+	int   rc;
+
+	LASSERT(strlen(net_entry) < sizeof(tokens));
+
+	/* work on a copy of the string */
+	strcpy(tokens, net_entry);
+	sep = tokens;
+	for (;;) {
+		/* scan for token start */
+		while (isspace(*sep))
+			sep++;
+		if (*sep == 0)
+			break;
+
+		token = sep++;
+
+		/* scan for token end */
+		while (*sep != 0 && !isspace(*sep))
+			sep++;
+		if (*sep != 0)
+			*sep++ = 0;
+
+		if (ntokens++ == 0) {
+			net = token;
+			continue;
+		}
+
+		len = strlen(token);
+
+		rc = lnet_match_network_token(token, len, ipaddrs, nip);
+		if (rc < 0) {
+			lnet_syntax("ip2nets", net_entry,
+				    (int)(token - tokens), len);
+			return rc;
+		}
+
+		matched |= (rc != 0);
+	}
+
+	if (!matched)
+		return 0;
+
+	strcpy(net_entry, net);		 /* replace with matched net */
+	return 1;
+}
+
+static __u32
+lnet_netspec2net(char *netspec)
+{
+	char   *bracket = strchr(netspec, '(');
+	__u32   net;
+
+	if (bracket != NULL)
+		*bracket = 0;
+
+	net = libcfs_str2net(netspec);
+
+	if (bracket != NULL)
+		*bracket = '(';
+
+	return net;
+}
+
+static int
+lnet_splitnets(char *source, struct list_head *nets)
+{
+	int	       offset = 0;
+	int	       offset2;
+	int	       len;
+	struct lnet_text_buf_t  *tb;
+	struct lnet_text_buf_t  *tb2;
+	struct list_head       *t;
+	char	     *sep;
+	char	     *bracket;
+	__u32	     net;
+
+	LASSERT(!list_empty(nets));
+	LASSERT(nets->next == nets->prev);     /* single entry */
+
+	tb = list_entry(nets->next, struct lnet_text_buf_t, ltb_list);
+
+	for (;;) {
+		sep = strchr(tb->ltb_text, ',');
+		bracket = strchr(tb->ltb_text, '(');
+
+		if (sep != NULL &&
+		    bracket != NULL &&
+		    bracket < sep) {
+			/* netspec lists interfaces... */
+
+			offset2 = offset + (int)(bracket - tb->ltb_text);
+			len = strlen(bracket);
+
+			bracket = strchr(bracket + 1, ')');
+
+			if (bracket == NULL ||
+			    !(bracket[1] == ',' || bracket[1] == 0)) {
+				lnet_syntax("ip2nets", source, offset2, len);
+				return -EINVAL;
+			}
+
+			sep = (bracket[1] == 0) ? NULL : bracket + 1;
+		}
+
+		if (sep != NULL)
+			*sep++ = 0;
+
+		net = lnet_netspec2net(tb->ltb_text);
+		if (net == LNET_NIDNET(LNET_NID_ANY)) {
+			lnet_syntax("ip2nets", source, offset,
+				    strlen(tb->ltb_text));
+			return -EINVAL;
+		}
+
+		list_for_each(t, nets) {
+			tb2 = list_entry(t, struct lnet_text_buf_t, ltb_list);
+
+			if (tb2 == tb)
+				continue;
+
+			if (net == lnet_netspec2net(tb2->ltb_text)) {
+				/* duplicate network */
+				lnet_syntax("ip2nets", source, offset,
+					    strlen(tb->ltb_text));
+				return -EINVAL;
+			}
+		}
+
+		if (sep == NULL)
+			return 0;
+
+		offset += (int)(sep - tb->ltb_text);
+		tb2 = lnet_new_text_buf(strlen(sep));
+		if (tb2 == NULL)
+			return -ENOMEM;
+
+		strcpy(tb2->ltb_text, sep);
+		list_add_tail(&tb2->ltb_list, nets);
+
+		tb = tb2;
+	}
+}
+
+static int
+lnet_match_networks(char **networksp, char *ip2nets, __u32 *ipaddrs, int nip)
+{
+	static char	networks[LNET_SINGLE_TEXTBUF_NOB];
+	static char	source[LNET_SINGLE_TEXTBUF_NOB];
+
+	struct list_head	  raw_entries;
+	struct list_head	  matched_nets;
+	struct list_head	  current_nets;
+	struct list_head	 *t;
+	struct list_head	 *t2;
+	struct lnet_text_buf_t    *tb;
+	struct lnet_text_buf_t    *tb2;
+	__u32	       net1;
+	__u32	       net2;
+	int		 len;
+	int		 count;
+	int		 dup;
+	int		 rc;
+
+	INIT_LIST_HEAD(&raw_entries);
+	if (lnet_str2tbs_sep(&raw_entries, ip2nets) < 0) {
+		CERROR("Error parsing ip2nets\n");
+		LASSERT(lnet_tbnob == 0);
+		return -EINVAL;
+	}
+
+	INIT_LIST_HEAD(&matched_nets);
+	INIT_LIST_HEAD(&current_nets);
+	networks[0] = 0;
+	count = 0;
+	len = 0;
+	rc = 0;
+
+	while (!list_empty(&raw_entries)) {
+		tb = list_entry(raw_entries.next, struct lnet_text_buf_t,
+				    ltb_list);
+
+		strncpy(source, tb->ltb_text, sizeof(source)-1);
+		source[sizeof(source)-1] = 0;
+
+		/* replace ltb_text with the network(s) add on match */
+		rc = lnet_match_network_tokens(tb->ltb_text, ipaddrs, nip);
+		if (rc < 0)
+			break;
+
+		list_del(&tb->ltb_list);
+
+		if (rc == 0) {		  /* no match */
+			lnet_free_text_buf(tb);
+			continue;
+		}
+
+		/* split into separate networks */
+		INIT_LIST_HEAD(&current_nets);
+		list_add(&tb->ltb_list, &current_nets);
+		rc = lnet_splitnets(source, &current_nets);
+		if (rc < 0)
+			break;
+
+		dup = 0;
+		list_for_each(t, &current_nets) {
+			tb = list_entry(t, struct lnet_text_buf_t, ltb_list);
+			net1 = lnet_netspec2net(tb->ltb_text);
+			LASSERT(net1 != LNET_NIDNET(LNET_NID_ANY));
+
+			list_for_each(t2, &matched_nets) {
+				tb2 = list_entry(t2, struct lnet_text_buf_t,
+						     ltb_list);
+				net2 = lnet_netspec2net(tb2->ltb_text);
+				LASSERT(net2 != LNET_NIDNET(LNET_NID_ANY));
+
+				if (net1 == net2) {
+					dup = 1;
+					break;
+				}
+			}
+
+			if (dup)
+				break;
+		}
+
+		if (dup) {
+			lnet_free_text_bufs(&current_nets);
+			continue;
+		}
+
+		list_for_each_safe(t, t2, &current_nets) {
+			tb = list_entry(t, struct lnet_text_buf_t, ltb_list);
+
+			list_del(&tb->ltb_list);
+			list_add_tail(&tb->ltb_list, &matched_nets);
+
+			len += snprintf(networks + len, sizeof(networks) - len,
+					"%s%s", (len == 0) ? "" : ",",
+					tb->ltb_text);
+
+			if (len >= sizeof(networks)) {
+				CERROR("Too many matched networks\n");
+				rc = -E2BIG;
+				goto out;
+			}
+		}
+
+		count++;
+	}
+
+ out:
+	lnet_free_text_bufs(&raw_entries);
+	lnet_free_text_bufs(&matched_nets);
+	lnet_free_text_bufs(&current_nets);
+	LASSERT(lnet_tbnob == 0);
+
+	if (rc < 0)
+		return rc;
+
+	*networksp = networks;
+	return count;
+}
+
+static void
+lnet_ipaddr_free_enumeration(__u32 *ipaddrs, int nip)
+{
+	LIBCFS_FREE(ipaddrs, nip * sizeof(*ipaddrs));
+}
+
+static int
+lnet_ipaddr_enumerate(__u32 **ipaddrsp)
+{
+	int	up;
+	__u32      netmask;
+	__u32     *ipaddrs;
+	__u32     *ipaddrs2;
+	int	nip;
+	char     **ifnames;
+	int	nif = libcfs_ipif_enumerate(&ifnames);
+	int	i;
+	int	rc;
+
+	if (nif <= 0)
+		return nif;
+
+	LIBCFS_ALLOC(ipaddrs, nif * sizeof(*ipaddrs));
+	if (ipaddrs == NULL) {
+		CERROR("Can't allocate ipaddrs[%d]\n", nif);
+		libcfs_ipif_free_enumeration(ifnames, nif);
+		return -ENOMEM;
+	}
+
+	for (i = nip = 0; i < nif; i++) {
+		if (!strcmp(ifnames[i], "lo"))
+			continue;
+
+		rc = libcfs_ipif_query(ifnames[i], &up,
+				       &ipaddrs[nip], &netmask);
+		if (rc != 0) {
+			CWARN("Can't query interface %s: %d\n",
+			      ifnames[i], rc);
+			continue;
+		}
+
+		if (!up) {
+			CWARN("Ignoring interface %s: it's down\n",
+			      ifnames[i]);
+			continue;
+		}
+
+		nip++;
+	}
+
+	libcfs_ipif_free_enumeration(ifnames, nif);
+
+	if (nip == nif) {
+		*ipaddrsp = ipaddrs;
+	} else {
+		if (nip > 0) {
+			LIBCFS_ALLOC(ipaddrs2, nip * sizeof(*ipaddrs2));
+			if (ipaddrs2 == NULL) {
+				CERROR("Can't allocate ipaddrs[%d]\n", nip);
+				nip = -ENOMEM;
+			} else {
+				memcpy(ipaddrs2, ipaddrs,
+				       nip * sizeof(*ipaddrs));
+				*ipaddrsp = ipaddrs2;
+				rc = nip;
+			}
+		}
+		lnet_ipaddr_free_enumeration(ipaddrs, nif);
+	}
+	return nip;
+}
+
+int
+lnet_parse_ip2nets(char **networksp, char *ip2nets)
+{
+	__u32     *ipaddrs = NULL;
+	int	nip = lnet_ipaddr_enumerate(&ipaddrs);
+	int	rc;
+
+	if (nip < 0) {
+		LCONSOLE_ERROR_MSG(0x117,
+				   "Error %d enumerating local IP interfaces for ip2nets to match\n",
+				   nip);
+		return nip;
+	}
+
+	if (nip == 0) {
+		LCONSOLE_ERROR_MSG(0x118,
+				   "No local IP interfaces for ip2nets to match\n");
+		return -ENOENT;
+	}
+
+	rc = lnet_match_networks(networksp, ip2nets, ipaddrs, nip);
+	lnet_ipaddr_free_enumeration(ipaddrs, nip);
+
+	if (rc < 0) {
+		LCONSOLE_ERROR_MSG(0x119, "Error %d parsing ip2nets\n", rc);
+		return rc;
+	}
+
+	if (rc == 0) {
+		LCONSOLE_ERROR_MSG(0x11a,
+				   "ip2nets does not match any local IP interfaces\n");
+		return -ENOENT;
+	}
+
+	return 0;
+}
+
+int
+lnet_set_ip_niaddr(lnet_ni_t *ni)
+{
+	__u32  net = LNET_NIDNET(ni->ni_nid);
+	char **names;
+	int    n;
+	__u32  ip;
+	__u32  netmask;
+	int    up;
+	int    i;
+	int    rc;
+
+	/* Convenience for LNDs that use the IP address of a local interface as
+	 * the local address part of their NID */
+
+	if (ni->ni_interfaces[0] != NULL) {
+
+		CLASSERT(LNET_MAX_INTERFACES > 1);
+
+		if (ni->ni_interfaces[1] != NULL) {
+			CERROR("Net %s doesn't support multiple interfaces\n",
+			       libcfs_net2str(net));
+			return -EPERM;
+		}
+
+		rc = libcfs_ipif_query(ni->ni_interfaces[0],
+				       &up, &ip, &netmask);
+		if (rc != 0) {
+			CERROR("Net %s can't query interface %s: %d\n",
+			       libcfs_net2str(net), ni->ni_interfaces[0], rc);
+			return -EPERM;
+		}
+
+		if (!up) {
+			CERROR("Net %s can't use interface %s: it's down\n",
+			       libcfs_net2str(net), ni->ni_interfaces[0]);
+			return -ENETDOWN;
+		}
+
+		ni->ni_nid = LNET_MKNID(net, ip);
+		return 0;
+	}
+
+	n = libcfs_ipif_enumerate(&names);
+	if (n <= 0) {
+		CERROR("Net %s can't enumerate interfaces: %d\n",
+		       libcfs_net2str(net), n);
+		return 0;
+	}
+
+	for (i = 0; i < n; i++) {
+		if (!strcmp(names[i], "lo")) /* skip the loopback IF */
+			continue;
+
+		rc = libcfs_ipif_query(names[i], &up, &ip, &netmask);
+
+		if (rc != 0) {
+			CWARN("Net %s can't query interface %s: %d\n",
+			      libcfs_net2str(net), names[i], rc);
+			continue;
+		}
+
+		if (!up) {
+			CWARN("Net %s ignoring interface %s (down)\n",
+			      libcfs_net2str(net), names[i]);
+			continue;
+		}
+
+		libcfs_ipif_free_enumeration(names, n);
+		ni->ni_nid = LNET_MKNID(net, ip);
+		return 0;
+	}
+
+	CERROR("Net %s can't find any interfaces\n", libcfs_net2str(net));
+	libcfs_ipif_free_enumeration(names, n);
+	return -ENOENT;
+}
+EXPORT_SYMBOL(lnet_set_ip_niaddr);
diff --git a/kernel/drivers/staging/lustre/lnet/lnet/lib-eq.c b/kernel/drivers/staging/lustre/lnet/lnet/lib-eq.c
new file mode 100644
index 000000000..5470148f5
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lnet/lnet/lib-eq.c
@@ -0,0 +1,441 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/lnet/lib-eq.c
+ *
+ * Library level Event queue management routines
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+#include "../../include/linux/lnet/lib-lnet.h"
+
+/**
+ * Create an event queue that has room for \a count number of events.
+ *
+ * The event queue is circular and older events will be overwritten by new
+ * ones if they are not removed in time by the user using the functions
+ * LNetEQGet(), LNetEQWait(), or LNetEQPoll(). It is up to the user to
+ * determine the appropriate size of the event queue to prevent this loss
+ * of events. Note that when EQ handler is specified in \a callback, no
+ * event loss can happen, since the handler is run for each event deposited
+ * into the EQ.
+ *
+ * \param count The number of events to be stored in the event queue. It
+ * will be rounded up to the next power of two.
+ * \param callback A handler function that runs when an event is deposited
+ * into the EQ. The constant value LNET_EQ_HANDLER_NONE can be used to
+ * indicate that no event handler is desired.
+ * \param handle On successful return, this location will hold a handle for
+ * the newly created EQ.
+ *
+ * \retval 0       On success.
+ * \retval -EINVAL If an parameter is not valid.
+ * \retval -ENOMEM If memory for the EQ can't be allocated.
+ *
+ * \see lnet_eq_handler_t for the discussion on EQ handler semantics.
+ */
+int
+LNetEQAlloc(unsigned int count, lnet_eq_handler_t callback,
+	    lnet_handle_eq_t *handle)
+{
+	lnet_eq_t     *eq;
+
+	LASSERT(the_lnet.ln_init);
+	LASSERT(the_lnet.ln_refcount > 0);
+
+	/* We need count to be a power of 2 so that when eq_{enq,deq}_seq
+	 * overflow, they don't skip entries, so the queue has the same
+	 * apparent capacity at all times */
+
+	count = cfs_power2_roundup(count);
+
+	if (callback != LNET_EQ_HANDLER_NONE && count != 0)
+		CWARN("EQ callback is guaranteed to get every event, do you still want to set eqcount %d for polling event which will have locking overhead? Please contact with developer to confirm\n", count);
+
+	/* count can be 0 if only need callback, we can eliminate
+	 * overhead of enqueue event */
+	if (count == 0 && callback == LNET_EQ_HANDLER_NONE)
+		return -EINVAL;
+
+	eq = lnet_eq_alloc();
+	if (eq == NULL)
+		return -ENOMEM;
+
+	if (count != 0) {
+		LIBCFS_ALLOC(eq->eq_events, count * sizeof(lnet_event_t));
+		if (eq->eq_events == NULL)
+			goto failed;
+		/* NB allocator has set all event sequence numbers to 0,
+		 * so all them should be earlier than eq_deq_seq */
+	}
+
+	eq->eq_deq_seq = 1;
+	eq->eq_enq_seq = 1;
+	eq->eq_size = count;
+	eq->eq_callback = callback;
+
+	eq->eq_refs = cfs_percpt_alloc(lnet_cpt_table(),
+				       sizeof(*eq->eq_refs[0]));
+	if (eq->eq_refs == NULL)
+		goto failed;
+
+	/* MUST hold both exclusive lnet_res_lock */
+	lnet_res_lock(LNET_LOCK_EX);
+	/* NB: hold lnet_eq_wait_lock for EQ link/unlink, so we can do
+	 * both EQ lookup and poll event with only lnet_eq_wait_lock */
+	lnet_eq_wait_lock();
+
+	lnet_res_lh_initialize(&the_lnet.ln_eq_container, &eq->eq_lh);
+	list_add(&eq->eq_list, &the_lnet.ln_eq_container.rec_active);
+
+	lnet_eq_wait_unlock();
+	lnet_res_unlock(LNET_LOCK_EX);
+
+	lnet_eq2handle(handle, eq);
+	return 0;
+
+failed:
+	if (eq->eq_events != NULL)
+		LIBCFS_FREE(eq->eq_events, count * sizeof(lnet_event_t));
+
+	if (eq->eq_refs != NULL)
+		cfs_percpt_free(eq->eq_refs);
+
+	lnet_eq_free(eq);
+	return -ENOMEM;
+}
+EXPORT_SYMBOL(LNetEQAlloc);
+
+/**
+ * Release the resources associated with an event queue if it's idle;
+ * otherwise do nothing and it's up to the user to try again.
+ *
+ * \param eqh A handle for the event queue to be released.
+ *
+ * \retval 0 If the EQ is not in use and freed.
+ * \retval -ENOENT If \a eqh does not point to a valid EQ.
+ * \retval -EBUSY  If the EQ is still in use by some MDs.
+ */
+int
+LNetEQFree(lnet_handle_eq_t eqh)
+{
+	struct lnet_eq	*eq;
+	lnet_event_t	*events = NULL;
+	int		**refs = NULL;
+	int		*ref;
+	int		rc = 0;
+	int		size = 0;
+	int		i;
+
+	LASSERT(the_lnet.ln_init);
+	LASSERT(the_lnet.ln_refcount > 0);
+
+	lnet_res_lock(LNET_LOCK_EX);
+	/* NB: hold lnet_eq_wait_lock for EQ link/unlink, so we can do
+	 * both EQ lookup and poll event with only lnet_eq_wait_lock */
+	lnet_eq_wait_lock();
+
+	eq = lnet_handle2eq(&eqh);
+	if (eq == NULL) {
+		rc = -ENOENT;
+		goto out;
+	}
+
+	cfs_percpt_for_each(ref, i, eq->eq_refs) {
+		LASSERT(*ref >= 0);
+		if (*ref == 0)
+			continue;
+
+		CDEBUG(D_NET, "Event equeue (%d: %d) busy on destroy.\n",
+		       i, *ref);
+		rc = -EBUSY;
+		goto out;
+	}
+
+	/* stash for free after lock dropped */
+	events	= eq->eq_events;
+	size	= eq->eq_size;
+	refs	= eq->eq_refs;
+
+	lnet_res_lh_invalidate(&eq->eq_lh);
+	list_del(&eq->eq_list);
+	lnet_eq_free_locked(eq);
+ out:
+	lnet_eq_wait_unlock();
+	lnet_res_unlock(LNET_LOCK_EX);
+
+	if (events != NULL)
+		LIBCFS_FREE(events, size * sizeof(lnet_event_t));
+	if (refs != NULL)
+		cfs_percpt_free(refs);
+
+	return rc;
+}
+EXPORT_SYMBOL(LNetEQFree);
+
+void
+lnet_eq_enqueue_event(lnet_eq_t *eq, lnet_event_t *ev)
+{
+	/* MUST called with resource lock hold but w/o lnet_eq_wait_lock */
+	int index;
+
+	if (eq->eq_size == 0) {
+		LASSERT(eq->eq_callback != LNET_EQ_HANDLER_NONE);
+		eq->eq_callback(ev);
+		return;
+	}
+
+	lnet_eq_wait_lock();
+	ev->sequence = eq->eq_enq_seq++;
+
+	LASSERT(eq->eq_size == LOWEST_BIT_SET(eq->eq_size));
+	index = ev->sequence & (eq->eq_size - 1);
+
+	eq->eq_events[index] = *ev;
+
+	if (eq->eq_callback != LNET_EQ_HANDLER_NONE)
+		eq->eq_callback(ev);
+
+	/* Wake anyone waiting in LNetEQPoll() */
+	if (waitqueue_active(&the_lnet.ln_eq_waitq))
+		wake_up_all(&the_lnet.ln_eq_waitq);
+	lnet_eq_wait_unlock();
+}
+
+static int
+lnet_eq_dequeue_event(lnet_eq_t *eq, lnet_event_t *ev)
+{
+	int		new_index = eq->eq_deq_seq & (eq->eq_size - 1);
+	lnet_event_t	*new_event = &eq->eq_events[new_index];
+	int		rc;
+
+	/* must called with lnet_eq_wait_lock hold */
+	if (LNET_SEQ_GT(eq->eq_deq_seq, new_event->sequence))
+		return 0;
+
+	/* We've got a new event... */
+	*ev = *new_event;
+
+	CDEBUG(D_INFO, "event: %p, sequence: %lu, eq->size: %u\n",
+	       new_event, eq->eq_deq_seq, eq->eq_size);
+
+	/* ...but did it overwrite an event we've not seen yet? */
+	if (eq->eq_deq_seq == new_event->sequence) {
+		rc = 1;
+	} else {
+		/* don't complain with CERROR: some EQs are sized small
+		 * anyway; if it's important, the caller should complain */
+		CDEBUG(D_NET, "Event Queue Overflow: eq seq %lu ev seq %lu\n",
+		       eq->eq_deq_seq, new_event->sequence);
+		rc = -EOVERFLOW;
+	}
+
+	eq->eq_deq_seq = new_event->sequence + 1;
+	return rc;
+}
+
+/**
+ * A nonblocking function that can be used to get the next event in an EQ.
+ * If an event handler is associated with the EQ, the handler will run before
+ * this function returns successfully. The event is removed from the queue.
+ *
+ * \param eventq A handle for the event queue.
+ * \param event On successful return (1 or -EOVERFLOW), this location will
+ * hold the next event in the EQ.
+ *
+ * \retval 0	  No pending event in the EQ.
+ * \retval 1	  Indicates success.
+ * \retval -ENOENT    If \a eventq does not point to a valid EQ.
+ * \retval -EOVERFLOW Indicates success (i.e., an event is returned) and that
+ * at least one event between this event and the last event obtained from the
+ * EQ has been dropped due to limited space in the EQ.
+ */
+int
+LNetEQGet(lnet_handle_eq_t eventq, lnet_event_t *event)
+{
+	int which;
+
+	return LNetEQPoll(&eventq, 1, 0,
+			 event, &which);
+}
+EXPORT_SYMBOL(LNetEQGet);
+
+/**
+ * Block the calling process until there is an event in the EQ.
+ * If an event handler is associated with the EQ, the handler will run before
+ * this function returns successfully. This function returns the next event
+ * in the EQ and removes it from the EQ.
+ *
+ * \param eventq A handle for the event queue.
+ * \param event On successful return (1 or -EOVERFLOW), this location will
+ * hold the next event in the EQ.
+ *
+ * \retval 1	  Indicates success.
+ * \retval -ENOENT    If \a eventq does not point to a valid EQ.
+ * \retval -EOVERFLOW Indicates success (i.e., an event is returned) and that
+ * at least one event between this event and the last event obtained from the
+ * EQ has been dropped due to limited space in the EQ.
+ */
+int
+LNetEQWait(lnet_handle_eq_t eventq, lnet_event_t *event)
+{
+	int which;
+
+	return LNetEQPoll(&eventq, 1, LNET_TIME_FOREVER,
+			 event, &which);
+}
+EXPORT_SYMBOL(LNetEQWait);
+
+
+static int
+lnet_eq_wait_locked(int *timeout_ms)
+__must_hold(&the_lnet.ln_eq_wait_lock)
+{
+	int		tms = *timeout_ms;
+	int		wait;
+	wait_queue_t  wl;
+	unsigned long      now;
+
+	if (tms == 0)
+		return -1; /* don't want to wait and no new event */
+
+	init_waitqueue_entry(&wl, current);
+	set_current_state(TASK_INTERRUPTIBLE);
+	add_wait_queue(&the_lnet.ln_eq_waitq, &wl);
+
+	lnet_eq_wait_unlock();
+
+	if (tms < 0) {
+		schedule();
+
+	} else {
+		struct timeval tv;
+
+		now = cfs_time_current();
+		schedule_timeout(cfs_time_seconds(tms) / 1000);
+		cfs_duration_usec(cfs_time_sub(cfs_time_current(), now), &tv);
+		tms -= (int)(tv.tv_sec * 1000 + tv.tv_usec / 1000);
+		if (tms < 0) /* no more wait but may have new event */
+			tms = 0;
+	}
+
+	wait = tms != 0; /* might need to call here again */
+	*timeout_ms = tms;
+
+	lnet_eq_wait_lock();
+	remove_wait_queue(&the_lnet.ln_eq_waitq, &wl);
+
+	return wait;
+}
+
+
+
+/**
+ * Block the calling process until there's an event from a set of EQs or
+ * timeout happens.
+ *
+ * If an event handler is associated with the EQ, the handler will run before
+ * this function returns successfully, in which case the corresponding event
+ * is consumed.
+ *
+ * LNetEQPoll() provides a timeout to allow applications to poll, block for a
+ * fixed period, or block indefinitely.
+ *
+ * \param eventqs,neq An array of EQ handles, and size of the array.
+ * \param timeout_ms Time in milliseconds to wait for an event to occur on
+ * one of the EQs. The constant LNET_TIME_FOREVER can be used to indicate an
+ * infinite timeout.
+ * \param event,which On successful return (1 or -EOVERFLOW), \a event will
+ * hold the next event in the EQs, and \a which will contain the index of the
+ * EQ from which the event was taken.
+ *
+ * \retval 0	  No pending event in the EQs after timeout.
+ * \retval 1	  Indicates success.
+ * \retval -EOVERFLOW Indicates success (i.e., an event is returned) and that
+ * at least one event between this event and the last event obtained from the
+ * EQ indicated by \a which has been dropped due to limited space in the EQ.
+ * \retval -ENOENT    If there's an invalid handle in \a eventqs.
+ */
+int
+LNetEQPoll(lnet_handle_eq_t *eventqs, int neq, int timeout_ms,
+	   lnet_event_t *event, int *which)
+{
+	int	wait = 1;
+	int	rc;
+	int	i;
+
+	LASSERT(the_lnet.ln_init);
+	LASSERT(the_lnet.ln_refcount > 0);
+
+	if (neq < 1)
+		return -ENOENT;
+
+	lnet_eq_wait_lock();
+
+	for (;;) {
+		for (i = 0; i < neq; i++) {
+			lnet_eq_t *eq = lnet_handle2eq(&eventqs[i]);
+
+			if (eq == NULL) {
+				lnet_eq_wait_unlock();
+				return -ENOENT;
+			}
+
+			rc = lnet_eq_dequeue_event(eq, event);
+			if (rc != 0) {
+				lnet_eq_wait_unlock();
+				*which = i;
+				return rc;
+			}
+		}
+
+		if (wait == 0)
+			break;
+
+		/*
+		 * return value of lnet_eq_wait_locked:
+		 * -1 : did nothing and it's sure no new event
+		 *  1 : sleep inside and wait until new event
+		 *  0 : don't want to wait anymore, but might have new event
+		 *      so need to call dequeue again
+		 */
+		wait = lnet_eq_wait_locked(&timeout_ms);
+		if (wait < 0) /* no new event */
+			break;
+	}
+
+	lnet_eq_wait_unlock();
+	return 0;
+}
diff --git a/kernel/drivers/staging/lustre/lnet/lnet/lib-md.c b/kernel/drivers/staging/lustre/lnet/lnet/lib-md.c
new file mode 100644
index 000000000..89d660fef
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lnet/lnet/lib-md.c
@@ -0,0 +1,454 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/lnet/lib-md.c
+ *
+ * Memory Descriptor management routines
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+
+#include "../../include/linux/lnet/lib-lnet.h"
+
+/* must be called with lnet_res_lock held */
+void
+lnet_md_unlink(lnet_libmd_t *md)
+{
+	if ((md->md_flags & LNET_MD_FLAG_ZOMBIE) == 0) {
+		/* first unlink attempt... */
+		lnet_me_t *me = md->md_me;
+
+		md->md_flags |= LNET_MD_FLAG_ZOMBIE;
+
+		/* Disassociate from ME (if any),
+		 * and unlink it if it was created
+		 * with LNET_UNLINK */
+		if (me != NULL) {
+			/* detach MD from portal */
+			lnet_ptl_detach_md(me, md);
+			if (me->me_unlink == LNET_UNLINK)
+				lnet_me_unlink(me);
+		}
+
+		/* ensure all future handle lookups fail */
+		lnet_res_lh_invalidate(&md->md_lh);
+	}
+
+	if (md->md_refcount != 0) {
+		CDEBUG(D_NET, "Queueing unlink of md %p\n", md);
+		return;
+	}
+
+	CDEBUG(D_NET, "Unlinking md %p\n", md);
+
+	if (md->md_eq != NULL) {
+		int	cpt = lnet_cpt_of_cookie(md->md_lh.lh_cookie);
+
+		LASSERT(*md->md_eq->eq_refs[cpt] > 0);
+		(*md->md_eq->eq_refs[cpt])--;
+	}
+
+	LASSERT(!list_empty(&md->md_list));
+	list_del_init(&md->md_list);
+	lnet_md_free_locked(md);
+}
+
+static int
+lnet_md_build(lnet_libmd_t *lmd, lnet_md_t *umd, int unlink)
+{
+	int	  i;
+	unsigned int niov;
+	int	  total_length = 0;
+
+	lmd->md_me = NULL;
+	lmd->md_start = umd->start;
+	lmd->md_offset = 0;
+	lmd->md_max_size = umd->max_size;
+	lmd->md_options = umd->options;
+	lmd->md_user_ptr = umd->user_ptr;
+	lmd->md_eq = NULL;
+	lmd->md_threshold = umd->threshold;
+	lmd->md_refcount = 0;
+	lmd->md_flags = (unlink == LNET_UNLINK) ? LNET_MD_FLAG_AUTO_UNLINK : 0;
+
+	if ((umd->options & LNET_MD_IOVEC) != 0) {
+
+		if ((umd->options & LNET_MD_KIOV) != 0) /* Can't specify both */
+			return -EINVAL;
+
+		lmd->md_niov = niov = umd->length;
+		memcpy(lmd->md_iov.iov, umd->start,
+		       niov * sizeof(lmd->md_iov.iov[0]));
+
+		for (i = 0; i < (int)niov; i++) {
+			/* We take the base address on trust */
+			/* invalid length */
+			if (lmd->md_iov.iov[i].iov_len <= 0)
+				return -EINVAL;
+
+			total_length += lmd->md_iov.iov[i].iov_len;
+		}
+
+		lmd->md_length = total_length;
+
+		if ((umd->options & LNET_MD_MAX_SIZE) != 0 && /* use max size */
+		    (umd->max_size < 0 ||
+		     umd->max_size > total_length)) /* illegal max_size */
+			return -EINVAL;
+
+	} else if ((umd->options & LNET_MD_KIOV) != 0) {
+		lmd->md_niov = niov = umd->length;
+		memcpy(lmd->md_iov.kiov, umd->start,
+		       niov * sizeof(lmd->md_iov.kiov[0]));
+
+		for (i = 0; i < (int)niov; i++) {
+			/* We take the page pointer on trust */
+			if (lmd->md_iov.kiov[i].kiov_offset +
+			    lmd->md_iov.kiov[i].kiov_len > PAGE_CACHE_SIZE)
+				return -EINVAL; /* invalid length */
+
+			total_length += lmd->md_iov.kiov[i].kiov_len;
+		}
+
+		lmd->md_length = total_length;
+
+		if ((umd->options & LNET_MD_MAX_SIZE) != 0 && /* max size used */
+		    (umd->max_size < 0 ||
+		     umd->max_size > total_length)) /* illegal max_size */
+			return -EINVAL;
+	} else {   /* contiguous */
+		lmd->md_length = umd->length;
+		lmd->md_niov = niov = 1;
+		lmd->md_iov.iov[0].iov_base = umd->start;
+		lmd->md_iov.iov[0].iov_len = umd->length;
+
+		if ((umd->options & LNET_MD_MAX_SIZE) != 0 && /* max size used */
+		    (umd->max_size < 0 ||
+		     umd->max_size > (int)umd->length)) /* illegal max_size */
+			return -EINVAL;
+	}
+
+	return 0;
+}
+
+/* must be called with resource lock held */
+static int
+lnet_md_link(lnet_libmd_t *md, lnet_handle_eq_t eq_handle, int cpt)
+{
+	struct lnet_res_container *container = the_lnet.ln_md_containers[cpt];
+
+	/* NB we are passed an allocated, but inactive md.
+	 * if we return success, caller may lnet_md_unlink() it.
+	 * otherwise caller may only lnet_md_free() it.
+	 */
+	/* This implementation doesn't know how to create START events or
+	 * disable END events.  Best to LASSERT our caller is compliant so
+	 * we find out quickly...  */
+	/*  TODO - reevaluate what should be here in light of
+	 * the removal of the start and end events
+	 * maybe there we shouldn't even allow LNET_EQ_NONE!)
+	 * LASSERT (eq == NULL);
+	 */
+	if (!LNetHandleIsInvalid(eq_handle)) {
+		md->md_eq = lnet_handle2eq(&eq_handle);
+
+		if (md->md_eq == NULL)
+			return -ENOENT;
+
+		(*md->md_eq->eq_refs[cpt])++;
+	}
+
+	lnet_res_lh_initialize(container, &md->md_lh);
+
+	LASSERT(list_empty(&md->md_list));
+	list_add(&md->md_list, &container->rec_active);
+
+	return 0;
+}
+
+/* must be called with lnet_res_lock held */
+void
+lnet_md_deconstruct(lnet_libmd_t *lmd, lnet_md_t *umd)
+{
+	/* NB this doesn't copy out all the iov entries so when a
+	 * discontiguous MD is copied out, the target gets to know the
+	 * original iov pointer (in start) and the number of entries it had
+	 * and that's all.
+	 */
+	umd->start = lmd->md_start;
+	umd->length = ((lmd->md_options &
+			(LNET_MD_IOVEC | LNET_MD_KIOV)) == 0) ?
+		      lmd->md_length : lmd->md_niov;
+	umd->threshold = lmd->md_threshold;
+	umd->max_size = lmd->md_max_size;
+	umd->options = lmd->md_options;
+	umd->user_ptr = lmd->md_user_ptr;
+	lnet_eq2handle(&umd->eq_handle, lmd->md_eq);
+}
+
+static int
+lnet_md_validate(lnet_md_t *umd)
+{
+	if (umd->start == NULL && umd->length != 0) {
+		CERROR("MD start pointer can not be NULL with length %u\n",
+		       umd->length);
+		return -EINVAL;
+	}
+
+	if ((umd->options & (LNET_MD_KIOV | LNET_MD_IOVEC)) != 0 &&
+	    umd->length > LNET_MAX_IOV) {
+		CERROR("Invalid option: too many fragments %u, %d max\n",
+		       umd->length, LNET_MAX_IOV);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+/**
+ * Create a memory descriptor and attach it to a ME
+ *
+ * \param meh A handle for a ME to associate the new MD with.
+ * \param umd Provides initial values for the user-visible parts of a MD.
+ * Other than its use for initialization, there is no linkage between this
+ * structure and the MD maintained by the LNet.
+ * \param unlink A flag to indicate whether the MD is automatically unlinked
+ * when it becomes inactive, either because the operation threshold drops to
+ * zero or because the available memory becomes less than \a umd.max_size.
+ * (Note that the check for unlinking a MD only occurs after the completion
+ * of a successful operation on the MD.) The value LNET_UNLINK enables auto
+ * unlinking; the value LNET_RETAIN disables it.
+ * \param handle On successful returns, a handle to the newly created MD is
+ * saved here. This handle can be used later in LNetMDUnlink().
+ *
+ * \retval 0       On success.
+ * \retval -EINVAL If \a umd is not valid.
+ * \retval -ENOMEM If new MD cannot be allocated.
+ * \retval -ENOENT Either \a meh or \a umd.eq_handle does not point to a
+ * valid object. Note that it's OK to supply a NULL \a umd.eq_handle by
+ * calling LNetInvalidateHandle() on it.
+ * \retval -EBUSY  If the ME pointed to by \a meh is already associated with
+ * a MD.
+ */
+int
+LNetMDAttach(lnet_handle_me_t meh, lnet_md_t umd,
+	     lnet_unlink_t unlink, lnet_handle_md_t *handle)
+{
+	LIST_HEAD(matches);
+	LIST_HEAD(drops);
+	struct lnet_me		*me;
+	struct lnet_libmd	*md;
+	int			cpt;
+	int			rc;
+
+	LASSERT(the_lnet.ln_init);
+	LASSERT(the_lnet.ln_refcount > 0);
+
+	if (lnet_md_validate(&umd) != 0)
+		return -EINVAL;
+
+	if ((umd.options & (LNET_MD_OP_GET | LNET_MD_OP_PUT)) == 0) {
+		CERROR("Invalid option: no MD_OP set\n");
+		return -EINVAL;
+	}
+
+	md = lnet_md_alloc(&umd);
+	if (md == NULL)
+		return -ENOMEM;
+
+	rc = lnet_md_build(md, &umd, unlink);
+	cpt = lnet_cpt_of_cookie(meh.cookie);
+
+	lnet_res_lock(cpt);
+	if (rc != 0)
+		goto failed;
+
+	me = lnet_handle2me(&meh);
+	if (me == NULL)
+		rc = -ENOENT;
+	else if (me->me_md != NULL)
+		rc = -EBUSY;
+	else
+		rc = lnet_md_link(md, umd.eq_handle, cpt);
+
+	if (rc != 0)
+		goto failed;
+
+	/* attach this MD to portal of ME and check if it matches any
+	 * blocked msgs on this portal */
+	lnet_ptl_attach_md(me, md, &matches, &drops);
+
+	lnet_md2handle(handle, md);
+
+	lnet_res_unlock(cpt);
+
+	lnet_drop_delayed_msg_list(&drops, "Bad match");
+	lnet_recv_delayed_msg_list(&matches);
+
+	return 0;
+
+ failed:
+	lnet_md_free_locked(md);
+
+	lnet_res_unlock(cpt);
+	return rc;
+}
+EXPORT_SYMBOL(LNetMDAttach);
+
+/**
+ * Create a "free floating" memory descriptor - a MD that is not associated
+ * with a ME. Such MDs are usually used in LNetPut() and LNetGet() operations.
+ *
+ * \param umd,unlink See the discussion for LNetMDAttach().
+ * \param handle On successful returns, a handle to the newly created MD is
+ * saved here. This handle can be used later in LNetMDUnlink(), LNetPut(),
+ * and LNetGet() operations.
+ *
+ * \retval 0       On success.
+ * \retval -EINVAL If \a umd is not valid.
+ * \retval -ENOMEM If new MD cannot be allocated.
+ * \retval -ENOENT \a umd.eq_handle does not point to a valid EQ. Note that
+ * it's OK to supply a NULL \a umd.eq_handle by calling
+ * LNetInvalidateHandle() on it.
+ */
+int
+LNetMDBind(lnet_md_t umd, lnet_unlink_t unlink, lnet_handle_md_t *handle)
+{
+	lnet_libmd_t	*md;
+	int		cpt;
+	int		rc;
+
+	LASSERT(the_lnet.ln_init);
+	LASSERT(the_lnet.ln_refcount > 0);
+
+	if (lnet_md_validate(&umd) != 0)
+		return -EINVAL;
+
+	if ((umd.options & (LNET_MD_OP_GET | LNET_MD_OP_PUT)) != 0) {
+		CERROR("Invalid option: GET|PUT illegal on active MDs\n");
+		return -EINVAL;
+	}
+
+	md = lnet_md_alloc(&umd);
+	if (md == NULL)
+		return -ENOMEM;
+
+	rc = lnet_md_build(md, &umd, unlink);
+
+	cpt = lnet_res_lock_current();
+	if (rc != 0)
+		goto failed;
+
+	rc = lnet_md_link(md, umd.eq_handle, cpt);
+	if (rc != 0)
+		goto failed;
+
+	lnet_md2handle(handle, md);
+
+	lnet_res_unlock(cpt);
+	return 0;
+
+ failed:
+	lnet_md_free_locked(md);
+
+	lnet_res_unlock(cpt);
+	return rc;
+}
+EXPORT_SYMBOL(LNetMDBind);
+
+/**
+ * Unlink the memory descriptor from any ME it may be linked to and release
+ * the internal resources associated with it. As a result, active messages
+ * associated with the MD may get aborted.
+ *
+ * This function does not free the memory region associated with the MD;
+ * i.e., the memory the user allocated for this MD. If the ME associated with
+ * this MD is not NULL and was created with auto unlink enabled, the ME is
+ * unlinked as well (see LNetMEAttach()).
+ *
+ * Explicitly unlinking a MD via this function call has the same behavior as
+ * a MD that has been automatically unlinked, except that no LNET_EVENT_UNLINK
+ * is generated in the latter case.
+ *
+ * An unlinked event can be reported in two ways:
+ * - If there's no pending operations on the MD, it's unlinked immediately
+ *   and an LNET_EVENT_UNLINK event is logged before this function returns.
+ * - Otherwise, the MD is only marked for deletion when this function
+ *   returns, and the unlinked event will be piggybacked on the event of
+ *   the completion of the last operation by setting the unlinked field of
+ *   the event. No dedicated LNET_EVENT_UNLINK event is generated.
+ *
+ * Note that in both cases the unlinked field of the event is always set; no
+ * more event will happen on the MD after such an event is logged.
+ *
+ * \param mdh A handle for the MD to be unlinked.
+ *
+ * \retval 0       On success.
+ * \retval -ENOENT If \a mdh does not point to a valid MD object.
+ */
+int
+LNetMDUnlink(lnet_handle_md_t mdh)
+{
+	lnet_event_t	ev;
+	lnet_libmd_t	*md;
+	int		cpt;
+
+	LASSERT(the_lnet.ln_init);
+	LASSERT(the_lnet.ln_refcount > 0);
+
+	cpt = lnet_cpt_of_cookie(mdh.cookie);
+	lnet_res_lock(cpt);
+
+	md = lnet_handle2md(&mdh);
+	if (md == NULL) {
+		lnet_res_unlock(cpt);
+		return -ENOENT;
+	}
+
+	md->md_flags |= LNET_MD_FLAG_ABORTED;
+	/* If the MD is busy, lnet_md_unlink just marks it for deletion, and
+	 * when the LND is done, the completion event flags that the MD was
+	 * unlinked.  Otherwise, we enqueue an event now... */
+	if (md->md_eq != NULL && md->md_refcount == 0) {
+		lnet_build_unlink_event(md, &ev);
+		lnet_eq_enqueue_event(md->md_eq, &ev);
+	}
+
+	lnet_md_unlink(md);
+
+	lnet_res_unlock(cpt);
+	return 0;
+}
+EXPORT_SYMBOL(LNetMDUnlink);
diff --git a/kernel/drivers/staging/lustre/lnet/lnet/lib-me.c b/kernel/drivers/staging/lustre/lnet/lnet/lib-me.c
new file mode 100644
index 000000000..a3f929244
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lnet/lnet/lib-me.c
@@ -0,0 +1,298 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/lnet/lib-me.c
+ *
+ * Match Entry management routines
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+
+#include "../../include/linux/lnet/lib-lnet.h"
+
+/**
+ * Create and attach a match entry to the match list of \a portal. The new
+ * ME is empty, i.e. not associated with a memory descriptor. LNetMDAttach()
+ * can be used to attach a MD to an empty ME.
+ *
+ * \param portal The portal table index where the ME should be attached.
+ * \param match_id Specifies the match criteria for the process ID of
+ * the requester. The constants LNET_PID_ANY and LNET_NID_ANY can be
+ * used to wildcard either of the identifiers in the lnet_process_id_t
+ * structure.
+ * \param match_bits,ignore_bits Specify the match criteria to apply
+ * to the match bits in the incoming request. The ignore bits are used
+ * to mask out insignificant bits in the incoming match bits. The resulting
+ * bits are then compared to the ME's match bits to determine if the
+ * incoming request meets the match criteria.
+ * \param unlink Indicates whether the ME should be unlinked when the memory
+ * descriptor associated with it is unlinked (Note that the check for
+ * unlinking a ME only occurs when the memory descriptor is unlinked.).
+ * Valid values are LNET_RETAIN and LNET_UNLINK.
+ * \param pos Indicates whether the new ME should be prepended or
+ * appended to the match list. Allowed constants: LNET_INS_BEFORE,
+ * LNET_INS_AFTER.
+ * \param handle On successful returns, a handle to the newly created ME
+ * object is saved here. This handle can be used later in LNetMEInsert(),
+ * LNetMEUnlink(), or LNetMDAttach() functions.
+ *
+ * \retval 0       On success.
+ * \retval -EINVAL If \a portal is invalid.
+ * \retval -ENOMEM If new ME object cannot be allocated.
+ */
+int
+LNetMEAttach(unsigned int portal,
+	     lnet_process_id_t match_id,
+	     __u64 match_bits, __u64 ignore_bits,
+	     lnet_unlink_t unlink, lnet_ins_pos_t pos,
+	     lnet_handle_me_t *handle)
+{
+	struct lnet_match_table *mtable;
+	struct lnet_me		*me;
+	struct list_head		*head;
+
+	LASSERT(the_lnet.ln_init);
+	LASSERT(the_lnet.ln_refcount > 0);
+
+	if ((int)portal >= the_lnet.ln_nportals)
+		return -EINVAL;
+
+	mtable = lnet_mt_of_attach(portal, match_id,
+				   match_bits, ignore_bits, pos);
+	if (mtable == NULL) /* can't match portal type */
+		return -EPERM;
+
+	me = lnet_me_alloc();
+	if (me == NULL)
+		return -ENOMEM;
+
+	lnet_res_lock(mtable->mt_cpt);
+
+	me->me_portal = portal;
+	me->me_match_id = match_id;
+	me->me_match_bits = match_bits;
+	me->me_ignore_bits = ignore_bits;
+	me->me_unlink = unlink;
+	me->me_md = NULL;
+
+	lnet_res_lh_initialize(the_lnet.ln_me_containers[mtable->mt_cpt],
+			       &me->me_lh);
+	if (ignore_bits != 0)
+		head = &mtable->mt_mhash[LNET_MT_HASH_IGNORE];
+	else
+		head = lnet_mt_match_head(mtable, match_id, match_bits);
+
+	me->me_pos = head - &mtable->mt_mhash[0];
+	if (pos == LNET_INS_AFTER || pos == LNET_INS_LOCAL)
+		list_add_tail(&me->me_list, head);
+	else
+		list_add(&me->me_list, head);
+
+	lnet_me2handle(handle, me);
+
+	lnet_res_unlock(mtable->mt_cpt);
+	return 0;
+}
+EXPORT_SYMBOL(LNetMEAttach);
+
+/**
+ * Create and a match entry and insert it before or after the ME pointed to by
+ * \a current_meh. The new ME is empty, i.e. not associated with a memory
+ * descriptor. LNetMDAttach() can be used to attach a MD to an empty ME.
+ *
+ * This function is identical to LNetMEAttach() except for the position
+ * where the new ME is inserted.
+ *
+ * \param current_meh A handle for a ME. The new ME will be inserted
+ * immediately before or immediately after this ME.
+ * \param match_id,match_bits,ignore_bits,unlink,pos,handle See the discussion
+ * for LNetMEAttach().
+ *
+ * \retval 0       On success.
+ * \retval -ENOMEM If new ME object cannot be allocated.
+ * \retval -ENOENT If \a current_meh does not point to a valid match entry.
+ */
+int
+LNetMEInsert(lnet_handle_me_t current_meh,
+	     lnet_process_id_t match_id,
+	     __u64 match_bits, __u64 ignore_bits,
+	     lnet_unlink_t unlink, lnet_ins_pos_t pos,
+	     lnet_handle_me_t *handle)
+{
+	struct lnet_me		*current_me;
+	struct lnet_me		*new_me;
+	struct lnet_portal	*ptl;
+	int			cpt;
+
+	LASSERT(the_lnet.ln_init);
+	LASSERT(the_lnet.ln_refcount > 0);
+
+	if (pos == LNET_INS_LOCAL)
+		return -EPERM;
+
+	new_me = lnet_me_alloc();
+	if (new_me == NULL)
+		return -ENOMEM;
+
+	cpt = lnet_cpt_of_cookie(current_meh.cookie);
+
+	lnet_res_lock(cpt);
+
+	current_me = lnet_handle2me(&current_meh);
+	if (current_me == NULL) {
+		lnet_me_free_locked(new_me);
+
+		lnet_res_unlock(cpt);
+		return -ENOENT;
+	}
+
+	LASSERT(current_me->me_portal < the_lnet.ln_nportals);
+
+	ptl = the_lnet.ln_portals[current_me->me_portal];
+	if (lnet_ptl_is_unique(ptl)) {
+		/* nosense to insertion on unique portal */
+		lnet_me_free_locked(new_me);
+		lnet_res_unlock(cpt);
+		return -EPERM;
+	}
+
+	new_me->me_pos = current_me->me_pos;
+	new_me->me_portal = current_me->me_portal;
+	new_me->me_match_id = match_id;
+	new_me->me_match_bits = match_bits;
+	new_me->me_ignore_bits = ignore_bits;
+	new_me->me_unlink = unlink;
+	new_me->me_md = NULL;
+
+	lnet_res_lh_initialize(the_lnet.ln_me_containers[cpt], &new_me->me_lh);
+
+	if (pos == LNET_INS_AFTER)
+		list_add(&new_me->me_list, &current_me->me_list);
+	else
+		list_add_tail(&new_me->me_list, &current_me->me_list);
+
+	lnet_me2handle(handle, new_me);
+
+	lnet_res_unlock(cpt);
+
+	return 0;
+}
+EXPORT_SYMBOL(LNetMEInsert);
+
+/**
+ * Unlink a match entry from its match list.
+ *
+ * This operation also releases any resources associated with the ME. If a
+ * memory descriptor is attached to the ME, then it will be unlinked as well
+ * and an unlink event will be generated. It is an error to use the ME handle
+ * after calling LNetMEUnlink().
+ *
+ * \param meh A handle for the ME to be unlinked.
+ *
+ * \retval 0       On success.
+ * \retval -ENOENT If \a meh does not point to a valid ME.
+ * \see LNetMDUnlink() for the discussion on delivering unlink event.
+ */
+int
+LNetMEUnlink(lnet_handle_me_t meh)
+{
+	lnet_me_t	*me;
+	lnet_libmd_t	*md;
+	lnet_event_t	ev;
+	int		cpt;
+
+	LASSERT(the_lnet.ln_init);
+	LASSERT(the_lnet.ln_refcount > 0);
+
+	cpt = lnet_cpt_of_cookie(meh.cookie);
+	lnet_res_lock(cpt);
+
+	me = lnet_handle2me(&meh);
+	if (me == NULL) {
+		lnet_res_unlock(cpt);
+		return -ENOENT;
+	}
+
+	md = me->me_md;
+	if (md != NULL) {
+		md->md_flags |= LNET_MD_FLAG_ABORTED;
+		if (md->md_eq != NULL && md->md_refcount == 0) {
+			lnet_build_unlink_event(md, &ev);
+			lnet_eq_enqueue_event(md->md_eq, &ev);
+		}
+	}
+
+	lnet_me_unlink(me);
+
+	lnet_res_unlock(cpt);
+	return 0;
+}
+EXPORT_SYMBOL(LNetMEUnlink);
+
+/* call with lnet_res_lock please */
+void
+lnet_me_unlink(lnet_me_t *me)
+{
+	list_del(&me->me_list);
+
+	if (me->me_md != NULL) {
+		lnet_libmd_t *md = me->me_md;
+
+		/* detach MD from portal of this ME */
+		lnet_ptl_detach_md(me, md);
+		lnet_md_unlink(md);
+	}
+
+	lnet_res_lh_invalidate(&me->me_lh);
+	lnet_me_free_locked(me);
+}
+
+#if 0
+static void
+lib_me_dump(lnet_me_t *me)
+{
+	CWARN("Match Entry %p (%#llx)\n", me,
+	      me->me_lh.lh_cookie);
+
+	CWARN("\tMatch/Ignore\t= %016lx / %016lx\n",
+	      me->me_match_bits, me->me_ignore_bits);
+
+	CWARN("\tMD\t= %p\n", me->md);
+	CWARN("\tprev\t= %p\n",
+	      list_entry(me->me_list.prev, lnet_me_t, me_list));
+	CWARN("\tnext\t= %p\n",
+	      list_entry(me->me_list.next, lnet_me_t, me_list));
+}
+#endif
diff --git a/kernel/drivers/staging/lustre/lnet/lnet/lib-move.c b/kernel/drivers/staging/lustre/lnet/lnet/lib-move.c
new file mode 100644
index 000000000..c2fb70e5f
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lnet/lnet/lib-move.c
@@ -0,0 +1,2460 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/lnet/lib-move.c
+ *
+ * Data movement routines
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+
+#include "../../include/linux/lnet/lib-lnet.h"
+
+static int local_nid_dist_zero = 1;
+module_param(local_nid_dist_zero, int, 0444);
+MODULE_PARM_DESC(local_nid_dist_zero, "Reserved");
+
+int
+lnet_fail_nid(lnet_nid_t nid, unsigned int threshold)
+{
+	lnet_test_peer_t  *tp;
+	struct list_head	*el;
+	struct list_head	*next;
+	struct list_head	 cull;
+
+	LASSERT(the_lnet.ln_init);
+
+	/* NB: use lnet_net_lock(0) to serialize operations on test peers */
+	if (threshold != 0) {
+		/* Adding a new entry */
+		LIBCFS_ALLOC(tp, sizeof(*tp));
+		if (tp == NULL)
+			return -ENOMEM;
+
+		tp->tp_nid = nid;
+		tp->tp_threshold = threshold;
+
+		lnet_net_lock(0);
+		list_add_tail(&tp->tp_list, &the_lnet.ln_test_peers);
+		lnet_net_unlock(0);
+		return 0;
+	}
+
+	/* removing entries */
+	INIT_LIST_HEAD(&cull);
+
+	lnet_net_lock(0);
+
+	list_for_each_safe(el, next, &the_lnet.ln_test_peers) {
+		tp = list_entry(el, lnet_test_peer_t, tp_list);
+
+		if (tp->tp_threshold == 0 ||    /* needs culling anyway */
+		    nid == LNET_NID_ANY ||       /* removing all entries */
+		    tp->tp_nid == nid) {	  /* matched this one */
+			list_del(&tp->tp_list);
+			list_add(&tp->tp_list, &cull);
+		}
+	}
+
+	lnet_net_unlock(0);
+
+	while (!list_empty(&cull)) {
+		tp = list_entry(cull.next, lnet_test_peer_t, tp_list);
+
+		list_del(&tp->tp_list);
+		LIBCFS_FREE(tp, sizeof(*tp));
+	}
+	return 0;
+}
+
+static int
+fail_peer(lnet_nid_t nid, int outgoing)
+{
+	lnet_test_peer_t *tp;
+	struct list_head       *el;
+	struct list_head       *next;
+	struct list_head	cull;
+	int	       fail = 0;
+
+	INIT_LIST_HEAD(&cull);
+
+	/* NB: use lnet_net_lock(0) to serialize operations on test peers */
+	lnet_net_lock(0);
+
+	list_for_each_safe(el, next, &the_lnet.ln_test_peers) {
+		tp = list_entry(el, lnet_test_peer_t, tp_list);
+
+		if (tp->tp_threshold == 0) {
+			/* zombie entry */
+			if (outgoing) {
+				/* only cull zombies on outgoing tests,
+				 * since we may be at interrupt priority on
+				 * incoming messages. */
+				list_del(&tp->tp_list);
+				list_add(&tp->tp_list, &cull);
+			}
+			continue;
+		}
+
+		if (tp->tp_nid == LNET_NID_ANY || /* fail every peer */
+		    nid == tp->tp_nid) {	/* fail this peer */
+			fail = 1;
+
+			if (tp->tp_threshold != LNET_MD_THRESH_INF) {
+				tp->tp_threshold--;
+				if (outgoing &&
+				    tp->tp_threshold == 0) {
+					/* see above */
+					list_del(&tp->tp_list);
+					list_add(&tp->tp_list, &cull);
+				}
+			}
+			break;
+		}
+	}
+
+	lnet_net_unlock(0);
+
+	while (!list_empty(&cull)) {
+		tp = list_entry(cull.next, lnet_test_peer_t, tp_list);
+		list_del(&tp->tp_list);
+
+		LIBCFS_FREE(tp, sizeof(*tp));
+	}
+
+	return fail;
+}
+
+unsigned int
+lnet_iov_nob(unsigned int niov, struct kvec *iov)
+{
+	unsigned int nob = 0;
+
+	while (niov-- > 0)
+		nob += (iov++)->iov_len;
+
+	return nob;
+}
+EXPORT_SYMBOL(lnet_iov_nob);
+
+void
+lnet_copy_iov2iov(unsigned int ndiov, struct kvec *diov, unsigned int doffset,
+		   unsigned int nsiov, struct kvec *siov, unsigned int soffset,
+		   unsigned int nob)
+{
+	/* NB diov, siov are READ-ONLY */
+	unsigned int  this_nob;
+
+	if (nob == 0)
+		return;
+
+	/* skip complete frags before 'doffset' */
+	LASSERT(ndiov > 0);
+	while (doffset >= diov->iov_len) {
+		doffset -= diov->iov_len;
+		diov++;
+		ndiov--;
+		LASSERT(ndiov > 0);
+	}
+
+	/* skip complete frags before 'soffset' */
+	LASSERT(nsiov > 0);
+	while (soffset >= siov->iov_len) {
+		soffset -= siov->iov_len;
+		siov++;
+		nsiov--;
+		LASSERT(nsiov > 0);
+	}
+
+	do {
+		LASSERT(ndiov > 0);
+		LASSERT(nsiov > 0);
+		this_nob = min(diov->iov_len - doffset,
+			       siov->iov_len - soffset);
+		this_nob = min(this_nob, nob);
+
+		memcpy((char *)diov->iov_base + doffset,
+			(char *)siov->iov_base + soffset, this_nob);
+		nob -= this_nob;
+
+		if (diov->iov_len > doffset + this_nob) {
+			doffset += this_nob;
+		} else {
+			diov++;
+			ndiov--;
+			doffset = 0;
+		}
+
+		if (siov->iov_len > soffset + this_nob) {
+			soffset += this_nob;
+		} else {
+			siov++;
+			nsiov--;
+			soffset = 0;
+		}
+	} while (nob > 0);
+}
+EXPORT_SYMBOL(lnet_copy_iov2iov);
+
+int
+lnet_extract_iov(int dst_niov, struct kvec *dst,
+		  int src_niov, struct kvec *src,
+		  unsigned int offset, unsigned int len)
+{
+	/* Initialise 'dst' to the subset of 'src' starting at 'offset',
+	 * for exactly 'len' bytes, and return the number of entries.
+	 * NB not destructive to 'src' */
+	unsigned int    frag_len;
+	unsigned int    niov;
+
+	if (len == 0)			   /* no data => */
+		return 0;		     /* no frags */
+
+	LASSERT(src_niov > 0);
+	while (offset >= src->iov_len) {      /* skip initial frags */
+		offset -= src->iov_len;
+		src_niov--;
+		src++;
+		LASSERT(src_niov > 0);
+	}
+
+	niov = 1;
+	for (;;) {
+		LASSERT(src_niov > 0);
+		LASSERT((int)niov <= dst_niov);
+
+		frag_len = src->iov_len - offset;
+		dst->iov_base = ((char *)src->iov_base) + offset;
+
+		if (len <= frag_len) {
+			dst->iov_len = len;
+			return niov;
+		}
+
+		dst->iov_len = frag_len;
+
+		len -= frag_len;
+		dst++;
+		src++;
+		niov++;
+		src_niov--;
+		offset = 0;
+	}
+}
+EXPORT_SYMBOL(lnet_extract_iov);
+
+
+unsigned int
+lnet_kiov_nob(unsigned int niov, lnet_kiov_t *kiov)
+{
+	unsigned int  nob = 0;
+
+	while (niov-- > 0)
+		nob += (kiov++)->kiov_len;
+
+	return nob;
+}
+EXPORT_SYMBOL(lnet_kiov_nob);
+
+void
+lnet_copy_kiov2kiov(unsigned int ndiov, lnet_kiov_t *diov, unsigned int doffset,
+		    unsigned int nsiov, lnet_kiov_t *siov, unsigned int soffset,
+		    unsigned int nob)
+{
+	/* NB diov, siov are READ-ONLY */
+	unsigned int    this_nob;
+	char	   *daddr = NULL;
+	char	   *saddr = NULL;
+
+	if (nob == 0)
+		return;
+
+	LASSERT(!in_interrupt());
+
+	LASSERT(ndiov > 0);
+	while (doffset >= diov->kiov_len) {
+		doffset -= diov->kiov_len;
+		diov++;
+		ndiov--;
+		LASSERT(ndiov > 0);
+	}
+
+	LASSERT(nsiov > 0);
+	while (soffset >= siov->kiov_len) {
+		soffset -= siov->kiov_len;
+		siov++;
+		nsiov--;
+		LASSERT(nsiov > 0);
+	}
+
+	do {
+		LASSERT(ndiov > 0);
+		LASSERT(nsiov > 0);
+		this_nob = min(diov->kiov_len - doffset,
+			       siov->kiov_len - soffset);
+		this_nob = min(this_nob, nob);
+
+		if (daddr == NULL)
+			daddr = ((char *)kmap(diov->kiov_page)) +
+				diov->kiov_offset + doffset;
+		if (saddr == NULL)
+			saddr = ((char *)kmap(siov->kiov_page)) +
+				siov->kiov_offset + soffset;
+
+		/* Vanishing risk of kmap deadlock when mapping 2 pages.
+		 * However in practice at least one of the kiovs will be mapped
+		 * kernel pages and the map/unmap will be NOOPs */
+
+		memcpy(daddr, saddr, this_nob);
+		nob -= this_nob;
+
+		if (diov->kiov_len > doffset + this_nob) {
+			daddr += this_nob;
+			doffset += this_nob;
+		} else {
+			kunmap(diov->kiov_page);
+			daddr = NULL;
+			diov++;
+			ndiov--;
+			doffset = 0;
+		}
+
+		if (siov->kiov_len > soffset + this_nob) {
+			saddr += this_nob;
+			soffset += this_nob;
+		} else {
+			kunmap(siov->kiov_page);
+			saddr = NULL;
+			siov++;
+			nsiov--;
+			soffset = 0;
+		}
+	} while (nob > 0);
+
+	if (daddr != NULL)
+		kunmap(diov->kiov_page);
+	if (saddr != NULL)
+		kunmap(siov->kiov_page);
+}
+EXPORT_SYMBOL(lnet_copy_kiov2kiov);
+
+void
+lnet_copy_kiov2iov(unsigned int niov, struct kvec *iov, unsigned int iovoffset,
+		   unsigned int nkiov, lnet_kiov_t *kiov,
+		   unsigned int kiovoffset, unsigned int nob)
+{
+	/* NB iov, kiov are READ-ONLY */
+	unsigned int    this_nob;
+	char	   *addr = NULL;
+
+	if (nob == 0)
+		return;
+
+	LASSERT(!in_interrupt());
+
+	LASSERT(niov > 0);
+	while (iovoffset >= iov->iov_len) {
+		iovoffset -= iov->iov_len;
+		iov++;
+		niov--;
+		LASSERT(niov > 0);
+	}
+
+	LASSERT(nkiov > 0);
+	while (kiovoffset >= kiov->kiov_len) {
+		kiovoffset -= kiov->kiov_len;
+		kiov++;
+		nkiov--;
+		LASSERT(nkiov > 0);
+	}
+
+	do {
+		LASSERT(niov > 0);
+		LASSERT(nkiov > 0);
+		this_nob = min(iov->iov_len - iovoffset,
+			       (__kernel_size_t) kiov->kiov_len - kiovoffset);
+		this_nob = min(this_nob, nob);
+
+		if (addr == NULL)
+			addr = ((char *)kmap(kiov->kiov_page)) +
+				kiov->kiov_offset + kiovoffset;
+
+		memcpy((char *)iov->iov_base + iovoffset, addr, this_nob);
+		nob -= this_nob;
+
+		if (iov->iov_len > iovoffset + this_nob) {
+			iovoffset += this_nob;
+		} else {
+			iov++;
+			niov--;
+			iovoffset = 0;
+		}
+
+		if (kiov->kiov_len > kiovoffset + this_nob) {
+			addr += this_nob;
+			kiovoffset += this_nob;
+		} else {
+			kunmap(kiov->kiov_page);
+			addr = NULL;
+			kiov++;
+			nkiov--;
+			kiovoffset = 0;
+		}
+
+	} while (nob > 0);
+
+	if (addr != NULL)
+		kunmap(kiov->kiov_page);
+}
+EXPORT_SYMBOL(lnet_copy_kiov2iov);
+
+void
+lnet_copy_iov2kiov(unsigned int nkiov, lnet_kiov_t *kiov,
+		   unsigned int kiovoffset, unsigned int niov,
+		   struct kvec *iov, unsigned int iovoffset,
+		   unsigned int nob)
+{
+	/* NB kiov, iov are READ-ONLY */
+	unsigned int    this_nob;
+	char	   *addr = NULL;
+
+	if (nob == 0)
+		return;
+
+	LASSERT(!in_interrupt());
+
+	LASSERT(nkiov > 0);
+	while (kiovoffset >= kiov->kiov_len) {
+		kiovoffset -= kiov->kiov_len;
+		kiov++;
+		nkiov--;
+		LASSERT(nkiov > 0);
+	}
+
+	LASSERT(niov > 0);
+	while (iovoffset >= iov->iov_len) {
+		iovoffset -= iov->iov_len;
+		iov++;
+		niov--;
+		LASSERT(niov > 0);
+	}
+
+	do {
+		LASSERT(nkiov > 0);
+		LASSERT(niov > 0);
+		this_nob = min((__kernel_size_t) kiov->kiov_len - kiovoffset,
+			       iov->iov_len - iovoffset);
+		this_nob = min(this_nob, nob);
+
+		if (addr == NULL)
+			addr = ((char *)kmap(kiov->kiov_page)) +
+				kiov->kiov_offset + kiovoffset;
+
+		memcpy(addr, (char *)iov->iov_base + iovoffset, this_nob);
+		nob -= this_nob;
+
+		if (kiov->kiov_len > kiovoffset + this_nob) {
+			addr += this_nob;
+			kiovoffset += this_nob;
+		} else {
+			kunmap(kiov->kiov_page);
+			addr = NULL;
+			kiov++;
+			nkiov--;
+			kiovoffset = 0;
+		}
+
+		if (iov->iov_len > iovoffset + this_nob) {
+			iovoffset += this_nob;
+		} else {
+			iov++;
+			niov--;
+			iovoffset = 0;
+		}
+	} while (nob > 0);
+
+	if (addr != NULL)
+		kunmap(kiov->kiov_page);
+}
+EXPORT_SYMBOL(lnet_copy_iov2kiov);
+
+int
+lnet_extract_kiov(int dst_niov, lnet_kiov_t *dst,
+		   int src_niov, lnet_kiov_t *src,
+		   unsigned int offset, unsigned int len)
+{
+	/* Initialise 'dst' to the subset of 'src' starting at 'offset',
+	 * for exactly 'len' bytes, and return the number of entries.
+	 * NB not destructive to 'src' */
+	unsigned int    frag_len;
+	unsigned int    niov;
+
+	if (len == 0)			   /* no data => */
+		return 0;		     /* no frags */
+
+	LASSERT(src_niov > 0);
+	while (offset >= src->kiov_len) {      /* skip initial frags */
+		offset -= src->kiov_len;
+		src_niov--;
+		src++;
+		LASSERT(src_niov > 0);
+	}
+
+	niov = 1;
+	for (;;) {
+		LASSERT(src_niov > 0);
+		LASSERT((int)niov <= dst_niov);
+
+		frag_len = src->kiov_len - offset;
+		dst->kiov_page = src->kiov_page;
+		dst->kiov_offset = src->kiov_offset + offset;
+
+		if (len <= frag_len) {
+			dst->kiov_len = len;
+			LASSERT(dst->kiov_offset + dst->kiov_len
+					     <= PAGE_CACHE_SIZE);
+			return niov;
+		}
+
+		dst->kiov_len = frag_len;
+		LASSERT(dst->kiov_offset + dst->kiov_len <= PAGE_CACHE_SIZE);
+
+		len -= frag_len;
+		dst++;
+		src++;
+		niov++;
+		src_niov--;
+		offset = 0;
+	}
+}
+EXPORT_SYMBOL(lnet_extract_kiov);
+
+static void
+lnet_ni_recv(lnet_ni_t *ni, void *private, lnet_msg_t *msg, int delayed,
+	     unsigned int offset, unsigned int mlen, unsigned int rlen)
+{
+	unsigned int  niov = 0;
+	struct kvec *iov = NULL;
+	lnet_kiov_t  *kiov = NULL;
+	int	   rc;
+
+	LASSERT(!in_interrupt());
+	LASSERT(mlen == 0 || msg != NULL);
+
+	if (msg != NULL) {
+		LASSERT(msg->msg_receiving);
+		LASSERT(!msg->msg_sending);
+		LASSERT(rlen == msg->msg_len);
+		LASSERT(mlen <= msg->msg_len);
+		LASSERT(msg->msg_offset == offset);
+		LASSERT(msg->msg_wanted == mlen);
+
+		msg->msg_receiving = 0;
+
+		if (mlen != 0) {
+			niov = msg->msg_niov;
+			iov  = msg->msg_iov;
+			kiov = msg->msg_kiov;
+
+			LASSERT(niov > 0);
+			LASSERT((iov == NULL) != (kiov == NULL));
+		}
+	}
+
+	rc = (ni->ni_lnd->lnd_recv)(ni, private, msg, delayed,
+				    niov, iov, kiov, offset, mlen, rlen);
+	if (rc < 0)
+		lnet_finalize(ni, msg, rc);
+}
+
+static void
+lnet_setpayloadbuffer(lnet_msg_t *msg)
+{
+	lnet_libmd_t *md = msg->msg_md;
+
+	LASSERT(msg->msg_len > 0);
+	LASSERT(!msg->msg_routing);
+	LASSERT(md != NULL);
+	LASSERT(msg->msg_niov == 0);
+	LASSERT(msg->msg_iov == NULL);
+	LASSERT(msg->msg_kiov == NULL);
+
+	msg->msg_niov = md->md_niov;
+	if ((md->md_options & LNET_MD_KIOV) != 0)
+		msg->msg_kiov = md->md_iov.kiov;
+	else
+		msg->msg_iov = md->md_iov.iov;
+}
+
+void
+lnet_prep_send(lnet_msg_t *msg, int type, lnet_process_id_t target,
+	       unsigned int offset, unsigned int len)
+{
+	msg->msg_type = type;
+	msg->msg_target = target;
+	msg->msg_len = len;
+	msg->msg_offset = offset;
+
+	if (len != 0)
+		lnet_setpayloadbuffer(msg);
+
+	memset(&msg->msg_hdr, 0, sizeof(msg->msg_hdr));
+	msg->msg_hdr.type	   = cpu_to_le32(type);
+	msg->msg_hdr.dest_nid       = cpu_to_le64(target.nid);
+	msg->msg_hdr.dest_pid       = cpu_to_le32(target.pid);
+	/* src_nid will be set later */
+	msg->msg_hdr.src_pid	= cpu_to_le32(the_lnet.ln_pid);
+	msg->msg_hdr.payload_length = cpu_to_le32(len);
+}
+
+static void
+lnet_ni_send(lnet_ni_t *ni, lnet_msg_t *msg)
+{
+	void   *priv = msg->msg_private;
+	int     rc;
+
+	LASSERT(!in_interrupt());
+	LASSERT(LNET_NETTYP(LNET_NIDNET(ni->ni_nid)) == LOLND ||
+		 (msg->msg_txcredit && msg->msg_peertxcredit));
+
+	rc = (ni->ni_lnd->lnd_send)(ni, priv, msg);
+	if (rc < 0)
+		lnet_finalize(ni, msg, rc);
+}
+
+static int
+lnet_ni_eager_recv(lnet_ni_t *ni, lnet_msg_t *msg)
+{
+	int	rc;
+
+	LASSERT(!msg->msg_sending);
+	LASSERT(msg->msg_receiving);
+	LASSERT(!msg->msg_rx_ready_delay);
+	LASSERT(ni->ni_lnd->lnd_eager_recv != NULL);
+
+	msg->msg_rx_ready_delay = 1;
+	rc = (ni->ni_lnd->lnd_eager_recv)(ni, msg->msg_private, msg,
+					  &msg->msg_private);
+	if (rc != 0) {
+		CERROR("recv from %s / send to %s aborted: eager_recv failed %d\n",
+		       libcfs_nid2str(msg->msg_rxpeer->lp_nid),
+		       libcfs_id2str(msg->msg_target), rc);
+		LASSERT(rc < 0); /* required by my callers */
+	}
+
+	return rc;
+}
+
+/* NB: caller shall hold a ref on 'lp' as I'd drop lnet_net_lock */
+static void
+lnet_ni_query_locked(lnet_ni_t *ni, lnet_peer_t *lp)
+{
+	unsigned long last_alive = 0;
+
+	LASSERT(lnet_peer_aliveness_enabled(lp));
+	LASSERT(ni->ni_lnd->lnd_query != NULL);
+
+	lnet_net_unlock(lp->lp_cpt);
+	(ni->ni_lnd->lnd_query)(ni, lp->lp_nid, &last_alive);
+	lnet_net_lock(lp->lp_cpt);
+
+	lp->lp_last_query = cfs_time_current();
+
+	if (last_alive != 0) /* NI has updated timestamp */
+		lp->lp_last_alive = last_alive;
+}
+
+/* NB: always called with lnet_net_lock held */
+static inline int
+lnet_peer_is_alive(lnet_peer_t *lp, unsigned long now)
+{
+	int	alive;
+	unsigned long deadline;
+
+	LASSERT(lnet_peer_aliveness_enabled(lp));
+
+	/* Trust lnet_notify() if it has more recent aliveness news, but
+	 * ignore the initial assumed death (see lnet_peers_start_down()).
+	 */
+	if (!lp->lp_alive && lp->lp_alive_count > 0 &&
+	    cfs_time_aftereq(lp->lp_timestamp, lp->lp_last_alive))
+		return 0;
+
+	deadline = cfs_time_add(lp->lp_last_alive,
+				cfs_time_seconds(lp->lp_ni->ni_peertimeout));
+	alive = cfs_time_after(deadline, now);
+
+	/* Update obsolete lp_alive except for routers assumed to be dead
+	 * initially, because router checker would update aliveness in this
+	 * case, and moreover lp_last_alive at peer creation is assumed.
+	 */
+	if (alive && !lp->lp_alive &&
+	    !(lnet_isrouter(lp) && lp->lp_alive_count == 0))
+		lnet_notify_locked(lp, 0, 1, lp->lp_last_alive);
+
+	return alive;
+}
+
+
+/* NB: returns 1 when alive, 0 when dead, negative when error;
+ *     may drop the lnet_net_lock */
+static int
+lnet_peer_alive_locked(lnet_peer_t *lp)
+{
+	unsigned long now = cfs_time_current();
+
+	if (!lnet_peer_aliveness_enabled(lp))
+		return -ENODEV;
+
+	if (lnet_peer_is_alive(lp, now))
+		return 1;
+
+	/* Peer appears dead, but we should avoid frequent NI queries (at
+	 * most once per lnet_queryinterval seconds). */
+	if (lp->lp_last_query != 0) {
+		static const int lnet_queryinterval = 1;
+
+		unsigned long next_query =
+			   cfs_time_add(lp->lp_last_query,
+					cfs_time_seconds(lnet_queryinterval));
+
+		if (time_before(now, next_query)) {
+			if (lp->lp_alive)
+				CWARN("Unexpected aliveness of peer %s: %d < %d (%d/%d)\n",
+				      libcfs_nid2str(lp->lp_nid),
+				      (int)now, (int)next_query,
+				      lnet_queryinterval,
+				      lp->lp_ni->ni_peertimeout);
+			return 0;
+		}
+	}
+
+	/* query NI for latest aliveness news */
+	lnet_ni_query_locked(lp->lp_ni, lp);
+
+	if (lnet_peer_is_alive(lp, now))
+		return 1;
+
+	lnet_notify_locked(lp, 0, 0, lp->lp_last_alive);
+	return 0;
+}
+
+/**
+ * \param msg The message to be sent.
+ * \param do_send True if lnet_ni_send() should be called in this function.
+ *	  lnet_send() is going to lnet_net_unlock immediately after this, so
+ *	  it sets do_send FALSE and I don't do the unlock/send/lock bit.
+ *
+ * \retval 0 If \a msg sent or OK to send.
+ * \retval EAGAIN If \a msg blocked for credit.
+ * \retval EHOSTUNREACH If the next hop of the message appears dead.
+ * \retval ECANCELED If the MD of the message has been unlinked.
+ */
+static int
+lnet_post_send_locked(lnet_msg_t *msg, int do_send)
+{
+	lnet_peer_t		*lp = msg->msg_txpeer;
+	lnet_ni_t		*ni = lp->lp_ni;
+	int			cpt = msg->msg_tx_cpt;
+	struct lnet_tx_queue	*tq = ni->ni_tx_queues[cpt];
+
+	/* non-lnet_send() callers have checked before */
+	LASSERT(!do_send || msg->msg_tx_delayed);
+	LASSERT(!msg->msg_receiving);
+	LASSERT(msg->msg_tx_committed);
+
+	/* NB 'lp' is always the next hop */
+	if ((msg->msg_target.pid & LNET_PID_USERFLAG) == 0 &&
+	    lnet_peer_alive_locked(lp) == 0) {
+		the_lnet.ln_counters[cpt]->drop_count++;
+		the_lnet.ln_counters[cpt]->drop_length += msg->msg_len;
+		lnet_net_unlock(cpt);
+
+		CNETERR("Dropping message for %s: peer not alive\n",
+			libcfs_id2str(msg->msg_target));
+		if (do_send)
+			lnet_finalize(ni, msg, -EHOSTUNREACH);
+
+		lnet_net_lock(cpt);
+		return EHOSTUNREACH;
+	}
+
+	if (msg->msg_md != NULL &&
+	    (msg->msg_md->md_flags & LNET_MD_FLAG_ABORTED) != 0) {
+		lnet_net_unlock(cpt);
+
+		CNETERR("Aborting message for %s: LNetM[DE]Unlink() already called on the MD/ME.\n",
+			libcfs_id2str(msg->msg_target));
+		if (do_send)
+			lnet_finalize(ni, msg, -ECANCELED);
+
+		lnet_net_lock(cpt);
+		return ECANCELED;
+	}
+
+	if (!msg->msg_peertxcredit) {
+		LASSERT((lp->lp_txcredits < 0) ==
+			 !list_empty(&lp->lp_txq));
+
+		msg->msg_peertxcredit = 1;
+		lp->lp_txqnob += msg->msg_len + sizeof(lnet_hdr_t);
+		lp->lp_txcredits--;
+
+		if (lp->lp_txcredits < lp->lp_mintxcredits)
+			lp->lp_mintxcredits = lp->lp_txcredits;
+
+		if (lp->lp_txcredits < 0) {
+			msg->msg_tx_delayed = 1;
+			list_add_tail(&msg->msg_list, &lp->lp_txq);
+			return EAGAIN;
+		}
+	}
+
+	if (!msg->msg_txcredit) {
+		LASSERT((tq->tq_credits < 0) ==
+			!list_empty(&tq->tq_delayed));
+
+		msg->msg_txcredit = 1;
+		tq->tq_credits--;
+
+		if (tq->tq_credits < tq->tq_credits_min)
+			tq->tq_credits_min = tq->tq_credits;
+
+		if (tq->tq_credits < 0) {
+			msg->msg_tx_delayed = 1;
+			list_add_tail(&msg->msg_list, &tq->tq_delayed);
+			return EAGAIN;
+		}
+	}
+
+	if (do_send) {
+		lnet_net_unlock(cpt);
+		lnet_ni_send(ni, msg);
+		lnet_net_lock(cpt);
+	}
+	return 0;
+}
+
+
+static lnet_rtrbufpool_t *
+lnet_msg2bufpool(lnet_msg_t *msg)
+{
+	lnet_rtrbufpool_t	*rbp;
+	int			cpt;
+
+	LASSERT(msg->msg_rx_committed);
+
+	cpt = msg->msg_rx_cpt;
+	rbp = &the_lnet.ln_rtrpools[cpt][0];
+
+	LASSERT(msg->msg_len <= LNET_MTU);
+	while (msg->msg_len > (unsigned int)rbp->rbp_npages * PAGE_CACHE_SIZE) {
+		rbp++;
+		LASSERT(rbp < &the_lnet.ln_rtrpools[cpt][LNET_NRBPOOLS]);
+	}
+
+	return rbp;
+}
+
+static int
+lnet_post_routed_recv_locked(lnet_msg_t *msg, int do_recv)
+{
+	/* lnet_parse is going to lnet_net_unlock immediately after this, so it
+	 * sets do_recv FALSE and I don't do the unlock/send/lock bit.  I
+	 * return EAGAIN if msg blocked and 0 if received or OK to receive */
+	lnet_peer_t	 *lp = msg->msg_rxpeer;
+	lnet_rtrbufpool_t   *rbp;
+	lnet_rtrbuf_t       *rb;
+
+	LASSERT(msg->msg_iov == NULL);
+	LASSERT(msg->msg_kiov == NULL);
+	LASSERT(msg->msg_niov == 0);
+	LASSERT(msg->msg_routing);
+	LASSERT(msg->msg_receiving);
+	LASSERT(!msg->msg_sending);
+
+	/* non-lnet_parse callers only receive delayed messages */
+	LASSERT(!do_recv || msg->msg_rx_delayed);
+
+	if (!msg->msg_peerrtrcredit) {
+		LASSERT((lp->lp_rtrcredits < 0) ==
+			 !list_empty(&lp->lp_rtrq));
+
+		msg->msg_peerrtrcredit = 1;
+		lp->lp_rtrcredits--;
+		if (lp->lp_rtrcredits < lp->lp_minrtrcredits)
+			lp->lp_minrtrcredits = lp->lp_rtrcredits;
+
+		if (lp->lp_rtrcredits < 0) {
+			/* must have checked eager_recv before here */
+			LASSERT(msg->msg_rx_ready_delay);
+			msg->msg_rx_delayed = 1;
+			list_add_tail(&msg->msg_list, &lp->lp_rtrq);
+			return EAGAIN;
+		}
+	}
+
+	rbp = lnet_msg2bufpool(msg);
+
+	if (!msg->msg_rtrcredit) {
+		LASSERT((rbp->rbp_credits < 0) ==
+			 !list_empty(&rbp->rbp_msgs));
+
+		msg->msg_rtrcredit = 1;
+		rbp->rbp_credits--;
+		if (rbp->rbp_credits < rbp->rbp_mincredits)
+			rbp->rbp_mincredits = rbp->rbp_credits;
+
+		if (rbp->rbp_credits < 0) {
+			/* must have checked eager_recv before here */
+			LASSERT(msg->msg_rx_ready_delay);
+			msg->msg_rx_delayed = 1;
+			list_add_tail(&msg->msg_list, &rbp->rbp_msgs);
+			return EAGAIN;
+		}
+	}
+
+	LASSERT(!list_empty(&rbp->rbp_bufs));
+	rb = list_entry(rbp->rbp_bufs.next, lnet_rtrbuf_t, rb_list);
+	list_del(&rb->rb_list);
+
+	msg->msg_niov = rbp->rbp_npages;
+	msg->msg_kiov = &rb->rb_kiov[0];
+
+	if (do_recv) {
+		int cpt = msg->msg_rx_cpt;
+
+		lnet_net_unlock(cpt);
+		lnet_ni_recv(lp->lp_ni, msg->msg_private, msg, 1,
+			     0, msg->msg_len, msg->msg_len);
+		lnet_net_lock(cpt);
+	}
+	return 0;
+}
+
+void
+lnet_return_tx_credits_locked(lnet_msg_t *msg)
+{
+	lnet_peer_t	*txpeer = msg->msg_txpeer;
+	lnet_msg_t	*msg2;
+
+	if (msg->msg_txcredit) {
+		struct lnet_ni	     *ni = txpeer->lp_ni;
+		struct lnet_tx_queue *tq = ni->ni_tx_queues[msg->msg_tx_cpt];
+
+		/* give back NI txcredits */
+		msg->msg_txcredit = 0;
+
+		LASSERT((tq->tq_credits < 0) ==
+			!list_empty(&tq->tq_delayed));
+
+		tq->tq_credits++;
+		if (tq->tq_credits <= 0) {
+			msg2 = list_entry(tq->tq_delayed.next,
+					      lnet_msg_t, msg_list);
+			list_del(&msg2->msg_list);
+
+			LASSERT(msg2->msg_txpeer->lp_ni == ni);
+			LASSERT(msg2->msg_tx_delayed);
+
+			(void) lnet_post_send_locked(msg2, 1);
+		}
+	}
+
+	if (msg->msg_peertxcredit) {
+		/* give back peer txcredits */
+		msg->msg_peertxcredit = 0;
+
+		LASSERT((txpeer->lp_txcredits < 0) ==
+			!list_empty(&txpeer->lp_txq));
+
+		txpeer->lp_txqnob -= msg->msg_len + sizeof(lnet_hdr_t);
+		LASSERT(txpeer->lp_txqnob >= 0);
+
+		txpeer->lp_txcredits++;
+		if (txpeer->lp_txcredits <= 0) {
+			msg2 = list_entry(txpeer->lp_txq.next,
+					      lnet_msg_t, msg_list);
+			list_del(&msg2->msg_list);
+
+			LASSERT(msg2->msg_txpeer == txpeer);
+			LASSERT(msg2->msg_tx_delayed);
+
+			(void) lnet_post_send_locked(msg2, 1);
+		}
+	}
+
+	if (txpeer != NULL) {
+		msg->msg_txpeer = NULL;
+		lnet_peer_decref_locked(txpeer);
+	}
+}
+
+void
+lnet_return_rx_credits_locked(lnet_msg_t *msg)
+{
+	lnet_peer_t	*rxpeer = msg->msg_rxpeer;
+	lnet_msg_t	*msg2;
+
+	if (msg->msg_rtrcredit) {
+		/* give back global router credits */
+		lnet_rtrbuf_t     *rb;
+		lnet_rtrbufpool_t *rbp;
+
+		/* NB If a msg ever blocks for a buffer in rbp_msgs, it stays
+		 * there until it gets one allocated, or aborts the wait
+		 * itself */
+		LASSERT(msg->msg_kiov != NULL);
+
+		rb = list_entry(msg->msg_kiov, lnet_rtrbuf_t, rb_kiov[0]);
+		rbp = rb->rb_pool;
+		LASSERT(rbp == lnet_msg2bufpool(msg));
+
+		msg->msg_kiov = NULL;
+		msg->msg_rtrcredit = 0;
+
+		LASSERT((rbp->rbp_credits < 0) ==
+			!list_empty(&rbp->rbp_msgs));
+		LASSERT((rbp->rbp_credits > 0) ==
+			!list_empty(&rbp->rbp_bufs));
+
+		list_add(&rb->rb_list, &rbp->rbp_bufs);
+		rbp->rbp_credits++;
+		if (rbp->rbp_credits <= 0) {
+			msg2 = list_entry(rbp->rbp_msgs.next,
+					      lnet_msg_t, msg_list);
+			list_del(&msg2->msg_list);
+
+			(void) lnet_post_routed_recv_locked(msg2, 1);
+		}
+	}
+
+	if (msg->msg_peerrtrcredit) {
+		/* give back peer router credits */
+		msg->msg_peerrtrcredit = 0;
+
+		LASSERT((rxpeer->lp_rtrcredits < 0) ==
+			!list_empty(&rxpeer->lp_rtrq));
+
+		rxpeer->lp_rtrcredits++;
+		if (rxpeer->lp_rtrcredits <= 0) {
+			msg2 = list_entry(rxpeer->lp_rtrq.next,
+					      lnet_msg_t, msg_list);
+			list_del(&msg2->msg_list);
+
+			(void) lnet_post_routed_recv_locked(msg2, 1);
+		}
+	}
+	if (rxpeer != NULL) {
+		msg->msg_rxpeer = NULL;
+		lnet_peer_decref_locked(rxpeer);
+	}
+}
+
+static int
+lnet_compare_routes(lnet_route_t *r1, lnet_route_t *r2)
+{
+	lnet_peer_t *p1 = r1->lr_gateway;
+	lnet_peer_t *p2 = r2->lr_gateway;
+
+	if (r1->lr_priority < r2->lr_priority)
+		return 1;
+
+	if (r1->lr_priority > r2->lr_priority)
+		return -1;
+
+	if (r1->lr_hops < r2->lr_hops)
+		return 1;
+
+	if (r1->lr_hops > r2->lr_hops)
+		return -1;
+
+	if (p1->lp_txqnob < p2->lp_txqnob)
+		return 1;
+
+	if (p1->lp_txqnob > p2->lp_txqnob)
+		return -1;
+
+	if (p1->lp_txcredits > p2->lp_txcredits)
+		return 1;
+
+	if (p1->lp_txcredits < p2->lp_txcredits)
+		return -1;
+
+	if (r1->lr_seq - r2->lr_seq <= 0)
+		return 1;
+
+	return -1;
+}
+
+static lnet_peer_t *
+lnet_find_route_locked(lnet_ni_t *ni, lnet_nid_t target, lnet_nid_t rtr_nid)
+{
+	lnet_remotenet_t	*rnet;
+	lnet_route_t		*rtr;
+	lnet_route_t		*rtr_best;
+	lnet_route_t		*rtr_last;
+	struct lnet_peer	*lp_best;
+	struct lnet_peer	*lp;
+	int			rc;
+
+	/* If @rtr_nid is not LNET_NID_ANY, return the gateway with
+	 * rtr_nid nid, otherwise find the best gateway I can use */
+
+	rnet = lnet_find_net_locked(LNET_NIDNET(target));
+	if (rnet == NULL)
+		return NULL;
+
+	lp_best = NULL;
+	rtr_best = rtr_last = NULL;
+	list_for_each_entry(rtr, &rnet->lrn_routes, lr_list) {
+		lp = rtr->lr_gateway;
+
+		if (!lp->lp_alive || /* gateway is down */
+		    ((lp->lp_ping_feats & LNET_PING_FEAT_NI_STATUS) != 0 &&
+		     rtr->lr_downis != 0)) /* NI to target is down */
+			continue;
+
+		if (ni != NULL && lp->lp_ni != ni)
+			continue;
+
+		if (lp->lp_nid == rtr_nid) /* it's pre-determined router */
+			return lp;
+
+		if (lp_best == NULL) {
+			rtr_best = rtr_last = rtr;
+			lp_best = lp;
+			continue;
+		}
+
+		/* no protection on below fields, but it's harmless */
+		if (rtr_last->lr_seq - rtr->lr_seq < 0)
+			rtr_last = rtr;
+
+		rc = lnet_compare_routes(rtr, rtr_best);
+		if (rc < 0)
+			continue;
+
+		rtr_best = rtr;
+		lp_best = lp;
+	}
+
+	/* set sequence number on the best router to the latest sequence + 1
+	 * so we can round-robin all routers, it's race and inaccurate but
+	 * harmless and functional  */
+	if (rtr_best != NULL)
+		rtr_best->lr_seq = rtr_last->lr_seq + 1;
+	return lp_best;
+}
+
+int
+lnet_send(lnet_nid_t src_nid, lnet_msg_t *msg, lnet_nid_t rtr_nid)
+{
+	lnet_nid_t		dst_nid = msg->msg_target.nid;
+	struct lnet_ni		*src_ni;
+	struct lnet_ni		*local_ni;
+	struct lnet_peer	*lp;
+	int			cpt;
+	int			cpt2;
+	int			rc;
+
+	/* NB: rtr_nid is set to LNET_NID_ANY for all current use-cases,
+	 * but we might want to use pre-determined router for ACK/REPLY
+	 * in the future */
+	/* NB: ni != NULL == interface pre-determined (ACK/REPLY) */
+	LASSERT(msg->msg_txpeer == NULL);
+	LASSERT(!msg->msg_sending);
+	LASSERT(!msg->msg_target_is_router);
+	LASSERT(!msg->msg_receiving);
+
+	msg->msg_sending = 1;
+
+	LASSERT(!msg->msg_tx_committed);
+	cpt = lnet_cpt_of_nid(rtr_nid == LNET_NID_ANY ? dst_nid : rtr_nid);
+ again:
+	lnet_net_lock(cpt);
+
+	if (the_lnet.ln_shutdown) {
+		lnet_net_unlock(cpt);
+		return -ESHUTDOWN;
+	}
+
+	if (src_nid == LNET_NID_ANY) {
+		src_ni = NULL;
+	} else {
+		src_ni = lnet_nid2ni_locked(src_nid, cpt);
+		if (src_ni == NULL) {
+			lnet_net_unlock(cpt);
+			LCONSOLE_WARN("Can't send to %s: src %s is not a local nid\n",
+				      libcfs_nid2str(dst_nid),
+				      libcfs_nid2str(src_nid));
+			return -EINVAL;
+		}
+		LASSERT(!msg->msg_routing);
+	}
+
+	/* Is this for someone on a local network? */
+	local_ni = lnet_net2ni_locked(LNET_NIDNET(dst_nid), cpt);
+
+	if (local_ni != NULL) {
+		if (src_ni == NULL) {
+			src_ni = local_ni;
+			src_nid = src_ni->ni_nid;
+		} else if (src_ni == local_ni) {
+			lnet_ni_decref_locked(local_ni, cpt);
+		} else {
+			lnet_ni_decref_locked(local_ni, cpt);
+			lnet_ni_decref_locked(src_ni, cpt);
+			lnet_net_unlock(cpt);
+			LCONSOLE_WARN("No route to %s via from %s\n",
+				      libcfs_nid2str(dst_nid),
+				      libcfs_nid2str(src_nid));
+			return -EINVAL;
+		}
+
+		LASSERT(src_nid != LNET_NID_ANY);
+		lnet_msg_commit(msg, cpt);
+
+		if (!msg->msg_routing)
+			msg->msg_hdr.src_nid = cpu_to_le64(src_nid);
+
+		if (src_ni == the_lnet.ln_loni) {
+			/* No send credit hassles with LOLND */
+			lnet_net_unlock(cpt);
+			lnet_ni_send(src_ni, msg);
+
+			lnet_net_lock(cpt);
+			lnet_ni_decref_locked(src_ni, cpt);
+			lnet_net_unlock(cpt);
+			return 0;
+		}
+
+		rc = lnet_nid2peer_locked(&lp, dst_nid, cpt);
+		/* lp has ref on src_ni; lose mine */
+		lnet_ni_decref_locked(src_ni, cpt);
+		if (rc != 0) {
+			lnet_net_unlock(cpt);
+			LCONSOLE_WARN("Error %d finding peer %s\n", rc,
+				      libcfs_nid2str(dst_nid));
+			/* ENOMEM or shutting down */
+			return rc;
+		}
+		LASSERT(lp->lp_ni == src_ni);
+	} else {
+		/* sending to a remote network */
+		lp = lnet_find_route_locked(src_ni, dst_nid, rtr_nid);
+		if (lp == NULL) {
+			if (src_ni != NULL)
+				lnet_ni_decref_locked(src_ni, cpt);
+			lnet_net_unlock(cpt);
+
+			LCONSOLE_WARN("No route to %s via %s (all routers down)\n",
+				      libcfs_id2str(msg->msg_target),
+				      libcfs_nid2str(src_nid));
+			return -EHOSTUNREACH;
+		}
+
+		/* rtr_nid is LNET_NID_ANY or NID of pre-determined router,
+		 * it's possible that rtr_nid isn't LNET_NID_ANY and lp isn't
+		 * pre-determined router, this can happen if router table
+		 * was changed when we release the lock */
+		if (rtr_nid != lp->lp_nid) {
+			cpt2 = lnet_cpt_of_nid_locked(lp->lp_nid);
+			if (cpt2 != cpt) {
+				if (src_ni != NULL)
+					lnet_ni_decref_locked(src_ni, cpt);
+				lnet_net_unlock(cpt);
+
+				rtr_nid = lp->lp_nid;
+				cpt = cpt2;
+				goto again;
+			}
+		}
+
+		CDEBUG(D_NET, "Best route to %s via %s for %s %d\n",
+		       libcfs_nid2str(dst_nid), libcfs_nid2str(lp->lp_nid),
+		       lnet_msgtyp2str(msg->msg_type), msg->msg_len);
+
+		if (src_ni == NULL) {
+			src_ni = lp->lp_ni;
+			src_nid = src_ni->ni_nid;
+		} else {
+			LASSERT(src_ni == lp->lp_ni);
+			lnet_ni_decref_locked(src_ni, cpt);
+		}
+
+		lnet_peer_addref_locked(lp);
+
+		LASSERT(src_nid != LNET_NID_ANY);
+		lnet_msg_commit(msg, cpt);
+
+		if (!msg->msg_routing) {
+			/* I'm the source and now I know which NI to send on */
+			msg->msg_hdr.src_nid = cpu_to_le64(src_nid);
+		}
+
+		msg->msg_target_is_router = 1;
+		msg->msg_target.nid = lp->lp_nid;
+		msg->msg_target.pid = LUSTRE_SRV_LNET_PID;
+	}
+
+	/* 'lp' is our best choice of peer */
+
+	LASSERT(!msg->msg_peertxcredit);
+	LASSERT(!msg->msg_txcredit);
+	LASSERT(msg->msg_txpeer == NULL);
+
+	msg->msg_txpeer = lp;		   /* msg takes my ref on lp */
+
+	rc = lnet_post_send_locked(msg, 0);
+	lnet_net_unlock(cpt);
+
+	if (rc == EHOSTUNREACH || rc == ECANCELED)
+		return -rc;
+
+	if (rc == 0)
+		lnet_ni_send(src_ni, msg);
+
+	return 0; /* rc == 0 or EAGAIN */
+}
+
+static void
+lnet_drop_message(lnet_ni_t *ni, int cpt, void *private, unsigned int nob)
+{
+	lnet_net_lock(cpt);
+	the_lnet.ln_counters[cpt]->drop_count++;
+	the_lnet.ln_counters[cpt]->drop_length += nob;
+	lnet_net_unlock(cpt);
+
+	lnet_ni_recv(ni, private, NULL, 0, 0, 0, nob);
+}
+
+static void
+lnet_recv_put(lnet_ni_t *ni, lnet_msg_t *msg)
+{
+	lnet_hdr_t	*hdr = &msg->msg_hdr;
+
+	if (msg->msg_wanted != 0)
+		lnet_setpayloadbuffer(msg);
+
+	lnet_build_msg_event(msg, LNET_EVENT_PUT);
+
+	/* Must I ACK?  If so I'll grab the ack_wmd out of the header and put
+	 * it back into the ACK during lnet_finalize() */
+	msg->msg_ack = (!lnet_is_wire_handle_none(&hdr->msg.put.ack_wmd) &&
+			(msg->msg_md->md_options & LNET_MD_ACK_DISABLE) == 0);
+
+	lnet_ni_recv(ni, msg->msg_private, msg, msg->msg_rx_delayed,
+		     msg->msg_offset, msg->msg_wanted, hdr->payload_length);
+}
+
+static int
+lnet_parse_put(lnet_ni_t *ni, lnet_msg_t *msg)
+{
+	lnet_hdr_t		*hdr = &msg->msg_hdr;
+	struct lnet_match_info	info;
+	int			rc;
+
+	/* Convert put fields to host byte order */
+	hdr->msg.put.match_bits	= le64_to_cpu(hdr->msg.put.match_bits);
+	hdr->msg.put.ptl_index	= le32_to_cpu(hdr->msg.put.ptl_index);
+	hdr->msg.put.offset	= le32_to_cpu(hdr->msg.put.offset);
+
+	info.mi_id.nid	= hdr->src_nid;
+	info.mi_id.pid	= hdr->src_pid;
+	info.mi_opc	= LNET_MD_OP_PUT;
+	info.mi_portal	= hdr->msg.put.ptl_index;
+	info.mi_rlength	= hdr->payload_length;
+	info.mi_roffset	= hdr->msg.put.offset;
+	info.mi_mbits	= hdr->msg.put.match_bits;
+
+	msg->msg_rx_ready_delay = ni->ni_lnd->lnd_eager_recv == NULL;
+
+ again:
+	rc = lnet_ptl_match_md(&info, msg);
+	switch (rc) {
+	default:
+		LBUG();
+
+	case LNET_MATCHMD_OK:
+		lnet_recv_put(ni, msg);
+		return 0;
+
+	case LNET_MATCHMD_NONE:
+		if (msg->msg_rx_delayed) /* attached on delayed list */
+			return 0;
+
+		rc = lnet_ni_eager_recv(ni, msg);
+		if (rc == 0)
+			goto again;
+		/* fall through */
+
+	case LNET_MATCHMD_DROP:
+		CNETERR("Dropping PUT from %s portal %d match %llu offset %d length %d: %d\n",
+			libcfs_id2str(info.mi_id), info.mi_portal,
+			info.mi_mbits, info.mi_roffset, info.mi_rlength, rc);
+
+		return ENOENT;	/* +ve: OK but no match */
+	}
+}
+
+static int
+lnet_parse_get(lnet_ni_t *ni, lnet_msg_t *msg, int rdma_get)
+{
+	struct lnet_match_info	info;
+	lnet_hdr_t		*hdr = &msg->msg_hdr;
+	lnet_handle_wire_t	reply_wmd;
+	int			rc;
+
+	/* Convert get fields to host byte order */
+	hdr->msg.get.match_bits	  = le64_to_cpu(hdr->msg.get.match_bits);
+	hdr->msg.get.ptl_index	  = le32_to_cpu(hdr->msg.get.ptl_index);
+	hdr->msg.get.sink_length  = le32_to_cpu(hdr->msg.get.sink_length);
+	hdr->msg.get.src_offset	  = le32_to_cpu(hdr->msg.get.src_offset);
+
+	info.mi_id.nid	= hdr->src_nid;
+	info.mi_id.pid	= hdr->src_pid;
+	info.mi_opc	= LNET_MD_OP_GET;
+	info.mi_portal	= hdr->msg.get.ptl_index;
+	info.mi_rlength	= hdr->msg.get.sink_length;
+	info.mi_roffset	= hdr->msg.get.src_offset;
+	info.mi_mbits	= hdr->msg.get.match_bits;
+
+	rc = lnet_ptl_match_md(&info, msg);
+	if (rc == LNET_MATCHMD_DROP) {
+		CNETERR("Dropping GET from %s portal %d match %llu offset %d length %d\n",
+			libcfs_id2str(info.mi_id), info.mi_portal,
+			info.mi_mbits, info.mi_roffset, info.mi_rlength);
+		return ENOENT;	/* +ve: OK but no match */
+	}
+
+	LASSERT(rc == LNET_MATCHMD_OK);
+
+	lnet_build_msg_event(msg, LNET_EVENT_GET);
+
+	reply_wmd = hdr->msg.get.return_wmd;
+
+	lnet_prep_send(msg, LNET_MSG_REPLY, info.mi_id,
+		       msg->msg_offset, msg->msg_wanted);
+
+	msg->msg_hdr.msg.reply.dst_wmd = reply_wmd;
+
+	if (rdma_get) {
+		/* The LND completes the REPLY from her recv procedure */
+		lnet_ni_recv(ni, msg->msg_private, msg, 0,
+			     msg->msg_offset, msg->msg_len, msg->msg_len);
+		return 0;
+	}
+
+	lnet_ni_recv(ni, msg->msg_private, NULL, 0, 0, 0, 0);
+	msg->msg_receiving = 0;
+
+	rc = lnet_send(ni->ni_nid, msg, LNET_NID_ANY);
+	if (rc < 0) {
+		/* didn't get as far as lnet_ni_send() */
+		CERROR("%s: Unable to send REPLY for GET from %s: %d\n",
+		       libcfs_nid2str(ni->ni_nid),
+		       libcfs_id2str(info.mi_id), rc);
+
+		lnet_finalize(ni, msg, rc);
+	}
+
+	return 0;
+}
+
+static int
+lnet_parse_reply(lnet_ni_t *ni, lnet_msg_t *msg)
+{
+	void	     *private = msg->msg_private;
+	lnet_hdr_t       *hdr = &msg->msg_hdr;
+	lnet_process_id_t src = {0};
+	lnet_libmd_t     *md;
+	int	       rlength;
+	int	       mlength;
+	int			cpt;
+
+	cpt = lnet_cpt_of_cookie(hdr->msg.reply.dst_wmd.wh_object_cookie);
+	lnet_res_lock(cpt);
+
+	src.nid = hdr->src_nid;
+	src.pid = hdr->src_pid;
+
+	/* NB handles only looked up by creator (no flips) */
+	md = lnet_wire_handle2md(&hdr->msg.reply.dst_wmd);
+	if (md == NULL || md->md_threshold == 0 || md->md_me != NULL) {
+		CNETERR("%s: Dropping REPLY from %s for %s MD %#llx.%#llx\n",
+			libcfs_nid2str(ni->ni_nid), libcfs_id2str(src),
+			(md == NULL) ? "invalid" : "inactive",
+			hdr->msg.reply.dst_wmd.wh_interface_cookie,
+			hdr->msg.reply.dst_wmd.wh_object_cookie);
+		if (md != NULL && md->md_me != NULL)
+			CERROR("REPLY MD also attached to portal %d\n",
+			       md->md_me->me_portal);
+
+		lnet_res_unlock(cpt);
+		return ENOENT;		  /* +ve: OK but no match */
+	}
+
+	LASSERT(md->md_offset == 0);
+
+	rlength = hdr->payload_length;
+	mlength = min_t(uint, rlength, md->md_length);
+
+	if (mlength < rlength &&
+	    (md->md_options & LNET_MD_TRUNCATE) == 0) {
+		CNETERR("%s: Dropping REPLY from %s length %d for MD %#llx would overflow (%d)\n",
+			libcfs_nid2str(ni->ni_nid), libcfs_id2str(src),
+			rlength, hdr->msg.reply.dst_wmd.wh_object_cookie,
+			mlength);
+		lnet_res_unlock(cpt);
+		return ENOENT;	  /* +ve: OK but no match */
+	}
+
+	CDEBUG(D_NET, "%s: Reply from %s of length %d/%d into md %#llx\n",
+	       libcfs_nid2str(ni->ni_nid), libcfs_id2str(src),
+	       mlength, rlength, hdr->msg.reply.dst_wmd.wh_object_cookie);
+
+	lnet_msg_attach_md(msg, md, 0, mlength);
+
+	if (mlength != 0)
+		lnet_setpayloadbuffer(msg);
+
+	lnet_res_unlock(cpt);
+
+	lnet_build_msg_event(msg, LNET_EVENT_REPLY);
+
+	lnet_ni_recv(ni, private, msg, 0, 0, mlength, rlength);
+	return 0;
+}
+
+static int
+lnet_parse_ack(lnet_ni_t *ni, lnet_msg_t *msg)
+{
+	lnet_hdr_t       *hdr = &msg->msg_hdr;
+	lnet_process_id_t src = {0};
+	lnet_libmd_t     *md;
+	int			cpt;
+
+	src.nid = hdr->src_nid;
+	src.pid = hdr->src_pid;
+
+	/* Convert ack fields to host byte order */
+	hdr->msg.ack.match_bits = le64_to_cpu(hdr->msg.ack.match_bits);
+	hdr->msg.ack.mlength = le32_to_cpu(hdr->msg.ack.mlength);
+
+	cpt = lnet_cpt_of_cookie(hdr->msg.ack.dst_wmd.wh_object_cookie);
+	lnet_res_lock(cpt);
+
+	/* NB handles only looked up by creator (no flips) */
+	md = lnet_wire_handle2md(&hdr->msg.ack.dst_wmd);
+	if (md == NULL || md->md_threshold == 0 || md->md_me != NULL) {
+		/* Don't moan; this is expected */
+		CDEBUG(D_NET,
+		       "%s: Dropping ACK from %s to %s MD %#llx.%#llx\n",
+		       libcfs_nid2str(ni->ni_nid), libcfs_id2str(src),
+		       (md == NULL) ? "invalid" : "inactive",
+		       hdr->msg.ack.dst_wmd.wh_interface_cookie,
+		       hdr->msg.ack.dst_wmd.wh_object_cookie);
+		if (md != NULL && md->md_me != NULL)
+			CERROR("Source MD also attached to portal %d\n",
+			       md->md_me->me_portal);
+
+		lnet_res_unlock(cpt);
+		return ENOENT;		  /* +ve! */
+	}
+
+	CDEBUG(D_NET, "%s: ACK from %s into md %#llx\n",
+	       libcfs_nid2str(ni->ni_nid), libcfs_id2str(src),
+	       hdr->msg.ack.dst_wmd.wh_object_cookie);
+
+	lnet_msg_attach_md(msg, md, 0, 0);
+
+	lnet_res_unlock(cpt);
+
+	lnet_build_msg_event(msg, LNET_EVENT_ACK);
+
+	lnet_ni_recv(ni, msg->msg_private, msg, 0, 0, 0, msg->msg_len);
+	return 0;
+}
+
+static int
+lnet_parse_forward_locked(lnet_ni_t *ni, lnet_msg_t *msg)
+{
+	int	rc = 0;
+
+	if (msg->msg_rxpeer->lp_rtrcredits <= 0 ||
+	    lnet_msg2bufpool(msg)->rbp_credits <= 0) {
+		if (ni->ni_lnd->lnd_eager_recv == NULL) {
+			msg->msg_rx_ready_delay = 1;
+		} else {
+			lnet_net_unlock(msg->msg_rx_cpt);
+			rc = lnet_ni_eager_recv(ni, msg);
+			lnet_net_lock(msg->msg_rx_cpt);
+		}
+	}
+
+	if (rc == 0)
+		rc = lnet_post_routed_recv_locked(msg, 0);
+	return rc;
+}
+
+char *
+lnet_msgtyp2str(int type)
+{
+	switch (type) {
+	case LNET_MSG_ACK:
+		return "ACK";
+	case LNET_MSG_PUT:
+		return "PUT";
+	case LNET_MSG_GET:
+		return "GET";
+	case LNET_MSG_REPLY:
+		return "REPLY";
+	case LNET_MSG_HELLO:
+		return "HELLO";
+	default:
+		return "<UNKNOWN>";
+	}
+}
+EXPORT_SYMBOL(lnet_msgtyp2str);
+
+void
+lnet_print_hdr(lnet_hdr_t *hdr)
+{
+	lnet_process_id_t src = {0};
+	lnet_process_id_t dst = {0};
+	char *type_str = lnet_msgtyp2str(hdr->type);
+
+	src.nid = hdr->src_nid;
+	src.pid = hdr->src_pid;
+
+	dst.nid = hdr->dest_nid;
+	dst.pid = hdr->dest_pid;
+
+	CWARN("P3 Header at %p of type %s\n", hdr, type_str);
+	CWARN("    From %s\n", libcfs_id2str(src));
+	CWARN("    To   %s\n", libcfs_id2str(dst));
+
+	switch (hdr->type) {
+	default:
+		break;
+
+	case LNET_MSG_PUT:
+		CWARN("    Ptl index %d, ack md %#llx.%#llx, match bits %llu\n",
+		      hdr->msg.put.ptl_index,
+		      hdr->msg.put.ack_wmd.wh_interface_cookie,
+		      hdr->msg.put.ack_wmd.wh_object_cookie,
+		      hdr->msg.put.match_bits);
+		CWARN("    Length %d, offset %d, hdr data %#llx\n",
+		      hdr->payload_length, hdr->msg.put.offset,
+		      hdr->msg.put.hdr_data);
+		break;
+
+	case LNET_MSG_GET:
+		CWARN("    Ptl index %d, return md %#llx.%#llx, match bits %llu\n",
+		      hdr->msg.get.ptl_index,
+		      hdr->msg.get.return_wmd.wh_interface_cookie,
+		      hdr->msg.get.return_wmd.wh_object_cookie,
+		      hdr->msg.get.match_bits);
+		CWARN("    Length %d, src offset %d\n",
+		      hdr->msg.get.sink_length,
+		      hdr->msg.get.src_offset);
+		break;
+
+	case LNET_MSG_ACK:
+		CWARN("    dst md %#llx.%#llx, manipulated length %d\n",
+		      hdr->msg.ack.dst_wmd.wh_interface_cookie,
+		      hdr->msg.ack.dst_wmd.wh_object_cookie,
+		      hdr->msg.ack.mlength);
+		break;
+
+	case LNET_MSG_REPLY:
+		CWARN("    dst md %#llx.%#llx, length %d\n",
+		      hdr->msg.reply.dst_wmd.wh_interface_cookie,
+		      hdr->msg.reply.dst_wmd.wh_object_cookie,
+		      hdr->payload_length);
+	}
+
+}
+
+int
+lnet_parse(lnet_ni_t *ni, lnet_hdr_t *hdr, lnet_nid_t from_nid,
+	   void *private, int rdma_req)
+{
+	int		rc = 0;
+	int		cpt;
+	int		for_me;
+	struct lnet_msg	*msg;
+	lnet_pid_t     dest_pid;
+	lnet_nid_t     dest_nid;
+	lnet_nid_t     src_nid;
+	__u32	  payload_length;
+	__u32	  type;
+
+	LASSERT(!in_interrupt());
+
+	type = le32_to_cpu(hdr->type);
+	src_nid = le64_to_cpu(hdr->src_nid);
+	dest_nid = le64_to_cpu(hdr->dest_nid);
+	dest_pid = le32_to_cpu(hdr->dest_pid);
+	payload_length = le32_to_cpu(hdr->payload_length);
+
+	for_me = (ni->ni_nid == dest_nid);
+	cpt = lnet_cpt_of_nid(from_nid);
+
+	switch (type) {
+	case LNET_MSG_ACK:
+	case LNET_MSG_GET:
+		if (payload_length > 0) {
+			CERROR("%s, src %s: bad %s payload %d (0 expected)\n",
+			       libcfs_nid2str(from_nid),
+			       libcfs_nid2str(src_nid),
+			       lnet_msgtyp2str(type), payload_length);
+			return -EPROTO;
+		}
+		break;
+
+	case LNET_MSG_PUT:
+	case LNET_MSG_REPLY:
+		if (payload_length >
+		   (__u32)(for_me ? LNET_MAX_PAYLOAD : LNET_MTU)) {
+			CERROR("%s, src %s: bad %s payload %d (%d max expected)\n",
+			       libcfs_nid2str(from_nid),
+			       libcfs_nid2str(src_nid),
+			       lnet_msgtyp2str(type),
+			       payload_length,
+			       for_me ? LNET_MAX_PAYLOAD : LNET_MTU);
+			return -EPROTO;
+		}
+		break;
+
+	default:
+		CERROR("%s, src %s: Bad message type 0x%x\n",
+		       libcfs_nid2str(from_nid),
+		       libcfs_nid2str(src_nid), type);
+		return -EPROTO;
+	}
+
+	if (the_lnet.ln_routing &&
+	    ni->ni_last_alive != get_seconds()) {
+		lnet_ni_lock(ni);
+
+		/* NB: so far here is the only place to set NI status to "up */
+		ni->ni_last_alive = get_seconds();
+		if (ni->ni_status != NULL &&
+		    ni->ni_status->ns_status == LNET_NI_STATUS_DOWN)
+			ni->ni_status->ns_status = LNET_NI_STATUS_UP;
+		lnet_ni_unlock(ni);
+	}
+
+	/* Regard a bad destination NID as a protocol error.  Senders should
+	 * know what they're doing; if they don't they're misconfigured, buggy
+	 * or malicious so we chop them off at the knees :) */
+
+	if (!for_me) {
+		if (LNET_NIDNET(dest_nid) == LNET_NIDNET(ni->ni_nid)) {
+			/* should have gone direct */
+			CERROR("%s, src %s: Bad dest nid %s (should have been sent direct)\n",
+			       libcfs_nid2str(from_nid),
+			       libcfs_nid2str(src_nid),
+			       libcfs_nid2str(dest_nid));
+			return -EPROTO;
+		}
+
+		if (lnet_islocalnid(dest_nid)) {
+			/* dest is another local NI; sender should have used
+			 * this node's NID on its own network */
+			CERROR("%s, src %s: Bad dest nid %s (it's my nid but on a different network)\n",
+			       libcfs_nid2str(from_nid),
+			       libcfs_nid2str(src_nid),
+			       libcfs_nid2str(dest_nid));
+			return -EPROTO;
+		}
+
+		if (rdma_req && type == LNET_MSG_GET) {
+			CERROR("%s, src %s: Bad optimized GET for %s (final destination must be me)\n",
+			       libcfs_nid2str(from_nid),
+			       libcfs_nid2str(src_nid),
+			       libcfs_nid2str(dest_nid));
+			return -EPROTO;
+		}
+
+		if (!the_lnet.ln_routing) {
+			CERROR("%s, src %s: Dropping message for %s (routing not enabled)\n",
+			       libcfs_nid2str(from_nid),
+			       libcfs_nid2str(src_nid),
+			       libcfs_nid2str(dest_nid));
+			goto drop;
+		}
+	}
+
+	/* Message looks OK; we're not going to return an error, so we MUST
+	 * call back lnd_recv() come what may... */
+
+	if (!list_empty(&the_lnet.ln_test_peers) && /* normally we don't */
+	    fail_peer(src_nid, 0)) {	     /* shall we now? */
+		CERROR("%s, src %s: Dropping %s to simulate failure\n",
+		       libcfs_nid2str(from_nid), libcfs_nid2str(src_nid),
+		       lnet_msgtyp2str(type));
+		goto drop;
+	}
+
+	msg = lnet_msg_alloc();
+	if (msg == NULL) {
+		CERROR("%s, src %s: Dropping %s (out of memory)\n",
+		       libcfs_nid2str(from_nid), libcfs_nid2str(src_nid),
+		       lnet_msgtyp2str(type));
+		goto drop;
+	}
+
+	/* msg zeroed in lnet_msg_alloc;
+	 * i.e. flags all clear, pointers NULL etc
+	 */
+
+	msg->msg_type = type;
+	msg->msg_private = private;
+	msg->msg_receiving = 1;
+	msg->msg_len = msg->msg_wanted = payload_length;
+	msg->msg_offset = 0;
+	msg->msg_hdr = *hdr;
+	/* for building message event */
+	msg->msg_from = from_nid;
+	if (!for_me) {
+		msg->msg_target.pid	= dest_pid;
+		msg->msg_target.nid	= dest_nid;
+		msg->msg_routing	= 1;
+
+	} else {
+		/* convert common msg->hdr fields to host byteorder */
+		msg->msg_hdr.type	= type;
+		msg->msg_hdr.src_nid	= src_nid;
+		msg->msg_hdr.src_pid	= le32_to_cpu(msg->msg_hdr.src_pid);
+		msg->msg_hdr.dest_nid	= dest_nid;
+		msg->msg_hdr.dest_pid	= dest_pid;
+		msg->msg_hdr.payload_length = payload_length;
+	}
+
+	lnet_net_lock(cpt);
+	rc = lnet_nid2peer_locked(&msg->msg_rxpeer, from_nid, cpt);
+	if (rc != 0) {
+		lnet_net_unlock(cpt);
+		CERROR("%s, src %s: Dropping %s (error %d looking up sender)\n",
+		       libcfs_nid2str(from_nid), libcfs_nid2str(src_nid),
+		       lnet_msgtyp2str(type), rc);
+		lnet_msg_free(msg);
+		goto drop;
+	}
+
+	if (lnet_isrouter(msg->msg_rxpeer)) {
+		lnet_peer_set_alive(msg->msg_rxpeer);
+		if (avoid_asym_router_failure &&
+		    LNET_NIDNET(src_nid) != LNET_NIDNET(from_nid)) {
+			/* received a remote message from router, update
+			 * remote NI status on this router.
+			 * NB: multi-hop routed message will be ignored.
+			 */
+			lnet_router_ni_update_locked(msg->msg_rxpeer,
+						     LNET_NIDNET(src_nid));
+		}
+	}
+
+	lnet_msg_commit(msg, cpt);
+
+	if (!for_me) {
+		rc = lnet_parse_forward_locked(ni, msg);
+		lnet_net_unlock(cpt);
+
+		if (rc < 0)
+			goto free_drop;
+		if (rc == 0) {
+			lnet_ni_recv(ni, msg->msg_private, msg, 0,
+				     0, payload_length, payload_length);
+		}
+		return 0;
+	}
+
+	lnet_net_unlock(cpt);
+
+	switch (type) {
+	case LNET_MSG_ACK:
+		rc = lnet_parse_ack(ni, msg);
+		break;
+	case LNET_MSG_PUT:
+		rc = lnet_parse_put(ni, msg);
+		break;
+	case LNET_MSG_GET:
+		rc = lnet_parse_get(ni, msg, rdma_req);
+		break;
+	case LNET_MSG_REPLY:
+		rc = lnet_parse_reply(ni, msg);
+		break;
+	default:
+		LASSERT(0);
+		rc = -EPROTO;
+		goto free_drop;  /* prevent an unused label if !kernel */
+	}
+
+	if (rc == 0)
+		return 0;
+
+	LASSERT(rc == ENOENT);
+
+ free_drop:
+	LASSERT(msg->msg_md == NULL);
+	lnet_finalize(ni, msg, rc);
+
+ drop:
+	lnet_drop_message(ni, cpt, private, payload_length);
+	return 0;
+}
+EXPORT_SYMBOL(lnet_parse);
+
+void
+lnet_drop_delayed_msg_list(struct list_head *head, char *reason)
+{
+	while (!list_empty(head)) {
+		lnet_process_id_t	id = {0};
+		lnet_msg_t		*msg;
+
+		msg = list_entry(head->next, lnet_msg_t, msg_list);
+		list_del(&msg->msg_list);
+
+		id.nid = msg->msg_hdr.src_nid;
+		id.pid = msg->msg_hdr.src_pid;
+
+		LASSERT(msg->msg_md == NULL);
+		LASSERT(msg->msg_rx_delayed);
+		LASSERT(msg->msg_rxpeer != NULL);
+		LASSERT(msg->msg_hdr.type == LNET_MSG_PUT);
+
+		CWARN("Dropping delayed PUT from %s portal %d match %llu offset %d length %d: %s\n",
+		      libcfs_id2str(id),
+		      msg->msg_hdr.msg.put.ptl_index,
+		      msg->msg_hdr.msg.put.match_bits,
+		      msg->msg_hdr.msg.put.offset,
+		      msg->msg_hdr.payload_length, reason);
+
+		/* NB I can't drop msg's ref on msg_rxpeer until after I've
+		 * called lnet_drop_message(), so I just hang onto msg as well
+		 * until that's done */
+
+		lnet_drop_message(msg->msg_rxpeer->lp_ni,
+				  msg->msg_rxpeer->lp_cpt,
+				  msg->msg_private, msg->msg_len);
+		/*
+		 * NB: message will not generate event because w/o attached MD,
+		 * but we still should give error code so lnet_msg_decommit()
+		 * can skip counters operations and other checks.
+		 */
+		lnet_finalize(msg->msg_rxpeer->lp_ni, msg, -ENOENT);
+	}
+}
+
+void
+lnet_recv_delayed_msg_list(struct list_head *head)
+{
+	while (!list_empty(head)) {
+		lnet_msg_t	  *msg;
+		lnet_process_id_t  id;
+
+		msg = list_entry(head->next, lnet_msg_t, msg_list);
+		list_del(&msg->msg_list);
+
+		/* md won't disappear under me, since each msg
+		 * holds a ref on it */
+
+		id.nid = msg->msg_hdr.src_nid;
+		id.pid = msg->msg_hdr.src_pid;
+
+		LASSERT(msg->msg_rx_delayed);
+		LASSERT(msg->msg_md != NULL);
+		LASSERT(msg->msg_rxpeer != NULL);
+		LASSERT(msg->msg_hdr.type == LNET_MSG_PUT);
+
+		CDEBUG(D_NET, "Resuming delayed PUT from %s portal %d match %llu offset %d length %d.\n",
+		       libcfs_id2str(id), msg->msg_hdr.msg.put.ptl_index,
+		       msg->msg_hdr.msg.put.match_bits,
+		       msg->msg_hdr.msg.put.offset,
+		       msg->msg_hdr.payload_length);
+
+		lnet_recv_put(msg->msg_rxpeer->lp_ni, msg);
+	}
+}
+
+/**
+ * Initiate an asynchronous PUT operation.
+ *
+ * There are several events associated with a PUT: completion of the send on
+ * the initiator node (LNET_EVENT_SEND), and when the send completes
+ * successfully, the receipt of an acknowledgment (LNET_EVENT_ACK) indicating
+ * that the operation was accepted by the target. The event LNET_EVENT_PUT is
+ * used at the target node to indicate the completion of incoming data
+ * delivery.
+ *
+ * The local events will be logged in the EQ associated with the MD pointed to
+ * by \a mdh handle. Using a MD without an associated EQ results in these
+ * events being discarded. In this case, the caller must have another
+ * mechanism (e.g., a higher level protocol) for determining when it is safe
+ * to modify the memory region associated with the MD.
+ *
+ * Note that LNet does not guarantee the order of LNET_EVENT_SEND and
+ * LNET_EVENT_ACK, though intuitively ACK should happen after SEND.
+ *
+ * \param self Indicates the NID of a local interface through which to send
+ * the PUT request. Use LNET_NID_ANY to let LNet choose one by itself.
+ * \param mdh A handle for the MD that describes the memory to be sent. The MD
+ * must be "free floating" (See LNetMDBind()).
+ * \param ack Controls whether an acknowledgment is requested.
+ * Acknowledgments are only sent when they are requested by the initiating
+ * process and the target MD enables them.
+ * \param target A process identifier for the target process.
+ * \param portal The index in the \a target's portal table.
+ * \param match_bits The match bits to use for MD selection at the target
+ * process.
+ * \param offset The offset into the target MD (only used when the target
+ * MD has the LNET_MD_MANAGE_REMOTE option set).
+ * \param hdr_data 64 bits of user data that can be included in the message
+ * header. This data is written to an event queue entry at the target if an
+ * EQ is present on the matching MD.
+ *
+ * \retval  0      Success, and only in this case events will be generated
+ * and logged to EQ (if it exists).
+ * \retval -EIO    Simulated failure.
+ * \retval -ENOMEM Memory allocation failure.
+ * \retval -ENOENT Invalid MD object.
+ *
+ * \see lnet_event_t::hdr_data and lnet_event_kind_t.
+ */
+int
+LNetPut(lnet_nid_t self, lnet_handle_md_t mdh, lnet_ack_req_t ack,
+	lnet_process_id_t target, unsigned int portal,
+	__u64 match_bits, unsigned int offset,
+	__u64 hdr_data)
+{
+	struct lnet_msg		*msg;
+	struct lnet_libmd	*md;
+	int			cpt;
+	int			rc;
+
+	LASSERT(the_lnet.ln_init);
+	LASSERT(the_lnet.ln_refcount > 0);
+
+	if (!list_empty(&the_lnet.ln_test_peers) && /* normally we don't */
+	    fail_peer(target.nid, 1)) { /* shall we now? */
+		CERROR("Dropping PUT to %s: simulated failure\n",
+		       libcfs_id2str(target));
+		return -EIO;
+	}
+
+	msg = lnet_msg_alloc();
+	if (msg == NULL) {
+		CERROR("Dropping PUT to %s: ENOMEM on lnet_msg_t\n",
+		       libcfs_id2str(target));
+		return -ENOMEM;
+	}
+	msg->msg_vmflush = !!memory_pressure_get();
+
+	cpt = lnet_cpt_of_cookie(mdh.cookie);
+	lnet_res_lock(cpt);
+
+	md = lnet_handle2md(&mdh);
+	if (md == NULL || md->md_threshold == 0 || md->md_me != NULL) {
+		CERROR("Dropping PUT (%llu:%d:%s): MD (%d) invalid\n",
+		       match_bits, portal, libcfs_id2str(target),
+		       md == NULL ? -1 : md->md_threshold);
+		if (md != NULL && md->md_me != NULL)
+			CERROR("Source MD also attached to portal %d\n",
+			       md->md_me->me_portal);
+		lnet_res_unlock(cpt);
+
+		lnet_msg_free(msg);
+		return -ENOENT;
+	}
+
+	CDEBUG(D_NET, "LNetPut -> %s\n", libcfs_id2str(target));
+
+	lnet_msg_attach_md(msg, md, 0, 0);
+
+	lnet_prep_send(msg, LNET_MSG_PUT, target, 0, md->md_length);
+
+	msg->msg_hdr.msg.put.match_bits = cpu_to_le64(match_bits);
+	msg->msg_hdr.msg.put.ptl_index = cpu_to_le32(portal);
+	msg->msg_hdr.msg.put.offset = cpu_to_le32(offset);
+	msg->msg_hdr.msg.put.hdr_data = hdr_data;
+
+	/* NB handles only looked up by creator (no flips) */
+	if (ack == LNET_ACK_REQ) {
+		msg->msg_hdr.msg.put.ack_wmd.wh_interface_cookie =
+			the_lnet.ln_interface_cookie;
+		msg->msg_hdr.msg.put.ack_wmd.wh_object_cookie =
+			md->md_lh.lh_cookie;
+	} else {
+		msg->msg_hdr.msg.put.ack_wmd.wh_interface_cookie =
+			LNET_WIRE_HANDLE_COOKIE_NONE;
+		msg->msg_hdr.msg.put.ack_wmd.wh_object_cookie =
+			LNET_WIRE_HANDLE_COOKIE_NONE;
+	}
+
+	lnet_res_unlock(cpt);
+
+	lnet_build_msg_event(msg, LNET_EVENT_SEND);
+
+	rc = lnet_send(self, msg, LNET_NID_ANY);
+	if (rc != 0) {
+		CNETERR("Error sending PUT to %s: %d\n",
+		       libcfs_id2str(target), rc);
+		lnet_finalize(NULL, msg, rc);
+	}
+
+	/* completion will be signalled by an event */
+	return 0;
+}
+EXPORT_SYMBOL(LNetPut);
+
+lnet_msg_t *
+lnet_create_reply_msg(lnet_ni_t *ni, lnet_msg_t *getmsg)
+{
+	/* The LND can DMA direct to the GET md (i.e. no REPLY msg).  This
+	 * returns a msg for the LND to pass to lnet_finalize() when the sink
+	 * data has been received.
+	 *
+	 * CAVEAT EMPTOR: 'getmsg' is the original GET, which is freed when
+	 * lnet_finalize() is called on it, so the LND must call this first */
+
+	struct lnet_msg		*msg = lnet_msg_alloc();
+	struct lnet_libmd	*getmd = getmsg->msg_md;
+	lnet_process_id_t	peer_id = getmsg->msg_target;
+	int			cpt;
+
+	LASSERT(!getmsg->msg_target_is_router);
+	LASSERT(!getmsg->msg_routing);
+
+	cpt = lnet_cpt_of_cookie(getmd->md_lh.lh_cookie);
+	lnet_res_lock(cpt);
+
+	LASSERT(getmd->md_refcount > 0);
+
+	if (msg == NULL) {
+		CERROR("%s: Dropping REPLY from %s: can't allocate msg\n",
+			libcfs_nid2str(ni->ni_nid), libcfs_id2str(peer_id));
+		goto drop;
+	}
+
+	if (getmd->md_threshold == 0) {
+		CERROR("%s: Dropping REPLY from %s for inactive MD %p\n",
+			libcfs_nid2str(ni->ni_nid), libcfs_id2str(peer_id),
+			getmd);
+		lnet_res_unlock(cpt);
+		goto drop;
+	}
+
+	LASSERT(getmd->md_offset == 0);
+
+	CDEBUG(D_NET, "%s: Reply from %s md %p\n",
+	       libcfs_nid2str(ni->ni_nid), libcfs_id2str(peer_id), getmd);
+
+	/* setup information for lnet_build_msg_event */
+	msg->msg_from = peer_id.nid;
+	msg->msg_type = LNET_MSG_GET; /* flag this msg as an "optimized" GET */
+	msg->msg_hdr.src_nid = peer_id.nid;
+	msg->msg_hdr.payload_length = getmd->md_length;
+	msg->msg_receiving = 1; /* required by lnet_msg_attach_md */
+
+	lnet_msg_attach_md(msg, getmd, getmd->md_offset, getmd->md_length);
+	lnet_res_unlock(cpt);
+
+	cpt = lnet_cpt_of_nid(peer_id.nid);
+
+	lnet_net_lock(cpt);
+	lnet_msg_commit(msg, cpt);
+	lnet_net_unlock(cpt);
+
+	lnet_build_msg_event(msg, LNET_EVENT_REPLY);
+
+	return msg;
+
+ drop:
+	cpt = lnet_cpt_of_nid(peer_id.nid);
+
+	lnet_net_lock(cpt);
+	the_lnet.ln_counters[cpt]->drop_count++;
+	the_lnet.ln_counters[cpt]->drop_length += getmd->md_length;
+	lnet_net_unlock(cpt);
+
+	if (msg != NULL)
+		lnet_msg_free(msg);
+
+	return NULL;
+}
+EXPORT_SYMBOL(lnet_create_reply_msg);
+
+void
+lnet_set_reply_msg_len(lnet_ni_t *ni, lnet_msg_t *reply, unsigned int len)
+{
+	/* Set the REPLY length, now the RDMA that elides the REPLY message has
+	 * completed and I know it. */
+	LASSERT(reply != NULL);
+	LASSERT(reply->msg_type == LNET_MSG_GET);
+	LASSERT(reply->msg_ev.type == LNET_EVENT_REPLY);
+
+	/* NB I trusted my peer to RDMA.  If she tells me she's written beyond
+	 * the end of my buffer, I might as well be dead. */
+	LASSERT(len <= reply->msg_ev.mlength);
+
+	reply->msg_ev.mlength = len;
+}
+EXPORT_SYMBOL(lnet_set_reply_msg_len);
+
+/**
+ * Initiate an asynchronous GET operation.
+ *
+ * On the initiator node, an LNET_EVENT_SEND is logged when the GET request
+ * is sent, and an LNET_EVENT_REPLY is logged when the data returned from
+ * the target node in the REPLY has been written to local MD.
+ *
+ * On the target node, an LNET_EVENT_GET is logged when the GET request
+ * arrives and is accepted into a MD.
+ *
+ * \param self,target,portal,match_bits,offset See the discussion in LNetPut().
+ * \param mdh A handle for the MD that describes the memory into which the
+ * requested data will be received. The MD must be "free floating"
+ * (See LNetMDBind()).
+ *
+ * \retval  0      Success, and only in this case events will be generated
+ * and logged to EQ (if it exists) of the MD.
+ * \retval -EIO    Simulated failure.
+ * \retval -ENOMEM Memory allocation failure.
+ * \retval -ENOENT Invalid MD object.
+ */
+int
+LNetGet(lnet_nid_t self, lnet_handle_md_t mdh,
+	lnet_process_id_t target, unsigned int portal,
+	__u64 match_bits, unsigned int offset)
+{
+	struct lnet_msg		*msg;
+	struct lnet_libmd	*md;
+	int			cpt;
+	int			rc;
+
+	LASSERT(the_lnet.ln_init);
+	LASSERT(the_lnet.ln_refcount > 0);
+
+	if (!list_empty(&the_lnet.ln_test_peers) && /* normally we don't */
+	    fail_peer(target.nid, 1)) {	  /* shall we now? */
+		CERROR("Dropping GET to %s: simulated failure\n",
+		       libcfs_id2str(target));
+		return -EIO;
+	}
+
+	msg = lnet_msg_alloc();
+	if (msg == NULL) {
+		CERROR("Dropping GET to %s: ENOMEM on lnet_msg_t\n",
+		       libcfs_id2str(target));
+		return -ENOMEM;
+	}
+
+	cpt = lnet_cpt_of_cookie(mdh.cookie);
+	lnet_res_lock(cpt);
+
+	md = lnet_handle2md(&mdh);
+	if (md == NULL || md->md_threshold == 0 || md->md_me != NULL) {
+		CERROR("Dropping GET (%llu:%d:%s): MD (%d) invalid\n",
+		       match_bits, portal, libcfs_id2str(target),
+		       md == NULL ? -1 : md->md_threshold);
+		if (md != NULL && md->md_me != NULL)
+			CERROR("REPLY MD also attached to portal %d\n",
+			       md->md_me->me_portal);
+
+		lnet_res_unlock(cpt);
+
+		lnet_msg_free(msg);
+		return -ENOENT;
+	}
+
+	CDEBUG(D_NET, "LNetGet -> %s\n", libcfs_id2str(target));
+
+	lnet_msg_attach_md(msg, md, 0, 0);
+
+	lnet_prep_send(msg, LNET_MSG_GET, target, 0, 0);
+
+	msg->msg_hdr.msg.get.match_bits = cpu_to_le64(match_bits);
+	msg->msg_hdr.msg.get.ptl_index = cpu_to_le32(portal);
+	msg->msg_hdr.msg.get.src_offset = cpu_to_le32(offset);
+	msg->msg_hdr.msg.get.sink_length = cpu_to_le32(md->md_length);
+
+	/* NB handles only looked up by creator (no flips) */
+	msg->msg_hdr.msg.get.return_wmd.wh_interface_cookie =
+		the_lnet.ln_interface_cookie;
+	msg->msg_hdr.msg.get.return_wmd.wh_object_cookie =
+		md->md_lh.lh_cookie;
+
+	lnet_res_unlock(cpt);
+
+	lnet_build_msg_event(msg, LNET_EVENT_SEND);
+
+	rc = lnet_send(self, msg, LNET_NID_ANY);
+	if (rc < 0) {
+		CNETERR("Error sending GET to %s: %d\n",
+		       libcfs_id2str(target), rc);
+		lnet_finalize(NULL, msg, rc);
+	}
+
+	/* completion will be signalled by an event */
+	return 0;
+}
+EXPORT_SYMBOL(LNetGet);
+
+/**
+ * Calculate distance to node at \a dstnid.
+ *
+ * \param dstnid Target NID.
+ * \param srcnidp If not NULL, NID of the local interface to reach \a dstnid
+ * is saved here.
+ * \param orderp If not NULL, order of the route to reach \a dstnid is saved
+ * here.
+ *
+ * \retval 0 If \a dstnid belongs to a local interface, and reserved option
+ * local_nid_dist_zero is set, which is the default.
+ * \retval positives Distance to target NID, i.e. number of hops plus one.
+ * \retval -EHOSTUNREACH If \a dstnid is not reachable.
+ */
+int
+LNetDist(lnet_nid_t dstnid, lnet_nid_t *srcnidp, __u32 *orderp)
+{
+	struct list_head		*e;
+	struct lnet_ni		*ni;
+	lnet_remotenet_t	*rnet;
+	__u32			dstnet = LNET_NIDNET(dstnid);
+	int			hops;
+	int			cpt;
+	__u32			order = 2;
+	struct list_head		*rn_list;
+
+	/* if !local_nid_dist_zero, I don't return a distance of 0 ever
+	 * (when lustre sees a distance of 0, it substitutes 0@lo), so I
+	 * keep order 0 free for 0@lo and order 1 free for a local NID
+	 * match */
+
+	LASSERT(the_lnet.ln_init);
+	LASSERT(the_lnet.ln_refcount > 0);
+
+	cpt = lnet_net_lock_current();
+
+	list_for_each(e, &the_lnet.ln_nis) {
+		ni = list_entry(e, lnet_ni_t, ni_list);
+
+		if (ni->ni_nid == dstnid) {
+			if (srcnidp != NULL)
+				*srcnidp = dstnid;
+			if (orderp != NULL) {
+				if (LNET_NETTYP(LNET_NIDNET(dstnid)) == LOLND)
+					*orderp = 0;
+				else
+					*orderp = 1;
+			}
+			lnet_net_unlock(cpt);
+
+			return local_nid_dist_zero ? 0 : 1;
+		}
+
+		if (LNET_NIDNET(ni->ni_nid) == dstnet) {
+			if (srcnidp != NULL)
+				*srcnidp = ni->ni_nid;
+			if (orderp != NULL)
+				*orderp = order;
+			lnet_net_unlock(cpt);
+			return 1;
+		}
+
+		order++;
+	}
+
+	rn_list = lnet_net2rnethash(dstnet);
+	list_for_each(e, rn_list) {
+		rnet = list_entry(e, lnet_remotenet_t, lrn_list);
+
+		if (rnet->lrn_net == dstnet) {
+			lnet_route_t *route;
+			lnet_route_t *shortest = NULL;
+
+			LASSERT(!list_empty(&rnet->lrn_routes));
+
+			list_for_each_entry(route, &rnet->lrn_routes,
+						lr_list) {
+				if (shortest == NULL ||
+				    route->lr_hops < shortest->lr_hops)
+					shortest = route;
+			}
+
+			LASSERT(shortest != NULL);
+			hops = shortest->lr_hops;
+			if (srcnidp != NULL)
+				*srcnidp = shortest->lr_gateway->lp_ni->ni_nid;
+			if (orderp != NULL)
+				*orderp = order;
+			lnet_net_unlock(cpt);
+			return hops + 1;
+		}
+		order++;
+	}
+
+	lnet_net_unlock(cpt);
+	return -EHOSTUNREACH;
+}
+EXPORT_SYMBOL(LNetDist);
+
+/**
+ * Set the number of asynchronous messages expected from a target process.
+ *
+ * This function is only meaningful for userspace callers. It's a no-op when
+ * called from kernel.
+ *
+ * Asynchronous messages are those that can come from a target when the
+ * userspace process is not waiting for IO to complete; e.g., AST callbacks
+ * from Lustre servers. Specifying the expected number of such messages
+ * allows them to be eagerly received when user process is not running in
+ * LNet; otherwise network errors may occur.
+ *
+ * \param id Process ID of the target process.
+ * \param nasync Number of asynchronous messages expected from the target.
+ *
+ * \return 0 on success, and an error code otherwise.
+ */
+int
+LNetSetAsync(lnet_process_id_t id, int nasync)
+{
+	return 0;
+}
+EXPORT_SYMBOL(LNetSetAsync);
diff --git a/kernel/drivers/staging/lustre/lnet/lnet/lib-msg.c b/kernel/drivers/staging/lustre/lnet/lnet/lib-msg.c
new file mode 100644
index 000000000..a46ccbf66
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lnet/lnet/lib-msg.c
@@ -0,0 +1,647 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/lnet/lib-msg.c
+ *
+ * Message decoding, parsing and finalizing routines
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+
+#include "../../include/linux/lnet/lib-lnet.h"
+
+void
+lnet_build_unlink_event(lnet_libmd_t *md, lnet_event_t *ev)
+{
+	memset(ev, 0, sizeof(*ev));
+
+	ev->status   = 0;
+	ev->unlinked = 1;
+	ev->type     = LNET_EVENT_UNLINK;
+	lnet_md_deconstruct(md, &ev->md);
+	lnet_md2handle(&ev->md_handle, md);
+}
+
+/*
+ * Don't need any lock, must be called after lnet_commit_md
+ */
+void
+lnet_build_msg_event(lnet_msg_t *msg, lnet_event_kind_t ev_type)
+{
+	lnet_hdr_t	*hdr = &msg->msg_hdr;
+	lnet_event_t	*ev  = &msg->msg_ev;
+
+	LASSERT(!msg->msg_routing);
+
+	ev->type = ev_type;
+
+	if (ev_type == LNET_EVENT_SEND) {
+		/* event for active message */
+		ev->target.nid    = le64_to_cpu(hdr->dest_nid);
+		ev->target.pid    = le32_to_cpu(hdr->dest_pid);
+		ev->initiator.nid = LNET_NID_ANY;
+		ev->initiator.pid = the_lnet.ln_pid;
+		ev->sender	  = LNET_NID_ANY;
+
+	} else {
+		/* event for passive message */
+		ev->target.pid    = hdr->dest_pid;
+		ev->target.nid    = hdr->dest_nid;
+		ev->initiator.pid = hdr->src_pid;
+		ev->initiator.nid = hdr->src_nid;
+		ev->rlength       = hdr->payload_length;
+		ev->sender	  = msg->msg_from;
+		ev->mlength	  = msg->msg_wanted;
+		ev->offset	  = msg->msg_offset;
+	}
+
+	switch (ev_type) {
+	default:
+		LBUG();
+
+	case LNET_EVENT_PUT: /* passive PUT */
+		ev->pt_index   = hdr->msg.put.ptl_index;
+		ev->match_bits = hdr->msg.put.match_bits;
+		ev->hdr_data   = hdr->msg.put.hdr_data;
+		return;
+
+	case LNET_EVENT_GET: /* passive GET */
+		ev->pt_index   = hdr->msg.get.ptl_index;
+		ev->match_bits = hdr->msg.get.match_bits;
+		ev->hdr_data   = 0;
+		return;
+
+	case LNET_EVENT_ACK: /* ACK */
+		ev->match_bits = hdr->msg.ack.match_bits;
+		ev->mlength    = hdr->msg.ack.mlength;
+		return;
+
+	case LNET_EVENT_REPLY: /* REPLY */
+		return;
+
+	case LNET_EVENT_SEND: /* active message */
+		if (msg->msg_type == LNET_MSG_PUT) {
+			ev->pt_index   = le32_to_cpu(hdr->msg.put.ptl_index);
+			ev->match_bits = le64_to_cpu(hdr->msg.put.match_bits);
+			ev->offset     = le32_to_cpu(hdr->msg.put.offset);
+			ev->mlength    =
+			ev->rlength    = le32_to_cpu(hdr->payload_length);
+			ev->hdr_data   = le64_to_cpu(hdr->msg.put.hdr_data);
+
+		} else {
+			LASSERT(msg->msg_type == LNET_MSG_GET);
+			ev->pt_index   = le32_to_cpu(hdr->msg.get.ptl_index);
+			ev->match_bits = le64_to_cpu(hdr->msg.get.match_bits);
+			ev->mlength    =
+			ev->rlength    = le32_to_cpu(hdr->msg.get.sink_length);
+			ev->offset     = le32_to_cpu(hdr->msg.get.src_offset);
+			ev->hdr_data   = 0;
+		}
+		return;
+	}
+}
+
+void
+lnet_msg_commit(lnet_msg_t *msg, int cpt)
+{
+	struct lnet_msg_container *container = the_lnet.ln_msg_containers[cpt];
+	lnet_counters_t		  *counters  = the_lnet.ln_counters[cpt];
+
+	/* routed message can be committed for both receiving and sending */
+	LASSERT(!msg->msg_tx_committed);
+
+	if (msg->msg_sending) {
+		LASSERT(!msg->msg_receiving);
+
+		msg->msg_tx_cpt = cpt;
+		msg->msg_tx_committed = 1;
+		if (msg->msg_rx_committed) { /* routed message REPLY */
+			LASSERT(msg->msg_onactivelist);
+			return;
+		}
+	} else {
+		LASSERT(!msg->msg_sending);
+		msg->msg_rx_cpt = cpt;
+		msg->msg_rx_committed = 1;
+	}
+
+	LASSERT(!msg->msg_onactivelist);
+	msg->msg_onactivelist = 1;
+	list_add(&msg->msg_activelist, &container->msc_active);
+
+	counters->msgs_alloc++;
+	if (counters->msgs_alloc > counters->msgs_max)
+		counters->msgs_max = counters->msgs_alloc;
+}
+
+static void
+lnet_msg_decommit_tx(lnet_msg_t *msg, int status)
+{
+	lnet_counters_t	*counters;
+	lnet_event_t	*ev = &msg->msg_ev;
+
+	LASSERT(msg->msg_tx_committed);
+	if (status != 0)
+		goto out;
+
+	counters = the_lnet.ln_counters[msg->msg_tx_cpt];
+	switch (ev->type) {
+	default: /* routed message */
+		LASSERT(msg->msg_routing);
+		LASSERT(msg->msg_rx_committed);
+		LASSERT(ev->type == 0);
+
+		counters->route_length += msg->msg_len;
+		counters->route_count++;
+		goto out;
+
+	case LNET_EVENT_PUT:
+		/* should have been decommitted */
+		LASSERT(!msg->msg_rx_committed);
+		/* overwritten while sending ACK */
+		LASSERT(msg->msg_type == LNET_MSG_ACK);
+		msg->msg_type = LNET_MSG_PUT; /* fix type */
+		break;
+
+	case LNET_EVENT_SEND:
+		LASSERT(!msg->msg_rx_committed);
+		if (msg->msg_type == LNET_MSG_PUT)
+			counters->send_length += msg->msg_len;
+		break;
+
+	case LNET_EVENT_GET:
+		LASSERT(msg->msg_rx_committed);
+		/* overwritten while sending reply, we should never be
+		 * here for optimized GET */
+		LASSERT(msg->msg_type == LNET_MSG_REPLY);
+		msg->msg_type = LNET_MSG_GET; /* fix type */
+		break;
+	}
+
+	counters->send_count++;
+ out:
+	lnet_return_tx_credits_locked(msg);
+	msg->msg_tx_committed = 0;
+}
+
+static void
+lnet_msg_decommit_rx(lnet_msg_t *msg, int status)
+{
+	lnet_counters_t	*counters;
+	lnet_event_t	*ev = &msg->msg_ev;
+
+	LASSERT(!msg->msg_tx_committed); /* decommitted or never committed */
+	LASSERT(msg->msg_rx_committed);
+
+	if (status != 0)
+		goto out;
+
+	counters = the_lnet.ln_counters[msg->msg_rx_cpt];
+	switch (ev->type) {
+	default:
+		LASSERT(ev->type == 0);
+		LASSERT(msg->msg_routing);
+		goto out;
+
+	case LNET_EVENT_ACK:
+		LASSERT(msg->msg_type == LNET_MSG_ACK);
+		break;
+
+	case LNET_EVENT_GET:
+		/* type is "REPLY" if it's an optimized GET on passive side,
+		 * because optimized GET will never be committed for sending,
+		 * so message type wouldn't be changed back to "GET" by
+		 * lnet_msg_decommit_tx(), see details in lnet_parse_get() */
+		LASSERT(msg->msg_type == LNET_MSG_REPLY ||
+			msg->msg_type == LNET_MSG_GET);
+		counters->send_length += msg->msg_wanted;
+		break;
+
+	case LNET_EVENT_PUT:
+		LASSERT(msg->msg_type == LNET_MSG_PUT);
+		break;
+
+	case LNET_EVENT_REPLY:
+		/* type is "GET" if it's an optimized GET on active side,
+		 * see details in lnet_create_reply_msg() */
+		LASSERT(msg->msg_type == LNET_MSG_GET ||
+			msg->msg_type == LNET_MSG_REPLY);
+		break;
+	}
+
+	counters->recv_count++;
+	if (ev->type == LNET_EVENT_PUT || ev->type == LNET_EVENT_REPLY)
+		counters->recv_length += msg->msg_wanted;
+
+ out:
+	lnet_return_rx_credits_locked(msg);
+	msg->msg_rx_committed = 0;
+}
+
+void
+lnet_msg_decommit(lnet_msg_t *msg, int cpt, int status)
+{
+	int	cpt2 = cpt;
+
+	LASSERT(msg->msg_tx_committed || msg->msg_rx_committed);
+	LASSERT(msg->msg_onactivelist);
+
+	if (msg->msg_tx_committed) { /* always decommit for sending first */
+		LASSERT(cpt == msg->msg_tx_cpt);
+		lnet_msg_decommit_tx(msg, status);
+	}
+
+	if (msg->msg_rx_committed) {
+		/* forwarding msg committed for both receiving and sending */
+		if (cpt != msg->msg_rx_cpt) {
+			lnet_net_unlock(cpt);
+			cpt2 = msg->msg_rx_cpt;
+			lnet_net_lock(cpt2);
+		}
+		lnet_msg_decommit_rx(msg, status);
+	}
+
+	list_del(&msg->msg_activelist);
+	msg->msg_onactivelist = 0;
+
+	the_lnet.ln_counters[cpt2]->msgs_alloc--;
+
+	if (cpt2 != cpt) {
+		lnet_net_unlock(cpt2);
+		lnet_net_lock(cpt);
+	}
+}
+
+void
+lnet_msg_attach_md(lnet_msg_t *msg, lnet_libmd_t *md,
+		   unsigned int offset, unsigned int mlen)
+{
+	/* NB: @offset and @len are only useful for receiving */
+	/* Here, we attach the MD on lnet_msg and mark it busy and
+	 * decrementing its threshold. Come what may, the lnet_msg "owns"
+	 * the MD until a call to lnet_msg_detach_md or lnet_finalize()
+	 * signals completion. */
+	LASSERT(!msg->msg_routing);
+
+	msg->msg_md = md;
+	if (msg->msg_receiving) { /* committed for receiving */
+		msg->msg_offset = offset;
+		msg->msg_wanted = mlen;
+	}
+
+	md->md_refcount++;
+	if (md->md_threshold != LNET_MD_THRESH_INF) {
+		LASSERT(md->md_threshold > 0);
+		md->md_threshold--;
+	}
+
+	/* build umd in event */
+	lnet_md2handle(&msg->msg_ev.md_handle, md);
+	lnet_md_deconstruct(md, &msg->msg_ev.md);
+}
+
+void
+lnet_msg_detach_md(lnet_msg_t *msg, int status)
+{
+	lnet_libmd_t	*md = msg->msg_md;
+	int		unlink;
+
+	/* Now it's safe to drop my caller's ref */
+	md->md_refcount--;
+	LASSERT(md->md_refcount >= 0);
+
+	unlink = lnet_md_unlinkable(md);
+	if (md->md_eq != NULL) {
+		msg->msg_ev.status   = status;
+		msg->msg_ev.unlinked = unlink;
+		lnet_eq_enqueue_event(md->md_eq, &msg->msg_ev);
+	}
+
+	if (unlink)
+		lnet_md_unlink(md);
+
+	msg->msg_md = NULL;
+}
+
+static int
+lnet_complete_msg_locked(lnet_msg_t *msg, int cpt)
+{
+	lnet_handle_wire_t ack_wmd;
+	int		rc;
+	int		status = msg->msg_ev.status;
+
+	LASSERT(msg->msg_onactivelist);
+
+	if (status == 0 && msg->msg_ack) {
+		/* Only send an ACK if the PUT completed successfully */
+
+		lnet_msg_decommit(msg, cpt, 0);
+
+		msg->msg_ack = 0;
+		lnet_net_unlock(cpt);
+
+		LASSERT(msg->msg_ev.type == LNET_EVENT_PUT);
+		LASSERT(!msg->msg_routing);
+
+		ack_wmd = msg->msg_hdr.msg.put.ack_wmd;
+
+		lnet_prep_send(msg, LNET_MSG_ACK, msg->msg_ev.initiator, 0, 0);
+
+		msg->msg_hdr.msg.ack.dst_wmd = ack_wmd;
+		msg->msg_hdr.msg.ack.match_bits = msg->msg_ev.match_bits;
+		msg->msg_hdr.msg.ack.mlength = cpu_to_le32(msg->msg_ev.mlength);
+
+		/* NB: we probably want to use NID of msg::msg_from as 3rd
+		 * parameter (router NID) if it's routed message */
+		rc = lnet_send(msg->msg_ev.target.nid, msg, LNET_NID_ANY);
+
+		lnet_net_lock(cpt);
+		/*
+		 * NB: message is committed for sending, we should return
+		 * on success because LND will finalize this message later.
+		 *
+		 * Also, there is possibility that message is committed for
+		 * sending and also failed before delivering to LND,
+		 * i.e: ENOMEM, in that case we can't fall through either
+		 * because CPT for sending can be different with CPT for
+		 * receiving, so we should return back to lnet_finalize()
+		 * to make sure we are locking the correct partition.
+		 */
+		return rc;
+
+	} else if (status == 0 &&	/* OK so far */
+		   (msg->msg_routing && !msg->msg_sending)) {
+		/* not forwarded */
+		LASSERT(!msg->msg_receiving);	/* called back recv already */
+		lnet_net_unlock(cpt);
+
+		rc = lnet_send(LNET_NID_ANY, msg, LNET_NID_ANY);
+
+		lnet_net_lock(cpt);
+		/*
+		 * NB: message is committed for sending, we should return
+		 * on success because LND will finalize this message later.
+		 *
+		 * Also, there is possibility that message is committed for
+		 * sending and also failed before delivering to LND,
+		 * i.e: ENOMEM, in that case we can't fall through either:
+		 * - The rule is message must decommit for sending first if
+		 *   the it's committed for both sending and receiving
+		 * - CPT for sending can be different with CPT for receiving,
+		 *   so we should return back to lnet_finalize() to make
+		 *   sure we are locking the correct partition.
+		 */
+		return rc;
+	}
+
+	lnet_msg_decommit(msg, cpt, status);
+	lnet_msg_free_locked(msg);
+	return 0;
+}
+
+void
+lnet_finalize(lnet_ni_t *ni, lnet_msg_t *msg, int status)
+{
+	struct lnet_msg_container	*container;
+	int				my_slot;
+	int				cpt;
+	int				rc;
+	int				i;
+
+	LASSERT(!in_interrupt());
+
+	if (msg == NULL)
+		return;
+#if 0
+	CDEBUG(D_WARNING, "%s msg->%s Flags:%s%s%s%s%s%s%s%s%s%s%s txp %s rxp %s\n",
+	       lnet_msgtyp2str(msg->msg_type), libcfs_id2str(msg->msg_target),
+	       msg->msg_target_is_router ? "t" : "",
+	       msg->msg_routing ? "X" : "",
+	       msg->msg_ack ? "A" : "",
+	       msg->msg_sending ? "S" : "",
+	       msg->msg_receiving ? "R" : "",
+	       msg->msg_delayed ? "d" : "",
+	       msg->msg_txcredit ? "C" : "",
+	       msg->msg_peertxcredit ? "c" : "",
+	       msg->msg_rtrcredit ? "F" : "",
+	       msg->msg_peerrtrcredit ? "f" : "",
+	       msg->msg_onactivelist ? "!" : "",
+	       msg->msg_txpeer == NULL ? "<none>" : libcfs_nid2str(msg->msg_txpeer->lp_nid),
+	       msg->msg_rxpeer == NULL ? "<none>" : libcfs_nid2str(msg->msg_rxpeer->lp_nid));
+#endif
+	msg->msg_ev.status = status;
+
+	if (msg->msg_md != NULL) {
+		cpt = lnet_cpt_of_cookie(msg->msg_md->md_lh.lh_cookie);
+
+		lnet_res_lock(cpt);
+		lnet_msg_detach_md(msg, status);
+		lnet_res_unlock(cpt);
+	}
+
+ again:
+	rc = 0;
+	if (!msg->msg_tx_committed && !msg->msg_rx_committed) {
+		/* not committed to network yet */
+		LASSERT(!msg->msg_onactivelist);
+		lnet_msg_free(msg);
+		return;
+	}
+
+	/*
+	 * NB: routed message can be committed for both receiving and sending,
+	 * we should finalize in LIFO order and keep counters correct.
+	 * (finalize sending first then finalize receiving)
+	 */
+	cpt = msg->msg_tx_committed ? msg->msg_tx_cpt : msg->msg_rx_cpt;
+	lnet_net_lock(cpt);
+
+	container = the_lnet.ln_msg_containers[cpt];
+	list_add_tail(&msg->msg_list, &container->msc_finalizing);
+
+	/* Recursion breaker.  Don't complete the message here if I am (or
+	 * enough other threads are) already completing messages */
+
+	my_slot = -1;
+	for (i = 0; i < container->msc_nfinalizers; i++) {
+		if (container->msc_finalizers[i] == current)
+			break;
+
+		if (my_slot < 0 && container->msc_finalizers[i] == NULL)
+			my_slot = i;
+	}
+
+	if (i < container->msc_nfinalizers || my_slot < 0) {
+		lnet_net_unlock(cpt);
+		return;
+	}
+
+	container->msc_finalizers[my_slot] = current;
+
+	while (!list_empty(&container->msc_finalizing)) {
+		msg = list_entry(container->msc_finalizing.next,
+				     lnet_msg_t, msg_list);
+
+		list_del(&msg->msg_list);
+
+		/* NB drops and regains the lnet lock if it actually does
+		 * anything, so my finalizing friends can chomp along too */
+		rc = lnet_complete_msg_locked(msg, cpt);
+		if (rc != 0)
+			break;
+	}
+
+	container->msc_finalizers[my_slot] = NULL;
+	lnet_net_unlock(cpt);
+
+	if (rc != 0)
+		goto again;
+}
+EXPORT_SYMBOL(lnet_finalize);
+
+void
+lnet_msg_container_cleanup(struct lnet_msg_container *container)
+{
+	int     count = 0;
+
+	if (container->msc_init == 0)
+		return;
+
+	while (!list_empty(&container->msc_active)) {
+		lnet_msg_t *msg = list_entry(container->msc_active.next,
+						 lnet_msg_t, msg_activelist);
+
+		LASSERT(msg->msg_onactivelist);
+		msg->msg_onactivelist = 0;
+		list_del(&msg->msg_activelist);
+		lnet_msg_free(msg);
+		count++;
+	}
+
+	if (count > 0)
+		CERROR("%d active msg on exit\n", count);
+
+	if (container->msc_finalizers != NULL) {
+		LIBCFS_FREE(container->msc_finalizers,
+			    container->msc_nfinalizers *
+			    sizeof(*container->msc_finalizers));
+		container->msc_finalizers = NULL;
+	}
+#ifdef LNET_USE_LIB_FREELIST
+	lnet_freelist_fini(&container->msc_freelist);
+#endif
+	container->msc_init = 0;
+}
+
+int
+lnet_msg_container_setup(struct lnet_msg_container *container, int cpt)
+{
+	int	rc;
+
+	container->msc_init = 1;
+
+	INIT_LIST_HEAD(&container->msc_active);
+	INIT_LIST_HEAD(&container->msc_finalizing);
+
+#ifdef LNET_USE_LIB_FREELIST
+	memset(&container->msc_freelist, 0, sizeof(lnet_freelist_t));
+
+	rc = lnet_freelist_init(&container->msc_freelist,
+				LNET_FL_MAX_MSGS, sizeof(lnet_msg_t));
+	if (rc != 0) {
+		CERROR("Failed to init freelist for message container\n");
+		lnet_msg_container_cleanup(container);
+		return rc;
+	}
+#else
+	rc = 0;
+#endif
+	/* number of CPUs */
+	container->msc_nfinalizers = cfs_cpt_weight(lnet_cpt_table(), cpt);
+
+	LIBCFS_CPT_ALLOC(container->msc_finalizers, lnet_cpt_table(), cpt,
+			 container->msc_nfinalizers *
+			 sizeof(*container->msc_finalizers));
+
+	if (container->msc_finalizers == NULL) {
+		CERROR("Failed to allocate message finalizers\n");
+		lnet_msg_container_cleanup(container);
+		return -ENOMEM;
+	}
+
+	return rc;
+}
+
+void
+lnet_msg_containers_destroy(void)
+{
+	struct lnet_msg_container *container;
+	int     i;
+
+	if (the_lnet.ln_msg_containers == NULL)
+		return;
+
+	cfs_percpt_for_each(container, i, the_lnet.ln_msg_containers)
+		lnet_msg_container_cleanup(container);
+
+	cfs_percpt_free(the_lnet.ln_msg_containers);
+	the_lnet.ln_msg_containers = NULL;
+}
+
+int
+lnet_msg_containers_create(void)
+{
+	struct lnet_msg_container *container;
+	int	rc;
+	int	i;
+
+	the_lnet.ln_msg_containers = cfs_percpt_alloc(lnet_cpt_table(),
+						      sizeof(*container));
+
+	if (the_lnet.ln_msg_containers == NULL) {
+		CERROR("Failed to allocate cpu-partition data for network\n");
+		return -ENOMEM;
+	}
+
+	cfs_percpt_for_each(container, i, the_lnet.ln_msg_containers) {
+		rc = lnet_msg_container_setup(container, i);
+		if (rc != 0) {
+			lnet_msg_containers_destroy();
+			return rc;
+		}
+	}
+
+	return 0;
+}
diff --git a/kernel/drivers/staging/lustre/lnet/lnet/lib-ptl.c b/kernel/drivers/staging/lustre/lnet/lnet/lib-ptl.c
new file mode 100644
index 000000000..3ba0da919
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lnet/lnet/lib-ptl.c
@@ -0,0 +1,935 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/lnet/lib-ptl.c
+ *
+ * portal & match routines
+ *
+ * Author: liang@whamcloud.com
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+
+#include "../../include/linux/lnet/lib-lnet.h"
+
+/* NB: add /proc interfaces in upcoming patches */
+int	portal_rotor	= LNET_PTL_ROTOR_HASH_RT;
+module_param(portal_rotor, int, 0644);
+MODULE_PARM_DESC(portal_rotor, "redirect PUTs to different cpu-partitions");
+
+static int
+lnet_ptl_match_type(unsigned int index, lnet_process_id_t match_id,
+		    __u64 mbits, __u64 ignore_bits)
+{
+	struct lnet_portal	*ptl = the_lnet.ln_portals[index];
+	int			unique;
+
+	unique = ignore_bits == 0 &&
+		 match_id.nid != LNET_NID_ANY &&
+		 match_id.pid != LNET_PID_ANY;
+
+	LASSERT(!lnet_ptl_is_unique(ptl) || !lnet_ptl_is_wildcard(ptl));
+
+	/* prefer to check w/o any lock */
+	if (likely(lnet_ptl_is_unique(ptl) || lnet_ptl_is_wildcard(ptl)))
+		goto match;
+
+	/* unset, new portal */
+	lnet_ptl_lock(ptl);
+	/* check again with lock */
+	if (unlikely(lnet_ptl_is_unique(ptl) || lnet_ptl_is_wildcard(ptl))) {
+		lnet_ptl_unlock(ptl);
+		goto match;
+	}
+
+	/* still not set */
+	if (unique)
+		lnet_ptl_setopt(ptl, LNET_PTL_MATCH_UNIQUE);
+	else
+		lnet_ptl_setopt(ptl, LNET_PTL_MATCH_WILDCARD);
+
+	lnet_ptl_unlock(ptl);
+
+	return 1;
+
+ match:
+	if ((lnet_ptl_is_unique(ptl) && !unique) ||
+	    (lnet_ptl_is_wildcard(ptl) && unique))
+		return 0;
+	return 1;
+}
+
+static void
+lnet_ptl_enable_mt(struct lnet_portal *ptl, int cpt)
+{
+	struct lnet_match_table	*mtable = ptl->ptl_mtables[cpt];
+	int			i;
+
+	/* with hold of both lnet_res_lock(cpt) and lnet_ptl_lock */
+	LASSERT(lnet_ptl_is_wildcard(ptl));
+
+	mtable->mt_enabled = 1;
+
+	ptl->ptl_mt_maps[ptl->ptl_mt_nmaps] = cpt;
+	for (i = ptl->ptl_mt_nmaps - 1; i >= 0; i--) {
+		LASSERT(ptl->ptl_mt_maps[i] != cpt);
+		if (ptl->ptl_mt_maps[i] < cpt)
+			break;
+
+		/* swap to order */
+		ptl->ptl_mt_maps[i + 1] = ptl->ptl_mt_maps[i];
+		ptl->ptl_mt_maps[i] = cpt;
+	}
+
+	ptl->ptl_mt_nmaps++;
+}
+
+static void
+lnet_ptl_disable_mt(struct lnet_portal *ptl, int cpt)
+{
+	struct lnet_match_table	*mtable = ptl->ptl_mtables[cpt];
+	int			i;
+
+	/* with hold of both lnet_res_lock(cpt) and lnet_ptl_lock */
+	LASSERT(lnet_ptl_is_wildcard(ptl));
+
+	if (LNET_CPT_NUMBER == 1)
+		return; /* never disable the only match-table */
+
+	mtable->mt_enabled = 0;
+
+	LASSERT(ptl->ptl_mt_nmaps > 0 &&
+		ptl->ptl_mt_nmaps <= LNET_CPT_NUMBER);
+
+	/* remove it from mt_maps */
+	ptl->ptl_mt_nmaps--;
+	for (i = 0; i < ptl->ptl_mt_nmaps; i++) {
+		if (ptl->ptl_mt_maps[i] >= cpt) /* overwrite it */
+			ptl->ptl_mt_maps[i] = ptl->ptl_mt_maps[i + 1];
+	}
+}
+
+static int
+lnet_try_match_md(lnet_libmd_t *md,
+		  struct lnet_match_info *info, struct lnet_msg *msg)
+{
+	/* ALWAYS called holding the lnet_res_lock, and can't lnet_res_unlock;
+	 * lnet_match_blocked_msg() relies on this to avoid races */
+	unsigned int	offset;
+	unsigned int	mlength;
+	lnet_me_t	*me = md->md_me;
+
+	/* MD exhausted */
+	if (lnet_md_exhausted(md))
+		return LNET_MATCHMD_NONE | LNET_MATCHMD_EXHAUSTED;
+
+	/* mismatched MD op */
+	if ((md->md_options & info->mi_opc) == 0)
+		return LNET_MATCHMD_NONE;
+
+	/* mismatched ME nid/pid? */
+	if (me->me_match_id.nid != LNET_NID_ANY &&
+	    me->me_match_id.nid != info->mi_id.nid)
+		return LNET_MATCHMD_NONE;
+
+	if (me->me_match_id.pid != LNET_PID_ANY &&
+	    me->me_match_id.pid != info->mi_id.pid)
+		return LNET_MATCHMD_NONE;
+
+	/* mismatched ME matchbits? */
+	if (((me->me_match_bits ^ info->mi_mbits) & ~me->me_ignore_bits) != 0)
+		return LNET_MATCHMD_NONE;
+
+	/* Hurrah! This _is_ a match; check it out... */
+
+	if ((md->md_options & LNET_MD_MANAGE_REMOTE) == 0)
+		offset = md->md_offset;
+	else
+		offset = info->mi_roffset;
+
+	if ((md->md_options & LNET_MD_MAX_SIZE) != 0) {
+		mlength = md->md_max_size;
+		LASSERT(md->md_offset + mlength <= md->md_length);
+	} else {
+		mlength = md->md_length - offset;
+	}
+
+	if (info->mi_rlength <= mlength) {	/* fits in allowed space */
+		mlength = info->mi_rlength;
+	} else if ((md->md_options & LNET_MD_TRUNCATE) == 0) {
+		/* this packet _really_ is too big */
+		CERROR("Matching packet from %s, match %llu length %d too big: %d left, %d allowed\n",
+		       libcfs_id2str(info->mi_id), info->mi_mbits,
+		       info->mi_rlength, md->md_length - offset, mlength);
+
+		return LNET_MATCHMD_DROP;
+	}
+
+	/* Commit to this ME/MD */
+	CDEBUG(D_NET, "Incoming %s index %x from %s of length %d/%d into md %#llx [%d] + %d\n",
+	       (info->mi_opc == LNET_MD_OP_PUT) ? "put" : "get",
+	       info->mi_portal, libcfs_id2str(info->mi_id), mlength,
+	       info->mi_rlength, md->md_lh.lh_cookie, md->md_niov, offset);
+
+	lnet_msg_attach_md(msg, md, offset, mlength);
+	md->md_offset = offset + mlength;
+
+	if (!lnet_md_exhausted(md))
+		return LNET_MATCHMD_OK;
+
+	/* Auto-unlink NOW, so the ME gets unlinked if required.
+	 * We bumped md->md_refcount above so the MD just gets flagged
+	 * for unlink when it is finalized. */
+	if ((md->md_flags & LNET_MD_FLAG_AUTO_UNLINK) != 0)
+		lnet_md_unlink(md);
+
+	return LNET_MATCHMD_OK | LNET_MATCHMD_EXHAUSTED;
+}
+
+static struct lnet_match_table *
+lnet_match2mt(struct lnet_portal *ptl, lnet_process_id_t id, __u64 mbits)
+{
+	if (LNET_CPT_NUMBER == 1)
+		return ptl->ptl_mtables[0]; /* the only one */
+
+	/* if it's a unique portal, return match-table hashed by NID */
+	return lnet_ptl_is_unique(ptl) ?
+	       ptl->ptl_mtables[lnet_cpt_of_nid(id.nid)] : NULL;
+}
+
+struct lnet_match_table *
+lnet_mt_of_attach(unsigned int index, lnet_process_id_t id,
+		  __u64 mbits, __u64 ignore_bits, lnet_ins_pos_t pos)
+{
+	struct lnet_portal	*ptl;
+	struct lnet_match_table	*mtable;
+
+	/* NB: called w/o lock */
+	LASSERT(index < the_lnet.ln_nportals);
+
+	if (!lnet_ptl_match_type(index, id, mbits, ignore_bits))
+		return NULL;
+
+	ptl = the_lnet.ln_portals[index];
+
+	mtable = lnet_match2mt(ptl, id, mbits);
+	if (mtable != NULL) /* unique portal or only one match-table */
+		return mtable;
+
+	/* it's a wildcard portal */
+	switch (pos) {
+	default:
+		return NULL;
+	case LNET_INS_BEFORE:
+	case LNET_INS_AFTER:
+		/* posted by no affinity thread, always hash to specific
+		 * match-table to avoid buffer stealing which is heavy */
+		return ptl->ptl_mtables[ptl->ptl_index % LNET_CPT_NUMBER];
+	case LNET_INS_LOCAL:
+		/* posted by cpu-affinity thread */
+		return ptl->ptl_mtables[lnet_cpt_current()];
+	}
+}
+
+static struct lnet_match_table *
+lnet_mt_of_match(struct lnet_match_info *info, struct lnet_msg *msg)
+{
+	struct lnet_match_table	*mtable;
+	struct lnet_portal	*ptl;
+	unsigned int		nmaps;
+	unsigned int		rotor;
+	unsigned int		cpt;
+	bool			routed;
+
+	/* NB: called w/o lock */
+	LASSERT(info->mi_portal < the_lnet.ln_nportals);
+	ptl = the_lnet.ln_portals[info->mi_portal];
+
+	LASSERT(lnet_ptl_is_wildcard(ptl) || lnet_ptl_is_unique(ptl));
+
+	mtable = lnet_match2mt(ptl, info->mi_id, info->mi_mbits);
+	if (mtable != NULL)
+		return mtable;
+
+	/* it's a wildcard portal */
+	routed = LNET_NIDNET(msg->msg_hdr.src_nid) !=
+		 LNET_NIDNET(msg->msg_hdr.dest_nid);
+
+	if (portal_rotor == LNET_PTL_ROTOR_OFF ||
+	    (portal_rotor != LNET_PTL_ROTOR_ON && !routed)) {
+		cpt = lnet_cpt_current();
+		if (ptl->ptl_mtables[cpt]->mt_enabled)
+			return ptl->ptl_mtables[cpt];
+	}
+
+	rotor = ptl->ptl_rotor++; /* get round-robin factor */
+	if (portal_rotor == LNET_PTL_ROTOR_HASH_RT && routed)
+		cpt = lnet_cpt_of_nid(msg->msg_hdr.src_nid);
+	else
+		cpt = rotor % LNET_CPT_NUMBER;
+
+	if (!ptl->ptl_mtables[cpt]->mt_enabled) {
+		/* is there any active entry for this portal? */
+		nmaps = ptl->ptl_mt_nmaps;
+		/* map to an active mtable to avoid heavy "stealing" */
+		if (nmaps != 0) {
+			/* NB: there is possibility that ptl_mt_maps is being
+			 * changed because we are not under protection of
+			 * lnet_ptl_lock, but it shouldn't hurt anything */
+			cpt = ptl->ptl_mt_maps[rotor % nmaps];
+		}
+	}
+
+	return ptl->ptl_mtables[cpt];
+}
+
+static int
+lnet_mt_test_exhausted(struct lnet_match_table *mtable, int pos)
+{
+	__u64	*bmap;
+	int	i;
+
+	if (!lnet_ptl_is_wildcard(the_lnet.ln_portals[mtable->mt_portal]))
+		return 0;
+
+	if (pos < 0) { /* check all bits */
+		for (i = 0; i < LNET_MT_EXHAUSTED_BMAP; i++) {
+			if (mtable->mt_exhausted[i] != (__u64)(-1))
+				return 0;
+		}
+		return 1;
+	}
+
+	LASSERT(pos <= LNET_MT_HASH_IGNORE);
+	/* mtable::mt_mhash[pos] is marked as exhausted or not */
+	bmap = &mtable->mt_exhausted[pos >> LNET_MT_BITS_U64];
+	pos &= (1 << LNET_MT_BITS_U64) - 1;
+
+	return ((*bmap) & (1ULL << pos)) != 0;
+}
+
+static void
+lnet_mt_set_exhausted(struct lnet_match_table *mtable, int pos, int exhausted)
+{
+	__u64	*bmap;
+
+	LASSERT(lnet_ptl_is_wildcard(the_lnet.ln_portals[mtable->mt_portal]));
+	LASSERT(pos <= LNET_MT_HASH_IGNORE);
+
+	/* set mtable::mt_mhash[pos] as exhausted/non-exhausted */
+	bmap = &mtable->mt_exhausted[pos >> LNET_MT_BITS_U64];
+	pos &= (1 << LNET_MT_BITS_U64) - 1;
+
+	if (!exhausted)
+		*bmap &= ~(1ULL << pos);
+	else
+		*bmap |= 1ULL << pos;
+}
+
+struct list_head *
+lnet_mt_match_head(struct lnet_match_table *mtable,
+		   lnet_process_id_t id, __u64 mbits)
+{
+	struct lnet_portal *ptl = the_lnet.ln_portals[mtable->mt_portal];
+
+	if (lnet_ptl_is_wildcard(ptl)) {
+		return &mtable->mt_mhash[mbits & LNET_MT_HASH_MASK];
+	} else {
+		unsigned long hash = mbits + id.nid + id.pid;
+
+		LASSERT(lnet_ptl_is_unique(ptl));
+		hash = hash_long(hash, LNET_MT_HASH_BITS);
+		return &mtable->mt_mhash[hash];
+	}
+}
+
+int
+lnet_mt_match_md(struct lnet_match_table *mtable,
+		 struct lnet_match_info *info, struct lnet_msg *msg)
+{
+	struct list_head		*head;
+	lnet_me_t		*me;
+	lnet_me_t		*tmp;
+	int			exhausted = 0;
+	int			rc;
+
+	/* any ME with ignore bits? */
+	if (!list_empty(&mtable->mt_mhash[LNET_MT_HASH_IGNORE]))
+		head = &mtable->mt_mhash[LNET_MT_HASH_IGNORE];
+	else
+		head = lnet_mt_match_head(mtable, info->mi_id, info->mi_mbits);
+ again:
+	/* NB: only wildcard portal needs to return LNET_MATCHMD_EXHAUSTED */
+	if (lnet_ptl_is_wildcard(the_lnet.ln_portals[mtable->mt_portal]))
+		exhausted = LNET_MATCHMD_EXHAUSTED;
+
+	list_for_each_entry_safe(me, tmp, head, me_list) {
+		/* ME attached but MD not attached yet */
+		if (me->me_md == NULL)
+			continue;
+
+		LASSERT(me == me->me_md->md_me);
+
+		rc = lnet_try_match_md(me->me_md, info, msg);
+		if ((rc & LNET_MATCHMD_EXHAUSTED) == 0)
+			exhausted = 0; /* mlist is not empty */
+
+		if ((rc & LNET_MATCHMD_FINISH) != 0) {
+			/* don't return EXHAUSTED bit because we don't know
+			 * whether the mlist is empty or not */
+			return rc & ~LNET_MATCHMD_EXHAUSTED;
+		}
+	}
+
+	if (exhausted == LNET_MATCHMD_EXHAUSTED) { /* @head is exhausted */
+		lnet_mt_set_exhausted(mtable, head - mtable->mt_mhash, 1);
+		if (!lnet_mt_test_exhausted(mtable, -1))
+			exhausted = 0;
+	}
+
+	if (exhausted == 0 && head == &mtable->mt_mhash[LNET_MT_HASH_IGNORE]) {
+		head = lnet_mt_match_head(mtable, info->mi_id, info->mi_mbits);
+		goto again; /* re-check MEs w/o ignore-bits */
+	}
+
+	if (info->mi_opc == LNET_MD_OP_GET ||
+	    !lnet_ptl_is_lazy(the_lnet.ln_portals[info->mi_portal]))
+		return LNET_MATCHMD_DROP | exhausted;
+
+	return LNET_MATCHMD_NONE | exhausted;
+}
+
+static int
+lnet_ptl_match_early(struct lnet_portal *ptl, struct lnet_msg *msg)
+{
+	int	rc;
+
+	/* message arrived before any buffer posting on this portal,
+	 * simply delay or drop this message */
+	if (likely(lnet_ptl_is_wildcard(ptl) || lnet_ptl_is_unique(ptl)))
+		return 0;
+
+	lnet_ptl_lock(ptl);
+	/* check it again with hold of lock */
+	if (lnet_ptl_is_wildcard(ptl) || lnet_ptl_is_unique(ptl)) {
+		lnet_ptl_unlock(ptl);
+		return 0;
+	}
+
+	if (lnet_ptl_is_lazy(ptl)) {
+		if (msg->msg_rx_ready_delay) {
+			msg->msg_rx_delayed = 1;
+			list_add_tail(&msg->msg_list,
+					  &ptl->ptl_msg_delayed);
+		}
+		rc = LNET_MATCHMD_NONE;
+	} else {
+		rc = LNET_MATCHMD_DROP;
+	}
+
+	lnet_ptl_unlock(ptl);
+	return rc;
+}
+
+static int
+lnet_ptl_match_delay(struct lnet_portal *ptl,
+		     struct lnet_match_info *info, struct lnet_msg *msg)
+{
+	int	first = ptl->ptl_mt_maps[0]; /* read w/o lock */
+	int	rc = 0;
+	int	i;
+
+	/* steal buffer from other CPTs, and delay it if nothing to steal,
+	 * this function is more expensive than a regular match, but we
+	 * don't expect it can happen a lot */
+	LASSERT(lnet_ptl_is_wildcard(ptl));
+
+	for (i = 0; i < LNET_CPT_NUMBER; i++) {
+		struct lnet_match_table *mtable;
+		int			cpt;
+
+		cpt = (first + i) % LNET_CPT_NUMBER;
+		mtable = ptl->ptl_mtables[cpt];
+		if (i != 0 && i != LNET_CPT_NUMBER - 1 && !mtable->mt_enabled)
+			continue;
+
+		lnet_res_lock(cpt);
+		lnet_ptl_lock(ptl);
+
+		if (i == 0) { /* the first try, attach on stealing list */
+			list_add_tail(&msg->msg_list,
+					  &ptl->ptl_msg_stealing);
+		}
+
+		if (!list_empty(&msg->msg_list)) { /* on stealing list */
+			rc = lnet_mt_match_md(mtable, info, msg);
+
+			if ((rc & LNET_MATCHMD_EXHAUSTED) != 0 &&
+			    mtable->mt_enabled)
+				lnet_ptl_disable_mt(ptl, cpt);
+
+			if ((rc & LNET_MATCHMD_FINISH) != 0)
+				list_del_init(&msg->msg_list);
+
+		} else {
+			/* could be matched by lnet_ptl_attach_md()
+			 * which is called by another thread */
+			rc = msg->msg_md == NULL ?
+			     LNET_MATCHMD_DROP : LNET_MATCHMD_OK;
+		}
+
+		if (!list_empty(&msg->msg_list) && /* not matched yet */
+		    (i == LNET_CPT_NUMBER - 1 || /* the last CPT */
+		     ptl->ptl_mt_nmaps == 0 ||   /* no active CPT */
+		     (ptl->ptl_mt_nmaps == 1 &&  /* the only active CPT */
+		      ptl->ptl_mt_maps[0] == cpt))) {
+			/* nothing to steal, delay or drop */
+			list_del_init(&msg->msg_list);
+
+			if (lnet_ptl_is_lazy(ptl)) {
+				msg->msg_rx_delayed = 1;
+				list_add_tail(&msg->msg_list,
+						  &ptl->ptl_msg_delayed);
+				rc = LNET_MATCHMD_NONE;
+			} else {
+				rc = LNET_MATCHMD_DROP;
+			}
+		}
+
+		lnet_ptl_unlock(ptl);
+		lnet_res_unlock(cpt);
+
+		if ((rc & LNET_MATCHMD_FINISH) != 0 || msg->msg_rx_delayed)
+			break;
+	}
+
+	return rc;
+}
+
+int
+lnet_ptl_match_md(struct lnet_match_info *info, struct lnet_msg *msg)
+{
+	struct lnet_match_table	*mtable;
+	struct lnet_portal	*ptl;
+	int			rc;
+
+	CDEBUG(D_NET, "Request from %s of length %d into portal %d MB=%#llx\n",
+	       libcfs_id2str(info->mi_id), info->mi_rlength, info->mi_portal,
+	       info->mi_mbits);
+
+	if (info->mi_portal >= the_lnet.ln_nportals) {
+		CERROR("Invalid portal %d not in [0-%d]\n",
+		       info->mi_portal, the_lnet.ln_nportals);
+		return LNET_MATCHMD_DROP;
+	}
+
+	ptl = the_lnet.ln_portals[info->mi_portal];
+	rc = lnet_ptl_match_early(ptl, msg);
+	if (rc != 0) /* matched or delayed early message */
+		return rc;
+
+	mtable = lnet_mt_of_match(info, msg);
+	lnet_res_lock(mtable->mt_cpt);
+
+	if (the_lnet.ln_shutdown) {
+		rc = LNET_MATCHMD_DROP;
+		goto out1;
+	}
+
+	rc = lnet_mt_match_md(mtable, info, msg);
+	if ((rc & LNET_MATCHMD_EXHAUSTED) != 0 && mtable->mt_enabled) {
+		lnet_ptl_lock(ptl);
+		lnet_ptl_disable_mt(ptl, mtable->mt_cpt);
+		lnet_ptl_unlock(ptl);
+	}
+
+	if ((rc & LNET_MATCHMD_FINISH) != 0)	/* matched or dropping */
+		goto out1;
+
+	if (!msg->msg_rx_ready_delay)
+		goto out1;
+
+	LASSERT(lnet_ptl_is_lazy(ptl));
+	LASSERT(!msg->msg_rx_delayed);
+
+	/* NB: we don't expect "delay" can happen a lot */
+	if (lnet_ptl_is_unique(ptl) || LNET_CPT_NUMBER == 1) {
+		lnet_ptl_lock(ptl);
+
+		msg->msg_rx_delayed = 1;
+		list_add_tail(&msg->msg_list, &ptl->ptl_msg_delayed);
+
+		lnet_ptl_unlock(ptl);
+		lnet_res_unlock(mtable->mt_cpt);
+
+	} else  {
+		lnet_res_unlock(mtable->mt_cpt);
+		rc = lnet_ptl_match_delay(ptl, info, msg);
+	}
+
+	if (msg->msg_rx_delayed) {
+		CDEBUG(D_NET,
+		       "Delaying %s from %s ptl %d MB %#llx off %d len %d\n",
+		       info->mi_opc == LNET_MD_OP_PUT ? "PUT" : "GET",
+		       libcfs_id2str(info->mi_id), info->mi_portal,
+		       info->mi_mbits, info->mi_roffset, info->mi_rlength);
+	}
+	goto out0;
+ out1:
+	lnet_res_unlock(mtable->mt_cpt);
+ out0:
+	/* EXHAUSTED bit is only meaningful for internal functions */
+	return rc & ~LNET_MATCHMD_EXHAUSTED;
+}
+
+void
+lnet_ptl_detach_md(lnet_me_t *me, lnet_libmd_t *md)
+{
+	LASSERT(me->me_md == md && md->md_me == me);
+
+	me->me_md = NULL;
+	md->md_me = NULL;
+}
+
+/* called with lnet_res_lock held */
+void
+lnet_ptl_attach_md(lnet_me_t *me, lnet_libmd_t *md,
+		   struct list_head *matches, struct list_head *drops)
+{
+	struct lnet_portal	*ptl = the_lnet.ln_portals[me->me_portal];
+	struct lnet_match_table	*mtable;
+	struct list_head		*head;
+	lnet_msg_t		*tmp;
+	lnet_msg_t		*msg;
+	int			exhausted = 0;
+	int			cpt;
+
+	LASSERT(md->md_refcount == 0); /* a brand new MD */
+
+	me->me_md = md;
+	md->md_me = me;
+
+	cpt = lnet_cpt_of_cookie(md->md_lh.lh_cookie);
+	mtable = ptl->ptl_mtables[cpt];
+
+	if (list_empty(&ptl->ptl_msg_stealing) &&
+	    list_empty(&ptl->ptl_msg_delayed) &&
+	    !lnet_mt_test_exhausted(mtable, me->me_pos))
+		return;
+
+	lnet_ptl_lock(ptl);
+	head = &ptl->ptl_msg_stealing;
+ again:
+	list_for_each_entry_safe(msg, tmp, head, msg_list) {
+		struct lnet_match_info	info;
+		lnet_hdr_t		*hdr;
+		int			rc;
+
+		LASSERT(msg->msg_rx_delayed || head == &ptl->ptl_msg_stealing);
+
+		hdr   = &msg->msg_hdr;
+		info.mi_id.nid	= hdr->src_nid;
+		info.mi_id.pid	= hdr->src_pid;
+		info.mi_opc	= LNET_MD_OP_PUT;
+		info.mi_portal	= hdr->msg.put.ptl_index;
+		info.mi_rlength	= hdr->payload_length;
+		info.mi_roffset	= hdr->msg.put.offset;
+		info.mi_mbits	= hdr->msg.put.match_bits;
+
+		rc = lnet_try_match_md(md, &info, msg);
+
+		exhausted = (rc & LNET_MATCHMD_EXHAUSTED) != 0;
+		if ((rc & LNET_MATCHMD_NONE) != 0) {
+			if (exhausted)
+				break;
+			continue;
+		}
+
+		/* Hurrah! This _is_ a match */
+		LASSERT((rc & LNET_MATCHMD_FINISH) != 0);
+		list_del_init(&msg->msg_list);
+
+		if (head == &ptl->ptl_msg_stealing) {
+			if (exhausted)
+				break;
+			/* stealing thread will handle the message */
+			continue;
+		}
+
+		if ((rc & LNET_MATCHMD_OK) != 0) {
+			list_add_tail(&msg->msg_list, matches);
+
+			CDEBUG(D_NET, "Resuming delayed PUT from %s portal %d match %llu offset %d length %d.\n",
+			       libcfs_id2str(info.mi_id),
+			       info.mi_portal, info.mi_mbits,
+			       info.mi_roffset, info.mi_rlength);
+		} else {
+			list_add_tail(&msg->msg_list, drops);
+		}
+
+		if (exhausted)
+			break;
+	}
+
+	if (!exhausted && head == &ptl->ptl_msg_stealing) {
+		head = &ptl->ptl_msg_delayed;
+		goto again;
+	}
+
+	if (lnet_ptl_is_wildcard(ptl) && !exhausted) {
+		lnet_mt_set_exhausted(mtable, me->me_pos, 0);
+		if (!mtable->mt_enabled)
+			lnet_ptl_enable_mt(ptl, cpt);
+	}
+
+	lnet_ptl_unlock(ptl);
+}
+
+static void
+lnet_ptl_cleanup(struct lnet_portal *ptl)
+{
+	struct lnet_match_table	*mtable;
+	int			i;
+
+	if (ptl->ptl_mtables == NULL) /* uninitialized portal */
+		return;
+
+	LASSERT(list_empty(&ptl->ptl_msg_delayed));
+	LASSERT(list_empty(&ptl->ptl_msg_stealing));
+	cfs_percpt_for_each(mtable, i, ptl->ptl_mtables) {
+		struct list_head	*mhash;
+		lnet_me_t	*me;
+		int		j;
+
+		if (mtable->mt_mhash == NULL) /* uninitialized match-table */
+			continue;
+
+		mhash = mtable->mt_mhash;
+		/* cleanup ME */
+		for (j = 0; j < LNET_MT_HASH_SIZE + 1; j++) {
+			while (!list_empty(&mhash[j])) {
+				me = list_entry(mhash[j].next,
+						    lnet_me_t, me_list);
+				CERROR("Active ME %p on exit\n", me);
+				list_del(&me->me_list);
+				lnet_me_free(me);
+			}
+		}
+		/* the extra entry is for MEs with ignore bits */
+		LIBCFS_FREE(mhash, sizeof(*mhash) * (LNET_MT_HASH_SIZE + 1));
+	}
+
+	cfs_percpt_free(ptl->ptl_mtables);
+	ptl->ptl_mtables = NULL;
+}
+
+static int
+lnet_ptl_setup(struct lnet_portal *ptl, int index)
+{
+	struct lnet_match_table	*mtable;
+	struct list_head		*mhash;
+	int			i;
+	int			j;
+
+	ptl->ptl_mtables = cfs_percpt_alloc(lnet_cpt_table(),
+					    sizeof(struct lnet_match_table));
+	if (ptl->ptl_mtables == NULL) {
+		CERROR("Failed to create match table for portal %d\n", index);
+		return -ENOMEM;
+	}
+
+	ptl->ptl_index = index;
+	INIT_LIST_HEAD(&ptl->ptl_msg_delayed);
+	INIT_LIST_HEAD(&ptl->ptl_msg_stealing);
+	spin_lock_init(&ptl->ptl_lock);
+	cfs_percpt_for_each(mtable, i, ptl->ptl_mtables) {
+		/* the extra entry is for MEs with ignore bits */
+		LIBCFS_CPT_ALLOC(mhash, lnet_cpt_table(), i,
+				 sizeof(*mhash) * (LNET_MT_HASH_SIZE + 1));
+		if (mhash == NULL) {
+			CERROR("Failed to create match hash for portal %d\n",
+			       index);
+			goto failed;
+		}
+
+		memset(&mtable->mt_exhausted[0], -1,
+		       sizeof(mtable->mt_exhausted[0]) *
+		       LNET_MT_EXHAUSTED_BMAP);
+		mtable->mt_mhash = mhash;
+		for (j = 0; j < LNET_MT_HASH_SIZE + 1; j++)
+			INIT_LIST_HEAD(&mhash[j]);
+
+		mtable->mt_portal = index;
+		mtable->mt_cpt = i;
+	}
+
+	return 0;
+ failed:
+	lnet_ptl_cleanup(ptl);
+	return -ENOMEM;
+}
+
+void
+lnet_portals_destroy(void)
+{
+	int	i;
+
+	if (the_lnet.ln_portals == NULL)
+		return;
+
+	for (i = 0; i < the_lnet.ln_nportals; i++)
+		lnet_ptl_cleanup(the_lnet.ln_portals[i]);
+
+	cfs_array_free(the_lnet.ln_portals);
+	the_lnet.ln_portals = NULL;
+}
+
+int
+lnet_portals_create(void)
+{
+	int	size;
+	int	i;
+
+	size = offsetof(struct lnet_portal, ptl_mt_maps[LNET_CPT_NUMBER]);
+
+	the_lnet.ln_nportals = MAX_PORTALS;
+	the_lnet.ln_portals = cfs_array_alloc(the_lnet.ln_nportals, size);
+	if (the_lnet.ln_portals == NULL) {
+		CERROR("Failed to allocate portals table\n");
+		return -ENOMEM;
+	}
+
+	for (i = 0; i < the_lnet.ln_nportals; i++) {
+		if (lnet_ptl_setup(the_lnet.ln_portals[i], i)) {
+			lnet_portals_destroy();
+			return -ENOMEM;
+		}
+	}
+
+	return 0;
+}
+
+/**
+ * Turn on the lazy portal attribute. Use with caution!
+ *
+ * This portal attribute only affects incoming PUT requests to the portal,
+ * and is off by default. By default, if there's no matching MD for an
+ * incoming PUT request, it is simply dropped. With the lazy attribute on,
+ * such requests are queued indefinitely until either a matching MD is
+ * posted to the portal or the lazy attribute is turned off.
+ *
+ * It would prevent dropped requests, however it should be regarded as the
+ * last line of defense - i.e. users must keep a close watch on active
+ * buffers on a lazy portal and once it becomes too low post more buffers as
+ * soon as possible. This is because delayed requests usually have detrimental
+ * effects on underlying network connections. A few delayed requests often
+ * suffice to bring an underlying connection to a complete halt, due to flow
+ * control mechanisms.
+ *
+ * There's also a DOS attack risk. If users don't post match-all MDs on a
+ * lazy portal, a malicious peer can easily stop a service by sending some
+ * PUT requests with match bits that won't match any MD. A routed server is
+ * especially vulnerable since the connections to its neighbor routers are
+ * shared among all clients.
+ *
+ * \param portal Index of the portal to enable the lazy attribute on.
+ *
+ * \retval 0       On success.
+ * \retval -EINVAL If \a portal is not a valid index.
+ */
+int
+LNetSetLazyPortal(int portal)
+{
+	struct lnet_portal *ptl;
+
+	if (portal < 0 || portal >= the_lnet.ln_nportals)
+		return -EINVAL;
+
+	CDEBUG(D_NET, "Setting portal %d lazy\n", portal);
+	ptl = the_lnet.ln_portals[portal];
+
+	lnet_res_lock(LNET_LOCK_EX);
+	lnet_ptl_lock(ptl);
+
+	lnet_ptl_setopt(ptl, LNET_PTL_LAZY);
+
+	lnet_ptl_unlock(ptl);
+	lnet_res_unlock(LNET_LOCK_EX);
+
+	return 0;
+}
+EXPORT_SYMBOL(LNetSetLazyPortal);
+
+/**
+ * Turn off the lazy portal attribute. Delayed requests on the portal,
+ * if any, will be all dropped when this function returns.
+ *
+ * \param portal Index of the portal to disable the lazy attribute on.
+ *
+ * \retval 0       On success.
+ * \retval -EINVAL If \a portal is not a valid index.
+ */
+int
+LNetClearLazyPortal(int portal)
+{
+	struct lnet_portal	*ptl;
+	LIST_HEAD		(zombies);
+
+	if (portal < 0 || portal >= the_lnet.ln_nportals)
+		return -EINVAL;
+
+	ptl = the_lnet.ln_portals[portal];
+
+	lnet_res_lock(LNET_LOCK_EX);
+	lnet_ptl_lock(ptl);
+
+	if (!lnet_ptl_is_lazy(ptl)) {
+		lnet_ptl_unlock(ptl);
+		lnet_res_unlock(LNET_LOCK_EX);
+		return 0;
+	}
+
+	if (the_lnet.ln_shutdown)
+		CWARN("Active lazy portal %d on exit\n", portal);
+	else
+		CDEBUG(D_NET, "clearing portal %d lazy\n", portal);
+
+	/* grab all the blocked messages atomically */
+	list_splice_init(&ptl->ptl_msg_delayed, &zombies);
+
+	lnet_ptl_unsetopt(ptl, LNET_PTL_LAZY);
+
+	lnet_ptl_unlock(ptl);
+	lnet_res_unlock(LNET_LOCK_EX);
+
+	lnet_drop_delayed_msg_list(&zombies, "Clearing lazy portal attr");
+
+	return 0;
+}
+EXPORT_SYMBOL(LNetClearLazyPortal);
diff --git a/kernel/drivers/staging/lustre/lnet/lnet/lo.c b/kernel/drivers/staging/lustre/lnet/lnet/lo.c
new file mode 100644
index 000000000..f708c2e64
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lnet/lnet/lo.c
@@ -0,0 +1,120 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+#include "../../include/linux/lnet/lib-lnet.h"
+
+static int
+lolnd_send(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg)
+{
+	LASSERT(!lntmsg->msg_routing);
+	LASSERT(!lntmsg->msg_target_is_router);
+
+	return lnet_parse(ni, &lntmsg->msg_hdr, ni->ni_nid, lntmsg, 0);
+}
+
+static int
+lolnd_recv(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg,
+	    int delayed, unsigned int niov,
+	    struct kvec *iov, lnet_kiov_t *kiov,
+	    unsigned int offset, unsigned int mlen, unsigned int rlen)
+{
+	lnet_msg_t *sendmsg = private;
+
+	if (lntmsg != NULL) {		   /* not discarding */
+		if (sendmsg->msg_iov != NULL) {
+			if (iov != NULL)
+				lnet_copy_iov2iov(niov, iov, offset,
+						  sendmsg->msg_niov,
+						  sendmsg->msg_iov,
+						  sendmsg->msg_offset, mlen);
+			else
+				lnet_copy_iov2kiov(niov, kiov, offset,
+						   sendmsg->msg_niov,
+						   sendmsg->msg_iov,
+						   sendmsg->msg_offset, mlen);
+		} else {
+			if (iov != NULL)
+				lnet_copy_kiov2iov(niov, iov, offset,
+						   sendmsg->msg_niov,
+						   sendmsg->msg_kiov,
+						   sendmsg->msg_offset, mlen);
+			else
+				lnet_copy_kiov2kiov(niov, kiov, offset,
+						    sendmsg->msg_niov,
+						    sendmsg->msg_kiov,
+						    sendmsg->msg_offset, mlen);
+		}
+
+		lnet_finalize(ni, lntmsg, 0);
+	}
+
+	lnet_finalize(ni, sendmsg, 0);
+	return 0;
+}
+
+static int lolnd_instanced;
+
+static void
+lolnd_shutdown(lnet_ni_t *ni)
+{
+	CDEBUG(D_NET, "shutdown\n");
+	LASSERT(lolnd_instanced);
+
+	lolnd_instanced = 0;
+}
+
+static int
+lolnd_startup(lnet_ni_t *ni)
+{
+	LASSERT(ni->ni_lnd == &the_lolnd);
+	LASSERT(!lolnd_instanced);
+	lolnd_instanced = 1;
+
+	return 0;
+}
+
+lnd_t the_lolnd = {
+	/* .lnd_list       = */ {&the_lolnd.lnd_list, &the_lolnd.lnd_list},
+	/* .lnd_refcount   = */ 0,
+	/* .lnd_type       = */ LOLND,
+	/* .lnd_startup    = */ lolnd_startup,
+	/* .lnd_shutdown   = */ lolnd_shutdown,
+	/* .lnt_ctl	= */ NULL,
+	/* .lnd_send       = */ lolnd_send,
+	/* .lnd_recv       = */ lolnd_recv,
+	/* .lnd_eager_recv = */ NULL,
+	/* .lnd_notify     = */ NULL,
+	/* .lnd_accept     = */ NULL
+};
diff --git a/kernel/drivers/staging/lustre/lnet/lnet/module.c b/kernel/drivers/staging/lustre/lnet/lnet/module.c
new file mode 100644
index 000000000..72b7fbc83
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lnet/lnet/module.c
@@ -0,0 +1,155 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+#include "../../include/linux/lnet/lib-lnet.h"
+
+static int config_on_load;
+module_param(config_on_load, int, 0444);
+MODULE_PARM_DESC(config_on_load, "configure network at module load");
+
+static struct mutex lnet_config_mutex;
+
+static int
+lnet_configure(void *arg)
+{
+	/* 'arg' only there so I can be passed to cfs_create_thread() */
+	int    rc = 0;
+
+	LNET_MUTEX_LOCK(&lnet_config_mutex);
+
+	if (!the_lnet.ln_niinit_self) {
+		rc = LNetNIInit(LUSTRE_SRV_LNET_PID);
+		if (rc >= 0) {
+			the_lnet.ln_niinit_self = 1;
+			rc = 0;
+		}
+	}
+
+	LNET_MUTEX_UNLOCK(&lnet_config_mutex);
+	return rc;
+}
+
+static int
+lnet_unconfigure(void)
+{
+	int   refcount;
+
+	LNET_MUTEX_LOCK(&lnet_config_mutex);
+
+	if (the_lnet.ln_niinit_self) {
+		the_lnet.ln_niinit_self = 0;
+		LNetNIFini();
+	}
+
+	LNET_MUTEX_LOCK(&the_lnet.ln_api_mutex);
+	refcount = the_lnet.ln_refcount;
+	LNET_MUTEX_UNLOCK(&the_lnet.ln_api_mutex);
+
+	LNET_MUTEX_UNLOCK(&lnet_config_mutex);
+	return (refcount == 0) ? 0 : -EBUSY;
+}
+
+static int
+lnet_ioctl(unsigned int cmd, struct libcfs_ioctl_data *data)
+{
+	int   rc;
+
+	switch (cmd) {
+	case IOC_LIBCFS_CONFIGURE:
+		return lnet_configure(NULL);
+
+	case IOC_LIBCFS_UNCONFIGURE:
+		return lnet_unconfigure();
+
+	default:
+		/* Passing LNET_PID_ANY only gives me a ref if the net is up
+		 * already; I'll need it to ensure the net can't go down while
+		 * I'm called into it */
+		rc = LNetNIInit(LNET_PID_ANY);
+		if (rc >= 0) {
+			rc = LNetCtl(cmd, data);
+			LNetNIFini();
+		}
+		return rc;
+	}
+}
+
+static DECLARE_IOCTL_HANDLER(lnet_ioctl_handler, lnet_ioctl);
+
+static int __init
+init_lnet(void)
+{
+	int		  rc;
+
+	mutex_init(&lnet_config_mutex);
+
+	rc = LNetInit();
+	if (rc != 0) {
+		CERROR("LNetInit: error %d\n", rc);
+		return rc;
+	}
+
+	rc = libcfs_register_ioctl(&lnet_ioctl_handler);
+	LASSERT(rc == 0);
+
+	if (config_on_load) {
+		/* Have to schedule a separate thread to avoid deadlocking
+		 * in modload */
+		(void) kthread_run(lnet_configure, NULL, "lnet_initd");
+	}
+
+	return 0;
+}
+
+static void __exit
+fini_lnet(void)
+{
+	int rc;
+
+	rc = libcfs_deregister_ioctl(&lnet_ioctl_handler);
+	LASSERT(rc == 0);
+
+	LNetFini();
+}
+
+MODULE_AUTHOR("Peter J. Braam <braam@clusterfs.com>");
+MODULE_DESCRIPTION("Portals v3.1");
+MODULE_LICENSE("GPL");
+MODULE_VERSION("1.0.0");
+
+module_init(init_lnet);
+module_exit(fini_lnet);
diff --git a/kernel/drivers/staging/lustre/lnet/lnet/peer.c b/kernel/drivers/staging/lustre/lnet/lnet/peer.c
new file mode 100644
index 000000000..45b5742f1
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lnet/lnet/peer.c
@@ -0,0 +1,338 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/lnet/peer.c
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+
+#include "../../include/linux/lnet/lib-lnet.h"
+
+int
+lnet_peer_tables_create(void)
+{
+	struct lnet_peer_table	*ptable;
+	struct list_head		*hash;
+	int			i;
+	int			j;
+
+	the_lnet.ln_peer_tables = cfs_percpt_alloc(lnet_cpt_table(),
+						   sizeof(*ptable));
+	if (the_lnet.ln_peer_tables == NULL) {
+		CERROR("Failed to allocate cpu-partition peer tables\n");
+		return -ENOMEM;
+	}
+
+	cfs_percpt_for_each(ptable, i, the_lnet.ln_peer_tables) {
+		INIT_LIST_HEAD(&ptable->pt_deathrow);
+
+		LIBCFS_CPT_ALLOC(hash, lnet_cpt_table(), i,
+				 LNET_PEER_HASH_SIZE * sizeof(*hash));
+		if (hash == NULL) {
+			CERROR("Failed to create peer hash table\n");
+			lnet_peer_tables_destroy();
+			return -ENOMEM;
+		}
+
+		for (j = 0; j < LNET_PEER_HASH_SIZE; j++)
+			INIT_LIST_HEAD(&hash[j]);
+		ptable->pt_hash = hash; /* sign of initialization */
+	}
+
+	return 0;
+}
+
+void
+lnet_peer_tables_destroy(void)
+{
+	struct lnet_peer_table	*ptable;
+	struct list_head		*hash;
+	int			i;
+	int			j;
+
+	if (the_lnet.ln_peer_tables == NULL)
+		return;
+
+	cfs_percpt_for_each(ptable, i, the_lnet.ln_peer_tables) {
+		hash = ptable->pt_hash;
+		if (hash == NULL) /* not initialized */
+			break;
+
+		LASSERT(list_empty(&ptable->pt_deathrow));
+
+		ptable->pt_hash = NULL;
+		for (j = 0; j < LNET_PEER_HASH_SIZE; j++)
+			LASSERT(list_empty(&hash[j]));
+
+		LIBCFS_FREE(hash, LNET_PEER_HASH_SIZE * sizeof(*hash));
+	}
+
+	cfs_percpt_free(the_lnet.ln_peer_tables);
+	the_lnet.ln_peer_tables = NULL;
+}
+
+void
+lnet_peer_tables_cleanup(void)
+{
+	struct lnet_peer_table	*ptable;
+	int			i;
+	int			j;
+
+	LASSERT(the_lnet.ln_shutdown);	/* i.e. no new peers */
+
+	cfs_percpt_for_each(ptable, i, the_lnet.ln_peer_tables) {
+		lnet_net_lock(i);
+
+		for (j = 0; j < LNET_PEER_HASH_SIZE; j++) {
+			struct list_head *peers = &ptable->pt_hash[j];
+
+			while (!list_empty(peers)) {
+				lnet_peer_t *lp = list_entry(peers->next,
+								 lnet_peer_t,
+								 lp_hashlist);
+				list_del_init(&lp->lp_hashlist);
+				/* lose hash table's ref */
+				lnet_peer_decref_locked(lp);
+			}
+		}
+
+		lnet_net_unlock(i);
+	}
+
+	cfs_percpt_for_each(ptable, i, the_lnet.ln_peer_tables) {
+		LIST_HEAD(deathrow);
+		lnet_peer_t	*lp;
+
+		lnet_net_lock(i);
+
+		for (j = 3; ptable->pt_number != 0; j++) {
+			lnet_net_unlock(i);
+
+			if ((j & (j - 1)) == 0) {
+				CDEBUG(D_WARNING,
+				       "Waiting for %d peers on peer table\n",
+				       ptable->pt_number);
+			}
+			set_current_state(TASK_UNINTERRUPTIBLE);
+			schedule_timeout(cfs_time_seconds(1) / 2);
+			lnet_net_lock(i);
+		}
+		list_splice_init(&ptable->pt_deathrow, &deathrow);
+
+		lnet_net_unlock(i);
+
+		while (!list_empty(&deathrow)) {
+			lp = list_entry(deathrow.next,
+					    lnet_peer_t, lp_hashlist);
+			list_del(&lp->lp_hashlist);
+			LIBCFS_FREE(lp, sizeof(*lp));
+		}
+	}
+}
+
+void
+lnet_destroy_peer_locked(lnet_peer_t *lp)
+{
+	struct lnet_peer_table *ptable;
+
+	LASSERT(lp->lp_refcount == 0);
+	LASSERT(lp->lp_rtr_refcount == 0);
+	LASSERT(list_empty(&lp->lp_txq));
+	LASSERT(list_empty(&lp->lp_hashlist));
+	LASSERT(lp->lp_txqnob == 0);
+
+	ptable = the_lnet.ln_peer_tables[lp->lp_cpt];
+	LASSERT(ptable->pt_number > 0);
+	ptable->pt_number--;
+
+	lnet_ni_decref_locked(lp->lp_ni, lp->lp_cpt);
+	lp->lp_ni = NULL;
+
+	list_add(&lp->lp_hashlist, &ptable->pt_deathrow);
+}
+
+lnet_peer_t *
+lnet_find_peer_locked(struct lnet_peer_table *ptable, lnet_nid_t nid)
+{
+	struct list_head	*peers;
+	lnet_peer_t	*lp;
+
+	LASSERT(!the_lnet.ln_shutdown);
+
+	peers = &ptable->pt_hash[lnet_nid2peerhash(nid)];
+	list_for_each_entry(lp, peers, lp_hashlist) {
+		if (lp->lp_nid == nid) {
+			lnet_peer_addref_locked(lp);
+			return lp;
+		}
+	}
+
+	return NULL;
+}
+
+int
+lnet_nid2peer_locked(lnet_peer_t **lpp, lnet_nid_t nid, int cpt)
+{
+	struct lnet_peer_table	*ptable;
+	lnet_peer_t		*lp = NULL;
+	lnet_peer_t		*lp2;
+	int			cpt2;
+	int			rc = 0;
+
+	*lpp = NULL;
+	if (the_lnet.ln_shutdown) /* it's shutting down */
+		return -ESHUTDOWN;
+
+	/* cpt can be LNET_LOCK_EX if it's called from router functions */
+	cpt2 = cpt != LNET_LOCK_EX ? cpt : lnet_cpt_of_nid_locked(nid);
+
+	ptable = the_lnet.ln_peer_tables[cpt2];
+	lp = lnet_find_peer_locked(ptable, nid);
+	if (lp != NULL) {
+		*lpp = lp;
+		return 0;
+	}
+
+	if (!list_empty(&ptable->pt_deathrow)) {
+		lp = list_entry(ptable->pt_deathrow.next,
+				    lnet_peer_t, lp_hashlist);
+		list_del(&lp->lp_hashlist);
+	}
+
+	/*
+	 * take extra refcount in case another thread has shutdown LNet
+	 * and destroyed locks and peer-table before I finish the allocation
+	 */
+	ptable->pt_number++;
+	lnet_net_unlock(cpt);
+
+	if (lp != NULL)
+		memset(lp, 0, sizeof(*lp));
+	else
+		LIBCFS_CPT_ALLOC(lp, lnet_cpt_table(), cpt2, sizeof(*lp));
+
+	if (lp == NULL) {
+		rc = -ENOMEM;
+		lnet_net_lock(cpt);
+		goto out;
+	}
+
+	INIT_LIST_HEAD(&lp->lp_txq);
+	INIT_LIST_HEAD(&lp->lp_rtrq);
+	INIT_LIST_HEAD(&lp->lp_routes);
+
+	lp->lp_notify = 0;
+	lp->lp_notifylnd = 0;
+	lp->lp_notifying = 0;
+	lp->lp_alive_count = 0;
+	lp->lp_timestamp = 0;
+	lp->lp_alive = !lnet_peers_start_down(); /* 1 bit!! */
+	lp->lp_last_alive = cfs_time_current(); /* assumes alive */
+	lp->lp_last_query = 0; /* haven't asked NI yet */
+	lp->lp_ping_timestamp = 0;
+	lp->lp_ping_feats = LNET_PING_FEAT_INVAL;
+	lp->lp_nid = nid;
+	lp->lp_cpt = cpt2;
+	lp->lp_refcount = 2;	/* 1 for caller; 1 for hash */
+	lp->lp_rtr_refcount = 0;
+
+	lnet_net_lock(cpt);
+
+	if (the_lnet.ln_shutdown) {
+		rc = -ESHUTDOWN;
+		goto out;
+	}
+
+	lp2 = lnet_find_peer_locked(ptable, nid);
+	if (lp2 != NULL) {
+		*lpp = lp2;
+		goto out;
+	}
+
+	lp->lp_ni = lnet_net2ni_locked(LNET_NIDNET(nid), cpt2);
+	if (lp->lp_ni == NULL) {
+		rc = -EHOSTUNREACH;
+		goto out;
+	}
+
+	lp->lp_txcredits    =
+	lp->lp_mintxcredits = lp->lp_ni->ni_peertxcredits;
+	lp->lp_rtrcredits    =
+	lp->lp_minrtrcredits = lnet_peer_buffer_credits(lp->lp_ni);
+
+	list_add_tail(&lp->lp_hashlist,
+			  &ptable->pt_hash[lnet_nid2peerhash(nid)]);
+	ptable->pt_version++;
+	*lpp = lp;
+
+	return 0;
+out:
+	if (lp != NULL)
+		list_add(&lp->lp_hashlist, &ptable->pt_deathrow);
+	ptable->pt_number--;
+	return rc;
+}
+
+void
+lnet_debug_peer(lnet_nid_t nid)
+{
+	char		*aliveness = "NA";
+	lnet_peer_t	*lp;
+	int		rc;
+	int		cpt;
+
+	cpt = lnet_cpt_of_nid(nid);
+	lnet_net_lock(cpt);
+
+	rc = lnet_nid2peer_locked(&lp, nid, cpt);
+	if (rc != 0) {
+		lnet_net_unlock(cpt);
+		CDEBUG(D_WARNING, "No peer %s\n", libcfs_nid2str(nid));
+		return;
+	}
+
+	if (lnet_isrouter(lp) || lnet_peer_aliveness_enabled(lp))
+		aliveness = lp->lp_alive ? "up" : "down";
+
+	CDEBUG(D_WARNING, "%-24s %4d %5s %5d %5d %5d %5d %5d %ld\n",
+	       libcfs_nid2str(lp->lp_nid), lp->lp_refcount,
+	       aliveness, lp->lp_ni->ni_peertxcredits,
+	       lp->lp_rtrcredits, lp->lp_minrtrcredits,
+	       lp->lp_txcredits, lp->lp_mintxcredits, lp->lp_txqnob);
+
+	lnet_peer_decref_locked(lp);
+
+	lnet_net_unlock(cpt);
+}
diff --git a/kernel/drivers/staging/lustre/lnet/lnet/router.c b/kernel/drivers/staging/lustre/lnet/lnet/router.c
new file mode 100644
index 000000000..8510bae48
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lnet/lnet/router.c
@@ -0,0 +1,1706 @@
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ *
+ *   This file is part of Portals
+ *   http://sourceforge.net/projects/sandiaportals/
+ *
+ *   Portals is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Portals is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Portals; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+#include "../../include/linux/lnet/lib-lnet.h"
+
+#if  defined(LNET_ROUTER)
+
+#define LNET_NRB_TINY_MIN	512	/* min value for each CPT */
+#define LNET_NRB_TINY		(LNET_NRB_TINY_MIN * 4)
+#define LNET_NRB_SMALL_MIN	4096	/* min value for each CPT */
+#define LNET_NRB_SMALL		(LNET_NRB_SMALL_MIN * 4)
+#define LNET_NRB_LARGE_MIN	256	/* min value for each CPT */
+#define LNET_NRB_LARGE		(LNET_NRB_LARGE_MIN * 4)
+
+static char *forwarding = "";
+module_param(forwarding, charp, 0444);
+MODULE_PARM_DESC(forwarding, "Explicitly enable/disable forwarding between networks");
+
+static int tiny_router_buffers;
+module_param(tiny_router_buffers, int, 0444);
+MODULE_PARM_DESC(tiny_router_buffers, "# of 0 payload messages to buffer in the router");
+static int small_router_buffers;
+module_param(small_router_buffers, int, 0444);
+MODULE_PARM_DESC(small_router_buffers, "# of small (1 page) messages to buffer in the router");
+static int large_router_buffers;
+module_param(large_router_buffers, int, 0444);
+MODULE_PARM_DESC(large_router_buffers, "# of large messages to buffer in the router");
+static int peer_buffer_credits;
+module_param(peer_buffer_credits, int, 0444);
+MODULE_PARM_DESC(peer_buffer_credits, "# router buffer credits per peer");
+
+static int auto_down = 1;
+module_param(auto_down, int, 0444);
+MODULE_PARM_DESC(auto_down, "Automatically mark peers down on comms error");
+
+int
+lnet_peer_buffer_credits(lnet_ni_t *ni)
+{
+	/* NI option overrides LNet default */
+	if (ni->ni_peerrtrcredits > 0)
+		return ni->ni_peerrtrcredits;
+	if (peer_buffer_credits > 0)
+		return peer_buffer_credits;
+
+	/* As an approximation, allow this peer the same number of router
+	 * buffers as it is allowed outstanding sends */
+	return ni->ni_peertxcredits;
+}
+
+/* forward ref's */
+static int lnet_router_checker(void *);
+#else
+
+int
+lnet_peer_buffer_credits(lnet_ni_t *ni)
+{
+	return 0;
+}
+
+#endif
+
+static int check_routers_before_use;
+module_param(check_routers_before_use, int, 0444);
+MODULE_PARM_DESC(check_routers_before_use, "Assume routers are down and ping them before use");
+
+int avoid_asym_router_failure = 1;
+module_param(avoid_asym_router_failure, int, 0644);
+MODULE_PARM_DESC(avoid_asym_router_failure, "Avoid asymmetrical router failures (0 to disable)");
+
+static int dead_router_check_interval = 60;
+module_param(dead_router_check_interval, int, 0644);
+MODULE_PARM_DESC(dead_router_check_interval, "Seconds between dead router health checks (<= 0 to disable)");
+
+static int live_router_check_interval = 60;
+module_param(live_router_check_interval, int, 0644);
+MODULE_PARM_DESC(live_router_check_interval, "Seconds between live router health checks (<= 0 to disable)");
+
+static int router_ping_timeout = 50;
+module_param(router_ping_timeout, int, 0644);
+MODULE_PARM_DESC(router_ping_timeout, "Seconds to wait for the reply to a router health query");
+
+int
+lnet_peers_start_down(void)
+{
+	return check_routers_before_use;
+}
+
+void
+lnet_notify_locked(lnet_peer_t *lp, int notifylnd, int alive,
+		   unsigned long when)
+{
+	if (time_before(when, lp->lp_timestamp)) { /* out of date information */
+		CDEBUG(D_NET, "Out of date\n");
+		return;
+	}
+
+	lp->lp_timestamp = when;		/* update timestamp */
+	lp->lp_ping_deadline = 0;	       /* disable ping timeout */
+
+	if (lp->lp_alive_count != 0 &&	  /* got old news */
+	    (!lp->lp_alive) == (!alive)) {      /* new date for old news */
+		CDEBUG(D_NET, "Old news\n");
+		return;
+	}
+
+	/* Flag that notification is outstanding */
+
+	lp->lp_alive_count++;
+	lp->lp_alive = !(!alive);	       /* 1 bit! */
+	lp->lp_notify = 1;
+	lp->lp_notifylnd |= notifylnd;
+	if (lp->lp_alive)
+		lp->lp_ping_feats = LNET_PING_FEAT_INVAL; /* reset */
+
+	CDEBUG(D_NET, "set %s %d\n", libcfs_nid2str(lp->lp_nid), alive);
+}
+
+static void
+lnet_ni_notify_locked(lnet_ni_t *ni, lnet_peer_t *lp)
+{
+	int	alive;
+	int	notifylnd;
+
+	/* Notify only in 1 thread at any time to ensure ordered notification.
+	 * NB individual events can be missed; the only guarantee is that you
+	 * always get the most recent news */
+
+	if (lp->lp_notifying || ni == NULL)
+		return;
+
+	lp->lp_notifying = 1;
+
+	while (lp->lp_notify) {
+		alive     = lp->lp_alive;
+		notifylnd = lp->lp_notifylnd;
+
+		lp->lp_notifylnd = 0;
+		lp->lp_notify    = 0;
+
+		if (notifylnd && ni->ni_lnd->lnd_notify != NULL) {
+			lnet_net_unlock(lp->lp_cpt);
+
+			/* A new notification could happen now; I'll handle it
+			 * when control returns to me */
+
+			(ni->ni_lnd->lnd_notify)(ni, lp->lp_nid, alive);
+
+			lnet_net_lock(lp->lp_cpt);
+		}
+	}
+
+	lp->lp_notifying = 0;
+}
+
+
+static void
+lnet_rtr_addref_locked(lnet_peer_t *lp)
+{
+	LASSERT(lp->lp_refcount > 0);
+	LASSERT(lp->lp_rtr_refcount >= 0);
+
+	/* lnet_net_lock must be exclusively locked */
+	lp->lp_rtr_refcount++;
+	if (lp->lp_rtr_refcount == 1) {
+		struct list_head *pos;
+
+		/* a simple insertion sort */
+		list_for_each_prev(pos, &the_lnet.ln_routers) {
+			lnet_peer_t *rtr = list_entry(pos, lnet_peer_t,
+							  lp_rtr_list);
+
+			if (rtr->lp_nid < lp->lp_nid)
+				break;
+		}
+
+		list_add(&lp->lp_rtr_list, pos);
+		/* addref for the_lnet.ln_routers */
+		lnet_peer_addref_locked(lp);
+		the_lnet.ln_routers_version++;
+	}
+}
+
+static void
+lnet_rtr_decref_locked(lnet_peer_t *lp)
+{
+	LASSERT(lp->lp_refcount > 0);
+	LASSERT(lp->lp_rtr_refcount > 0);
+
+	/* lnet_net_lock must be exclusively locked */
+	lp->lp_rtr_refcount--;
+	if (lp->lp_rtr_refcount == 0) {
+		LASSERT(list_empty(&lp->lp_routes));
+
+		if (lp->lp_rcd != NULL) {
+			list_add(&lp->lp_rcd->rcd_list,
+				     &the_lnet.ln_rcd_deathrow);
+			lp->lp_rcd = NULL;
+		}
+
+		list_del(&lp->lp_rtr_list);
+		/* decref for the_lnet.ln_routers */
+		lnet_peer_decref_locked(lp);
+		the_lnet.ln_routers_version++;
+	}
+}
+
+lnet_remotenet_t *
+lnet_find_net_locked(__u32 net)
+{
+	lnet_remotenet_t	*rnet;
+	struct list_head		*tmp;
+	struct list_head		*rn_list;
+
+	LASSERT(!the_lnet.ln_shutdown);
+
+	rn_list = lnet_net2rnethash(net);
+	list_for_each(tmp, rn_list) {
+		rnet = list_entry(tmp, lnet_remotenet_t, lrn_list);
+
+		if (rnet->lrn_net == net)
+			return rnet;
+	}
+	return NULL;
+}
+
+static void lnet_shuffle_seed(void)
+{
+	static int seeded;
+	int lnd_type, seed[2];
+	struct timeval tv;
+	lnet_ni_t *ni;
+	struct list_head *tmp;
+
+	if (seeded)
+		return;
+
+	cfs_get_random_bytes(seed, sizeof(seed));
+
+	/* Nodes with small feet have little entropy
+	 * the NID for this node gives the most entropy in the low bits */
+	list_for_each(tmp, &the_lnet.ln_nis) {
+		ni = list_entry(tmp, lnet_ni_t, ni_list);
+		lnd_type = LNET_NETTYP(LNET_NIDNET(ni->ni_nid));
+
+		if (lnd_type != LOLND)
+			seed[0] ^= (LNET_NIDADDR(ni->ni_nid) | lnd_type);
+	}
+
+	do_gettimeofday(&tv);
+	cfs_srand(tv.tv_sec ^ seed[0], tv.tv_usec ^ seed[1]);
+	seeded = 1;
+}
+
+/* NB expects LNET_LOCK held */
+static void
+lnet_add_route_to_rnet(lnet_remotenet_t *rnet, lnet_route_t *route)
+{
+	unsigned int      len = 0;
+	unsigned int      offset = 0;
+	struct list_head       *e;
+
+	lnet_shuffle_seed();
+
+	list_for_each(e, &rnet->lrn_routes) {
+		len++;
+	}
+
+	/* len+1 positions to add a new entry, also prevents division by 0 */
+	offset = cfs_rand() % (len + 1);
+	list_for_each(e, &rnet->lrn_routes) {
+		if (offset == 0)
+			break;
+		offset--;
+	}
+	list_add(&route->lr_list, e);
+	list_add(&route->lr_gwlist, &route->lr_gateway->lp_routes);
+
+	the_lnet.ln_remote_nets_version++;
+	lnet_rtr_addref_locked(route->lr_gateway);
+}
+
+int
+lnet_add_route(__u32 net, unsigned int hops, lnet_nid_t gateway,
+	       unsigned int priority)
+{
+	struct list_head	  *e;
+	lnet_remotenet_t    *rnet;
+	lnet_remotenet_t    *rnet2;
+	lnet_route_t	*route;
+	lnet_ni_t	   *ni;
+	int		  add_route;
+	int		  rc;
+
+	CDEBUG(D_NET, "Add route: net %s hops %u priority %u gw %s\n",
+	       libcfs_net2str(net), hops, priority, libcfs_nid2str(gateway));
+
+	if (gateway == LNET_NID_ANY ||
+	    LNET_NETTYP(LNET_NIDNET(gateway)) == LOLND ||
+	    net == LNET_NIDNET(LNET_NID_ANY) ||
+	    LNET_NETTYP(net) == LOLND ||
+	    LNET_NIDNET(gateway) == net ||
+	    hops < 1 || hops > 255)
+		return -EINVAL;
+
+	if (lnet_islocalnet(net))	       /* it's a local network */
+		return 0;		       /* ignore the route entry */
+
+	/* Assume net, route, all new */
+	LIBCFS_ALLOC(route, sizeof(*route));
+	LIBCFS_ALLOC(rnet, sizeof(*rnet));
+	if (route == NULL || rnet == NULL) {
+		CERROR("Out of memory creating route %s %d %s\n",
+		       libcfs_net2str(net), hops, libcfs_nid2str(gateway));
+		if (route != NULL)
+			LIBCFS_FREE(route, sizeof(*route));
+		if (rnet != NULL)
+			LIBCFS_FREE(rnet, sizeof(*rnet));
+		return -ENOMEM;
+	}
+
+	INIT_LIST_HEAD(&rnet->lrn_routes);
+	rnet->lrn_net = net;
+	route->lr_hops = hops;
+	route->lr_net = net;
+	route->lr_priority = priority;
+
+	lnet_net_lock(LNET_LOCK_EX);
+
+	rc = lnet_nid2peer_locked(&route->lr_gateway, gateway, LNET_LOCK_EX);
+	if (rc != 0) {
+		lnet_net_unlock(LNET_LOCK_EX);
+
+		LIBCFS_FREE(route, sizeof(*route));
+		LIBCFS_FREE(rnet, sizeof(*rnet));
+
+		if (rc == -EHOSTUNREACH) /* gateway is not on a local net */
+			return 0;	/* ignore the route entry */
+		CERROR("Error %d creating route %s %d %s\n", rc,
+		       libcfs_net2str(net), hops,
+		       libcfs_nid2str(gateway));
+
+		return rc;
+	}
+
+	LASSERT(!the_lnet.ln_shutdown);
+
+	rnet2 = lnet_find_net_locked(net);
+	if (rnet2 == NULL) {
+		/* new network */
+		list_add_tail(&rnet->lrn_list, lnet_net2rnethash(net));
+		rnet2 = rnet;
+	}
+
+	/* Search for a duplicate route (it's a NOOP if it is) */
+	add_route = 1;
+	list_for_each(e, &rnet2->lrn_routes) {
+		lnet_route_t *route2 = list_entry(e, lnet_route_t, lr_list);
+
+		if (route2->lr_gateway == route->lr_gateway) {
+			add_route = 0;
+			break;
+		}
+
+		/* our lookups must be true */
+		LASSERT(route2->lr_gateway->lp_nid != gateway);
+	}
+
+	if (add_route) {
+		lnet_peer_addref_locked(route->lr_gateway); /* +1 for notify */
+		lnet_add_route_to_rnet(rnet2, route);
+
+		ni = route->lr_gateway->lp_ni;
+		lnet_net_unlock(LNET_LOCK_EX);
+
+		/* XXX Assume alive */
+		if (ni->ni_lnd->lnd_notify != NULL)
+			(ni->ni_lnd->lnd_notify)(ni, gateway, 1);
+
+		lnet_net_lock(LNET_LOCK_EX);
+	}
+
+	/* -1 for notify or !add_route */
+	lnet_peer_decref_locked(route->lr_gateway);
+	lnet_net_unlock(LNET_LOCK_EX);
+
+	if (!add_route)
+		LIBCFS_FREE(route, sizeof(*route));
+
+	if (rnet != rnet2)
+		LIBCFS_FREE(rnet, sizeof(*rnet));
+
+	return 0;
+}
+
+int
+lnet_check_routes(void)
+{
+	lnet_remotenet_t	*rnet;
+	lnet_route_t		*route;
+	lnet_route_t		*route2;
+	struct list_head		*e1;
+	struct list_head		*e2;
+	int			cpt;
+	struct list_head		*rn_list;
+	int			i;
+
+	cpt = lnet_net_lock_current();
+
+	for (i = 0; i < LNET_REMOTE_NETS_HASH_SIZE; i++) {
+		rn_list = &the_lnet.ln_remote_nets_hash[i];
+		list_for_each(e1, rn_list) {
+			rnet = list_entry(e1, lnet_remotenet_t, lrn_list);
+
+			route2 = NULL;
+			list_for_each(e2, &rnet->lrn_routes) {
+				lnet_nid_t	nid1;
+				lnet_nid_t	nid2;
+				int		net;
+
+				route = list_entry(e2, lnet_route_t,
+						       lr_list);
+
+				if (route2 == NULL) {
+					route2 = route;
+					continue;
+				}
+
+				if (route->lr_gateway->lp_ni ==
+				    route2->lr_gateway->lp_ni)
+					continue;
+
+				nid1 = route->lr_gateway->lp_nid;
+				nid2 = route2->lr_gateway->lp_nid;
+				net = rnet->lrn_net;
+
+				lnet_net_unlock(cpt);
+
+				CERROR("Routes to %s via %s and %s not supported\n",
+				       libcfs_net2str(net),
+				       libcfs_nid2str(nid1),
+				       libcfs_nid2str(nid2));
+				return -EINVAL;
+			}
+		}
+	}
+
+	lnet_net_unlock(cpt);
+	return 0;
+}
+
+int
+lnet_del_route(__u32 net, lnet_nid_t gw_nid)
+{
+	struct lnet_peer	*gateway;
+	lnet_remotenet_t	*rnet;
+	lnet_route_t		*route;
+	struct list_head		*e1;
+	struct list_head		*e2;
+	int			rc = -ENOENT;
+	struct list_head		*rn_list;
+	int			idx = 0;
+
+	CDEBUG(D_NET, "Del route: net %s : gw %s\n",
+	       libcfs_net2str(net), libcfs_nid2str(gw_nid));
+
+	/* NB Caller may specify either all routes via the given gateway
+	 * or a specific route entry actual NIDs) */
+
+	lnet_net_lock(LNET_LOCK_EX);
+	if (net == LNET_NIDNET(LNET_NID_ANY))
+		rn_list = &the_lnet.ln_remote_nets_hash[0];
+	else
+		rn_list = lnet_net2rnethash(net);
+
+ again:
+	list_for_each(e1, rn_list) {
+		rnet = list_entry(e1, lnet_remotenet_t, lrn_list);
+
+		if (!(net == LNET_NIDNET(LNET_NID_ANY) ||
+			net == rnet->lrn_net))
+			continue;
+
+		list_for_each(e2, &rnet->lrn_routes) {
+			route = list_entry(e2, lnet_route_t, lr_list);
+
+			gateway = route->lr_gateway;
+			if (!(gw_nid == LNET_NID_ANY ||
+			      gw_nid == gateway->lp_nid))
+				continue;
+
+			list_del(&route->lr_list);
+			list_del(&route->lr_gwlist);
+			the_lnet.ln_remote_nets_version++;
+
+			if (list_empty(&rnet->lrn_routes))
+				list_del(&rnet->lrn_list);
+			else
+				rnet = NULL;
+
+			lnet_rtr_decref_locked(gateway);
+			lnet_peer_decref_locked(gateway);
+
+			lnet_net_unlock(LNET_LOCK_EX);
+
+			LIBCFS_FREE(route, sizeof(*route));
+
+			if (rnet != NULL)
+				LIBCFS_FREE(rnet, sizeof(*rnet));
+
+			rc = 0;
+			lnet_net_lock(LNET_LOCK_EX);
+			goto again;
+		}
+	}
+
+	if (net == LNET_NIDNET(LNET_NID_ANY) &&
+	    ++idx < LNET_REMOTE_NETS_HASH_SIZE) {
+		rn_list = &the_lnet.ln_remote_nets_hash[idx];
+		goto again;
+	}
+	lnet_net_unlock(LNET_LOCK_EX);
+
+	return rc;
+}
+
+void
+lnet_destroy_routes(void)
+{
+	lnet_del_route(LNET_NIDNET(LNET_NID_ANY), LNET_NID_ANY);
+}
+
+int
+lnet_get_route(int idx, __u32 *net, __u32 *hops,
+	       lnet_nid_t *gateway, __u32 *alive, __u32 *priority)
+{
+	struct list_head		*e1;
+	struct list_head		*e2;
+	lnet_remotenet_t	*rnet;
+	lnet_route_t		*route;
+	int			cpt;
+	int			i;
+	struct list_head		*rn_list;
+
+	cpt = lnet_net_lock_current();
+
+	for (i = 0; i < LNET_REMOTE_NETS_HASH_SIZE; i++) {
+		rn_list = &the_lnet.ln_remote_nets_hash[i];
+		list_for_each(e1, rn_list) {
+			rnet = list_entry(e1, lnet_remotenet_t, lrn_list);
+
+			list_for_each(e2, &rnet->lrn_routes) {
+				route = list_entry(e2, lnet_route_t,
+						       lr_list);
+
+				if (idx-- == 0) {
+					*net	  = rnet->lrn_net;
+					*hops	  = route->lr_hops;
+					*priority = route->lr_priority;
+					*gateway  = route->lr_gateway->lp_nid;
+					*alive	  = route->lr_gateway->lp_alive;
+					lnet_net_unlock(cpt);
+					return 0;
+				}
+			}
+		}
+	}
+
+	lnet_net_unlock(cpt);
+	return -ENOENT;
+}
+
+void
+lnet_swap_pinginfo(lnet_ping_info_t *info)
+{
+	int	       i;
+	lnet_ni_status_t *stat;
+
+	__swab32s(&info->pi_magic);
+	__swab32s(&info->pi_features);
+	__swab32s(&info->pi_pid);
+	__swab32s(&info->pi_nnis);
+	for (i = 0; i < info->pi_nnis && i < LNET_MAX_RTR_NIS; i++) {
+		stat = &info->pi_ni[i];
+		__swab64s(&stat->ns_nid);
+		__swab32s(&stat->ns_status);
+	}
+}
+
+/**
+ * parse router-checker pinginfo, record number of down NIs for remote
+ * networks on that router.
+ */
+static void
+lnet_parse_rc_info(lnet_rc_data_t *rcd)
+{
+	lnet_ping_info_t	*info = rcd->rcd_pinginfo;
+	struct lnet_peer	*gw   = rcd->rcd_gateway;
+	lnet_route_t		*rtr;
+
+	if (!gw->lp_alive)
+		return;
+
+	if (info->pi_magic == __swab32(LNET_PROTO_PING_MAGIC))
+		lnet_swap_pinginfo(info);
+
+	/* NB always racing with network! */
+	if (info->pi_magic != LNET_PROTO_PING_MAGIC) {
+		CDEBUG(D_NET, "%s: Unexpected magic %08x\n",
+		       libcfs_nid2str(gw->lp_nid), info->pi_magic);
+		gw->lp_ping_feats = LNET_PING_FEAT_INVAL;
+		return;
+	}
+
+	gw->lp_ping_feats = info->pi_features;
+	if ((gw->lp_ping_feats & LNET_PING_FEAT_MASK) == 0) {
+		CDEBUG(D_NET, "%s: Unexpected features 0x%x\n",
+		       libcfs_nid2str(gw->lp_nid), gw->lp_ping_feats);
+		return; /* nothing I can understand */
+	}
+
+	if ((gw->lp_ping_feats & LNET_PING_FEAT_NI_STATUS) == 0)
+		return; /* can't carry NI status info */
+
+	list_for_each_entry(rtr, &gw->lp_routes, lr_gwlist) {
+		int	ptl_status = LNET_NI_STATUS_INVALID;
+		int	down = 0;
+		int	up = 0;
+		int	i;
+
+		for (i = 0; i < info->pi_nnis && i < LNET_MAX_RTR_NIS; i++) {
+			lnet_ni_status_t *stat = &info->pi_ni[i];
+			lnet_nid_t	 nid = stat->ns_nid;
+
+			if (nid == LNET_NID_ANY) {
+				CDEBUG(D_NET, "%s: unexpected LNET_NID_ANY\n",
+				       libcfs_nid2str(gw->lp_nid));
+				gw->lp_ping_feats = LNET_PING_FEAT_INVAL;
+				return;
+			}
+
+			if (LNET_NETTYP(LNET_NIDNET(nid)) == LOLND)
+				continue;
+
+			if (stat->ns_status == LNET_NI_STATUS_DOWN) {
+				if (LNET_NETTYP(LNET_NIDNET(nid)) != PTLLND)
+					down++;
+				else if (ptl_status != LNET_NI_STATUS_UP)
+					ptl_status = LNET_NI_STATUS_DOWN;
+				continue;
+			}
+
+			if (stat->ns_status == LNET_NI_STATUS_UP) {
+				if (LNET_NIDNET(nid) == rtr->lr_net) {
+					up = 1;
+					break;
+				}
+				/* ptl NIs are considered down only when
+				 * they're all down */
+				if (LNET_NETTYP(LNET_NIDNET(nid)) == PTLLND)
+					ptl_status = LNET_NI_STATUS_UP;
+				continue;
+			}
+
+			CDEBUG(D_NET, "%s: Unexpected status 0x%x\n",
+			       libcfs_nid2str(gw->lp_nid), stat->ns_status);
+			gw->lp_ping_feats = LNET_PING_FEAT_INVAL;
+			return;
+		}
+
+		if (up) { /* ignore downed NIs if NI for dest network is up */
+			rtr->lr_downis = 0;
+			continue;
+		}
+		rtr->lr_downis = down + (ptl_status == LNET_NI_STATUS_DOWN);
+	}
+}
+
+static void
+lnet_router_checker_event(lnet_event_t *event)
+{
+	lnet_rc_data_t		*rcd = event->md.user_ptr;
+	struct lnet_peer	*lp;
+
+	LASSERT(rcd != NULL);
+
+	if (event->unlinked) {
+		LNetInvalidateHandle(&rcd->rcd_mdh);
+		return;
+	}
+
+	LASSERT(event->type == LNET_EVENT_SEND ||
+		event->type == LNET_EVENT_REPLY);
+
+	lp = rcd->rcd_gateway;
+	LASSERT(lp != NULL);
+
+	 /* NB: it's called with holding lnet_res_lock, we have a few
+	  * places need to hold both locks at the same time, please take
+	  * care of lock ordering */
+	lnet_net_lock(lp->lp_cpt);
+	if (!lnet_isrouter(lp) || lp->lp_rcd != rcd) {
+		/* ignore if no longer a router or rcd is replaced */
+		goto out;
+	}
+
+	if (event->type == LNET_EVENT_SEND) {
+		lp->lp_ping_notsent = 0;
+		if (event->status == 0)
+			goto out;
+	}
+
+	/* LNET_EVENT_REPLY */
+	/* A successful REPLY means the router is up.  If _any_ comms
+	 * to the router fail I assume it's down (this will happen if
+	 * we ping alive routers to try to detect router death before
+	 * apps get burned). */
+
+	lnet_notify_locked(lp, 1, (event->status == 0), cfs_time_current());
+	/* The router checker will wake up very shortly and do the
+	 * actual notification.
+	 * XXX If 'lp' stops being a router before then, it will still
+	 * have the notification pending!!! */
+
+	if (avoid_asym_router_failure && event->status == 0)
+		lnet_parse_rc_info(rcd);
+
+ out:
+	lnet_net_unlock(lp->lp_cpt);
+}
+
+static void
+lnet_wait_known_routerstate(void)
+{
+	lnet_peer_t	 *rtr;
+	struct list_head	  *entry;
+	int		  all_known;
+
+	LASSERT(the_lnet.ln_rc_state == LNET_RC_STATE_RUNNING);
+
+	for (;;) {
+		int	cpt = lnet_net_lock_current();
+
+		all_known = 1;
+		list_for_each(entry, &the_lnet.ln_routers) {
+			rtr = list_entry(entry, lnet_peer_t, lp_rtr_list);
+
+			if (rtr->lp_alive_count == 0) {
+				all_known = 0;
+				break;
+			}
+		}
+
+		lnet_net_unlock(cpt);
+
+		if (all_known)
+			return;
+
+		set_current_state(TASK_UNINTERRUPTIBLE);
+		schedule_timeout(cfs_time_seconds(1));
+	}
+}
+
+void
+lnet_router_ni_update_locked(lnet_peer_t *gw, __u32 net)
+{
+	lnet_route_t *rte;
+
+	if ((gw->lp_ping_feats & LNET_PING_FEAT_NI_STATUS) != 0) {
+		list_for_each_entry(rte, &gw->lp_routes, lr_gwlist) {
+			if (rte->lr_net == net) {
+				rte->lr_downis = 0;
+				break;
+			}
+		}
+	}
+}
+
+static void
+lnet_update_ni_status_locked(void)
+{
+	lnet_ni_t	*ni;
+	long		now;
+	int		timeout;
+
+	LASSERT(the_lnet.ln_routing);
+
+	timeout = router_ping_timeout +
+		  max(live_router_check_interval, dead_router_check_interval);
+
+	now = get_seconds();
+	list_for_each_entry(ni, &the_lnet.ln_nis, ni_list) {
+		if (ni->ni_lnd->lnd_type == LOLND)
+			continue;
+
+		if (now < ni->ni_last_alive + timeout)
+			continue;
+
+		lnet_ni_lock(ni);
+		/* re-check with lock */
+		if (now < ni->ni_last_alive + timeout) {
+			lnet_ni_unlock(ni);
+			continue;
+		}
+
+		LASSERT(ni->ni_status != NULL);
+
+		if (ni->ni_status->ns_status != LNET_NI_STATUS_DOWN) {
+			CDEBUG(D_NET, "NI(%s:%d) status changed to down\n",
+			       libcfs_nid2str(ni->ni_nid), timeout);
+			/* NB: so far, this is the only place to set
+			 * NI status to "down" */
+			ni->ni_status->ns_status = LNET_NI_STATUS_DOWN;
+		}
+		lnet_ni_unlock(ni);
+	}
+}
+
+static void
+lnet_destroy_rc_data(lnet_rc_data_t *rcd)
+{
+	LASSERT(list_empty(&rcd->rcd_list));
+	/* detached from network */
+	LASSERT(LNetHandleIsInvalid(rcd->rcd_mdh));
+
+	if (rcd->rcd_gateway != NULL) {
+		int cpt = rcd->rcd_gateway->lp_cpt;
+
+		lnet_net_lock(cpt);
+		lnet_peer_decref_locked(rcd->rcd_gateway);
+		lnet_net_unlock(cpt);
+	}
+
+	if (rcd->rcd_pinginfo != NULL)
+		LIBCFS_FREE(rcd->rcd_pinginfo, LNET_PINGINFO_SIZE);
+
+	LIBCFS_FREE(rcd, sizeof(*rcd));
+}
+
+static lnet_rc_data_t *
+lnet_create_rc_data_locked(lnet_peer_t *gateway)
+{
+	lnet_rc_data_t		*rcd = NULL;
+	lnet_ping_info_t	*pi;
+	int			rc;
+	int			i;
+
+	lnet_net_unlock(gateway->lp_cpt);
+
+	LIBCFS_ALLOC(rcd, sizeof(*rcd));
+	if (rcd == NULL)
+		goto out;
+
+	LNetInvalidateHandle(&rcd->rcd_mdh);
+	INIT_LIST_HEAD(&rcd->rcd_list);
+
+	LIBCFS_ALLOC(pi, LNET_PINGINFO_SIZE);
+	if (pi == NULL)
+		goto out;
+
+	for (i = 0; i < LNET_MAX_RTR_NIS; i++) {
+		pi->pi_ni[i].ns_nid = LNET_NID_ANY;
+		pi->pi_ni[i].ns_status = LNET_NI_STATUS_INVALID;
+	}
+	rcd->rcd_pinginfo = pi;
+
+	LASSERT(!LNetHandleIsInvalid(the_lnet.ln_rc_eqh));
+	rc = LNetMDBind((lnet_md_t){.start     = pi,
+				    .user_ptr  = rcd,
+				    .length    = LNET_PINGINFO_SIZE,
+				    .threshold = LNET_MD_THRESH_INF,
+				    .options   = LNET_MD_TRUNCATE,
+				    .eq_handle = the_lnet.ln_rc_eqh},
+			LNET_UNLINK,
+			&rcd->rcd_mdh);
+	if (rc < 0) {
+		CERROR("Can't bind MD: %d\n", rc);
+		goto out;
+	}
+	LASSERT(rc == 0);
+
+	lnet_net_lock(gateway->lp_cpt);
+	/* router table changed or someone has created rcd for this gateway */
+	if (!lnet_isrouter(gateway) || gateway->lp_rcd != NULL) {
+		lnet_net_unlock(gateway->lp_cpt);
+		goto out;
+	}
+
+	lnet_peer_addref_locked(gateway);
+	rcd->rcd_gateway = gateway;
+	gateway->lp_rcd = rcd;
+	gateway->lp_ping_notsent = 0;
+
+	return rcd;
+
+ out:
+	if (rcd != NULL) {
+		if (!LNetHandleIsInvalid(rcd->rcd_mdh)) {
+			rc = LNetMDUnlink(rcd->rcd_mdh);
+			LASSERT(rc == 0);
+		}
+		lnet_destroy_rc_data(rcd);
+	}
+
+	lnet_net_lock(gateway->lp_cpt);
+	return gateway->lp_rcd;
+}
+
+static int
+lnet_router_check_interval(lnet_peer_t *rtr)
+{
+	int secs;
+
+	secs = rtr->lp_alive ? live_router_check_interval :
+			       dead_router_check_interval;
+	if (secs < 0)
+		secs = 0;
+
+	return secs;
+}
+
+static void
+lnet_ping_router_locked(lnet_peer_t *rtr)
+{
+	lnet_rc_data_t *rcd = NULL;
+	unsigned long      now = cfs_time_current();
+	int	     secs;
+
+	lnet_peer_addref_locked(rtr);
+
+	if (rtr->lp_ping_deadline != 0 && /* ping timed out? */
+	    cfs_time_after(now, rtr->lp_ping_deadline))
+		lnet_notify_locked(rtr, 1, 0, now);
+
+	/* Run any outstanding notifications */
+	lnet_ni_notify_locked(rtr->lp_ni, rtr);
+
+	if (!lnet_isrouter(rtr) ||
+	    the_lnet.ln_rc_state != LNET_RC_STATE_RUNNING) {
+		/* router table changed or router checker is shutting down */
+		lnet_peer_decref_locked(rtr);
+		return;
+	}
+
+	rcd = rtr->lp_rcd != NULL ?
+	      rtr->lp_rcd : lnet_create_rc_data_locked(rtr);
+
+	if (rcd == NULL)
+		return;
+
+	secs = lnet_router_check_interval(rtr);
+
+	CDEBUG(D_NET,
+	       "rtr %s %d: deadline %lu ping_notsent %d alive %d alive_count %d lp_ping_timestamp %lu\n",
+	       libcfs_nid2str(rtr->lp_nid), secs,
+	       rtr->lp_ping_deadline, rtr->lp_ping_notsent,
+	       rtr->lp_alive, rtr->lp_alive_count, rtr->lp_ping_timestamp);
+
+	if (secs != 0 && !rtr->lp_ping_notsent &&
+	    cfs_time_after(now, cfs_time_add(rtr->lp_ping_timestamp,
+					     cfs_time_seconds(secs)))) {
+		int	       rc;
+		lnet_process_id_t id;
+		lnet_handle_md_t  mdh;
+
+		id.nid = rtr->lp_nid;
+		id.pid = LUSTRE_SRV_LNET_PID;
+		CDEBUG(D_NET, "Check: %s\n", libcfs_id2str(id));
+
+		rtr->lp_ping_notsent   = 1;
+		rtr->lp_ping_timestamp = now;
+
+		mdh = rcd->rcd_mdh;
+
+		if (rtr->lp_ping_deadline == 0) {
+			rtr->lp_ping_deadline =
+				cfs_time_shift(router_ping_timeout);
+		}
+
+		lnet_net_unlock(rtr->lp_cpt);
+
+		rc = LNetGet(LNET_NID_ANY, mdh, id, LNET_RESERVED_PORTAL,
+			     LNET_PROTO_PING_MATCHBITS, 0);
+
+		lnet_net_lock(rtr->lp_cpt);
+		if (rc != 0)
+			rtr->lp_ping_notsent = 0; /* no event pending */
+	}
+
+	lnet_peer_decref_locked(rtr);
+}
+
+int
+lnet_router_checker_start(void)
+{
+	int	  rc;
+	int	  eqsz;
+
+	LASSERT(the_lnet.ln_rc_state == LNET_RC_STATE_SHUTDOWN);
+
+	if (check_routers_before_use &&
+	    dead_router_check_interval <= 0) {
+		LCONSOLE_ERROR_MSG(0x10a, "'dead_router_check_interval' must be set if 'check_routers_before_use' is set\n");
+		return -EINVAL;
+	}
+
+	if (!the_lnet.ln_routing &&
+	    live_router_check_interval <= 0 &&
+	    dead_router_check_interval <= 0)
+		return 0;
+
+	sema_init(&the_lnet.ln_rc_signal, 0);
+	/* EQ size doesn't matter; the callback is guaranteed to get every
+	 * event */
+	eqsz = 0;
+	rc = LNetEQAlloc(eqsz, lnet_router_checker_event,
+			 &the_lnet.ln_rc_eqh);
+	if (rc != 0) {
+		CERROR("Can't allocate EQ(%d): %d\n", eqsz, rc);
+		return -ENOMEM;
+	}
+
+	the_lnet.ln_rc_state = LNET_RC_STATE_RUNNING;
+	rc = PTR_ERR(kthread_run(lnet_router_checker,
+				 NULL, "router_checker"));
+	if (IS_ERR_VALUE(rc)) {
+		CERROR("Can't start router checker thread: %d\n", rc);
+		/* block until event callback signals exit */
+		down(&the_lnet.ln_rc_signal);
+		rc = LNetEQFree(the_lnet.ln_rc_eqh);
+		LASSERT(rc == 0);
+		the_lnet.ln_rc_state = LNET_RC_STATE_SHUTDOWN;
+		return -ENOMEM;
+	}
+
+	if (check_routers_before_use) {
+		/* Note that a helpful side-effect of pinging all known routers
+		 * at startup is that it makes them drop stale connections they
+		 * may have to a previous instance of me. */
+		lnet_wait_known_routerstate();
+	}
+
+	return 0;
+}
+
+void
+lnet_router_checker_stop(void)
+{
+	int rc;
+
+	if (the_lnet.ln_rc_state == LNET_RC_STATE_SHUTDOWN)
+		return;
+
+	LASSERT(the_lnet.ln_rc_state == LNET_RC_STATE_RUNNING);
+	the_lnet.ln_rc_state = LNET_RC_STATE_STOPPING;
+
+	/* block until event callback signals exit */
+	down(&the_lnet.ln_rc_signal);
+	LASSERT(the_lnet.ln_rc_state == LNET_RC_STATE_SHUTDOWN);
+
+	rc = LNetEQFree(the_lnet.ln_rc_eqh);
+	LASSERT(rc == 0);
+}
+
+static void
+lnet_prune_rc_data(int wait_unlink)
+{
+	lnet_rc_data_t		*rcd;
+	lnet_rc_data_t		*tmp;
+	lnet_peer_t		*lp;
+	struct list_head		head;
+	int			i = 2;
+
+	if (likely(the_lnet.ln_rc_state == LNET_RC_STATE_RUNNING &&
+		   list_empty(&the_lnet.ln_rcd_deathrow) &&
+		   list_empty(&the_lnet.ln_rcd_zombie)))
+		return;
+
+	INIT_LIST_HEAD(&head);
+
+	lnet_net_lock(LNET_LOCK_EX);
+
+	if (the_lnet.ln_rc_state != LNET_RC_STATE_RUNNING) {
+		/* router checker is stopping, prune all */
+		list_for_each_entry(lp, &the_lnet.ln_routers,
+					lp_rtr_list) {
+			if (lp->lp_rcd == NULL)
+				continue;
+
+			LASSERT(list_empty(&lp->lp_rcd->rcd_list));
+			list_add(&lp->lp_rcd->rcd_list,
+				     &the_lnet.ln_rcd_deathrow);
+			lp->lp_rcd = NULL;
+		}
+	}
+
+	/* unlink all RCDs on deathrow list */
+	list_splice_init(&the_lnet.ln_rcd_deathrow, &head);
+
+	if (!list_empty(&head)) {
+		lnet_net_unlock(LNET_LOCK_EX);
+
+		list_for_each_entry(rcd, &head, rcd_list)
+			LNetMDUnlink(rcd->rcd_mdh);
+
+		lnet_net_lock(LNET_LOCK_EX);
+	}
+
+	list_splice_init(&head, &the_lnet.ln_rcd_zombie);
+
+	/* release all zombie RCDs */
+	while (!list_empty(&the_lnet.ln_rcd_zombie)) {
+		list_for_each_entry_safe(rcd, tmp, &the_lnet.ln_rcd_zombie,
+					     rcd_list) {
+			if (LNetHandleIsInvalid(rcd->rcd_mdh))
+				list_move(&rcd->rcd_list, &head);
+		}
+
+		wait_unlink = wait_unlink &&
+			      !list_empty(&the_lnet.ln_rcd_zombie);
+
+		lnet_net_unlock(LNET_LOCK_EX);
+
+		while (!list_empty(&head)) {
+			rcd = list_entry(head.next,
+					     lnet_rc_data_t, rcd_list);
+			list_del_init(&rcd->rcd_list);
+			lnet_destroy_rc_data(rcd);
+		}
+
+		if (!wait_unlink)
+			return;
+
+		i++;
+		CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET,
+		       "Waiting for rc buffers to unlink\n");
+		set_current_state(TASK_UNINTERRUPTIBLE);
+		schedule_timeout(cfs_time_seconds(1) / 4);
+
+		lnet_net_lock(LNET_LOCK_EX);
+	}
+
+	lnet_net_unlock(LNET_LOCK_EX);
+}
+
+
+#if  defined(LNET_ROUTER)
+
+static int
+lnet_router_checker(void *arg)
+{
+	lnet_peer_t       *rtr;
+	struct list_head	*entry;
+
+	cfs_block_allsigs();
+
+	LASSERT(the_lnet.ln_rc_state == LNET_RC_STATE_RUNNING);
+
+	while (the_lnet.ln_rc_state == LNET_RC_STATE_RUNNING) {
+		__u64	version;
+		int	cpt;
+		int	cpt2;
+
+		cpt = lnet_net_lock_current();
+rescan:
+		version = the_lnet.ln_routers_version;
+
+		list_for_each(entry, &the_lnet.ln_routers) {
+			rtr = list_entry(entry, lnet_peer_t, lp_rtr_list);
+
+			cpt2 = lnet_cpt_of_nid_locked(rtr->lp_nid);
+			if (cpt != cpt2) {
+				lnet_net_unlock(cpt);
+				cpt = cpt2;
+				lnet_net_lock(cpt);
+				/* the routers list has changed */
+				if (version != the_lnet.ln_routers_version)
+					goto rescan;
+			}
+
+			lnet_ping_router_locked(rtr);
+
+			/* NB dropped lock */
+			if (version != the_lnet.ln_routers_version) {
+				/* the routers list has changed */
+				goto rescan;
+			}
+		}
+
+		if (the_lnet.ln_routing)
+			lnet_update_ni_status_locked();
+
+		lnet_net_unlock(cpt);
+
+		lnet_prune_rc_data(0); /* don't wait for UNLINK */
+
+		/* Call schedule_timeout() here always adds 1 to load average
+		 * because kernel counts # active tasks as nr_running
+		 * + nr_uninterruptible. */
+		set_current_state(TASK_INTERRUPTIBLE);
+		schedule_timeout(cfs_time_seconds(1));
+	}
+
+	LASSERT(the_lnet.ln_rc_state == LNET_RC_STATE_STOPPING);
+
+	lnet_prune_rc_data(1); /* wait for UNLINK */
+
+	the_lnet.ln_rc_state = LNET_RC_STATE_SHUTDOWN;
+	up(&the_lnet.ln_rc_signal);
+	/* The unlink event callback will signal final completion */
+	return 0;
+}
+
+static void
+lnet_destroy_rtrbuf(lnet_rtrbuf_t *rb, int npages)
+{
+	int sz = offsetof(lnet_rtrbuf_t, rb_kiov[npages]);
+
+	while (--npages >= 0)
+		__free_page(rb->rb_kiov[npages].kiov_page);
+
+	LIBCFS_FREE(rb, sz);
+}
+
+static lnet_rtrbuf_t *
+lnet_new_rtrbuf(lnet_rtrbufpool_t *rbp, int cpt)
+{
+	int	    npages = rbp->rbp_npages;
+	int	    sz = offsetof(lnet_rtrbuf_t, rb_kiov[npages]);
+	struct page   *page;
+	lnet_rtrbuf_t *rb;
+	int	    i;
+
+	LIBCFS_CPT_ALLOC(rb, lnet_cpt_table(), cpt, sz);
+	if (rb == NULL)
+		return NULL;
+
+	rb->rb_pool = rbp;
+
+	for (i = 0; i < npages; i++) {
+		page = alloc_pages_node(
+				cfs_cpt_spread_node(lnet_cpt_table(), cpt),
+				__GFP_ZERO | GFP_IOFS, 0);
+		if (page == NULL) {
+			while (--i >= 0)
+				__free_page(rb->rb_kiov[i].kiov_page);
+
+			LIBCFS_FREE(rb, sz);
+			return NULL;
+		}
+
+		rb->rb_kiov[i].kiov_len = PAGE_CACHE_SIZE;
+		rb->rb_kiov[i].kiov_offset = 0;
+		rb->rb_kiov[i].kiov_page = page;
+	}
+
+	return rb;
+}
+
+static void
+lnet_rtrpool_free_bufs(lnet_rtrbufpool_t *rbp)
+{
+	int		npages = rbp->rbp_npages;
+	int		nbuffers = 0;
+	lnet_rtrbuf_t	*rb;
+
+	if (rbp->rbp_nbuffers == 0) /* not initialized or already freed */
+		return;
+
+	LASSERT(list_empty(&rbp->rbp_msgs));
+	LASSERT(rbp->rbp_credits == rbp->rbp_nbuffers);
+
+	while (!list_empty(&rbp->rbp_bufs)) {
+		LASSERT(rbp->rbp_credits > 0);
+
+		rb = list_entry(rbp->rbp_bufs.next,
+				    lnet_rtrbuf_t, rb_list);
+		list_del(&rb->rb_list);
+		lnet_destroy_rtrbuf(rb, npages);
+		nbuffers++;
+	}
+
+	LASSERT(rbp->rbp_nbuffers == nbuffers);
+	LASSERT(rbp->rbp_credits == nbuffers);
+
+	rbp->rbp_nbuffers = rbp->rbp_credits = 0;
+}
+
+static int
+lnet_rtrpool_alloc_bufs(lnet_rtrbufpool_t *rbp, int nbufs, int cpt)
+{
+	lnet_rtrbuf_t *rb;
+	int	    i;
+
+	if (rbp->rbp_nbuffers != 0) {
+		LASSERT(rbp->rbp_nbuffers == nbufs);
+		return 0;
+	}
+
+	for (i = 0; i < nbufs; i++) {
+		rb = lnet_new_rtrbuf(rbp, cpt);
+
+		if (rb == NULL) {
+			CERROR("Failed to allocate %d router bufs of %d pages\n",
+			       nbufs, rbp->rbp_npages);
+			return -ENOMEM;
+		}
+
+		rbp->rbp_nbuffers++;
+		rbp->rbp_credits++;
+		rbp->rbp_mincredits++;
+		list_add(&rb->rb_list, &rbp->rbp_bufs);
+
+		/* No allocation "under fire" */
+		/* Otherwise we'd need code to schedule blocked msgs etc */
+		LASSERT(!the_lnet.ln_routing);
+	}
+
+	LASSERT(rbp->rbp_credits == nbufs);
+	return 0;
+}
+
+static void
+lnet_rtrpool_init(lnet_rtrbufpool_t *rbp, int npages)
+{
+	INIT_LIST_HEAD(&rbp->rbp_msgs);
+	INIT_LIST_HEAD(&rbp->rbp_bufs);
+
+	rbp->rbp_npages = npages;
+	rbp->rbp_credits = 0;
+	rbp->rbp_mincredits = 0;
+}
+
+void
+lnet_rtrpools_free(void)
+{
+	lnet_rtrbufpool_t *rtrp;
+	int		  i;
+
+	if (the_lnet.ln_rtrpools == NULL) /* uninitialized or freed */
+		return;
+
+	cfs_percpt_for_each(rtrp, i, the_lnet.ln_rtrpools) {
+		lnet_rtrpool_free_bufs(&rtrp[0]);
+		lnet_rtrpool_free_bufs(&rtrp[1]);
+		lnet_rtrpool_free_bufs(&rtrp[2]);
+	}
+
+	cfs_percpt_free(the_lnet.ln_rtrpools);
+	the_lnet.ln_rtrpools = NULL;
+}
+
+static int
+lnet_nrb_tiny_calculate(int npages)
+{
+	int	nrbs = LNET_NRB_TINY;
+
+	if (tiny_router_buffers < 0) {
+		LCONSOLE_ERROR_MSG(0x10c,
+				   "tiny_router_buffers=%d invalid when routing enabled\n",
+				   tiny_router_buffers);
+		return -1;
+	}
+
+	if (tiny_router_buffers > 0)
+		nrbs = tiny_router_buffers;
+
+	nrbs /= LNET_CPT_NUMBER;
+	return max(nrbs, LNET_NRB_TINY_MIN);
+}
+
+static int
+lnet_nrb_small_calculate(int npages)
+{
+	int	nrbs = LNET_NRB_SMALL;
+
+	if (small_router_buffers < 0) {
+		LCONSOLE_ERROR_MSG(0x10c,
+				   "small_router_buffers=%d invalid when routing enabled\n",
+				   small_router_buffers);
+		return -1;
+	}
+
+	if (small_router_buffers > 0)
+		nrbs = small_router_buffers;
+
+	nrbs /= LNET_CPT_NUMBER;
+	return max(nrbs, LNET_NRB_SMALL_MIN);
+}
+
+static int
+lnet_nrb_large_calculate(int npages)
+{
+	int	nrbs = LNET_NRB_LARGE;
+
+	if (large_router_buffers < 0) {
+		LCONSOLE_ERROR_MSG(0x10c,
+				   "large_router_buffers=%d invalid when routing enabled\n",
+				   large_router_buffers);
+		return -1;
+	}
+
+	if (large_router_buffers > 0)
+		nrbs = large_router_buffers;
+
+	nrbs /= LNET_CPT_NUMBER;
+	return max(nrbs, LNET_NRB_LARGE_MIN);
+}
+
+int
+lnet_rtrpools_alloc(int im_a_router)
+{
+	lnet_rtrbufpool_t *rtrp;
+	int	large_pages;
+	int	small_pages = 1;
+	int	nrb_tiny;
+	int	nrb_small;
+	int	nrb_large;
+	int	rc;
+	int	i;
+
+	large_pages = (LNET_MTU + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+
+	if (!strcmp(forwarding, "")) {
+		/* not set either way */
+		if (!im_a_router)
+			return 0;
+	} else if (!strcmp(forwarding, "disabled")) {
+		/* explicitly disabled */
+		return 0;
+	} else if (!strcmp(forwarding, "enabled")) {
+		/* explicitly enabled */
+	} else {
+		LCONSOLE_ERROR_MSG(0x10b, "'forwarding' not set to either 'enabled' or 'disabled'\n");
+		return -EINVAL;
+	}
+
+	nrb_tiny = lnet_nrb_tiny_calculate(0);
+	if (nrb_tiny < 0)
+		return -EINVAL;
+
+	nrb_small = lnet_nrb_small_calculate(small_pages);
+	if (nrb_small < 0)
+		return -EINVAL;
+
+	nrb_large = lnet_nrb_large_calculate(large_pages);
+	if (nrb_large < 0)
+		return -EINVAL;
+
+	the_lnet.ln_rtrpools = cfs_percpt_alloc(lnet_cpt_table(),
+						LNET_NRBPOOLS *
+						sizeof(lnet_rtrbufpool_t));
+	if (the_lnet.ln_rtrpools == NULL) {
+		LCONSOLE_ERROR_MSG(0x10c,
+				   "Failed to initialize router buffe pool\n");
+		return -ENOMEM;
+	}
+
+	cfs_percpt_for_each(rtrp, i, the_lnet.ln_rtrpools) {
+		lnet_rtrpool_init(&rtrp[0], 0);
+		rc = lnet_rtrpool_alloc_bufs(&rtrp[0], nrb_tiny, i);
+		if (rc != 0)
+			goto failed;
+
+		lnet_rtrpool_init(&rtrp[1], small_pages);
+		rc = lnet_rtrpool_alloc_bufs(&rtrp[1], nrb_small, i);
+		if (rc != 0)
+			goto failed;
+
+		lnet_rtrpool_init(&rtrp[2], large_pages);
+		rc = lnet_rtrpool_alloc_bufs(&rtrp[2], nrb_large, i);
+		if (rc != 0)
+			goto failed;
+	}
+
+	lnet_net_lock(LNET_LOCK_EX);
+	the_lnet.ln_routing = 1;
+	lnet_net_unlock(LNET_LOCK_EX);
+
+	return 0;
+
+ failed:
+	lnet_rtrpools_free();
+	return rc;
+}
+
+int
+lnet_notify(lnet_ni_t *ni, lnet_nid_t nid, int alive, unsigned long when)
+{
+	struct lnet_peer	*lp = NULL;
+	unsigned long		now = cfs_time_current();
+	int			cpt = lnet_cpt_of_nid(nid);
+
+	LASSERT(!in_interrupt ());
+
+	CDEBUG(D_NET, "%s notifying %s: %s\n",
+		(ni == NULL) ? "userspace" : libcfs_nid2str(ni->ni_nid),
+		libcfs_nid2str(nid),
+		alive ? "up" : "down");
+
+	if (ni != NULL &&
+	    LNET_NIDNET(ni->ni_nid) != LNET_NIDNET(nid)) {
+		CWARN("Ignoring notification of %s %s by %s (different net)\n",
+			libcfs_nid2str(nid), alive ? "birth" : "death",
+			libcfs_nid2str(ni->ni_nid));
+		return -EINVAL;
+	}
+
+	/* can't do predictions... */
+	if (cfs_time_after(when, now)) {
+		CWARN("Ignoring prediction from %s of %s %s %ld seconds in the future\n",
+		      (ni == NULL) ? "userspace" : libcfs_nid2str(ni->ni_nid),
+		      libcfs_nid2str(nid), alive ? "up" : "down",
+		      cfs_duration_sec(cfs_time_sub(when, now)));
+		return -EINVAL;
+	}
+
+	if (ni != NULL && !alive &&	     /* LND telling me she's down */
+	    !auto_down) {		       /* auto-down disabled */
+		CDEBUG(D_NET, "Auto-down disabled\n");
+		return 0;
+	}
+
+	lnet_net_lock(cpt);
+
+	if (the_lnet.ln_shutdown) {
+		lnet_net_unlock(cpt);
+		return -ESHUTDOWN;
+	}
+
+	lp = lnet_find_peer_locked(the_lnet.ln_peer_tables[cpt], nid);
+	if (lp == NULL) {
+		/* nid not found */
+		lnet_net_unlock(cpt);
+		CDEBUG(D_NET, "%s not found\n", libcfs_nid2str(nid));
+		return 0;
+	}
+
+	/* We can't fully trust LND on reporting exact peer last_alive
+	 * if he notifies us about dead peer. For example ksocklnd can
+	 * call us with when == _time_when_the_node_was_booted_ if
+	 * no connections were successfully established */
+	if (ni != NULL && !alive && when < lp->lp_last_alive)
+		when = lp->lp_last_alive;
+
+	lnet_notify_locked(lp, ni == NULL, alive, when);
+
+	lnet_ni_notify_locked(ni, lp);
+
+	lnet_peer_decref_locked(lp);
+
+	lnet_net_unlock(cpt);
+	return 0;
+}
+EXPORT_SYMBOL(lnet_notify);
+
+void
+lnet_get_tunables(void)
+{
+}
+
+#else
+
+int
+lnet_notify(lnet_ni_t *ni, lnet_nid_t nid, int alive, unsigned long when)
+{
+	return -EOPNOTSUPP;
+}
+
+void
+lnet_router_checker(void)
+{
+	static time_t last;
+	static int    running;
+
+	time_t	    now = get_seconds();
+	int	       interval = now - last;
+	int	       rc;
+	__u64	     version;
+	lnet_peer_t      *rtr;
+
+	/* It's no use to call me again within a sec - all intervals and
+	 * timeouts are measured in seconds */
+	if (last != 0 && interval < 2)
+		return;
+
+	if (last != 0 &&
+	    interval > max(live_router_check_interval,
+			   dead_router_check_interval))
+		CNETERR("Checker(%d/%d) not called for %d seconds\n",
+			live_router_check_interval, dead_router_check_interval,
+			interval);
+
+	LASSERT(LNET_CPT_NUMBER == 1);
+
+	lnet_net_lock(0);
+	LASSERT(!running); /* recursion check */
+	running = 1;
+	lnet_net_unlock(0);
+
+	last = now;
+
+	if (the_lnet.ln_rc_state == LNET_RC_STATE_STOPPING)
+		lnet_prune_rc_data(0); /* unlink all rcd and nowait */
+
+	/* consume all pending events */
+	while (1) {
+		int	  i;
+		lnet_event_t ev;
+
+		/* NB ln_rc_eqh must be the 1st in 'eventqs' otherwise the
+		 * recursion breaker in LNetEQPoll would fail */
+		rc = LNetEQPoll(&the_lnet.ln_rc_eqh, 1, 0, &ev, &i);
+		if (rc == 0)   /* no event pending */
+			break;
+
+		/* NB a lost SENT prevents me from pinging a router again */
+		if (rc == -EOVERFLOW) {
+			CERROR("Dropped an event!!!\n");
+			abort();
+		}
+
+		LASSERT(rc == 1);
+
+		lnet_router_checker_event(&ev);
+	}
+
+	if (the_lnet.ln_rc_state == LNET_RC_STATE_STOPPING) {
+		lnet_prune_rc_data(1); /* release rcd */
+		the_lnet.ln_rc_state = LNET_RC_STATE_SHUTDOWN;
+		running = 0;
+		return;
+	}
+
+	LASSERT(the_lnet.ln_rc_state == LNET_RC_STATE_RUNNING);
+
+	lnet_net_lock(0);
+
+	version = the_lnet.ln_routers_version;
+	list_for_each_entry(rtr, &the_lnet.ln_routers, lp_rtr_list) {
+		lnet_ping_router_locked(rtr);
+		LASSERT(version == the_lnet.ln_routers_version);
+	}
+
+	lnet_net_unlock(0);
+
+	running = 0; /* lock only needed for the recursion check */
+}
+
+/* NB lnet_peers_start_down depends on me,
+ * so must be called before any peer creation */
+void
+lnet_get_tunables(void)
+{
+	char *s;
+
+	s = getenv("LNET_ROUTER_PING_TIMEOUT");
+	if (s != NULL)
+		router_ping_timeout = atoi(s);
+
+	s = getenv("LNET_LIVE_ROUTER_CHECK_INTERVAL");
+	if (s != NULL)
+		live_router_check_interval = atoi(s);
+
+	s = getenv("LNET_DEAD_ROUTER_CHECK_INTERVAL");
+	if (s != NULL)
+		dead_router_check_interval = atoi(s);
+
+	/* This replaces old lnd_notify mechanism */
+	check_routers_before_use = 1;
+	if (dead_router_check_interval <= 0)
+		dead_router_check_interval = 30;
+}
+
+void
+lnet_rtrpools_free(void)
+{
+}
+
+int
+lnet_rtrpools_alloc(int im_a_arouter)
+{
+	return 0;
+}
+
+#endif
diff --git a/kernel/drivers/staging/lustre/lnet/lnet/router_proc.c b/kernel/drivers/staging/lustre/lnet/lnet/router_proc.c
new file mode 100644
index 000000000..c055afc86
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lnet/lnet/router_proc.c
@@ -0,0 +1,968 @@
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ *
+ *   This file is part of Portals
+ *   http://sourceforge.net/projects/sandiaportals/
+ *
+ *   Portals is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Portals is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Portals; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+#include "../../include/linux/libcfs/libcfs.h"
+#include "../../include/linux/lnet/lib-lnet.h"
+
+#if  defined(LNET_ROUTER)
+
+/* This is really lnet_proc.c. You might need to update sanity test 215
+ * if any file format is changed. */
+
+static struct ctl_table_header *lnet_table_header;
+
+#define CTL_LNET	 (0x100)
+enum {
+	PSDEV_LNET_STATS = 100,
+	PSDEV_LNET_ROUTES,
+	PSDEV_LNET_ROUTERS,
+	PSDEV_LNET_PEERS,
+	PSDEV_LNET_BUFFERS,
+	PSDEV_LNET_NIS,
+	PSDEV_LNET_PTL_ROTOR,
+};
+
+#define LNET_LOFFT_BITS		(sizeof(loff_t) * 8)
+/*
+ * NB: max allowed LNET_CPT_BITS is 8 on 64-bit system and 2 on 32-bit system
+ */
+#define LNET_PROC_CPT_BITS	(LNET_CPT_BITS + 1)
+/* change version, 16 bits or 8 bits */
+#define LNET_PROC_VER_BITS	max_t(size_t, min_t(size_t, LNET_LOFFT_BITS, 64) / 4, 8)
+
+#define LNET_PROC_HASH_BITS	LNET_PEER_HASH_BITS
+/*
+ * bits for peer hash offset
+ * NB: we don't use the highest bit of *ppos because it's signed
+ */
+#define LNET_PROC_HOFF_BITS	(LNET_LOFFT_BITS -       \
+				 LNET_PROC_CPT_BITS -    \
+				 LNET_PROC_VER_BITS -    \
+				 LNET_PROC_HASH_BITS - 1)
+/* bits for hash index + position */
+#define LNET_PROC_HPOS_BITS	(LNET_PROC_HASH_BITS + LNET_PROC_HOFF_BITS)
+/* bits for peer hash table + hash version */
+#define LNET_PROC_VPOS_BITS	(LNET_PROC_HPOS_BITS + LNET_PROC_VER_BITS)
+
+#define LNET_PROC_CPT_MASK	((1ULL << LNET_PROC_CPT_BITS) - 1)
+#define LNET_PROC_VER_MASK	((1ULL << LNET_PROC_VER_BITS) - 1)
+#define LNET_PROC_HASH_MASK	((1ULL << LNET_PROC_HASH_BITS) - 1)
+#define LNET_PROC_HOFF_MASK	((1ULL << LNET_PROC_HOFF_BITS) - 1)
+
+#define LNET_PROC_CPT_GET(pos)				\
+	(int)(((pos) >> LNET_PROC_VPOS_BITS) & LNET_PROC_CPT_MASK)
+
+#define LNET_PROC_VER_GET(pos)				\
+	(int)(((pos) >> LNET_PROC_HPOS_BITS) & LNET_PROC_VER_MASK)
+
+#define LNET_PROC_HASH_GET(pos)				\
+	(int)(((pos) >> LNET_PROC_HOFF_BITS) & LNET_PROC_HASH_MASK)
+
+#define LNET_PROC_HOFF_GET(pos)				\
+	(int)((pos) & LNET_PROC_HOFF_MASK)
+
+#define LNET_PROC_POS_MAKE(cpt, ver, hash, off)		\
+	(((((loff_t)(cpt)) & LNET_PROC_CPT_MASK) << LNET_PROC_VPOS_BITS) |   \
+	((((loff_t)(ver)) & LNET_PROC_VER_MASK) << LNET_PROC_HPOS_BITS) |   \
+	((((loff_t)(hash)) & LNET_PROC_HASH_MASK) << LNET_PROC_HOFF_BITS) | \
+	((off) & LNET_PROC_HOFF_MASK))
+
+#define LNET_PROC_VERSION(v)	((unsigned int)((v) & LNET_PROC_VER_MASK))
+
+static int proc_call_handler(void *data, int write, loff_t *ppos,
+		void __user *buffer, size_t *lenp,
+		int (*handler)(void *data, int write,
+		loff_t pos, void __user *buffer, int len))
+{
+	int rc = handler(data, write, *ppos, buffer, *lenp);
+
+	if (rc < 0)
+		return rc;
+
+	if (write) {
+		*ppos += *lenp;
+	} else {
+		*lenp = rc;
+		*ppos += rc;
+	}
+	return 0;
+}
+
+static int __proc_lnet_stats(void *data, int write,
+			     loff_t pos, void __user *buffer, int nob)
+{
+	int	      rc;
+	lnet_counters_t *ctrs;
+	int	      len;
+	char	    *tmpstr;
+	const int	tmpsiz = 256; /* 7 %u and 4 %llu */
+
+	if (write) {
+		lnet_counters_reset();
+		return 0;
+	}
+
+	/* read */
+
+	LIBCFS_ALLOC(ctrs, sizeof(*ctrs));
+	if (ctrs == NULL)
+		return -ENOMEM;
+
+	LIBCFS_ALLOC(tmpstr, tmpsiz);
+	if (tmpstr == NULL) {
+		LIBCFS_FREE(ctrs, sizeof(*ctrs));
+		return -ENOMEM;
+	}
+
+	lnet_counters_get(ctrs);
+
+	len = snprintf(tmpstr, tmpsiz,
+		       "%u %u %u %u %u %u %u %llu %llu %llu %llu",
+		       ctrs->msgs_alloc, ctrs->msgs_max,
+		       ctrs->errors,
+		       ctrs->send_count, ctrs->recv_count,
+		       ctrs->route_count, ctrs->drop_count,
+		       ctrs->send_length, ctrs->recv_length,
+		       ctrs->route_length, ctrs->drop_length);
+
+	if (pos >= min_t(int, len, strlen(tmpstr)))
+		rc = 0;
+	else
+		rc = cfs_trace_copyout_string(buffer, nob,
+					      tmpstr + pos, "\n");
+
+	LIBCFS_FREE(tmpstr, tmpsiz);
+	LIBCFS_FREE(ctrs, sizeof(*ctrs));
+	return rc;
+}
+
+static int proc_lnet_stats(struct ctl_table *table, int write,
+			   void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	return proc_call_handler(table->data, write, ppos, buffer, lenp,
+				 __proc_lnet_stats);
+}
+
+static int proc_lnet_routes(struct ctl_table *table, int write,
+			    void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	const int	tmpsiz = 256;
+	char		*tmpstr;
+	char		*s;
+	int		rc = 0;
+	int		len;
+	int		ver;
+	int		off;
+
+	CLASSERT(sizeof(loff_t) >= 4);
+
+	off = LNET_PROC_HOFF_GET(*ppos);
+	ver = LNET_PROC_VER_GET(*ppos);
+
+	LASSERT(!write);
+
+	if (*lenp == 0)
+		return 0;
+
+	LIBCFS_ALLOC(tmpstr, tmpsiz);
+	if (tmpstr == NULL)
+		return -ENOMEM;
+
+	s = tmpstr; /* points to current position in tmpstr[] */
+
+	if (*ppos == 0) {
+		s += snprintf(s, tmpstr + tmpsiz - s, "Routing %s\n",
+			      the_lnet.ln_routing ? "enabled" : "disabled");
+		LASSERT(tmpstr + tmpsiz - s > 0);
+
+		s += snprintf(s, tmpstr + tmpsiz - s, "%-8s %4s %8s %7s %s\n",
+			      "net", "hops", "priority", "state", "router");
+		LASSERT(tmpstr + tmpsiz - s > 0);
+
+		lnet_net_lock(0);
+		ver = (unsigned int)the_lnet.ln_remote_nets_version;
+		lnet_net_unlock(0);
+		*ppos = LNET_PROC_POS_MAKE(0, ver, 0, off);
+	} else {
+		struct list_head		*n;
+		struct list_head		*r;
+		lnet_route_t		*route = NULL;
+		lnet_remotenet_t	*rnet  = NULL;
+		int			skip  = off - 1;
+		struct list_head		*rn_list;
+		int			i;
+
+		lnet_net_lock(0);
+
+		if (ver != LNET_PROC_VERSION(the_lnet.ln_remote_nets_version)) {
+			lnet_net_unlock(0);
+			LIBCFS_FREE(tmpstr, tmpsiz);
+			return -ESTALE;
+		}
+
+		for (i = 0; i < LNET_REMOTE_NETS_HASH_SIZE && route == NULL;
+		     i++) {
+			rn_list = &the_lnet.ln_remote_nets_hash[i];
+
+			n = rn_list->next;
+
+			while (n != rn_list && route == NULL) {
+				rnet = list_entry(n, lnet_remotenet_t,
+						      lrn_list);
+
+				r = rnet->lrn_routes.next;
+
+				while (r != &rnet->lrn_routes) {
+					lnet_route_t *re =
+						list_entry(r, lnet_route_t,
+							       lr_list);
+					if (skip == 0) {
+						route = re;
+						break;
+					}
+
+					skip--;
+					r = r->next;
+				}
+
+				n = n->next;
+			}
+		}
+
+		if (route != NULL) {
+			__u32        net	= rnet->lrn_net;
+			unsigned int hops	= route->lr_hops;
+			unsigned int priority	= route->lr_priority;
+			lnet_nid_t   nid	= route->lr_gateway->lp_nid;
+			int          alive	= route->lr_gateway->lp_alive;
+
+			s += snprintf(s, tmpstr + tmpsiz - s,
+				      "%-8s %4u %8u %7s %s\n",
+				      libcfs_net2str(net), hops,
+				      priority,
+				      alive ? "up" : "down",
+				      libcfs_nid2str(nid));
+			LASSERT(tmpstr + tmpsiz - s > 0);
+		}
+
+		lnet_net_unlock(0);
+	}
+
+	len = s - tmpstr;     /* how many bytes was written */
+
+	if (len > *lenp) {    /* linux-supplied buffer is too small */
+		rc = -EINVAL;
+	} else if (len > 0) { /* wrote something */
+		if (copy_to_user(buffer, tmpstr, len))
+			rc = -EFAULT;
+		else {
+			off += 1;
+			*ppos = LNET_PROC_POS_MAKE(0, ver, 0, off);
+		}
+	}
+
+	LIBCFS_FREE(tmpstr, tmpsiz);
+
+	if (rc == 0)
+		*lenp = len;
+
+	return rc;
+}
+
+static int proc_lnet_routers(struct ctl_table *table, int write,
+			     void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	int	rc = 0;
+	char      *tmpstr;
+	char      *s;
+	const int  tmpsiz = 256;
+	int	len;
+	int	ver;
+	int	off;
+
+	off = LNET_PROC_HOFF_GET(*ppos);
+	ver = LNET_PROC_VER_GET(*ppos);
+
+	LASSERT(!write);
+
+	if (*lenp == 0)
+		return 0;
+
+	LIBCFS_ALLOC(tmpstr, tmpsiz);
+	if (tmpstr == NULL)
+		return -ENOMEM;
+
+	s = tmpstr; /* points to current position in tmpstr[] */
+
+	if (*ppos == 0) {
+		s += snprintf(s, tmpstr + tmpsiz - s,
+			      "%-4s %7s %9s %6s %12s %9s %8s %7s %s\n",
+			      "ref", "rtr_ref", "alive_cnt", "state",
+			      "last_ping", "ping_sent", "deadline",
+			      "down_ni", "router");
+		LASSERT(tmpstr + tmpsiz - s > 0);
+
+		lnet_net_lock(0);
+		ver = (unsigned int)the_lnet.ln_routers_version;
+		lnet_net_unlock(0);
+		*ppos = LNET_PROC_POS_MAKE(0, ver, 0, off);
+	} else {
+		struct list_head		*r;
+		struct lnet_peer	*peer = NULL;
+		int			skip = off - 1;
+
+		lnet_net_lock(0);
+
+		if (ver != LNET_PROC_VERSION(the_lnet.ln_routers_version)) {
+			lnet_net_unlock(0);
+
+			LIBCFS_FREE(tmpstr, tmpsiz);
+			return -ESTALE;
+		}
+
+		r = the_lnet.ln_routers.next;
+
+		while (r != &the_lnet.ln_routers) {
+			lnet_peer_t *lp = list_entry(r, lnet_peer_t,
+							 lp_rtr_list);
+
+			if (skip == 0) {
+				peer = lp;
+				break;
+			}
+
+			skip--;
+			r = r->next;
+		}
+
+		if (peer != NULL) {
+			lnet_nid_t nid = peer->lp_nid;
+			unsigned long now = cfs_time_current();
+			unsigned long deadline = peer->lp_ping_deadline;
+			int nrefs     = peer->lp_refcount;
+			int nrtrrefs  = peer->lp_rtr_refcount;
+			int alive_cnt = peer->lp_alive_count;
+			int alive     = peer->lp_alive;
+			int pingsent  = !peer->lp_ping_notsent;
+			int last_ping = cfs_duration_sec(cfs_time_sub(now,
+						     peer->lp_ping_timestamp));
+			int down_ni   = 0;
+			lnet_route_t *rtr;
+
+			if ((peer->lp_ping_feats &
+			     LNET_PING_FEAT_NI_STATUS) != 0) {
+				list_for_each_entry(rtr, &peer->lp_routes,
+							lr_gwlist) {
+					/* downis on any route should be the
+					 * number of downis on the gateway */
+					if (rtr->lr_downis != 0) {
+						down_ni = rtr->lr_downis;
+						break;
+					}
+				}
+			}
+
+			if (deadline == 0)
+				s += snprintf(s, tmpstr + tmpsiz - s,
+					      "%-4d %7d %9d %6s %12d %9d %8s %7d %s\n",
+					      nrefs, nrtrrefs, alive_cnt,
+					      alive ? "up" : "down", last_ping,
+					      pingsent, "NA", down_ni,
+					      libcfs_nid2str(nid));
+			else
+				s += snprintf(s, tmpstr + tmpsiz - s,
+					      "%-4d %7d %9d %6s %12d %9d %8lu %7d %s\n",
+					      nrefs, nrtrrefs, alive_cnt,
+					      alive ? "up" : "down", last_ping,
+					      pingsent,
+					      cfs_duration_sec(cfs_time_sub(deadline, now)),
+					      down_ni, libcfs_nid2str(nid));
+			LASSERT(tmpstr + tmpsiz - s > 0);
+		}
+
+		lnet_net_unlock(0);
+	}
+
+	len = s - tmpstr;     /* how many bytes was written */
+
+	if (len > *lenp) {    /* linux-supplied buffer is too small */
+		rc = -EINVAL;
+	} else if (len > 0) { /* wrote something */
+		if (copy_to_user(buffer, tmpstr, len))
+			rc = -EFAULT;
+		else {
+			off += 1;
+			*ppos = LNET_PROC_POS_MAKE(0, ver, 0, off);
+		}
+	}
+
+	LIBCFS_FREE(tmpstr, tmpsiz);
+
+	if (rc == 0)
+		*lenp = len;
+
+	return rc;
+}
+
+static int proc_lnet_peers(struct ctl_table *table, int write,
+			   void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	const int		tmpsiz  = 256;
+	struct lnet_peer_table	*ptable;
+	char			*tmpstr;
+	char			*s;
+	int			cpt  = LNET_PROC_CPT_GET(*ppos);
+	int			ver  = LNET_PROC_VER_GET(*ppos);
+	int			hash = LNET_PROC_HASH_GET(*ppos);
+	int			hoff = LNET_PROC_HOFF_GET(*ppos);
+	int			rc = 0;
+	int			len;
+
+	CLASSERT(LNET_PROC_HASH_BITS >= LNET_PEER_HASH_BITS);
+	LASSERT(!write);
+
+	if (*lenp == 0)
+		return 0;
+
+	if (cpt >= LNET_CPT_NUMBER) {
+		*lenp = 0;
+		return 0;
+	}
+
+	LIBCFS_ALLOC(tmpstr, tmpsiz);
+	if (tmpstr == NULL)
+		return -ENOMEM;
+
+	s = tmpstr; /* points to current position in tmpstr[] */
+
+	if (*ppos == 0) {
+		s += snprintf(s, tmpstr + tmpsiz - s,
+			      "%-24s %4s %5s %5s %5s %5s %5s %5s %5s %s\n",
+			      "nid", "refs", "state", "last", "max",
+			      "rtr", "min", "tx", "min", "queue");
+		LASSERT(tmpstr + tmpsiz - s > 0);
+
+		hoff++;
+	} else {
+		struct lnet_peer	*peer;
+		struct list_head		*p;
+		int			skip;
+ again:
+		p = NULL;
+		peer = NULL;
+		skip = hoff - 1;
+
+		lnet_net_lock(cpt);
+		ptable = the_lnet.ln_peer_tables[cpt];
+		if (hoff == 1)
+			ver = LNET_PROC_VERSION(ptable->pt_version);
+
+		if (ver != LNET_PROC_VERSION(ptable->pt_version)) {
+			lnet_net_unlock(cpt);
+			LIBCFS_FREE(tmpstr, tmpsiz);
+			return -ESTALE;
+		}
+
+		while (hash < LNET_PEER_HASH_SIZE) {
+			if (p == NULL)
+				p = ptable->pt_hash[hash].next;
+
+			while (p != &ptable->pt_hash[hash]) {
+				lnet_peer_t *lp = list_entry(p, lnet_peer_t,
+								 lp_hashlist);
+				if (skip == 0) {
+					peer = lp;
+
+					/* minor optimization: start from idx+1
+					 * on next iteration if we've just
+					 * drained lp_hashlist */
+					if (lp->lp_hashlist.next ==
+					    &ptable->pt_hash[hash]) {
+						hoff = 1;
+						hash++;
+					} else {
+						hoff++;
+					}
+
+					break;
+				}
+
+				skip--;
+				p = lp->lp_hashlist.next;
+			}
+
+			if (peer != NULL)
+				break;
+
+			p = NULL;
+			hoff = 1;
+			hash++;
+		}
+
+		if (peer != NULL) {
+			lnet_nid_t nid       = peer->lp_nid;
+			int	nrefs     = peer->lp_refcount;
+			int	lastalive = -1;
+			char      *aliveness = "NA";
+			int	maxcr     = peer->lp_ni->ni_peertxcredits;
+			int	txcr      = peer->lp_txcredits;
+			int	mintxcr   = peer->lp_mintxcredits;
+			int	rtrcr     = peer->lp_rtrcredits;
+			int	minrtrcr  = peer->lp_minrtrcredits;
+			int	txqnob    = peer->lp_txqnob;
+
+			if (lnet_isrouter(peer) ||
+			    lnet_peer_aliveness_enabled(peer))
+				aliveness = peer->lp_alive ? "up" : "down";
+
+			if (lnet_peer_aliveness_enabled(peer)) {
+				unsigned long     now = cfs_time_current();
+				long delta;
+
+				delta = cfs_time_sub(now, peer->lp_last_alive);
+				lastalive = cfs_duration_sec(delta);
+
+				/* No need to mess up peers contents with
+				 * arbitrarily long integers - it suffices to
+				 * know that lastalive is more than 10000s old
+				 */
+				if (lastalive >= 10000)
+					lastalive = 9999;
+			}
+
+			lnet_net_unlock(cpt);
+
+			s += snprintf(s, tmpstr + tmpsiz - s,
+				      "%-24s %4d %5s %5d %5d %5d %5d %5d %5d %d\n",
+				      libcfs_nid2str(nid), nrefs, aliveness,
+				      lastalive, maxcr, rtrcr, minrtrcr, txcr,
+				      mintxcr, txqnob);
+			LASSERT(tmpstr + tmpsiz - s > 0);
+
+		} else { /* peer is NULL */
+			lnet_net_unlock(cpt);
+		}
+
+		if (hash == LNET_PEER_HASH_SIZE) {
+			cpt++;
+			hash = 0;
+			hoff = 1;
+			if (peer == NULL && cpt < LNET_CPT_NUMBER)
+				goto again;
+		}
+	}
+
+	len = s - tmpstr;     /* how many bytes was written */
+
+	if (len > *lenp) {    /* linux-supplied buffer is too small */
+		rc = -EINVAL;
+	} else if (len > 0) { /* wrote something */
+		if (copy_to_user(buffer, tmpstr, len))
+			rc = -EFAULT;
+		else
+			*ppos = LNET_PROC_POS_MAKE(cpt, ver, hash, hoff);
+	}
+
+	LIBCFS_FREE(tmpstr, tmpsiz);
+
+	if (rc == 0)
+		*lenp = len;
+
+	return rc;
+}
+
+static int __proc_lnet_buffers(void *data, int write,
+			       loff_t pos, void __user *buffer, int nob)
+{
+	char	    *s;
+	char	    *tmpstr;
+	int		tmpsiz;
+	int		idx;
+	int		len;
+	int		rc;
+	int		i;
+
+	LASSERT(!write);
+
+	/* (4 %d) * 4 * LNET_CPT_NUMBER */
+	tmpsiz = 64 * (LNET_NRBPOOLS + 1) * LNET_CPT_NUMBER;
+	LIBCFS_ALLOC(tmpstr, tmpsiz);
+	if (tmpstr == NULL)
+		return -ENOMEM;
+
+	s = tmpstr; /* points to current position in tmpstr[] */
+
+	s += snprintf(s, tmpstr + tmpsiz - s,
+		      "%5s %5s %7s %7s\n",
+		      "pages", "count", "credits", "min");
+	LASSERT(tmpstr + tmpsiz - s > 0);
+
+	if (the_lnet.ln_rtrpools == NULL)
+		goto out; /* I'm not a router */
+
+	for (idx = 0; idx < LNET_NRBPOOLS; idx++) {
+		lnet_rtrbufpool_t *rbp;
+
+		lnet_net_lock(LNET_LOCK_EX);
+		cfs_percpt_for_each(rbp, i, the_lnet.ln_rtrpools) {
+			s += snprintf(s, tmpstr + tmpsiz - s,
+				      "%5d %5d %7d %7d\n",
+				      rbp[idx].rbp_npages,
+				      rbp[idx].rbp_nbuffers,
+				      rbp[idx].rbp_credits,
+				      rbp[idx].rbp_mincredits);
+			LASSERT(tmpstr + tmpsiz - s > 0);
+		}
+		lnet_net_unlock(LNET_LOCK_EX);
+	}
+
+ out:
+	len = s - tmpstr;
+
+	if (pos >= min_t(int, len, strlen(tmpstr)))
+		rc = 0;
+	else
+		rc = cfs_trace_copyout_string(buffer, nob,
+					      tmpstr + pos, NULL);
+
+	LIBCFS_FREE(tmpstr, tmpsiz);
+	return rc;
+}
+
+static int proc_lnet_buffers(struct ctl_table *table, int write,
+			     void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	return proc_call_handler(table->data, write, ppos, buffer, lenp,
+				 __proc_lnet_buffers);
+}
+
+static int proc_lnet_nis(struct ctl_table *table, int write,
+			 void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	int	tmpsiz = 128 * LNET_CPT_NUMBER;
+	int	rc = 0;
+	char      *tmpstr;
+	char      *s;
+	int	len;
+
+	LASSERT(!write);
+
+	if (*lenp == 0)
+		return 0;
+
+	LIBCFS_ALLOC(tmpstr, tmpsiz);
+	if (tmpstr == NULL)
+		return -ENOMEM;
+
+	s = tmpstr; /* points to current position in tmpstr[] */
+
+	if (*ppos == 0) {
+		s += snprintf(s, tmpstr + tmpsiz - s,
+			      "%-24s %6s %5s %4s %4s %4s %5s %5s %5s\n",
+			      "nid", "status", "alive", "refs", "peer",
+			      "rtr", "max", "tx", "min");
+		LASSERT(tmpstr + tmpsiz - s > 0);
+	} else {
+		struct list_head	*n;
+		lnet_ni_t	 *ni   = NULL;
+		int		skip = *ppos - 1;
+
+		lnet_net_lock(0);
+
+		n = the_lnet.ln_nis.next;
+
+		while (n != &the_lnet.ln_nis) {
+			lnet_ni_t *a_ni = list_entry(n, lnet_ni_t, ni_list);
+
+			if (skip == 0) {
+				ni = a_ni;
+				break;
+			}
+
+			skip--;
+			n = n->next;
+		}
+
+		if (ni != NULL) {
+			struct lnet_tx_queue	*tq;
+			char	*stat;
+			long	now = get_seconds();
+			int	last_alive = -1;
+			int	i;
+			int	j;
+
+			if (the_lnet.ln_routing)
+				last_alive = now - ni->ni_last_alive;
+
+			/* @lo forever alive */
+			if (ni->ni_lnd->lnd_type == LOLND)
+				last_alive = 0;
+
+			lnet_ni_lock(ni);
+			LASSERT(ni->ni_status != NULL);
+			stat = (ni->ni_status->ns_status ==
+				LNET_NI_STATUS_UP) ? "up" : "down";
+			lnet_ni_unlock(ni);
+
+			/* we actually output credits information for
+			 * TX queue of each partition */
+			cfs_percpt_for_each(tq, i, ni->ni_tx_queues) {
+				for (j = 0; ni->ni_cpts != NULL &&
+				     j < ni->ni_ncpts; j++) {
+					if (i == ni->ni_cpts[j])
+						break;
+				}
+
+				if (j == ni->ni_ncpts)
+					continue;
+
+				if (i != 0)
+					lnet_net_lock(i);
+
+				s += snprintf(s, tmpstr + tmpsiz - s,
+				      "%-24s %6s %5d %4d %4d %4d %5d %5d %5d\n",
+				      libcfs_nid2str(ni->ni_nid), stat,
+				      last_alive, *ni->ni_refs[i],
+				      ni->ni_peertxcredits,
+				      ni->ni_peerrtrcredits,
+				      tq->tq_credits_max,
+				      tq->tq_credits, tq->tq_credits_min);
+				if (i != 0)
+					lnet_net_unlock(i);
+			}
+			LASSERT(tmpstr + tmpsiz - s > 0);
+		}
+
+		lnet_net_unlock(0);
+	}
+
+	len = s - tmpstr;     /* how many bytes was written */
+
+	if (len > *lenp) {    /* linux-supplied buffer is too small */
+		rc = -EINVAL;
+	} else if (len > 0) { /* wrote something */
+		if (copy_to_user(buffer, tmpstr, len))
+			rc = -EFAULT;
+		else
+			*ppos += 1;
+	}
+
+	LIBCFS_FREE(tmpstr, tmpsiz);
+
+	if (rc == 0)
+		*lenp = len;
+
+	return rc;
+}
+
+struct lnet_portal_rotors {
+	int	     pr_value;
+	const char      *pr_name;
+	const char	*pr_desc;
+};
+
+static struct lnet_portal_rotors	portal_rotors[] = {
+	{
+		.pr_value = LNET_PTL_ROTOR_OFF,
+		.pr_name  = "OFF",
+		.pr_desc  = "Turn off message rotor for wildcard portals"
+	},
+	{
+		.pr_value = LNET_PTL_ROTOR_ON,
+		.pr_name  = "ON",
+		.pr_desc  = "round-robin dispatch all PUT messages for wildcard portals"
+	},
+	{
+		.pr_value = LNET_PTL_ROTOR_RR_RT,
+		.pr_name  = "RR_RT",
+		.pr_desc  = "round-robin dispatch routed PUT message for wildcard portals"
+	},
+	{
+		.pr_value = LNET_PTL_ROTOR_HASH_RT,
+		.pr_name  = "HASH_RT",
+		.pr_desc  = "dispatch routed PUT message by hashing source NID for wildcard portals"
+	},
+	{
+		.pr_value = -1,
+		.pr_name  = NULL,
+		.pr_desc  = NULL
+	},
+};
+
+extern int portal_rotor;
+
+static int __proc_lnet_portal_rotor(void *data, int write,
+				    loff_t pos, void __user *buffer, int nob)
+{
+	const int	buf_len	= 128;
+	char		*buf;
+	char		*tmp;
+	int		rc;
+	int		i;
+
+	LIBCFS_ALLOC(buf, buf_len);
+	if (buf == NULL)
+		return -ENOMEM;
+
+	if (!write) {
+		lnet_res_lock(0);
+
+		for (i = 0; portal_rotors[i].pr_value >= 0; i++) {
+			if (portal_rotors[i].pr_value == portal_rotor)
+				break;
+		}
+
+		LASSERT(portal_rotors[i].pr_value == portal_rotor);
+		lnet_res_unlock(0);
+
+		rc = snprintf(buf, buf_len,
+			      "{\n\tportals: all\n"
+			      "\trotor: %s\n\tdescription: %s\n}",
+			      portal_rotors[i].pr_name,
+			      portal_rotors[i].pr_desc);
+
+		if (pos >= min_t(int, rc, buf_len)) {
+			rc = 0;
+		} else {
+			rc = cfs_trace_copyout_string(buffer, nob,
+					buf + pos, "\n");
+		}
+		goto out;
+	}
+
+	rc = cfs_trace_copyin_string(buf, buf_len, buffer, nob);
+	if (rc < 0)
+		goto out;
+
+	tmp = cfs_trimwhite(buf);
+
+	rc = -EINVAL;
+	lnet_res_lock(0);
+	for (i = 0; portal_rotors[i].pr_name != NULL; i++) {
+		if (strncasecmp(portal_rotors[i].pr_name, tmp,
+				strlen(portal_rotors[i].pr_name)) == 0) {
+			portal_rotor = portal_rotors[i].pr_value;
+			rc = 0;
+			break;
+		}
+	}
+	lnet_res_unlock(0);
+out:
+	LIBCFS_FREE(buf, buf_len);
+	return rc;
+}
+
+static int proc_lnet_portal_rotor(struct ctl_table *table, int write,
+				  void __user *buffer, size_t *lenp,
+				  loff_t *ppos)
+{
+	return proc_call_handler(table->data, write, ppos, buffer, lenp,
+				 __proc_lnet_portal_rotor);
+}
+
+static struct ctl_table lnet_table[] = {
+	/*
+	 * NB No .strategy entries have been provided since sysctl(8) prefers
+	 * to go via /proc for portability.
+	 */
+	{
+		.procname = "stats",
+		.mode     = 0644,
+		.proc_handler = &proc_lnet_stats,
+	},
+	{
+		.procname = "routes",
+		.mode     = 0444,
+		.proc_handler = &proc_lnet_routes,
+	},
+	{
+		.procname = "routers",
+		.mode     = 0444,
+		.proc_handler = &proc_lnet_routers,
+	},
+	{
+		.procname = "peers",
+		.mode     = 0444,
+		.proc_handler = &proc_lnet_peers,
+	},
+	{
+		.procname = "buffers",
+		.mode     = 0444,
+		.proc_handler = &proc_lnet_buffers,
+	},
+	{
+		.procname = "nis",
+		.mode     = 0444,
+		.proc_handler = &proc_lnet_nis,
+	},
+	{
+		.procname = "portal_rotor",
+		.mode     = 0644,
+		.proc_handler = &proc_lnet_portal_rotor,
+	},
+	{
+	}
+};
+
+static struct ctl_table top_table[] = {
+	{
+		.procname = "lnet",
+		.mode     = 0555,
+		.data     = NULL,
+		.maxlen   = 0,
+		.child    = lnet_table,
+	},
+	{
+	}
+};
+
+void
+lnet_proc_init(void)
+{
+	if (lnet_table_header == NULL)
+		lnet_table_header = register_sysctl_table(top_table);
+}
+
+void
+lnet_proc_fini(void)
+{
+	if (lnet_table_header != NULL)
+		unregister_sysctl_table(lnet_table_header);
+
+	lnet_table_header = NULL;
+}
+
+#else
+
+void
+lnet_proc_init(void)
+{
+}
+
+void
+lnet_proc_fini(void)
+{
+}
+
+#endif
diff --git a/kernel/drivers/staging/lustre/lnet/selftest/Makefile b/kernel/drivers/staging/lustre/lnet/selftest/Makefile
new file mode 100644
index 000000000..c0de6e2d9
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lnet/selftest/Makefile
@@ -0,0 +1,4 @@
+obj-$(CONFIG_LNET_SELFTEST) := lnet_selftest.o
+
+lnet_selftest-y := console.o conrpc.o conctl.o framework.o timer.o rpc.o \
+		   module.o ping_test.o brw_test.o
diff --git a/kernel/drivers/staging/lustre/lnet/selftest/brw_test.c b/kernel/drivers/staging/lustre/lnet/selftest/brw_test.c
new file mode 100644
index 000000000..658f4584f
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lnet/selftest/brw_test.c
@@ -0,0 +1,508 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/selftest/brw_test.c
+ *
+ * Author: Isaac Huang <isaac@clusterfs.com>
+ */
+
+#include "selftest.h"
+
+static int brw_srv_workitems = SFW_TEST_WI_MAX;
+module_param(brw_srv_workitems, int, 0644);
+MODULE_PARM_DESC(brw_srv_workitems, "# BRW server workitems");
+
+static int brw_inject_errors;
+module_param(brw_inject_errors, int, 0644);
+MODULE_PARM_DESC(brw_inject_errors, "# data errors to inject randomly, zero by default");
+
+static void
+brw_client_fini(sfw_test_instance_t *tsi)
+{
+	srpc_bulk_t     *bulk;
+	sfw_test_unit_t *tsu;
+
+	LASSERT(tsi->tsi_is_client);
+
+	list_for_each_entry(tsu, &tsi->tsi_units, tsu_list) {
+		bulk = tsu->tsu_private;
+		if (bulk == NULL)
+			continue;
+
+		srpc_free_bulk(bulk);
+		tsu->tsu_private = NULL;
+	}
+}
+
+static int
+brw_client_init(sfw_test_instance_t *tsi)
+{
+	sfw_session_t	 *sn = tsi->tsi_batch->bat_session;
+	int		  flags;
+	int		  npg;
+	int		  len;
+	int		  opc;
+	srpc_bulk_t	 *bulk;
+	sfw_test_unit_t	 *tsu;
+
+	LASSERT(sn != NULL);
+	LASSERT(tsi->tsi_is_client);
+
+	if ((sn->sn_features & LST_FEAT_BULK_LEN) == 0) {
+		test_bulk_req_t  *breq = &tsi->tsi_u.bulk_v0;
+
+		opc   = breq->blk_opc;
+		flags = breq->blk_flags;
+		npg   = breq->blk_npg;
+		/* NB: this is not going to work for variable page size,
+		 * but we have to keep it for compatibility */
+		len   = npg * PAGE_CACHE_SIZE;
+
+	} else {
+		test_bulk_req_v1_t  *breq = &tsi->tsi_u.bulk_v1;
+
+		/* I should never get this step if it's unknown feature
+		 * because make_session will reject unknown feature */
+		LASSERT((sn->sn_features & ~LST_FEATS_MASK) == 0);
+
+		opc   = breq->blk_opc;
+		flags = breq->blk_flags;
+		len   = breq->blk_len;
+		npg   = (len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+	}
+
+	if (npg > LNET_MAX_IOV || npg <= 0)
+		return -EINVAL;
+
+	if (opc != LST_BRW_READ && opc != LST_BRW_WRITE)
+		return -EINVAL;
+
+	if (flags != LST_BRW_CHECK_NONE &&
+	    flags != LST_BRW_CHECK_FULL && flags != LST_BRW_CHECK_SIMPLE)
+		return -EINVAL;
+
+	list_for_each_entry(tsu, &tsi->tsi_units, tsu_list) {
+		bulk = srpc_alloc_bulk(lnet_cpt_of_nid(tsu->tsu_dest.nid),
+				       npg, len, opc == LST_BRW_READ);
+		if (bulk == NULL) {
+			brw_client_fini(tsi);
+			return -ENOMEM;
+		}
+
+		tsu->tsu_private = bulk;
+	}
+
+	return 0;
+}
+
+#define BRW_POISON      0xbeefbeefbeefbeefULL
+#define BRW_MAGIC       0xeeb0eeb1eeb2eeb3ULL
+#define BRW_MSIZE       sizeof(__u64)
+
+static int
+brw_inject_one_error(void)
+{
+	struct timeval tv;
+
+	if (brw_inject_errors <= 0)
+		return 0;
+
+	do_gettimeofday(&tv);
+
+	if ((tv.tv_usec & 1) == 0)
+		return 0;
+
+	return brw_inject_errors--;
+}
+
+static void
+brw_fill_page(struct page *pg, int pattern, __u64 magic)
+{
+	char *addr = page_address(pg);
+	int   i;
+
+	LASSERT(addr != NULL);
+
+	if (pattern == LST_BRW_CHECK_NONE)
+		return;
+
+	if (magic == BRW_MAGIC)
+		magic += brw_inject_one_error();
+
+	if (pattern == LST_BRW_CHECK_SIMPLE) {
+		memcpy(addr, &magic, BRW_MSIZE);
+		addr += PAGE_CACHE_SIZE - BRW_MSIZE;
+		memcpy(addr, &magic, BRW_MSIZE);
+		return;
+	}
+
+	if (pattern == LST_BRW_CHECK_FULL) {
+		for (i = 0; i < PAGE_CACHE_SIZE / BRW_MSIZE; i++)
+			memcpy(addr + i * BRW_MSIZE, &magic, BRW_MSIZE);
+		return;
+	}
+
+	LBUG();
+}
+
+static int
+brw_check_page(struct page *pg, int pattern, __u64 magic)
+{
+	char  *addr = page_address(pg);
+	__u64  data = 0; /* make compiler happy */
+	int    i;
+
+	LASSERT(addr != NULL);
+
+	if (pattern == LST_BRW_CHECK_NONE)
+		return 0;
+
+	if (pattern == LST_BRW_CHECK_SIMPLE) {
+		data = *((__u64 *) addr);
+		if (data != magic)
+			goto bad_data;
+
+		addr += PAGE_CACHE_SIZE - BRW_MSIZE;
+		data = *((__u64 *) addr);
+		if (data != magic)
+			goto bad_data;
+
+		return 0;
+	}
+
+	if (pattern == LST_BRW_CHECK_FULL) {
+		for (i = 0; i < PAGE_CACHE_SIZE / BRW_MSIZE; i++) {
+			data = *(((__u64 *) addr) + i);
+			if (data != magic)
+				goto bad_data;
+		}
+
+		return 0;
+	}
+
+	LBUG();
+
+bad_data:
+	CERROR("Bad data in page %p: %#llx, %#llx expected\n",
+		pg, data, magic);
+	return 1;
+}
+
+static void
+brw_fill_bulk(srpc_bulk_t *bk, int pattern, __u64 magic)
+{
+	int	 i;
+	struct page *pg;
+
+	for (i = 0; i < bk->bk_niov; i++) {
+		pg = bk->bk_iovs[i].kiov_page;
+		brw_fill_page(pg, pattern, magic);
+	}
+}
+
+static int
+brw_check_bulk(srpc_bulk_t *bk, int pattern, __u64 magic)
+{
+	int	 i;
+	struct page *pg;
+
+	for (i = 0; i < bk->bk_niov; i++) {
+		pg = bk->bk_iovs[i].kiov_page;
+		if (brw_check_page(pg, pattern, magic) != 0) {
+			CERROR("Bulk page %p (%d/%d) is corrupted!\n",
+				pg, i, bk->bk_niov);
+			return 1;
+		}
+	}
+
+	return 0;
+}
+
+static int
+brw_client_prep_rpc(sfw_test_unit_t *tsu,
+		     lnet_process_id_t dest, srpc_client_rpc_t **rpcpp)
+{
+	srpc_bulk_t	 *bulk = tsu->tsu_private;
+	sfw_test_instance_t *tsi = tsu->tsu_instance;
+	sfw_session_t	    *sn = tsi->tsi_batch->bat_session;
+	srpc_client_rpc_t   *rpc;
+	srpc_brw_reqst_t    *req;
+	int		     flags;
+	int		     npg;
+	int		     len;
+	int		     opc;
+	int		     rc;
+
+	LASSERT(sn != NULL);
+	LASSERT(bulk != NULL);
+
+	if ((sn->sn_features & LST_FEAT_BULK_LEN) == 0) {
+		test_bulk_req_t *breq = &tsi->tsi_u.bulk_v0;
+
+		opc   = breq->blk_opc;
+		flags = breq->blk_flags;
+		npg   = breq->blk_npg;
+		len   = npg * PAGE_CACHE_SIZE;
+
+	} else {
+		test_bulk_req_v1_t  *breq = &tsi->tsi_u.bulk_v1;
+
+		/* I should never get this step if it's unknown feature
+		 * because make_session will reject unknown feature */
+		LASSERT((sn->sn_features & ~LST_FEATS_MASK) == 0);
+
+		opc   = breq->blk_opc;
+		flags = breq->blk_flags;
+		len   = breq->blk_len;
+		npg   = (len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+	}
+
+	rc = sfw_create_test_rpc(tsu, dest, sn->sn_features, npg, len, &rpc);
+	if (rc != 0)
+		return rc;
+
+	memcpy(&rpc->crpc_bulk, bulk, offsetof(srpc_bulk_t, bk_iovs[npg]));
+	if (opc == LST_BRW_WRITE)
+		brw_fill_bulk(&rpc->crpc_bulk, flags, BRW_MAGIC);
+	else
+		brw_fill_bulk(&rpc->crpc_bulk, flags, BRW_POISON);
+
+	req = &rpc->crpc_reqstmsg.msg_body.brw_reqst;
+	req->brw_flags = flags;
+	req->brw_rw    = opc;
+	req->brw_len   = len;
+
+	*rpcpp = rpc;
+	return 0;
+}
+
+static void
+brw_client_done_rpc(sfw_test_unit_t *tsu, srpc_client_rpc_t *rpc)
+{
+	__u64		magic = BRW_MAGIC;
+	sfw_test_instance_t *tsi = tsu->tsu_instance;
+	sfw_session_t       *sn = tsi->tsi_batch->bat_session;
+	srpc_msg_t	  *msg = &rpc->crpc_replymsg;
+	srpc_brw_reply_t    *reply = &msg->msg_body.brw_reply;
+	srpc_brw_reqst_t    *reqst = &rpc->crpc_reqstmsg.msg_body.brw_reqst;
+
+	LASSERT(sn != NULL);
+
+	if (rpc->crpc_status != 0) {
+		CERROR("BRW RPC to %s failed with %d\n",
+			libcfs_id2str(rpc->crpc_dest), rpc->crpc_status);
+		if (!tsi->tsi_stopping) /* rpc could have been aborted */
+			atomic_inc(&sn->sn_brw_errors);
+		goto out;
+	}
+
+	if (msg->msg_magic != SRPC_MSG_MAGIC) {
+		__swab64s(&magic);
+		__swab32s(&reply->brw_status);
+	}
+
+	CDEBUG(reply->brw_status ? D_WARNING : D_NET,
+		"BRW RPC to %s finished with brw_status: %d\n",
+		libcfs_id2str(rpc->crpc_dest), reply->brw_status);
+
+	if (reply->brw_status != 0) {
+		atomic_inc(&sn->sn_brw_errors);
+		rpc->crpc_status = -(int)reply->brw_status;
+		goto out;
+	}
+
+	if (reqst->brw_rw == LST_BRW_WRITE)
+		goto out;
+
+	if (brw_check_bulk(&rpc->crpc_bulk, reqst->brw_flags, magic) != 0) {
+		CERROR("Bulk data from %s is corrupted!\n",
+			libcfs_id2str(rpc->crpc_dest));
+		atomic_inc(&sn->sn_brw_errors);
+		rpc->crpc_status = -EBADMSG;
+	}
+
+out:
+	return;
+}
+
+static void
+brw_server_rpc_done(srpc_server_rpc_t *rpc)
+{
+	srpc_bulk_t *blk = rpc->srpc_bulk;
+
+	if (blk == NULL)
+		return;
+
+	if (rpc->srpc_status != 0)
+		CERROR("Bulk transfer %s %s has failed: %d\n",
+			blk->bk_sink ? "from" : "to",
+			libcfs_id2str(rpc->srpc_peer), rpc->srpc_status);
+	else
+		CDEBUG(D_NET, "Transferred %d pages bulk data %s %s\n",
+			blk->bk_niov, blk->bk_sink ? "from" : "to",
+			libcfs_id2str(rpc->srpc_peer));
+
+	sfw_free_pages(rpc);
+}
+
+static int
+brw_bulk_ready(srpc_server_rpc_t *rpc, int status)
+{
+	__u64	     magic = BRW_MAGIC;
+	srpc_brw_reply_t *reply = &rpc->srpc_replymsg.msg_body.brw_reply;
+	srpc_brw_reqst_t *reqst;
+	srpc_msg_t       *reqstmsg;
+
+	LASSERT(rpc->srpc_bulk != NULL);
+	LASSERT(rpc->srpc_reqstbuf != NULL);
+
+	reqstmsg = &rpc->srpc_reqstbuf->buf_msg;
+	reqst = &reqstmsg->msg_body.brw_reqst;
+
+	if (status != 0) {
+		CERROR("BRW bulk %s failed for RPC from %s: %d\n",
+			reqst->brw_rw == LST_BRW_READ ? "READ" : "WRITE",
+			libcfs_id2str(rpc->srpc_peer), status);
+		return -EIO;
+	}
+
+	if (reqst->brw_rw == LST_BRW_READ)
+		return 0;
+
+	if (reqstmsg->msg_magic != SRPC_MSG_MAGIC)
+		__swab64s(&magic);
+
+	if (brw_check_bulk(rpc->srpc_bulk, reqst->brw_flags, magic) != 0) {
+		CERROR("Bulk data from %s is corrupted!\n",
+			libcfs_id2str(rpc->srpc_peer));
+		reply->brw_status = EBADMSG;
+	}
+
+	return 0;
+}
+
+static int
+brw_server_handle(struct srpc_server_rpc *rpc)
+{
+	struct srpc_service	*sv = rpc->srpc_scd->scd_svc;
+	srpc_msg_t       *replymsg = &rpc->srpc_replymsg;
+	srpc_msg_t       *reqstmsg = &rpc->srpc_reqstbuf->buf_msg;
+	srpc_brw_reply_t *reply = &replymsg->msg_body.brw_reply;
+	srpc_brw_reqst_t *reqst = &reqstmsg->msg_body.brw_reqst;
+	int		  npg;
+	int	       rc;
+
+	LASSERT(sv->sv_id == SRPC_SERVICE_BRW);
+
+	if (reqstmsg->msg_magic != SRPC_MSG_MAGIC) {
+		LASSERT(reqstmsg->msg_magic == __swab32(SRPC_MSG_MAGIC));
+
+		__swab32s(&reqst->brw_rw);
+		__swab32s(&reqst->brw_len);
+		__swab32s(&reqst->brw_flags);
+		__swab64s(&reqst->brw_rpyid);
+		__swab64s(&reqst->brw_bulkid);
+	}
+	LASSERT(reqstmsg->msg_type == (__u32)srpc_service2request(sv->sv_id));
+
+	reply->brw_status = 0;
+	rpc->srpc_done = brw_server_rpc_done;
+
+	if ((reqst->brw_rw != LST_BRW_READ && reqst->brw_rw != LST_BRW_WRITE) ||
+	    (reqst->brw_flags != LST_BRW_CHECK_NONE &&
+	     reqst->brw_flags != LST_BRW_CHECK_FULL &&
+	     reqst->brw_flags != LST_BRW_CHECK_SIMPLE)) {
+		reply->brw_status = EINVAL;
+		return 0;
+	}
+
+	if ((reqstmsg->msg_ses_feats & ~LST_FEATS_MASK) != 0) {
+		replymsg->msg_ses_feats = LST_FEATS_MASK;
+		reply->brw_status = EPROTO;
+		return 0;
+	}
+
+	if ((reqstmsg->msg_ses_feats & LST_FEAT_BULK_LEN) == 0) {
+		/* compat with old version */
+		if ((reqst->brw_len & ~CFS_PAGE_MASK) != 0) {
+			reply->brw_status = EINVAL;
+			return 0;
+		}
+		npg = reqst->brw_len >> PAGE_CACHE_SHIFT;
+
+	} else {
+		npg = (reqst->brw_len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+	}
+
+	replymsg->msg_ses_feats = reqstmsg->msg_ses_feats;
+
+	if (reqst->brw_len == 0 || npg > LNET_MAX_IOV) {
+		reply->brw_status = EINVAL;
+		return 0;
+	}
+
+	rc = sfw_alloc_pages(rpc, rpc->srpc_scd->scd_cpt, npg,
+			     reqst->brw_len,
+			     reqst->brw_rw == LST_BRW_WRITE);
+	if (rc != 0)
+		return rc;
+
+	if (reqst->brw_rw == LST_BRW_READ)
+		brw_fill_bulk(rpc->srpc_bulk, reqst->brw_flags, BRW_MAGIC);
+	else
+		brw_fill_bulk(rpc->srpc_bulk, reqst->brw_flags, BRW_POISON);
+
+	return 0;
+}
+
+sfw_test_client_ops_t brw_test_client;
+void brw_init_test_client(void)
+{
+	brw_test_client.tso_init       = brw_client_init;
+	brw_test_client.tso_fini       = brw_client_fini;
+	brw_test_client.tso_prep_rpc   = brw_client_prep_rpc;
+	brw_test_client.tso_done_rpc   = brw_client_done_rpc;
+};
+
+srpc_service_t brw_test_service;
+void brw_init_test_service(void)
+{
+
+	brw_test_service.sv_id	 = SRPC_SERVICE_BRW;
+	brw_test_service.sv_name       = "brw_test";
+	brw_test_service.sv_handler    = brw_server_handle;
+	brw_test_service.sv_bulk_ready = brw_bulk_ready;
+	brw_test_service.sv_wi_total   = brw_srv_workitems;
+}
diff --git a/kernel/drivers/staging/lustre/lnet/selftest/conctl.c b/kernel/drivers/staging/lustre/lnet/selftest/conctl.c
new file mode 100644
index 000000000..045fe295a
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lnet/selftest/conctl.c
@@ -0,0 +1,929 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/selftest/conctl.c
+ *
+ * IOC handle in kernel
+ *
+ * Author: Liang Zhen <liangzhen@clusterfs.com>
+ */
+
+#include "../../include/linux/libcfs/libcfs.h"
+#include "../../include/linux/lnet/lib-lnet.h"
+#include "../../include/linux/lnet/lnetst.h"
+#include "console.h"
+
+static int
+lst_session_new_ioctl(lstio_session_new_args_t *args)
+{
+	char      *name;
+	int	rc;
+
+	if (args->lstio_ses_idp   == NULL || /* address for output sid */
+	    args->lstio_ses_key   == 0 || /* no key is specified */
+	    args->lstio_ses_namep == NULL || /* session name */
+	    args->lstio_ses_nmlen <= 0 ||
+	    args->lstio_ses_nmlen > LST_NAME_SIZE)
+		return -EINVAL;
+
+	LIBCFS_ALLOC(name, args->lstio_ses_nmlen + 1);
+	if (name == NULL)
+		return -ENOMEM;
+
+	if (copy_from_user(name,
+			       args->lstio_ses_namep,
+			       args->lstio_ses_nmlen)) {
+		LIBCFS_FREE(name, args->lstio_ses_nmlen + 1);
+		return -EFAULT;
+	}
+
+	name[args->lstio_ses_nmlen] = 0;
+
+	rc = lstcon_session_new(name,
+				args->lstio_ses_key,
+				args->lstio_ses_feats,
+				args->lstio_ses_force,
+				args->lstio_ses_timeout,
+				args->lstio_ses_idp);
+
+	LIBCFS_FREE(name, args->lstio_ses_nmlen + 1);
+	return rc;
+}
+
+static int
+lst_session_end_ioctl(lstio_session_end_args_t *args)
+{
+	if (args->lstio_ses_key != console_session.ses_key)
+		return -EACCES;
+
+	return lstcon_session_end();
+}
+
+static int
+lst_session_info_ioctl(lstio_session_info_args_t *args)
+{
+	/* no checking of key */
+
+	if (args->lstio_ses_idp   == NULL || /* address for output sid */
+	    args->lstio_ses_keyp  == NULL || /* address for output key */
+	    args->lstio_ses_featp  == NULL || /* address for output features */
+	    args->lstio_ses_ndinfo == NULL || /* address for output ndinfo */
+	    args->lstio_ses_namep == NULL || /* address for output name */
+	    args->lstio_ses_nmlen <= 0 ||
+	    args->lstio_ses_nmlen > LST_NAME_SIZE)
+		return -EINVAL;
+
+	return lstcon_session_info(args->lstio_ses_idp,
+				   args->lstio_ses_keyp,
+				   args->lstio_ses_featp,
+				   args->lstio_ses_ndinfo,
+				   args->lstio_ses_namep,
+				   args->lstio_ses_nmlen);
+}
+
+static int
+lst_debug_ioctl(lstio_debug_args_t *args)
+{
+	char   *name   = NULL;
+	int     client = 1;
+	int     rc;
+
+	if (args->lstio_dbg_key != console_session.ses_key)
+		return -EACCES;
+
+	if (args->lstio_dbg_resultp == NULL)
+		return -EINVAL;
+
+	if (args->lstio_dbg_namep != NULL && /* name of batch/group */
+	    (args->lstio_dbg_nmlen <= 0 ||
+	     args->lstio_dbg_nmlen > LST_NAME_SIZE))
+		return -EINVAL;
+
+	if (args->lstio_dbg_namep != NULL) {
+		LIBCFS_ALLOC(name, args->lstio_dbg_nmlen + 1);
+		if (name == NULL)
+			return -ENOMEM;
+
+		if (copy_from_user(name, args->lstio_dbg_namep,
+				       args->lstio_dbg_nmlen)) {
+			LIBCFS_FREE(name, args->lstio_dbg_nmlen + 1);
+
+			return -EFAULT;
+		}
+
+		name[args->lstio_dbg_nmlen] = 0;
+	}
+
+	rc = -EINVAL;
+
+	switch (args->lstio_dbg_type) {
+	case LST_OPC_SESSION:
+		rc = lstcon_session_debug(args->lstio_dbg_timeout,
+					  args->lstio_dbg_resultp);
+		break;
+
+	case LST_OPC_BATCHSRV:
+		client = 0;
+	case LST_OPC_BATCHCLI:
+		if (name == NULL)
+			goto out;
+
+		rc = lstcon_batch_debug(args->lstio_dbg_timeout,
+					name, client, args->lstio_dbg_resultp);
+		break;
+
+	case LST_OPC_GROUP:
+		if (name == NULL)
+			goto out;
+
+		rc = lstcon_group_debug(args->lstio_dbg_timeout,
+					name, args->lstio_dbg_resultp);
+		break;
+
+	case LST_OPC_NODES:
+		if (args->lstio_dbg_count <= 0 ||
+		    args->lstio_dbg_idsp == NULL)
+			goto out;
+
+		rc = lstcon_nodes_debug(args->lstio_dbg_timeout,
+					args->lstio_dbg_count,
+					args->lstio_dbg_idsp,
+					args->lstio_dbg_resultp);
+		break;
+
+	default:
+		break;
+	}
+
+out:
+	if (name != NULL)
+		LIBCFS_FREE(name, args->lstio_dbg_nmlen + 1);
+
+	return rc;
+}
+
+static int
+lst_group_add_ioctl(lstio_group_add_args_t *args)
+{
+	char	   *name;
+	int	     rc;
+
+	if (args->lstio_grp_key != console_session.ses_key)
+		return -EACCES;
+
+	if (args->lstio_grp_namep == NULL ||
+	    args->lstio_grp_nmlen <= 0 ||
+	    args->lstio_grp_nmlen > LST_NAME_SIZE)
+		return -EINVAL;
+
+	LIBCFS_ALLOC(name, args->lstio_grp_nmlen + 1);
+	if (name == NULL)
+		return -ENOMEM;
+
+	if (copy_from_user(name,
+			       args->lstio_grp_namep,
+			       args->lstio_grp_nmlen)) {
+		LIBCFS_FREE(name, args->lstio_grp_nmlen);
+		return -EFAULT;
+	}
+
+	name[args->lstio_grp_nmlen] = 0;
+
+	rc = lstcon_group_add(name);
+
+	LIBCFS_FREE(name, args->lstio_grp_nmlen + 1);
+
+	return rc;
+}
+
+static int
+lst_group_del_ioctl(lstio_group_del_args_t *args)
+{
+	int     rc;
+	char   *name;
+
+	if (args->lstio_grp_key != console_session.ses_key)
+		return -EACCES;
+
+	if (args->lstio_grp_namep == NULL ||
+	    args->lstio_grp_nmlen <= 0 ||
+	    args->lstio_grp_nmlen > LST_NAME_SIZE)
+		return -EINVAL;
+
+	LIBCFS_ALLOC(name, args->lstio_grp_nmlen + 1);
+	if (name == NULL)
+		return -ENOMEM;
+
+	if (copy_from_user(name,
+			       args->lstio_grp_namep,
+			       args->lstio_grp_nmlen)) {
+		LIBCFS_FREE(name, args->lstio_grp_nmlen + 1);
+		return -EFAULT;
+	}
+
+	name[args->lstio_grp_nmlen] = 0;
+
+	rc = lstcon_group_del(name);
+
+	LIBCFS_FREE(name, args->lstio_grp_nmlen + 1);
+
+	return rc;
+}
+
+static int
+lst_group_update_ioctl(lstio_group_update_args_t *args)
+{
+	int     rc;
+	char   *name;
+
+	if (args->lstio_grp_key != console_session.ses_key)
+		return -EACCES;
+
+	if (args->lstio_grp_resultp == NULL ||
+	    args->lstio_grp_namep == NULL ||
+	    args->lstio_grp_nmlen <= 0 ||
+	    args->lstio_grp_nmlen > LST_NAME_SIZE)
+		return -EINVAL;
+
+	LIBCFS_ALLOC(name, args->lstio_grp_nmlen + 1);
+	if (name == NULL)
+		return -ENOMEM;
+
+	if (copy_from_user(name,
+			   args->lstio_grp_namep,
+			   args->lstio_grp_nmlen)) {
+		LIBCFS_FREE(name, args->lstio_grp_nmlen + 1);
+		return -EFAULT;
+	}
+
+	name[args->lstio_grp_nmlen] = 0;
+
+	switch (args->lstio_grp_opc) {
+	case LST_GROUP_CLEAN:
+		rc = lstcon_group_clean(name, args->lstio_grp_args);
+		break;
+
+	case LST_GROUP_REFRESH:
+		rc = lstcon_group_refresh(name, args->lstio_grp_resultp);
+		break;
+
+	case LST_GROUP_RMND:
+		if (args->lstio_grp_count  <= 0 ||
+		    args->lstio_grp_idsp == NULL) {
+			rc = -EINVAL;
+			break;
+		}
+		rc = lstcon_nodes_remove(name, args->lstio_grp_count,
+					 args->lstio_grp_idsp,
+					 args->lstio_grp_resultp);
+		break;
+
+	default:
+		rc = -EINVAL;
+		break;
+	}
+
+	LIBCFS_FREE(name, args->lstio_grp_nmlen + 1);
+
+	return rc;
+}
+
+static int
+lst_nodes_add_ioctl(lstio_group_nodes_args_t *args)
+{
+	unsigned feats;
+	int     rc;
+	char   *name;
+
+	if (args->lstio_grp_key != console_session.ses_key)
+		return -EACCES;
+
+	if (args->lstio_grp_idsp == NULL || /* array of ids */
+	    args->lstio_grp_count <= 0 ||
+	    args->lstio_grp_resultp == NULL ||
+	    args->lstio_grp_featp == NULL ||
+	    args->lstio_grp_namep == NULL ||
+	    args->lstio_grp_nmlen <= 0 ||
+	    args->lstio_grp_nmlen > LST_NAME_SIZE)
+		return -EINVAL;
+
+	LIBCFS_ALLOC(name, args->lstio_grp_nmlen + 1);
+	if (name == NULL)
+		return -ENOMEM;
+
+	if (copy_from_user(name, args->lstio_grp_namep,
+			       args->lstio_grp_nmlen)) {
+		LIBCFS_FREE(name, args->lstio_grp_nmlen + 1);
+
+		return -EFAULT;
+	}
+
+	name[args->lstio_grp_nmlen] = 0;
+
+	rc = lstcon_nodes_add(name, args->lstio_grp_count,
+			      args->lstio_grp_idsp, &feats,
+			      args->lstio_grp_resultp);
+
+	LIBCFS_FREE(name, args->lstio_grp_nmlen + 1);
+	if (rc == 0 &&
+	    copy_to_user(args->lstio_grp_featp, &feats, sizeof(feats))) {
+		return -EINVAL;
+	}
+
+	return rc;
+}
+
+static int
+lst_group_list_ioctl(lstio_group_list_args_t *args)
+{
+	if (args->lstio_grp_key != console_session.ses_key)
+		return -EACCES;
+
+	if (args->lstio_grp_idx   < 0 ||
+	    args->lstio_grp_namep == NULL ||
+	    args->lstio_grp_nmlen <= 0 ||
+	    args->lstio_grp_nmlen > LST_NAME_SIZE)
+		return -EINVAL;
+
+	return lstcon_group_list(args->lstio_grp_idx,
+			      args->lstio_grp_nmlen,
+			      args->lstio_grp_namep);
+}
+
+static int
+lst_group_info_ioctl(lstio_group_info_args_t *args)
+{
+	char	   *name;
+	int	     ndent;
+	int	     index;
+	int	     rc;
+
+	if (args->lstio_grp_key != console_session.ses_key)
+		return -EACCES;
+
+	if (args->lstio_grp_namep == NULL ||
+	    args->lstio_grp_nmlen <= 0 ||
+	    args->lstio_grp_nmlen > LST_NAME_SIZE)
+		return -EINVAL;
+
+	if (args->lstio_grp_entp  == NULL && /* output: group entry */
+	    args->lstio_grp_dentsp == NULL)  /* output: node entry */
+		return -EINVAL;
+
+	if (args->lstio_grp_dentsp != NULL) { /* have node entry */
+		if (args->lstio_grp_idxp == NULL || /* node index */
+		    args->lstio_grp_ndentp == NULL) /* # of node entry */
+			return -EINVAL;
+
+		if (copy_from_user(&ndent, args->lstio_grp_ndentp,
+				       sizeof(ndent)) ||
+		    copy_from_user(&index, args->lstio_grp_idxp,
+				       sizeof(index)))
+			return -EFAULT;
+
+		if (ndent <= 0 || index < 0)
+			return -EINVAL;
+	}
+
+	LIBCFS_ALLOC(name, args->lstio_grp_nmlen + 1);
+	if (name == NULL)
+		return -ENOMEM;
+
+	if (copy_from_user(name,
+			       args->lstio_grp_namep,
+			       args->lstio_grp_nmlen)) {
+		LIBCFS_FREE(name, args->lstio_grp_nmlen + 1);
+		return -EFAULT;
+	}
+
+	name[args->lstio_grp_nmlen] = 0;
+
+	rc = lstcon_group_info(name, args->lstio_grp_entp,
+			       &index, &ndent, args->lstio_grp_dentsp);
+
+	LIBCFS_FREE(name, args->lstio_grp_nmlen + 1);
+
+	if (rc != 0)
+		return rc;
+
+	if (args->lstio_grp_dentsp != NULL &&
+	    (copy_to_user(args->lstio_grp_idxp, &index, sizeof(index)) ||
+	     copy_to_user(args->lstio_grp_ndentp, &ndent, sizeof(ndent))))
+		rc = -EFAULT;
+
+	return 0;
+}
+
+static int
+lst_batch_add_ioctl(lstio_batch_add_args_t *args)
+{
+	int	     rc;
+	char	   *name;
+
+	if (args->lstio_bat_key != console_session.ses_key)
+		return -EACCES;
+
+	if (args->lstio_bat_namep == NULL ||
+	    args->lstio_bat_nmlen <= 0 ||
+	    args->lstio_bat_nmlen > LST_NAME_SIZE)
+		return -EINVAL;
+
+	LIBCFS_ALLOC(name, args->lstio_bat_nmlen + 1);
+	if (name == NULL)
+		return -ENOMEM;
+
+	if (copy_from_user(name,
+			       args->lstio_bat_namep,
+			       args->lstio_bat_nmlen)) {
+		LIBCFS_FREE(name, args->lstio_bat_nmlen + 1);
+		return -EFAULT;
+	}
+
+	name[args->lstio_bat_nmlen] = 0;
+
+	rc = lstcon_batch_add(name);
+
+	LIBCFS_FREE(name, args->lstio_bat_nmlen + 1);
+
+	return rc;
+}
+
+static int
+lst_batch_run_ioctl(lstio_batch_run_args_t *args)
+{
+	int	     rc;
+	char	   *name;
+
+	if (args->lstio_bat_key != console_session.ses_key)
+		return -EACCES;
+
+	if (args->lstio_bat_namep == NULL ||
+	    args->lstio_bat_nmlen <= 0 ||
+	    args->lstio_bat_nmlen > LST_NAME_SIZE)
+		return -EINVAL;
+
+	LIBCFS_ALLOC(name, args->lstio_bat_nmlen + 1);
+	if (name == NULL)
+		return -ENOMEM;
+
+	if (copy_from_user(name,
+			       args->lstio_bat_namep,
+			       args->lstio_bat_nmlen)) {
+		LIBCFS_FREE(name, args->lstio_bat_nmlen + 1);
+		return -EFAULT;
+	}
+
+	name[args->lstio_bat_nmlen] = 0;
+
+	rc = lstcon_batch_run(name, args->lstio_bat_timeout,
+			      args->lstio_bat_resultp);
+
+	LIBCFS_FREE(name, args->lstio_bat_nmlen + 1);
+
+	return rc;
+}
+
+static int
+lst_batch_stop_ioctl(lstio_batch_stop_args_t *args)
+{
+	int	     rc;
+	char	   *name;
+
+	if (args->lstio_bat_key != console_session.ses_key)
+		return -EACCES;
+
+	if (args->lstio_bat_resultp == NULL ||
+	    args->lstio_bat_namep == NULL ||
+	    args->lstio_bat_nmlen <= 0 ||
+	    args->lstio_bat_nmlen > LST_NAME_SIZE)
+		return -EINVAL;
+
+	LIBCFS_ALLOC(name, args->lstio_bat_nmlen + 1);
+	if (name == NULL)
+		return -ENOMEM;
+
+	if (copy_from_user(name,
+			       args->lstio_bat_namep,
+			       args->lstio_bat_nmlen)) {
+		LIBCFS_FREE(name, args->lstio_bat_nmlen + 1);
+		return -EFAULT;
+	}
+
+	name[args->lstio_bat_nmlen] = 0;
+
+	rc = lstcon_batch_stop(name, args->lstio_bat_force,
+			       args->lstio_bat_resultp);
+
+	LIBCFS_FREE(name, args->lstio_bat_nmlen + 1);
+
+	return rc;
+}
+
+static int
+lst_batch_query_ioctl(lstio_batch_query_args_t *args)
+{
+	char   *name;
+	int     rc;
+
+	if (args->lstio_bat_key != console_session.ses_key)
+		return -EACCES;
+
+	if (args->lstio_bat_resultp == NULL ||
+	    args->lstio_bat_namep == NULL ||
+	    args->lstio_bat_nmlen <= 0 ||
+	    args->lstio_bat_nmlen > LST_NAME_SIZE)
+		return -EINVAL;
+
+	if (args->lstio_bat_testidx < 0)
+		return -EINVAL;
+
+	LIBCFS_ALLOC(name, args->lstio_bat_nmlen + 1);
+	if (name == NULL)
+		return -ENOMEM;
+
+	if (copy_from_user(name,
+			       args->lstio_bat_namep,
+			       args->lstio_bat_nmlen)) {
+		LIBCFS_FREE(name, args->lstio_bat_nmlen + 1);
+		return -EFAULT;
+	}
+
+	name[args->lstio_bat_nmlen] = 0;
+
+	rc = lstcon_test_batch_query(name,
+				     args->lstio_bat_testidx,
+				     args->lstio_bat_client,
+				     args->lstio_bat_timeout,
+				     args->lstio_bat_resultp);
+
+	LIBCFS_FREE(name, args->lstio_bat_nmlen + 1);
+
+	return rc;
+}
+
+static int
+lst_batch_list_ioctl(lstio_batch_list_args_t *args)
+{
+	if (args->lstio_bat_key != console_session.ses_key)
+		return -EACCES;
+
+	if (args->lstio_bat_idx   < 0 ||
+	    args->lstio_bat_namep == NULL ||
+	    args->lstio_bat_nmlen <= 0 ||
+	    args->lstio_bat_nmlen > LST_NAME_SIZE)
+		return -EINVAL;
+
+	return lstcon_batch_list(args->lstio_bat_idx,
+			      args->lstio_bat_nmlen,
+			      args->lstio_bat_namep);
+}
+
+static int
+lst_batch_info_ioctl(lstio_batch_info_args_t *args)
+{
+	char	   *name;
+	int	     rc;
+	int	     index;
+	int	     ndent;
+
+	if (args->lstio_bat_key != console_session.ses_key)
+		return -EACCES;
+
+	if (args->lstio_bat_namep == NULL || /* batch name */
+	    args->lstio_bat_nmlen <= 0 ||
+	    args->lstio_bat_nmlen > LST_NAME_SIZE)
+		return -EINVAL;
+
+	if (args->lstio_bat_entp == NULL && /* output: batch entry */
+	    args->lstio_bat_dentsp == NULL) /* output: node entry */
+		return -EINVAL;
+
+	if (args->lstio_bat_dentsp != NULL) { /* have node entry */
+		if (args->lstio_bat_idxp == NULL || /* node index */
+		    args->lstio_bat_ndentp == NULL) /* # of node entry */
+			return -EINVAL;
+
+		if (copy_from_user(&index, args->lstio_bat_idxp,
+				       sizeof(index)) ||
+		    copy_from_user(&ndent, args->lstio_bat_ndentp,
+				       sizeof(ndent)))
+			return -EFAULT;
+
+		if (ndent <= 0 || index < 0)
+			return -EINVAL;
+	}
+
+	LIBCFS_ALLOC(name, args->lstio_bat_nmlen + 1);
+	if (name == NULL)
+		return -ENOMEM;
+
+	if (copy_from_user(name,
+			       args->lstio_bat_namep, args->lstio_bat_nmlen)) {
+		LIBCFS_FREE(name, args->lstio_bat_nmlen + 1);
+		return -EFAULT;
+	}
+
+	name[args->lstio_bat_nmlen] = 0;
+
+	rc = lstcon_batch_info(name,
+			    args->lstio_bat_entp, args->lstio_bat_server,
+			    args->lstio_bat_testidx, &index, &ndent,
+			    args->lstio_bat_dentsp);
+
+	LIBCFS_FREE(name, args->lstio_bat_nmlen + 1);
+
+	if (rc != 0)
+		return rc;
+
+	if (args->lstio_bat_dentsp != NULL &&
+	    (copy_to_user(args->lstio_bat_idxp, &index, sizeof(index)) ||
+	     copy_to_user(args->lstio_bat_ndentp, &ndent, sizeof(ndent))))
+		rc = -EFAULT;
+
+	return rc;
+}
+
+static int
+lst_stat_query_ioctl(lstio_stat_args_t *args)
+{
+	int	     rc;
+	char	   *name;
+
+	/* TODO: not finished */
+	if (args->lstio_sta_key != console_session.ses_key)
+		return -EACCES;
+
+	if (args->lstio_sta_resultp == NULL ||
+	    (args->lstio_sta_namep  == NULL &&
+	     args->lstio_sta_idsp   == NULL) ||
+	    args->lstio_sta_nmlen <= 0 ||
+	    args->lstio_sta_nmlen > LST_NAME_SIZE)
+		return -EINVAL;
+
+	if (args->lstio_sta_idsp != NULL &&
+	    args->lstio_sta_count <= 0)
+		return -EINVAL;
+
+	LIBCFS_ALLOC(name, args->lstio_sta_nmlen + 1);
+	if (name == NULL)
+		return -ENOMEM;
+
+	if (copy_from_user(name, args->lstio_sta_namep,
+			       args->lstio_sta_nmlen)) {
+		LIBCFS_FREE(name, args->lstio_sta_nmlen + 1);
+		return -EFAULT;
+	}
+
+	if (args->lstio_sta_idsp == NULL) {
+		rc = lstcon_group_stat(name, args->lstio_sta_timeout,
+				       args->lstio_sta_resultp);
+	} else {
+		rc = lstcon_nodes_stat(args->lstio_sta_count,
+				       args->lstio_sta_idsp,
+				       args->lstio_sta_timeout,
+				       args->lstio_sta_resultp);
+	}
+
+	LIBCFS_FREE(name, args->lstio_sta_nmlen + 1);
+
+	return rc;
+}
+
+static int lst_test_add_ioctl(lstio_test_args_t *args)
+{
+	char		*batch_name;
+	char		*src_name = NULL;
+	char		*dst_name = NULL;
+	void		*param = NULL;
+	int		ret = 0;
+	int		rc = -ENOMEM;
+
+	if (args->lstio_tes_resultp == NULL ||
+	    args->lstio_tes_retp == NULL ||
+	    args->lstio_tes_bat_name == NULL || /* no specified batch */
+	    args->lstio_tes_bat_nmlen <= 0 ||
+	    args->lstio_tes_bat_nmlen > LST_NAME_SIZE ||
+	    args->lstio_tes_sgrp_name == NULL || /* no source group */
+	    args->lstio_tes_sgrp_nmlen <= 0 ||
+	    args->lstio_tes_sgrp_nmlen > LST_NAME_SIZE ||
+	    args->lstio_tes_dgrp_name == NULL || /* no target group */
+	    args->lstio_tes_dgrp_nmlen <= 0 ||
+	    args->lstio_tes_dgrp_nmlen > LST_NAME_SIZE)
+		return -EINVAL;
+
+	if (args->lstio_tes_loop == 0 || /* negative is infinite */
+	    args->lstio_tes_concur <= 0 ||
+	    args->lstio_tes_dist <= 0 ||
+	    args->lstio_tes_span <= 0)
+		return -EINVAL;
+
+	/* have parameter, check if parameter length is valid */
+	if (args->lstio_tes_param != NULL &&
+	    (args->lstio_tes_param_len <= 0 ||
+	     args->lstio_tes_param_len > PAGE_CACHE_SIZE - sizeof(lstcon_test_t)))
+		return -EINVAL;
+
+	LIBCFS_ALLOC(batch_name, args->lstio_tes_bat_nmlen + 1);
+	if (batch_name == NULL)
+		return rc;
+
+	LIBCFS_ALLOC(src_name, args->lstio_tes_sgrp_nmlen + 1);
+	if (src_name == NULL)
+		goto out;
+
+	LIBCFS_ALLOC(dst_name, args->lstio_tes_dgrp_nmlen + 1);
+	 if (dst_name == NULL)
+		goto out;
+
+	if (args->lstio_tes_param != NULL) {
+		LIBCFS_ALLOC(param, args->lstio_tes_param_len);
+		if (param == NULL)
+			goto out;
+	}
+
+	rc = -EFAULT;
+	if (copy_from_user(batch_name, args->lstio_tes_bat_name,
+			   args->lstio_tes_bat_nmlen) ||
+	    copy_from_user(src_name, args->lstio_tes_sgrp_name,
+			   args->lstio_tes_sgrp_nmlen) ||
+	    copy_from_user(dst_name, args->lstio_tes_dgrp_name,
+			   args->lstio_tes_dgrp_nmlen) ||
+	    copy_from_user(param, args->lstio_tes_param,
+			      args->lstio_tes_param_len))
+		goto out;
+
+	rc = lstcon_test_add(batch_name,
+			    args->lstio_tes_type,
+			    args->lstio_tes_loop,
+			    args->lstio_tes_concur,
+			    args->lstio_tes_dist, args->lstio_tes_span,
+			    src_name, dst_name, param,
+			    args->lstio_tes_param_len,
+			    &ret, args->lstio_tes_resultp);
+
+	if (ret != 0)
+		rc = (copy_to_user(args->lstio_tes_retp, &ret,
+				       sizeof(ret))) ? -EFAULT : 0;
+out:
+	if (batch_name != NULL)
+		LIBCFS_FREE(batch_name, args->lstio_tes_bat_nmlen + 1);
+
+	if (src_name != NULL)
+		LIBCFS_FREE(src_name, args->lstio_tes_sgrp_nmlen + 1);
+
+	if (dst_name != NULL)
+		LIBCFS_FREE(dst_name, args->lstio_tes_dgrp_nmlen + 1);
+
+	if (param != NULL)
+		LIBCFS_FREE(param, args->lstio_tes_param_len);
+
+	return rc;
+}
+
+int
+lstcon_ioctl_entry(unsigned int cmd, struct libcfs_ioctl_data *data)
+{
+	char   *buf;
+	int     opc = data->ioc_u32[0];
+	int     rc;
+
+	if (cmd != IOC_LIBCFS_LNETST)
+		return -EINVAL;
+
+	if (data->ioc_plen1 > PAGE_CACHE_SIZE)
+		return -EINVAL;
+
+	LIBCFS_ALLOC(buf, data->ioc_plen1);
+	if (buf == NULL)
+		return -ENOMEM;
+
+	/* copy in parameter */
+	if (copy_from_user(buf, data->ioc_pbuf1, data->ioc_plen1)) {
+		LIBCFS_FREE(buf, data->ioc_plen1);
+		return -EFAULT;
+	}
+
+	mutex_lock(&console_session.ses_mutex);
+
+	console_session.ses_laststamp = get_seconds();
+
+	if (console_session.ses_shutdown) {
+		rc = -ESHUTDOWN;
+		goto out;
+	}
+
+	if (console_session.ses_expired)
+		lstcon_session_end();
+
+	if (opc != LSTIO_SESSION_NEW &&
+	    console_session.ses_state == LST_SESSION_NONE) {
+		CDEBUG(D_NET, "LST no active session\n");
+		rc = -ESRCH;
+		goto out;
+	}
+
+	memset(&console_session.ses_trans_stat, 0, sizeof(lstcon_trans_stat_t));
+
+	switch (opc) {
+	case LSTIO_SESSION_NEW:
+		rc = lst_session_new_ioctl((lstio_session_new_args_t *)buf);
+		break;
+	case LSTIO_SESSION_END:
+		rc = lst_session_end_ioctl((lstio_session_end_args_t *)buf);
+		break;
+	case LSTIO_SESSION_INFO:
+		rc = lst_session_info_ioctl((lstio_session_info_args_t *)buf);
+		break;
+	case LSTIO_DEBUG:
+		rc = lst_debug_ioctl((lstio_debug_args_t *)buf);
+		break;
+	case LSTIO_GROUP_ADD:
+		rc = lst_group_add_ioctl((lstio_group_add_args_t *)buf);
+		break;
+	case LSTIO_GROUP_DEL:
+		rc = lst_group_del_ioctl((lstio_group_del_args_t *)buf);
+		break;
+	case LSTIO_GROUP_UPDATE:
+		rc = lst_group_update_ioctl((lstio_group_update_args_t *)buf);
+		break;
+	case LSTIO_NODES_ADD:
+		rc = lst_nodes_add_ioctl((lstio_group_nodes_args_t *)buf);
+		break;
+	case LSTIO_GROUP_LIST:
+		rc = lst_group_list_ioctl((lstio_group_list_args_t *)buf);
+		break;
+	case LSTIO_GROUP_INFO:
+		rc = lst_group_info_ioctl((lstio_group_info_args_t *)buf);
+		break;
+	case LSTIO_BATCH_ADD:
+		rc = lst_batch_add_ioctl((lstio_batch_add_args_t *)buf);
+		break;
+	case LSTIO_BATCH_START:
+		rc = lst_batch_run_ioctl((lstio_batch_run_args_t *)buf);
+		break;
+	case LSTIO_BATCH_STOP:
+		rc = lst_batch_stop_ioctl((lstio_batch_stop_args_t *)buf);
+		break;
+	case LSTIO_BATCH_QUERY:
+		rc = lst_batch_query_ioctl((lstio_batch_query_args_t *)buf);
+		break;
+	case LSTIO_BATCH_LIST:
+		rc = lst_batch_list_ioctl((lstio_batch_list_args_t *)buf);
+		break;
+	case LSTIO_BATCH_INFO:
+		rc = lst_batch_info_ioctl((lstio_batch_info_args_t *)buf);
+		break;
+	case LSTIO_TEST_ADD:
+		rc = lst_test_add_ioctl((lstio_test_args_t *)buf);
+		break;
+	case LSTIO_STAT_QUERY:
+		rc = lst_stat_query_ioctl((lstio_stat_args_t *)buf);
+		break;
+	default:
+		rc = -EINVAL;
+	}
+
+	if (copy_to_user(data->ioc_pbuf2, &console_session.ses_trans_stat,
+			     sizeof(lstcon_trans_stat_t)))
+		rc = -EFAULT;
+out:
+	mutex_unlock(&console_session.ses_mutex);
+
+	LIBCFS_FREE(buf, data->ioc_plen1);
+
+	return rc;
+}
+
+EXPORT_SYMBOL(lstcon_ioctl_entry);
diff --git a/kernel/drivers/staging/lustre/lnet/selftest/conrpc.c b/kernel/drivers/staging/lustre/lnet/selftest/conrpc.c
new file mode 100644
index 000000000..77f02b761
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lnet/selftest/conrpc.c
@@ -0,0 +1,1396 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/selftest/conctl.c
+ *
+ * Console framework rpcs
+ *
+ * Author: Liang Zhen <liang@whamcloud.com>
+ */
+
+
+#include "../../include/linux/libcfs/libcfs.h"
+#include "../../include/linux/lnet/lib-lnet.h"
+#include "timer.h"
+#include "conrpc.h"
+#include "console.h"
+
+void lstcon_rpc_stat_reply(lstcon_rpc_trans_t *, srpc_msg_t *,
+			   lstcon_node_t *, lstcon_trans_stat_t *);
+
+static void
+lstcon_rpc_done(srpc_client_rpc_t *rpc)
+{
+	lstcon_rpc_t *crpc = (lstcon_rpc_t *)rpc->crpc_priv;
+
+	LASSERT(crpc != NULL && rpc == crpc->crp_rpc);
+	LASSERT(crpc->crp_posted && !crpc->crp_finished);
+
+	spin_lock(&rpc->crpc_lock);
+
+	if (crpc->crp_trans == NULL) {
+		/* Orphan RPC is not in any transaction,
+		 * I'm just a poor body and nobody loves me */
+		spin_unlock(&rpc->crpc_lock);
+
+		/* release it */
+		lstcon_rpc_put(crpc);
+		return;
+	}
+
+	/* not an orphan RPC */
+	crpc->crp_finished = 1;
+
+	if (crpc->crp_stamp == 0) {
+		/* not aborted */
+		LASSERT(crpc->crp_status == 0);
+
+		crpc->crp_stamp  = cfs_time_current();
+		crpc->crp_status = rpc->crpc_status;
+	}
+
+	/* wakeup (transaction)thread if I'm the last RPC in the transaction */
+	if (atomic_dec_and_test(&crpc->crp_trans->tas_remaining))
+		wake_up(&crpc->crp_trans->tas_waitq);
+
+	spin_unlock(&rpc->crpc_lock);
+}
+
+static int
+lstcon_rpc_init(lstcon_node_t *nd, int service, unsigned feats,
+		int bulk_npg, int bulk_len, int embedded, lstcon_rpc_t *crpc)
+{
+	crpc->crp_rpc = sfw_create_rpc(nd->nd_id, service,
+				       feats, bulk_npg, bulk_len,
+				       lstcon_rpc_done, (void *)crpc);
+	if (crpc->crp_rpc == NULL)
+		return -ENOMEM;
+
+	crpc->crp_trans    = NULL;
+	crpc->crp_node     = nd;
+	crpc->crp_posted   = 0;
+	crpc->crp_finished = 0;
+	crpc->crp_unpacked = 0;
+	crpc->crp_status   = 0;
+	crpc->crp_stamp    = 0;
+	crpc->crp_embedded = embedded;
+	INIT_LIST_HEAD(&crpc->crp_link);
+
+	atomic_inc(&console_session.ses_rpc_counter);
+
+	return 0;
+}
+
+static int
+lstcon_rpc_prep(lstcon_node_t *nd, int service, unsigned feats,
+		int bulk_npg, int bulk_len, lstcon_rpc_t **crpcpp)
+{
+	lstcon_rpc_t  *crpc = NULL;
+	int	    rc;
+
+	spin_lock(&console_session.ses_rpc_lock);
+
+	if (!list_empty(&console_session.ses_rpc_freelist)) {
+		crpc = list_entry(console_session.ses_rpc_freelist.next,
+				      lstcon_rpc_t, crp_link);
+		list_del_init(&crpc->crp_link);
+	}
+
+	spin_unlock(&console_session.ses_rpc_lock);
+
+	if (crpc == NULL) {
+		LIBCFS_ALLOC(crpc, sizeof(*crpc));
+		if (crpc == NULL)
+			return -ENOMEM;
+	}
+
+	rc = lstcon_rpc_init(nd, service, feats, bulk_npg, bulk_len, 0, crpc);
+	if (rc == 0) {
+		*crpcpp = crpc;
+		return 0;
+	}
+
+	LIBCFS_FREE(crpc, sizeof(*crpc));
+
+	return rc;
+}
+
+void
+lstcon_rpc_put(lstcon_rpc_t *crpc)
+{
+	srpc_bulk_t *bulk = &crpc->crp_rpc->crpc_bulk;
+	int	  i;
+
+	LASSERT(list_empty(&crpc->crp_link));
+
+	for (i = 0; i < bulk->bk_niov; i++) {
+		if (bulk->bk_iovs[i].kiov_page == NULL)
+			continue;
+
+		__free_page(bulk->bk_iovs[i].kiov_page);
+	}
+
+	srpc_client_rpc_decref(crpc->crp_rpc);
+
+	if (crpc->crp_embedded) {
+		/* embedded RPC, don't recycle it */
+		memset(crpc, 0, sizeof(*crpc));
+		crpc->crp_embedded = 1;
+
+	} else {
+		spin_lock(&console_session.ses_rpc_lock);
+
+		list_add(&crpc->crp_link,
+			     &console_session.ses_rpc_freelist);
+
+		spin_unlock(&console_session.ses_rpc_lock);
+	}
+
+	/* RPC is not alive now */
+	atomic_dec(&console_session.ses_rpc_counter);
+}
+
+static void
+lstcon_rpc_post(lstcon_rpc_t *crpc)
+{
+	lstcon_rpc_trans_t *trans = crpc->crp_trans;
+
+	LASSERT(trans != NULL);
+
+	atomic_inc(&trans->tas_remaining);
+	crpc->crp_posted = 1;
+
+	sfw_post_rpc(crpc->crp_rpc);
+}
+
+static char *
+lstcon_rpc_trans_name(int transop)
+{
+	if (transop == LST_TRANS_SESNEW)
+		return "SESNEW";
+
+	if (transop == LST_TRANS_SESEND)
+		return "SESEND";
+
+	if (transop == LST_TRANS_SESQRY)
+		return "SESQRY";
+
+	if (transop == LST_TRANS_SESPING)
+		return "SESPING";
+
+	if (transop == LST_TRANS_TSBCLIADD)
+		return "TSBCLIADD";
+
+	if (transop == LST_TRANS_TSBSRVADD)
+		return "TSBSRVADD";
+
+	if (transop == LST_TRANS_TSBRUN)
+		return "TSBRUN";
+
+	if (transop == LST_TRANS_TSBSTOP)
+		return "TSBSTOP";
+
+	if (transop == LST_TRANS_TSBCLIQRY)
+		return "TSBCLIQRY";
+
+	if (transop == LST_TRANS_TSBSRVQRY)
+		return "TSBSRVQRY";
+
+	if (transop == LST_TRANS_STATQRY)
+		return "STATQRY";
+
+	return "Unknown";
+}
+
+int
+lstcon_rpc_trans_prep(struct list_head *translist,
+		      int transop, lstcon_rpc_trans_t **transpp)
+{
+	lstcon_rpc_trans_t *trans;
+
+	if (translist != NULL) {
+		list_for_each_entry(trans, translist, tas_link) {
+			/* Can't enqueue two private transaction on
+			 * the same object */
+			if ((trans->tas_opc & transop) == LST_TRANS_PRIVATE)
+				return -EPERM;
+		}
+	}
+
+	/* create a trans group */
+	LIBCFS_ALLOC(trans, sizeof(*trans));
+	if (trans == NULL)
+		return -ENOMEM;
+
+	trans->tas_opc = transop;
+
+	if (translist == NULL)
+		INIT_LIST_HEAD(&trans->tas_olink);
+	else
+		list_add_tail(&trans->tas_olink, translist);
+
+	list_add_tail(&trans->tas_link, &console_session.ses_trans_list);
+
+	INIT_LIST_HEAD(&trans->tas_rpcs_list);
+	atomic_set(&trans->tas_remaining, 0);
+	init_waitqueue_head(&trans->tas_waitq);
+
+	spin_lock(&console_session.ses_rpc_lock);
+	trans->tas_features = console_session.ses_features;
+	spin_unlock(&console_session.ses_rpc_lock);
+
+	*transpp = trans;
+	return 0;
+}
+
+void
+lstcon_rpc_trans_addreq(lstcon_rpc_trans_t *trans, lstcon_rpc_t *crpc)
+{
+	list_add_tail(&crpc->crp_link, &trans->tas_rpcs_list);
+	crpc->crp_trans = trans;
+}
+
+void
+lstcon_rpc_trans_abort(lstcon_rpc_trans_t *trans, int error)
+{
+	srpc_client_rpc_t *rpc;
+	lstcon_rpc_t      *crpc;
+	lstcon_node_t     *nd;
+
+	list_for_each_entry(crpc, &trans->tas_rpcs_list, crp_link) {
+		rpc = crpc->crp_rpc;
+
+		spin_lock(&rpc->crpc_lock);
+
+		if (!crpc->crp_posted || /* not posted */
+		    crpc->crp_stamp != 0) { /* rpc done or aborted already */
+			if (crpc->crp_stamp == 0) {
+				crpc->crp_stamp = cfs_time_current();
+				crpc->crp_status = -EINTR;
+			}
+			spin_unlock(&rpc->crpc_lock);
+			continue;
+		}
+
+		crpc->crp_stamp  = cfs_time_current();
+		crpc->crp_status = error;
+
+		spin_unlock(&rpc->crpc_lock);
+
+		sfw_abort_rpc(rpc);
+
+		if (error != ETIMEDOUT)
+			continue;
+
+		nd = crpc->crp_node;
+		if (cfs_time_after(nd->nd_stamp, crpc->crp_stamp))
+			continue;
+
+		nd->nd_stamp = crpc->crp_stamp;
+		nd->nd_state = LST_NODE_DOWN;
+	}
+}
+
+static int
+lstcon_rpc_trans_check(lstcon_rpc_trans_t *trans)
+{
+	if (console_session.ses_shutdown &&
+	    !list_empty(&trans->tas_olink)) /* Not an end session RPC */
+		return 1;
+
+	return (atomic_read(&trans->tas_remaining) == 0) ? 1 : 0;
+}
+
+int
+lstcon_rpc_trans_postwait(lstcon_rpc_trans_t *trans, int timeout)
+{
+	lstcon_rpc_t  *crpc;
+	int	    rc;
+
+	if (list_empty(&trans->tas_rpcs_list))
+		return 0;
+
+	if (timeout < LST_TRANS_MIN_TIMEOUT)
+		timeout = LST_TRANS_MIN_TIMEOUT;
+
+	CDEBUG(D_NET, "Transaction %s started\n",
+	       lstcon_rpc_trans_name(trans->tas_opc));
+
+	/* post all requests */
+	list_for_each_entry(crpc, &trans->tas_rpcs_list, crp_link) {
+		LASSERT(!crpc->crp_posted);
+
+		lstcon_rpc_post(crpc);
+	}
+
+	mutex_unlock(&console_session.ses_mutex);
+
+	rc = wait_event_interruptible_timeout(trans->tas_waitq,
+					      lstcon_rpc_trans_check(trans),
+					      cfs_time_seconds(timeout));
+	rc = (rc > 0) ? 0 : ((rc < 0) ? -EINTR : -ETIMEDOUT);
+
+	mutex_lock(&console_session.ses_mutex);
+
+	if (console_session.ses_shutdown)
+		rc = -ESHUTDOWN;
+
+	if (rc != 0 || atomic_read(&trans->tas_remaining) != 0) {
+		/* treat short timeout as canceled */
+		if (rc == -ETIMEDOUT && timeout < LST_TRANS_MIN_TIMEOUT * 2)
+			rc = -EINTR;
+
+		lstcon_rpc_trans_abort(trans, rc);
+	}
+
+	CDEBUG(D_NET, "Transaction %s stopped: %d\n",
+	       lstcon_rpc_trans_name(trans->tas_opc), rc);
+
+	lstcon_rpc_trans_stat(trans, lstcon_trans_stat());
+
+	return rc;
+}
+
+static int
+lstcon_rpc_get_reply(lstcon_rpc_t *crpc, srpc_msg_t **msgpp)
+{
+	lstcon_node_t	*nd  = crpc->crp_node;
+	srpc_client_rpc_t    *rpc = crpc->crp_rpc;
+	srpc_generic_reply_t *rep;
+
+	LASSERT(nd != NULL && rpc != NULL);
+	LASSERT(crpc->crp_stamp != 0);
+
+	if (crpc->crp_status != 0) {
+		*msgpp = NULL;
+		return crpc->crp_status;
+	}
+
+	*msgpp = &rpc->crpc_replymsg;
+	if (!crpc->crp_unpacked) {
+		sfw_unpack_message(*msgpp);
+		crpc->crp_unpacked = 1;
+	}
+
+	if (cfs_time_after(nd->nd_stamp, crpc->crp_stamp))
+		return 0;
+
+	nd->nd_stamp = crpc->crp_stamp;
+	rep = &(*msgpp)->msg_body.reply;
+
+	if (rep->sid.ses_nid == LNET_NID_ANY)
+		nd->nd_state = LST_NODE_UNKNOWN;
+	else if (lstcon_session_match(rep->sid))
+		nd->nd_state = LST_NODE_ACTIVE;
+	else
+		nd->nd_state = LST_NODE_BUSY;
+
+	return 0;
+}
+
+void
+lstcon_rpc_trans_stat(lstcon_rpc_trans_t *trans, lstcon_trans_stat_t *stat)
+{
+	lstcon_rpc_t      *crpc;
+	srpc_msg_t	*rep;
+	int		error;
+
+	LASSERT(stat != NULL);
+
+	memset(stat, 0, sizeof(*stat));
+
+	list_for_each_entry(crpc, &trans->tas_rpcs_list, crp_link) {
+		lstcon_rpc_stat_total(stat, 1);
+
+		LASSERT(crpc->crp_stamp != 0);
+
+		error = lstcon_rpc_get_reply(crpc, &rep);
+		if (error != 0) {
+			lstcon_rpc_stat_failure(stat, 1);
+			if (stat->trs_rpc_errno == 0)
+				stat->trs_rpc_errno = -error;
+
+			continue;
+		}
+
+		lstcon_rpc_stat_success(stat, 1);
+
+		lstcon_rpc_stat_reply(trans, rep, crpc->crp_node, stat);
+	}
+
+	if (trans->tas_opc == LST_TRANS_SESNEW && stat->trs_fwk_errno == 0) {
+		stat->trs_fwk_errno =
+		      lstcon_session_feats_check(trans->tas_features);
+	}
+
+	CDEBUG(D_NET, "transaction %s : success %d, failure %d, total %d, RPC error(%d), Framework error(%d)\n",
+	       lstcon_rpc_trans_name(trans->tas_opc),
+	       lstcon_rpc_stat_success(stat, 0),
+	       lstcon_rpc_stat_failure(stat, 0),
+	       lstcon_rpc_stat_total(stat, 0),
+	       stat->trs_rpc_errno, stat->trs_fwk_errno);
+
+	return;
+}
+
+int
+lstcon_rpc_trans_interpreter(lstcon_rpc_trans_t *trans,
+			     struct list_head *head_up,
+			     lstcon_rpc_readent_func_t readent)
+{
+	struct list_head	    tmp;
+	struct list_head	   *next;
+	lstcon_rpc_ent_t     *ent;
+	srpc_generic_reply_t *rep;
+	lstcon_rpc_t	 *crpc;
+	srpc_msg_t	   *msg;
+	lstcon_node_t	*nd;
+	long	dur;
+	struct timeval	tv;
+	int		   error;
+
+	LASSERT(head_up != NULL);
+
+	next = head_up;
+
+	list_for_each_entry(crpc, &trans->tas_rpcs_list, crp_link) {
+		if (copy_from_user(&tmp, next,
+				       sizeof(struct list_head)))
+			return -EFAULT;
+
+		if (tmp.next == head_up)
+			return 0;
+
+		next = tmp.next;
+
+		ent = list_entry(next, lstcon_rpc_ent_t, rpe_link);
+
+		LASSERT(crpc->crp_stamp != 0);
+
+		error = lstcon_rpc_get_reply(crpc, &msg);
+
+		nd = crpc->crp_node;
+
+		dur = (long)cfs_time_sub(crpc->crp_stamp,
+		      (unsigned long)console_session.ses_id.ses_stamp);
+		cfs_duration_usec(dur, &tv);
+
+		if (copy_to_user(&ent->rpe_peer,
+				     &nd->nd_id, sizeof(lnet_process_id_t)) ||
+		    copy_to_user(&ent->rpe_stamp, &tv, sizeof(tv)) ||
+		    copy_to_user(&ent->rpe_state,
+				     &nd->nd_state, sizeof(nd->nd_state)) ||
+		    copy_to_user(&ent->rpe_rpc_errno, &error,
+				     sizeof(error)))
+			return -EFAULT;
+
+		if (error != 0)
+			continue;
+
+		/* RPC is done */
+		rep = (srpc_generic_reply_t *)&msg->msg_body.reply;
+
+		if (copy_to_user(&ent->rpe_sid,
+				     &rep->sid, sizeof(lst_sid_t)) ||
+		    copy_to_user(&ent->rpe_fwk_errno,
+				     &rep->status, sizeof(rep->status)))
+			return -EFAULT;
+
+		if (readent == NULL)
+			continue;
+
+		error = readent(trans->tas_opc, msg, ent);
+
+		if (error != 0)
+			return error;
+	}
+
+	return 0;
+}
+
+void
+lstcon_rpc_trans_destroy(lstcon_rpc_trans_t *trans)
+{
+	srpc_client_rpc_t *rpc;
+	lstcon_rpc_t      *crpc;
+	lstcon_rpc_t      *tmp;
+	int		count = 0;
+
+	list_for_each_entry_safe(crpc, tmp, &trans->tas_rpcs_list,
+				 crp_link) {
+		rpc = crpc->crp_rpc;
+
+		spin_lock(&rpc->crpc_lock);
+
+		/* free it if not posted or finished already */
+		if (!crpc->crp_posted || crpc->crp_finished) {
+			spin_unlock(&rpc->crpc_lock);
+
+			list_del_init(&crpc->crp_link);
+			lstcon_rpc_put(crpc);
+
+			continue;
+		}
+
+		/* rpcs can be still not callbacked (even LNetMDUnlink is called)
+		 * because huge timeout for inaccessible network, don't make
+		 * user wait for them, just abandon them, they will be recycled
+		 * in callback */
+
+		LASSERT(crpc->crp_status != 0);
+
+		crpc->crp_node  = NULL;
+		crpc->crp_trans = NULL;
+		list_del_init(&crpc->crp_link);
+		count++;
+
+		spin_unlock(&rpc->crpc_lock);
+
+		atomic_dec(&trans->tas_remaining);
+	}
+
+	LASSERT(atomic_read(&trans->tas_remaining) == 0);
+
+	list_del(&trans->tas_link);
+	if (!list_empty(&trans->tas_olink))
+		list_del(&trans->tas_olink);
+
+	CDEBUG(D_NET, "Transaction %s destroyed with %d pending RPCs\n",
+	       lstcon_rpc_trans_name(trans->tas_opc), count);
+
+	LIBCFS_FREE(trans, sizeof(*trans));
+
+	return;
+}
+
+int
+lstcon_sesrpc_prep(lstcon_node_t *nd, int transop,
+		   unsigned feats, lstcon_rpc_t **crpc)
+{
+	srpc_mksn_reqst_t *msrq;
+	srpc_rmsn_reqst_t *rsrq;
+	int		rc;
+
+	switch (transop) {
+	case LST_TRANS_SESNEW:
+		rc = lstcon_rpc_prep(nd, SRPC_SERVICE_MAKE_SESSION,
+				     feats, 0, 0, crpc);
+		if (rc != 0)
+			return rc;
+
+		msrq = &(*crpc)->crp_rpc->crpc_reqstmsg.msg_body.mksn_reqst;
+		msrq->mksn_sid     = console_session.ses_id;
+		msrq->mksn_force   = console_session.ses_force;
+		strncpy(msrq->mksn_name, console_session.ses_name,
+			strlen(console_session.ses_name));
+		break;
+
+	case LST_TRANS_SESEND:
+		rc = lstcon_rpc_prep(nd, SRPC_SERVICE_REMOVE_SESSION,
+				     feats, 0, 0, crpc);
+		if (rc != 0)
+			return rc;
+
+		rsrq = &(*crpc)->crp_rpc->crpc_reqstmsg.msg_body.rmsn_reqst;
+		rsrq->rmsn_sid = console_session.ses_id;
+		break;
+
+	default:
+		LBUG();
+	}
+
+	return 0;
+}
+
+int
+lstcon_dbgrpc_prep(lstcon_node_t *nd, unsigned feats, lstcon_rpc_t **crpc)
+{
+	srpc_debug_reqst_t *drq;
+	int		    rc;
+
+	rc = lstcon_rpc_prep(nd, SRPC_SERVICE_DEBUG, feats, 0, 0, crpc);
+	if (rc != 0)
+		return rc;
+
+	drq = &(*crpc)->crp_rpc->crpc_reqstmsg.msg_body.dbg_reqst;
+
+	drq->dbg_sid   = console_session.ses_id;
+	drq->dbg_flags = 0;
+
+	return rc;
+}
+
+int
+lstcon_batrpc_prep(lstcon_node_t *nd, int transop, unsigned feats,
+		   lstcon_tsb_hdr_t *tsb, lstcon_rpc_t **crpc)
+{
+	lstcon_batch_t	   *batch;
+	srpc_batch_reqst_t *brq;
+	int		    rc;
+
+	rc = lstcon_rpc_prep(nd, SRPC_SERVICE_BATCH, feats, 0, 0, crpc);
+	if (rc != 0)
+		return rc;
+
+	brq = &(*crpc)->crp_rpc->crpc_reqstmsg.msg_body.bat_reqst;
+
+	brq->bar_sid     = console_session.ses_id;
+	brq->bar_bid     = tsb->tsb_id;
+	brq->bar_testidx = tsb->tsb_index;
+	brq->bar_opc     = transop == LST_TRANS_TSBRUN ? SRPC_BATCH_OPC_RUN :
+			   (transop == LST_TRANS_TSBSTOP ? SRPC_BATCH_OPC_STOP :
+			    SRPC_BATCH_OPC_QUERY);
+
+	if (transop != LST_TRANS_TSBRUN &&
+	    transop != LST_TRANS_TSBSTOP)
+		return 0;
+
+	LASSERT(tsb->tsb_index == 0);
+
+	batch = (lstcon_batch_t *)tsb;
+	brq->bar_arg = batch->bat_arg;
+
+	return 0;
+}
+
+int
+lstcon_statrpc_prep(lstcon_node_t *nd, unsigned feats, lstcon_rpc_t **crpc)
+{
+	srpc_stat_reqst_t *srq;
+	int		   rc;
+
+	rc = lstcon_rpc_prep(nd, SRPC_SERVICE_QUERY_STAT, feats, 0, 0, crpc);
+	if (rc != 0)
+		return rc;
+
+	srq = &(*crpc)->crp_rpc->crpc_reqstmsg.msg_body.stat_reqst;
+
+	srq->str_sid  = console_session.ses_id;
+	srq->str_type = 0; /* XXX remove it */
+
+	return 0;
+}
+
+static lnet_process_id_packed_t *
+lstcon_next_id(int idx, int nkiov, lnet_kiov_t *kiov)
+{
+	lnet_process_id_packed_t *pid;
+	int		       i;
+
+	i = idx / SFW_ID_PER_PAGE;
+
+	LASSERT(i < nkiov);
+
+	pid = (lnet_process_id_packed_t *)page_address(kiov[i].kiov_page);
+
+	return &pid[idx % SFW_ID_PER_PAGE];
+}
+
+static int
+lstcon_dstnodes_prep(lstcon_group_t *grp, int idx,
+		     int dist, int span, int nkiov, lnet_kiov_t *kiov)
+{
+	lnet_process_id_packed_t *pid;
+	lstcon_ndlink_t	  *ndl;
+	lstcon_node_t	    *nd;
+	int		       start;
+	int		       end;
+	int		       i = 0;
+
+	LASSERT(dist >= 1);
+	LASSERT(span >= 1);
+	LASSERT(grp->grp_nnode >= 1);
+
+	if (span > grp->grp_nnode)
+		return -EINVAL;
+
+	start = ((idx / dist) * span) % grp->grp_nnode;
+	end   = ((idx / dist) * span + span - 1) % grp->grp_nnode;
+
+	list_for_each_entry(ndl, &grp->grp_ndl_list, ndl_link) {
+		nd = ndl->ndl_node;
+		if (i < start) {
+			i++;
+			continue;
+		}
+
+		if (i > (end >= start ? end : grp->grp_nnode))
+			break;
+
+		pid = lstcon_next_id((i - start), nkiov, kiov);
+		pid->nid = nd->nd_id.nid;
+		pid->pid = nd->nd_id.pid;
+		i++;
+	}
+
+	if (start <= end) /* done */
+		return 0;
+
+	list_for_each_entry(ndl, &grp->grp_ndl_list, ndl_link) {
+		if (i > grp->grp_nnode + end)
+			break;
+
+		nd = ndl->ndl_node;
+		pid = lstcon_next_id((i - start), nkiov, kiov);
+		pid->nid = nd->nd_id.nid;
+		pid->pid = nd->nd_id.pid;
+		i++;
+	}
+
+	return 0;
+}
+
+static int
+lstcon_pingrpc_prep(lst_test_ping_param_t *param, srpc_test_reqst_t *req)
+{
+	test_ping_req_t *prq = &req->tsr_u.ping;
+
+	prq->png_size   = param->png_size;
+	prq->png_flags  = param->png_flags;
+	/* TODO dest */
+	return 0;
+}
+
+static int
+lstcon_bulkrpc_v0_prep(lst_test_bulk_param_t *param, srpc_test_reqst_t *req)
+{
+	test_bulk_req_t *brq = &req->tsr_u.bulk_v0;
+
+	brq->blk_opc    = param->blk_opc;
+	brq->blk_npg    = (param->blk_size + PAGE_CACHE_SIZE - 1) / PAGE_CACHE_SIZE;
+	brq->blk_flags  = param->blk_flags;
+
+	return 0;
+}
+
+static int
+lstcon_bulkrpc_v1_prep(lst_test_bulk_param_t *param, srpc_test_reqst_t *req)
+{
+	test_bulk_req_v1_t *brq = &req->tsr_u.bulk_v1;
+
+	brq->blk_opc	= param->blk_opc;
+	brq->blk_flags	= param->blk_flags;
+	brq->blk_len	= param->blk_size;
+	brq->blk_offset	= 0; /* reserved */
+
+	return 0;
+}
+
+int
+lstcon_testrpc_prep(lstcon_node_t *nd, int transop, unsigned feats,
+		    lstcon_test_t *test, lstcon_rpc_t **crpc)
+{
+	lstcon_group_t    *sgrp = test->tes_src_grp;
+	lstcon_group_t    *dgrp = test->tes_dst_grp;
+	srpc_test_reqst_t *trq;
+	srpc_bulk_t       *bulk;
+	int		i;
+	int		   npg = 0;
+	int		   nob = 0;
+	int		   rc  = 0;
+
+	if (transop == LST_TRANS_TSBCLIADD) {
+		npg = sfw_id_pages(test->tes_span);
+		nob = (feats & LST_FEAT_BULK_LEN) == 0 ?
+		      npg * PAGE_CACHE_SIZE :
+		      sizeof(lnet_process_id_packed_t) * test->tes_span;
+	}
+
+	rc = lstcon_rpc_prep(nd, SRPC_SERVICE_TEST, feats, npg, nob, crpc);
+	if (rc != 0)
+		return rc;
+
+	trq  = &(*crpc)->crp_rpc->crpc_reqstmsg.msg_body.tes_reqst;
+
+	if (transop == LST_TRANS_TSBSRVADD) {
+		int ndist = (sgrp->grp_nnode + test->tes_dist - 1) / test->tes_dist;
+		int nspan = (dgrp->grp_nnode + test->tes_span - 1) / test->tes_span;
+		int nmax = (ndist + nspan - 1) / nspan;
+
+		trq->tsr_ndest = 0;
+		trq->tsr_loop  = nmax * test->tes_dist * test->tes_concur;
+
+	} else {
+		bulk = &(*crpc)->crp_rpc->crpc_bulk;
+
+		for (i = 0; i < npg; i++) {
+			int	len;
+
+			LASSERT(nob > 0);
+
+			len = (feats & LST_FEAT_BULK_LEN) == 0 ?
+			      PAGE_CACHE_SIZE : min_t(int, nob, PAGE_CACHE_SIZE);
+			nob -= len;
+
+			bulk->bk_iovs[i].kiov_offset = 0;
+			bulk->bk_iovs[i].kiov_len    = len;
+			bulk->bk_iovs[i].kiov_page   =
+				alloc_page(GFP_IOFS);
+
+			if (bulk->bk_iovs[i].kiov_page == NULL) {
+				lstcon_rpc_put(*crpc);
+				return -ENOMEM;
+			}
+		}
+
+		bulk->bk_sink = 0;
+
+		LASSERT(transop == LST_TRANS_TSBCLIADD);
+
+		rc = lstcon_dstnodes_prep(test->tes_dst_grp,
+					  test->tes_cliidx++,
+					  test->tes_dist,
+					  test->tes_span,
+					  npg, &bulk->bk_iovs[0]);
+		if (rc != 0) {
+			lstcon_rpc_put(*crpc);
+			return rc;
+		}
+
+		trq->tsr_ndest = test->tes_span;
+		trq->tsr_loop  = test->tes_loop;
+	}
+
+	trq->tsr_sid	= console_session.ses_id;
+	trq->tsr_bid	= test->tes_hdr.tsb_id;
+	trq->tsr_concur     = test->tes_concur;
+	trq->tsr_is_client  = (transop == LST_TRANS_TSBCLIADD) ? 1 : 0;
+	trq->tsr_stop_onerr = !!test->tes_stop_onerr;
+
+	switch (test->tes_type) {
+	case LST_TEST_PING:
+		trq->tsr_service = SRPC_SERVICE_PING;
+		rc = lstcon_pingrpc_prep((lst_test_ping_param_t *)
+					 &test->tes_param[0], trq);
+		break;
+
+	case LST_TEST_BULK:
+		trq->tsr_service = SRPC_SERVICE_BRW;
+		if ((feats & LST_FEAT_BULK_LEN) == 0) {
+			rc = lstcon_bulkrpc_v0_prep((lst_test_bulk_param_t *)
+						    &test->tes_param[0], trq);
+		} else {
+			rc = lstcon_bulkrpc_v1_prep((lst_test_bulk_param_t *)
+						    &test->tes_param[0], trq);
+		}
+
+		break;
+	default:
+		LBUG();
+		break;
+	}
+
+	return rc;
+}
+
+static int
+lstcon_sesnew_stat_reply(lstcon_rpc_trans_t *trans,
+			 lstcon_node_t *nd, srpc_msg_t *reply)
+{
+	srpc_mksn_reply_t *mksn_rep = &reply->msg_body.mksn_reply;
+	int		   status   = mksn_rep->mksn_status;
+
+	if (status == 0 &&
+	    (reply->msg_ses_feats & ~LST_FEATS_MASK) != 0) {
+		mksn_rep->mksn_status = EPROTO;
+		status = EPROTO;
+	}
+
+	if (status == EPROTO) {
+		CNETERR("session protocol error from %s: %u\n",
+			libcfs_nid2str(nd->nd_id.nid),
+			reply->msg_ses_feats);
+	}
+
+	if (status != 0)
+		return status;
+
+	if (!trans->tas_feats_updated) {
+		trans->tas_feats_updated = 1;
+		trans->tas_features = reply->msg_ses_feats;
+	}
+
+	if (reply->msg_ses_feats != trans->tas_features) {
+		CNETERR("Framework features %x from %s is different with features on this transaction: %x\n",
+			 reply->msg_ses_feats, libcfs_nid2str(nd->nd_id.nid),
+			 trans->tas_features);
+		status = mksn_rep->mksn_status = EPROTO;
+	}
+
+	if (status == 0) {
+		/* session timeout on remote node */
+		nd->nd_timeout = mksn_rep->mksn_timeout;
+	}
+
+	return status;
+}
+
+void
+lstcon_rpc_stat_reply(lstcon_rpc_trans_t *trans, srpc_msg_t *msg,
+		      lstcon_node_t *nd, lstcon_trans_stat_t *stat)
+{
+	srpc_rmsn_reply_t  *rmsn_rep;
+	srpc_debug_reply_t *dbg_rep;
+	srpc_batch_reply_t *bat_rep;
+	srpc_test_reply_t  *test_rep;
+	srpc_stat_reply_t  *stat_rep;
+	int		 rc = 0;
+
+	switch (trans->tas_opc) {
+	case LST_TRANS_SESNEW:
+		rc = lstcon_sesnew_stat_reply(trans, nd, msg);
+		if (rc == 0) {
+			lstcon_sesop_stat_success(stat, 1);
+			return;
+		}
+
+		lstcon_sesop_stat_failure(stat, 1);
+		break;
+
+	case LST_TRANS_SESEND:
+		rmsn_rep = &msg->msg_body.rmsn_reply;
+		/* ESRCH is not an error for end session */
+		if (rmsn_rep->rmsn_status == 0 ||
+		    rmsn_rep->rmsn_status == ESRCH) {
+			lstcon_sesop_stat_success(stat, 1);
+			return;
+		}
+
+		lstcon_sesop_stat_failure(stat, 1);
+		rc = rmsn_rep->rmsn_status;
+		break;
+
+	case LST_TRANS_SESQRY:
+	case LST_TRANS_SESPING:
+		dbg_rep = &msg->msg_body.dbg_reply;
+
+		if (dbg_rep->dbg_status == ESRCH) {
+			lstcon_sesqry_stat_unknown(stat, 1);
+			return;
+		}
+
+		if (lstcon_session_match(dbg_rep->dbg_sid))
+			lstcon_sesqry_stat_active(stat, 1);
+		else
+			lstcon_sesqry_stat_busy(stat, 1);
+		return;
+
+	case LST_TRANS_TSBRUN:
+	case LST_TRANS_TSBSTOP:
+		bat_rep = &msg->msg_body.bat_reply;
+
+		if (bat_rep->bar_status == 0) {
+			lstcon_tsbop_stat_success(stat, 1);
+			return;
+		}
+
+		if (bat_rep->bar_status == EPERM &&
+		    trans->tas_opc == LST_TRANS_TSBSTOP) {
+			lstcon_tsbop_stat_success(stat, 1);
+			return;
+		}
+
+		lstcon_tsbop_stat_failure(stat, 1);
+		rc = bat_rep->bar_status;
+		break;
+
+	case LST_TRANS_TSBCLIQRY:
+	case LST_TRANS_TSBSRVQRY:
+		bat_rep = &msg->msg_body.bat_reply;
+
+		if (bat_rep->bar_active != 0)
+			lstcon_tsbqry_stat_run(stat, 1);
+		else
+			lstcon_tsbqry_stat_idle(stat, 1);
+
+		if (bat_rep->bar_status == 0)
+			return;
+
+		lstcon_tsbqry_stat_failure(stat, 1);
+		rc = bat_rep->bar_status;
+		break;
+
+	case LST_TRANS_TSBCLIADD:
+	case LST_TRANS_TSBSRVADD:
+		test_rep = &msg->msg_body.tes_reply;
+
+		if (test_rep->tsr_status == 0) {
+			lstcon_tsbop_stat_success(stat, 1);
+			return;
+		}
+
+		lstcon_tsbop_stat_failure(stat, 1);
+		rc = test_rep->tsr_status;
+		break;
+
+	case LST_TRANS_STATQRY:
+		stat_rep = &msg->msg_body.stat_reply;
+
+		if (stat_rep->str_status == 0) {
+			lstcon_statqry_stat_success(stat, 1);
+			return;
+		}
+
+		lstcon_statqry_stat_failure(stat, 1);
+		rc = stat_rep->str_status;
+		break;
+
+	default:
+		LBUG();
+	}
+
+	if (stat->trs_fwk_errno == 0)
+		stat->trs_fwk_errno = rc;
+
+	return;
+}
+
+int
+lstcon_rpc_trans_ndlist(struct list_head *ndlist,
+			struct list_head *translist, int transop,
+			void *arg, lstcon_rpc_cond_func_t condition,
+			lstcon_rpc_trans_t **transpp)
+{
+	lstcon_rpc_trans_t *trans;
+	lstcon_ndlink_t    *ndl;
+	lstcon_node_t      *nd;
+	lstcon_rpc_t       *rpc;
+	unsigned	    feats;
+	int		 rc;
+
+	/* Creating session RPG for list of nodes */
+
+	rc = lstcon_rpc_trans_prep(translist, transop, &trans);
+	if (rc != 0) {
+		CERROR("Can't create transaction %d: %d\n", transop, rc);
+		return rc;
+	}
+
+	feats = trans->tas_features;
+	list_for_each_entry(ndl, ndlist, ndl_link) {
+		rc = condition == NULL ? 1 :
+		     condition(transop, ndl->ndl_node, arg);
+
+		if (rc == 0)
+			continue;
+
+		if (rc < 0) {
+			CDEBUG(D_NET, "Condition error while creating RPC for transaction %d: %d\n",
+					transop, rc);
+			break;
+		}
+
+		nd = ndl->ndl_node;
+
+		switch (transop) {
+		case LST_TRANS_SESNEW:
+		case LST_TRANS_SESEND:
+			rc = lstcon_sesrpc_prep(nd, transop, feats, &rpc);
+			break;
+		case LST_TRANS_SESQRY:
+		case LST_TRANS_SESPING:
+			rc = lstcon_dbgrpc_prep(nd, feats, &rpc);
+			break;
+		case LST_TRANS_TSBCLIADD:
+		case LST_TRANS_TSBSRVADD:
+			rc = lstcon_testrpc_prep(nd, transop, feats,
+						 (lstcon_test_t *)arg, &rpc);
+			break;
+		case LST_TRANS_TSBRUN:
+		case LST_TRANS_TSBSTOP:
+		case LST_TRANS_TSBCLIQRY:
+		case LST_TRANS_TSBSRVQRY:
+			rc = lstcon_batrpc_prep(nd, transop, feats,
+						(lstcon_tsb_hdr_t *)arg, &rpc);
+			break;
+		case LST_TRANS_STATQRY:
+			rc = lstcon_statrpc_prep(nd, feats, &rpc);
+			break;
+		default:
+			rc = -EINVAL;
+			break;
+		}
+
+		if (rc != 0) {
+			CERROR("Failed to create RPC for transaction %s: %d\n",
+			       lstcon_rpc_trans_name(transop), rc);
+			break;
+		}
+
+		lstcon_rpc_trans_addreq(trans, rpc);
+	}
+
+	if (rc == 0) {
+		*transpp = trans;
+		return 0;
+	}
+
+	lstcon_rpc_trans_destroy(trans);
+
+	return rc;
+}
+
+static void
+lstcon_rpc_pinger(void *arg)
+{
+	stt_timer_t	*ptimer = (stt_timer_t *)arg;
+	lstcon_rpc_trans_t *trans;
+	lstcon_rpc_t       *crpc;
+	srpc_msg_t	 *rep;
+	srpc_debug_reqst_t *drq;
+	lstcon_ndlink_t    *ndl;
+	lstcon_node_t      *nd;
+	time_t	      intv;
+	int		 count = 0;
+	int		 rc;
+
+	/* RPC pinger is a special case of transaction,
+	 * it's called by timer at 8 seconds interval.
+	 */
+	mutex_lock(&console_session.ses_mutex);
+
+	if (console_session.ses_shutdown || console_session.ses_expired) {
+		mutex_unlock(&console_session.ses_mutex);
+		return;
+	}
+
+	if (!console_session.ses_expired &&
+	    get_seconds() - console_session.ses_laststamp >
+	    (time_t)console_session.ses_timeout)
+		console_session.ses_expired = 1;
+
+	trans = console_session.ses_ping;
+
+	LASSERT(trans != NULL);
+
+	list_for_each_entry(ndl, &console_session.ses_ndl_list, ndl_link) {
+		nd = ndl->ndl_node;
+
+		if (console_session.ses_expired) {
+			/* idle console, end session on all nodes */
+			if (nd->nd_state != LST_NODE_ACTIVE)
+				continue;
+
+			rc = lstcon_sesrpc_prep(nd, LST_TRANS_SESEND,
+						trans->tas_features, &crpc);
+			if (rc != 0) {
+				CERROR("Out of memory\n");
+				break;
+			}
+
+			lstcon_rpc_trans_addreq(trans, crpc);
+			lstcon_rpc_post(crpc);
+
+			continue;
+		}
+
+		crpc = &nd->nd_ping;
+
+		if (crpc->crp_rpc != NULL) {
+			LASSERT(crpc->crp_trans == trans);
+			LASSERT(!list_empty(&crpc->crp_link));
+
+			spin_lock(&crpc->crp_rpc->crpc_lock);
+
+			LASSERT(crpc->crp_posted);
+
+			if (!crpc->crp_finished) {
+				/* in flight */
+				spin_unlock(&crpc->crp_rpc->crpc_lock);
+				continue;
+			}
+
+			spin_unlock(&crpc->crp_rpc->crpc_lock);
+
+			lstcon_rpc_get_reply(crpc, &rep);
+
+			list_del_init(&crpc->crp_link);
+
+			lstcon_rpc_put(crpc);
+		}
+
+		if (nd->nd_state != LST_NODE_ACTIVE)
+			continue;
+
+		intv = cfs_duration_sec(cfs_time_sub(cfs_time_current(),
+						     nd->nd_stamp));
+		if (intv < (time_t)nd->nd_timeout / 2)
+			continue;
+
+		rc = lstcon_rpc_init(nd, SRPC_SERVICE_DEBUG,
+				     trans->tas_features, 0, 0, 1, crpc);
+		if (rc != 0) {
+			CERROR("Out of memory\n");
+			break;
+		}
+
+		drq = &crpc->crp_rpc->crpc_reqstmsg.msg_body.dbg_reqst;
+
+		drq->dbg_sid   = console_session.ses_id;
+		drq->dbg_flags = 0;
+
+		lstcon_rpc_trans_addreq(trans, crpc);
+		lstcon_rpc_post(crpc);
+
+		count++;
+	}
+
+	if (console_session.ses_expired) {
+		mutex_unlock(&console_session.ses_mutex);
+		return;
+	}
+
+	CDEBUG(D_NET, "Ping %d nodes in session\n", count);
+
+	ptimer->stt_expires = (unsigned long)(get_seconds() + LST_PING_INTERVAL);
+	stt_add_timer(ptimer);
+
+	mutex_unlock(&console_session.ses_mutex);
+}
+
+int
+lstcon_rpc_pinger_start(void)
+{
+	stt_timer_t    *ptimer;
+	int	     rc;
+
+	LASSERT(list_empty(&console_session.ses_rpc_freelist));
+	LASSERT(atomic_read(&console_session.ses_rpc_counter) == 0);
+
+	rc = lstcon_rpc_trans_prep(NULL, LST_TRANS_SESPING,
+				   &console_session.ses_ping);
+	if (rc != 0) {
+		CERROR("Failed to create console pinger\n");
+		return rc;
+	}
+
+	ptimer = &console_session.ses_ping_timer;
+	ptimer->stt_expires = (unsigned long)(get_seconds() + LST_PING_INTERVAL);
+
+	stt_add_timer(ptimer);
+
+	return 0;
+}
+
+void
+lstcon_rpc_pinger_stop(void)
+{
+	LASSERT(console_session.ses_shutdown);
+
+	stt_del_timer(&console_session.ses_ping_timer);
+
+	lstcon_rpc_trans_abort(console_session.ses_ping, -ESHUTDOWN);
+	lstcon_rpc_trans_stat(console_session.ses_ping, lstcon_trans_stat());
+	lstcon_rpc_trans_destroy(console_session.ses_ping);
+
+	memset(lstcon_trans_stat(), 0, sizeof(lstcon_trans_stat_t));
+
+	console_session.ses_ping = NULL;
+}
+
+void
+lstcon_rpc_cleanup_wait(void)
+{
+	lstcon_rpc_trans_t *trans;
+	lstcon_rpc_t       *crpc;
+	struct list_head	 *pacer;
+	struct list_head	  zlist;
+
+	/* Called with hold of global mutex */
+
+	LASSERT(console_session.ses_shutdown);
+
+	while (!list_empty(&console_session.ses_trans_list)) {
+		list_for_each(pacer, &console_session.ses_trans_list) {
+			trans = list_entry(pacer, lstcon_rpc_trans_t,
+					       tas_link);
+
+			CDEBUG(D_NET, "Session closed, wakeup transaction %s\n",
+			       lstcon_rpc_trans_name(trans->tas_opc));
+
+			wake_up(&trans->tas_waitq);
+		}
+
+		mutex_unlock(&console_session.ses_mutex);
+
+		CWARN("Session is shutting down, waiting for termination of transactions\n");
+		set_current_state(TASK_UNINTERRUPTIBLE);
+		schedule_timeout(cfs_time_seconds(1));
+
+		mutex_lock(&console_session.ses_mutex);
+	}
+
+	spin_lock(&console_session.ses_rpc_lock);
+
+	lst_wait_until((atomic_read(&console_session.ses_rpc_counter) == 0),
+		       console_session.ses_rpc_lock,
+		       "Network is not accessible or target is down, waiting for %d console RPCs to being recycled\n",
+		       atomic_read(&console_session.ses_rpc_counter));
+
+	list_add(&zlist, &console_session.ses_rpc_freelist);
+	list_del_init(&console_session.ses_rpc_freelist);
+
+	spin_unlock(&console_session.ses_rpc_lock);
+
+	while (!list_empty(&zlist)) {
+		crpc = list_entry(zlist.next, lstcon_rpc_t, crp_link);
+
+		list_del(&crpc->crp_link);
+		LIBCFS_FREE(crpc, sizeof(lstcon_rpc_t));
+	}
+}
+
+int
+lstcon_rpc_module_init(void)
+{
+	INIT_LIST_HEAD(&console_session.ses_ping_timer.stt_list);
+	console_session.ses_ping_timer.stt_func = lstcon_rpc_pinger;
+	console_session.ses_ping_timer.stt_data = &console_session.ses_ping_timer;
+
+	console_session.ses_ping = NULL;
+
+	spin_lock_init(&console_session.ses_rpc_lock);
+	atomic_set(&console_session.ses_rpc_counter, 0);
+	INIT_LIST_HEAD(&console_session.ses_rpc_freelist);
+
+	return 0;
+}
+
+void
+lstcon_rpc_module_fini(void)
+{
+	LASSERT(list_empty(&console_session.ses_rpc_freelist));
+	LASSERT(atomic_read(&console_session.ses_rpc_counter) == 0);
+}
diff --git a/kernel/drivers/staging/lustre/lnet/selftest/conrpc.h b/kernel/drivers/staging/lustre/lnet/selftest/conrpc.h
new file mode 100644
index 000000000..2353889c6
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lnet/selftest/conrpc.h
@@ -0,0 +1,146 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * /lnet/selftest/conrpc.h
+ *
+ * Console rpc
+ *
+ * Author: Liang Zhen <liang@whamcloud.com>
+ */
+
+#ifndef __LST_CONRPC_H__
+#define __LST_CONRPC_H__
+
+#include "../../include/linux/libcfs/libcfs.h"
+#include "../../include/linux/lnet/lnet.h"
+#include "../../include/linux/lnet/lib-types.h"
+#include "../../include/linux/lnet/lnetst.h"
+#include "rpc.h"
+#include "selftest.h"
+
+/* Console rpc and rpc transaction */
+#define LST_TRANS_TIMEOUT       30
+#define LST_TRANS_MIN_TIMEOUT   3
+
+#define LST_VALIDATE_TIMEOUT(t) min(max(t, LST_TRANS_MIN_TIMEOUT), LST_TRANS_TIMEOUT)
+
+#define LST_PING_INTERVAL       8
+
+struct lstcon_rpc_trans;
+struct lstcon_tsb_hdr;
+struct lstcon_test;
+struct lstcon_node;
+
+typedef struct lstcon_rpc {
+	struct list_head	       crp_link;       /* chain on rpc transaction */
+	srpc_client_rpc_t       *crp_rpc;	/* client rpc */
+	struct lstcon_node      *crp_node;       /* destination node */
+	struct lstcon_rpc_trans *crp_trans;     /* conrpc transaction */
+
+	unsigned int		 crp_posted:1;   /* rpc is posted */
+	unsigned int		 crp_finished:1; /* rpc is finished */
+	unsigned int		 crp_unpacked:1; /* reply is unpacked */
+	/** RPC is embedded in other structure and can't free it */
+	unsigned int		 crp_embedded:1;
+	int		      crp_status;     /* console rpc errors */
+	unsigned long	       crp_stamp;      /* replied time stamp */
+} lstcon_rpc_t;
+
+typedef struct lstcon_rpc_trans {
+	struct list_head	    tas_olink;     /* link chain on owner list */
+	struct list_head	    tas_link;      /* link chain on global list */
+	int		   tas_opc;       /* operation code of transaction */
+	/* features mask is uptodate */
+	unsigned	      tas_feats_updated;
+	/* test features mask */
+	unsigned	      tas_features;
+	wait_queue_head_t	   tas_waitq;     /* wait queue head */
+	atomic_t	  tas_remaining; /* # of un-scheduled rpcs */
+	struct list_head	    tas_rpcs_list; /* queued requests */
+} lstcon_rpc_trans_t;
+
+#define LST_TRANS_PRIVATE       0x1000
+
+#define LST_TRANS_SESNEW	(LST_TRANS_PRIVATE | 0x01)
+#define LST_TRANS_SESEND	(LST_TRANS_PRIVATE | 0x02)
+#define LST_TRANS_SESQRY	0x03
+#define LST_TRANS_SESPING       0x04
+
+#define LST_TRANS_TSBCLIADD     (LST_TRANS_PRIVATE | 0x11)
+#define LST_TRANS_TSBSRVADD     (LST_TRANS_PRIVATE | 0x12)
+#define LST_TRANS_TSBRUN	(LST_TRANS_PRIVATE | 0x13)
+#define LST_TRANS_TSBSTOP       (LST_TRANS_PRIVATE | 0x14)
+#define LST_TRANS_TSBCLIQRY     0x15
+#define LST_TRANS_TSBSRVQRY     0x16
+
+#define LST_TRANS_STATQRY       0x21
+
+typedef int (* lstcon_rpc_cond_func_t)(int, struct lstcon_node *, void *);
+typedef int (* lstcon_rpc_readent_func_t)(int, srpc_msg_t *, lstcon_rpc_ent_t *);
+
+int  lstcon_sesrpc_prep(struct lstcon_node *nd, int transop,
+			unsigned version, lstcon_rpc_t **crpc);
+int  lstcon_dbgrpc_prep(struct lstcon_node *nd,
+			unsigned version, lstcon_rpc_t **crpc);
+int  lstcon_batrpc_prep(struct lstcon_node *nd, int transop, unsigned version,
+			struct lstcon_tsb_hdr *tsb, lstcon_rpc_t **crpc);
+int  lstcon_testrpc_prep(struct lstcon_node *nd, int transop, unsigned version,
+			 struct lstcon_test *test, lstcon_rpc_t **crpc);
+int  lstcon_statrpc_prep(struct lstcon_node *nd, unsigned version,
+			 lstcon_rpc_t **crpc);
+void lstcon_rpc_put(lstcon_rpc_t *crpc);
+int  lstcon_rpc_trans_prep(struct list_head *translist,
+			   int transop, lstcon_rpc_trans_t **transpp);
+int  lstcon_rpc_trans_ndlist(struct list_head *ndlist,
+			     struct list_head *translist, int transop,
+			     void *arg, lstcon_rpc_cond_func_t condition,
+			     lstcon_rpc_trans_t **transpp);
+void lstcon_rpc_trans_stat(lstcon_rpc_trans_t *trans,
+			   lstcon_trans_stat_t *stat);
+int  lstcon_rpc_trans_interpreter(lstcon_rpc_trans_t *trans,
+				  struct list_head *head_up,
+				  lstcon_rpc_readent_func_t readent);
+void lstcon_rpc_trans_abort(lstcon_rpc_trans_t *trans, int error);
+void lstcon_rpc_trans_destroy(lstcon_rpc_trans_t *trans);
+void lstcon_rpc_trans_addreq(lstcon_rpc_trans_t *trans, lstcon_rpc_t *req);
+int  lstcon_rpc_trans_postwait(lstcon_rpc_trans_t *trans, int timeout);
+int  lstcon_rpc_pinger_start(void);
+void lstcon_rpc_pinger_stop(void);
+void lstcon_rpc_cleanup_wait(void);
+int  lstcon_rpc_module_init(void);
+void lstcon_rpc_module_fini(void);
+
+
+#endif
diff --git a/kernel/drivers/staging/lustre/lnet/selftest/console.c b/kernel/drivers/staging/lustre/lnet/selftest/console.c
new file mode 100644
index 000000000..2b5f53c7a
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lnet/selftest/console.c
@@ -0,0 +1,2096 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/selftest/conctl.c
+ *
+ * Infrastructure of LST console
+ *
+ * Author: Liang Zhen <liangzhen@clusterfs.com>
+ */
+
+
+#include "../../include/linux/libcfs/libcfs.h"
+#include "../../include/linux/lnet/lib-lnet.h"
+#include "console.h"
+#include "conrpc.h"
+
+#define LST_NODE_STATE_COUNTER(nd, p)			\
+do {							\
+	if ((nd)->nd_state == LST_NODE_ACTIVE)		\
+		(p)->nle_nactive++;			\
+	else if ((nd)->nd_state == LST_NODE_BUSY)       \
+		(p)->nle_nbusy++;			\
+	else if ((nd)->nd_state == LST_NODE_DOWN)       \
+		(p)->nle_ndown++;			\
+	else						\
+		(p)->nle_nunknown++;			\
+	(p)->nle_nnode++;				\
+} while (0)
+
+lstcon_session_t	console_session;
+
+static void
+lstcon_node_get(lstcon_node_t *nd)
+{
+	LASSERT(nd->nd_ref >= 1);
+
+	nd->nd_ref++;
+}
+
+static int
+lstcon_node_find(lnet_process_id_t id, lstcon_node_t **ndpp, int create)
+{
+	lstcon_ndlink_t *ndl;
+	unsigned int     idx = LNET_NIDADDR(id.nid) % LST_GLOBAL_HASHSIZE;
+
+	LASSERT(id.nid != LNET_NID_ANY);
+
+	list_for_each_entry(ndl, &console_session.ses_ndl_hash[idx], ndl_hlink) {
+		if (ndl->ndl_node->nd_id.nid != id.nid ||
+		    ndl->ndl_node->nd_id.pid != id.pid)
+			continue;
+
+		lstcon_node_get(ndl->ndl_node);
+		*ndpp = ndl->ndl_node;
+		return 0;
+	}
+
+	if (!create)
+		return -ENOENT;
+
+	LIBCFS_ALLOC(*ndpp, sizeof(lstcon_node_t) + sizeof(lstcon_ndlink_t));
+	if (*ndpp == NULL)
+		return -ENOMEM;
+
+	ndl = (lstcon_ndlink_t *)(*ndpp + 1);
+
+	ndl->ndl_node = *ndpp;
+
+	ndl->ndl_node->nd_ref   = 1;
+	ndl->ndl_node->nd_id    = id;
+	ndl->ndl_node->nd_stamp = cfs_time_current();
+	ndl->ndl_node->nd_state = LST_NODE_UNKNOWN;
+	ndl->ndl_node->nd_timeout = 0;
+	memset(&ndl->ndl_node->nd_ping, 0, sizeof(lstcon_rpc_t));
+
+	/* queued in global hash & list, no refcount is taken by
+	 * global hash & list, if caller release his refcount,
+	 * node will be released */
+	list_add_tail(&ndl->ndl_hlink, &console_session.ses_ndl_hash[idx]);
+	list_add_tail(&ndl->ndl_link, &console_session.ses_ndl_list);
+
+	return 0;
+}
+
+static void
+lstcon_node_put(lstcon_node_t *nd)
+{
+	lstcon_ndlink_t  *ndl;
+
+	LASSERT(nd->nd_ref > 0);
+
+	if (--nd->nd_ref > 0)
+		return;
+
+	ndl = (lstcon_ndlink_t *)(nd + 1);
+
+	LASSERT(!list_empty(&ndl->ndl_link));
+	LASSERT(!list_empty(&ndl->ndl_hlink));
+
+	/* remove from session */
+	list_del(&ndl->ndl_link);
+	list_del(&ndl->ndl_hlink);
+
+	LIBCFS_FREE(nd, sizeof(lstcon_node_t) + sizeof(lstcon_ndlink_t));
+}
+
+static int
+lstcon_ndlink_find(struct list_head *hash,
+		   lnet_process_id_t id, lstcon_ndlink_t **ndlpp, int create)
+{
+	unsigned int     idx = LNET_NIDADDR(id.nid) % LST_NODE_HASHSIZE;
+	lstcon_ndlink_t *ndl;
+	lstcon_node_t   *nd;
+	int	      rc;
+
+	if (id.nid == LNET_NID_ANY)
+		return -EINVAL;
+
+	/* search in hash */
+	list_for_each_entry(ndl, &hash[idx], ndl_hlink) {
+		if (ndl->ndl_node->nd_id.nid != id.nid ||
+		    ndl->ndl_node->nd_id.pid != id.pid)
+			continue;
+
+		*ndlpp = ndl;
+		return 0;
+	}
+
+	if (create == 0)
+		return -ENOENT;
+
+	/* find or create in session hash */
+	rc = lstcon_node_find(id, &nd, (create == 1) ? 1 : 0);
+	if (rc != 0)
+		return rc;
+
+	LIBCFS_ALLOC(ndl, sizeof(lstcon_ndlink_t));
+	if (ndl == NULL) {
+		lstcon_node_put(nd);
+		return -ENOMEM;
+	}
+
+	*ndlpp = ndl;
+
+	ndl->ndl_node = nd;
+	INIT_LIST_HEAD(&ndl->ndl_link);
+	list_add_tail(&ndl->ndl_hlink, &hash[idx]);
+
+	return  0;
+}
+
+static void
+lstcon_ndlink_release(lstcon_ndlink_t *ndl)
+{
+	LASSERT(list_empty(&ndl->ndl_link));
+	LASSERT(!list_empty(&ndl->ndl_hlink));
+
+	list_del(&ndl->ndl_hlink); /* delete from hash */
+	lstcon_node_put(ndl->ndl_node);
+
+	LIBCFS_FREE(ndl, sizeof(*ndl));
+}
+
+static int
+lstcon_group_alloc(char *name, lstcon_group_t **grpp)
+{
+	lstcon_group_t *grp;
+	int	     i;
+
+	LIBCFS_ALLOC(grp, offsetof(lstcon_group_t,
+				   grp_ndl_hash[LST_NODE_HASHSIZE]));
+	if (grp == NULL)
+		return -ENOMEM;
+
+	grp->grp_ref = 1;
+	if (name != NULL)
+		strcpy(grp->grp_name, name);
+
+	INIT_LIST_HEAD(&grp->grp_link);
+	INIT_LIST_HEAD(&grp->grp_ndl_list);
+	INIT_LIST_HEAD(&grp->grp_trans_list);
+
+	for (i = 0; i < LST_NODE_HASHSIZE; i++)
+		INIT_LIST_HEAD(&grp->grp_ndl_hash[i]);
+
+	*grpp = grp;
+
+	return 0;
+}
+
+static void
+lstcon_group_addref(lstcon_group_t *grp)
+{
+	grp->grp_ref++;
+}
+
+static void lstcon_group_ndlink_release(lstcon_group_t *, lstcon_ndlink_t *);
+
+static void
+lstcon_group_drain(lstcon_group_t *grp, int keep)
+{
+	lstcon_ndlink_t *ndl;
+	lstcon_ndlink_t *tmp;
+
+	list_for_each_entry_safe(ndl, tmp, &grp->grp_ndl_list, ndl_link) {
+		if ((ndl->ndl_node->nd_state & keep) == 0)
+			lstcon_group_ndlink_release(grp, ndl);
+	}
+}
+
+static void
+lstcon_group_decref(lstcon_group_t *grp)
+{
+	int     i;
+
+	if (--grp->grp_ref > 0)
+		return;
+
+	if (!list_empty(&grp->grp_link))
+		list_del(&grp->grp_link);
+
+	lstcon_group_drain(grp, 0);
+
+	for (i = 0; i < LST_NODE_HASHSIZE; i++) {
+		LASSERT(list_empty(&grp->grp_ndl_hash[i]));
+	}
+
+	LIBCFS_FREE(grp, offsetof(lstcon_group_t,
+				  grp_ndl_hash[LST_NODE_HASHSIZE]));
+}
+
+static int
+lstcon_group_find(const char *name, lstcon_group_t **grpp)
+{
+	lstcon_group_t   *grp;
+
+	list_for_each_entry(grp, &console_session.ses_grp_list, grp_link) {
+		if (strncmp(grp->grp_name, name, LST_NAME_SIZE) != 0)
+			continue;
+
+		lstcon_group_addref(grp);  /* +1 ref for caller */
+		*grpp = grp;
+		return 0;
+	}
+
+	return -ENOENT;
+}
+
+static void
+lstcon_group_put(lstcon_group_t *grp)
+{
+	lstcon_group_decref(grp);
+}
+
+static int
+lstcon_group_ndlink_find(lstcon_group_t *grp, lnet_process_id_t id,
+			 lstcon_ndlink_t **ndlpp, int create)
+{
+	int     rc;
+
+	rc = lstcon_ndlink_find(&grp->grp_ndl_hash[0], id, ndlpp, create);
+	if (rc != 0)
+		return rc;
+
+	if (!list_empty(&(*ndlpp)->ndl_link))
+		return 0;
+
+	list_add_tail(&(*ndlpp)->ndl_link, &grp->grp_ndl_list);
+	grp->grp_nnode++;
+
+	return 0;
+}
+
+static void
+lstcon_group_ndlink_release(lstcon_group_t *grp, lstcon_ndlink_t *ndl)
+{
+	list_del_init(&ndl->ndl_link);
+	lstcon_ndlink_release(ndl);
+	grp->grp_nnode --;
+}
+
+static void
+lstcon_group_ndlink_move(lstcon_group_t *old,
+			 lstcon_group_t *new, lstcon_ndlink_t *ndl)
+{
+	unsigned int idx = LNET_NIDADDR(ndl->ndl_node->nd_id.nid) %
+			   LST_NODE_HASHSIZE;
+
+	list_del(&ndl->ndl_hlink);
+	list_del(&ndl->ndl_link);
+	old->grp_nnode --;
+
+	list_add_tail(&ndl->ndl_hlink, &new->grp_ndl_hash[idx]);
+	list_add_tail(&ndl->ndl_link, &new->grp_ndl_list);
+	new->grp_nnode++;
+
+	return;
+}
+
+static void
+lstcon_group_move(lstcon_group_t *old, lstcon_group_t *new)
+{
+	lstcon_ndlink_t *ndl;
+
+	while (!list_empty(&old->grp_ndl_list)) {
+		ndl = list_entry(old->grp_ndl_list.next,
+				     lstcon_ndlink_t, ndl_link);
+		lstcon_group_ndlink_move(old, new, ndl);
+	}
+}
+
+static int
+lstcon_sesrpc_condition(int transop, lstcon_node_t *nd, void *arg)
+{
+	lstcon_group_t *grp = (lstcon_group_t *)arg;
+
+	switch (transop) {
+	case LST_TRANS_SESNEW:
+		if (nd->nd_state == LST_NODE_ACTIVE)
+			return 0;
+		break;
+
+	case LST_TRANS_SESEND:
+		if (nd->nd_state != LST_NODE_ACTIVE)
+			return 0;
+
+		if (grp != NULL && nd->nd_ref > 1)
+			return 0;
+		break;
+
+	case LST_TRANS_SESQRY:
+		break;
+
+	default:
+		LBUG();
+	}
+
+	return 1;
+}
+
+static int
+lstcon_sesrpc_readent(int transop, srpc_msg_t *msg,
+		      lstcon_rpc_ent_t *ent_up)
+{
+	srpc_debug_reply_t *rep;
+
+	switch (transop) {
+	case LST_TRANS_SESNEW:
+	case LST_TRANS_SESEND:
+		return 0;
+
+	case LST_TRANS_SESQRY:
+		rep = &msg->msg_body.dbg_reply;
+
+		if (copy_to_user(&ent_up->rpe_priv[0],
+				     &rep->dbg_timeout, sizeof(int)) ||
+		    copy_to_user(&ent_up->rpe_payload[0],
+				     &rep->dbg_name, LST_NAME_SIZE))
+			return -EFAULT;
+
+		return 0;
+
+	default:
+		LBUG();
+	}
+
+	return 0;
+}
+
+static int
+lstcon_group_nodes_add(lstcon_group_t *grp,
+		       int count, lnet_process_id_t *ids_up,
+		       unsigned *featp, struct list_head *result_up)
+{
+	lstcon_rpc_trans_t      *trans;
+	lstcon_ndlink_t	 *ndl;
+	lstcon_group_t	  *tmp;
+	lnet_process_id_t	id;
+	int		      i;
+	int		      rc;
+
+	rc = lstcon_group_alloc(NULL, &tmp);
+	if (rc != 0) {
+		CERROR("Out of memory\n");
+		return -ENOMEM;
+	}
+
+	for (i = 0 ; i < count; i++) {
+		if (copy_from_user(&id, &ids_up[i], sizeof(id))) {
+			rc = -EFAULT;
+			break;
+		}
+
+		/* skip if it's in this group already */
+		rc = lstcon_group_ndlink_find(grp, id, &ndl, 0);
+		if (rc == 0)
+			continue;
+
+		/* add to tmp group */
+		rc = lstcon_group_ndlink_find(tmp, id, &ndl, 1);
+		if (rc != 0) {
+			CERROR("Can't create ndlink, out of memory\n");
+			break;
+		}
+	}
+
+	if (rc != 0) {
+		lstcon_group_put(tmp);
+		return rc;
+	}
+
+	rc = lstcon_rpc_trans_ndlist(&tmp->grp_ndl_list,
+				     &tmp->grp_trans_list, LST_TRANS_SESNEW,
+				     tmp, lstcon_sesrpc_condition, &trans);
+	if (rc != 0) {
+		CERROR("Can't create transaction: %d\n", rc);
+		lstcon_group_put(tmp);
+		return rc;
+	}
+
+	/* post all RPCs */
+	lstcon_rpc_trans_postwait(trans, LST_TRANS_TIMEOUT);
+
+	rc = lstcon_rpc_trans_interpreter(trans, result_up,
+					  lstcon_sesrpc_readent);
+	*featp = trans->tas_features;
+
+	/* destroy all RPGs */
+	lstcon_rpc_trans_destroy(trans);
+
+	lstcon_group_move(tmp, grp);
+	lstcon_group_put(tmp);
+
+	return rc;
+}
+
+static int
+lstcon_group_nodes_remove(lstcon_group_t *grp,
+			  int count, lnet_process_id_t *ids_up,
+			  struct list_head *result_up)
+{
+	lstcon_rpc_trans_t     *trans;
+	lstcon_ndlink_t	*ndl;
+	lstcon_group_t	 *tmp;
+	lnet_process_id_t       id;
+	int		     rc;
+	int		     i;
+
+	/* End session and remove node from the group */
+
+	rc = lstcon_group_alloc(NULL, &tmp);
+	if (rc != 0) {
+		CERROR("Out of memory\n");
+		return -ENOMEM;
+	}
+
+	for (i = 0; i < count; i++) {
+		if (copy_from_user(&id, &ids_up[i], sizeof(id))) {
+			rc = -EFAULT;
+			goto error;
+		}
+
+		/* move node to tmp group */
+		if (lstcon_group_ndlink_find(grp, id, &ndl, 0) == 0)
+			lstcon_group_ndlink_move(grp, tmp, ndl);
+	}
+
+	rc = lstcon_rpc_trans_ndlist(&tmp->grp_ndl_list,
+				     &tmp->grp_trans_list, LST_TRANS_SESEND,
+				     tmp, lstcon_sesrpc_condition, &trans);
+	if (rc != 0) {
+		CERROR("Can't create transaction: %d\n", rc);
+		goto error;
+	}
+
+	lstcon_rpc_trans_postwait(trans, LST_TRANS_TIMEOUT);
+
+	rc = lstcon_rpc_trans_interpreter(trans, result_up, NULL);
+
+	lstcon_rpc_trans_destroy(trans);
+	/* release nodes anyway, because we can't rollback status */
+	lstcon_group_put(tmp);
+
+	return rc;
+error:
+	lstcon_group_move(tmp, grp);
+	lstcon_group_put(tmp);
+
+	return rc;
+}
+
+int
+lstcon_group_add(char *name)
+{
+	lstcon_group_t *grp;
+	int	     rc;
+
+	rc = (lstcon_group_find(name, &grp) == 0)? -EEXIST: 0;
+	if (rc != 0) {
+		/* find a group with same name */
+		lstcon_group_put(grp);
+		return rc;
+	}
+
+	rc = lstcon_group_alloc(name, &grp);
+	if (rc != 0) {
+		CERROR("Can't allocate descriptor for group %s\n", name);
+		return -ENOMEM;
+	}
+
+	list_add_tail(&grp->grp_link, &console_session.ses_grp_list);
+
+	return rc;
+}
+
+int
+lstcon_nodes_add(char *name, int count, lnet_process_id_t *ids_up,
+		 unsigned *featp, struct list_head *result_up)
+{
+	lstcon_group_t	 *grp;
+	int		     rc;
+
+	LASSERT(count > 0);
+	LASSERT(ids_up != NULL);
+
+	rc = lstcon_group_find(name, &grp);
+	if (rc != 0) {
+		CDEBUG(D_NET, "Can't find group %s\n", name);
+		return rc;
+	}
+
+	if (grp->grp_ref > 2) {
+		/* referred by other threads or test */
+		CDEBUG(D_NET, "Group %s is busy\n", name);
+		lstcon_group_put(grp);
+
+		return -EBUSY;
+	}
+
+	rc = lstcon_group_nodes_add(grp, count, ids_up, featp, result_up);
+
+	lstcon_group_put(grp);
+
+	return rc;
+}
+
+int
+lstcon_group_del(char *name)
+{
+	lstcon_rpc_trans_t *trans;
+	lstcon_group_t     *grp;
+	int		 rc;
+
+	rc = lstcon_group_find(name, &grp);
+	if (rc != 0) {
+		CDEBUG(D_NET, "Can't find group: %s\n", name);
+		return rc;
+	}
+
+	if (grp->grp_ref > 2) {
+		/* referred by others threads or test */
+		CDEBUG(D_NET, "Group %s is busy\n", name);
+		lstcon_group_put(grp);
+		return -EBUSY;
+	}
+
+	rc = lstcon_rpc_trans_ndlist(&grp->grp_ndl_list,
+				     &grp->grp_trans_list, LST_TRANS_SESEND,
+				     grp, lstcon_sesrpc_condition, &trans);
+	if (rc != 0) {
+		CERROR("Can't create transaction: %d\n", rc);
+		lstcon_group_put(grp);
+		return rc;
+	}
+
+	lstcon_rpc_trans_postwait(trans, LST_TRANS_TIMEOUT);
+
+	lstcon_rpc_trans_destroy(trans);
+
+	lstcon_group_put(grp);
+	/* -ref for session, it's destroyed,
+	 * status can't be rolled back, destroy group anyway */
+	lstcon_group_put(grp);
+
+	return rc;
+}
+
+int
+lstcon_group_clean(char *name, int args)
+{
+	lstcon_group_t *grp = NULL;
+	int	     rc;
+
+	rc = lstcon_group_find(name, &grp);
+	if (rc != 0) {
+		CDEBUG(D_NET, "Can't find group %s\n", name);
+		return rc;
+	}
+
+	if (grp->grp_ref > 2) {
+		/* referred by test */
+		CDEBUG(D_NET, "Group %s is busy\n", name);
+		lstcon_group_put(grp);
+		return -EBUSY;
+	}
+
+	args = (LST_NODE_ACTIVE | LST_NODE_BUSY |
+		LST_NODE_DOWN | LST_NODE_UNKNOWN) & ~args;
+
+	lstcon_group_drain(grp, args);
+
+	lstcon_group_put(grp);
+	/* release empty group */
+	if (list_empty(&grp->grp_ndl_list))
+		lstcon_group_put(grp);
+
+	return 0;
+}
+
+int
+lstcon_nodes_remove(char *name, int count,
+		    lnet_process_id_t *ids_up, struct list_head *result_up)
+{
+	lstcon_group_t *grp = NULL;
+	int	     rc;
+
+	rc = lstcon_group_find(name, &grp);
+	if (rc != 0) {
+		CDEBUG(D_NET, "Can't find group: %s\n", name);
+		return rc;
+	}
+
+	if (grp->grp_ref > 2) {
+		/* referred by test */
+		CDEBUG(D_NET, "Group %s is busy\n", name);
+		lstcon_group_put(grp);
+		return -EBUSY;
+	}
+
+	rc = lstcon_group_nodes_remove(grp, count, ids_up, result_up);
+
+	lstcon_group_put(grp);
+	/* release empty group */
+	if (list_empty(&grp->grp_ndl_list))
+		lstcon_group_put(grp);
+
+	return rc;
+}
+
+int
+lstcon_group_refresh(char *name, struct list_head *result_up)
+{
+	lstcon_rpc_trans_t      *trans;
+	lstcon_group_t	  *grp;
+	int		      rc;
+
+	rc = lstcon_group_find(name, &grp);
+	if (rc != 0) {
+		CDEBUG(D_NET, "Can't find group: %s\n", name);
+		return rc;
+	}
+
+	if (grp->grp_ref > 2) {
+		/* referred by test */
+		CDEBUG(D_NET, "Group %s is busy\n", name);
+		lstcon_group_put(grp);
+		return -EBUSY;
+	}
+
+	/* re-invite all inactive nodes int the group */
+	rc = lstcon_rpc_trans_ndlist(&grp->grp_ndl_list,
+				     &grp->grp_trans_list, LST_TRANS_SESNEW,
+				     grp, lstcon_sesrpc_condition, &trans);
+	if (rc != 0) {
+		/* local error, return */
+		CDEBUG(D_NET, "Can't create transaction: %d\n", rc);
+		lstcon_group_put(grp);
+		return rc;
+	}
+
+	lstcon_rpc_trans_postwait(trans, LST_TRANS_TIMEOUT);
+
+	rc = lstcon_rpc_trans_interpreter(trans, result_up, NULL);
+
+	lstcon_rpc_trans_destroy(trans);
+	/* -ref for me */
+	lstcon_group_put(grp);
+
+	return rc;
+}
+
+int
+lstcon_group_list(int index, int len, char *name_up)
+{
+	lstcon_group_t *grp;
+
+	LASSERT(index >= 0);
+	LASSERT(name_up != NULL);
+
+	list_for_each_entry(grp, &console_session.ses_grp_list, grp_link) {
+		if (index-- == 0) {
+			return copy_to_user(name_up, grp->grp_name, len) ?
+			       -EFAULT : 0;
+		}
+	}
+
+	return -ENOENT;
+}
+
+static int
+lstcon_nodes_getent(struct list_head *head, int *index_p,
+		    int *count_p, lstcon_node_ent_t *dents_up)
+{
+	lstcon_ndlink_t  *ndl;
+	lstcon_node_t    *nd;
+	int	       count = 0;
+	int	       index = 0;
+
+	LASSERT(index_p != NULL && count_p != NULL);
+	LASSERT(dents_up != NULL);
+	LASSERT(*index_p >= 0);
+	LASSERT(*count_p > 0);
+
+	list_for_each_entry(ndl, head, ndl_link) {
+		if (index++ < *index_p)
+			continue;
+
+		if (count >= *count_p)
+			break;
+
+		nd = ndl->ndl_node;
+		if (copy_to_user(&dents_up[count].nde_id,
+				     &nd->nd_id, sizeof(nd->nd_id)) ||
+		    copy_to_user(&dents_up[count].nde_state,
+				     &nd->nd_state, sizeof(nd->nd_state)))
+			return -EFAULT;
+
+		count++;
+	}
+
+	if (index <= *index_p)
+		return -ENOENT;
+
+	*count_p = count;
+	*index_p = index;
+
+	return 0;
+}
+
+int
+lstcon_group_info(char *name, lstcon_ndlist_ent_t *gents_p,
+		  int *index_p, int *count_p, lstcon_node_ent_t *dents_up)
+{
+	lstcon_ndlist_ent_t *gentp;
+	lstcon_group_t      *grp;
+	lstcon_ndlink_t     *ndl;
+	int		  rc;
+
+	rc = lstcon_group_find(name, &grp);
+	if (rc != 0) {
+		CDEBUG(D_NET, "Can't find group %s\n", name);
+		return rc;
+	}
+
+	if (dents_up) {
+		/* verbose query */
+		rc = lstcon_nodes_getent(&grp->grp_ndl_list,
+					 index_p, count_p, dents_up);
+		lstcon_group_put(grp);
+
+		return rc;
+	}
+
+	/* non-verbose query */
+	LIBCFS_ALLOC(gentp, sizeof(lstcon_ndlist_ent_t));
+	if (gentp == NULL) {
+		CERROR("Can't allocate ndlist_ent\n");
+		lstcon_group_put(grp);
+
+		return -ENOMEM;
+	}
+
+	list_for_each_entry(ndl, &grp->grp_ndl_list, ndl_link)
+		LST_NODE_STATE_COUNTER(ndl->ndl_node, gentp);
+
+	rc = copy_to_user(gents_p, gentp,
+			      sizeof(lstcon_ndlist_ent_t)) ? -EFAULT: 0;
+
+	LIBCFS_FREE(gentp, sizeof(lstcon_ndlist_ent_t));
+
+	lstcon_group_put(grp);
+
+	return 0;
+}
+
+static int
+lstcon_batch_find(const char *name, lstcon_batch_t **batpp)
+{
+	lstcon_batch_t   *bat;
+
+	list_for_each_entry(bat, &console_session.ses_bat_list, bat_link) {
+		if (strncmp(bat->bat_name, name, LST_NAME_SIZE) == 0) {
+			*batpp = bat;
+			return 0;
+		}
+	}
+
+	return -ENOENT;
+}
+
+int
+lstcon_batch_add(char *name)
+{
+	lstcon_batch_t   *bat;
+	int	       i;
+	int	       rc;
+
+	rc = (lstcon_batch_find(name, &bat) == 0)? -EEXIST: 0;
+	if (rc != 0) {
+		CDEBUG(D_NET, "Batch %s already exists\n", name);
+		return rc;
+	}
+
+	LIBCFS_ALLOC(bat, sizeof(lstcon_batch_t));
+	if (bat == NULL) {
+		CERROR("Can't allocate descriptor for batch %s\n", name);
+		return -ENOMEM;
+	}
+
+	LIBCFS_ALLOC(bat->bat_cli_hash,
+		     sizeof(struct list_head) * LST_NODE_HASHSIZE);
+	if (bat->bat_cli_hash == NULL) {
+		CERROR("Can't allocate hash for batch %s\n", name);
+		LIBCFS_FREE(bat, sizeof(lstcon_batch_t));
+
+		return -ENOMEM;
+	}
+
+	LIBCFS_ALLOC(bat->bat_srv_hash,
+		     sizeof(struct list_head) * LST_NODE_HASHSIZE);
+	if (bat->bat_srv_hash == NULL) {
+		CERROR("Can't allocate hash for batch %s\n", name);
+		LIBCFS_FREE(bat->bat_cli_hash, LST_NODE_HASHSIZE);
+		LIBCFS_FREE(bat, sizeof(lstcon_batch_t));
+
+		return -ENOMEM;
+	}
+
+	strcpy(bat->bat_name, name);
+	bat->bat_hdr.tsb_index = 0;
+	bat->bat_hdr.tsb_id.bat_id = ++console_session.ses_id_cookie;
+
+	bat->bat_ntest = 0;
+	bat->bat_state = LST_BATCH_IDLE;
+
+	INIT_LIST_HEAD(&bat->bat_cli_list);
+	INIT_LIST_HEAD(&bat->bat_srv_list);
+	INIT_LIST_HEAD(&bat->bat_test_list);
+	INIT_LIST_HEAD(&bat->bat_trans_list);
+
+	for (i = 0; i < LST_NODE_HASHSIZE; i++) {
+		INIT_LIST_HEAD(&bat->bat_cli_hash[i]);
+		INIT_LIST_HEAD(&bat->bat_srv_hash[i]);
+	}
+
+	list_add_tail(&bat->bat_link, &console_session.ses_bat_list);
+
+	return rc;
+}
+
+int
+lstcon_batch_list(int index, int len, char *name_up)
+{
+	lstcon_batch_t    *bat;
+
+	LASSERT(name_up != NULL);
+	LASSERT(index >= 0);
+
+	list_for_each_entry(bat, &console_session.ses_bat_list, bat_link) {
+		if (index-- == 0) {
+			return copy_to_user(name_up, bat->bat_name, len) ?
+			       -EFAULT: 0;
+		}
+	}
+
+	return -ENOENT;
+}
+
+int
+lstcon_batch_info(char *name, lstcon_test_batch_ent_t *ent_up, int server,
+		  int testidx, int *index_p, int *ndent_p,
+		  lstcon_node_ent_t *dents_up)
+{
+	lstcon_test_batch_ent_t *entp;
+	struct list_head	      *clilst;
+	struct list_head	      *srvlst;
+	lstcon_test_t	   *test = NULL;
+	lstcon_batch_t	  *bat;
+	lstcon_ndlink_t	 *ndl;
+	int		      rc;
+
+	rc = lstcon_batch_find(name, &bat);
+	if (rc != 0) {
+		CDEBUG(D_NET, "Can't find batch %s\n", name);
+		return -ENOENT;
+	}
+
+	if (testidx > 0) {
+		/* query test, test index start from 1 */
+		list_for_each_entry(test, &bat->bat_test_list, tes_link) {
+			if (testidx-- == 1)
+				break;
+		}
+
+		if (testidx > 0) {
+			CDEBUG(D_NET, "Can't find specified test in batch\n");
+			return -ENOENT;
+		}
+	}
+
+	clilst = (test == NULL) ? &bat->bat_cli_list :
+				  &test->tes_src_grp->grp_ndl_list;
+	srvlst = (test == NULL) ? &bat->bat_srv_list :
+				  &test->tes_dst_grp->grp_ndl_list;
+
+	if (dents_up != NULL) {
+		rc = lstcon_nodes_getent((server ? srvlst: clilst),
+					 index_p, ndent_p, dents_up);
+		return rc;
+	}
+
+	/* non-verbose query */
+	LIBCFS_ALLOC(entp, sizeof(lstcon_test_batch_ent_t));
+	if (entp == NULL)
+		return -ENOMEM;
+
+	if (test == NULL) {
+		entp->u.tbe_batch.bae_ntest = bat->bat_ntest;
+		entp->u.tbe_batch.bae_state = bat->bat_state;
+
+	} else {
+
+		entp->u.tbe_test.tse_type   = test->tes_type;
+		entp->u.tbe_test.tse_loop   = test->tes_loop;
+		entp->u.tbe_test.tse_concur = test->tes_concur;
+	}
+
+	list_for_each_entry(ndl, clilst, ndl_link)
+		LST_NODE_STATE_COUNTER(ndl->ndl_node, &entp->tbe_cli_nle);
+
+	list_for_each_entry(ndl, srvlst, ndl_link)
+		LST_NODE_STATE_COUNTER(ndl->ndl_node, &entp->tbe_srv_nle);
+
+	rc = copy_to_user(ent_up, entp,
+			      sizeof(lstcon_test_batch_ent_t)) ? -EFAULT : 0;
+
+	LIBCFS_FREE(entp, sizeof(lstcon_test_batch_ent_t));
+
+	return rc;
+}
+
+static int
+lstcon_batrpc_condition(int transop, lstcon_node_t *nd, void *arg)
+{
+	switch (transop) {
+	case LST_TRANS_TSBRUN:
+		if (nd->nd_state != LST_NODE_ACTIVE)
+			return -ENETDOWN;
+		break;
+
+	case LST_TRANS_TSBSTOP:
+		if (nd->nd_state != LST_NODE_ACTIVE)
+			return 0;
+		break;
+
+	case LST_TRANS_TSBCLIQRY:
+	case LST_TRANS_TSBSRVQRY:
+		break;
+	}
+
+	return 1;
+}
+
+static int
+lstcon_batch_op(lstcon_batch_t *bat, int transop,
+		struct list_head *result_up)
+{
+	lstcon_rpc_trans_t *trans;
+	int		 rc;
+
+	rc = lstcon_rpc_trans_ndlist(&bat->bat_cli_list,
+				     &bat->bat_trans_list, transop,
+				     bat, lstcon_batrpc_condition, &trans);
+	if (rc != 0) {
+		CERROR("Can't create transaction: %d\n", rc);
+		return rc;
+	}
+
+	lstcon_rpc_trans_postwait(trans, LST_TRANS_TIMEOUT);
+
+	rc = lstcon_rpc_trans_interpreter(trans, result_up, NULL);
+
+	lstcon_rpc_trans_destroy(trans);
+
+	return rc;
+}
+
+int
+lstcon_batch_run(char *name, int timeout, struct list_head *result_up)
+{
+	lstcon_batch_t *bat;
+	int	     rc;
+
+	if (lstcon_batch_find(name, &bat) != 0) {
+		CDEBUG(D_NET, "Can't find batch %s\n", name);
+		return -ENOENT;
+	}
+
+	bat->bat_arg = timeout;
+
+	rc = lstcon_batch_op(bat, LST_TRANS_TSBRUN, result_up);
+
+	/* mark batch as running if it's started in any node */
+	if (lstcon_tsbop_stat_success(lstcon_trans_stat(), 0) != 0)
+		bat->bat_state = LST_BATCH_RUNNING;
+
+	return rc;
+}
+
+int
+lstcon_batch_stop(char *name, int force, struct list_head *result_up)
+{
+	lstcon_batch_t *bat;
+	int	     rc;
+
+	if (lstcon_batch_find(name, &bat) != 0) {
+		CDEBUG(D_NET, "Can't find batch %s\n", name);
+		return -ENOENT;
+	}
+
+	bat->bat_arg = force;
+
+	rc = lstcon_batch_op(bat, LST_TRANS_TSBSTOP, result_up);
+
+	/* mark batch as stopped if all RPCs finished */
+	if (lstcon_tsbop_stat_failure(lstcon_trans_stat(), 0) == 0)
+		bat->bat_state = LST_BATCH_IDLE;
+
+	return rc;
+}
+
+static void
+lstcon_batch_destroy(lstcon_batch_t *bat)
+{
+	lstcon_ndlink_t    *ndl;
+	lstcon_test_t      *test;
+	int		 i;
+
+	list_del(&bat->bat_link);
+
+	while (!list_empty(&bat->bat_test_list)) {
+		test = list_entry(bat->bat_test_list.next,
+				      lstcon_test_t, tes_link);
+		LASSERT(list_empty(&test->tes_trans_list));
+
+		list_del(&test->tes_link);
+
+		lstcon_group_put(test->tes_src_grp);
+		lstcon_group_put(test->tes_dst_grp);
+
+		LIBCFS_FREE(test, offsetof(lstcon_test_t,
+					   tes_param[test->tes_paramlen]));
+	}
+
+	LASSERT(list_empty(&bat->bat_trans_list));
+
+	while (!list_empty(&bat->bat_cli_list)) {
+		ndl = list_entry(bat->bat_cli_list.next,
+				     lstcon_ndlink_t, ndl_link);
+		list_del_init(&ndl->ndl_link);
+
+		lstcon_ndlink_release(ndl);
+	}
+
+	while (!list_empty(&bat->bat_srv_list)) {
+		ndl = list_entry(bat->bat_srv_list.next,
+				     lstcon_ndlink_t, ndl_link);
+		list_del_init(&ndl->ndl_link);
+
+		lstcon_ndlink_release(ndl);
+	}
+
+	for (i = 0; i < LST_NODE_HASHSIZE; i++) {
+		LASSERT(list_empty(&bat->bat_cli_hash[i]));
+		LASSERT(list_empty(&bat->bat_srv_hash[i]));
+	}
+
+	LIBCFS_FREE(bat->bat_cli_hash,
+		    sizeof(struct list_head) * LST_NODE_HASHSIZE);
+	LIBCFS_FREE(bat->bat_srv_hash,
+		    sizeof(struct list_head) * LST_NODE_HASHSIZE);
+	LIBCFS_FREE(bat, sizeof(lstcon_batch_t));
+}
+
+static int
+lstcon_testrpc_condition(int transop, lstcon_node_t *nd, void *arg)
+{
+	lstcon_test_t    *test;
+	lstcon_batch_t   *batch;
+	lstcon_ndlink_t  *ndl;
+	struct list_head       *hash;
+	struct list_head       *head;
+
+	test = (lstcon_test_t *)arg;
+	LASSERT(test != NULL);
+
+	batch = test->tes_batch;
+	LASSERT(batch != NULL);
+
+	if (test->tes_oneside &&
+	    transop == LST_TRANS_TSBSRVADD)
+		return 0;
+
+	if (nd->nd_state != LST_NODE_ACTIVE)
+		return -ENETDOWN;
+
+	if (transop == LST_TRANS_TSBCLIADD) {
+		hash = batch->bat_cli_hash;
+		head = &batch->bat_cli_list;
+
+	} else {
+		LASSERT(transop == LST_TRANS_TSBSRVADD);
+
+		hash = batch->bat_srv_hash;
+		head = &batch->bat_srv_list;
+	}
+
+	LASSERT(nd->nd_id.nid != LNET_NID_ANY);
+
+	if (lstcon_ndlink_find(hash, nd->nd_id, &ndl, 1) != 0)
+		return -ENOMEM;
+
+	if (list_empty(&ndl->ndl_link))
+		list_add_tail(&ndl->ndl_link, head);
+
+	return 1;
+}
+
+static int
+lstcon_test_nodes_add(lstcon_test_t *test, struct list_head *result_up)
+{
+	lstcon_rpc_trans_t     *trans;
+	lstcon_group_t	 *grp;
+	int		     transop;
+	int		     rc;
+
+	LASSERT(test->tes_src_grp != NULL);
+	LASSERT(test->tes_dst_grp != NULL);
+
+	transop = LST_TRANS_TSBSRVADD;
+	grp  = test->tes_dst_grp;
+again:
+	rc = lstcon_rpc_trans_ndlist(&grp->grp_ndl_list,
+				     &test->tes_trans_list, transop,
+				     test, lstcon_testrpc_condition, &trans);
+	if (rc != 0) {
+		CERROR("Can't create transaction: %d\n", rc);
+		return rc;
+	}
+
+	lstcon_rpc_trans_postwait(trans, LST_TRANS_TIMEOUT);
+
+	if (lstcon_trans_stat()->trs_rpc_errno != 0 ||
+	    lstcon_trans_stat()->trs_fwk_errno != 0) {
+		lstcon_rpc_trans_interpreter(trans, result_up, NULL);
+
+		lstcon_rpc_trans_destroy(trans);
+		/* return if any error */
+		CDEBUG(D_NET, "Failed to add test %s, RPC error %d, framework error %d\n",
+		       transop == LST_TRANS_TSBCLIADD ? "client" : "server",
+		       lstcon_trans_stat()->trs_rpc_errno,
+		       lstcon_trans_stat()->trs_fwk_errno);
+
+		return rc;
+	}
+
+	lstcon_rpc_trans_destroy(trans);
+
+	if (transop == LST_TRANS_TSBCLIADD)
+		return rc;
+
+	transop = LST_TRANS_TSBCLIADD;
+	grp = test->tes_src_grp;
+	test->tes_cliidx = 0;
+
+	/* requests to test clients */
+	goto again;
+}
+
+static int
+lstcon_verify_batch(const char *name, lstcon_batch_t **batch)
+{
+	int rc;
+
+	rc = lstcon_batch_find(name, batch);
+	if (rc != 0) {
+		CDEBUG(D_NET, "Can't find batch %s\n", name);
+		return rc;
+	}
+
+	if ((*batch)->bat_state != LST_BATCH_IDLE) {
+		CDEBUG(D_NET, "Can't change running batch %s\n", name);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int
+lstcon_verify_group(const char *name, lstcon_group_t **grp)
+{
+	int			rc;
+	lstcon_ndlink_t		*ndl;
+
+	rc = lstcon_group_find(name, grp);
+	if (rc != 0) {
+		CDEBUG(D_NET, "can't find group %s\n", name);
+		return rc;
+	}
+
+	list_for_each_entry(ndl, &(*grp)->grp_ndl_list, ndl_link) {
+		if (ndl->ndl_node->nd_state == LST_NODE_ACTIVE)
+			return 0;
+	}
+
+	CDEBUG(D_NET, "Group %s has no ACTIVE nodes\n", name);
+
+	return -EINVAL;
+}
+
+int
+lstcon_test_add(char *batch_name, int type, int loop,
+		int concur, int dist, int span,
+		char *src_name, char *dst_name,
+		void *param, int paramlen, int *retp,
+		struct list_head *result_up)
+{
+	lstcon_test_t	 *test	 = NULL;
+	int		 rc;
+	lstcon_group_t	 *src_grp = NULL;
+	lstcon_group_t	 *dst_grp = NULL;
+	lstcon_batch_t	 *batch = NULL;
+
+	/*
+	 * verify that a batch of the given name exists, and the groups
+	 * that will be part of the batch exist and have at least one
+	 * active node
+	 */
+	rc = lstcon_verify_batch(batch_name, &batch);
+	if (rc != 0)
+		goto out;
+
+	rc = lstcon_verify_group(src_name, &src_grp);
+	if (rc != 0)
+		goto out;
+
+	rc = lstcon_verify_group(dst_name, &dst_grp);
+	if (rc != 0)
+		goto out;
+
+	if (dst_grp->grp_userland)
+		*retp = 1;
+
+	LIBCFS_ALLOC(test, offsetof(lstcon_test_t, tes_param[paramlen]));
+	if (!test) {
+		CERROR("Can't allocate test descriptor\n");
+		rc = -ENOMEM;
+
+		goto out;
+	}
+
+	test->tes_hdr.tsb_id	= batch->bat_hdr.tsb_id;
+	test->tes_batch		= batch;
+	test->tes_type		= type;
+	test->tes_oneside	= 0; /* TODO */
+	test->tes_loop		= loop;
+	test->tes_concur	= concur;
+	test->tes_stop_onerr	= 1; /* TODO */
+	test->tes_span		= span;
+	test->tes_dist		= dist;
+	test->tes_cliidx	= 0; /* just used for creating RPC */
+	test->tes_src_grp	= src_grp;
+	test->tes_dst_grp	= dst_grp;
+	INIT_LIST_HEAD(&test->tes_trans_list);
+
+	if (param != NULL) {
+		test->tes_paramlen = paramlen;
+		memcpy(&test->tes_param[0], param, paramlen);
+	}
+
+	rc = lstcon_test_nodes_add(test, result_up);
+
+	if (rc != 0)
+		goto out;
+
+	if (lstcon_trans_stat()->trs_rpc_errno != 0 ||
+	    lstcon_trans_stat()->trs_fwk_errno != 0)
+		CDEBUG(D_NET, "Failed to add test %d to batch %s\n", type,
+		       batch_name);
+
+	/* add to test list anyway, so user can check what's going on */
+	list_add_tail(&test->tes_link, &batch->bat_test_list);
+
+	batch->bat_ntest++;
+	test->tes_hdr.tsb_index = batch->bat_ntest;
+
+	/*  hold groups so nobody can change them */
+	return rc;
+out:
+	if (test != NULL)
+		LIBCFS_FREE(test, offsetof(lstcon_test_t, tes_param[paramlen]));
+
+	if (dst_grp != NULL)
+		lstcon_group_put(dst_grp);
+
+	if (src_grp != NULL)
+		lstcon_group_put(src_grp);
+
+	return rc;
+}
+
+static int
+lstcon_test_find(lstcon_batch_t *batch, int idx, lstcon_test_t **testpp)
+{
+	lstcon_test_t *test;
+
+	list_for_each_entry(test, &batch->bat_test_list, tes_link) {
+		if (idx == test->tes_hdr.tsb_index) {
+			*testpp = test;
+			return 0;
+		}
+	}
+
+	return -ENOENT;
+}
+
+static int
+lstcon_tsbrpc_readent(int transop, srpc_msg_t *msg,
+		      lstcon_rpc_ent_t *ent_up)
+{
+	srpc_batch_reply_t *rep = &msg->msg_body.bat_reply;
+
+	LASSERT(transop == LST_TRANS_TSBCLIQRY ||
+		 transop == LST_TRANS_TSBSRVQRY);
+
+	/* positive errno, framework error code */
+	if (copy_to_user(&ent_up->rpe_priv[0],
+			     &rep->bar_active, sizeof(rep->bar_active)))
+		return -EFAULT;
+
+	return 0;
+}
+
+int
+lstcon_test_batch_query(char *name, int testidx, int client,
+			int timeout, struct list_head *result_up)
+{
+	lstcon_rpc_trans_t *trans;
+	struct list_head	 *translist;
+	struct list_head	 *ndlist;
+	lstcon_tsb_hdr_t   *hdr;
+	lstcon_batch_t     *batch;
+	lstcon_test_t      *test = NULL;
+	int		 transop;
+	int		 rc;
+
+	rc = lstcon_batch_find(name, &batch);
+	if (rc != 0) {
+		CDEBUG(D_NET, "Can't find batch: %s\n", name);
+		return rc;
+	}
+
+	if (testidx == 0) {
+		translist = &batch->bat_trans_list;
+		ndlist    = &batch->bat_cli_list;
+		hdr       = &batch->bat_hdr;
+
+	} else {
+		/* query specified test only */
+		rc = lstcon_test_find(batch, testidx, &test);
+		if (rc != 0) {
+			CDEBUG(D_NET, "Can't find test: %d\n", testidx);
+			return rc;
+		}
+
+		translist = &test->tes_trans_list;
+		ndlist    = &test->tes_src_grp->grp_ndl_list;
+		hdr       = &test->tes_hdr;
+	}
+
+	transop = client ? LST_TRANS_TSBCLIQRY : LST_TRANS_TSBSRVQRY;
+
+	rc = lstcon_rpc_trans_ndlist(ndlist, translist, transop, hdr,
+				     lstcon_batrpc_condition, &trans);
+	if (rc != 0) {
+		CERROR("Can't create transaction: %d\n", rc);
+		return rc;
+	}
+
+	lstcon_rpc_trans_postwait(trans, timeout);
+
+	if (testidx == 0 && /* query a batch, not a test */
+	    lstcon_rpc_stat_failure(lstcon_trans_stat(), 0) == 0 &&
+	    lstcon_tsbqry_stat_run(lstcon_trans_stat(), 0) == 0) {
+		/* all RPCs finished, and no active test */
+		batch->bat_state = LST_BATCH_IDLE;
+	}
+
+	rc = lstcon_rpc_trans_interpreter(trans, result_up,
+					  lstcon_tsbrpc_readent);
+	lstcon_rpc_trans_destroy(trans);
+
+	return rc;
+}
+
+static int
+lstcon_statrpc_readent(int transop, srpc_msg_t *msg,
+		       lstcon_rpc_ent_t *ent_up)
+{
+	srpc_stat_reply_t *rep = &msg->msg_body.stat_reply;
+	sfw_counters_t    *sfwk_stat;
+	srpc_counters_t   *srpc_stat;
+	lnet_counters_t   *lnet_stat;
+
+	if (rep->str_status != 0)
+		return 0;
+
+	sfwk_stat = (sfw_counters_t *)&ent_up->rpe_payload[0];
+	srpc_stat = (srpc_counters_t *)((char *)sfwk_stat + sizeof(*sfwk_stat));
+	lnet_stat = (lnet_counters_t *)((char *)srpc_stat + sizeof(*srpc_stat));
+
+	if (copy_to_user(sfwk_stat, &rep->str_fw, sizeof(*sfwk_stat)) ||
+	    copy_to_user(srpc_stat, &rep->str_rpc, sizeof(*srpc_stat)) ||
+	    copy_to_user(lnet_stat, &rep->str_lnet, sizeof(*lnet_stat)))
+		return -EFAULT;
+
+	return 0;
+}
+
+static int
+lstcon_ndlist_stat(struct list_head *ndlist,
+		   int timeout, struct list_head *result_up)
+{
+	struct list_head	  head;
+	lstcon_rpc_trans_t *trans;
+	int		 rc;
+
+	INIT_LIST_HEAD(&head);
+
+	rc = lstcon_rpc_trans_ndlist(ndlist, &head,
+				     LST_TRANS_STATQRY, NULL, NULL, &trans);
+	if (rc != 0) {
+		CERROR("Can't create transaction: %d\n", rc);
+		return rc;
+	}
+
+	lstcon_rpc_trans_postwait(trans, LST_VALIDATE_TIMEOUT(timeout));
+
+	rc = lstcon_rpc_trans_interpreter(trans, result_up,
+					  lstcon_statrpc_readent);
+	lstcon_rpc_trans_destroy(trans);
+
+	return rc;
+}
+
+int
+lstcon_group_stat(char *grp_name, int timeout, struct list_head *result_up)
+{
+	lstcon_group_t     *grp;
+	int		 rc;
+
+	rc = lstcon_group_find(grp_name, &grp);
+	if (rc != 0) {
+		CDEBUG(D_NET, "Can't find group %s\n", grp_name);
+		return rc;
+	}
+
+	rc = lstcon_ndlist_stat(&grp->grp_ndl_list, timeout, result_up);
+
+	lstcon_group_put(grp);
+
+	return rc;
+}
+
+int
+lstcon_nodes_stat(int count, lnet_process_id_t *ids_up,
+		  int timeout, struct list_head *result_up)
+{
+	lstcon_ndlink_t	 *ndl;
+	lstcon_group_t	  *tmp;
+	lnet_process_id_t	id;
+	int		      i;
+	int		      rc;
+
+	rc = lstcon_group_alloc(NULL, &tmp);
+	if (rc != 0) {
+		CERROR("Out of memory\n");
+		return -ENOMEM;
+	}
+
+	for (i = 0 ; i < count; i++) {
+		if (copy_from_user(&id, &ids_up[i], sizeof(id))) {
+			rc = -EFAULT;
+			break;
+		}
+
+		/* add to tmp group */
+		rc = lstcon_group_ndlink_find(tmp, id, &ndl, 2);
+		if (rc != 0) {
+			CDEBUG((rc == -ENOMEM) ? D_ERROR : D_NET,
+			       "Failed to find or create %s: %d\n",
+			       libcfs_id2str(id), rc);
+			break;
+		}
+	}
+
+	if (rc != 0) {
+		lstcon_group_put(tmp);
+		return rc;
+	}
+
+	rc = lstcon_ndlist_stat(&tmp->grp_ndl_list, timeout, result_up);
+
+	lstcon_group_put(tmp);
+
+	return rc;
+}
+
+static int
+lstcon_debug_ndlist(struct list_head *ndlist,
+		    struct list_head *translist,
+		    int timeout, struct list_head *result_up)
+{
+	lstcon_rpc_trans_t *trans;
+	int		 rc;
+
+	rc = lstcon_rpc_trans_ndlist(ndlist, translist, LST_TRANS_SESQRY,
+				     NULL, lstcon_sesrpc_condition, &trans);
+	if (rc != 0) {
+		CERROR("Can't create transaction: %d\n", rc);
+		return rc;
+	}
+
+	lstcon_rpc_trans_postwait(trans, LST_VALIDATE_TIMEOUT(timeout));
+
+	rc = lstcon_rpc_trans_interpreter(trans, result_up,
+					  lstcon_sesrpc_readent);
+	lstcon_rpc_trans_destroy(trans);
+
+	return rc;
+}
+
+int
+lstcon_session_debug(int timeout, struct list_head *result_up)
+{
+	return lstcon_debug_ndlist(&console_session.ses_ndl_list,
+				   NULL, timeout, result_up);
+}
+
+int
+lstcon_batch_debug(int timeout, char *name,
+		   int client, struct list_head *result_up)
+{
+	lstcon_batch_t *bat;
+	int	     rc;
+
+	rc = lstcon_batch_find(name, &bat);
+	if (rc != 0)
+		return -ENOENT;
+
+	rc = lstcon_debug_ndlist(client ? &bat->bat_cli_list :
+					  &bat->bat_srv_list,
+				 NULL, timeout, result_up);
+
+	return rc;
+}
+
+int
+lstcon_group_debug(int timeout, char *name,
+		   struct list_head *result_up)
+{
+	lstcon_group_t *grp;
+	int	     rc;
+
+	rc = lstcon_group_find(name, &grp);
+	if (rc != 0)
+		return -ENOENT;
+
+	rc = lstcon_debug_ndlist(&grp->grp_ndl_list, NULL,
+				 timeout, result_up);
+	lstcon_group_put(grp);
+
+	return rc;
+}
+
+int
+lstcon_nodes_debug(int timeout,
+		   int count, lnet_process_id_t *ids_up,
+		   struct list_head *result_up)
+{
+	lnet_process_id_t  id;
+	lstcon_ndlink_t   *ndl;
+	lstcon_group_t    *grp;
+	int		i;
+	int		rc;
+
+	rc = lstcon_group_alloc(NULL, &grp);
+	if (rc != 0) {
+		CDEBUG(D_NET, "Out of memory\n");
+		return rc;
+	}
+
+	for (i = 0; i < count; i++) {
+		if (copy_from_user(&id, &ids_up[i], sizeof(id))) {
+			rc = -EFAULT;
+			break;
+		}
+
+		/* node is added to tmp group */
+		rc = lstcon_group_ndlink_find(grp, id, &ndl, 1);
+		if (rc != 0) {
+			CERROR("Can't create node link\n");
+			break;
+		}
+	}
+
+	if (rc != 0) {
+		lstcon_group_put(grp);
+		return rc;
+	}
+
+	rc = lstcon_debug_ndlist(&grp->grp_ndl_list, NULL,
+				 timeout, result_up);
+
+	lstcon_group_put(grp);
+
+	return rc;
+}
+
+int
+lstcon_session_match(lst_sid_t sid)
+{
+	return (console_session.ses_id.ses_nid   == sid.ses_nid &&
+		console_session.ses_id.ses_stamp == sid.ses_stamp) ?  1: 0;
+}
+
+static void
+lstcon_new_session_id(lst_sid_t *sid)
+{
+	lnet_process_id_t      id;
+
+	LASSERT(console_session.ses_state == LST_SESSION_NONE);
+
+	LNetGetId(1, &id);
+	sid->ses_nid   = id.nid;
+	sid->ses_stamp = cfs_time_current();
+}
+
+extern srpc_service_t lstcon_acceptor_service;
+
+int
+lstcon_session_new(char *name, int key, unsigned feats,
+		   int timeout, int force, lst_sid_t *sid_up)
+{
+	int     rc = 0;
+	int     i;
+
+	if (console_session.ses_state != LST_SESSION_NONE) {
+		/* session exists */
+		if (!force) {
+			CNETERR("Session %s already exists\n",
+				console_session.ses_name);
+			return -EEXIST;
+		}
+
+		rc = lstcon_session_end();
+
+		/* lstcon_session_end() only return local error */
+		if  (rc != 0)
+			return rc;
+	}
+
+	if ((feats & ~LST_FEATS_MASK) != 0) {
+		CNETERR("Unknown session features %x\n",
+			(feats & ~LST_FEATS_MASK));
+		return -EINVAL;
+	}
+
+	for (i = 0; i < LST_GLOBAL_HASHSIZE; i++)
+		LASSERT(list_empty(&console_session.ses_ndl_hash[i]));
+
+	lstcon_new_session_id(&console_session.ses_id);
+
+	console_session.ses_key	    = key;
+	console_session.ses_state   = LST_SESSION_ACTIVE;
+	console_session.ses_force   = !!force;
+	console_session.ses_features = feats;
+	console_session.ses_feats_updated = 0;
+	console_session.ses_timeout = (timeout <= 0) ?
+				      LST_CONSOLE_TIMEOUT : timeout;
+	strcpy(console_session.ses_name, name);
+
+	rc = lstcon_batch_add(LST_DEFAULT_BATCH);
+	if (rc != 0)
+		return rc;
+
+	rc = lstcon_rpc_pinger_start();
+	if (rc != 0) {
+		lstcon_batch_t *bat = NULL;
+
+		lstcon_batch_find(LST_DEFAULT_BATCH, &bat);
+		lstcon_batch_destroy(bat);
+
+		return rc;
+	}
+
+	if (copy_to_user(sid_up, &console_session.ses_id,
+			     sizeof(lst_sid_t)) == 0)
+		return rc;
+
+	lstcon_session_end();
+
+	return -EFAULT;
+}
+
+int
+lstcon_session_info(lst_sid_t *sid_up, int *key_up, unsigned *featp,
+		    lstcon_ndlist_ent_t *ndinfo_up, char *name_up, int len)
+{
+	lstcon_ndlist_ent_t *entp;
+	lstcon_ndlink_t     *ndl;
+	int		  rc = 0;
+
+	if (console_session.ses_state != LST_SESSION_ACTIVE)
+		return -ESRCH;
+
+	LIBCFS_ALLOC(entp, sizeof(*entp));
+	if (entp == NULL)
+		return -ENOMEM;
+
+	list_for_each_entry(ndl, &console_session.ses_ndl_list, ndl_link)
+		LST_NODE_STATE_COUNTER(ndl->ndl_node, entp);
+
+	if (copy_to_user(sid_up, &console_session.ses_id,
+			     sizeof(lst_sid_t)) ||
+	    copy_to_user(key_up, &console_session.ses_key,
+			     sizeof(*key_up)) ||
+	    copy_to_user(featp, &console_session.ses_features,
+			     sizeof(*featp)) ||
+	    copy_to_user(ndinfo_up, entp, sizeof(*entp)) ||
+	    copy_to_user(name_up, console_session.ses_name, len))
+		rc = -EFAULT;
+
+	LIBCFS_FREE(entp, sizeof(*entp));
+
+	return rc;
+}
+
+int
+lstcon_session_end(void)
+{
+	lstcon_rpc_trans_t *trans;
+	lstcon_group_t     *grp;
+	lstcon_batch_t     *bat;
+	int		 rc = 0;
+
+	LASSERT(console_session.ses_state == LST_SESSION_ACTIVE);
+
+	rc = lstcon_rpc_trans_ndlist(&console_session.ses_ndl_list,
+				     NULL, LST_TRANS_SESEND, NULL,
+				     lstcon_sesrpc_condition, &trans);
+	if (rc != 0) {
+		CERROR("Can't create transaction: %d\n", rc);
+		return rc;
+	}
+
+	console_session.ses_shutdown = 1;
+
+	lstcon_rpc_pinger_stop();
+
+	lstcon_rpc_trans_postwait(trans, LST_TRANS_TIMEOUT);
+
+	lstcon_rpc_trans_destroy(trans);
+	/* User can do nothing even rpc failed, so go on */
+
+	/* waiting for orphan rpcs to die */
+	lstcon_rpc_cleanup_wait();
+
+	console_session.ses_id    = LST_INVALID_SID;
+	console_session.ses_state = LST_SESSION_NONE;
+	console_session.ses_key   = 0;
+	console_session.ses_force = 0;
+	console_session.ses_feats_updated = 0;
+
+	/* destroy all batches */
+	while (!list_empty(&console_session.ses_bat_list)) {
+		bat = list_entry(console_session.ses_bat_list.next,
+				     lstcon_batch_t, bat_link);
+
+		lstcon_batch_destroy(bat);
+	}
+
+	/* destroy all groups */
+	while (!list_empty(&console_session.ses_grp_list)) {
+		grp = list_entry(console_session.ses_grp_list.next,
+				     lstcon_group_t, grp_link);
+		LASSERT(grp->grp_ref == 1);
+
+		lstcon_group_put(grp);
+	}
+
+	/* all nodes should be released */
+	LASSERT(list_empty(&console_session.ses_ndl_list));
+
+	console_session.ses_shutdown = 0;
+	console_session.ses_expired  = 0;
+
+	return rc;
+}
+
+int
+lstcon_session_feats_check(unsigned feats)
+{
+	int rc = 0;
+
+	if ((feats & ~LST_FEATS_MASK) != 0) {
+		CERROR("Can't support these features: %x\n",
+		       (feats & ~LST_FEATS_MASK));
+		return -EPROTO;
+	}
+
+	spin_lock(&console_session.ses_rpc_lock);
+
+	if (!console_session.ses_feats_updated) {
+		console_session.ses_feats_updated = 1;
+		console_session.ses_features = feats;
+	}
+
+	if (console_session.ses_features != feats)
+		rc = -EPROTO;
+
+	spin_unlock(&console_session.ses_rpc_lock);
+
+	if (rc != 0) {
+		CERROR("remote features %x do not match with session features %x of console\n",
+		       feats, console_session.ses_features);
+	}
+
+	return rc;
+}
+
+static int
+lstcon_acceptor_handle(srpc_server_rpc_t *rpc)
+{
+	srpc_msg_t	*rep  = &rpc->srpc_replymsg;
+	srpc_msg_t	*req  = &rpc->srpc_reqstbuf->buf_msg;
+	srpc_join_reqst_t *jreq = &req->msg_body.join_reqst;
+	srpc_join_reply_t *jrep = &rep->msg_body.join_reply;
+	lstcon_group_t    *grp  = NULL;
+	lstcon_ndlink_t   *ndl;
+	int		rc   = 0;
+
+	sfw_unpack_message(req);
+
+	mutex_lock(&console_session.ses_mutex);
+
+	jrep->join_sid = console_session.ses_id;
+
+	if (console_session.ses_id.ses_nid == LNET_NID_ANY) {
+		jrep->join_status = ESRCH;
+		goto out;
+	}
+
+	if (lstcon_session_feats_check(req->msg_ses_feats) != 0) {
+		jrep->join_status = EPROTO;
+		goto out;
+	}
+
+	if (jreq->join_sid.ses_nid != LNET_NID_ANY &&
+	     !lstcon_session_match(jreq->join_sid)) {
+		jrep->join_status = EBUSY;
+		goto out;
+	}
+
+	if (lstcon_group_find(jreq->join_group, &grp) != 0) {
+		rc = lstcon_group_alloc(jreq->join_group, &grp);
+		if (rc != 0) {
+			CERROR("Out of memory\n");
+			goto out;
+		}
+
+		list_add_tail(&grp->grp_link,
+				  &console_session.ses_grp_list);
+		lstcon_group_addref(grp);
+	}
+
+	if (grp->grp_ref > 2) {
+		/* Group in using */
+		jrep->join_status = EBUSY;
+		goto out;
+	}
+
+	rc = lstcon_group_ndlink_find(grp, rpc->srpc_peer, &ndl, 0);
+	if (rc == 0) {
+		jrep->join_status = EEXIST;
+		goto out;
+	}
+
+	rc = lstcon_group_ndlink_find(grp, rpc->srpc_peer, &ndl, 1);
+	if (rc != 0) {
+		CERROR("Out of memory\n");
+		goto out;
+	}
+
+	ndl->ndl_node->nd_state   = LST_NODE_ACTIVE;
+	ndl->ndl_node->nd_timeout = console_session.ses_timeout;
+
+	if (grp->grp_userland == 0)
+		grp->grp_userland = 1;
+
+	strcpy(jrep->join_session, console_session.ses_name);
+	jrep->join_timeout = console_session.ses_timeout;
+	jrep->join_status  = 0;
+
+out:
+	rep->msg_ses_feats = console_session.ses_features;
+	if (grp != NULL)
+		lstcon_group_put(grp);
+
+	mutex_unlock(&console_session.ses_mutex);
+
+	return rc;
+}
+
+srpc_service_t lstcon_acceptor_service;
+static void lstcon_init_acceptor_service(void)
+{
+	/* initialize selftest console acceptor service table */
+	lstcon_acceptor_service.sv_name    = "join session";
+	lstcon_acceptor_service.sv_handler = lstcon_acceptor_handle;
+	lstcon_acceptor_service.sv_id      = SRPC_SERVICE_JOIN;
+	lstcon_acceptor_service.sv_wi_total = SFW_FRWK_WI_MAX;
+}
+
+extern int lstcon_ioctl_entry(unsigned int cmd, struct libcfs_ioctl_data *data);
+
+static DECLARE_IOCTL_HANDLER(lstcon_ioctl_handler, lstcon_ioctl_entry);
+
+/* initialize console */
+int
+lstcon_console_init(void)
+{
+	int     i;
+	int     rc;
+
+	memset(&console_session, 0, sizeof(lstcon_session_t));
+
+	console_session.ses_id		    = LST_INVALID_SID;
+	console_session.ses_state	    = LST_SESSION_NONE;
+	console_session.ses_timeout	    = 0;
+	console_session.ses_force	    = 0;
+	console_session.ses_expired	    = 0;
+	console_session.ses_feats_updated   = 0;
+	console_session.ses_features	    = LST_FEATS_MASK;
+	console_session.ses_laststamp	    = get_seconds();
+
+	mutex_init(&console_session.ses_mutex);
+
+	INIT_LIST_HEAD(&console_session.ses_ndl_list);
+	INIT_LIST_HEAD(&console_session.ses_grp_list);
+	INIT_LIST_HEAD(&console_session.ses_bat_list);
+	INIT_LIST_HEAD(&console_session.ses_trans_list);
+
+	LIBCFS_ALLOC(console_session.ses_ndl_hash,
+		     sizeof(struct list_head) * LST_GLOBAL_HASHSIZE);
+	if (console_session.ses_ndl_hash == NULL)
+		return -ENOMEM;
+
+	for (i = 0; i < LST_GLOBAL_HASHSIZE; i++)
+		INIT_LIST_HEAD(&console_session.ses_ndl_hash[i]);
+
+
+	/* initialize acceptor service table */
+	lstcon_init_acceptor_service();
+
+	rc = srpc_add_service(&lstcon_acceptor_service);
+	LASSERT(rc != -EBUSY);
+	if (rc != 0) {
+		LIBCFS_FREE(console_session.ses_ndl_hash,
+			    sizeof(struct list_head) * LST_GLOBAL_HASHSIZE);
+		return rc;
+	}
+
+	rc = srpc_service_add_buffers(&lstcon_acceptor_service,
+				      lstcon_acceptor_service.sv_wi_total);
+	if (rc != 0) {
+		rc = -ENOMEM;
+		goto out;
+	}
+
+	rc = libcfs_register_ioctl(&lstcon_ioctl_handler);
+
+	if (rc == 0) {
+		lstcon_rpc_module_init();
+		return 0;
+	}
+
+out:
+	srpc_shutdown_service(&lstcon_acceptor_service);
+	srpc_remove_service(&lstcon_acceptor_service);
+
+	LIBCFS_FREE(console_session.ses_ndl_hash,
+		    sizeof(struct list_head) * LST_GLOBAL_HASHSIZE);
+
+	srpc_wait_service_shutdown(&lstcon_acceptor_service);
+
+	return rc;
+}
+
+int
+lstcon_console_fini(void)
+{
+	int     i;
+
+	libcfs_deregister_ioctl(&lstcon_ioctl_handler);
+
+	mutex_lock(&console_session.ses_mutex);
+
+	srpc_shutdown_service(&lstcon_acceptor_service);
+	srpc_remove_service(&lstcon_acceptor_service);
+
+	if (console_session.ses_state != LST_SESSION_NONE)
+		lstcon_session_end();
+
+	lstcon_rpc_module_fini();
+
+	mutex_unlock(&console_session.ses_mutex);
+
+	LASSERT(list_empty(&console_session.ses_ndl_list));
+	LASSERT(list_empty(&console_session.ses_grp_list));
+	LASSERT(list_empty(&console_session.ses_bat_list));
+	LASSERT(list_empty(&console_session.ses_trans_list));
+
+	for (i = 0; i < LST_NODE_HASHSIZE; i++) {
+		LASSERT(list_empty(&console_session.ses_ndl_hash[i]));
+	}
+
+	LIBCFS_FREE(console_session.ses_ndl_hash,
+		    sizeof(struct list_head) * LST_GLOBAL_HASHSIZE);
+
+	srpc_wait_service_shutdown(&lstcon_acceptor_service);
+
+	return 0;
+}
diff --git a/kernel/drivers/staging/lustre/lnet/selftest/console.h b/kernel/drivers/staging/lustre/lnet/selftest/console.h
new file mode 100644
index 000000000..e41ca89f1
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lnet/selftest/console.h
@@ -0,0 +1,235 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/selftest/console.h
+ *
+ * kernel structure for LST console
+ *
+ * Author: Liang Zhen <liangzhen@clusterfs.com>
+ */
+
+#ifndef __LST_CONSOLE_H__
+#define __LST_CONSOLE_H__
+
+
+#include "../../include/linux/libcfs/libcfs.h"
+#include "../../include/linux/lnet/lnet.h"
+#include "../../include/linux/lnet/lib-types.h"
+#include "../../include/linux/lnet/lnetst.h"
+#include "selftest.h"
+#include "conrpc.h"
+
+typedef struct lstcon_node {
+	lnet_process_id_t    nd_id;	  /* id of the node */
+	int		  nd_ref;	 /* reference count */
+	int		  nd_state;       /* state of the node */
+	int		  nd_timeout;     /* session timeout */
+	unsigned long	   nd_stamp;       /* timestamp of last replied RPC */
+	struct lstcon_rpc    nd_ping;	/* ping rpc */
+} lstcon_node_t;				/*** node descriptor */
+
+typedef struct {
+	struct list_head	   ndl_link;       /* chain on list */
+	struct list_head	   ndl_hlink;      /* chain on hash */
+	lstcon_node_t       *ndl_node;       /* pointer to node */
+} lstcon_ndlink_t;			      /*** node link descriptor */
+
+typedef struct {
+	struct list_head	   grp_link;       /* chain on global group list */
+	int		  grp_ref;	/* reference count */
+	int		  grp_userland;   /* has userland nodes */
+	int		  grp_nnode;      /* # of nodes */
+	char		 grp_name[LST_NAME_SIZE]; /* group name */
+
+	struct list_head	   grp_trans_list; /* transaction list */
+	struct list_head	   grp_ndl_list;   /* nodes list */
+	struct list_head	   grp_ndl_hash[0];/* hash table for nodes */
+} lstcon_group_t;		    /*** (alias of nodes) group descriptor */
+
+#define LST_BATCH_IDLE	  0xB0	    /* idle batch */
+#define LST_BATCH_RUNNING       0xB1	    /* running batch */
+
+typedef struct lstcon_tsb_hdr {
+	lst_bid_t	       tsb_id;	 /* batch ID */
+	int		     tsb_index;      /* test index */
+} lstcon_tsb_hdr_t;
+
+typedef struct {
+	lstcon_tsb_hdr_t	bat_hdr;	/* test_batch header */
+	struct list_head	      bat_link;       /* chain on session's batches list */
+	int		     bat_ntest;      /* # of test */
+	int		     bat_state;      /* state of the batch */
+	int		     bat_arg;	/* parameter for run|stop, timeout for run, force for stop */
+	char		    bat_name[LST_NAME_SIZE]; /* name of batch */
+
+	struct list_head	      bat_test_list;  /* list head of tests (lstcon_test_t) */
+	struct list_head	      bat_trans_list; /* list head of transaction */
+	struct list_head	      bat_cli_list;   /* list head of client nodes (lstcon_node_t) */
+	struct list_head	     *bat_cli_hash;   /* hash table of client nodes */
+	struct list_head	      bat_srv_list;   /* list head of server nodes */
+	struct list_head	     *bat_srv_hash;   /* hash table of server nodes */
+} lstcon_batch_t;			     /*** (tests ) batch descriptor */
+
+typedef struct lstcon_test {
+	lstcon_tsb_hdr_t      tes_hdr;	/* test batch header */
+	struct list_head	    tes_link;       /* chain on batch's tests list */
+	lstcon_batch_t       *tes_batch;      /* pointer to batch */
+
+	int		   tes_type;       /* type of the test, i.e: bulk, ping */
+	int		   tes_stop_onerr; /* stop on error */
+	int		   tes_oneside;    /* one-sided test */
+	int		   tes_concur;     /* concurrency */
+	int		   tes_loop;       /* loop count */
+	int		   tes_dist;       /* nodes distribution of target group */
+	int		   tes_span;       /* nodes span of target group */
+	int		   tes_cliidx;     /* client index, used for RPC creating */
+
+	struct list_head  tes_trans_list; /* transaction list */
+	lstcon_group_t       *tes_src_grp;    /* group run the test */
+	lstcon_group_t       *tes_dst_grp;    /* target group */
+
+	int		   tes_paramlen;   /* test parameter length */
+	char		  tes_param[0];   /* test parameter */
+} lstcon_test_t;				/*** a single test descriptor */
+
+#define LST_GLOBAL_HASHSIZE     503	     /* global nodes hash table size */
+#define LST_NODE_HASHSIZE       239	     /* node hash table (for batch or group) */
+
+#define LST_SESSION_NONE	0x0	     /* no session */
+#define LST_SESSION_ACTIVE      0x1	     /* working session */
+
+#define LST_CONSOLE_TIMEOUT     300	     /* default console timeout */
+
+typedef struct {
+	struct mutex		ses_mutex;      /* only 1 thread in session */
+	lst_sid_t	       ses_id;	 /* global session id */
+	int		     ses_key;	/* local session key */
+	int		     ses_state;      /* state of session */
+	int		     ses_timeout;    /* timeout in seconds */
+	time_t		  ses_laststamp;  /* last operation stamp (seconds) */
+	/** tests features of the session */
+	unsigned		ses_features;
+	/** features are synced with remote test nodes */
+	unsigned		ses_feats_updated:1;
+	/** force creating */
+	unsigned		ses_force:1;
+	/** session is shutting down */
+	unsigned		ses_shutdown:1;
+	/** console is timedout */
+	unsigned		ses_expired:1;
+	__u64		   ses_id_cookie;  /* batch id cookie */
+	char		    ses_name[LST_NAME_SIZE];  /* session name */
+	lstcon_rpc_trans_t     *ses_ping;       /* session pinger */
+	stt_timer_t	     ses_ping_timer; /* timer for pinger */
+	lstcon_trans_stat_t     ses_trans_stat; /* transaction stats */
+
+	struct list_head	      ses_trans_list; /* global list of transaction */
+	struct list_head	      ses_grp_list;   /* global list of groups */
+	struct list_head	      ses_bat_list;   /* global list of batches */
+	struct list_head	      ses_ndl_list;   /* global list of nodes */
+	struct list_head	     *ses_ndl_hash;   /* hash table of nodes */
+
+	spinlock_t	  ses_rpc_lock;   /* serialize */
+	atomic_t	    ses_rpc_counter;/* # of initialized RPCs */
+	struct list_head	      ses_rpc_freelist; /* idle console rpc */
+} lstcon_session_t;			     /*** session descriptor */
+
+extern lstcon_session_t	 console_session;
+
+static inline lstcon_trans_stat_t *
+lstcon_trans_stat(void)
+{
+	return &console_session.ses_trans_stat;
+}
+
+static inline struct list_head *
+lstcon_id2hash (lnet_process_id_t id, struct list_head *hash)
+{
+	unsigned int idx = LNET_NIDADDR(id.nid) % LST_NODE_HASHSIZE;
+
+	return &hash[idx];
+}
+
+int lstcon_console_init(void);
+int lstcon_ioctl_entry(unsigned int cmd, struct libcfs_ioctl_data *data);
+int lstcon_console_fini(void);
+extern int lstcon_session_match(lst_sid_t sid);
+extern int lstcon_session_new(char *name, int key, unsigned version,
+			      int timeout, int flags, lst_sid_t *sid_up);
+extern int lstcon_session_info(lst_sid_t *sid_up, int *key, unsigned *verp,
+			       lstcon_ndlist_ent_t *entp, char *name_up, int len);
+extern int lstcon_session_end(void);
+extern int lstcon_session_debug(int timeout, struct list_head *result_up);
+extern int lstcon_session_feats_check(unsigned feats);
+extern int lstcon_batch_debug(int timeout, char *name,
+			      int client, struct list_head *result_up);
+extern int lstcon_group_debug(int timeout, char *name,
+			      struct list_head *result_up);
+extern int lstcon_nodes_debug(int timeout, int nnd, lnet_process_id_t *nds_up,
+			      struct list_head *result_up);
+extern int lstcon_group_add(char *name);
+extern int lstcon_group_del(char *name);
+extern int lstcon_group_clean(char *name, int args);
+extern int lstcon_group_refresh(char *name, struct list_head *result_up);
+extern int lstcon_nodes_add(char *name, int nnd, lnet_process_id_t *nds_up,
+			    unsigned *featp, struct list_head *result_up);
+extern int lstcon_nodes_remove(char *name, int nnd, lnet_process_id_t *nds_up,
+			       struct list_head *result_up);
+extern int lstcon_group_info(char *name, lstcon_ndlist_ent_t *gent_up,
+			     int *index_p, int *ndent_p, lstcon_node_ent_t *ndents_up);
+extern int lstcon_group_list(int idx, int len, char *name_up);
+extern int lstcon_batch_add(char *name);
+extern int lstcon_batch_run(char *name, int timeout,
+			    struct list_head *result_up);
+extern int lstcon_batch_stop(char *name, int force,
+			     struct list_head *result_up);
+extern int lstcon_test_batch_query(char *name, int testidx,
+				   int client, int timeout,
+				   struct list_head *result_up);
+extern int lstcon_batch_del(char *name);
+extern int lstcon_batch_list(int idx, int namelen, char *name_up);
+extern int lstcon_batch_info(char *name, lstcon_test_batch_ent_t *ent_up,
+			     int server, int testidx, int *index_p,
+			     int *ndent_p, lstcon_node_ent_t *dents_up);
+extern int lstcon_group_stat(char *grp_name, int timeout,
+			     struct list_head *result_up);
+extern int lstcon_nodes_stat(int count, lnet_process_id_t *ids_up,
+			     int timeout, struct list_head *result_up);
+extern int lstcon_test_add(char *batch_name, int type, int loop,
+			   int concur, int dist, int span,
+			   char *src_name, char *dst_name,
+			   void *param, int paramlen, int *retp,
+			   struct list_head *result_up);
+#endif
diff --git a/kernel/drivers/staging/lustre/lnet/selftest/framework.c b/kernel/drivers/staging/lustre/lnet/selftest/framework.c
new file mode 100644
index 000000000..a93a90de0
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lnet/selftest/framework.c
@@ -0,0 +1,1804 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/selftest/framework.c
+ *
+ * Author: Isaac Huang <isaac@clusterfs.com>
+ * Author: Liang Zhen  <liangzhen@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+
+#include "selftest.h"
+
+lst_sid_t LST_INVALID_SID = {LNET_NID_ANY, -1};
+
+static int session_timeout = 100;
+module_param(session_timeout, int, 0444);
+MODULE_PARM_DESC(session_timeout, "test session timeout in seconds (100 by default, 0 == never)");
+
+static int rpc_timeout = 64;
+module_param(rpc_timeout, int, 0644);
+MODULE_PARM_DESC(rpc_timeout, "rpc timeout in seconds (64 by default, 0 == never)");
+
+#define sfw_unpack_id(id)	       \
+do {				    \
+	__swab64s(&(id).nid);	   \
+	__swab32s(&(id).pid);	   \
+} while (0)
+
+#define sfw_unpack_sid(sid)	     \
+do {				    \
+	__swab64s(&(sid).ses_nid);      \
+	__swab64s(&(sid).ses_stamp);    \
+} while (0)
+
+#define sfw_unpack_fw_counters(fc)	\
+do {				      \
+	__swab32s(&(fc).running_ms);      \
+	__swab32s(&(fc).active_batches);  \
+	__swab32s(&(fc).zombie_sessions); \
+	__swab32s(&(fc).brw_errors);      \
+	__swab32s(&(fc).ping_errors);     \
+} while (0)
+
+#define sfw_unpack_rpc_counters(rc)     \
+do {				    \
+	__swab32s(&(rc).errors);	\
+	__swab32s(&(rc).rpcs_sent);     \
+	__swab32s(&(rc).rpcs_rcvd);     \
+	__swab32s(&(rc).rpcs_dropped);  \
+	__swab32s(&(rc).rpcs_expired);  \
+	__swab64s(&(rc).bulk_get);      \
+	__swab64s(&(rc).bulk_put);      \
+} while (0)
+
+#define sfw_unpack_lnet_counters(lc)    \
+do {				    \
+	__swab32s(&(lc).errors);	\
+	__swab32s(&(lc).msgs_max);      \
+	__swab32s(&(lc).msgs_alloc);    \
+	__swab32s(&(lc).send_count);    \
+	__swab32s(&(lc).recv_count);    \
+	__swab32s(&(lc).drop_count);    \
+	__swab32s(&(lc).route_count);   \
+	__swab64s(&(lc).send_length);   \
+	__swab64s(&(lc).recv_length);   \
+	__swab64s(&(lc).drop_length);   \
+	__swab64s(&(lc).route_length);  \
+} while (0)
+
+#define sfw_test_active(t)      (atomic_read(&(t)->tsi_nactive) != 0)
+#define sfw_batch_active(b)     (atomic_read(&(b)->bat_nactive) != 0)
+
+static struct smoketest_framework {
+	struct list_head	 fw_zombie_rpcs;     /* RPCs to be recycled */
+	struct list_head	 fw_zombie_sessions; /* stopping sessions */
+	struct list_head	 fw_tests;	   /* registered test cases */
+	atomic_t       fw_nzombies;	/* # zombie sessions */
+	spinlock_t	   fw_lock;		/* serialise */
+	sfw_session_t	  *fw_session;		/* _the_ session */
+	int		   fw_shuttingdown;	/* shutdown in progress */
+	srpc_server_rpc_t *fw_active_srpc;	/* running RPC */
+} sfw_data;
+
+/* forward ref's */
+int sfw_stop_batch(sfw_batch_t *tsb, int force);
+void sfw_destroy_session(sfw_session_t *sn);
+
+static inline sfw_test_case_t *
+sfw_find_test_case(int id)
+{
+	sfw_test_case_t *tsc;
+
+	LASSERT(id <= SRPC_SERVICE_MAX_ID);
+	LASSERT(id > SRPC_FRAMEWORK_SERVICE_MAX_ID);
+
+	list_for_each_entry(tsc, &sfw_data.fw_tests, tsc_list) {
+		if (tsc->tsc_srv_service->sv_id == id)
+			return tsc;
+	}
+
+	return NULL;
+}
+
+static int
+sfw_register_test(srpc_service_t *service, sfw_test_client_ops_t *cliops)
+{
+	sfw_test_case_t *tsc;
+
+	if (sfw_find_test_case(service->sv_id) != NULL) {
+		CERROR("Failed to register test %s (%d)\n",
+			service->sv_name, service->sv_id);
+		return -EEXIST;
+	}
+
+	LIBCFS_ALLOC(tsc, sizeof(sfw_test_case_t));
+	if (tsc == NULL)
+		return -ENOMEM;
+
+	tsc->tsc_cli_ops     = cliops;
+	tsc->tsc_srv_service = service;
+
+	list_add_tail(&tsc->tsc_list, &sfw_data.fw_tests);
+	return 0;
+}
+
+static void
+sfw_add_session_timer(void)
+{
+	sfw_session_t *sn = sfw_data.fw_session;
+	stt_timer_t   *timer = &sn->sn_timer;
+
+	LASSERT(!sfw_data.fw_shuttingdown);
+
+	if (sn == NULL || sn->sn_timeout == 0)
+		return;
+
+	LASSERT(!sn->sn_timer_active);
+
+	sn->sn_timer_active = 1;
+	timer->stt_expires = cfs_time_add(sn->sn_timeout,
+					  get_seconds());
+	stt_add_timer(timer);
+	return;
+}
+
+static int
+sfw_del_session_timer(void)
+{
+	sfw_session_t *sn = sfw_data.fw_session;
+
+	if (sn == NULL || !sn->sn_timer_active)
+		return 0;
+
+	LASSERT(sn->sn_timeout != 0);
+
+	if (stt_del_timer(&sn->sn_timer)) { /* timer defused */
+		sn->sn_timer_active = 0;
+		return 0;
+	}
+
+	return EBUSY; /* racing with sfw_session_expired() */
+}
+
+static void
+sfw_deactivate_session(void)
+	__must_hold(&sfw_data.fw_lock)
+{
+	sfw_session_t *sn = sfw_data.fw_session;
+	int	    nactive = 0;
+	sfw_batch_t   *tsb;
+	sfw_test_case_t *tsc;
+
+	if (sn == NULL) return;
+
+	LASSERT(!sn->sn_timer_active);
+
+	sfw_data.fw_session = NULL;
+	atomic_inc(&sfw_data.fw_nzombies);
+	list_add(&sn->sn_list, &sfw_data.fw_zombie_sessions);
+
+	spin_unlock(&sfw_data.fw_lock);
+
+	list_for_each_entry(tsc, &sfw_data.fw_tests, tsc_list) {
+		srpc_abort_service(tsc->tsc_srv_service);
+	}
+
+	spin_lock(&sfw_data.fw_lock);
+
+	list_for_each_entry(tsb, &sn->sn_batches, bat_list) {
+		if (sfw_batch_active(tsb)) {
+			nactive++;
+			sfw_stop_batch(tsb, 1);
+		}
+	}
+
+	if (nactive != 0)
+		return;   /* wait for active batches to stop */
+
+	list_del_init(&sn->sn_list);
+	spin_unlock(&sfw_data.fw_lock);
+
+	sfw_destroy_session(sn);
+
+	spin_lock(&sfw_data.fw_lock);
+}
+
+
+static void
+sfw_session_expired(void *data)
+{
+	sfw_session_t *sn = data;
+
+	spin_lock(&sfw_data.fw_lock);
+
+	LASSERT(sn->sn_timer_active);
+	LASSERT(sn == sfw_data.fw_session);
+
+	CWARN("Session expired! sid: %s-%llu, name: %s\n",
+	       libcfs_nid2str(sn->sn_id.ses_nid),
+	       sn->sn_id.ses_stamp, &sn->sn_name[0]);
+
+	sn->sn_timer_active = 0;
+	sfw_deactivate_session();
+
+	spin_unlock(&sfw_data.fw_lock);
+}
+
+static inline void
+sfw_init_session(sfw_session_t *sn, lst_sid_t sid,
+		 unsigned features, const char *name)
+{
+	stt_timer_t *timer = &sn->sn_timer;
+
+	memset(sn, 0, sizeof(sfw_session_t));
+	INIT_LIST_HEAD(&sn->sn_list);
+	INIT_LIST_HEAD(&sn->sn_batches);
+	atomic_set(&sn->sn_refcount, 1);	/* +1 for caller */
+	atomic_set(&sn->sn_brw_errors, 0);
+	atomic_set(&sn->sn_ping_errors, 0);
+	strlcpy(&sn->sn_name[0], name, sizeof(sn->sn_name));
+
+	sn->sn_timer_active = 0;
+	sn->sn_id	   = sid;
+	sn->sn_features	    = features;
+	sn->sn_timeout      = session_timeout;
+	sn->sn_started      = cfs_time_current();
+
+	timer->stt_data = sn;
+	timer->stt_func = sfw_session_expired;
+	INIT_LIST_HEAD(&timer->stt_list);
+}
+
+/* completion handler for incoming framework RPCs */
+static void
+sfw_server_rpc_done(struct srpc_server_rpc *rpc)
+{
+	struct srpc_service	*sv	= rpc->srpc_scd->scd_svc;
+	int			status	= rpc->srpc_status;
+
+	CDEBUG(D_NET,
+		"Incoming framework RPC done: service %s, peer %s, status %s:%d\n",
+		sv->sv_name, libcfs_id2str(rpc->srpc_peer),
+		swi_state2str(rpc->srpc_wi.swi_state),
+		status);
+
+	if (rpc->srpc_bulk != NULL)
+		sfw_free_pages(rpc);
+	return;
+}
+
+static void
+sfw_client_rpc_fini(srpc_client_rpc_t *rpc)
+{
+	LASSERT(rpc->crpc_bulk.bk_niov == 0);
+	LASSERT(list_empty(&rpc->crpc_list));
+	LASSERT(atomic_read(&rpc->crpc_refcount) == 0);
+
+	CDEBUG(D_NET,
+		"Outgoing framework RPC done: service %d, peer %s, status %s:%d:%d\n",
+		rpc->crpc_service, libcfs_id2str(rpc->crpc_dest),
+		swi_state2str(rpc->crpc_wi.swi_state),
+		rpc->crpc_aborted, rpc->crpc_status);
+
+	spin_lock(&sfw_data.fw_lock);
+
+	/* my callers must finish all RPCs before shutting me down */
+	LASSERT(!sfw_data.fw_shuttingdown);
+	list_add(&rpc->crpc_list, &sfw_data.fw_zombie_rpcs);
+
+	spin_unlock(&sfw_data.fw_lock);
+}
+
+static sfw_batch_t *
+sfw_find_batch(lst_bid_t bid)
+{
+	sfw_session_t *sn = sfw_data.fw_session;
+	sfw_batch_t   *bat;
+
+	LASSERT(sn != NULL);
+
+	list_for_each_entry(bat, &sn->sn_batches, bat_list) {
+		if (bat->bat_id.bat_id == bid.bat_id)
+			return bat;
+	}
+
+	return NULL;
+}
+
+static sfw_batch_t *
+sfw_bid2batch(lst_bid_t bid)
+{
+	sfw_session_t *sn = sfw_data.fw_session;
+	sfw_batch_t   *bat;
+
+	LASSERT(sn != NULL);
+
+	bat = sfw_find_batch(bid);
+	if (bat != NULL)
+		return bat;
+
+	LIBCFS_ALLOC(bat, sizeof(sfw_batch_t));
+	if (bat == NULL)
+		return NULL;
+
+	bat->bat_error    = 0;
+	bat->bat_session  = sn;
+	bat->bat_id       = bid;
+	atomic_set(&bat->bat_nactive, 0);
+	INIT_LIST_HEAD(&bat->bat_tests);
+
+	list_add_tail(&bat->bat_list, &sn->sn_batches);
+	return bat;
+}
+
+static int
+sfw_get_stats(srpc_stat_reqst_t *request, srpc_stat_reply_t *reply)
+{
+	sfw_session_t  *sn = sfw_data.fw_session;
+	sfw_counters_t *cnt = &reply->str_fw;
+	sfw_batch_t    *bat;
+	struct timeval  tv;
+
+	reply->str_sid = (sn == NULL) ? LST_INVALID_SID : sn->sn_id;
+
+	if (request->str_sid.ses_nid == LNET_NID_ANY) {
+		reply->str_status = EINVAL;
+		return 0;
+	}
+
+	if (sn == NULL || !sfw_sid_equal(request->str_sid, sn->sn_id)) {
+		reply->str_status = ESRCH;
+		return 0;
+	}
+
+	lnet_counters_get(&reply->str_lnet);
+	srpc_get_counters(&reply->str_rpc);
+
+	/* send over the msecs since the session was started
+	 - with 32 bits to send, this is ~49 days */
+	cfs_duration_usec(cfs_time_sub(cfs_time_current(),
+				       sn->sn_started), &tv);
+
+	cnt->running_ms      = (__u32)(tv.tv_sec * 1000 + tv.tv_usec / 1000);
+	cnt->brw_errors      = atomic_read(&sn->sn_brw_errors);
+	cnt->ping_errors     = atomic_read(&sn->sn_ping_errors);
+	cnt->zombie_sessions = atomic_read(&sfw_data.fw_nzombies);
+
+	cnt->active_batches = 0;
+	list_for_each_entry(bat, &sn->sn_batches, bat_list) {
+		if (atomic_read(&bat->bat_nactive) > 0)
+			cnt->active_batches++;
+	}
+
+	reply->str_status = 0;
+	return 0;
+}
+
+int
+sfw_make_session(srpc_mksn_reqst_t *request, srpc_mksn_reply_t *reply)
+{
+	sfw_session_t *sn = sfw_data.fw_session;
+	srpc_msg_t    *msg = container_of(request, srpc_msg_t,
+					  msg_body.mksn_reqst);
+	int	       cplen = 0;
+
+	if (request->mksn_sid.ses_nid == LNET_NID_ANY) {
+		reply->mksn_sid = (sn == NULL) ? LST_INVALID_SID : sn->sn_id;
+		reply->mksn_status = EINVAL;
+		return 0;
+	}
+
+	if (sn != NULL) {
+		reply->mksn_status  = 0;
+		reply->mksn_sid     = sn->sn_id;
+		reply->mksn_timeout = sn->sn_timeout;
+
+		if (sfw_sid_equal(request->mksn_sid, sn->sn_id)) {
+			atomic_inc(&sn->sn_refcount);
+			return 0;
+		}
+
+		if (!request->mksn_force) {
+			reply->mksn_status = EBUSY;
+			cplen = strlcpy(&reply->mksn_name[0], &sn->sn_name[0],
+					sizeof(reply->mksn_name));
+			if (cplen >= sizeof(reply->mksn_name))
+				return -E2BIG;
+			return 0;
+		}
+	}
+
+	/* reject the request if it requires unknown features
+	 * NB: old version will always accept all features because it's not
+	 * aware of srpc_msg_t::msg_ses_feats, it's a defect but it's also
+	 * harmless because it will return zero feature to console, and it's
+	 * console's responsibility to make sure all nodes in a session have
+	 * same feature mask. */
+	if ((msg->msg_ses_feats & ~LST_FEATS_MASK) != 0) {
+		reply->mksn_status = EPROTO;
+		return 0;
+	}
+
+	/* brand new or create by force */
+	LIBCFS_ALLOC(sn, sizeof(sfw_session_t));
+	if (sn == NULL) {
+		CERROR("Dropping RPC (mksn) under memory pressure.\n");
+		return -ENOMEM;
+	}
+
+	sfw_init_session(sn, request->mksn_sid,
+			 msg->msg_ses_feats, &request->mksn_name[0]);
+
+	spin_lock(&sfw_data.fw_lock);
+
+	sfw_deactivate_session();
+	LASSERT(sfw_data.fw_session == NULL);
+	sfw_data.fw_session = sn;
+
+	spin_unlock(&sfw_data.fw_lock);
+
+	reply->mksn_status  = 0;
+	reply->mksn_sid     = sn->sn_id;
+	reply->mksn_timeout = sn->sn_timeout;
+	return 0;
+}
+
+static int
+sfw_remove_session(srpc_rmsn_reqst_t *request, srpc_rmsn_reply_t *reply)
+{
+	sfw_session_t *sn = sfw_data.fw_session;
+
+	reply->rmsn_sid = (sn == NULL) ? LST_INVALID_SID : sn->sn_id;
+
+	if (request->rmsn_sid.ses_nid == LNET_NID_ANY) {
+		reply->rmsn_status = EINVAL;
+		return 0;
+	}
+
+	if (sn == NULL || !sfw_sid_equal(request->rmsn_sid, sn->sn_id)) {
+		reply->rmsn_status = (sn == NULL) ? ESRCH : EBUSY;
+		return 0;
+	}
+
+	if (!atomic_dec_and_test(&sn->sn_refcount)) {
+		reply->rmsn_status = 0;
+		return 0;
+	}
+
+	spin_lock(&sfw_data.fw_lock);
+	sfw_deactivate_session();
+	spin_unlock(&sfw_data.fw_lock);
+
+	reply->rmsn_status = 0;
+	reply->rmsn_sid    = LST_INVALID_SID;
+	LASSERT(sfw_data.fw_session == NULL);
+	return 0;
+}
+
+static int
+sfw_debug_session(srpc_debug_reqst_t *request, srpc_debug_reply_t *reply)
+{
+	sfw_session_t *sn = sfw_data.fw_session;
+
+	if (sn == NULL) {
+		reply->dbg_status = ESRCH;
+		reply->dbg_sid    = LST_INVALID_SID;
+		return 0;
+	}
+
+	reply->dbg_status  = 0;
+	reply->dbg_sid     = sn->sn_id;
+	reply->dbg_timeout = sn->sn_timeout;
+	if (strlcpy(reply->dbg_name, &sn->sn_name[0], sizeof(reply->dbg_name))
+	    >= sizeof(reply->dbg_name))
+		return -E2BIG;
+
+	return 0;
+}
+
+static void
+sfw_test_rpc_fini(srpc_client_rpc_t *rpc)
+{
+	sfw_test_unit_t     *tsu = rpc->crpc_priv;
+	sfw_test_instance_t *tsi = tsu->tsu_instance;
+
+	/* Called with hold of tsi->tsi_lock */
+	LASSERT(list_empty(&rpc->crpc_list));
+	list_add(&rpc->crpc_list, &tsi->tsi_free_rpcs);
+}
+
+static inline int
+sfw_test_buffers(sfw_test_instance_t *tsi)
+{
+	struct sfw_test_case	*tsc = sfw_find_test_case(tsi->tsi_service);
+	struct srpc_service	*svc = tsc->tsc_srv_service;
+	int			nbuf;
+
+	nbuf = min(svc->sv_wi_total, tsi->tsi_loop) / svc->sv_ncpts;
+	return max(SFW_TEST_WI_MIN, nbuf + SFW_TEST_WI_EXTRA);
+}
+
+static int
+sfw_load_test(struct sfw_test_instance *tsi)
+{
+	struct sfw_test_case	*tsc;
+	struct srpc_service	*svc;
+	int			nbuf;
+	int			rc;
+
+	LASSERT(tsi != NULL);
+	tsc = sfw_find_test_case(tsi->tsi_service);
+	nbuf = sfw_test_buffers(tsi);
+	LASSERT(tsc != NULL);
+	svc = tsc->tsc_srv_service;
+
+	if (tsi->tsi_is_client) {
+		tsi->tsi_ops = tsc->tsc_cli_ops;
+		return 0;
+	}
+
+	rc = srpc_service_add_buffers(svc, nbuf);
+	if (rc != 0) {
+		CWARN("Failed to reserve enough buffers: service %s, %d needed: %d\n",
+		      svc->sv_name, nbuf, rc);
+		/* NB: this error handler is not strictly correct, because
+		 * it may release more buffers than already allocated,
+		 * but it doesn't matter because request portal should
+		 * be lazy portal and will grow buffers if necessary. */
+		srpc_service_remove_buffers(svc, nbuf);
+		return -ENOMEM;
+	}
+
+	CDEBUG(D_NET, "Reserved %d buffers for test %s\n",
+	       nbuf * (srpc_serv_is_framework(svc) ?
+		       1 : cfs_cpt_number(cfs_cpt_table)), svc->sv_name);
+	return 0;
+}
+
+static void
+sfw_unload_test(struct sfw_test_instance *tsi)
+{
+	struct sfw_test_case *tsc = sfw_find_test_case(tsi->tsi_service);
+
+	LASSERT(tsc != NULL);
+
+	if (tsi->tsi_is_client)
+		return;
+
+	/* shrink buffers, because request portal is lazy portal
+	 * which can grow buffers at runtime so we may leave
+	 * some buffers behind, but never mind... */
+	srpc_service_remove_buffers(tsc->tsc_srv_service,
+				    sfw_test_buffers(tsi));
+	return;
+}
+
+static void
+sfw_destroy_test_instance(sfw_test_instance_t *tsi)
+{
+	srpc_client_rpc_t *rpc;
+	sfw_test_unit_t   *tsu;
+
+	if (!tsi->tsi_is_client) goto clean;
+
+	tsi->tsi_ops->tso_fini(tsi);
+
+	LASSERT(!tsi->tsi_stopping);
+	LASSERT(list_empty(&tsi->tsi_active_rpcs));
+	LASSERT(!sfw_test_active(tsi));
+
+	while (!list_empty(&tsi->tsi_units)) {
+		tsu = list_entry(tsi->tsi_units.next,
+				     sfw_test_unit_t, tsu_list);
+		list_del(&tsu->tsu_list);
+		LIBCFS_FREE(tsu, sizeof(*tsu));
+	}
+
+	while (!list_empty(&tsi->tsi_free_rpcs)) {
+		rpc = list_entry(tsi->tsi_free_rpcs.next,
+				     srpc_client_rpc_t, crpc_list);
+		list_del(&rpc->crpc_list);
+		LIBCFS_FREE(rpc, srpc_client_rpc_size(rpc));
+	}
+
+clean:
+	sfw_unload_test(tsi);
+	LIBCFS_FREE(tsi, sizeof(*tsi));
+	return;
+}
+
+static void
+sfw_destroy_batch(sfw_batch_t *tsb)
+{
+	sfw_test_instance_t *tsi;
+
+	LASSERT(!sfw_batch_active(tsb));
+	LASSERT(list_empty(&tsb->bat_list));
+
+	while (!list_empty(&tsb->bat_tests)) {
+		tsi = list_entry(tsb->bat_tests.next,
+				     sfw_test_instance_t, tsi_list);
+		list_del_init(&tsi->tsi_list);
+		sfw_destroy_test_instance(tsi);
+	}
+
+	LIBCFS_FREE(tsb, sizeof(sfw_batch_t));
+	return;
+}
+
+void
+sfw_destroy_session(sfw_session_t *sn)
+{
+	sfw_batch_t *batch;
+
+	LASSERT(list_empty(&sn->sn_list));
+	LASSERT(sn != sfw_data.fw_session);
+
+	while (!list_empty(&sn->sn_batches)) {
+		batch = list_entry(sn->sn_batches.next,
+				       sfw_batch_t, bat_list);
+		list_del_init(&batch->bat_list);
+		sfw_destroy_batch(batch);
+	}
+
+	LIBCFS_FREE(sn, sizeof(*sn));
+	atomic_dec(&sfw_data.fw_nzombies);
+	return;
+}
+
+static void
+sfw_unpack_addtest_req(srpc_msg_t *msg)
+{
+	srpc_test_reqst_t *req = &msg->msg_body.tes_reqst;
+
+	LASSERT(msg->msg_type == SRPC_MSG_TEST_REQST);
+	LASSERT(req->tsr_is_client);
+
+	if (msg->msg_magic == SRPC_MSG_MAGIC)
+		return; /* no flipping needed */
+
+	LASSERT(msg->msg_magic == __swab32(SRPC_MSG_MAGIC));
+
+	if (req->tsr_service == SRPC_SERVICE_BRW) {
+		if ((msg->msg_ses_feats & LST_FEAT_BULK_LEN) == 0) {
+			test_bulk_req_t *bulk = &req->tsr_u.bulk_v0;
+
+			__swab32s(&bulk->blk_opc);
+			__swab32s(&bulk->blk_npg);
+			__swab32s(&bulk->blk_flags);
+
+		} else {
+			test_bulk_req_v1_t *bulk = &req->tsr_u.bulk_v1;
+
+			__swab16s(&bulk->blk_opc);
+			__swab16s(&bulk->blk_flags);
+			__swab32s(&bulk->blk_offset);
+			__swab32s(&bulk->blk_len);
+		}
+
+		return;
+	}
+
+	if (req->tsr_service == SRPC_SERVICE_PING) {
+		test_ping_req_t *ping = &req->tsr_u.ping;
+
+		__swab32s(&ping->png_size);
+		__swab32s(&ping->png_flags);
+		return;
+	}
+
+	LBUG();
+	return;
+}
+
+static int
+sfw_add_test_instance(sfw_batch_t *tsb, srpc_server_rpc_t *rpc)
+{
+	srpc_msg_t	  *msg = &rpc->srpc_reqstbuf->buf_msg;
+	srpc_test_reqst_t   *req = &msg->msg_body.tes_reqst;
+	srpc_bulk_t	 *bk = rpc->srpc_bulk;
+	int		  ndest = req->tsr_ndest;
+	sfw_test_unit_t     *tsu;
+	sfw_test_instance_t *tsi;
+	int		  i;
+	int		  rc;
+
+	LIBCFS_ALLOC(tsi, sizeof(*tsi));
+	if (tsi == NULL) {
+		CERROR("Can't allocate test instance for batch: %llu\n",
+			tsb->bat_id.bat_id);
+		return -ENOMEM;
+	}
+
+	spin_lock_init(&tsi->tsi_lock);
+	atomic_set(&tsi->tsi_nactive, 0);
+	INIT_LIST_HEAD(&tsi->tsi_units);
+	INIT_LIST_HEAD(&tsi->tsi_free_rpcs);
+	INIT_LIST_HEAD(&tsi->tsi_active_rpcs);
+
+	tsi->tsi_stopping      = 0;
+	tsi->tsi_batch	 = tsb;
+	tsi->tsi_loop	  = req->tsr_loop;
+	tsi->tsi_concur	= req->tsr_concur;
+	tsi->tsi_service       = req->tsr_service;
+	tsi->tsi_is_client     = !!(req->tsr_is_client);
+	tsi->tsi_stoptsu_onerr = !!(req->tsr_stop_onerr);
+
+	rc = sfw_load_test(tsi);
+	if (rc != 0) {
+		LIBCFS_FREE(tsi, sizeof(*tsi));
+		return rc;
+	}
+
+	LASSERT(!sfw_batch_active(tsb));
+
+	if (!tsi->tsi_is_client) {
+		/* it's test server, just add it to tsb */
+		list_add_tail(&tsi->tsi_list, &tsb->bat_tests);
+		return 0;
+	}
+
+	LASSERT(bk != NULL);
+	LASSERT(bk->bk_niov * SFW_ID_PER_PAGE >= (unsigned int)ndest);
+	LASSERT((unsigned int)bk->bk_len >=
+		sizeof(lnet_process_id_packed_t) * ndest);
+
+	sfw_unpack_addtest_req(msg);
+	memcpy(&tsi->tsi_u, &req->tsr_u, sizeof(tsi->tsi_u));
+
+	for (i = 0; i < ndest; i++) {
+		lnet_process_id_packed_t *dests;
+		lnet_process_id_packed_t  id;
+		int		       j;
+
+		dests = page_address(bk->bk_iovs[i / SFW_ID_PER_PAGE].kiov_page);
+		LASSERT(dests != NULL);  /* my pages are within KVM always */
+		id = dests[i % SFW_ID_PER_PAGE];
+		if (msg->msg_magic != SRPC_MSG_MAGIC)
+			sfw_unpack_id(id);
+
+		for (j = 0; j < tsi->tsi_concur; j++) {
+			LIBCFS_ALLOC(tsu, sizeof(sfw_test_unit_t));
+			if (tsu == NULL) {
+				rc = -ENOMEM;
+				CERROR("Can't allocate tsu for %d\n",
+					tsi->tsi_service);
+				goto error;
+			}
+
+			tsu->tsu_dest.nid = id.nid;
+			tsu->tsu_dest.pid = id.pid;
+			tsu->tsu_instance = tsi;
+			tsu->tsu_private  = NULL;
+			list_add_tail(&tsu->tsu_list, &tsi->tsi_units);
+		}
+	}
+
+	rc = tsi->tsi_ops->tso_init(tsi);
+	if (rc == 0) {
+		list_add_tail(&tsi->tsi_list, &tsb->bat_tests);
+		return 0;
+	}
+
+error:
+	LASSERT(rc != 0);
+	sfw_destroy_test_instance(tsi);
+	return rc;
+}
+
+static void
+sfw_test_unit_done(sfw_test_unit_t *tsu)
+{
+	sfw_test_instance_t *tsi = tsu->tsu_instance;
+	sfw_batch_t	 *tsb = tsi->tsi_batch;
+	sfw_session_t       *sn = tsb->bat_session;
+
+	LASSERT(sfw_test_active(tsi));
+
+	if (!atomic_dec_and_test(&tsi->tsi_nactive))
+		return;
+
+	/* the test instance is done */
+	spin_lock(&tsi->tsi_lock);
+
+	tsi->tsi_stopping = 0;
+
+	spin_unlock(&tsi->tsi_lock);
+
+	spin_lock(&sfw_data.fw_lock);
+
+	if (!atomic_dec_and_test(&tsb->bat_nactive) ||/* tsb still active */
+	    sn == sfw_data.fw_session) {		  /* sn also active */
+		spin_unlock(&sfw_data.fw_lock);
+		return;
+	}
+
+	LASSERT(!list_empty(&sn->sn_list)); /* I'm a zombie! */
+
+	list_for_each_entry(tsb, &sn->sn_batches, bat_list) {
+		if (sfw_batch_active(tsb)) {
+			spin_unlock(&sfw_data.fw_lock);
+			return;
+		}
+	}
+
+	list_del_init(&sn->sn_list);
+	spin_unlock(&sfw_data.fw_lock);
+
+	sfw_destroy_session(sn);
+	return;
+}
+
+static void
+sfw_test_rpc_done(srpc_client_rpc_t *rpc)
+{
+	sfw_test_unit_t     *tsu = rpc->crpc_priv;
+	sfw_test_instance_t *tsi = tsu->tsu_instance;
+	int		  done = 0;
+
+	tsi->tsi_ops->tso_done_rpc(tsu, rpc);
+
+	spin_lock(&tsi->tsi_lock);
+
+	LASSERT(sfw_test_active(tsi));
+	LASSERT(!list_empty(&rpc->crpc_list));
+
+	list_del_init(&rpc->crpc_list);
+
+	/* batch is stopping or loop is done or get error */
+	if (tsi->tsi_stopping ||
+	    tsu->tsu_loop == 0 ||
+	    (rpc->crpc_status != 0 && tsi->tsi_stoptsu_onerr))
+		done = 1;
+
+	/* dec ref for poster */
+	srpc_client_rpc_decref(rpc);
+
+	spin_unlock(&tsi->tsi_lock);
+
+	if (!done) {
+		swi_schedule_workitem(&tsu->tsu_worker);
+		return;
+	}
+
+	sfw_test_unit_done(tsu);
+	return;
+}
+
+int
+sfw_create_test_rpc(sfw_test_unit_t *tsu, lnet_process_id_t peer,
+		    unsigned features, int nblk, int blklen,
+		    srpc_client_rpc_t **rpcpp)
+{
+	srpc_client_rpc_t   *rpc = NULL;
+	sfw_test_instance_t *tsi = tsu->tsu_instance;
+
+	spin_lock(&tsi->tsi_lock);
+
+	LASSERT(sfw_test_active(tsi));
+
+	if (!list_empty(&tsi->tsi_free_rpcs)) {
+		/* pick request from buffer */
+		rpc = list_entry(tsi->tsi_free_rpcs.next,
+				     srpc_client_rpc_t, crpc_list);
+		LASSERT(nblk == rpc->crpc_bulk.bk_niov);
+		list_del_init(&rpc->crpc_list);
+	}
+
+	spin_unlock(&tsi->tsi_lock);
+
+	if (rpc == NULL) {
+		rpc = srpc_create_client_rpc(peer, tsi->tsi_service, nblk,
+					     blklen, sfw_test_rpc_done,
+					     sfw_test_rpc_fini, tsu);
+	} else {
+		srpc_init_client_rpc(rpc, peer, tsi->tsi_service, nblk,
+				     blklen, sfw_test_rpc_done,
+				     sfw_test_rpc_fini, tsu);
+	}
+
+	if (rpc == NULL) {
+		CERROR("Can't create rpc for test %d\n", tsi->tsi_service);
+		return -ENOMEM;
+	}
+
+	rpc->crpc_reqstmsg.msg_ses_feats = features;
+	*rpcpp = rpc;
+
+	return 0;
+}
+
+static int
+sfw_run_test(swi_workitem_t *wi)
+{
+	sfw_test_unit_t     *tsu = wi->swi_workitem.wi_data;
+	sfw_test_instance_t *tsi = tsu->tsu_instance;
+	srpc_client_rpc_t   *rpc = NULL;
+
+	LASSERT(wi == &tsu->tsu_worker);
+
+	if (tsi->tsi_ops->tso_prep_rpc(tsu, tsu->tsu_dest, &rpc) != 0) {
+		LASSERT(rpc == NULL);
+		goto test_done;
+	}
+
+	LASSERT(rpc != NULL);
+
+	spin_lock(&tsi->tsi_lock);
+
+	if (tsi->tsi_stopping) {
+		list_add(&rpc->crpc_list, &tsi->tsi_free_rpcs);
+		spin_unlock(&tsi->tsi_lock);
+		goto test_done;
+	}
+
+	if (tsu->tsu_loop > 0)
+		tsu->tsu_loop--;
+
+	list_add_tail(&rpc->crpc_list, &tsi->tsi_active_rpcs);
+	spin_unlock(&tsi->tsi_lock);
+
+	rpc->crpc_timeout = rpc_timeout;
+
+	spin_lock(&rpc->crpc_lock);
+	srpc_post_rpc(rpc);
+	spin_unlock(&rpc->crpc_lock);
+	return 0;
+
+test_done:
+	/*
+	 * No one can schedule me now since:
+	 * - previous RPC, if any, has done and
+	 * - no new RPC is initiated.
+	 * - my batch is still active; no one can run it again now.
+	 * Cancel pending schedules and prevent future schedule attempts:
+	 */
+	swi_exit_workitem(wi);
+	sfw_test_unit_done(tsu);
+	return 1;
+}
+
+static int
+sfw_run_batch(sfw_batch_t *tsb)
+{
+	swi_workitem_t      *wi;
+	sfw_test_unit_t     *tsu;
+	sfw_test_instance_t *tsi;
+
+	if (sfw_batch_active(tsb)) {
+		CDEBUG(D_NET, "Batch already active: %llu (%d)\n",
+		       tsb->bat_id.bat_id, atomic_read(&tsb->bat_nactive));
+		return 0;
+	}
+
+	list_for_each_entry(tsi, &tsb->bat_tests, tsi_list) {
+		if (!tsi->tsi_is_client) /* skip server instances */
+			continue;
+
+		LASSERT(!tsi->tsi_stopping);
+		LASSERT(!sfw_test_active(tsi));
+
+		atomic_inc(&tsb->bat_nactive);
+
+		list_for_each_entry(tsu, &tsi->tsi_units, tsu_list) {
+			atomic_inc(&tsi->tsi_nactive);
+			tsu->tsu_loop = tsi->tsi_loop;
+			wi = &tsu->tsu_worker;
+			swi_init_workitem(wi, tsu, sfw_run_test,
+					  lst_sched_test[\
+					  lnet_cpt_of_nid(tsu->tsu_dest.nid)]);
+			swi_schedule_workitem(wi);
+		}
+	}
+
+	return 0;
+}
+
+int
+sfw_stop_batch(sfw_batch_t *tsb, int force)
+{
+	sfw_test_instance_t *tsi;
+	srpc_client_rpc_t   *rpc;
+
+	if (!sfw_batch_active(tsb)) {
+		CDEBUG(D_NET, "Batch %llu inactive\n", tsb->bat_id.bat_id);
+		return 0;
+	}
+
+	list_for_each_entry(tsi, &tsb->bat_tests, tsi_list) {
+		spin_lock(&tsi->tsi_lock);
+
+		if (!tsi->tsi_is_client ||
+		    !sfw_test_active(tsi) || tsi->tsi_stopping) {
+			spin_unlock(&tsi->tsi_lock);
+			continue;
+		}
+
+		tsi->tsi_stopping = 1;
+
+		if (!force) {
+			spin_unlock(&tsi->tsi_lock);
+			continue;
+		}
+
+		/* abort launched rpcs in the test */
+		list_for_each_entry(rpc, &tsi->tsi_active_rpcs, crpc_list) {
+			spin_lock(&rpc->crpc_lock);
+
+			srpc_abort_rpc(rpc, -EINTR);
+
+			spin_unlock(&rpc->crpc_lock);
+		}
+
+		spin_unlock(&tsi->tsi_lock);
+	}
+
+	return 0;
+}
+
+static int
+sfw_query_batch(sfw_batch_t *tsb, int testidx, srpc_batch_reply_t *reply)
+{
+	sfw_test_instance_t *tsi;
+
+	if (testidx < 0)
+		return -EINVAL;
+
+	if (testidx == 0) {
+		reply->bar_active = atomic_read(&tsb->bat_nactive);
+		return 0;
+	}
+
+	list_for_each_entry(tsi, &tsb->bat_tests, tsi_list) {
+		if (testidx-- > 1)
+			continue;
+
+		reply->bar_active = atomic_read(&tsi->tsi_nactive);
+		return 0;
+	}
+
+	return -ENOENT;
+}
+
+void
+sfw_free_pages(srpc_server_rpc_t *rpc)
+{
+	srpc_free_bulk(rpc->srpc_bulk);
+	rpc->srpc_bulk = NULL;
+}
+
+int
+sfw_alloc_pages(struct srpc_server_rpc *rpc, int cpt, int npages, int len,
+		int sink)
+{
+	LASSERT(rpc->srpc_bulk == NULL);
+	LASSERT(npages > 0 && npages <= LNET_MAX_IOV);
+
+	rpc->srpc_bulk = srpc_alloc_bulk(cpt, npages, len, sink);
+	if (rpc->srpc_bulk == NULL)
+		return -ENOMEM;
+
+	return 0;
+}
+
+static int
+sfw_add_test(srpc_server_rpc_t *rpc)
+{
+	sfw_session_t     *sn = sfw_data.fw_session;
+	srpc_test_reply_t *reply = &rpc->srpc_replymsg.msg_body.tes_reply;
+	srpc_test_reqst_t *request;
+	int		rc;
+	sfw_batch_t       *bat;
+
+	request = &rpc->srpc_reqstbuf->buf_msg.msg_body.tes_reqst;
+	reply->tsr_sid = (sn == NULL) ? LST_INVALID_SID : sn->sn_id;
+
+	if (request->tsr_loop == 0 ||
+	    request->tsr_concur == 0 ||
+	    request->tsr_sid.ses_nid == LNET_NID_ANY ||
+	    request->tsr_ndest > SFW_MAX_NDESTS ||
+	    (request->tsr_is_client && request->tsr_ndest == 0) ||
+	    request->tsr_concur > SFW_MAX_CONCUR ||
+	    request->tsr_service > SRPC_SERVICE_MAX_ID ||
+	    request->tsr_service <= SRPC_FRAMEWORK_SERVICE_MAX_ID) {
+		reply->tsr_status = EINVAL;
+		return 0;
+	}
+
+	if (sn == NULL || !sfw_sid_equal(request->tsr_sid, sn->sn_id) ||
+	    sfw_find_test_case(request->tsr_service) == NULL) {
+		reply->tsr_status = ENOENT;
+		return 0;
+	}
+
+	bat = sfw_bid2batch(request->tsr_bid);
+	if (bat == NULL) {
+		CERROR("Dropping RPC (%s) from %s under memory pressure.\n",
+			rpc->srpc_scd->scd_svc->sv_name,
+			libcfs_id2str(rpc->srpc_peer));
+		return -ENOMEM;
+	}
+
+	if (sfw_batch_active(bat)) {
+		reply->tsr_status = EBUSY;
+		return 0;
+	}
+
+	if (request->tsr_is_client && rpc->srpc_bulk == NULL) {
+		/* rpc will be resumed later in sfw_bulk_ready */
+		int	npg = sfw_id_pages(request->tsr_ndest);
+		int	len;
+
+		if ((sn->sn_features & LST_FEAT_BULK_LEN) == 0) {
+			len = npg * PAGE_CACHE_SIZE;
+
+		} else  {
+			len = sizeof(lnet_process_id_packed_t) *
+			      request->tsr_ndest;
+		}
+
+		return sfw_alloc_pages(rpc, CFS_CPT_ANY, npg, len, 1);
+	}
+
+	rc = sfw_add_test_instance(bat, rpc);
+	CDEBUG(rc == 0 ? D_NET : D_WARNING,
+		"%s test: sv %d %s, loop %d, concur %d, ndest %d\n",
+		rc == 0 ? "Added" : "Failed to add", request->tsr_service,
+		request->tsr_is_client ? "client" : "server",
+		request->tsr_loop, request->tsr_concur, request->tsr_ndest);
+
+	reply->tsr_status = (rc < 0) ? -rc : rc;
+	return 0;
+}
+
+static int
+sfw_control_batch(srpc_batch_reqst_t *request, srpc_batch_reply_t *reply)
+{
+	sfw_session_t *sn = sfw_data.fw_session;
+	int	    rc = 0;
+	sfw_batch_t   *bat;
+
+	reply->bar_sid = (sn == NULL) ? LST_INVALID_SID : sn->sn_id;
+
+	if (sn == NULL || !sfw_sid_equal(request->bar_sid, sn->sn_id)) {
+		reply->bar_status = ESRCH;
+		return 0;
+	}
+
+	bat = sfw_find_batch(request->bar_bid);
+	if (bat == NULL) {
+		reply->bar_status = ENOENT;
+		return 0;
+	}
+
+	switch (request->bar_opc) {
+	case SRPC_BATCH_OPC_RUN:
+		rc = sfw_run_batch(bat);
+		break;
+
+	case SRPC_BATCH_OPC_STOP:
+		rc = sfw_stop_batch(bat, request->bar_arg);
+		break;
+
+	case SRPC_BATCH_OPC_QUERY:
+		rc = sfw_query_batch(bat, request->bar_testidx, reply);
+		break;
+
+	default:
+		return -EINVAL; /* drop it */
+	}
+
+	reply->bar_status = (rc < 0) ? -rc : rc;
+	return 0;
+}
+
+static int
+sfw_handle_server_rpc(struct srpc_server_rpc *rpc)
+{
+	struct srpc_service	*sv = rpc->srpc_scd->scd_svc;
+	srpc_msg_t     *reply	= &rpc->srpc_replymsg;
+	srpc_msg_t     *request	= &rpc->srpc_reqstbuf->buf_msg;
+	unsigned	features = LST_FEATS_MASK;
+	int		rc = 0;
+
+	LASSERT(sfw_data.fw_active_srpc == NULL);
+	LASSERT(sv->sv_id <= SRPC_FRAMEWORK_SERVICE_MAX_ID);
+
+	spin_lock(&sfw_data.fw_lock);
+
+	if (sfw_data.fw_shuttingdown) {
+		spin_unlock(&sfw_data.fw_lock);
+		return -ESHUTDOWN;
+	}
+
+	/* Remove timer to avoid racing with it or expiring active session */
+	if (sfw_del_session_timer() != 0) {
+		CERROR("Dropping RPC (%s) from %s: racing with expiry timer.",
+		       sv->sv_name, libcfs_id2str(rpc->srpc_peer));
+		spin_unlock(&sfw_data.fw_lock);
+		return -EAGAIN;
+	}
+
+	sfw_data.fw_active_srpc = rpc;
+	spin_unlock(&sfw_data.fw_lock);
+
+	sfw_unpack_message(request);
+	LASSERT(request->msg_type == srpc_service2request(sv->sv_id));
+
+	/* rpc module should have checked this */
+	LASSERT(request->msg_version == SRPC_MSG_VERSION);
+
+	if (sv->sv_id != SRPC_SERVICE_MAKE_SESSION &&
+	    sv->sv_id != SRPC_SERVICE_DEBUG) {
+		sfw_session_t *sn = sfw_data.fw_session;
+
+		if (sn != NULL &&
+		    sn->sn_features != request->msg_ses_feats) {
+			CNETERR("Features of framework RPC don't match features of current session: %x/%x\n",
+				request->msg_ses_feats, sn->sn_features);
+			reply->msg_body.reply.status = EPROTO;
+			reply->msg_body.reply.sid    = sn->sn_id;
+			goto out;
+		}
+
+	} else if ((request->msg_ses_feats & ~LST_FEATS_MASK) != 0) {
+		/* NB: at this point, old version will ignore features and
+		 * create new session anyway, so console should be able
+		 * to handle this */
+		reply->msg_body.reply.status = EPROTO;
+		goto out;
+	}
+
+	switch (sv->sv_id) {
+	default:
+		LBUG();
+	case SRPC_SERVICE_TEST:
+		rc = sfw_add_test(rpc);
+		break;
+
+	case SRPC_SERVICE_BATCH:
+		rc = sfw_control_batch(&request->msg_body.bat_reqst,
+				       &reply->msg_body.bat_reply);
+		break;
+
+	case SRPC_SERVICE_QUERY_STAT:
+		rc = sfw_get_stats(&request->msg_body.stat_reqst,
+				   &reply->msg_body.stat_reply);
+		break;
+
+	case SRPC_SERVICE_DEBUG:
+		rc = sfw_debug_session(&request->msg_body.dbg_reqst,
+				       &reply->msg_body.dbg_reply);
+		break;
+
+	case SRPC_SERVICE_MAKE_SESSION:
+		rc = sfw_make_session(&request->msg_body.mksn_reqst,
+				      &reply->msg_body.mksn_reply);
+		break;
+
+	case SRPC_SERVICE_REMOVE_SESSION:
+		rc = sfw_remove_session(&request->msg_body.rmsn_reqst,
+					&reply->msg_body.rmsn_reply);
+		break;
+	}
+
+	if (sfw_data.fw_session != NULL)
+		features = sfw_data.fw_session->sn_features;
+ out:
+	reply->msg_ses_feats = features;
+	rpc->srpc_done = sfw_server_rpc_done;
+	spin_lock(&sfw_data.fw_lock);
+
+	if (!sfw_data.fw_shuttingdown)
+		sfw_add_session_timer();
+
+	sfw_data.fw_active_srpc = NULL;
+	spin_unlock(&sfw_data.fw_lock);
+	return rc;
+}
+
+static int
+sfw_bulk_ready(struct srpc_server_rpc *rpc, int status)
+{
+	struct srpc_service	*sv = rpc->srpc_scd->scd_svc;
+	int			rc;
+
+	LASSERT(rpc->srpc_bulk != NULL);
+	LASSERT(sv->sv_id == SRPC_SERVICE_TEST);
+	LASSERT(sfw_data.fw_active_srpc == NULL);
+	LASSERT(rpc->srpc_reqstbuf->buf_msg.msg_body.tes_reqst.tsr_is_client);
+
+	spin_lock(&sfw_data.fw_lock);
+
+	if (status != 0) {
+		CERROR("Bulk transfer failed for RPC: service %s, peer %s, status %d\n",
+		       sv->sv_name, libcfs_id2str(rpc->srpc_peer), status);
+		spin_unlock(&sfw_data.fw_lock);
+		return -EIO;
+	}
+
+	if (sfw_data.fw_shuttingdown) {
+		spin_unlock(&sfw_data.fw_lock);
+		return -ESHUTDOWN;
+	}
+
+	if (sfw_del_session_timer() != 0) {
+		CERROR("Dropping RPC (%s) from %s: racing with expiry timer",
+		       sv->sv_name, libcfs_id2str(rpc->srpc_peer));
+		spin_unlock(&sfw_data.fw_lock);
+		return -EAGAIN;
+	}
+
+	sfw_data.fw_active_srpc = rpc;
+	spin_unlock(&sfw_data.fw_lock);
+
+	rc = sfw_add_test(rpc);
+
+	spin_lock(&sfw_data.fw_lock);
+
+	if (!sfw_data.fw_shuttingdown)
+		sfw_add_session_timer();
+
+	sfw_data.fw_active_srpc = NULL;
+	spin_unlock(&sfw_data.fw_lock);
+	return rc;
+}
+
+srpc_client_rpc_t *
+sfw_create_rpc(lnet_process_id_t peer, int service,
+	       unsigned features, int nbulkiov, int bulklen,
+	       void (*done)(srpc_client_rpc_t *), void *priv)
+{
+	srpc_client_rpc_t *rpc = NULL;
+
+	spin_lock(&sfw_data.fw_lock);
+
+	LASSERT(!sfw_data.fw_shuttingdown);
+	LASSERT(service <= SRPC_FRAMEWORK_SERVICE_MAX_ID);
+
+	if (nbulkiov == 0 && !list_empty(&sfw_data.fw_zombie_rpcs)) {
+		rpc = list_entry(sfw_data.fw_zombie_rpcs.next,
+				     srpc_client_rpc_t, crpc_list);
+		list_del(&rpc->crpc_list);
+
+		srpc_init_client_rpc(rpc, peer, service, 0, 0,
+				     done, sfw_client_rpc_fini, priv);
+	}
+
+	spin_unlock(&sfw_data.fw_lock);
+
+	if (rpc == NULL) {
+		rpc = srpc_create_client_rpc(peer, service,
+					     nbulkiov, bulklen, done,
+					     nbulkiov != 0 ?  NULL :
+					     sfw_client_rpc_fini,
+					     priv);
+	}
+
+	if (rpc != NULL) /* "session" is concept in framework */
+		rpc->crpc_reqstmsg.msg_ses_feats = features;
+
+	return rpc;
+}
+
+void
+sfw_unpack_message(srpc_msg_t *msg)
+{
+	if (msg->msg_magic == SRPC_MSG_MAGIC)
+		return; /* no flipping needed */
+
+	/* srpc module should guarantee I wouldn't get crap */
+	LASSERT(msg->msg_magic == __swab32(SRPC_MSG_MAGIC));
+
+	if (msg->msg_type == SRPC_MSG_STAT_REQST) {
+		srpc_stat_reqst_t *req = &msg->msg_body.stat_reqst;
+
+		__swab32s(&req->str_type);
+		__swab64s(&req->str_rpyid);
+		sfw_unpack_sid(req->str_sid);
+		return;
+	}
+
+	if (msg->msg_type == SRPC_MSG_STAT_REPLY) {
+		srpc_stat_reply_t *rep = &msg->msg_body.stat_reply;
+
+		__swab32s(&rep->str_status);
+		sfw_unpack_sid(rep->str_sid);
+		sfw_unpack_fw_counters(rep->str_fw);
+		sfw_unpack_rpc_counters(rep->str_rpc);
+		sfw_unpack_lnet_counters(rep->str_lnet);
+		return;
+	}
+
+	if (msg->msg_type == SRPC_MSG_MKSN_REQST) {
+		srpc_mksn_reqst_t *req = &msg->msg_body.mksn_reqst;
+
+		__swab64s(&req->mksn_rpyid);
+		__swab32s(&req->mksn_force);
+		sfw_unpack_sid(req->mksn_sid);
+		return;
+	}
+
+	if (msg->msg_type == SRPC_MSG_MKSN_REPLY) {
+		srpc_mksn_reply_t *rep = &msg->msg_body.mksn_reply;
+
+		__swab32s(&rep->mksn_status);
+		__swab32s(&rep->mksn_timeout);
+		sfw_unpack_sid(rep->mksn_sid);
+		return;
+	}
+
+	if (msg->msg_type == SRPC_MSG_RMSN_REQST) {
+		srpc_rmsn_reqst_t *req = &msg->msg_body.rmsn_reqst;
+
+		__swab64s(&req->rmsn_rpyid);
+		sfw_unpack_sid(req->rmsn_sid);
+		return;
+	}
+
+	if (msg->msg_type == SRPC_MSG_RMSN_REPLY) {
+		srpc_rmsn_reply_t *rep = &msg->msg_body.rmsn_reply;
+
+		__swab32s(&rep->rmsn_status);
+		sfw_unpack_sid(rep->rmsn_sid);
+		return;
+	}
+
+	if (msg->msg_type == SRPC_MSG_DEBUG_REQST) {
+		srpc_debug_reqst_t *req = &msg->msg_body.dbg_reqst;
+
+		__swab64s(&req->dbg_rpyid);
+		__swab32s(&req->dbg_flags);
+		sfw_unpack_sid(req->dbg_sid);
+		return;
+	}
+
+	if (msg->msg_type == SRPC_MSG_DEBUG_REPLY) {
+		srpc_debug_reply_t *rep = &msg->msg_body.dbg_reply;
+
+		__swab32s(&rep->dbg_nbatch);
+		__swab32s(&rep->dbg_timeout);
+		sfw_unpack_sid(rep->dbg_sid);
+		return;
+	}
+
+	if (msg->msg_type == SRPC_MSG_BATCH_REQST) {
+		srpc_batch_reqst_t *req = &msg->msg_body.bat_reqst;
+
+		__swab32s(&req->bar_opc);
+		__swab64s(&req->bar_rpyid);
+		__swab32s(&req->bar_testidx);
+		__swab32s(&req->bar_arg);
+		sfw_unpack_sid(req->bar_sid);
+		__swab64s(&req->bar_bid.bat_id);
+		return;
+	}
+
+	if (msg->msg_type == SRPC_MSG_BATCH_REPLY) {
+		srpc_batch_reply_t *rep = &msg->msg_body.bat_reply;
+
+		__swab32s(&rep->bar_status);
+		sfw_unpack_sid(rep->bar_sid);
+		return;
+	}
+
+	if (msg->msg_type == SRPC_MSG_TEST_REQST) {
+		srpc_test_reqst_t *req = &msg->msg_body.tes_reqst;
+
+		__swab64s(&req->tsr_rpyid);
+		__swab64s(&req->tsr_bulkid);
+		__swab32s(&req->tsr_loop);
+		__swab32s(&req->tsr_ndest);
+		__swab32s(&req->tsr_concur);
+		__swab32s(&req->tsr_service);
+		sfw_unpack_sid(req->tsr_sid);
+		__swab64s(&req->tsr_bid.bat_id);
+		return;
+	}
+
+	if (msg->msg_type == SRPC_MSG_TEST_REPLY) {
+		srpc_test_reply_t *rep = &msg->msg_body.tes_reply;
+
+		__swab32s(&rep->tsr_status);
+		sfw_unpack_sid(rep->tsr_sid);
+		return;
+	}
+
+	if (msg->msg_type == SRPC_MSG_JOIN_REQST) {
+		srpc_join_reqst_t *req = &msg->msg_body.join_reqst;
+
+		__swab64s(&req->join_rpyid);
+		sfw_unpack_sid(req->join_sid);
+		return;
+	}
+
+	if (msg->msg_type == SRPC_MSG_JOIN_REPLY) {
+		srpc_join_reply_t *rep = &msg->msg_body.join_reply;
+
+		__swab32s(&rep->join_status);
+		__swab32s(&rep->join_timeout);
+		sfw_unpack_sid(rep->join_sid);
+		return;
+	}
+
+	LBUG();
+	return;
+}
+
+void
+sfw_abort_rpc(srpc_client_rpc_t *rpc)
+{
+	LASSERT(atomic_read(&rpc->crpc_refcount) > 0);
+	LASSERT(rpc->crpc_service <= SRPC_FRAMEWORK_SERVICE_MAX_ID);
+
+	spin_lock(&rpc->crpc_lock);
+	srpc_abort_rpc(rpc, -EINTR);
+	spin_unlock(&rpc->crpc_lock);
+	return;
+}
+
+void
+sfw_post_rpc(srpc_client_rpc_t *rpc)
+{
+	spin_lock(&rpc->crpc_lock);
+
+	LASSERT(!rpc->crpc_closed);
+	LASSERT(!rpc->crpc_aborted);
+	LASSERT(list_empty(&rpc->crpc_list));
+	LASSERT(!sfw_data.fw_shuttingdown);
+
+	rpc->crpc_timeout = rpc_timeout;
+	srpc_post_rpc(rpc);
+
+	spin_unlock(&rpc->crpc_lock);
+	return;
+}
+
+static srpc_service_t sfw_services[] = {
+	{
+		/* sv_id */    SRPC_SERVICE_DEBUG,
+		/* sv_name */  "debug",
+		0
+	},
+	{
+		/* sv_id */    SRPC_SERVICE_QUERY_STAT,
+		/* sv_name */  "query stats",
+		0
+	},
+	{
+		/* sv_id */    SRPC_SERVICE_MAKE_SESSION,
+		/* sv_name */  "make session",
+		0
+	},
+	{
+		/* sv_id */    SRPC_SERVICE_REMOVE_SESSION,
+		/* sv_name */  "remove session",
+		0
+	},
+	{
+		/* sv_id */    SRPC_SERVICE_BATCH,
+		/* sv_name */  "batch service",
+		0
+	},
+	{
+		/* sv_id */    SRPC_SERVICE_TEST,
+		/* sv_name */  "test service",
+		0
+	},
+	{
+		/* sv_id */    0,
+		/* sv_name */  NULL,
+		0
+	}
+};
+
+extern sfw_test_client_ops_t ping_test_client;
+extern srpc_service_t	ping_test_service;
+extern void ping_init_test_client(void);
+extern void ping_init_test_service(void);
+
+extern sfw_test_client_ops_t brw_test_client;
+extern srpc_service_t	brw_test_service;
+extern void brw_init_test_client(void);
+extern void brw_init_test_service(void);
+
+
+int
+sfw_startup(void)
+{
+	int	      i;
+	int	      rc;
+	int	      error;
+	srpc_service_t  *sv;
+	sfw_test_case_t *tsc;
+
+
+	if (session_timeout < 0) {
+		CERROR("Session timeout must be non-negative: %d\n",
+			session_timeout);
+		return -EINVAL;
+	}
+
+	if (rpc_timeout < 0) {
+		CERROR("RPC timeout must be non-negative: %d\n",
+			rpc_timeout);
+		return -EINVAL;
+	}
+
+	if (session_timeout == 0)
+		CWARN("Zero session_timeout specified - test sessions never expire.\n");
+
+	if (rpc_timeout == 0)
+		CWARN("Zero rpc_timeout specified - test RPC never expire.\n");
+
+	memset(&sfw_data, 0, sizeof(struct smoketest_framework));
+
+	sfw_data.fw_session     = NULL;
+	sfw_data.fw_active_srpc = NULL;
+	spin_lock_init(&sfw_data.fw_lock);
+	atomic_set(&sfw_data.fw_nzombies, 0);
+	INIT_LIST_HEAD(&sfw_data.fw_tests);
+	INIT_LIST_HEAD(&sfw_data.fw_zombie_rpcs);
+	INIT_LIST_HEAD(&sfw_data.fw_zombie_sessions);
+
+	brw_init_test_client();
+	brw_init_test_service();
+	rc = sfw_register_test(&brw_test_service, &brw_test_client);
+	LASSERT(rc == 0);
+
+	ping_init_test_client();
+	ping_init_test_service();
+	rc = sfw_register_test(&ping_test_service, &ping_test_client);
+	LASSERT(rc == 0);
+
+	error = 0;
+	list_for_each_entry(tsc, &sfw_data.fw_tests, tsc_list) {
+		sv = tsc->tsc_srv_service;
+
+		rc = srpc_add_service(sv);
+		LASSERT(rc != -EBUSY);
+		if (rc != 0) {
+			CWARN("Failed to add %s service: %d\n",
+			       sv->sv_name, rc);
+			error = rc;
+		}
+	}
+
+	for (i = 0; ; i++) {
+		sv = &sfw_services[i];
+		if (sv->sv_name == NULL) break;
+
+		sv->sv_bulk_ready = NULL;
+		sv->sv_handler    = sfw_handle_server_rpc;
+		sv->sv_wi_total   = SFW_FRWK_WI_MAX;
+		if (sv->sv_id == SRPC_SERVICE_TEST)
+			sv->sv_bulk_ready = sfw_bulk_ready;
+
+		rc = srpc_add_service(sv);
+		LASSERT(rc != -EBUSY);
+		if (rc != 0) {
+			CWARN("Failed to add %s service: %d\n",
+			       sv->sv_name, rc);
+			error = rc;
+		}
+
+		/* about to sfw_shutdown, no need to add buffer */
+		if (error) continue;
+
+		rc = srpc_service_add_buffers(sv, sv->sv_wi_total);
+		if (rc != 0) {
+			CWARN("Failed to reserve enough buffers: service %s, %d needed: %d\n",
+			      sv->sv_name, sv->sv_wi_total, rc);
+			error = -ENOMEM;
+		}
+	}
+
+	if (error != 0)
+		sfw_shutdown();
+	return error;
+}
+
+void
+sfw_shutdown(void)
+{
+	srpc_service_t	*sv;
+	sfw_test_case_t	*tsc;
+	int		 i;
+
+	spin_lock(&sfw_data.fw_lock);
+
+	sfw_data.fw_shuttingdown = 1;
+	lst_wait_until(sfw_data.fw_active_srpc == NULL, sfw_data.fw_lock,
+		       "waiting for active RPC to finish.\n");
+
+	if (sfw_del_session_timer() != 0)
+		lst_wait_until(sfw_data.fw_session == NULL, sfw_data.fw_lock,
+			       "waiting for session timer to explode.\n");
+
+	sfw_deactivate_session();
+	lst_wait_until(atomic_read(&sfw_data.fw_nzombies) == 0,
+		       sfw_data.fw_lock,
+		       "waiting for %d zombie sessions to die.\n",
+		       atomic_read(&sfw_data.fw_nzombies));
+
+	spin_unlock(&sfw_data.fw_lock);
+
+	for (i = 0; ; i++) {
+		sv = &sfw_services[i];
+		if (sv->sv_name == NULL)
+			break;
+
+		srpc_shutdown_service(sv);
+		srpc_remove_service(sv);
+	}
+
+	list_for_each_entry(tsc, &sfw_data.fw_tests, tsc_list) {
+		sv = tsc->tsc_srv_service;
+		srpc_shutdown_service(sv);
+		srpc_remove_service(sv);
+	}
+
+	while (!list_empty(&sfw_data.fw_zombie_rpcs)) {
+		srpc_client_rpc_t *rpc;
+
+		rpc = list_entry(sfw_data.fw_zombie_rpcs.next,
+				     srpc_client_rpc_t, crpc_list);
+		list_del(&rpc->crpc_list);
+
+		LIBCFS_FREE(rpc, srpc_client_rpc_size(rpc));
+	}
+
+	for (i = 0; ; i++) {
+		sv = &sfw_services[i];
+		if (sv->sv_name == NULL)
+			break;
+
+		srpc_wait_service_shutdown(sv);
+	}
+
+	while (!list_empty(&sfw_data.fw_tests)) {
+		tsc = list_entry(sfw_data.fw_tests.next,
+				     sfw_test_case_t, tsc_list);
+
+		srpc_wait_service_shutdown(tsc->tsc_srv_service);
+
+		list_del(&tsc->tsc_list);
+		LIBCFS_FREE(tsc, sizeof(*tsc));
+	}
+
+	return;
+}
diff --git a/kernel/drivers/staging/lustre/lnet/selftest/module.c b/kernel/drivers/staging/lustre/lnet/selftest/module.c
new file mode 100644
index 000000000..7ad62f167
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lnet/selftest/module.c
@@ -0,0 +1,159 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+
+#include "selftest.h"
+
+enum {
+	LST_INIT_NONE		= 0,
+	LST_INIT_WI_SERIAL,
+	LST_INIT_WI_TEST,
+	LST_INIT_RPC,
+	LST_INIT_FW,
+	LST_INIT_CONSOLE
+};
+
+extern int lstcon_console_init(void);
+extern int lstcon_console_fini(void);
+
+static int lst_init_step = LST_INIT_NONE;
+
+struct cfs_wi_sched *lst_sched_serial;
+struct cfs_wi_sched **lst_sched_test;
+
+static void
+lnet_selftest_fini(void)
+{
+	int	i;
+
+	switch (lst_init_step) {
+	case LST_INIT_CONSOLE:
+		lstcon_console_fini();
+	case LST_INIT_FW:
+		sfw_shutdown();
+	case LST_INIT_RPC:
+		srpc_shutdown();
+	case LST_INIT_WI_TEST:
+		for (i = 0;
+		     i < cfs_cpt_number(lnet_cpt_table()); i++) {
+			if (lst_sched_test[i] == NULL)
+				continue;
+			cfs_wi_sched_destroy(lst_sched_test[i]);
+		}
+		LIBCFS_FREE(lst_sched_test,
+			    sizeof(lst_sched_test[0]) *
+			    cfs_cpt_number(lnet_cpt_table()));
+		lst_sched_test = NULL;
+
+	case LST_INIT_WI_SERIAL:
+		cfs_wi_sched_destroy(lst_sched_serial);
+		lst_sched_serial = NULL;
+	case LST_INIT_NONE:
+		break;
+	default:
+		LBUG();
+	}
+}
+
+static int
+lnet_selftest_init(void)
+{
+	int	nscheds;
+	int	rc;
+	int	i;
+
+	rc = cfs_wi_sched_create("lst_s", lnet_cpt_table(), CFS_CPT_ANY,
+				 1, &lst_sched_serial);
+	if (rc != 0) {
+		CERROR("Failed to create serial WI scheduler for LST\n");
+		return rc;
+	}
+	lst_init_step = LST_INIT_WI_SERIAL;
+
+	nscheds = cfs_cpt_number(lnet_cpt_table());
+	LIBCFS_ALLOC(lst_sched_test, sizeof(lst_sched_test[0]) * nscheds);
+	if (lst_sched_test == NULL)
+		goto error;
+
+	lst_init_step = LST_INIT_WI_TEST;
+	for (i = 0; i < nscheds; i++) {
+		int nthrs = cfs_cpt_weight(lnet_cpt_table(), i);
+
+		/* reserve at least one CPU for LND */
+		nthrs = max(nthrs - 1, 1);
+		rc = cfs_wi_sched_create("lst_t", lnet_cpt_table(), i,
+					 nthrs, &lst_sched_test[i]);
+		if (rc != 0) {
+			CERROR("Failed to create CPT affinity WI scheduler %d for LST\n",
+			       i);
+			goto error;
+		}
+	}
+
+	rc = srpc_startup();
+	if (rc != 0) {
+		CERROR("LST can't startup rpc\n");
+		goto error;
+	}
+	lst_init_step = LST_INIT_RPC;
+
+	rc = sfw_startup();
+	if (rc != 0) {
+		CERROR("LST can't startup framework\n");
+		goto error;
+	}
+	lst_init_step = LST_INIT_FW;
+
+	rc = lstcon_console_init();
+	if (rc != 0) {
+		CERROR("LST can't startup console\n");
+		goto error;
+	}
+	lst_init_step = LST_INIT_CONSOLE;
+	return 0;
+error:
+	lnet_selftest_fini();
+	return rc;
+}
+
+
+MODULE_DESCRIPTION("LNet Selftest");
+MODULE_LICENSE("GPL");
+MODULE_VERSION("0.9.0");
+
+module_init(lnet_selftest_init);
+module_exit(lnet_selftest_fini);
diff --git a/kernel/drivers/staging/lustre/lnet/selftest/ping_test.c b/kernel/drivers/staging/lustre/lnet/selftest/ping_test.c
new file mode 100644
index 000000000..644069a9f
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lnet/selftest/ping_test.c
@@ -0,0 +1,230 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/selftest/conctl.c
+ *
+ * Test client & Server
+ *
+ * Author: Liang Zhen <liangzhen@clusterfs.com>
+ */
+
+#include "selftest.h"
+
+#define LST_PING_TEST_MAGIC     0xbabeface
+
+static int ping_srv_workitems = SFW_TEST_WI_MAX;
+module_param(ping_srv_workitems, int, 0644);
+MODULE_PARM_DESC(ping_srv_workitems, "# PING server workitems");
+
+typedef struct {
+	spinlock_t	pnd_lock;	/* serialize */
+	int		pnd_counter;	/* sequence counter */
+} lst_ping_data_t;
+
+static lst_ping_data_t  lst_ping_data;
+
+static int
+ping_client_init(sfw_test_instance_t *tsi)
+{
+	sfw_session_t *sn = tsi->tsi_batch->bat_session;
+
+	LASSERT(tsi->tsi_is_client);
+	LASSERT(sn != NULL && (sn->sn_features & ~LST_FEATS_MASK) == 0);
+
+	spin_lock_init(&lst_ping_data.pnd_lock);
+	lst_ping_data.pnd_counter = 0;
+
+	return 0;
+}
+
+static void
+ping_client_fini(sfw_test_instance_t *tsi)
+{
+	sfw_session_t *sn = tsi->tsi_batch->bat_session;
+	int	    errors;
+
+	LASSERT(sn != NULL);
+	LASSERT(tsi->tsi_is_client);
+
+	errors = atomic_read(&sn->sn_ping_errors);
+	if (errors)
+		CWARN("%d pings have failed.\n", errors);
+	else
+		CDEBUG(D_NET, "Ping test finished OK.\n");
+}
+
+static int
+ping_client_prep_rpc(sfw_test_unit_t *tsu,
+		     lnet_process_id_t dest, srpc_client_rpc_t **rpc)
+{
+	srpc_ping_reqst_t   *req;
+	sfw_test_instance_t *tsi = tsu->tsu_instance;
+	sfw_session_t       *sn  = tsi->tsi_batch->bat_session;
+	struct timeval       tv;
+	int		     rc;
+
+	LASSERT(sn != NULL);
+	LASSERT((sn->sn_features & ~LST_FEATS_MASK) == 0);
+
+	rc = sfw_create_test_rpc(tsu, dest, sn->sn_features, 0, 0, rpc);
+	if (rc != 0)
+		return rc;
+
+	req = &(*rpc)->crpc_reqstmsg.msg_body.ping_reqst;
+
+	req->pnr_magic = LST_PING_TEST_MAGIC;
+
+	spin_lock(&lst_ping_data.pnd_lock);
+	req->pnr_seq = lst_ping_data.pnd_counter++;
+	spin_unlock(&lst_ping_data.pnd_lock);
+
+	cfs_fs_timeval(&tv);
+	req->pnr_time_sec  = tv.tv_sec;
+	req->pnr_time_usec = tv.tv_usec;
+
+	return rc;
+}
+
+static void
+ping_client_done_rpc(sfw_test_unit_t *tsu, srpc_client_rpc_t *rpc)
+{
+	sfw_test_instance_t *tsi = tsu->tsu_instance;
+	sfw_session_t       *sn = tsi->tsi_batch->bat_session;
+	srpc_ping_reqst_t   *reqst = &rpc->crpc_reqstmsg.msg_body.ping_reqst;
+	srpc_ping_reply_t   *reply = &rpc->crpc_replymsg.msg_body.ping_reply;
+	struct timeval       tv;
+
+	LASSERT(sn != NULL);
+
+	if (rpc->crpc_status != 0) {
+		if (!tsi->tsi_stopping) /* rpc could have been aborted */
+			atomic_inc(&sn->sn_ping_errors);
+		CERROR("Unable to ping %s (%d): %d\n",
+			libcfs_id2str(rpc->crpc_dest),
+			reqst->pnr_seq, rpc->crpc_status);
+		return;
+	}
+
+	if (rpc->crpc_replymsg.msg_magic != SRPC_MSG_MAGIC) {
+		__swab32s(&reply->pnr_seq);
+		__swab32s(&reply->pnr_magic);
+		__swab32s(&reply->pnr_status);
+	}
+
+	if (reply->pnr_magic != LST_PING_TEST_MAGIC) {
+		rpc->crpc_status = -EBADMSG;
+		atomic_inc(&sn->sn_ping_errors);
+		CERROR("Bad magic %u from %s, %u expected.\n",
+			reply->pnr_magic, libcfs_id2str(rpc->crpc_dest),
+			LST_PING_TEST_MAGIC);
+		return;
+	}
+
+	if (reply->pnr_seq != reqst->pnr_seq) {
+		rpc->crpc_status = -EBADMSG;
+		atomic_inc(&sn->sn_ping_errors);
+		CERROR("Bad seq %u from %s, %u expected.\n",
+			reply->pnr_seq, libcfs_id2str(rpc->crpc_dest),
+			reqst->pnr_seq);
+		return;
+	}
+
+	cfs_fs_timeval(&tv);
+	CDEBUG(D_NET, "%d reply in %u usec\n", reply->pnr_seq,
+		(unsigned)((tv.tv_sec - (unsigned)reqst->pnr_time_sec) * 1000000
+			   + (tv.tv_usec - reqst->pnr_time_usec)));
+	return;
+}
+
+static int
+ping_server_handle(struct srpc_server_rpc *rpc)
+{
+	struct srpc_service	*sv  = rpc->srpc_scd->scd_svc;
+	srpc_msg_t	*reqstmsg = &rpc->srpc_reqstbuf->buf_msg;
+	srpc_msg_t	  *replymsg = &rpc->srpc_replymsg;
+	srpc_ping_reqst_t *req = &reqstmsg->msg_body.ping_reqst;
+	srpc_ping_reply_t *rep = &rpc->srpc_replymsg.msg_body.ping_reply;
+
+	LASSERT(sv->sv_id == SRPC_SERVICE_PING);
+
+	if (reqstmsg->msg_magic != SRPC_MSG_MAGIC) {
+		LASSERT(reqstmsg->msg_magic == __swab32(SRPC_MSG_MAGIC));
+
+		__swab32s(&req->pnr_seq);
+		__swab32s(&req->pnr_magic);
+		__swab64s(&req->pnr_time_sec);
+		__swab64s(&req->pnr_time_usec);
+	}
+	LASSERT(reqstmsg->msg_type == srpc_service2request(sv->sv_id));
+
+	if (req->pnr_magic != LST_PING_TEST_MAGIC) {
+		CERROR("Unexpected magic %08x from %s\n",
+			req->pnr_magic, libcfs_id2str(rpc->srpc_peer));
+		return -EINVAL;
+	}
+
+	rep->pnr_seq   = req->pnr_seq;
+	rep->pnr_magic = LST_PING_TEST_MAGIC;
+
+	if ((reqstmsg->msg_ses_feats & ~LST_FEATS_MASK) != 0) {
+		replymsg->msg_ses_feats = LST_FEATS_MASK;
+		rep->pnr_status = EPROTO;
+		return 0;
+	}
+
+	replymsg->msg_ses_feats = reqstmsg->msg_ses_feats;
+
+	CDEBUG(D_NET, "Get ping %d from %s\n",
+	       req->pnr_seq, libcfs_id2str(rpc->srpc_peer));
+	return 0;
+}
+
+sfw_test_client_ops_t ping_test_client;
+void ping_init_test_client(void)
+{
+	ping_test_client.tso_init     = ping_client_init;
+	ping_test_client.tso_fini     = ping_client_fini;
+	ping_test_client.tso_prep_rpc = ping_client_prep_rpc;
+	ping_test_client.tso_done_rpc = ping_client_done_rpc;
+}
+
+srpc_service_t ping_test_service;
+void ping_init_test_service(void)
+{
+	ping_test_service.sv_id       = SRPC_SERVICE_PING;
+	ping_test_service.sv_name     = "ping_test";
+	ping_test_service.sv_handler  = ping_server_handle;
+	ping_test_service.sv_wi_total = ping_srv_workitems;
+}
diff --git a/kernel/drivers/staging/lustre/lnet/selftest/rpc.c b/kernel/drivers/staging/lustre/lnet/selftest/rpc.c
new file mode 100644
index 000000000..080788ab7
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lnet/selftest/rpc.c
@@ -0,0 +1,1673 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/selftest/rpc.c
+ *
+ * Author: Isaac Huang <isaac@clusterfs.com>
+ *
+ * 2012-05-13: Liang Zhen <liang@whamcloud.com>
+ * - percpt data for service to improve smp performance
+ * - code cleanup
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+
+#include "selftest.h"
+
+typedef enum {
+	SRPC_STATE_NONE,
+	SRPC_STATE_NI_INIT,
+	SRPC_STATE_EQ_INIT,
+	SRPC_STATE_RUNNING,
+	SRPC_STATE_STOPPING,
+} srpc_state_t;
+
+static struct smoketest_rpc {
+	spinlock_t	 rpc_glock;	/* global lock */
+	srpc_service_t	*rpc_services[SRPC_SERVICE_MAX_ID + 1];
+	lnet_handle_eq_t rpc_lnet_eq;	/* _the_ LNet event queue */
+	srpc_state_t	 rpc_state;
+	srpc_counters_t	 rpc_counters;
+	__u64		 rpc_matchbits;	/* matchbits counter */
+} srpc_data;
+
+static inline int
+srpc_serv_portal(int svc_id)
+{
+	return svc_id < SRPC_FRAMEWORK_SERVICE_MAX_ID ?
+	       SRPC_FRAMEWORK_REQUEST_PORTAL : SRPC_REQUEST_PORTAL;
+}
+
+/* forward ref's */
+int srpc_handle_rpc(swi_workitem_t *wi);
+
+void srpc_get_counters(srpc_counters_t *cnt)
+{
+	spin_lock(&srpc_data.rpc_glock);
+	*cnt = srpc_data.rpc_counters;
+	spin_unlock(&srpc_data.rpc_glock);
+}
+
+void srpc_set_counters(const srpc_counters_t *cnt)
+{
+	spin_lock(&srpc_data.rpc_glock);
+	srpc_data.rpc_counters = *cnt;
+	spin_unlock(&srpc_data.rpc_glock);
+}
+
+static int
+srpc_add_bulk_page(srpc_bulk_t *bk, struct page *pg, int i, int nob)
+{
+	nob = min(nob, (int)PAGE_CACHE_SIZE);
+
+	LASSERT(nob > 0);
+	LASSERT(i >= 0 && i < bk->bk_niov);
+
+	bk->bk_iovs[i].kiov_offset = 0;
+	bk->bk_iovs[i].kiov_page   = pg;
+	bk->bk_iovs[i].kiov_len    = nob;
+	return nob;
+}
+
+void
+srpc_free_bulk(srpc_bulk_t *bk)
+{
+	int	 i;
+	struct page *pg;
+
+	LASSERT(bk != NULL);
+
+	for (i = 0; i < bk->bk_niov; i++) {
+		pg = bk->bk_iovs[i].kiov_page;
+		if (pg == NULL)
+			break;
+
+		__free_page(pg);
+	}
+
+	LIBCFS_FREE(bk, offsetof(srpc_bulk_t, bk_iovs[bk->bk_niov]));
+	return;
+}
+
+srpc_bulk_t *
+srpc_alloc_bulk(int cpt, unsigned bulk_npg, unsigned bulk_len, int sink)
+{
+	srpc_bulk_t  *bk;
+	int	      i;
+
+	LASSERT(bulk_npg > 0 && bulk_npg <= LNET_MAX_IOV);
+
+	LIBCFS_CPT_ALLOC(bk, lnet_cpt_table(), cpt,
+			 offsetof(srpc_bulk_t, bk_iovs[bulk_npg]));
+	if (bk == NULL) {
+		CERROR("Can't allocate descriptor for %d pages\n", bulk_npg);
+		return NULL;
+	}
+
+	memset(bk, 0, offsetof(srpc_bulk_t, bk_iovs[bulk_npg]));
+	bk->bk_sink   = sink;
+	bk->bk_len    = bulk_len;
+	bk->bk_niov   = bulk_npg;
+
+	for (i = 0; i < bulk_npg; i++) {
+		struct page *pg;
+		int	    nob;
+
+		pg = alloc_pages_node(cfs_cpt_spread_node(lnet_cpt_table(), cpt),
+				      GFP_IOFS, 0);
+		if (pg == NULL) {
+			CERROR("Can't allocate page %d of %d\n", i, bulk_npg);
+			srpc_free_bulk(bk);
+			return NULL;
+		}
+
+		nob = srpc_add_bulk_page(bk, pg, i, bulk_len);
+		bulk_len -= nob;
+	}
+
+	return bk;
+}
+
+static inline __u64
+srpc_next_id(void)
+{
+	__u64 id;
+
+	spin_lock(&srpc_data.rpc_glock);
+	id = srpc_data.rpc_matchbits++;
+	spin_unlock(&srpc_data.rpc_glock);
+	return id;
+}
+
+static void
+srpc_init_server_rpc(struct srpc_server_rpc *rpc,
+		     struct srpc_service_cd *scd,
+		     struct srpc_buffer *buffer)
+{
+	memset(rpc, 0, sizeof(*rpc));
+	swi_init_workitem(&rpc->srpc_wi, rpc, srpc_handle_rpc,
+			  srpc_serv_is_framework(scd->scd_svc) ?
+			  lst_sched_serial : lst_sched_test[scd->scd_cpt]);
+
+	rpc->srpc_ev.ev_fired = 1; /* no event expected now */
+
+	rpc->srpc_scd      = scd;
+	rpc->srpc_reqstbuf = buffer;
+	rpc->srpc_peer     = buffer->buf_peer;
+	rpc->srpc_self     = buffer->buf_self;
+	LNetInvalidateHandle(&rpc->srpc_replymdh);
+}
+
+static void
+srpc_service_fini(struct srpc_service *svc)
+{
+	struct srpc_service_cd	*scd;
+	struct srpc_server_rpc	*rpc;
+	struct srpc_buffer	*buf;
+	struct list_head		*q;
+	int			i;
+
+	if (svc->sv_cpt_data == NULL)
+		return;
+
+	cfs_percpt_for_each(scd, i, svc->sv_cpt_data) {
+		while (1) {
+			if (!list_empty(&scd->scd_buf_posted))
+				q = &scd->scd_buf_posted;
+			else if (!list_empty(&scd->scd_buf_blocked))
+				q = &scd->scd_buf_blocked;
+			else
+				break;
+
+			while (!list_empty(q)) {
+				buf = list_entry(q->next,
+						     struct srpc_buffer,
+						     buf_list);
+				list_del(&buf->buf_list);
+				LIBCFS_FREE(buf, sizeof(*buf));
+			}
+		}
+
+		LASSERT(list_empty(&scd->scd_rpc_active));
+
+		while (!list_empty(&scd->scd_rpc_free)) {
+			rpc = list_entry(scd->scd_rpc_free.next,
+					     struct srpc_server_rpc,
+					     srpc_list);
+			list_del(&rpc->srpc_list);
+			LIBCFS_FREE(rpc, sizeof(*rpc));
+		}
+	}
+
+	cfs_percpt_free(svc->sv_cpt_data);
+	svc->sv_cpt_data = NULL;
+}
+
+static int
+srpc_service_nrpcs(struct srpc_service *svc)
+{
+	int nrpcs = svc->sv_wi_total / svc->sv_ncpts;
+
+	return srpc_serv_is_framework(svc) ?
+	       max(nrpcs, SFW_FRWK_WI_MIN) : max(nrpcs, SFW_TEST_WI_MIN);
+}
+
+int srpc_add_buffer(struct swi_workitem *wi);
+
+static int
+srpc_service_init(struct srpc_service *svc)
+{
+	struct srpc_service_cd	*scd;
+	struct srpc_server_rpc	*rpc;
+	int			nrpcs;
+	int			i;
+	int			j;
+
+	svc->sv_shuttingdown = 0;
+
+	svc->sv_cpt_data = cfs_percpt_alloc(lnet_cpt_table(),
+					    sizeof(struct srpc_service_cd));
+	if (svc->sv_cpt_data == NULL)
+		return -ENOMEM;
+
+	svc->sv_ncpts = srpc_serv_is_framework(svc) ?
+			1 : cfs_cpt_number(lnet_cpt_table());
+	nrpcs = srpc_service_nrpcs(svc);
+
+	cfs_percpt_for_each(scd, i, svc->sv_cpt_data) {
+		scd->scd_cpt = i;
+		scd->scd_svc = svc;
+		spin_lock_init(&scd->scd_lock);
+		INIT_LIST_HEAD(&scd->scd_rpc_free);
+		INIT_LIST_HEAD(&scd->scd_rpc_active);
+		INIT_LIST_HEAD(&scd->scd_buf_posted);
+		INIT_LIST_HEAD(&scd->scd_buf_blocked);
+
+		scd->scd_ev.ev_data = scd;
+		scd->scd_ev.ev_type = SRPC_REQUEST_RCVD;
+
+		/* NB: don't use lst_sched_serial for adding buffer,
+		 * see details in srpc_service_add_buffers() */
+		swi_init_workitem(&scd->scd_buf_wi, scd,
+				  srpc_add_buffer, lst_sched_test[i]);
+
+		if (i != 0 && srpc_serv_is_framework(svc)) {
+			/* NB: framework service only needs srpc_service_cd for
+			 * one partition, but we allocate for all to make
+			 * it easier to implement, it will waste a little
+			 * memory but nobody should care about this */
+			continue;
+		}
+
+		for (j = 0; j < nrpcs; j++) {
+			LIBCFS_CPT_ALLOC(rpc, lnet_cpt_table(),
+					 i, sizeof(*rpc));
+			if (rpc == NULL) {
+				srpc_service_fini(svc);
+				return -ENOMEM;
+			}
+			list_add(&rpc->srpc_list, &scd->scd_rpc_free);
+		}
+	}
+
+	return 0;
+}
+
+int
+srpc_add_service(struct srpc_service *sv)
+{
+	int id = sv->sv_id;
+
+	LASSERT(0 <= id && id <= SRPC_SERVICE_MAX_ID);
+
+	if (srpc_service_init(sv) != 0)
+		return -ENOMEM;
+
+	spin_lock(&srpc_data.rpc_glock);
+
+	LASSERT(srpc_data.rpc_state == SRPC_STATE_RUNNING);
+
+	if (srpc_data.rpc_services[id] != NULL) {
+		spin_unlock(&srpc_data.rpc_glock);
+		goto failed;
+	}
+
+	srpc_data.rpc_services[id] = sv;
+	spin_unlock(&srpc_data.rpc_glock);
+
+	CDEBUG(D_NET, "Adding service: id %d, name %s\n", id, sv->sv_name);
+	return 0;
+
+ failed:
+	srpc_service_fini(sv);
+	return -EBUSY;
+}
+
+int
+srpc_remove_service(srpc_service_t *sv)
+{
+	int id = sv->sv_id;
+
+	spin_lock(&srpc_data.rpc_glock);
+
+	if (srpc_data.rpc_services[id] != sv) {
+		spin_unlock(&srpc_data.rpc_glock);
+		return -ENOENT;
+	}
+
+	srpc_data.rpc_services[id] = NULL;
+	spin_unlock(&srpc_data.rpc_glock);
+	return 0;
+}
+
+static int
+srpc_post_passive_rdma(int portal, int local, __u64 matchbits, void *buf,
+		       int len, int options, lnet_process_id_t peer,
+		       lnet_handle_md_t *mdh, srpc_event_t *ev)
+{
+	int		 rc;
+	lnet_md_t	 md;
+	lnet_handle_me_t meh;
+
+	rc = LNetMEAttach(portal, peer, matchbits, 0, LNET_UNLINK,
+			  local ? LNET_INS_LOCAL : LNET_INS_AFTER, &meh);
+	if (rc != 0) {
+		CERROR("LNetMEAttach failed: %d\n", rc);
+		LASSERT(rc == -ENOMEM);
+		return -ENOMEM;
+	}
+
+	md.threshold = 1;
+	md.user_ptr  = ev;
+	md.start     = buf;
+	md.length    = len;
+	md.options   = options;
+	md.eq_handle = srpc_data.rpc_lnet_eq;
+
+	rc = LNetMDAttach(meh, md, LNET_UNLINK, mdh);
+	if (rc != 0) {
+		CERROR("LNetMDAttach failed: %d\n", rc);
+		LASSERT(rc == -ENOMEM);
+
+		rc = LNetMEUnlink(meh);
+		LASSERT(rc == 0);
+		return -ENOMEM;
+	}
+
+	CDEBUG(D_NET,
+		"Posted passive RDMA: peer %s, portal %d, matchbits %#llx\n",
+		libcfs_id2str(peer), portal, matchbits);
+	return 0;
+}
+
+static int
+srpc_post_active_rdma(int portal, __u64 matchbits, void *buf, int len,
+		      int options, lnet_process_id_t peer, lnet_nid_t self,
+		      lnet_handle_md_t *mdh, srpc_event_t *ev)
+{
+	int       rc;
+	lnet_md_t md;
+
+	md.user_ptr  = ev;
+	md.start     = buf;
+	md.length    = len;
+	md.eq_handle = srpc_data.rpc_lnet_eq;
+	md.threshold = ((options & LNET_MD_OP_GET) != 0) ? 2 : 1;
+	md.options   = options & ~(LNET_MD_OP_PUT | LNET_MD_OP_GET);
+
+	rc = LNetMDBind(md, LNET_UNLINK, mdh);
+	if (rc != 0) {
+		CERROR("LNetMDBind failed: %d\n", rc);
+		LASSERT(rc == -ENOMEM);
+		return -ENOMEM;
+	}
+
+	/* this is kind of an abuse of the LNET_MD_OP_{PUT,GET} options.
+	 * they're only meaningful for MDs attached to an ME (i.e. passive
+	 * buffers... */
+	if ((options & LNET_MD_OP_PUT) != 0) {
+		rc = LNetPut(self, *mdh, LNET_NOACK_REQ, peer,
+			     portal, matchbits, 0, 0);
+	} else {
+		LASSERT((options & LNET_MD_OP_GET) != 0);
+
+		rc = LNetGet(self, *mdh, peer, portal, matchbits, 0);
+	}
+
+	if (rc != 0) {
+		CERROR("LNet%s(%s, %d, %lld) failed: %d\n",
+			((options & LNET_MD_OP_PUT) != 0) ? "Put" : "Get",
+			libcfs_id2str(peer), portal, matchbits, rc);
+
+		/* The forthcoming unlink event will complete this operation
+		 * with failure, so fall through and return success here.
+		 */
+		rc = LNetMDUnlink(*mdh);
+		LASSERT(rc == 0);
+	} else {
+		CDEBUG(D_NET,
+			"Posted active RDMA: peer %s, portal %u, matchbits %#llx\n",
+			libcfs_id2str(peer), portal, matchbits);
+	}
+	return 0;
+}
+
+static int
+srpc_post_active_rqtbuf(lnet_process_id_t peer, int service, void *buf,
+			int len, lnet_handle_md_t *mdh, srpc_event_t *ev)
+{
+	return srpc_post_active_rdma(srpc_serv_portal(service), service,
+				     buf, len, LNET_MD_OP_PUT, peer,
+				     LNET_NID_ANY, mdh, ev);
+}
+
+static int
+srpc_post_passive_rqtbuf(int service, int local, void *buf, int len,
+			 lnet_handle_md_t *mdh, srpc_event_t *ev)
+{
+	lnet_process_id_t any = {0};
+
+	any.nid = LNET_NID_ANY;
+	any.pid = LNET_PID_ANY;
+
+	return srpc_post_passive_rdma(srpc_serv_portal(service),
+				      local, service, buf, len,
+				      LNET_MD_OP_PUT, any, mdh, ev);
+}
+
+static int
+srpc_service_post_buffer(struct srpc_service_cd *scd, struct srpc_buffer *buf)
+	__must_hold(&scd->scd_lock)
+{
+	struct srpc_service	*sv = scd->scd_svc;
+	struct srpc_msg		*msg = &buf->buf_msg;
+	int			rc;
+
+	LNetInvalidateHandle(&buf->buf_mdh);
+	list_add(&buf->buf_list, &scd->scd_buf_posted);
+	scd->scd_buf_nposted++;
+	spin_unlock(&scd->scd_lock);
+
+	rc = srpc_post_passive_rqtbuf(sv->sv_id,
+				      !srpc_serv_is_framework(sv),
+				      msg, sizeof(*msg), &buf->buf_mdh,
+				      &scd->scd_ev);
+
+	/* At this point, a RPC (new or delayed) may have arrived in
+	 * msg and its event handler has been called. So we must add
+	 * buf to scd_buf_posted _before_ dropping scd_lock */
+
+	spin_lock(&scd->scd_lock);
+
+	if (rc == 0) {
+		if (!sv->sv_shuttingdown)
+			return 0;
+
+		spin_unlock(&scd->scd_lock);
+		/* srpc_shutdown_service might have tried to unlink me
+		 * when my buf_mdh was still invalid */
+		LNetMDUnlink(buf->buf_mdh);
+		spin_lock(&scd->scd_lock);
+		return 0;
+	}
+
+	scd->scd_buf_nposted--;
+	if (sv->sv_shuttingdown)
+		return rc; /* don't allow to change scd_buf_posted */
+
+	list_del(&buf->buf_list);
+	spin_unlock(&scd->scd_lock);
+
+	LIBCFS_FREE(buf, sizeof(*buf));
+
+	spin_lock(&scd->scd_lock);
+	return rc;
+}
+
+int
+srpc_add_buffer(struct swi_workitem *wi)
+{
+	struct srpc_service_cd	*scd = wi->swi_workitem.wi_data;
+	struct srpc_buffer	*buf;
+	int			rc = 0;
+
+	/* it's called by workitem scheduler threads, these threads
+	 * should have been set CPT affinity, so buffers will be posted
+	 * on CPT local list of Portal */
+	spin_lock(&scd->scd_lock);
+
+	while (scd->scd_buf_adjust > 0 &&
+	       !scd->scd_svc->sv_shuttingdown) {
+		scd->scd_buf_adjust--; /* consume it */
+		scd->scd_buf_posting++;
+
+		spin_unlock(&scd->scd_lock);
+
+		LIBCFS_ALLOC(buf, sizeof(*buf));
+		if (buf == NULL) {
+			CERROR("Failed to add new buf to service: %s\n",
+			       scd->scd_svc->sv_name);
+			spin_lock(&scd->scd_lock);
+			rc = -ENOMEM;
+			break;
+		}
+
+		spin_lock(&scd->scd_lock);
+		if (scd->scd_svc->sv_shuttingdown) {
+			spin_unlock(&scd->scd_lock);
+			LIBCFS_FREE(buf, sizeof(*buf));
+
+			spin_lock(&scd->scd_lock);
+			rc = -ESHUTDOWN;
+			break;
+		}
+
+		rc = srpc_service_post_buffer(scd, buf);
+		if (rc != 0)
+			break; /* buf has been freed inside */
+
+		LASSERT(scd->scd_buf_posting > 0);
+		scd->scd_buf_posting--;
+		scd->scd_buf_total++;
+		scd->scd_buf_low = max(2, scd->scd_buf_total / 4);
+	}
+
+	if (rc != 0) {
+		scd->scd_buf_err_stamp = get_seconds();
+		scd->scd_buf_err = rc;
+
+		LASSERT(scd->scd_buf_posting > 0);
+		scd->scd_buf_posting--;
+	}
+
+	spin_unlock(&scd->scd_lock);
+	return 0;
+}
+
+int
+srpc_service_add_buffers(struct srpc_service *sv, int nbuffer)
+{
+	struct srpc_service_cd	*scd;
+	int			rc = 0;
+	int			i;
+
+	LASSERTF(nbuffer > 0, "nbuffer must be positive: %d\n", nbuffer);
+
+	cfs_percpt_for_each(scd, i, sv->sv_cpt_data) {
+		spin_lock(&scd->scd_lock);
+
+		scd->scd_buf_err = 0;
+		scd->scd_buf_err_stamp = 0;
+		scd->scd_buf_posting = 0;
+		scd->scd_buf_adjust = nbuffer;
+		/* start to post buffers */
+		swi_schedule_workitem(&scd->scd_buf_wi);
+		spin_unlock(&scd->scd_lock);
+
+		/* framework service only post buffer for one partition  */
+		if (srpc_serv_is_framework(sv))
+			break;
+	}
+
+	cfs_percpt_for_each(scd, i, sv->sv_cpt_data) {
+		spin_lock(&scd->scd_lock);
+		/*
+		 * NB: srpc_service_add_buffers() can be called inside
+		 * thread context of lst_sched_serial, and we don't normally
+		 * allow to sleep inside thread context of WI scheduler
+		 * because it will block current scheduler thread from doing
+		 * anything else, even worse, it could deadlock if it's
+		 * waiting on result from another WI of the same scheduler.
+		 * However, it's safe at here because scd_buf_wi is scheduled
+		 * by thread in a different WI scheduler (lst_sched_test),
+		 * so we don't have any risk of deadlock, though this could
+		 * block all WIs pending on lst_sched_serial for a moment
+		 * which is not good but not fatal.
+		 */
+		lst_wait_until(scd->scd_buf_err != 0 ||
+			       (scd->scd_buf_adjust == 0 &&
+				scd->scd_buf_posting == 0),
+			       scd->scd_lock, "waiting for adding buffer\n");
+
+		if (scd->scd_buf_err != 0 && rc == 0)
+			rc = scd->scd_buf_err;
+
+		spin_unlock(&scd->scd_lock);
+	}
+
+	return rc;
+}
+
+void
+srpc_service_remove_buffers(struct srpc_service *sv, int nbuffer)
+{
+	struct srpc_service_cd	*scd;
+	int			num;
+	int			i;
+
+	LASSERT(!sv->sv_shuttingdown);
+
+	cfs_percpt_for_each(scd, i, sv->sv_cpt_data) {
+		spin_lock(&scd->scd_lock);
+
+		num = scd->scd_buf_total + scd->scd_buf_posting;
+		scd->scd_buf_adjust -= min(nbuffer, num);
+
+		spin_unlock(&scd->scd_lock);
+	}
+}
+
+/* returns 1 if sv has finished, otherwise 0 */
+int
+srpc_finish_service(struct srpc_service *sv)
+{
+	struct srpc_service_cd	*scd;
+	struct srpc_server_rpc	*rpc;
+	int			i;
+
+	LASSERT(sv->sv_shuttingdown); /* srpc_shutdown_service called */
+
+	cfs_percpt_for_each(scd, i, sv->sv_cpt_data) {
+		spin_lock(&scd->scd_lock);
+		if (!swi_deschedule_workitem(&scd->scd_buf_wi)) {
+			spin_unlock(&scd->scd_lock);
+			return 0;
+		}
+
+		if (scd->scd_buf_nposted > 0) {
+			CDEBUG(D_NET, "waiting for %d posted buffers to unlink",
+			       scd->scd_buf_nposted);
+			spin_unlock(&scd->scd_lock);
+			return 0;
+		}
+
+		if (list_empty(&scd->scd_rpc_active)) {
+			spin_unlock(&scd->scd_lock);
+			continue;
+		}
+
+		rpc = list_entry(scd->scd_rpc_active.next,
+				     struct srpc_server_rpc, srpc_list);
+		CNETERR("Active RPC %p on shutdown: sv %s, peer %s, wi %s scheduled %d running %d, ev fired %d type %d status %d lnet %d\n",
+			rpc, sv->sv_name, libcfs_id2str(rpc->srpc_peer),
+			swi_state2str(rpc->srpc_wi.swi_state),
+			rpc->srpc_wi.swi_workitem.wi_scheduled,
+			rpc->srpc_wi.swi_workitem.wi_running,
+			rpc->srpc_ev.ev_fired, rpc->srpc_ev.ev_type,
+			rpc->srpc_ev.ev_status, rpc->srpc_ev.ev_lnet);
+		spin_unlock(&scd->scd_lock);
+		return 0;
+	}
+
+	/* no lock needed from now on */
+	srpc_service_fini(sv);
+	return 1;
+}
+
+/* called with sv->sv_lock held */
+static void
+srpc_service_recycle_buffer(struct srpc_service_cd *scd, srpc_buffer_t *buf)
+	__must_hold(&scd->scd_lock)
+{
+	if (!scd->scd_svc->sv_shuttingdown && scd->scd_buf_adjust >= 0) {
+		if (srpc_service_post_buffer(scd, buf) != 0) {
+			CWARN("Failed to post %s buffer\n",
+			      scd->scd_svc->sv_name);
+		}
+		return;
+	}
+
+	/* service is shutting down, or we want to recycle some buffers */
+	scd->scd_buf_total--;
+
+	if (scd->scd_buf_adjust < 0) {
+		scd->scd_buf_adjust++;
+		if (scd->scd_buf_adjust < 0 &&
+		    scd->scd_buf_total == 0 && scd->scd_buf_posting == 0) {
+			CDEBUG(D_INFO,
+			       "Try to recycle %d buffers but nothing left\n",
+			       scd->scd_buf_adjust);
+			scd->scd_buf_adjust = 0;
+		}
+	}
+
+	spin_unlock(&scd->scd_lock);
+	LIBCFS_FREE(buf, sizeof(*buf));
+	spin_lock(&scd->scd_lock);
+}
+
+void
+srpc_abort_service(struct srpc_service *sv)
+{
+	struct srpc_service_cd	*scd;
+	struct srpc_server_rpc	*rpc;
+	int			i;
+
+	CDEBUG(D_NET, "Aborting service: id %d, name %s\n",
+	       sv->sv_id, sv->sv_name);
+
+	cfs_percpt_for_each(scd, i, sv->sv_cpt_data) {
+		spin_lock(&scd->scd_lock);
+
+		/* schedule in-flight RPCs to notice the abort, NB:
+		 * racing with incoming RPCs; complete fix should make test
+		 * RPCs carry session ID in its headers */
+		list_for_each_entry(rpc, &scd->scd_rpc_active, srpc_list) {
+			rpc->srpc_aborted = 1;
+			swi_schedule_workitem(&rpc->srpc_wi);
+		}
+
+		spin_unlock(&scd->scd_lock);
+	}
+}
+
+void
+srpc_shutdown_service(srpc_service_t *sv)
+{
+	struct srpc_service_cd	*scd;
+	struct srpc_server_rpc	*rpc;
+	srpc_buffer_t		*buf;
+	int			i;
+
+	CDEBUG(D_NET, "Shutting down service: id %d, name %s\n",
+	       sv->sv_id, sv->sv_name);
+
+	cfs_percpt_for_each(scd, i, sv->sv_cpt_data)
+		spin_lock(&scd->scd_lock);
+
+	sv->sv_shuttingdown = 1; /* i.e. no new active RPC */
+
+	cfs_percpt_for_each(scd, i, sv->sv_cpt_data)
+		spin_unlock(&scd->scd_lock);
+
+	cfs_percpt_for_each(scd, i, sv->sv_cpt_data) {
+		spin_lock(&scd->scd_lock);
+
+		/* schedule in-flight RPCs to notice the shutdown */
+		list_for_each_entry(rpc, &scd->scd_rpc_active, srpc_list)
+			swi_schedule_workitem(&rpc->srpc_wi);
+
+		spin_unlock(&scd->scd_lock);
+
+		/* OK to traverse scd_buf_posted without lock, since no one
+		 * touches scd_buf_posted now */
+		list_for_each_entry(buf, &scd->scd_buf_posted, buf_list)
+			LNetMDUnlink(buf->buf_mdh);
+	}
+}
+
+static int
+srpc_send_request(srpc_client_rpc_t *rpc)
+{
+	srpc_event_t *ev = &rpc->crpc_reqstev;
+	int	   rc;
+
+	ev->ev_fired = 0;
+	ev->ev_data  = rpc;
+	ev->ev_type  = SRPC_REQUEST_SENT;
+
+	rc = srpc_post_active_rqtbuf(rpc->crpc_dest, rpc->crpc_service,
+				     &rpc->crpc_reqstmsg, sizeof(srpc_msg_t),
+				     &rpc->crpc_reqstmdh, ev);
+	if (rc != 0) {
+		LASSERT(rc == -ENOMEM);
+		ev->ev_fired = 1;  /* no more event expected */
+	}
+	return rc;
+}
+
+static int
+srpc_prepare_reply(srpc_client_rpc_t *rpc)
+{
+	srpc_event_t *ev = &rpc->crpc_replyev;
+	__u64	*id = &rpc->crpc_reqstmsg.msg_body.reqst.rpyid;
+	int	   rc;
+
+	ev->ev_fired = 0;
+	ev->ev_data  = rpc;
+	ev->ev_type  = SRPC_REPLY_RCVD;
+
+	*id = srpc_next_id();
+
+	rc = srpc_post_passive_rdma(SRPC_RDMA_PORTAL, 0, *id,
+				    &rpc->crpc_replymsg, sizeof(srpc_msg_t),
+				    LNET_MD_OP_PUT, rpc->crpc_dest,
+				    &rpc->crpc_replymdh, ev);
+	if (rc != 0) {
+		LASSERT(rc == -ENOMEM);
+		ev->ev_fired = 1;  /* no more event expected */
+	}
+	return rc;
+}
+
+static int
+srpc_prepare_bulk(srpc_client_rpc_t *rpc)
+{
+	srpc_bulk_t  *bk = &rpc->crpc_bulk;
+	srpc_event_t *ev = &rpc->crpc_bulkev;
+	__u64	*id = &rpc->crpc_reqstmsg.msg_body.reqst.bulkid;
+	int	   rc;
+	int	   opt;
+
+	LASSERT(bk->bk_niov <= LNET_MAX_IOV);
+
+	if (bk->bk_niov == 0)
+		return 0; /* nothing to do */
+
+	opt = bk->bk_sink ? LNET_MD_OP_PUT : LNET_MD_OP_GET;
+	opt |= LNET_MD_KIOV;
+
+	ev->ev_fired = 0;
+	ev->ev_data  = rpc;
+	ev->ev_type  = SRPC_BULK_REQ_RCVD;
+
+	*id = srpc_next_id();
+
+	rc = srpc_post_passive_rdma(SRPC_RDMA_PORTAL, 0, *id,
+				    &bk->bk_iovs[0], bk->bk_niov, opt,
+				    rpc->crpc_dest, &bk->bk_mdh, ev);
+	if (rc != 0) {
+		LASSERT(rc == -ENOMEM);
+		ev->ev_fired = 1;  /* no more event expected */
+	}
+	return rc;
+}
+
+static int
+srpc_do_bulk(srpc_server_rpc_t *rpc)
+{
+	srpc_event_t  *ev = &rpc->srpc_ev;
+	srpc_bulk_t   *bk = rpc->srpc_bulk;
+	__u64	  id = rpc->srpc_reqstbuf->buf_msg.msg_body.reqst.bulkid;
+	int	    rc;
+	int	    opt;
+
+	LASSERT(bk != NULL);
+
+	opt = bk->bk_sink ? LNET_MD_OP_GET : LNET_MD_OP_PUT;
+	opt |= LNET_MD_KIOV;
+
+	ev->ev_fired = 0;
+	ev->ev_data  = rpc;
+	ev->ev_type  = bk->bk_sink ? SRPC_BULK_GET_RPLD : SRPC_BULK_PUT_SENT;
+
+	rc = srpc_post_active_rdma(SRPC_RDMA_PORTAL, id,
+				   &bk->bk_iovs[0], bk->bk_niov, opt,
+				   rpc->srpc_peer, rpc->srpc_self,
+				   &bk->bk_mdh, ev);
+	if (rc != 0)
+		ev->ev_fired = 1;  /* no more event expected */
+	return rc;
+}
+
+/* only called from srpc_handle_rpc */
+static void
+srpc_server_rpc_done(srpc_server_rpc_t *rpc, int status)
+{
+	struct srpc_service_cd	*scd = rpc->srpc_scd;
+	struct srpc_service	*sv  = scd->scd_svc;
+	srpc_buffer_t		*buffer;
+
+	LASSERT(status != 0 || rpc->srpc_wi.swi_state == SWI_STATE_DONE);
+
+	rpc->srpc_status = status;
+
+	CDEBUG_LIMIT(status == 0 ? D_NET : D_NETERROR,
+		"Server RPC %p done: service %s, peer %s, status %s:%d\n",
+		rpc, sv->sv_name, libcfs_id2str(rpc->srpc_peer),
+		swi_state2str(rpc->srpc_wi.swi_state), status);
+
+	if (status != 0) {
+		spin_lock(&srpc_data.rpc_glock);
+		srpc_data.rpc_counters.rpcs_dropped++;
+		spin_unlock(&srpc_data.rpc_glock);
+	}
+
+	if (rpc->srpc_done != NULL)
+		(*rpc->srpc_done) (rpc);
+	LASSERT(rpc->srpc_bulk == NULL);
+
+	spin_lock(&scd->scd_lock);
+
+	if (rpc->srpc_reqstbuf != NULL) {
+		/* NB might drop sv_lock in srpc_service_recycle_buffer, but
+		 * sv won't go away for scd_rpc_active must not be empty */
+		srpc_service_recycle_buffer(scd, rpc->srpc_reqstbuf);
+		rpc->srpc_reqstbuf = NULL;
+	}
+
+	list_del(&rpc->srpc_list); /* from scd->scd_rpc_active */
+
+	/*
+	 * No one can schedule me now since:
+	 * - I'm not on scd_rpc_active.
+	 * - all LNet events have been fired.
+	 * Cancel pending schedules and prevent future schedule attempts:
+	 */
+	LASSERT(rpc->srpc_ev.ev_fired);
+	swi_exit_workitem(&rpc->srpc_wi);
+
+	if (!sv->sv_shuttingdown && !list_empty(&scd->scd_buf_blocked)) {
+		buffer = list_entry(scd->scd_buf_blocked.next,
+					srpc_buffer_t, buf_list);
+		list_del(&buffer->buf_list);
+
+		srpc_init_server_rpc(rpc, scd, buffer);
+		list_add_tail(&rpc->srpc_list, &scd->scd_rpc_active);
+		swi_schedule_workitem(&rpc->srpc_wi);
+	} else {
+		list_add(&rpc->srpc_list, &scd->scd_rpc_free);
+	}
+
+	spin_unlock(&scd->scd_lock);
+	return;
+}
+
+/* handles an incoming RPC */
+int
+srpc_handle_rpc(swi_workitem_t *wi)
+{
+	struct srpc_server_rpc	*rpc = wi->swi_workitem.wi_data;
+	struct srpc_service_cd	*scd = rpc->srpc_scd;
+	struct srpc_service	*sv = scd->scd_svc;
+	srpc_event_t		*ev = &rpc->srpc_ev;
+	int			rc = 0;
+
+	LASSERT(wi == &rpc->srpc_wi);
+
+	spin_lock(&scd->scd_lock);
+
+	if (sv->sv_shuttingdown || rpc->srpc_aborted) {
+		spin_unlock(&scd->scd_lock);
+
+		if (rpc->srpc_bulk != NULL)
+			LNetMDUnlink(rpc->srpc_bulk->bk_mdh);
+		LNetMDUnlink(rpc->srpc_replymdh);
+
+		if (ev->ev_fired) { /* no more event, OK to finish */
+			srpc_server_rpc_done(rpc, -ESHUTDOWN);
+			return 1;
+		}
+		return 0;
+	}
+
+	spin_unlock(&scd->scd_lock);
+
+	switch (wi->swi_state) {
+	default:
+		LBUG();
+	case SWI_STATE_NEWBORN: {
+		srpc_msg_t	   *msg;
+		srpc_generic_reply_t *reply;
+
+		msg = &rpc->srpc_reqstbuf->buf_msg;
+		reply = &rpc->srpc_replymsg.msg_body.reply;
+
+		if (msg->msg_magic == 0) {
+			/* moaned already in srpc_lnet_ev_handler */
+			srpc_server_rpc_done(rpc, EBADMSG);
+			return 1;
+		}
+
+		srpc_unpack_msg_hdr(msg);
+		if (msg->msg_version != SRPC_MSG_VERSION) {
+			CWARN("Version mismatch: %u, %u expected, from %s\n",
+			      msg->msg_version, SRPC_MSG_VERSION,
+			      libcfs_id2str(rpc->srpc_peer));
+			reply->status = EPROTO;
+			/* drop through and send reply */
+		} else {
+			reply->status = 0;
+			rc = (*sv->sv_handler)(rpc);
+			LASSERT(reply->status == 0 || !rpc->srpc_bulk);
+			if (rc != 0) {
+				srpc_server_rpc_done(rpc, rc);
+				return 1;
+			}
+		}
+
+		wi->swi_state = SWI_STATE_BULK_STARTED;
+
+		if (rpc->srpc_bulk != NULL) {
+			rc = srpc_do_bulk(rpc);
+			if (rc == 0)
+				return 0; /* wait for bulk */
+
+			LASSERT(ev->ev_fired);
+			ev->ev_status = rc;
+		}
+	}
+	case SWI_STATE_BULK_STARTED:
+		LASSERT(rpc->srpc_bulk == NULL || ev->ev_fired);
+
+		if (rpc->srpc_bulk != NULL) {
+			rc = ev->ev_status;
+
+			if (sv->sv_bulk_ready != NULL)
+				rc = (*sv->sv_bulk_ready) (rpc, rc);
+
+			if (rc != 0) {
+				srpc_server_rpc_done(rpc, rc);
+				return 1;
+			}
+		}
+
+		wi->swi_state = SWI_STATE_REPLY_SUBMITTED;
+		rc = srpc_send_reply(rpc);
+		if (rc == 0)
+			return 0; /* wait for reply */
+		srpc_server_rpc_done(rpc, rc);
+		return 1;
+
+	case SWI_STATE_REPLY_SUBMITTED:
+		if (!ev->ev_fired) {
+			CERROR("RPC %p: bulk %p, service %d\n",
+			       rpc, rpc->srpc_bulk, sv->sv_id);
+			CERROR("Event: status %d, type %d, lnet %d\n",
+			       ev->ev_status, ev->ev_type, ev->ev_lnet);
+			LASSERT(ev->ev_fired);
+		}
+
+		wi->swi_state = SWI_STATE_DONE;
+		srpc_server_rpc_done(rpc, ev->ev_status);
+		return 1;
+	}
+
+	return 0;
+}
+
+static void
+srpc_client_rpc_expired(void *data)
+{
+	srpc_client_rpc_t *rpc = data;
+
+	CWARN("Client RPC expired: service %d, peer %s, timeout %d.\n",
+	       rpc->crpc_service, libcfs_id2str(rpc->crpc_dest),
+	       rpc->crpc_timeout);
+
+	spin_lock(&rpc->crpc_lock);
+
+	rpc->crpc_timeout = 0;
+	srpc_abort_rpc(rpc, -ETIMEDOUT);
+
+	spin_unlock(&rpc->crpc_lock);
+
+	spin_lock(&srpc_data.rpc_glock);
+	srpc_data.rpc_counters.rpcs_expired++;
+	spin_unlock(&srpc_data.rpc_glock);
+}
+
+inline void
+srpc_add_client_rpc_timer(srpc_client_rpc_t *rpc)
+{
+	stt_timer_t *timer = &rpc->crpc_timer;
+
+	if (rpc->crpc_timeout == 0)
+		return;
+
+	INIT_LIST_HEAD(&timer->stt_list);
+	timer->stt_data    = rpc;
+	timer->stt_func    = srpc_client_rpc_expired;
+	timer->stt_expires = cfs_time_add(rpc->crpc_timeout,
+					  get_seconds());
+	stt_add_timer(timer);
+	return;
+}
+
+/*
+ * Called with rpc->crpc_lock held.
+ *
+ * Upon exit the RPC expiry timer is not queued and the handler is not
+ * running on any CPU. */
+static void
+srpc_del_client_rpc_timer(srpc_client_rpc_t *rpc)
+{
+	/* timer not planted or already exploded */
+	if (rpc->crpc_timeout == 0)
+		return;
+
+	/* timer successfully defused */
+	if (stt_del_timer(&rpc->crpc_timer))
+		return;
+
+	/* timer detonated, wait for it to explode */
+	while (rpc->crpc_timeout != 0) {
+		spin_unlock(&rpc->crpc_lock);
+
+		schedule();
+
+		spin_lock(&rpc->crpc_lock);
+	}
+}
+
+static void
+srpc_client_rpc_done(srpc_client_rpc_t *rpc, int status)
+{
+	swi_workitem_t *wi = &rpc->crpc_wi;
+
+	LASSERT(status != 0 || wi->swi_state == SWI_STATE_DONE);
+
+	spin_lock(&rpc->crpc_lock);
+
+	rpc->crpc_closed = 1;
+	if (rpc->crpc_status == 0)
+		rpc->crpc_status = status;
+
+	srpc_del_client_rpc_timer(rpc);
+
+	CDEBUG_LIMIT((status == 0) ? D_NET : D_NETERROR,
+		"Client RPC done: service %d, peer %s, status %s:%d:%d\n",
+		rpc->crpc_service, libcfs_id2str(rpc->crpc_dest),
+		swi_state2str(wi->swi_state), rpc->crpc_aborted, status);
+
+	/*
+	 * No one can schedule me now since:
+	 * - RPC timer has been defused.
+	 * - all LNet events have been fired.
+	 * - crpc_closed has been set, preventing srpc_abort_rpc from
+	 *   scheduling me.
+	 * Cancel pending schedules and prevent future schedule attempts:
+	 */
+	LASSERT(!srpc_event_pending(rpc));
+	swi_exit_workitem(wi);
+
+	spin_unlock(&rpc->crpc_lock);
+
+	(*rpc->crpc_done)(rpc);
+	return;
+}
+
+/* sends an outgoing RPC */
+int
+srpc_send_rpc(swi_workitem_t *wi)
+{
+	int		rc = 0;
+	srpc_client_rpc_t *rpc;
+	srpc_msg_t	*reply;
+	int		do_bulk;
+
+	LASSERT(wi != NULL);
+
+	rpc = wi->swi_workitem.wi_data;
+
+	LASSERT(rpc != NULL);
+	LASSERT(wi == &rpc->crpc_wi);
+
+	reply = &rpc->crpc_replymsg;
+	do_bulk = rpc->crpc_bulk.bk_niov > 0;
+
+	spin_lock(&rpc->crpc_lock);
+
+	if (rpc->crpc_aborted) {
+		spin_unlock(&rpc->crpc_lock);
+		goto abort;
+	}
+
+	spin_unlock(&rpc->crpc_lock);
+
+	switch (wi->swi_state) {
+	default:
+		LBUG();
+	case SWI_STATE_NEWBORN:
+		LASSERT(!srpc_event_pending(rpc));
+
+		rc = srpc_prepare_reply(rpc);
+		if (rc != 0) {
+			srpc_client_rpc_done(rpc, rc);
+			return 1;
+		}
+
+		rc = srpc_prepare_bulk(rpc);
+		if (rc != 0)
+			break;
+
+		wi->swi_state = SWI_STATE_REQUEST_SUBMITTED;
+		rc = srpc_send_request(rpc);
+		break;
+
+	case SWI_STATE_REQUEST_SUBMITTED:
+		/* CAVEAT EMPTOR: rqtev, rpyev, and bulkev may come in any
+		 * order; however, they're processed in a strict order:
+		 * rqt, rpy, and bulk. */
+		if (!rpc->crpc_reqstev.ev_fired)
+			break;
+
+		rc = rpc->crpc_reqstev.ev_status;
+		if (rc != 0)
+			break;
+
+		wi->swi_state = SWI_STATE_REQUEST_SENT;
+		/* perhaps more events, fall thru */
+	case SWI_STATE_REQUEST_SENT: {
+		srpc_msg_type_t type = srpc_service2reply(rpc->crpc_service);
+
+		if (!rpc->crpc_replyev.ev_fired)
+			break;
+
+		rc = rpc->crpc_replyev.ev_status;
+		if (rc != 0)
+			break;
+
+		srpc_unpack_msg_hdr(reply);
+		if (reply->msg_type != type ||
+		    (reply->msg_magic != SRPC_MSG_MAGIC &&
+		     reply->msg_magic != __swab32(SRPC_MSG_MAGIC))) {
+			CWARN("Bad message from %s: type %u (%d expected), magic %u (%d expected).\n",
+			      libcfs_id2str(rpc->crpc_dest),
+			      reply->msg_type, type,
+			      reply->msg_magic, SRPC_MSG_MAGIC);
+			rc = -EBADMSG;
+			break;
+		}
+
+		if (do_bulk && reply->msg_body.reply.status != 0) {
+			CWARN("Remote error %d at %s, unlink bulk buffer in case peer didn't initiate bulk transfer\n",
+			      reply->msg_body.reply.status,
+			      libcfs_id2str(rpc->crpc_dest));
+			LNetMDUnlink(rpc->crpc_bulk.bk_mdh);
+		}
+
+		wi->swi_state = SWI_STATE_REPLY_RECEIVED;
+	}
+	case SWI_STATE_REPLY_RECEIVED:
+		if (do_bulk && !rpc->crpc_bulkev.ev_fired)
+			break;
+
+		rc = do_bulk ? rpc->crpc_bulkev.ev_status : 0;
+
+		/* Bulk buffer was unlinked due to remote error. Clear error
+		 * since reply buffer still contains valid data.
+		 * NB rpc->crpc_done shouldn't look into bulk data in case of
+		 * remote error. */
+		if (do_bulk && rpc->crpc_bulkev.ev_lnet == LNET_EVENT_UNLINK &&
+		    rpc->crpc_status == 0 && reply->msg_body.reply.status != 0)
+			rc = 0;
+
+		wi->swi_state = SWI_STATE_DONE;
+		srpc_client_rpc_done(rpc, rc);
+		return 1;
+	}
+
+	if (rc != 0) {
+		spin_lock(&rpc->crpc_lock);
+		srpc_abort_rpc(rpc, rc);
+		spin_unlock(&rpc->crpc_lock);
+	}
+
+abort:
+	if (rpc->crpc_aborted) {
+		LNetMDUnlink(rpc->crpc_reqstmdh);
+		LNetMDUnlink(rpc->crpc_replymdh);
+		LNetMDUnlink(rpc->crpc_bulk.bk_mdh);
+
+		if (!srpc_event_pending(rpc)) {
+			srpc_client_rpc_done(rpc, -EINTR);
+			return 1;
+		}
+	}
+	return 0;
+}
+
+srpc_client_rpc_t *
+srpc_create_client_rpc(lnet_process_id_t peer, int service,
+			int nbulkiov, int bulklen,
+			void (*rpc_done)(srpc_client_rpc_t *),
+			void (*rpc_fini)(srpc_client_rpc_t *), void *priv)
+{
+	srpc_client_rpc_t *rpc;
+
+	LIBCFS_ALLOC(rpc, offsetof(srpc_client_rpc_t,
+				   crpc_bulk.bk_iovs[nbulkiov]));
+	if (rpc == NULL)
+		return NULL;
+
+	srpc_init_client_rpc(rpc, peer, service, nbulkiov,
+			     bulklen, rpc_done, rpc_fini, priv);
+	return rpc;
+}
+
+/* called with rpc->crpc_lock held */
+void
+srpc_abort_rpc(srpc_client_rpc_t *rpc, int why)
+{
+	LASSERT(why != 0);
+
+	if (rpc->crpc_aborted || /* already aborted */
+	    rpc->crpc_closed)    /* callback imminent */
+		return;
+
+	CDEBUG(D_NET,
+		"Aborting RPC: service %d, peer %s, state %s, why %d\n",
+		rpc->crpc_service, libcfs_id2str(rpc->crpc_dest),
+		swi_state2str(rpc->crpc_wi.swi_state), why);
+
+	rpc->crpc_aborted = 1;
+	rpc->crpc_status  = why;
+	swi_schedule_workitem(&rpc->crpc_wi);
+	return;
+}
+
+/* called with rpc->crpc_lock held */
+void
+srpc_post_rpc(srpc_client_rpc_t *rpc)
+{
+	LASSERT(!rpc->crpc_aborted);
+	LASSERT(srpc_data.rpc_state == SRPC_STATE_RUNNING);
+
+	CDEBUG(D_NET, "Posting RPC: peer %s, service %d, timeout %d\n",
+		libcfs_id2str(rpc->crpc_dest), rpc->crpc_service,
+		rpc->crpc_timeout);
+
+	srpc_add_client_rpc_timer(rpc);
+	swi_schedule_workitem(&rpc->crpc_wi);
+	return;
+}
+
+
+int
+srpc_send_reply(struct srpc_server_rpc *rpc)
+{
+	srpc_event_t		*ev = &rpc->srpc_ev;
+	struct srpc_msg		*msg = &rpc->srpc_replymsg;
+	struct srpc_buffer	*buffer = rpc->srpc_reqstbuf;
+	struct srpc_service_cd	*scd = rpc->srpc_scd;
+	struct srpc_service	*sv = scd->scd_svc;
+	__u64			rpyid;
+	int			rc;
+
+	LASSERT(buffer != NULL);
+	rpyid = buffer->buf_msg.msg_body.reqst.rpyid;
+
+	spin_lock(&scd->scd_lock);
+
+	if (!sv->sv_shuttingdown && !srpc_serv_is_framework(sv)) {
+		/* Repost buffer before replying since test client
+		 * might send me another RPC once it gets the reply */
+		if (srpc_service_post_buffer(scd, buffer) != 0)
+			CWARN("Failed to repost %s buffer\n", sv->sv_name);
+		rpc->srpc_reqstbuf = NULL;
+	}
+
+	spin_unlock(&scd->scd_lock);
+
+	ev->ev_fired = 0;
+	ev->ev_data  = rpc;
+	ev->ev_type  = SRPC_REPLY_SENT;
+
+	msg->msg_magic   = SRPC_MSG_MAGIC;
+	msg->msg_version = SRPC_MSG_VERSION;
+	msg->msg_type    = srpc_service2reply(sv->sv_id);
+
+	rc = srpc_post_active_rdma(SRPC_RDMA_PORTAL, rpyid, msg,
+				   sizeof(*msg), LNET_MD_OP_PUT,
+				   rpc->srpc_peer, rpc->srpc_self,
+				   &rpc->srpc_replymdh, ev);
+	if (rc != 0)
+		ev->ev_fired = 1;  /* no more event expected */
+	return rc;
+}
+
+/* when in kernel always called with LNET_LOCK() held, and in thread context */
+static void
+srpc_lnet_ev_handler(lnet_event_t *ev)
+{
+	struct srpc_service_cd	*scd;
+	srpc_event_t      *rpcev = ev->md.user_ptr;
+	srpc_client_rpc_t *crpc;
+	srpc_server_rpc_t *srpc;
+	srpc_buffer_t     *buffer;
+	srpc_service_t    *sv;
+	srpc_msg_t	*msg;
+	srpc_msg_type_t    type;
+
+	LASSERT(!in_interrupt());
+
+	if (ev->status != 0) {
+		spin_lock(&srpc_data.rpc_glock);
+		srpc_data.rpc_counters.errors++;
+		spin_unlock(&srpc_data.rpc_glock);
+	}
+
+	rpcev->ev_lnet = ev->type;
+
+	switch (rpcev->ev_type) {
+	default:
+		CERROR("Unknown event: status %d, type %d, lnet %d\n",
+		       rpcev->ev_status, rpcev->ev_type, rpcev->ev_lnet);
+		LBUG();
+	case SRPC_REQUEST_SENT:
+		if (ev->status == 0 && ev->type != LNET_EVENT_UNLINK) {
+			spin_lock(&srpc_data.rpc_glock);
+			srpc_data.rpc_counters.rpcs_sent++;
+			spin_unlock(&srpc_data.rpc_glock);
+		}
+	case SRPC_REPLY_RCVD:
+	case SRPC_BULK_REQ_RCVD:
+		crpc = rpcev->ev_data;
+
+		if (rpcev != &crpc->crpc_reqstev &&
+		    rpcev != &crpc->crpc_replyev &&
+		    rpcev != &crpc->crpc_bulkev) {
+			CERROR("rpcev %p, crpc %p, reqstev %p, replyev %p, bulkev %p\n",
+			       rpcev, crpc, &crpc->crpc_reqstev,
+			       &crpc->crpc_replyev, &crpc->crpc_bulkev);
+			CERROR("Bad event: status %d, type %d, lnet %d\n",
+			       rpcev->ev_status, rpcev->ev_type, rpcev->ev_lnet);
+			LBUG();
+		}
+
+		spin_lock(&crpc->crpc_lock);
+
+		LASSERT(rpcev->ev_fired == 0);
+		rpcev->ev_fired  = 1;
+		rpcev->ev_status = (ev->type == LNET_EVENT_UNLINK) ?
+						-EINTR : ev->status;
+		swi_schedule_workitem(&crpc->crpc_wi);
+
+		spin_unlock(&crpc->crpc_lock);
+		break;
+
+	case SRPC_REQUEST_RCVD:
+		scd = rpcev->ev_data;
+		sv = scd->scd_svc;
+
+		LASSERT(rpcev == &scd->scd_ev);
+
+		spin_lock(&scd->scd_lock);
+
+		LASSERT(ev->unlinked);
+		LASSERT(ev->type == LNET_EVENT_PUT ||
+			 ev->type == LNET_EVENT_UNLINK);
+		LASSERT(ev->type != LNET_EVENT_UNLINK ||
+			 sv->sv_shuttingdown);
+
+		buffer = container_of(ev->md.start, srpc_buffer_t, buf_msg);
+		buffer->buf_peer = ev->initiator;
+		buffer->buf_self = ev->target.nid;
+
+		LASSERT(scd->scd_buf_nposted > 0);
+		scd->scd_buf_nposted--;
+
+		if (sv->sv_shuttingdown) {
+			/* Leave buffer on scd->scd_buf_nposted since
+			 * srpc_finish_service needs to traverse it. */
+			spin_unlock(&scd->scd_lock);
+			break;
+		}
+
+		if (scd->scd_buf_err_stamp != 0 &&
+		    scd->scd_buf_err_stamp < get_seconds()) {
+			/* re-enable adding buffer */
+			scd->scd_buf_err_stamp = 0;
+			scd->scd_buf_err = 0;
+		}
+
+		if (scd->scd_buf_err == 0 && /* adding buffer is enabled */
+		    scd->scd_buf_adjust == 0 &&
+		    scd->scd_buf_nposted < scd->scd_buf_low) {
+			scd->scd_buf_adjust = max(scd->scd_buf_total / 2,
+						  SFW_TEST_WI_MIN);
+			swi_schedule_workitem(&scd->scd_buf_wi);
+		}
+
+		list_del(&buffer->buf_list); /* from scd->scd_buf_posted */
+		msg = &buffer->buf_msg;
+		type = srpc_service2request(sv->sv_id);
+
+		if (ev->status != 0 || ev->mlength != sizeof(*msg) ||
+		    (msg->msg_type != type &&
+		     msg->msg_type != __swab32(type)) ||
+		    (msg->msg_magic != SRPC_MSG_MAGIC &&
+		     msg->msg_magic != __swab32(SRPC_MSG_MAGIC))) {
+			CERROR("Dropping RPC (%s) from %s: status %d mlength %d type %u magic %u.\n",
+			       sv->sv_name, libcfs_id2str(ev->initiator),
+			       ev->status, ev->mlength,
+			       msg->msg_type, msg->msg_magic);
+
+			/* NB can't call srpc_service_recycle_buffer here since
+			 * it may call LNetM[DE]Attach. The invalid magic tells
+			 * srpc_handle_rpc to drop this RPC */
+			msg->msg_magic = 0;
+		}
+
+		if (!list_empty(&scd->scd_rpc_free)) {
+			srpc = list_entry(scd->scd_rpc_free.next,
+					      struct srpc_server_rpc,
+					      srpc_list);
+			list_del(&srpc->srpc_list);
+
+			srpc_init_server_rpc(srpc, scd, buffer);
+			list_add_tail(&srpc->srpc_list,
+					  &scd->scd_rpc_active);
+			swi_schedule_workitem(&srpc->srpc_wi);
+		} else {
+			list_add_tail(&buffer->buf_list,
+					  &scd->scd_buf_blocked);
+		}
+
+		spin_unlock(&scd->scd_lock);
+
+		spin_lock(&srpc_data.rpc_glock);
+		srpc_data.rpc_counters.rpcs_rcvd++;
+		spin_unlock(&srpc_data.rpc_glock);
+		break;
+
+	case SRPC_BULK_GET_RPLD:
+		LASSERT(ev->type == LNET_EVENT_SEND ||
+			 ev->type == LNET_EVENT_REPLY ||
+			 ev->type == LNET_EVENT_UNLINK);
+
+		if (!ev->unlinked)
+			break; /* wait for final event */
+
+	case SRPC_BULK_PUT_SENT:
+		if (ev->status == 0 && ev->type != LNET_EVENT_UNLINK) {
+			spin_lock(&srpc_data.rpc_glock);
+
+			if (rpcev->ev_type == SRPC_BULK_GET_RPLD)
+				srpc_data.rpc_counters.bulk_get += ev->mlength;
+			else
+				srpc_data.rpc_counters.bulk_put += ev->mlength;
+
+			spin_unlock(&srpc_data.rpc_glock);
+		}
+	case SRPC_REPLY_SENT:
+		srpc = rpcev->ev_data;
+		scd  = srpc->srpc_scd;
+
+		LASSERT(rpcev == &srpc->srpc_ev);
+
+		spin_lock(&scd->scd_lock);
+
+		rpcev->ev_fired  = 1;
+		rpcev->ev_status = (ev->type == LNET_EVENT_UNLINK) ?
+				   -EINTR : ev->status;
+		swi_schedule_workitem(&srpc->srpc_wi);
+
+		spin_unlock(&scd->scd_lock);
+		break;
+	}
+}
+
+
+int
+srpc_startup(void)
+{
+	int rc;
+
+	memset(&srpc_data, 0, sizeof(struct smoketest_rpc));
+	spin_lock_init(&srpc_data.rpc_glock);
+
+	/* 1 second pause to avoid timestamp reuse */
+	set_current_state(TASK_UNINTERRUPTIBLE);
+	schedule_timeout(cfs_time_seconds(1));
+	srpc_data.rpc_matchbits = ((__u64) get_seconds()) << 48;
+
+	srpc_data.rpc_state = SRPC_STATE_NONE;
+
+	rc = LNetNIInit(LUSTRE_SRV_LNET_PID);
+	if (rc < 0) {
+		CERROR("LNetNIInit() has failed: %d\n", rc);
+		return rc;
+	}
+
+	srpc_data.rpc_state = SRPC_STATE_NI_INIT;
+
+	LNetInvalidateHandle(&srpc_data.rpc_lnet_eq);
+	rc = LNetEQAlloc(0, srpc_lnet_ev_handler, &srpc_data.rpc_lnet_eq);
+	if (rc != 0) {
+		CERROR("LNetEQAlloc() has failed: %d\n", rc);
+		goto bail;
+	}
+
+	rc = LNetSetLazyPortal(SRPC_FRAMEWORK_REQUEST_PORTAL);
+	LASSERT(rc == 0);
+	rc = LNetSetLazyPortal(SRPC_REQUEST_PORTAL);
+	LASSERT(rc == 0);
+
+	srpc_data.rpc_state = SRPC_STATE_EQ_INIT;
+
+	rc = stt_startup();
+
+bail:
+	if (rc != 0)
+		srpc_shutdown();
+	else
+		srpc_data.rpc_state = SRPC_STATE_RUNNING;
+
+	return rc;
+}
+
+void
+srpc_shutdown(void)
+{
+	int i;
+	int rc;
+	int state;
+
+	state = srpc_data.rpc_state;
+	srpc_data.rpc_state = SRPC_STATE_STOPPING;
+
+	switch (state) {
+	default:
+		LBUG();
+	case SRPC_STATE_RUNNING:
+		spin_lock(&srpc_data.rpc_glock);
+
+		for (i = 0; i <= SRPC_SERVICE_MAX_ID; i++) {
+			srpc_service_t *sv = srpc_data.rpc_services[i];
+
+			LASSERTF(sv == NULL,
+				  "service not empty: id %d, name %s\n",
+				  i, sv->sv_name);
+		}
+
+		spin_unlock(&srpc_data.rpc_glock);
+
+		stt_shutdown();
+
+	case SRPC_STATE_EQ_INIT:
+		rc = LNetClearLazyPortal(SRPC_FRAMEWORK_REQUEST_PORTAL);
+		rc = LNetClearLazyPortal(SRPC_REQUEST_PORTAL);
+		LASSERT(rc == 0);
+		rc = LNetEQFree(srpc_data.rpc_lnet_eq);
+		LASSERT(rc == 0); /* the EQ should have no user by now */
+
+	case SRPC_STATE_NI_INIT:
+		LNetNIFini();
+	}
+
+	return;
+}
diff --git a/kernel/drivers/staging/lustre/lnet/selftest/rpc.h b/kernel/drivers/staging/lustre/lnet/selftest/rpc.h
new file mode 100644
index 000000000..fbeb75fe5
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lnet/selftest/rpc.h
@@ -0,0 +1,302 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef __SELFTEST_RPC_H__
+#define __SELFTEST_RPC_H__
+
+#include "../../include/linux/lnet/lnetst.h"
+
+/*
+ * LST wired structures
+ *
+ * XXX: *REPLY == *REQST + 1
+ */
+typedef enum {
+	SRPC_MSG_MKSN_REQST     = 0,
+	SRPC_MSG_MKSN_REPLY     = 1,
+	SRPC_MSG_RMSN_REQST     = 2,
+	SRPC_MSG_RMSN_REPLY     = 3,
+	SRPC_MSG_BATCH_REQST    = 4,
+	SRPC_MSG_BATCH_REPLY    = 5,
+	SRPC_MSG_STAT_REQST     = 6,
+	SRPC_MSG_STAT_REPLY     = 7,
+	SRPC_MSG_TEST_REQST     = 8,
+	SRPC_MSG_TEST_REPLY     = 9,
+	SRPC_MSG_DEBUG_REQST    = 10,
+	SRPC_MSG_DEBUG_REPLY    = 11,
+	SRPC_MSG_BRW_REQST      = 12,
+	SRPC_MSG_BRW_REPLY      = 13,
+	SRPC_MSG_PING_REQST     = 14,
+	SRPC_MSG_PING_REPLY     = 15,
+	SRPC_MSG_JOIN_REQST     = 16,
+	SRPC_MSG_JOIN_REPLY     = 17,
+} srpc_msg_type_t;
+
+
+/* CAVEAT EMPTOR:
+ * All srpc_*_reqst_t's 1st field must be matchbits of reply buffer,
+ * and 2nd field matchbits of bulk buffer if any.
+ *
+ * All srpc_*_reply_t's 1st field must be a __u32 status, and 2nd field
+ * session id if needed.
+ */
+typedef struct {
+	__u64			rpyid;		/* reply buffer matchbits */
+	__u64			bulkid;		/* bulk buffer matchbits */
+} WIRE_ATTR srpc_generic_reqst_t;
+
+typedef struct {
+	__u32		   status;
+	lst_sid_t	       sid;
+} WIRE_ATTR srpc_generic_reply_t;
+
+/* FRAMEWORK RPCs */
+typedef struct {
+	__u64			mksn_rpyid;      /* reply buffer matchbits */
+	lst_sid_t	       mksn_sid;	/* session id */
+	__u32			mksn_force;      /* use brute force */
+	char			mksn_name[LST_NAME_SIZE];
+} WIRE_ATTR srpc_mksn_reqst_t;			/* make session request */
+
+typedef struct {
+	__u32		   mksn_status;      /* session status */
+	lst_sid_t	       mksn_sid;	 /* session id */
+	__u32		   mksn_timeout;     /* session timeout */
+	char			mksn_name[LST_NAME_SIZE];
+} WIRE_ATTR srpc_mksn_reply_t; /* make session reply */
+
+typedef struct {
+	__u64			rmsn_rpyid;      /* reply buffer matchbits */
+	lst_sid_t		rmsn_sid;	/* session id */
+} WIRE_ATTR srpc_rmsn_reqst_t; /* remove session request */
+
+typedef struct {
+	__u32			rmsn_status;
+	lst_sid_t		rmsn_sid;	/* session id */
+} WIRE_ATTR srpc_rmsn_reply_t; /* remove session reply */
+
+typedef struct {
+	__u64			join_rpyid;     /* reply buffer matchbits */
+	lst_sid_t	       join_sid;       /* session id to join */
+	char		    join_group[LST_NAME_SIZE]; /* group name */
+} WIRE_ATTR srpc_join_reqst_t;
+
+typedef struct {
+	__u32		   join_status;    /* returned status */
+	lst_sid_t	       join_sid;       /* session id */
+	__u32			join_timeout;   /* # seconds' inactivity to expire */
+	char		    join_session[LST_NAME_SIZE]; /* session name */
+} WIRE_ATTR srpc_join_reply_t;
+
+typedef struct {
+	__u64		   dbg_rpyid;      /* reply buffer matchbits */
+	lst_sid_t	       dbg_sid;	/* session id */
+	__u32		   dbg_flags;      /* bitmap of debug */
+} WIRE_ATTR srpc_debug_reqst_t;
+
+typedef struct {
+	__u32		   dbg_status;     /* returned code */
+	lst_sid_t	       dbg_sid;	/* session id */
+	__u32		   dbg_timeout;    /* session timeout */
+	__u32		   dbg_nbatch;     /* # of batches in the node */
+	char		    dbg_name[LST_NAME_SIZE]; /* session name */
+} WIRE_ATTR srpc_debug_reply_t;
+
+#define SRPC_BATCH_OPC_RUN      1
+#define SRPC_BATCH_OPC_STOP     2
+#define SRPC_BATCH_OPC_QUERY    3
+
+typedef struct {
+	__u64		   bar_rpyid;      /* reply buffer matchbits */
+	lst_sid_t	       bar_sid;	/* session id */
+	lst_bid_t	       bar_bid;	/* batch id */
+	__u32		   bar_opc;	/* create/start/stop batch */
+	__u32		   bar_testidx;    /* index of test */
+	__u32		   bar_arg;	/* parameters */
+} WIRE_ATTR srpc_batch_reqst_t;
+
+typedef struct {
+	__u32		   bar_status;     /* status of request */
+	lst_sid_t	       bar_sid;	/* session id */
+	__u32		   bar_active;     /* # of active tests in batch/test */
+	__u32		   bar_time;       /* remained time */
+} WIRE_ATTR srpc_batch_reply_t;
+
+typedef struct {
+	__u64		   str_rpyid;      /* reply buffer matchbits */
+	lst_sid_t	       str_sid;	/* session id */
+	__u32		   str_type;       /* type of stat */
+} WIRE_ATTR srpc_stat_reqst_t;
+
+typedef struct {
+	__u32		   str_status;
+	lst_sid_t	       str_sid;
+	sfw_counters_t	  str_fw;
+	srpc_counters_t	 str_rpc;
+	lnet_counters_t	 str_lnet;
+} WIRE_ATTR srpc_stat_reply_t;
+
+typedef struct {
+	__u32		   blk_opc;	/* bulk operation code */
+	__u32		   blk_npg;	/* # of pages */
+	__u32		   blk_flags;      /* reserved flags */
+} WIRE_ATTR test_bulk_req_t;
+
+typedef struct {
+	/** bulk operation code */
+	__u16			blk_opc;
+	/** data check flags */
+	__u16			blk_flags;
+	/** data length */
+	__u32			blk_len;
+	/** reserved: offset */
+	__u32		   blk_offset;
+} WIRE_ATTR test_bulk_req_v1_t;
+
+typedef struct {
+	__u32			png_size;       /* size of ping message */
+	__u32			png_flags;      /* reserved flags */
+} WIRE_ATTR test_ping_req_t;
+
+typedef struct {
+	__u64			tsr_rpyid;      /* reply buffer matchbits */
+	__u64			tsr_bulkid;     /* bulk buffer matchbits */
+	lst_sid_t		tsr_sid;	/* session id */
+	lst_bid_t		tsr_bid;	/* batch id */
+	__u32			tsr_service;    /* test type: bulk|ping|... */
+	/* test client loop count or # server buffers needed */
+	__u32			tsr_loop;
+	__u32			tsr_concur;     /* concurrency of test */
+	__u8			tsr_is_client;  /* is test client or not */
+	__u8			tsr_stop_onerr; /* stop on error */
+	__u32			tsr_ndest;      /* # of dest nodes */
+
+	union {
+		test_ping_req_t		ping;
+		test_bulk_req_t		bulk_v0;
+		test_bulk_req_v1_t	bulk_v1;
+	}		tsr_u;
+} WIRE_ATTR srpc_test_reqst_t;
+
+typedef struct {
+	__u32			tsr_status;     /* returned code */
+	lst_sid_t		tsr_sid;
+} WIRE_ATTR srpc_test_reply_t;
+
+/* TEST RPCs */
+typedef struct {
+	__u64		   pnr_rpyid;
+	__u32		   pnr_magic;
+	__u32		   pnr_seq;
+	__u64		   pnr_time_sec;
+	__u64		   pnr_time_usec;
+} WIRE_ATTR srpc_ping_reqst_t;
+
+typedef struct {
+	__u32		   pnr_status;
+	__u32		   pnr_magic;
+	__u32		   pnr_seq;
+} WIRE_ATTR srpc_ping_reply_t;
+
+typedef struct {
+	__u64		   brw_rpyid;      /* reply buffer matchbits */
+	__u64		   brw_bulkid;     /* bulk buffer matchbits */
+	__u32		   brw_rw;	 /* read or write */
+	__u32		   brw_len;	/* bulk data len */
+	__u32		   brw_flags;      /* bulk data patterns */
+} WIRE_ATTR srpc_brw_reqst_t; /* bulk r/w request */
+
+typedef struct {
+	__u32		   brw_status;
+} WIRE_ATTR srpc_brw_reply_t; /* bulk r/w reply */
+
+#define SRPC_MSG_MAGIC		  0xeeb0f00d
+#define SRPC_MSG_VERSION		1
+
+typedef struct srpc_msg {
+	/** magic number */
+	__u32	msg_magic;
+	/** message version number */
+	__u32	msg_version;
+	/** type of message body: srpc_msg_type_t */
+	__u32	msg_type;
+	__u32	msg_reserved0;
+	__u32	msg_reserved1;
+	/** test session features */
+	__u32	msg_ses_feats;
+	union {
+		srpc_generic_reqst_t reqst;
+		srpc_generic_reply_t reply;
+
+		srpc_mksn_reqst_t    mksn_reqst;
+		srpc_mksn_reply_t    mksn_reply;
+		srpc_rmsn_reqst_t    rmsn_reqst;
+		srpc_rmsn_reply_t    rmsn_reply;
+		srpc_debug_reqst_t   dbg_reqst;
+		srpc_debug_reply_t   dbg_reply;
+		srpc_batch_reqst_t   bat_reqst;
+		srpc_batch_reply_t   bat_reply;
+		srpc_stat_reqst_t    stat_reqst;
+		srpc_stat_reply_t    stat_reply;
+		srpc_test_reqst_t    tes_reqst;
+		srpc_test_reply_t    tes_reply;
+		srpc_join_reqst_t    join_reqst;
+		srpc_join_reply_t    join_reply;
+
+		srpc_ping_reqst_t    ping_reqst;
+		srpc_ping_reply_t    ping_reply;
+		srpc_brw_reqst_t     brw_reqst;
+		srpc_brw_reply_t     brw_reply;
+	}     msg_body;
+} WIRE_ATTR srpc_msg_t;
+
+static inline void
+srpc_unpack_msg_hdr(srpc_msg_t *msg)
+{
+	if (msg->msg_magic == SRPC_MSG_MAGIC)
+		return; /* no flipping needed */
+
+	/* We do not swap the magic number here as it is needed to
+	   determine whether the body needs to be swapped. */
+	/* __swab32s(&msg->msg_magic); */
+	__swab32s(&msg->msg_type);
+	__swab32s(&msg->msg_version);
+	__swab32s(&msg->msg_ses_feats);
+	__swab32s(&msg->msg_reserved0);
+	__swab32s(&msg->msg_reserved1);
+}
+
+#endif /* __SELFTEST_RPC_H__ */
diff --git a/kernel/drivers/staging/lustre/lnet/selftest/selftest.h b/kernel/drivers/staging/lustre/lnet/selftest/selftest.h
new file mode 100644
index 000000000..d48701834
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lnet/selftest/selftest.h
@@ -0,0 +1,624 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ * copy of GPLv2].
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/selftest/selftest.h
+ *
+ * Author: Isaac Huang <isaac@clusterfs.com>
+ */
+#ifndef __SELFTEST_SELFTEST_H__
+#define __SELFTEST_SELFTEST_H__
+
+#define LNET_ONLY
+
+#include "../../include/linux/libcfs/libcfs.h"
+#include "../../include/linux/lnet/lnet.h"
+#include "../../include/linux/lnet/lib-lnet.h"
+#include "../../include/linux/lnet/lib-types.h"
+#include "../../include/linux/lnet/lnetst.h"
+
+#include "rpc.h"
+#include "timer.h"
+
+#ifndef MADE_WITHOUT_COMPROMISE
+#define MADE_WITHOUT_COMPROMISE
+#endif
+
+
+#define SWI_STATE_NEWBORN		  0
+#define SWI_STATE_REPLY_SUBMITTED	  1
+#define SWI_STATE_REPLY_SENT	       2
+#define SWI_STATE_REQUEST_SUBMITTED	3
+#define SWI_STATE_REQUEST_SENT	     4
+#define SWI_STATE_REPLY_RECEIVED	   5
+#define SWI_STATE_BULK_STARTED	     6
+#define SWI_STATE_DONE		     10
+
+/* forward refs */
+struct srpc_service;
+struct srpc_service_cd;
+struct sfw_test_unit;
+struct sfw_test_instance;
+
+/* services below SRPC_FRAMEWORK_SERVICE_MAX_ID are framework
+ * services, e.g. create/modify session.
+ */
+#define SRPC_SERVICE_DEBUG	      0
+#define SRPC_SERVICE_MAKE_SESSION       1
+#define SRPC_SERVICE_REMOVE_SESSION     2
+#define SRPC_SERVICE_BATCH	      3
+#define SRPC_SERVICE_TEST	       4
+#define SRPC_SERVICE_QUERY_STAT	 5
+#define SRPC_SERVICE_JOIN	       6
+#define SRPC_FRAMEWORK_SERVICE_MAX_ID   10
+/* other services start from SRPC_FRAMEWORK_SERVICE_MAX_ID+1 */
+#define SRPC_SERVICE_BRW		11
+#define SRPC_SERVICE_PING	       12
+#define SRPC_SERVICE_MAX_ID	     12
+
+#define SRPC_REQUEST_PORTAL	     50
+/* a lazy portal for framework RPC requests */
+#define SRPC_FRAMEWORK_REQUEST_PORTAL   51
+/* all reply/bulk RDMAs go to this portal */
+#define SRPC_RDMA_PORTAL		52
+
+static inline srpc_msg_type_t
+srpc_service2request (int service)
+{
+	switch (service) {
+	default:
+		LBUG ();
+	case SRPC_SERVICE_DEBUG:
+		return SRPC_MSG_DEBUG_REQST;
+
+	case SRPC_SERVICE_MAKE_SESSION:
+		return SRPC_MSG_MKSN_REQST;
+
+	case SRPC_SERVICE_REMOVE_SESSION:
+		return SRPC_MSG_RMSN_REQST;
+
+	case SRPC_SERVICE_BATCH:
+		return SRPC_MSG_BATCH_REQST;
+
+	case SRPC_SERVICE_TEST:
+		return SRPC_MSG_TEST_REQST;
+
+	case SRPC_SERVICE_QUERY_STAT:
+		return SRPC_MSG_STAT_REQST;
+
+	case SRPC_SERVICE_BRW:
+		return SRPC_MSG_BRW_REQST;
+
+	case SRPC_SERVICE_PING:
+		return SRPC_MSG_PING_REQST;
+
+	case SRPC_SERVICE_JOIN:
+		return SRPC_MSG_JOIN_REQST;
+	}
+}
+
+static inline srpc_msg_type_t
+srpc_service2reply (int service)
+{
+	return srpc_service2request(service) + 1;
+}
+
+typedef enum {
+	SRPC_BULK_REQ_RCVD   = 1, /* passive bulk request(PUT sink/GET source) received */
+	SRPC_BULK_PUT_SENT   = 2, /* active bulk PUT sent (source) */
+	SRPC_BULK_GET_RPLD   = 3, /* active bulk GET replied (sink) */
+	SRPC_REPLY_RCVD      = 4, /* incoming reply received */
+	SRPC_REPLY_SENT      = 5, /* outgoing reply sent */
+	SRPC_REQUEST_RCVD    = 6, /* incoming request received */
+	SRPC_REQUEST_SENT    = 7, /* outgoing request sent */
+} srpc_event_type_t;
+
+/* RPC event */
+typedef struct {
+	srpc_event_type_t ev_type;   /* what's up */
+	lnet_event_kind_t ev_lnet;   /* LNet event type */
+	int	       ev_fired;  /* LNet event fired? */
+	int	       ev_status; /* LNet event status */
+	void	     *ev_data;   /* owning server/client RPC */
+} srpc_event_t;
+
+typedef struct {
+	int	      bk_len;  /* len of bulk data */
+	lnet_handle_md_t bk_mdh;
+	int	      bk_sink; /* sink/source */
+	int	      bk_niov; /* # iov in bk_iovs */
+	lnet_kiov_t      bk_iovs[0];
+} srpc_bulk_t; /* bulk descriptor */
+
+/* message buffer descriptor */
+typedef struct srpc_buffer {
+	struct list_head	   buf_list; /* chain on srpc_service::*_msgq */
+	srpc_msg_t	   buf_msg;
+	lnet_handle_md_t     buf_mdh;
+	lnet_nid_t	   buf_self;
+	lnet_process_id_t    buf_peer;
+} srpc_buffer_t;
+
+struct swi_workitem;
+typedef int (*swi_action_t) (struct swi_workitem *);
+
+typedef struct swi_workitem {
+	struct cfs_wi_sched	*swi_sched;
+	cfs_workitem_t       swi_workitem;
+	swi_action_t	 swi_action;
+	int		  swi_state;
+} swi_workitem_t;
+
+/* server-side state of a RPC */
+typedef struct srpc_server_rpc {
+	/* chain on srpc_service::*_rpcq */
+	struct list_head		srpc_list;
+	struct srpc_service_cd *srpc_scd;
+	swi_workitem_t       srpc_wi;
+	srpc_event_t	 srpc_ev;      /* bulk/reply event */
+	lnet_nid_t	   srpc_self;
+	lnet_process_id_t    srpc_peer;
+	srpc_msg_t	   srpc_replymsg;
+	lnet_handle_md_t     srpc_replymdh;
+	srpc_buffer_t       *srpc_reqstbuf;
+	srpc_bulk_t	 *srpc_bulk;
+
+	unsigned int	 srpc_aborted; /* being given up */
+	int		  srpc_status;
+	void	       (*srpc_done)(struct srpc_server_rpc *);
+} srpc_server_rpc_t;
+
+/* client-side state of a RPC */
+typedef struct srpc_client_rpc {
+	struct list_head		crpc_list;	/* chain on user's lists */
+	spinlock_t		crpc_lock;	/* serialize */
+	int		  crpc_service;
+	atomic_t	 crpc_refcount;
+	int		  crpc_timeout; /* # seconds to wait for reply */
+	stt_timer_t	  crpc_timer;
+	swi_workitem_t       crpc_wi;
+	lnet_process_id_t    crpc_dest;
+
+	void	       (*crpc_done)(struct srpc_client_rpc *);
+	void	       (*crpc_fini)(struct srpc_client_rpc *);
+	int		  crpc_status;    /* completion status */
+	void		*crpc_priv;      /* caller data */
+
+	/* state flags */
+	unsigned int	 crpc_aborted:1; /* being given up */
+	unsigned int	 crpc_closed:1;  /* completed */
+
+	/* RPC events */
+	srpc_event_t	 crpc_bulkev;    /* bulk event */
+	srpc_event_t	 crpc_reqstev;   /* request event */
+	srpc_event_t	 crpc_replyev;   /* reply event */
+
+	/* bulk, request(reqst), and reply exchanged on wire */
+	srpc_msg_t	   crpc_reqstmsg;
+	srpc_msg_t	   crpc_replymsg;
+	lnet_handle_md_t     crpc_reqstmdh;
+	lnet_handle_md_t     crpc_replymdh;
+	srpc_bulk_t	  crpc_bulk;
+} srpc_client_rpc_t;
+
+#define srpc_client_rpc_size(rpc)				       \
+offsetof(srpc_client_rpc_t, crpc_bulk.bk_iovs[(rpc)->crpc_bulk.bk_niov])
+
+#define srpc_client_rpc_addref(rpc)				     \
+do {								    \
+	CDEBUG(D_NET, "RPC[%p] -> %s (%d)++\n",			 \
+	       (rpc), libcfs_id2str((rpc)->crpc_dest),		  \
+	       atomic_read(&(rpc)->crpc_refcount));		 \
+	LASSERT(atomic_read(&(rpc)->crpc_refcount) > 0);	    \
+	atomic_inc(&(rpc)->crpc_refcount);			  \
+} while (0)
+
+#define srpc_client_rpc_decref(rpc)				     \
+do {								    \
+	CDEBUG(D_NET, "RPC[%p] -> %s (%d)--\n",			 \
+	       (rpc), libcfs_id2str((rpc)->crpc_dest),		  \
+	       atomic_read(&(rpc)->crpc_refcount));		 \
+	LASSERT(atomic_read(&(rpc)->crpc_refcount) > 0);	    \
+	if (atomic_dec_and_test(&(rpc)->crpc_refcount))	     \
+		srpc_destroy_client_rpc(rpc);			   \
+} while (0)
+
+#define srpc_event_pending(rpc)   ((rpc)->crpc_bulkev.ev_fired == 0 ||  \
+				   (rpc)->crpc_reqstev.ev_fired == 0 || \
+				   (rpc)->crpc_replyev.ev_fired == 0)
+
+/* CPU partition data of srpc service */
+struct srpc_service_cd {
+	/** serialize */
+	spinlock_t		scd_lock;
+	/** backref to service */
+	struct srpc_service	*scd_svc;
+	/** event buffer */
+	srpc_event_t		scd_ev;
+	/** free RPC descriptors */
+	struct list_head		scd_rpc_free;
+	/** in-flight RPCs */
+	struct list_head		scd_rpc_active;
+	/** workitem for posting buffer */
+	swi_workitem_t		scd_buf_wi;
+	/** CPT id */
+	int			scd_cpt;
+	/** error code for scd_buf_wi */
+	int			scd_buf_err;
+	/** timestamp for scd_buf_err */
+	unsigned long	   scd_buf_err_stamp;
+	/** total # request buffers */
+	int			scd_buf_total;
+	/** # posted request buffers */
+	int			scd_buf_nposted;
+	/** in progress of buffer posting */
+	int			scd_buf_posting;
+	/** allocate more buffers if scd_buf_nposted < scd_buf_low */
+	int			scd_buf_low;
+	/** increase/decrease some buffers */
+	int			scd_buf_adjust;
+	/** posted message buffers */
+	struct list_head		scd_buf_posted;
+	/** blocked for RPC descriptor */
+	struct list_head		scd_buf_blocked;
+};
+
+/* number of server workitems (mini-thread) for testing service */
+#define SFW_TEST_WI_MIN		256
+#define SFW_TEST_WI_MAX		2048
+/* extra buffers for tolerating buggy peers, or unbalanced number
+ * of peers between partitions  */
+#define SFW_TEST_WI_EXTRA	64
+
+/* number of server workitems (mini-thread) for framework service */
+#define SFW_FRWK_WI_MIN		16
+#define SFW_FRWK_WI_MAX		256
+
+typedef struct srpc_service {
+	int			sv_id;		/* service id */
+	const char		*sv_name;	/* human readable name */
+	int			sv_wi_total;	/* total server workitems */
+	int			sv_shuttingdown;
+	int			sv_ncpts;
+	/* percpt data for srpc_service */
+	struct srpc_service_cd	**sv_cpt_data;
+	/* Service callbacks:
+	 * - sv_handler: process incoming RPC request
+	 * - sv_bulk_ready: notify bulk data
+	 */
+	int	      (*sv_handler) (srpc_server_rpc_t *);
+	int	      (*sv_bulk_ready) (srpc_server_rpc_t *, int);
+} srpc_service_t;
+
+typedef struct {
+	struct list_head	sn_list;    /* chain on fw_zombie_sessions */
+	lst_sid_t	 sn_id;      /* unique identifier */
+	unsigned int      sn_timeout; /* # seconds' inactivity to expire */
+	int	       sn_timer_active;
+	unsigned int	  sn_features;
+	stt_timer_t       sn_timer;
+	struct list_head	sn_batches; /* list of batches */
+	char	      sn_name[LST_NAME_SIZE];
+	atomic_t      sn_refcount;
+	atomic_t      sn_brw_errors;
+	atomic_t      sn_ping_errors;
+	unsigned long	sn_started;
+} sfw_session_t;
+
+#define sfw_sid_equal(sid0, sid1)     ((sid0).ses_nid == (sid1).ses_nid && \
+				       (sid0).ses_stamp == (sid1).ses_stamp)
+
+typedef struct {
+	struct list_head	bat_list;      /* chain on sn_batches */
+	lst_bid_t	 bat_id;	/* batch id */
+	int	       bat_error;     /* error code of batch */
+	sfw_session_t    *bat_session;   /* batch's session */
+	atomic_t      bat_nactive;   /* # of active tests */
+	struct list_head	bat_tests;     /* test instances */
+} sfw_batch_t;
+
+typedef struct {
+	int  (*tso_init)(struct sfw_test_instance *tsi); /* initialize test client */
+	void (*tso_fini)(struct sfw_test_instance *tsi); /* finalize test client */
+	int  (*tso_prep_rpc)(struct sfw_test_unit *tsu,
+			     lnet_process_id_t dest,
+			     srpc_client_rpc_t **rpc);   /* prep a tests rpc */
+	void (*tso_done_rpc)(struct sfw_test_unit *tsu,
+			     srpc_client_rpc_t *rpc);    /* done a test rpc */
+} sfw_test_client_ops_t;
+
+typedef struct sfw_test_instance {
+	struct list_head	      tsi_list;	 /* chain on batch */
+	int		     tsi_service;      /* test type */
+	sfw_batch_t	    *tsi_batch;	/* batch */
+	sfw_test_client_ops_t  *tsi_ops;	  /* test client operations */
+
+	/* public parameter for all test units */
+	unsigned int		tsi_is_client:1;     /* is test client */
+	unsigned int		tsi_stoptsu_onerr:1; /* stop tsu on error */
+	int		     tsi_concur;	  /* concurrency */
+	int		     tsi_loop;	    /* loop count */
+
+	/* status of test instance */
+	spinlock_t		tsi_lock;	  /* serialize */
+	unsigned int		tsi_stopping:1;   /* test is stopping */
+	atomic_t	    tsi_nactive;      /* # of active test unit */
+	struct list_head	      tsi_units;	/* test units */
+	struct list_head	      tsi_free_rpcs;    /* free rpcs */
+	struct list_head	      tsi_active_rpcs;  /* active rpcs */
+
+	union {
+		test_ping_req_t		ping;	  /* ping parameter */
+		test_bulk_req_t		bulk_v0;  /* bulk parameter */
+		test_bulk_req_v1_t	bulk_v1;  /* bulk v1 parameter */
+	} tsi_u;
+} sfw_test_instance_t;
+
+/* XXX: trailing (PAGE_CACHE_SIZE % sizeof(lnet_process_id_t)) bytes at
+ * the end of pages are not used */
+#define SFW_MAX_CONCUR     LST_MAX_CONCUR
+#define SFW_ID_PER_PAGE    (PAGE_CACHE_SIZE / sizeof(lnet_process_id_packed_t))
+#define SFW_MAX_NDESTS     (LNET_MAX_IOV * SFW_ID_PER_PAGE)
+#define sfw_id_pages(n)    (((n) + SFW_ID_PER_PAGE - 1) / SFW_ID_PER_PAGE)
+
+typedef struct sfw_test_unit {
+	struct list_head	    tsu_list;	 /* chain on lst_test_instance */
+	lnet_process_id_t     tsu_dest;	 /* id of dest node */
+	int		   tsu_loop;	 /* loop count of the test */
+	sfw_test_instance_t  *tsu_instance;     /* pointer to test instance */
+	void		 *tsu_private;      /* private data */
+	swi_workitem_t	tsu_worker;       /* workitem of the test unit */
+} sfw_test_unit_t;
+
+typedef struct sfw_test_case {
+	struct list_head	      tsc_list;	 /* chain on fw_tests */
+	srpc_service_t	 *tsc_srv_service;  /* test service */
+	sfw_test_client_ops_t  *tsc_cli_ops;      /* ops of test client */
+} sfw_test_case_t;
+
+srpc_client_rpc_t *
+sfw_create_rpc(lnet_process_id_t peer, int service,
+	       unsigned features, int nbulkiov, int bulklen,
+	       void (*done) (srpc_client_rpc_t *), void *priv);
+int sfw_create_test_rpc(sfw_test_unit_t *tsu,
+			lnet_process_id_t peer, unsigned features,
+			int nblk, int blklen, srpc_client_rpc_t **rpc);
+void sfw_abort_rpc(srpc_client_rpc_t *rpc);
+void sfw_post_rpc(srpc_client_rpc_t *rpc);
+void sfw_client_rpc_done(srpc_client_rpc_t *rpc);
+void sfw_unpack_message(srpc_msg_t *msg);
+void sfw_free_pages(srpc_server_rpc_t *rpc);
+void sfw_add_bulk_page(srpc_bulk_t *bk, struct page *pg, int i);
+int sfw_alloc_pages(srpc_server_rpc_t *rpc, int cpt, int npages, int len,
+		    int sink);
+int sfw_make_session (srpc_mksn_reqst_t *request, srpc_mksn_reply_t *reply);
+
+srpc_client_rpc_t *
+srpc_create_client_rpc(lnet_process_id_t peer, int service,
+		       int nbulkiov, int bulklen,
+		       void (*rpc_done)(srpc_client_rpc_t *),
+		       void (*rpc_fini)(srpc_client_rpc_t *), void *priv);
+void srpc_post_rpc(srpc_client_rpc_t *rpc);
+void srpc_abort_rpc(srpc_client_rpc_t *rpc, int why);
+void srpc_free_bulk(srpc_bulk_t *bk);
+srpc_bulk_t *srpc_alloc_bulk(int cpt, unsigned bulk_npg, unsigned bulk_len,
+			     int sink);
+int srpc_send_rpc(swi_workitem_t *wi);
+int srpc_send_reply(srpc_server_rpc_t *rpc);
+int srpc_add_service(srpc_service_t *sv);
+int srpc_remove_service(srpc_service_t *sv);
+void srpc_shutdown_service(srpc_service_t *sv);
+void srpc_abort_service(srpc_service_t *sv);
+int srpc_finish_service(srpc_service_t *sv);
+int srpc_service_add_buffers(srpc_service_t *sv, int nbuffer);
+void srpc_service_remove_buffers(srpc_service_t *sv, int nbuffer);
+void srpc_get_counters(srpc_counters_t *cnt);
+void srpc_set_counters(const srpc_counters_t *cnt);
+
+extern struct cfs_wi_sched *lst_sched_serial;
+extern struct cfs_wi_sched **lst_sched_test;
+
+static inline int
+srpc_serv_is_framework(struct srpc_service *svc)
+{
+	return svc->sv_id < SRPC_FRAMEWORK_SERVICE_MAX_ID;
+}
+
+static inline int
+swi_wi_action(cfs_workitem_t *wi)
+{
+	swi_workitem_t *swi = container_of(wi, swi_workitem_t, swi_workitem);
+
+	return swi->swi_action(swi);
+}
+
+static inline void
+swi_init_workitem(swi_workitem_t *swi, void *data,
+		  swi_action_t action, struct cfs_wi_sched *sched)
+{
+	swi->swi_sched  = sched;
+	swi->swi_action = action;
+	swi->swi_state  = SWI_STATE_NEWBORN;
+	cfs_wi_init(&swi->swi_workitem, data, swi_wi_action);
+}
+
+static inline void
+swi_schedule_workitem(swi_workitem_t *wi)
+{
+	cfs_wi_schedule(wi->swi_sched, &wi->swi_workitem);
+}
+
+static inline void
+swi_exit_workitem(swi_workitem_t *swi)
+{
+	cfs_wi_exit(swi->swi_sched, &swi->swi_workitem);
+}
+
+static inline int
+swi_deschedule_workitem(swi_workitem_t *swi)
+{
+	return cfs_wi_deschedule(swi->swi_sched, &swi->swi_workitem);
+}
+
+
+int sfw_startup(void);
+int srpc_startup(void);
+void sfw_shutdown(void);
+void srpc_shutdown(void);
+
+static inline void
+srpc_destroy_client_rpc (srpc_client_rpc_t *rpc)
+{
+	LASSERT (rpc != NULL);
+	LASSERT (!srpc_event_pending(rpc));
+	LASSERT (atomic_read(&rpc->crpc_refcount) == 0);
+
+	if (rpc->crpc_fini == NULL) {
+		LIBCFS_FREE(rpc, srpc_client_rpc_size(rpc));
+	} else {
+		(*rpc->crpc_fini) (rpc);
+	}
+
+	return;
+}
+
+static inline void
+srpc_init_client_rpc (srpc_client_rpc_t *rpc, lnet_process_id_t peer,
+		      int service, int nbulkiov, int bulklen,
+		      void (*rpc_done)(srpc_client_rpc_t *),
+		      void (*rpc_fini)(srpc_client_rpc_t *), void *priv)
+{
+	LASSERT (nbulkiov <= LNET_MAX_IOV);
+
+	memset(rpc, 0, offsetof(srpc_client_rpc_t,
+				crpc_bulk.bk_iovs[nbulkiov]));
+
+	INIT_LIST_HEAD(&rpc->crpc_list);
+	swi_init_workitem(&rpc->crpc_wi, rpc, srpc_send_rpc,
+			  lst_sched_test[lnet_cpt_of_nid(peer.nid)]);
+	spin_lock_init(&rpc->crpc_lock);
+	atomic_set(&rpc->crpc_refcount, 1); /* 1 ref for caller */
+
+	rpc->crpc_dest	 = peer;
+	rpc->crpc_priv	 = priv;
+	rpc->crpc_service      = service;
+	rpc->crpc_bulk.bk_len  = bulklen;
+	rpc->crpc_bulk.bk_niov = nbulkiov;
+	rpc->crpc_done	 = rpc_done;
+	rpc->crpc_fini	 = rpc_fini;
+	LNetInvalidateHandle(&rpc->crpc_reqstmdh);
+	LNetInvalidateHandle(&rpc->crpc_replymdh);
+	LNetInvalidateHandle(&rpc->crpc_bulk.bk_mdh);
+
+	/* no event is expected at this point */
+	rpc->crpc_bulkev.ev_fired  =
+	rpc->crpc_reqstev.ev_fired =
+	rpc->crpc_replyev.ev_fired = 1;
+
+	rpc->crpc_reqstmsg.msg_magic   = SRPC_MSG_MAGIC;
+	rpc->crpc_reqstmsg.msg_version = SRPC_MSG_VERSION;
+	rpc->crpc_reqstmsg.msg_type    = srpc_service2request(service);
+	return;
+}
+
+static inline const char *
+swi_state2str (int state)
+{
+#define STATE2STR(x) case x: return #x
+	switch(state) {
+		default:
+			LBUG();
+		STATE2STR(SWI_STATE_NEWBORN);
+		STATE2STR(SWI_STATE_REPLY_SUBMITTED);
+		STATE2STR(SWI_STATE_REPLY_SENT);
+		STATE2STR(SWI_STATE_REQUEST_SUBMITTED);
+		STATE2STR(SWI_STATE_REQUEST_SENT);
+		STATE2STR(SWI_STATE_REPLY_RECEIVED);
+		STATE2STR(SWI_STATE_BULK_STARTED);
+		STATE2STR(SWI_STATE_DONE);
+	}
+#undef STATE2STR
+}
+
+#define selftest_wait_events()					\
+	do {							\
+		set_current_state(TASK_UNINTERRUPTIBLE);	\
+		schedule_timeout(cfs_time_seconds(1) / 10);	\
+	} while (0)
+
+
+#define lst_wait_until(cond, lock, fmt, ...)				\
+do {									\
+	int __I = 2;							\
+	while (!(cond)) {						\
+		CDEBUG(IS_PO2(++__I) ? D_WARNING : D_NET,		\
+		       fmt, ## __VA_ARGS__);				\
+		spin_unlock(&(lock));					\
+									\
+		selftest_wait_events();					\
+									\
+		spin_lock(&(lock));					\
+	}								\
+} while (0)
+
+static inline void
+srpc_wait_service_shutdown(srpc_service_t *sv)
+{
+	int i = 2;
+
+	LASSERT(sv->sv_shuttingdown);
+
+	while (srpc_finish_service(sv) == 0) {
+		i++;
+		CDEBUG (((i & -i) == i) ? D_WARNING : D_NET,
+			"Waiting for %s service to shutdown...\n",
+			sv->sv_name);
+		selftest_wait_events();
+	}
+}
+
+extern sfw_test_client_ops_t brw_test_client;
+void brw_init_test_client(void);
+
+extern srpc_service_t brw_test_service;
+void brw_init_test_service(void);
+
+extern sfw_test_client_ops_t ping_test_client;
+void ping_init_test_client(void);
+
+extern srpc_service_t ping_test_service;
+void ping_init_test_service(void);
+
+#endif /* __SELFTEST_SELFTEST_H__ */
diff --git a/kernel/drivers/staging/lustre/lnet/selftest/timer.c b/kernel/drivers/staging/lustre/lnet/selftest/timer.c
new file mode 100644
index 000000000..441f9472a
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lnet/selftest/timer.c
@@ -0,0 +1,248 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/selftest/timer.c
+ *
+ * Author: Isaac Huang <isaac@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+
+#include "selftest.h"
+
+
+/*
+ * Timers are implemented as a sorted queue of expiry times. The queue
+ * is slotted, with each slot holding timers which expire in a
+ * 2**STTIMER_MINPOLL (8) second period. The timers in each slot are
+ * sorted by increasing expiry time. The number of slots is 2**7 (128),
+ * to cover a time period of 1024 seconds into the future before wrapping.
+ */
+#define STTIMER_MINPOLL	3   /* log2 min poll interval (8 s) */
+#define STTIMER_SLOTTIME       (1 << STTIMER_MINPOLL)
+#define STTIMER_SLOTTIMEMASK   (~(STTIMER_SLOTTIME - 1))
+#define STTIMER_NSLOTS	       (1 << 7)
+#define STTIMER_SLOT(t)	       (&stt_data.stt_hash[(((t) >> STTIMER_MINPOLL) & \
+						    (STTIMER_NSLOTS - 1))])
+
+static struct st_timer_data {
+	spinlock_t	 stt_lock;
+	/* start time of the slot processed previously */
+	unsigned long       stt_prev_slot;
+	struct list_head       stt_hash[STTIMER_NSLOTS];
+	int	      stt_shuttingdown;
+	wait_queue_head_t      stt_waitq;
+	int	      stt_nthreads;
+} stt_data;
+
+void
+stt_add_timer(stt_timer_t *timer)
+{
+	struct list_head *pos;
+
+	spin_lock(&stt_data.stt_lock);
+
+	LASSERT(stt_data.stt_nthreads > 0);
+	LASSERT(!stt_data.stt_shuttingdown);
+	LASSERT(timer->stt_func != NULL);
+	LASSERT(list_empty(&timer->stt_list));
+	LASSERT(cfs_time_after(timer->stt_expires, get_seconds()));
+
+	/* a simple insertion sort */
+	list_for_each_prev(pos, STTIMER_SLOT(timer->stt_expires)) {
+		stt_timer_t *old = list_entry(pos, stt_timer_t, stt_list);
+
+		if (cfs_time_aftereq(timer->stt_expires, old->stt_expires))
+			break;
+	}
+	list_add(&timer->stt_list, pos);
+
+	spin_unlock(&stt_data.stt_lock);
+}
+
+/*
+ * The function returns whether it has deactivated a pending timer or not.
+ * (ie. del_timer() of an inactive timer returns 0, del_timer() of an
+ * active timer returns 1.)
+ *
+ * CAVEAT EMPTOR:
+ * When 0 is returned, it is possible that timer->stt_func _is_ running on
+ * another CPU.
+ */
+int
+stt_del_timer(stt_timer_t *timer)
+{
+	int ret = 0;
+
+	spin_lock(&stt_data.stt_lock);
+
+	LASSERT(stt_data.stt_nthreads > 0);
+	LASSERT(!stt_data.stt_shuttingdown);
+
+	if (!list_empty(&timer->stt_list)) {
+		ret = 1;
+		list_del_init(&timer->stt_list);
+	}
+
+	spin_unlock(&stt_data.stt_lock);
+	return ret;
+}
+
+/* called with stt_data.stt_lock held */
+static int
+stt_expire_list(struct list_head *slot, unsigned long now)
+{
+	int	  expired = 0;
+	stt_timer_t *timer;
+
+	while (!list_empty(slot)) {
+		timer = list_entry(slot->next, stt_timer_t, stt_list);
+
+		if (cfs_time_after(timer->stt_expires, now))
+			break;
+
+		list_del_init(&timer->stt_list);
+		spin_unlock(&stt_data.stt_lock);
+
+		expired++;
+		(*timer->stt_func) (timer->stt_data);
+
+		spin_lock(&stt_data.stt_lock);
+	}
+
+	return expired;
+}
+
+static int
+stt_check_timers(unsigned long *last)
+{
+	int	expired = 0;
+	unsigned long now;
+	unsigned long this_slot;
+
+	now = get_seconds();
+	this_slot = now & STTIMER_SLOTTIMEMASK;
+
+	spin_lock(&stt_data.stt_lock);
+
+	while (cfs_time_aftereq(this_slot, *last)) {
+		expired += stt_expire_list(STTIMER_SLOT(this_slot), now);
+		this_slot = cfs_time_sub(this_slot, STTIMER_SLOTTIME);
+	}
+
+	*last = now & STTIMER_SLOTTIMEMASK;
+	spin_unlock(&stt_data.stt_lock);
+	return expired;
+}
+
+
+static int
+stt_timer_main(void *arg)
+{
+	cfs_block_allsigs();
+
+	while (!stt_data.stt_shuttingdown) {
+		stt_check_timers(&stt_data.stt_prev_slot);
+
+		wait_event_timeout(stt_data.stt_waitq,
+				   stt_data.stt_shuttingdown,
+				   cfs_time_seconds(STTIMER_SLOTTIME));
+	}
+
+	spin_lock(&stt_data.stt_lock);
+	stt_data.stt_nthreads--;
+	spin_unlock(&stt_data.stt_lock);
+	return 0;
+}
+
+static int
+stt_start_timer_thread(void)
+{
+	struct task_struct *task;
+
+	LASSERT(!stt_data.stt_shuttingdown);
+
+	task = kthread_run(stt_timer_main, NULL, "st_timer");
+	if (IS_ERR(task))
+		return PTR_ERR(task);
+
+	spin_lock(&stt_data.stt_lock);
+	stt_data.stt_nthreads++;
+	spin_unlock(&stt_data.stt_lock);
+	return 0;
+}
+
+
+int
+stt_startup(void)
+{
+	int rc = 0;
+	int i;
+
+	stt_data.stt_shuttingdown = 0;
+	stt_data.stt_prev_slot = get_seconds() & STTIMER_SLOTTIMEMASK;
+
+	spin_lock_init(&stt_data.stt_lock);
+	for (i = 0; i < STTIMER_NSLOTS; i++)
+		INIT_LIST_HEAD(&stt_data.stt_hash[i]);
+
+	stt_data.stt_nthreads = 0;
+	init_waitqueue_head(&stt_data.stt_waitq);
+	rc = stt_start_timer_thread();
+	if (rc != 0)
+		CERROR("Can't spawn timer thread: %d\n", rc);
+
+	return rc;
+}
+
+void
+stt_shutdown(void)
+{
+	int i;
+
+	spin_lock(&stt_data.stt_lock);
+
+	for (i = 0; i < STTIMER_NSLOTS; i++)
+		LASSERT(list_empty(&stt_data.stt_hash[i]));
+
+	stt_data.stt_shuttingdown = 1;
+
+	wake_up(&stt_data.stt_waitq);
+	lst_wait_until(stt_data.stt_nthreads == 0, stt_data.stt_lock,
+		       "waiting for %d threads to terminate\n",
+		       stt_data.stt_nthreads);
+
+	spin_unlock(&stt_data.stt_lock);
+}
diff --git a/kernel/drivers/staging/lustre/lnet/selftest/timer.h b/kernel/drivers/staging/lustre/lnet/selftest/timer.h
new file mode 100644
index 000000000..d727c1e2b
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lnet/selftest/timer.h
@@ -0,0 +1,53 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/selftest/timer.h
+ *
+ * Author: Isaac Huang <isaac@clusterfs.com>
+ */
+#ifndef __SELFTEST_TIMER_H__
+#define __SELFTEST_TIMER_H__
+
+typedef struct {
+	struct list_head	stt_list;
+	unsigned long	stt_expires;
+	void	    (*stt_func) (void *);
+	void	     *stt_data;
+} stt_timer_t;
+
+void stt_add_timer (stt_timer_t *timer);
+int stt_del_timer (stt_timer_t *timer);
+int stt_startup (void);
+void stt_shutdown (void);
+
+#endif /* __SELFTEST_TIMER_H__ */
diff --git a/kernel/drivers/staging/lustre/lustre/Kconfig b/kernel/drivers/staging/lustre/lustre/Kconfig
new file mode 100644
index 000000000..62c7bba75
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/Kconfig
@@ -0,0 +1,62 @@
+config LUSTRE_FS
+	tristate "Lustre file system client support"
+	depends on INET && m && !MIPS && !XTENSA && !SUPERH
+	select LNET
+	select CRYPTO
+	select CRYPTO_CRC32
+	select CRYPTO_CRC32_PCLMUL if X86
+	select CRYPTO_CRC32C
+	select CRYPTO_MD5
+	select CRYPTO_SHA1
+	select CRYPTO_SHA256
+	select CRYPTO_SHA512
+	depends on MULTIUSER
+	help
+	  This option enables Lustre file system client support. Choose Y
+	  here if you want to access a Lustre file system cluster. To compile
+	  this file system support as a module, choose M here: the module will
+	  be called lustre.
+
+	  To mount Lustre file systems, you also need to install the user space
+	  mount.lustre and other user space commands which can be found in the
+	  lustre-client package, available from
+	  http://downloads.whamcloud.com/public/lustre/
+
+	  Lustre file system is the most popular cluster file system in high
+	  performance computing. Source code of both kernel space and user space
+	  Lustre components can also be found at
+	  http://git.whamcloud.com/?p=fs/lustre-release.git;a=summary
+
+	  If unsure, say N.
+
+	  See also http://wiki.lustre.org/
+
+config LUSTRE_OBD_MAX_IOCTL_BUFFER
+	int "Lustre obd max ioctl buffer bytes (default 8KB)"
+	depends on LUSTRE_FS
+	default 8192
+	help
+	  This option defines the maximum size of buffer in bytes that user space
+	  applications can pass to Lustre kernel module through ioctl interface.
+
+	  If unsure, use default.
+
+config LUSTRE_DEBUG_EXPENSIVE_CHECK
+	bool "Enable Lustre DEBUG checks"
+	depends on LUSTRE_FS
+	help
+	  This option is mainly for debug purpose. It enables Lustre code to do
+	  expensive checks that may have a performance impact.
+
+	  Use with caution. If unsure, say N.
+
+config LUSTRE_TRANSLATE_ERRNOS
+	bool
+	depends on LUSTRE_FS && !X86
+	default y
+
+config LUSTRE_LLITE_LLOOP
+	tristate "Lustre virtual block device"
+	depends on LUSTRE_FS && BLOCK
+	depends on !PPC_64K_PAGES && !ARM64_64K_PAGES && !MICROBLAZE_64K_PAGES && !PAGE_SIZE_64KB && !IA64_PAGE_SIZE_64KB && !PARISC_PAGE_SIZE_64KB
+	default m
diff --git a/kernel/drivers/staging/lustre/lustre/Makefile b/kernel/drivers/staging/lustre/lustre/Makefile
new file mode 100644
index 000000000..35d8b0b2d
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/Makefile
@@ -0,0 +1,2 @@
+obj-$(CONFIG_LUSTRE_FS) += libcfs/ obdclass/ ptlrpc/ fld/ osc/ mgc/ \
+			   fid/ lov/ mdc/ lmv/ llite/ obdecho/
diff --git a/kernel/drivers/staging/lustre/lustre/fid/Makefile b/kernel/drivers/staging/lustre/lustre/fid/Makefile
new file mode 100644
index 000000000..5513ce416
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/fid/Makefile
@@ -0,0 +1,3 @@
+obj-$(CONFIG_LUSTRE_FS) += fid.o
+fid-y := fid_request.o fid_lib.o
+fid-$(CONFIG_PROC_FS) += lproc_fid.o
diff --git a/kernel/drivers/staging/lustre/lustre/fid/fid_internal.h b/kernel/drivers/staging/lustre/lustre/fid/fid_internal.h
new file mode 100644
index 000000000..b5e8da895
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/fid/fid_internal.h
@@ -0,0 +1,56 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/fid/fid_internal.h
+ *
+ * Author: Yury Umanets <umka@clusterfs.com>
+ */
+#ifndef __FID_INTERNAL_H
+#define __FID_INTERNAL_H
+
+#include "../include/lustre/lustre_idl.h"
+#include "../../include/linux/libcfs/libcfs.h"
+
+/* Functions used internally in module. */
+int seq_client_alloc_super(struct lu_client_seq *seq,
+			   const struct lu_env *env);
+
+#if defined(CONFIG_PROC_FS)
+extern struct lprocfs_vars seq_client_proc_list[];
+#endif
+
+extern struct proc_dir_entry *seq_type_proc_dir;
+
+#endif /* __FID_INTERNAL_H */
diff --git a/kernel/drivers/staging/lustre/lustre/fid/fid_lib.c b/kernel/drivers/staging/lustre/lustre/fid/fid_lib.c
new file mode 100644
index 000000000..dd65159eb
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/fid/fid_lib.c
@@ -0,0 +1,95 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/fid/fid_lib.c
+ *
+ * Miscellaneous fid functions.
+ *
+ * Author: Nikita Danilov <nikita@clusterfs.com>
+ * Author: Yury Umanets <umka@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_FID
+
+#include "../../include/linux/libcfs/libcfs.h"
+#include <linux/module.h>
+#include "../include/lustre/lustre_idl.h"
+#include "../include/lustre_fid.h"
+
+/**
+ * A cluster-wide range from which fid-sequences are granted to servers and
+ * then clients.
+ *
+ * Fid namespace:
+ * <pre>
+ * Normal FID:        seq:64 [2^33,2^64-1]      oid:32          ver:32
+ * IGIF      :        0:32, ino:32              gen:32          0:32
+ * IDIF      :        0:31, 1:1, ost-index:16,  objd:48         0:32
+ * </pre>
+ *
+ * The first 0x400 sequences of normal FID are reserved for special purpose.
+ * FID_SEQ_START + 1 is for local file id generation.
+ * FID_SEQ_START + 2 is for .lustre directory and its objects
+ */
+const struct lu_seq_range LUSTRE_SEQ_SPACE_RANGE = {
+	FID_SEQ_NORMAL,
+	(__u64)~0ULL
+};
+EXPORT_SYMBOL(LUSTRE_SEQ_SPACE_RANGE);
+
+/* Zero range, used for init and other purposes. */
+const struct lu_seq_range LUSTRE_SEQ_ZERO_RANGE = {
+	0,
+	0
+};
+EXPORT_SYMBOL(LUSTRE_SEQ_ZERO_RANGE);
+
+/* Lustre Big Fs Lock fid. */
+const struct lu_fid LUSTRE_BFL_FID = { .f_seq = FID_SEQ_SPECIAL,
+				       .f_oid = FID_OID_SPECIAL_BFL,
+				       .f_ver = 0x0000000000000000 };
+EXPORT_SYMBOL(LUSTRE_BFL_FID);
+
+/** Special fid for ".lustre" directory */
+const struct lu_fid LU_DOT_LUSTRE_FID = { .f_seq = FID_SEQ_DOT_LUSTRE,
+					  .f_oid = FID_OID_DOT_LUSTRE,
+					  .f_ver = 0x0000000000000000 };
+EXPORT_SYMBOL(LU_DOT_LUSTRE_FID);
+
+/** Special fid for "fid" special object in .lustre */
+const struct lu_fid LU_OBF_FID = { .f_seq = FID_SEQ_DOT_LUSTRE,
+				   .f_oid = FID_OID_DOT_LUSTRE_OBF,
+				   .f_ver = 0x0000000000000000 };
+EXPORT_SYMBOL(LU_OBF_FID);
diff --git a/kernel/drivers/staging/lustre/lustre/fid/fid_request.c b/kernel/drivers/staging/lustre/lustre/fid/fid_request.c
new file mode 100644
index 000000000..063441abf
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/fid/fid_request.c
@@ -0,0 +1,572 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2013, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/fid/fid_request.c
+ *
+ * Lustre Sequence Manager
+ *
+ * Author: Yury Umanets <umka@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_FID
+
+#include "../../include/linux/libcfs/libcfs.h"
+#include <linux/module.h>
+
+#include "../include/obd.h"
+#include "../include/obd_class.h"
+#include "../include/obd_support.h"
+#include "../include/lustre_fid.h"
+/* mdc RPC locks */
+#include "../include/lustre_mdc.h"
+#include "fid_internal.h"
+
+static int seq_client_rpc(struct lu_client_seq *seq,
+			  struct lu_seq_range *output, __u32 opc,
+			  const char *opcname)
+{
+	struct obd_export     *exp = seq->lcs_exp;
+	struct ptlrpc_request *req;
+	struct lu_seq_range   *out, *in;
+	__u32                 *op;
+	unsigned int           debug_mask;
+	int                    rc;
+
+	req = ptlrpc_request_alloc_pack(class_exp2cliimp(exp), &RQF_SEQ_QUERY,
+					LUSTRE_MDS_VERSION, SEQ_QUERY);
+	if (req == NULL)
+		return -ENOMEM;
+
+	/* Init operation code */
+	op = req_capsule_client_get(&req->rq_pill, &RMF_SEQ_OPC);
+	*op = opc;
+
+	/* Zero out input range, this is not recovery yet. */
+	in = req_capsule_client_get(&req->rq_pill, &RMF_SEQ_RANGE);
+	range_init(in);
+
+	ptlrpc_request_set_replen(req);
+
+	in->lsr_index = seq->lcs_space.lsr_index;
+	if (seq->lcs_type == LUSTRE_SEQ_METADATA)
+		fld_range_set_mdt(in);
+	else
+		fld_range_set_ost(in);
+
+	if (opc == SEQ_ALLOC_SUPER) {
+		req->rq_request_portal = SEQ_CONTROLLER_PORTAL;
+		req->rq_reply_portal = MDC_REPLY_PORTAL;
+		/* During allocating super sequence for data object,
+		 * the current thread might hold the export of MDT0(MDT0
+		 * precreating objects on this OST), and it will send the
+		 * request to MDT0 here, so we can not keep resending the
+		 * request here, otherwise if MDT0 is failed(umounted),
+		 * it can not release the export of MDT0 */
+		if (seq->lcs_type == LUSTRE_SEQ_DATA)
+			req->rq_no_delay = req->rq_no_resend = 1;
+		debug_mask = D_CONSOLE;
+	} else {
+		if (seq->lcs_type == LUSTRE_SEQ_METADATA)
+			req->rq_request_portal = SEQ_METADATA_PORTAL;
+		else
+			req->rq_request_portal = SEQ_DATA_PORTAL;
+		debug_mask = D_INFO;
+	}
+
+	ptlrpc_at_set_req_timeout(req);
+
+	if (seq->lcs_type == LUSTRE_SEQ_METADATA)
+		mdc_get_rpc_lock(exp->exp_obd->u.cli.cl_rpc_lock, NULL);
+	rc = ptlrpc_queue_wait(req);
+	if (seq->lcs_type == LUSTRE_SEQ_METADATA)
+		mdc_put_rpc_lock(exp->exp_obd->u.cli.cl_rpc_lock, NULL);
+	if (rc)
+		goto out_req;
+
+	out = req_capsule_server_get(&req->rq_pill, &RMF_SEQ_RANGE);
+	*output = *out;
+
+	if (!range_is_sane(output)) {
+		CERROR("%s: Invalid range received from server: "
+		       DRANGE"\n", seq->lcs_name, PRANGE(output));
+		rc = -EINVAL;
+		goto out_req;
+	}
+
+	if (range_is_exhausted(output)) {
+		CERROR("%s: Range received from server is exhausted: "
+		       DRANGE"]\n", seq->lcs_name, PRANGE(output));
+		rc = -EINVAL;
+		goto out_req;
+	}
+
+	CDEBUG_LIMIT(debug_mask, "%s: Allocated %s-sequence "DRANGE"]\n",
+		     seq->lcs_name, opcname, PRANGE(output));
+
+out_req:
+	ptlrpc_req_finished(req);
+	return rc;
+}
+
+/* Request sequence-controller node to allocate new super-sequence. */
+int seq_client_alloc_super(struct lu_client_seq *seq,
+			   const struct lu_env *env)
+{
+	int rc;
+
+	mutex_lock(&seq->lcs_mutex);
+
+	if (seq->lcs_srv) {
+		rc = 0;
+	} else {
+		/* Check whether the connection to seq controller has been
+		 * setup (lcs_exp != NULL) */
+		if (seq->lcs_exp == NULL) {
+			mutex_unlock(&seq->lcs_mutex);
+			return -EINPROGRESS;
+		}
+
+		rc = seq_client_rpc(seq, &seq->lcs_space,
+				    SEQ_ALLOC_SUPER, "super");
+	}
+	mutex_unlock(&seq->lcs_mutex);
+	return rc;
+}
+
+/* Request sequence-controller node to allocate new meta-sequence. */
+static int seq_client_alloc_meta(const struct lu_env *env,
+				 struct lu_client_seq *seq)
+{
+	int rc;
+
+	if (seq->lcs_srv) {
+		rc = 0;
+	} else {
+		do {
+			/* If meta server return -EINPROGRESS or EAGAIN,
+			 * it means meta server might not be ready to
+			 * allocate super sequence from sequence controller
+			 * (MDT0)yet */
+			rc = seq_client_rpc(seq, &seq->lcs_space,
+					    SEQ_ALLOC_META, "meta");
+		} while (rc == -EINPROGRESS || rc == -EAGAIN);
+	}
+
+	return rc;
+}
+
+/* Allocate new sequence for client. */
+static int seq_client_alloc_seq(const struct lu_env *env,
+				struct lu_client_seq *seq, u64 *seqnr)
+{
+	int rc;
+
+	LASSERT(range_is_sane(&seq->lcs_space));
+
+	if (range_is_exhausted(&seq->lcs_space)) {
+		rc = seq_client_alloc_meta(env, seq);
+		if (rc) {
+			CERROR("%s: Can't allocate new meta-sequence, rc %d\n",
+			       seq->lcs_name, rc);
+			return rc;
+		} else {
+			CDEBUG(D_INFO, "%s: New range - "DRANGE"\n",
+			       seq->lcs_name, PRANGE(&seq->lcs_space));
+		}
+	} else {
+		rc = 0;
+	}
+
+	LASSERT(!range_is_exhausted(&seq->lcs_space));
+	*seqnr = seq->lcs_space.lsr_start;
+	seq->lcs_space.lsr_start += 1;
+
+	CDEBUG(D_INFO, "%s: Allocated sequence [%#llx]\n", seq->lcs_name,
+	       *seqnr);
+
+	return rc;
+}
+
+static int seq_fid_alloc_prep(struct lu_client_seq *seq,
+			      wait_queue_t *link)
+{
+	if (seq->lcs_update) {
+		add_wait_queue(&seq->lcs_waitq, link);
+		set_current_state(TASK_UNINTERRUPTIBLE);
+		mutex_unlock(&seq->lcs_mutex);
+
+		schedule();
+
+		mutex_lock(&seq->lcs_mutex);
+		remove_wait_queue(&seq->lcs_waitq, link);
+		set_current_state(TASK_RUNNING);
+		return -EAGAIN;
+	}
+	++seq->lcs_update;
+	mutex_unlock(&seq->lcs_mutex);
+	return 0;
+}
+
+static void seq_fid_alloc_fini(struct lu_client_seq *seq)
+{
+	LASSERT(seq->lcs_update == 1);
+	mutex_lock(&seq->lcs_mutex);
+	--seq->lcs_update;
+	wake_up(&seq->lcs_waitq);
+}
+
+/**
+ * Allocate the whole seq to the caller.
+ **/
+int seq_client_get_seq(const struct lu_env *env,
+		       struct lu_client_seq *seq, u64 *seqnr)
+{
+	wait_queue_t link;
+	int rc;
+
+	LASSERT(seqnr != NULL);
+	mutex_lock(&seq->lcs_mutex);
+	init_waitqueue_entry(&link, current);
+
+	while (1) {
+		rc = seq_fid_alloc_prep(seq, &link);
+		if (rc == 0)
+			break;
+	}
+
+	rc = seq_client_alloc_seq(env, seq, seqnr);
+	if (rc) {
+		CERROR("%s: Can't allocate new sequence, rc %d\n",
+		       seq->lcs_name, rc);
+		seq_fid_alloc_fini(seq);
+		mutex_unlock(&seq->lcs_mutex);
+		return rc;
+	}
+
+	CDEBUG(D_INFO, "%s: allocate sequence [0x%16.16Lx]\n",
+	       seq->lcs_name, *seqnr);
+
+	/* Since the caller require the whole seq,
+	 * so marked this seq to be used */
+	if (seq->lcs_type == LUSTRE_SEQ_METADATA)
+		seq->lcs_fid.f_oid = LUSTRE_METADATA_SEQ_MAX_WIDTH;
+	else
+		seq->lcs_fid.f_oid = LUSTRE_DATA_SEQ_MAX_WIDTH;
+
+	seq->lcs_fid.f_seq = *seqnr;
+	seq->lcs_fid.f_ver = 0;
+	/*
+	 * Inform caller that sequence switch is performed to allow it
+	 * to setup FLD for it.
+	 */
+	seq_fid_alloc_fini(seq);
+	mutex_unlock(&seq->lcs_mutex);
+
+	return rc;
+}
+EXPORT_SYMBOL(seq_client_get_seq);
+
+/* Allocate new fid on passed client @seq and save it to @fid. */
+int seq_client_alloc_fid(const struct lu_env *env,
+			 struct lu_client_seq *seq, struct lu_fid *fid)
+{
+	wait_queue_t link;
+	int rc;
+
+	LASSERT(seq != NULL);
+	LASSERT(fid != NULL);
+
+	init_waitqueue_entry(&link, current);
+	mutex_lock(&seq->lcs_mutex);
+
+	if (OBD_FAIL_CHECK(OBD_FAIL_SEQ_EXHAUST))
+		seq->lcs_fid.f_oid = seq->lcs_width;
+
+	while (1) {
+		u64 seqnr;
+
+		if (!fid_is_zero(&seq->lcs_fid) &&
+		    fid_oid(&seq->lcs_fid) < seq->lcs_width) {
+			/* Just bump last allocated fid and return to caller. */
+			seq->lcs_fid.f_oid += 1;
+			rc = 0;
+			break;
+		}
+
+		rc = seq_fid_alloc_prep(seq, &link);
+		if (rc)
+			continue;
+
+		rc = seq_client_alloc_seq(env, seq, &seqnr);
+		if (rc) {
+			CERROR("%s: Can't allocate new sequence, rc %d\n",
+			       seq->lcs_name, rc);
+			seq_fid_alloc_fini(seq);
+			mutex_unlock(&seq->lcs_mutex);
+			return rc;
+		}
+
+		CDEBUG(D_INFO, "%s: Switch to sequence [0x%16.16Lx]\n",
+		       seq->lcs_name, seqnr);
+
+		seq->lcs_fid.f_oid = LUSTRE_FID_INIT_OID;
+		seq->lcs_fid.f_seq = seqnr;
+		seq->lcs_fid.f_ver = 0;
+
+		/*
+		 * Inform caller that sequence switch is performed to allow it
+		 * to setup FLD for it.
+		 */
+		rc = 1;
+
+		seq_fid_alloc_fini(seq);
+		break;
+	}
+
+	*fid = seq->lcs_fid;
+	mutex_unlock(&seq->lcs_mutex);
+
+	CDEBUG(D_INFO, "%s: Allocated FID "DFID"\n", seq->lcs_name,  PFID(fid));
+	return rc;
+}
+EXPORT_SYMBOL(seq_client_alloc_fid);
+
+/*
+ * Finish the current sequence due to disconnect.
+ * See mdc_import_event()
+ */
+void seq_client_flush(struct lu_client_seq *seq)
+{
+	wait_queue_t link;
+
+	LASSERT(seq != NULL);
+	init_waitqueue_entry(&link, current);
+	mutex_lock(&seq->lcs_mutex);
+
+	while (seq->lcs_update) {
+		add_wait_queue(&seq->lcs_waitq, &link);
+		set_current_state(TASK_UNINTERRUPTIBLE);
+		mutex_unlock(&seq->lcs_mutex);
+
+		schedule();
+
+		mutex_lock(&seq->lcs_mutex);
+		remove_wait_queue(&seq->lcs_waitq, &link);
+		set_current_state(TASK_RUNNING);
+	}
+
+	fid_zero(&seq->lcs_fid);
+	/**
+	 * this id shld not be used for seq range allocation.
+	 * set to -1 for dgb check.
+	 */
+
+	seq->lcs_space.lsr_index = -1;
+
+	range_init(&seq->lcs_space);
+	mutex_unlock(&seq->lcs_mutex);
+}
+EXPORT_SYMBOL(seq_client_flush);
+
+static void seq_client_proc_fini(struct lu_client_seq *seq)
+{
+#if defined(CONFIG_PROC_FS)
+	if (seq->lcs_proc_dir) {
+		if (!IS_ERR(seq->lcs_proc_dir))
+			lprocfs_remove(&seq->lcs_proc_dir);
+		seq->lcs_proc_dir = NULL;
+	}
+#endif /* CONFIG_PROC_FS */
+}
+
+static int seq_client_proc_init(struct lu_client_seq *seq)
+{
+#if defined(CONFIG_PROC_FS)
+	int rc;
+
+	seq->lcs_proc_dir = lprocfs_register(seq->lcs_name,
+					     seq_type_proc_dir,
+					     NULL, NULL);
+
+	if (IS_ERR(seq->lcs_proc_dir)) {
+		CERROR("%s: LProcFS failed in seq-init\n",
+		       seq->lcs_name);
+		rc = PTR_ERR(seq->lcs_proc_dir);
+		return rc;
+	}
+
+	rc = lprocfs_add_vars(seq->lcs_proc_dir,
+			      seq_client_proc_list, seq);
+	if (rc) {
+		CERROR("%s: Can't init sequence manager proc, rc %d\n",
+		       seq->lcs_name, rc);
+		goto out_cleanup;
+	}
+
+	return 0;
+
+out_cleanup:
+	seq_client_proc_fini(seq);
+	return rc;
+
+#else /* CONFIG_PROC_FS */
+	return 0;
+#endif
+}
+
+int seq_client_init(struct lu_client_seq *seq,
+		    struct obd_export *exp,
+		    enum lu_cli_type type,
+		    const char *prefix,
+		    struct lu_server_seq *srv)
+{
+	int rc;
+
+	LASSERT(seq != NULL);
+	LASSERT(prefix != NULL);
+
+	seq->lcs_srv = srv;
+	seq->lcs_type = type;
+
+	mutex_init(&seq->lcs_mutex);
+	if (type == LUSTRE_SEQ_METADATA)
+		seq->lcs_width = LUSTRE_METADATA_SEQ_MAX_WIDTH;
+	else
+		seq->lcs_width = LUSTRE_DATA_SEQ_MAX_WIDTH;
+
+	init_waitqueue_head(&seq->lcs_waitq);
+	/* Make sure that things are clear before work is started. */
+	seq_client_flush(seq);
+
+	if (exp != NULL)
+		seq->lcs_exp = class_export_get(exp);
+	else if (type == LUSTRE_SEQ_METADATA)
+		LASSERT(seq->lcs_srv != NULL);
+
+	snprintf(seq->lcs_name, sizeof(seq->lcs_name),
+		 "cli-%s", prefix);
+
+	rc = seq_client_proc_init(seq);
+	if (rc)
+		seq_client_fini(seq);
+	return rc;
+}
+EXPORT_SYMBOL(seq_client_init);
+
+void seq_client_fini(struct lu_client_seq *seq)
+{
+	seq_client_proc_fini(seq);
+
+	if (seq->lcs_exp != NULL) {
+		class_export_put(seq->lcs_exp);
+		seq->lcs_exp = NULL;
+	}
+
+	seq->lcs_srv = NULL;
+}
+EXPORT_SYMBOL(seq_client_fini);
+
+int client_fid_init(struct obd_device *obd,
+		    struct obd_export *exp, enum lu_cli_type type)
+{
+	struct client_obd *cli = &obd->u.cli;
+	char *prefix;
+	int rc;
+
+	OBD_ALLOC_PTR(cli->cl_seq);
+	if (cli->cl_seq == NULL)
+		return -ENOMEM;
+
+	OBD_ALLOC(prefix, MAX_OBD_NAME + 5);
+	if (prefix == NULL) {
+		rc = -ENOMEM;
+		goto out_free_seq;
+	}
+
+	snprintf(prefix, MAX_OBD_NAME + 5, "cli-%s", obd->obd_name);
+
+	/* Init client side sequence-manager */
+	rc = seq_client_init(cli->cl_seq, exp, type, prefix, NULL);
+	OBD_FREE(prefix, MAX_OBD_NAME + 5);
+	if (rc)
+		goto out_free_seq;
+
+	return rc;
+out_free_seq:
+	OBD_FREE_PTR(cli->cl_seq);
+	cli->cl_seq = NULL;
+	return rc;
+}
+EXPORT_SYMBOL(client_fid_init);
+
+int client_fid_fini(struct obd_device *obd)
+{
+	struct client_obd *cli = &obd->u.cli;
+
+	if (cli->cl_seq != NULL) {
+		seq_client_fini(cli->cl_seq);
+		OBD_FREE_PTR(cli->cl_seq);
+		cli->cl_seq = NULL;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL(client_fid_fini);
+
+struct proc_dir_entry *seq_type_proc_dir;
+
+static int __init fid_mod_init(void)
+{
+	seq_type_proc_dir = lprocfs_register(LUSTRE_SEQ_NAME,
+					     proc_lustre_root,
+					     NULL, NULL);
+	return PTR_ERR_OR_ZERO(seq_type_proc_dir);
+}
+
+static void __exit fid_mod_exit(void)
+{
+	if (seq_type_proc_dir != NULL && !IS_ERR(seq_type_proc_dir)) {
+		lprocfs_remove(&seq_type_proc_dir);
+		seq_type_proc_dir = NULL;
+	}
+}
+
+MODULE_AUTHOR("Sun Microsystems, Inc. <http://www.lustre.org/>");
+MODULE_DESCRIPTION("Lustre FID Module");
+MODULE_LICENSE("GPL");
+MODULE_VERSION("0.1.0");
+
+module_init(fid_mod_init);
+module_exit(fid_mod_exit);
diff --git a/kernel/drivers/staging/lustre/lustre/fid/lproc_fid.c b/kernel/drivers/staging/lustre/lustre/fid/lproc_fid.c
new file mode 100644
index 000000000..783939dbd
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/fid/lproc_fid.c
@@ -0,0 +1,225 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/fid/lproc_fid.c
+ *
+ * Lustre Sequence Manager
+ *
+ * Author: Yury Umanets <umka@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_FID
+
+#include "../../include/linux/libcfs/libcfs.h"
+#include <linux/module.h>
+
+#include "../include/obd.h"
+#include "../include/obd_class.h"
+#include "../include/dt_object.h"
+#include "../include/obd_support.h"
+#include "../include/lustre_req_layout.h"
+#include "../include/lustre_fid.h"
+#include "fid_internal.h"
+
+/* Format: [0x64BIT_INT - 0x64BIT_INT] + 32 bytes just in case */
+#define MAX_FID_RANGE_STRLEN (32 + 2 * 2 * sizeof(__u64))
+/*
+ * Note: this function is only used for testing, it is no safe for production
+ * use.
+ */
+static int lprocfs_fid_write_common(const char __user *buffer, size_t count,
+				    struct lu_seq_range *range)
+{
+	struct lu_seq_range tmp;
+	int rc;
+	char kernbuf[MAX_FID_RANGE_STRLEN];
+
+	LASSERT(range != NULL);
+
+	if (count >= sizeof(kernbuf))
+		return -EINVAL;
+
+	if (copy_from_user(kernbuf, buffer, count))
+		return -EFAULT;
+
+	kernbuf[count] = 0;
+
+	if (count == 5 && strcmp(kernbuf, "clear") == 0) {
+		memset(range, 0, sizeof(*range));
+		return count;
+	}
+
+	/* of the form "[0x0000000240000400 - 0x000000028000400]" */
+	rc = sscanf(kernbuf, "[%llx - %llx]\n",
+		    (unsigned long long *)&tmp.lsr_start,
+		    (unsigned long long *)&tmp.lsr_end);
+	if (!range_is_sane(&tmp) || range_is_zero(&tmp) ||
+	    tmp.lsr_start < range->lsr_start || tmp.lsr_end > range->lsr_end)
+		return -EINVAL;
+	*range = tmp;
+	return count;
+}
+
+/* Client side procfs stuff */
+static ssize_t lprocfs_fid_space_seq_write(struct file *file,
+					   const char __user *buffer,
+					   size_t count, loff_t *off)
+{
+	struct lu_client_seq *seq;
+	int rc;
+
+	seq = ((struct seq_file *)file->private_data)->private;
+	LASSERT(seq != NULL);
+
+	mutex_lock(&seq->lcs_mutex);
+	rc = lprocfs_fid_write_common(buffer, count, &seq->lcs_space);
+
+	if (rc == 0) {
+		CDEBUG(D_INFO, "%s: Space: "DRANGE"\n",
+		       seq->lcs_name, PRANGE(&seq->lcs_space));
+	}
+
+	mutex_unlock(&seq->lcs_mutex);
+
+	return count;
+}
+
+static int
+lprocfs_fid_space_seq_show(struct seq_file *m, void *unused)
+{
+	struct lu_client_seq *seq = (struct lu_client_seq *)m->private;
+
+	LASSERT(seq != NULL);
+
+	mutex_lock(&seq->lcs_mutex);
+	seq_printf(m, "[%#llx - %#llx]:%x:%s\n", PRANGE(&seq->lcs_space));
+	mutex_unlock(&seq->lcs_mutex);
+
+	return 0;
+}
+
+static ssize_t lprocfs_fid_width_seq_write(struct file *file,
+					   const char __user *buffer,
+					   size_t count, loff_t *off)
+{
+	struct lu_client_seq *seq;
+	__u64  max;
+	int rc, val;
+
+	seq = ((struct seq_file *)file->private_data)->private;
+	LASSERT(seq != NULL);
+
+	rc = lprocfs_write_helper(buffer, count, &val);
+	if (rc)
+		return rc;
+
+	mutex_lock(&seq->lcs_mutex);
+	if (seq->lcs_type == LUSTRE_SEQ_DATA)
+		max = LUSTRE_DATA_SEQ_MAX_WIDTH;
+	else
+		max = LUSTRE_METADATA_SEQ_MAX_WIDTH;
+
+	if (val <= max && val > 0) {
+		seq->lcs_width = val;
+
+		if (rc == 0) {
+			CDEBUG(D_INFO, "%s: Sequence size: %llu\n",
+			       seq->lcs_name, seq->lcs_width);
+		}
+	}
+
+	mutex_unlock(&seq->lcs_mutex);
+
+	return count;
+}
+
+static int
+lprocfs_fid_width_seq_show(struct seq_file *m, void *unused)
+{
+	struct lu_client_seq *seq = (struct lu_client_seq *)m->private;
+
+	LASSERT(seq != NULL);
+
+	mutex_lock(&seq->lcs_mutex);
+	seq_printf(m, "%llu\n", seq->lcs_width);
+	mutex_unlock(&seq->lcs_mutex);
+
+	return 0;
+}
+
+static int
+lprocfs_fid_fid_seq_show(struct seq_file *m, void *unused)
+{
+	struct lu_client_seq *seq = (struct lu_client_seq *)m->private;
+
+	LASSERT(seq != NULL);
+
+	mutex_lock(&seq->lcs_mutex);
+	seq_printf(m, DFID "\n", PFID(&seq->lcs_fid));
+	mutex_unlock(&seq->lcs_mutex);
+
+	return 0;
+}
+
+static int
+lprocfs_fid_server_seq_show(struct seq_file *m, void *unused)
+{
+	struct lu_client_seq *seq = (struct lu_client_seq *)m->private;
+	struct client_obd *cli;
+
+	LASSERT(seq != NULL);
+
+	if (seq->lcs_exp != NULL) {
+		cli = &seq->lcs_exp->exp_obd->u.cli;
+		seq_printf(m, "%s\n", cli->cl_target_uuid.uuid);
+	} else {
+		seq_printf(m, "%s\n", seq->lcs_srv->lss_name);
+	}
+
+	return 0;
+}
+
+LPROC_SEQ_FOPS(lprocfs_fid_space);
+LPROC_SEQ_FOPS(lprocfs_fid_width);
+LPROC_SEQ_FOPS_RO(lprocfs_fid_server);
+LPROC_SEQ_FOPS_RO(lprocfs_fid_fid);
+
+struct lprocfs_vars seq_client_proc_list[] = {
+	{ "space", &lprocfs_fid_space_fops },
+	{ "width", &lprocfs_fid_width_fops },
+	{ "server", &lprocfs_fid_server_fops },
+	{ "fid", &lprocfs_fid_fid_fops },
+	{ NULL }
+};
diff --git a/kernel/drivers/staging/lustre/lustre/fld/Makefile b/kernel/drivers/staging/lustre/lustre/fld/Makefile
new file mode 100644
index 000000000..2bbf08433
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/fld/Makefile
@@ -0,0 +1,3 @@
+obj-$(CONFIG_LUSTRE_FS) += fld.o
+fld-y := fld_request.o fld_cache.o
+fld-$(CONFIG_PROC_FS) += lproc_fld.o
diff --git a/kernel/drivers/staging/lustre/lustre/fld/fld_cache.c b/kernel/drivers/staging/lustre/lustre/fld/fld_cache.c
new file mode 100644
index 000000000..0d0a73745
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/fld/fld_cache.c
@@ -0,0 +1,546 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, 2013, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/fld/fld_cache.c
+ *
+ * FLD (Fids Location Database)
+ *
+ * Author: Pravin Shelar <pravin.shelar@sun.com>
+ * Author: Yury Umanets <umka@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_FLD
+
+#include "../../include/linux/libcfs/libcfs.h"
+#include <linux/module.h>
+#include <asm/div64.h>
+
+#include "../include/obd.h"
+#include "../include/obd_class.h"
+#include "../include/lustre_ver.h"
+#include "../include/obd_support.h"
+#include "../include/lprocfs_status.h"
+
+#include "../include/dt_object.h"
+#include "../include/lustre_req_layout.h"
+#include "../include/lustre_fld.h"
+#include "fld_internal.h"
+
+/**
+ * create fld cache.
+ */
+struct fld_cache *fld_cache_init(const char *name,
+				 int cache_size, int cache_threshold)
+{
+	struct fld_cache *cache;
+
+	LASSERT(name != NULL);
+	LASSERT(cache_threshold < cache_size);
+
+	OBD_ALLOC_PTR(cache);
+	if (cache == NULL)
+		return ERR_PTR(-ENOMEM);
+
+	INIT_LIST_HEAD(&cache->fci_entries_head);
+	INIT_LIST_HEAD(&cache->fci_lru);
+
+	cache->fci_cache_count = 0;
+	rwlock_init(&cache->fci_lock);
+
+	strlcpy(cache->fci_name, name,
+		sizeof(cache->fci_name));
+
+	cache->fci_cache_size = cache_size;
+	cache->fci_threshold = cache_threshold;
+
+	/* Init fld cache info. */
+	memset(&cache->fci_stat, 0, sizeof(cache->fci_stat));
+
+	CDEBUG(D_INFO, "%s: FLD cache - Size: %d, Threshold: %d\n",
+	       cache->fci_name, cache_size, cache_threshold);
+
+	return cache;
+}
+
+/**
+ * destroy fld cache.
+ */
+void fld_cache_fini(struct fld_cache *cache)
+{
+	__u64 pct;
+
+	LASSERT(cache != NULL);
+	fld_cache_flush(cache);
+
+	if (cache->fci_stat.fst_count > 0) {
+		pct = cache->fci_stat.fst_cache * 100;
+		do_div(pct, cache->fci_stat.fst_count);
+	} else {
+		pct = 0;
+	}
+
+	CDEBUG(D_INFO, "FLD cache statistics (%s):\n", cache->fci_name);
+	CDEBUG(D_INFO, "  Total reqs: %llu\n", cache->fci_stat.fst_count);
+	CDEBUG(D_INFO, "  Cache reqs: %llu\n", cache->fci_stat.fst_cache);
+	CDEBUG(D_INFO, "  Cache hits: %llu%%\n", pct);
+
+	OBD_FREE_PTR(cache);
+}
+
+/**
+ * delete given node from list.
+ */
+void fld_cache_entry_delete(struct fld_cache *cache,
+			    struct fld_cache_entry *node)
+{
+	list_del(&node->fce_list);
+	list_del(&node->fce_lru);
+	cache->fci_cache_count--;
+	OBD_FREE_PTR(node);
+}
+
+/**
+ * fix list by checking new entry with NEXT entry in order.
+ */
+static void fld_fix_new_list(struct fld_cache *cache)
+{
+	struct fld_cache_entry *f_curr;
+	struct fld_cache_entry *f_next;
+	struct lu_seq_range *c_range;
+	struct lu_seq_range *n_range;
+	struct list_head *head = &cache->fci_entries_head;
+
+restart_fixup:
+
+	list_for_each_entry_safe(f_curr, f_next, head, fce_list) {
+		c_range = &f_curr->fce_range;
+		n_range = &f_next->fce_range;
+
+		LASSERT(range_is_sane(c_range));
+		if (&f_next->fce_list == head)
+			break;
+
+		if (c_range->lsr_flags != n_range->lsr_flags)
+			continue;
+
+		LASSERTF(c_range->lsr_start <= n_range->lsr_start,
+			 "cur lsr_start "DRANGE" next lsr_start "DRANGE"\n",
+			 PRANGE(c_range), PRANGE(n_range));
+
+		/* check merge possibility with next range */
+		if (c_range->lsr_end == n_range->lsr_start) {
+			if (c_range->lsr_index != n_range->lsr_index)
+				continue;
+			n_range->lsr_start = c_range->lsr_start;
+			fld_cache_entry_delete(cache, f_curr);
+			continue;
+		}
+
+		/* check if current range overlaps with next range. */
+		if (n_range->lsr_start < c_range->lsr_end) {
+			if (c_range->lsr_index == n_range->lsr_index) {
+				n_range->lsr_start = c_range->lsr_start;
+				n_range->lsr_end = max(c_range->lsr_end,
+						       n_range->lsr_end);
+				fld_cache_entry_delete(cache, f_curr);
+			} else {
+				if (n_range->lsr_end <= c_range->lsr_end) {
+					*n_range = *c_range;
+					fld_cache_entry_delete(cache, f_curr);
+				} else
+					n_range->lsr_start = c_range->lsr_end;
+			}
+
+			/* we could have overlap over next
+			 * range too. better restart. */
+			goto restart_fixup;
+		}
+
+		/* kill duplicates */
+		if (c_range->lsr_start == n_range->lsr_start &&
+		    c_range->lsr_end == n_range->lsr_end)
+			fld_cache_entry_delete(cache, f_curr);
+	}
+}
+
+/**
+ * add node to fld cache
+ */
+static inline void fld_cache_entry_add(struct fld_cache *cache,
+				       struct fld_cache_entry *f_new,
+				       struct list_head *pos)
+{
+	list_add(&f_new->fce_list, pos);
+	list_add(&f_new->fce_lru, &cache->fci_lru);
+
+	cache->fci_cache_count++;
+	fld_fix_new_list(cache);
+}
+
+/**
+ * Check if cache needs to be shrunk. If so - do it.
+ * Remove one entry in list and so on until cache is shrunk enough.
+ */
+static int fld_cache_shrink(struct fld_cache *cache)
+{
+	struct fld_cache_entry *flde;
+	struct list_head *curr;
+	int num = 0;
+
+	LASSERT(cache != NULL);
+
+	if (cache->fci_cache_count < cache->fci_cache_size)
+		return 0;
+
+	curr = cache->fci_lru.prev;
+
+	while (cache->fci_cache_count + cache->fci_threshold >
+	       cache->fci_cache_size && curr != &cache->fci_lru) {
+
+		flde = list_entry(curr, struct fld_cache_entry, fce_lru);
+		curr = curr->prev;
+		fld_cache_entry_delete(cache, flde);
+		num++;
+	}
+
+	CDEBUG(D_INFO, "%s: FLD cache - Shrunk by %d entries\n",
+			cache->fci_name, num);
+
+	return 0;
+}
+
+/**
+ * kill all fld cache entries.
+ */
+void fld_cache_flush(struct fld_cache *cache)
+{
+	write_lock(&cache->fci_lock);
+	cache->fci_cache_size = 0;
+	fld_cache_shrink(cache);
+	write_unlock(&cache->fci_lock);
+}
+
+/**
+ * punch hole in existing range. divide this range and add new
+ * entry accordingly.
+ */
+
+static void fld_cache_punch_hole(struct fld_cache *cache,
+				 struct fld_cache_entry *f_curr,
+				 struct fld_cache_entry *f_new)
+{
+	const struct lu_seq_range *range = &f_new->fce_range;
+	const u64 new_start  = range->lsr_start;
+	const u64 new_end  = range->lsr_end;
+	struct fld_cache_entry *fldt;
+
+	OBD_ALLOC_GFP(fldt, sizeof(*fldt), GFP_ATOMIC);
+	if (!fldt) {
+		OBD_FREE_PTR(f_new);
+		/* overlap is not allowed, so dont mess up list. */
+		return;
+	}
+	/*  break f_curr RANGE into three RANGES:
+	 *	f_curr, f_new , fldt
+	 */
+
+	/* f_new = *range */
+
+	/* fldt */
+	fldt->fce_range.lsr_start = new_end;
+	fldt->fce_range.lsr_end = f_curr->fce_range.lsr_end;
+	fldt->fce_range.lsr_index = f_curr->fce_range.lsr_index;
+
+	/* f_curr */
+	f_curr->fce_range.lsr_end = new_start;
+
+	/* add these two entries to list */
+	fld_cache_entry_add(cache, f_new, &f_curr->fce_list);
+	fld_cache_entry_add(cache, fldt, &f_new->fce_list);
+
+	/* no need to fixup */
+}
+
+/**
+ * handle range overlap in fld cache.
+ */
+static void fld_cache_overlap_handle(struct fld_cache *cache,
+				struct fld_cache_entry *f_curr,
+				struct fld_cache_entry *f_new)
+{
+	const struct lu_seq_range *range = &f_new->fce_range;
+	const u64 new_start  = range->lsr_start;
+	const u64 new_end  = range->lsr_end;
+	const u32 mdt = range->lsr_index;
+
+	/* this is overlap case, these case are checking overlapping with
+	 * prev range only. fixup will handle overlapping with next range. */
+
+	if (f_curr->fce_range.lsr_index == mdt) {
+		f_curr->fce_range.lsr_start = min(f_curr->fce_range.lsr_start,
+						  new_start);
+
+		f_curr->fce_range.lsr_end = max(f_curr->fce_range.lsr_end,
+						new_end);
+
+		OBD_FREE_PTR(f_new);
+		fld_fix_new_list(cache);
+
+	} else if (new_start <= f_curr->fce_range.lsr_start &&
+			f_curr->fce_range.lsr_end <= new_end) {
+		/* case 1: new range completely overshadowed existing range.
+		 *	 e.g. whole range migrated. update fld cache entry */
+
+		f_curr->fce_range = *range;
+		OBD_FREE_PTR(f_new);
+		fld_fix_new_list(cache);
+
+	} else if (f_curr->fce_range.lsr_start < new_start &&
+			new_end < f_curr->fce_range.lsr_end) {
+		/* case 2: new range fit within existing range. */
+
+		fld_cache_punch_hole(cache, f_curr, f_new);
+
+	} else  if (new_end <= f_curr->fce_range.lsr_end) {
+		/* case 3: overlap:
+		 *	 [new_start [c_start  new_end)  c_end)
+		 */
+
+		LASSERT(new_start <= f_curr->fce_range.lsr_start);
+
+		f_curr->fce_range.lsr_start = new_end;
+		fld_cache_entry_add(cache, f_new, f_curr->fce_list.prev);
+
+	} else if (f_curr->fce_range.lsr_start <= new_start) {
+		/* case 4: overlap:
+		 *	 [c_start [new_start c_end) new_end)
+		 */
+
+		LASSERT(f_curr->fce_range.lsr_end <= new_end);
+
+		f_curr->fce_range.lsr_end = new_start;
+		fld_cache_entry_add(cache, f_new, &f_curr->fce_list);
+	} else
+		CERROR("NEW range ="DRANGE" curr = "DRANGE"\n",
+		       PRANGE(range), PRANGE(&f_curr->fce_range));
+}
+
+struct fld_cache_entry
+*fld_cache_entry_create(const struct lu_seq_range *range)
+{
+	struct fld_cache_entry *f_new;
+
+	LASSERT(range_is_sane(range));
+
+	OBD_ALLOC_PTR(f_new);
+	if (!f_new)
+		return ERR_PTR(-ENOMEM);
+
+	f_new->fce_range = *range;
+	return f_new;
+}
+
+/**
+ * Insert FLD entry in FLD cache.
+ *
+ * This function handles all cases of merging and breaking up of
+ * ranges.
+ */
+int fld_cache_insert_nolock(struct fld_cache *cache,
+			    struct fld_cache_entry *f_new)
+{
+	struct fld_cache_entry *f_curr;
+	struct fld_cache_entry *n;
+	struct list_head *head;
+	struct list_head *prev = NULL;
+	const u64 new_start  = f_new->fce_range.lsr_start;
+	const u64 new_end  = f_new->fce_range.lsr_end;
+	__u32 new_flags  = f_new->fce_range.lsr_flags;
+
+	/*
+	 * Duplicate entries are eliminated in insert op.
+	 * So we don't need to search new entry before starting
+	 * insertion loop.
+	 */
+
+	if (!cache->fci_no_shrink)
+		fld_cache_shrink(cache);
+
+	head = &cache->fci_entries_head;
+
+	list_for_each_entry_safe(f_curr, n, head, fce_list) {
+		/* add list if next is end of list */
+		if (new_end < f_curr->fce_range.lsr_start ||
+		   (new_end == f_curr->fce_range.lsr_start &&
+		    new_flags != f_curr->fce_range.lsr_flags))
+			break;
+
+		prev = &f_curr->fce_list;
+		/* check if this range is to left of new range. */
+		if (new_start < f_curr->fce_range.lsr_end &&
+		    new_flags == f_curr->fce_range.lsr_flags) {
+			fld_cache_overlap_handle(cache, f_curr, f_new);
+			goto out;
+		}
+	}
+
+	if (prev == NULL)
+		prev = head;
+
+	CDEBUG(D_INFO, "insert range "DRANGE"\n", PRANGE(&f_new->fce_range));
+	/* Add new entry to cache and lru list. */
+	fld_cache_entry_add(cache, f_new, prev);
+out:
+	return 0;
+}
+
+int fld_cache_insert(struct fld_cache *cache,
+		     const struct lu_seq_range *range)
+{
+	struct fld_cache_entry	*flde;
+	int rc;
+
+	flde = fld_cache_entry_create(range);
+	if (IS_ERR(flde))
+		return PTR_ERR(flde);
+
+	write_lock(&cache->fci_lock);
+	rc = fld_cache_insert_nolock(cache, flde);
+	write_unlock(&cache->fci_lock);
+	if (rc)
+		OBD_FREE_PTR(flde);
+
+	return rc;
+}
+
+void fld_cache_delete_nolock(struct fld_cache *cache,
+		      const struct lu_seq_range *range)
+{
+	struct fld_cache_entry *flde;
+	struct fld_cache_entry *tmp;
+	struct list_head *head;
+
+	head = &cache->fci_entries_head;
+	list_for_each_entry_safe(flde, tmp, head, fce_list) {
+		/* add list if next is end of list */
+		if (range->lsr_start == flde->fce_range.lsr_start ||
+		   (range->lsr_end == flde->fce_range.lsr_end &&
+		    range->lsr_flags == flde->fce_range.lsr_flags)) {
+			fld_cache_entry_delete(cache, flde);
+			break;
+		}
+	}
+}
+
+/**
+ * Delete FLD entry in FLD cache.
+ *
+ */
+void fld_cache_delete(struct fld_cache *cache,
+		      const struct lu_seq_range *range)
+{
+	write_lock(&cache->fci_lock);
+	fld_cache_delete_nolock(cache, range);
+	write_unlock(&cache->fci_lock);
+}
+
+struct fld_cache_entry
+*fld_cache_entry_lookup_nolock(struct fld_cache *cache,
+			      struct lu_seq_range *range)
+{
+	struct fld_cache_entry *flde;
+	struct fld_cache_entry *got = NULL;
+	struct list_head *head;
+
+	head = &cache->fci_entries_head;
+	list_for_each_entry(flde, head, fce_list) {
+		if (range->lsr_start == flde->fce_range.lsr_start ||
+		   (range->lsr_end == flde->fce_range.lsr_end &&
+		    range->lsr_flags == flde->fce_range.lsr_flags)) {
+			got = flde;
+			break;
+		}
+	}
+
+	return got;
+}
+
+/**
+ * lookup \a seq sequence for range in fld cache.
+ */
+struct fld_cache_entry
+*fld_cache_entry_lookup(struct fld_cache *cache, struct lu_seq_range *range)
+{
+	struct fld_cache_entry *got = NULL;
+
+	read_lock(&cache->fci_lock);
+	got = fld_cache_entry_lookup_nolock(cache, range);
+	read_unlock(&cache->fci_lock);
+	return got;
+}
+
+/**
+ * lookup \a seq sequence for range in fld cache.
+ */
+int fld_cache_lookup(struct fld_cache *cache,
+		     const u64 seq, struct lu_seq_range *range)
+{
+	struct fld_cache_entry *flde;
+	struct fld_cache_entry *prev = NULL;
+	struct list_head *head;
+
+	read_lock(&cache->fci_lock);
+	head = &cache->fci_entries_head;
+
+	cache->fci_stat.fst_count++;
+	list_for_each_entry(flde, head, fce_list) {
+		if (flde->fce_range.lsr_start > seq) {
+			if (prev != NULL)
+				*range = prev->fce_range;
+			break;
+		}
+
+		prev = flde;
+		if (range_within(&flde->fce_range, seq)) {
+			*range = flde->fce_range;
+
+			cache->fci_stat.fst_cache++;
+			read_unlock(&cache->fci_lock);
+			return 0;
+		}
+	}
+	read_unlock(&cache->fci_lock);
+	return -ENOENT;
+}
diff --git a/kernel/drivers/staging/lustre/lustre/fld/fld_internal.h b/kernel/drivers/staging/lustre/lustre/fld/fld_internal.h
new file mode 100644
index 000000000..68bec7658
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/fld/fld_internal.h
@@ -0,0 +1,193 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, 2013, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/fld/fld_internal.h
+ *
+ * Author: Yury Umanets <umka@clusterfs.com>
+ * Author: Tom WangDi <wangdi@clusterfs.com>
+ */
+#ifndef __FLD_INTERNAL_H
+#define __FLD_INTERNAL_H
+
+#include "../include/lustre/lustre_idl.h"
+#include "../include/dt_object.h"
+
+#include "../../include/linux/libcfs/libcfs.h"
+#include "../include/lustre_req_layout.h"
+#include "../include/lustre_fld.h"
+
+enum {
+	LUSTRE_FLD_INIT = 1 << 0,
+	LUSTRE_FLD_RUN  = 1 << 1
+};
+
+struct fld_stats {
+	__u64   fst_count;
+	__u64   fst_cache;
+	__u64   fst_inflight;
+};
+
+typedef int (*fld_hash_func_t) (struct lu_client_fld *, __u64);
+
+typedef struct lu_fld_target *
+(*fld_scan_func_t) (struct lu_client_fld *, __u64);
+
+struct lu_fld_hash {
+	const char	      *fh_name;
+	fld_hash_func_t	  fh_hash_func;
+	fld_scan_func_t	  fh_scan_func;
+};
+
+struct fld_cache_entry {
+	struct list_head	       fce_lru;
+	struct list_head	       fce_list;
+	/**
+	 * fld cache entries are sorted on range->lsr_start field. */
+	struct lu_seq_range      fce_range;
+};
+
+struct fld_cache {
+	/**
+	 * Cache guard, protects fci_hash mostly because others immutable after
+	 * init is finished.
+	 */
+	rwlock_t		 fci_lock;
+
+	/**
+	 * Cache shrink threshold */
+	int		      fci_threshold;
+
+	/**
+	 * Preferred number of cached entries */
+	int		      fci_cache_size;
+
+	/**
+	 * Current number of cached entries. Protected by \a fci_lock */
+	int		      fci_cache_count;
+
+	/**
+	 * LRU list fld entries. */
+	struct list_head	       fci_lru;
+
+	/**
+	 * sorted fld entries. */
+	struct list_head	       fci_entries_head;
+
+	/**
+	 * Cache statistics. */
+	struct fld_stats	 fci_stat;
+
+	/**
+	 * Cache name used for debug and messages. */
+	char		     fci_name[LUSTRE_MDT_MAXNAMELEN];
+	unsigned int		 fci_no_shrink:1;
+};
+
+enum fld_op {
+	FLD_CREATE = 0,
+	FLD_DELETE = 1,
+	FLD_LOOKUP = 2
+};
+
+enum {
+	/* 4M of FLD cache will not hurt client a lot. */
+	FLD_SERVER_CACHE_SIZE      = (4 * 0x100000),
+
+	/* 1M of FLD cache will not hurt client a lot. */
+	FLD_CLIENT_CACHE_SIZE      = (1 * 0x100000)
+};
+
+enum {
+	/* Cache threshold is 10 percent of size. */
+	FLD_SERVER_CACHE_THRESHOLD = 10,
+
+	/* Cache threshold is 10 percent of size. */
+	FLD_CLIENT_CACHE_THRESHOLD = 10
+};
+
+extern struct lu_fld_hash fld_hash[];
+
+int fld_client_rpc(struct obd_export *exp,
+		   struct lu_seq_range *range, __u32 fld_op);
+
+#if defined(CONFIG_PROC_FS)
+extern struct lprocfs_vars fld_client_proc_list[];
+#endif
+
+
+struct fld_cache *fld_cache_init(const char *name,
+				 int cache_size, int cache_threshold);
+
+void fld_cache_fini(struct fld_cache *cache);
+
+void fld_cache_flush(struct fld_cache *cache);
+
+int fld_cache_insert(struct fld_cache *cache,
+		     const struct lu_seq_range *range);
+
+struct fld_cache_entry
+*fld_cache_entry_create(const struct lu_seq_range *range);
+
+int fld_cache_insert_nolock(struct fld_cache *cache,
+			    struct fld_cache_entry *f_new);
+void fld_cache_delete(struct fld_cache *cache,
+		      const struct lu_seq_range *range);
+void fld_cache_delete_nolock(struct fld_cache *cache,
+			     const struct lu_seq_range *range);
+int fld_cache_lookup(struct fld_cache *cache,
+		     const u64 seq, struct lu_seq_range *range);
+
+struct fld_cache_entry*
+fld_cache_entry_lookup(struct fld_cache *cache, struct lu_seq_range *range);
+void fld_cache_entry_delete(struct fld_cache *cache,
+			    struct fld_cache_entry *node);
+void fld_dump_cache_entries(struct fld_cache *cache);
+
+struct fld_cache_entry
+*fld_cache_entry_lookup_nolock(struct fld_cache *cache,
+			      struct lu_seq_range *range);
+int fld_write_range(const struct lu_env *env, struct dt_object *dt,
+		    const struct lu_seq_range *range, struct thandle *th);
+
+static inline const char *
+fld_target_name(struct lu_fld_target *tar)
+{
+	if (tar->ft_srv != NULL)
+		return tar->ft_srv->lsf_name;
+
+	return (const char *)tar->ft_exp->exp_obd->obd_name;
+}
+
+#endif /* __FLD_INTERNAL_H */
diff --git a/kernel/drivers/staging/lustre/lustre/fld/fld_request.c b/kernel/drivers/staging/lustre/lustre/fld/fld_request.c
new file mode 100644
index 000000000..6ac225e90
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/fld/fld_request.c
@@ -0,0 +1,526 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2013, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/fld/fld_request.c
+ *
+ * FLD (Fids Location Database)
+ *
+ * Author: Yury Umanets <umka@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_FLD
+
+#include "../../include/linux/libcfs/libcfs.h"
+#include <linux/module.h>
+#include <asm/div64.h>
+
+#include "../include/obd.h"
+#include "../include/obd_class.h"
+#include "../include/lustre_ver.h"
+#include "../include/obd_support.h"
+#include "../include/lprocfs_status.h"
+
+#include "../include/dt_object.h"
+#include "../include/lustre_req_layout.h"
+#include "../include/lustre_fld.h"
+#include "../include/lustre_mdc.h"
+#include "fld_internal.h"
+
+/* TODO: these 3 functions are copies of flow-control code from mdc_lib.c
+ * It should be common thing. The same about mdc RPC lock */
+static int fld_req_avail(struct client_obd *cli, struct mdc_cache_waiter *mcw)
+{
+	int rc;
+
+	client_obd_list_lock(&cli->cl_loi_list_lock);
+	rc = list_empty(&mcw->mcw_entry);
+	client_obd_list_unlock(&cli->cl_loi_list_lock);
+	return rc;
+};
+
+static void fld_enter_request(struct client_obd *cli)
+{
+	struct mdc_cache_waiter mcw;
+	struct l_wait_info lwi = { 0 };
+
+	client_obd_list_lock(&cli->cl_loi_list_lock);
+	if (cli->cl_r_in_flight >= cli->cl_max_rpcs_in_flight) {
+		list_add_tail(&mcw.mcw_entry, &cli->cl_cache_waiters);
+		init_waitqueue_head(&mcw.mcw_waitq);
+		client_obd_list_unlock(&cli->cl_loi_list_lock);
+		l_wait_event(mcw.mcw_waitq, fld_req_avail(cli, &mcw), &lwi);
+	} else {
+		cli->cl_r_in_flight++;
+		client_obd_list_unlock(&cli->cl_loi_list_lock);
+	}
+}
+
+static void fld_exit_request(struct client_obd *cli)
+{
+	struct list_head *l, *tmp;
+	struct mdc_cache_waiter *mcw;
+
+	client_obd_list_lock(&cli->cl_loi_list_lock);
+	cli->cl_r_in_flight--;
+	list_for_each_safe(l, tmp, &cli->cl_cache_waiters) {
+
+		if (cli->cl_r_in_flight >= cli->cl_max_rpcs_in_flight) {
+			/* No free request slots anymore */
+			break;
+		}
+
+		mcw = list_entry(l, struct mdc_cache_waiter, mcw_entry);
+		list_del_init(&mcw->mcw_entry);
+		cli->cl_r_in_flight++;
+		wake_up(&mcw->mcw_waitq);
+	}
+	client_obd_list_unlock(&cli->cl_loi_list_lock);
+}
+
+static int fld_rrb_hash(struct lu_client_fld *fld, u64 seq)
+{
+	LASSERT(fld->lcf_count > 0);
+	return do_div(seq, fld->lcf_count);
+}
+
+static struct lu_fld_target *
+fld_rrb_scan(struct lu_client_fld *fld, u64 seq)
+{
+	struct lu_fld_target *target;
+	int hash;
+
+	/* Because almost all of special sequence located in MDT0,
+	 * it should go to index 0 directly, instead of calculating
+	 * hash again, and also if other MDTs is not being connected,
+	 * the fld lookup requests(for seq on MDT0) should not be
+	 * blocked because of other MDTs */
+	if (fid_seq_is_norm(seq))
+		hash = fld_rrb_hash(fld, seq);
+	else
+		hash = 0;
+
+again:
+	list_for_each_entry(target, &fld->lcf_targets, ft_chain) {
+		if (target->ft_idx == hash)
+			return target;
+	}
+
+	if (hash != 0) {
+		/* It is possible the remote target(MDT) are not connected to
+		 * with client yet, so we will refer this to MDT0, which should
+		 * be connected during mount */
+		hash = 0;
+		goto again;
+	}
+
+	CERROR("%s: Can't find target by hash %d (seq %#llx). Targets (%d):\n",
+		fld->lcf_name, hash, seq, fld->lcf_count);
+
+	list_for_each_entry(target, &fld->lcf_targets, ft_chain) {
+		const char *srv_name = target->ft_srv != NULL  ?
+			target->ft_srv->lsf_name : "<null>";
+		const char *exp_name = target->ft_exp != NULL ?
+			(char *)target->ft_exp->exp_obd->obd_uuid.uuid :
+			"<null>";
+
+		CERROR("  exp: 0x%p (%s), srv: 0x%p (%s), idx: %llu\n",
+		       target->ft_exp, exp_name, target->ft_srv,
+		       srv_name, target->ft_idx);
+	}
+
+	/*
+	 * If target is not found, there is logical error anyway, so here is
+	 * LBUG() to catch this situation.
+	 */
+	LBUG();
+	return NULL;
+}
+
+struct lu_fld_hash fld_hash[] = {
+	{
+		.fh_name = "RRB",
+		.fh_hash_func = fld_rrb_hash,
+		.fh_scan_func = fld_rrb_scan
+	},
+	{
+		NULL,
+	}
+};
+
+static struct lu_fld_target *
+fld_client_get_target(struct lu_client_fld *fld, u64 seq)
+{
+	struct lu_fld_target *target;
+
+	LASSERT(fld->lcf_hash != NULL);
+
+	spin_lock(&fld->lcf_lock);
+	target = fld->lcf_hash->fh_scan_func(fld, seq);
+	spin_unlock(&fld->lcf_lock);
+
+	if (target != NULL) {
+		CDEBUG(D_INFO, "%s: Found target (idx %llu) by seq %#llx\n",
+		       fld->lcf_name, target->ft_idx, seq);
+	}
+
+	return target;
+}
+
+/*
+ * Add export to FLD. This is usually done by CMM and LMV as they are main users
+ * of FLD module.
+ */
+int fld_client_add_target(struct lu_client_fld *fld,
+			  struct lu_fld_target *tar)
+{
+	const char *name;
+	struct lu_fld_target *target, *tmp;
+
+	LASSERT(tar != NULL);
+	name = fld_target_name(tar);
+	LASSERT(name != NULL);
+	LASSERT(tar->ft_srv != NULL || tar->ft_exp != NULL);
+
+	if (fld->lcf_flags != LUSTRE_FLD_INIT) {
+		CERROR("%s: Attempt to add target %s (idx %llu) on fly - skip it\n",
+			fld->lcf_name, name, tar->ft_idx);
+		return 0;
+	}
+	CDEBUG(D_INFO, "%s: Adding target %s (idx %llu)\n",
+			fld->lcf_name, name, tar->ft_idx);
+
+	OBD_ALLOC_PTR(target);
+	if (target == NULL)
+		return -ENOMEM;
+
+	spin_lock(&fld->lcf_lock);
+	list_for_each_entry(tmp, &fld->lcf_targets, ft_chain) {
+		if (tmp->ft_idx == tar->ft_idx) {
+			spin_unlock(&fld->lcf_lock);
+			OBD_FREE_PTR(target);
+			CERROR("Target %s exists in FLD and known as %s:#%llu\n",
+			       name, fld_target_name(tmp), tmp->ft_idx);
+			return -EEXIST;
+		}
+	}
+
+	target->ft_exp = tar->ft_exp;
+	if (target->ft_exp != NULL)
+		class_export_get(target->ft_exp);
+	target->ft_srv = tar->ft_srv;
+	target->ft_idx = tar->ft_idx;
+
+	list_add_tail(&target->ft_chain,
+			  &fld->lcf_targets);
+
+	fld->lcf_count++;
+	spin_unlock(&fld->lcf_lock);
+
+	return 0;
+}
+EXPORT_SYMBOL(fld_client_add_target);
+
+/* Remove export from FLD */
+int fld_client_del_target(struct lu_client_fld *fld, __u64 idx)
+{
+	struct lu_fld_target *target, *tmp;
+
+	spin_lock(&fld->lcf_lock);
+	list_for_each_entry_safe(target, tmp,
+				     &fld->lcf_targets, ft_chain) {
+		if (target->ft_idx == idx) {
+			fld->lcf_count--;
+			list_del(&target->ft_chain);
+			spin_unlock(&fld->lcf_lock);
+
+			if (target->ft_exp != NULL)
+				class_export_put(target->ft_exp);
+
+			OBD_FREE_PTR(target);
+			return 0;
+		}
+	}
+	spin_unlock(&fld->lcf_lock);
+	return -ENOENT;
+}
+EXPORT_SYMBOL(fld_client_del_target);
+
+static struct proc_dir_entry *fld_type_proc_dir;
+
+#if defined(CONFIG_PROC_FS)
+static int fld_client_proc_init(struct lu_client_fld *fld)
+{
+	int rc;
+
+	fld->lcf_proc_dir = lprocfs_register(fld->lcf_name,
+					     fld_type_proc_dir,
+					     NULL, NULL);
+
+	if (IS_ERR(fld->lcf_proc_dir)) {
+		CERROR("%s: LProcFS failed in fld-init\n",
+		       fld->lcf_name);
+		rc = PTR_ERR(fld->lcf_proc_dir);
+		return rc;
+	}
+
+	rc = lprocfs_add_vars(fld->lcf_proc_dir,
+			      fld_client_proc_list, fld);
+	if (rc) {
+		CERROR("%s: Can't init FLD proc, rc %d\n",
+		       fld->lcf_name, rc);
+		goto out_cleanup;
+	}
+
+	return 0;
+
+out_cleanup:
+	fld_client_proc_fini(fld);
+	return rc;
+}
+
+void fld_client_proc_fini(struct lu_client_fld *fld)
+{
+	if (fld->lcf_proc_dir) {
+		if (!IS_ERR(fld->lcf_proc_dir))
+			lprocfs_remove(&fld->lcf_proc_dir);
+		fld->lcf_proc_dir = NULL;
+	}
+}
+#else
+static int fld_client_proc_init(struct lu_client_fld *fld)
+{
+	return 0;
+}
+
+void fld_client_proc_fini(struct lu_client_fld *fld)
+{
+}
+#endif
+EXPORT_SYMBOL(fld_client_proc_fini);
+
+static inline int hash_is_sane(int hash)
+{
+	return (hash >= 0 && hash < ARRAY_SIZE(fld_hash));
+}
+
+int fld_client_init(struct lu_client_fld *fld,
+		    const char *prefix, int hash)
+{
+	int cache_size, cache_threshold;
+	int rc;
+
+	LASSERT(fld != NULL);
+
+	snprintf(fld->lcf_name, sizeof(fld->lcf_name),
+		 "cli-%s", prefix);
+
+	if (!hash_is_sane(hash)) {
+		CERROR("%s: Wrong hash function %#x\n",
+		       fld->lcf_name, hash);
+		return -EINVAL;
+	}
+
+	fld->lcf_count = 0;
+	spin_lock_init(&fld->lcf_lock);
+	fld->lcf_hash = &fld_hash[hash];
+	fld->lcf_flags = LUSTRE_FLD_INIT;
+	INIT_LIST_HEAD(&fld->lcf_targets);
+
+	cache_size = FLD_CLIENT_CACHE_SIZE /
+		sizeof(struct fld_cache_entry);
+
+	cache_threshold = cache_size *
+		FLD_CLIENT_CACHE_THRESHOLD / 100;
+
+	fld->lcf_cache = fld_cache_init(fld->lcf_name,
+					cache_size, cache_threshold);
+	if (IS_ERR(fld->lcf_cache)) {
+		rc = PTR_ERR(fld->lcf_cache);
+		fld->lcf_cache = NULL;
+		goto out;
+	}
+
+	rc = fld_client_proc_init(fld);
+	if (rc)
+		goto out;
+out:
+	if (rc)
+		fld_client_fini(fld);
+	else
+		CDEBUG(D_INFO, "%s: Using \"%s\" hash\n",
+		       fld->lcf_name, fld->lcf_hash->fh_name);
+	return rc;
+}
+EXPORT_SYMBOL(fld_client_init);
+
+void fld_client_fini(struct lu_client_fld *fld)
+{
+	struct lu_fld_target *target, *tmp;
+
+	spin_lock(&fld->lcf_lock);
+	list_for_each_entry_safe(target, tmp,
+				     &fld->lcf_targets, ft_chain) {
+		fld->lcf_count--;
+		list_del(&target->ft_chain);
+		if (target->ft_exp != NULL)
+			class_export_put(target->ft_exp);
+		OBD_FREE_PTR(target);
+	}
+	spin_unlock(&fld->lcf_lock);
+
+	if (fld->lcf_cache != NULL) {
+		if (!IS_ERR(fld->lcf_cache))
+			fld_cache_fini(fld->lcf_cache);
+		fld->lcf_cache = NULL;
+	}
+}
+EXPORT_SYMBOL(fld_client_fini);
+
+int fld_client_rpc(struct obd_export *exp,
+		   struct lu_seq_range *range, __u32 fld_op)
+{
+	struct ptlrpc_request *req;
+	struct lu_seq_range   *prange;
+	__u32		 *op;
+	int		    rc;
+	struct obd_import     *imp;
+
+	LASSERT(exp != NULL);
+
+	imp = class_exp2cliimp(exp);
+	req = ptlrpc_request_alloc_pack(imp, &RQF_FLD_QUERY, LUSTRE_MDS_VERSION,
+					FLD_QUERY);
+	if (req == NULL)
+		return -ENOMEM;
+
+	op = req_capsule_client_get(&req->rq_pill, &RMF_FLD_OPC);
+	*op = fld_op;
+
+	prange = req_capsule_client_get(&req->rq_pill, &RMF_FLD_MDFLD);
+	*prange = *range;
+
+	ptlrpc_request_set_replen(req);
+	req->rq_request_portal = FLD_REQUEST_PORTAL;
+	req->rq_reply_portal = MDC_REPLY_PORTAL;
+	ptlrpc_at_set_req_timeout(req);
+
+	if (fld_op == FLD_LOOKUP &&
+	    imp->imp_connect_flags_orig & OBD_CONNECT_MDS_MDS)
+		req->rq_allow_replay = 1;
+
+	if (fld_op != FLD_LOOKUP)
+		mdc_get_rpc_lock(exp->exp_obd->u.cli.cl_rpc_lock, NULL);
+	fld_enter_request(&exp->exp_obd->u.cli);
+	rc = ptlrpc_queue_wait(req);
+	fld_exit_request(&exp->exp_obd->u.cli);
+	if (fld_op != FLD_LOOKUP)
+		mdc_put_rpc_lock(exp->exp_obd->u.cli.cl_rpc_lock, NULL);
+	if (rc)
+		goto out_req;
+
+	prange = req_capsule_server_get(&req->rq_pill, &RMF_FLD_MDFLD);
+	if (prange == NULL) {
+		rc = -EFAULT;
+		goto out_req;
+	}
+	*range = *prange;
+out_req:
+	ptlrpc_req_finished(req);
+	return rc;
+}
+
+int fld_client_lookup(struct lu_client_fld *fld, u64 seq, u32 *mds,
+		      __u32 flags, const struct lu_env *env)
+{
+	struct lu_seq_range res = { 0 };
+	struct lu_fld_target *target;
+	int rc;
+
+	fld->lcf_flags |= LUSTRE_FLD_RUN;
+
+	rc = fld_cache_lookup(fld->lcf_cache, seq, &res);
+	if (rc == 0) {
+		*mds = res.lsr_index;
+		return 0;
+	}
+
+	/* Can not find it in the cache */
+	target = fld_client_get_target(fld, seq);
+	LASSERT(target != NULL);
+
+	CDEBUG(D_INFO, "%s: Lookup fld entry (seq: %#llx) on target %s (idx %llu)\n",
+			fld->lcf_name, seq, fld_target_name(target), target->ft_idx);
+
+	res.lsr_start = seq;
+	fld_range_set_type(&res, flags);
+	rc = fld_client_rpc(target->ft_exp, &res, FLD_LOOKUP);
+
+	if (rc == 0) {
+		*mds = res.lsr_index;
+
+		fld_cache_insert(fld->lcf_cache, &res);
+	}
+	return rc;
+}
+EXPORT_SYMBOL(fld_client_lookup);
+
+void fld_client_flush(struct lu_client_fld *fld)
+{
+	fld_cache_flush(fld->lcf_cache);
+}
+EXPORT_SYMBOL(fld_client_flush);
+
+static int __init fld_mod_init(void)
+{
+	fld_type_proc_dir = lprocfs_register(LUSTRE_FLD_NAME,
+					     proc_lustre_root,
+					     NULL, NULL);
+	return PTR_ERR_OR_ZERO(fld_type_proc_dir);
+}
+
+static void __exit fld_mod_exit(void)
+{
+	if (fld_type_proc_dir != NULL && !IS_ERR(fld_type_proc_dir)) {
+		lprocfs_remove(&fld_type_proc_dir);
+		fld_type_proc_dir = NULL;
+	}
+}
+
+MODULE_AUTHOR("Sun Microsystems, Inc. <http://www.lustre.org/>");
+MODULE_DESCRIPTION("Lustre FLD");
+MODULE_LICENSE("GPL");
+
+module_init(fld_mod_init)
+module_exit(fld_mod_exit)
diff --git a/kernel/drivers/staging/lustre/lustre/fld/lproc_fld.c b/kernel/drivers/staging/lustre/lustre/fld/lproc_fld.c
new file mode 100644
index 000000000..f53fdcfae
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/fld/lproc_fld.c
@@ -0,0 +1,172 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, 2013, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/fld/lproc_fld.c
+ *
+ * FLD (FIDs Location Database)
+ *
+ * Author: Yury Umanets <umka@clusterfs.com>
+ *	Di Wang <di.wang@whamcloud.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_FLD
+
+#include "../../include/linux/libcfs/libcfs.h"
+#include <linux/module.h>
+
+#include "../include/obd.h"
+#include "../include/obd_class.h"
+#include "../include/dt_object.h"
+#include "../include/obd_support.h"
+#include "../include/lustre_req_layout.h"
+#include "../include/lustre_fld.h"
+#include "../include/lustre_fid.h"
+#include "fld_internal.h"
+
+static int
+fld_proc_targets_seq_show(struct seq_file *m, void *unused)
+{
+	struct lu_client_fld *fld = (struct lu_client_fld *)m->private;
+	struct lu_fld_target *target;
+
+	LASSERT(fld != NULL);
+
+	spin_lock(&fld->lcf_lock);
+	list_for_each_entry(target,
+				&fld->lcf_targets, ft_chain)
+		seq_printf(m, "%s\n", fld_target_name(target));
+	spin_unlock(&fld->lcf_lock);
+
+	return 0;
+}
+
+static int
+fld_proc_hash_seq_show(struct seq_file *m, void *unused)
+{
+	struct lu_client_fld *fld = (struct lu_client_fld *)m->private;
+
+	LASSERT(fld != NULL);
+
+	spin_lock(&fld->lcf_lock);
+	seq_printf(m, "%s\n", fld->lcf_hash->fh_name);
+	spin_unlock(&fld->lcf_lock);
+
+	return 0;
+}
+
+static ssize_t
+fld_proc_hash_seq_write(struct file *file,
+				const char __user *buffer,
+				size_t count, loff_t *off)
+{
+	struct lu_client_fld *fld;
+	struct lu_fld_hash *hash = NULL;
+	char fh_name[8];
+	int i;
+
+	if (count > sizeof(fh_name))
+		return -ENAMETOOLONG;
+
+	if (copy_from_user(fh_name, buffer, count) != 0)
+		return -EFAULT;
+
+	fld = ((struct seq_file *)file->private_data)->private;
+	LASSERT(fld != NULL);
+
+	for (i = 0; fld_hash[i].fh_name != NULL; i++) {
+		if (count != strlen(fld_hash[i].fh_name))
+			continue;
+
+		if (!strncmp(fld_hash[i].fh_name, fh_name, count)) {
+			hash = &fld_hash[i];
+			break;
+		}
+	}
+
+	if (hash != NULL) {
+		spin_lock(&fld->lcf_lock);
+		fld->lcf_hash = hash;
+		spin_unlock(&fld->lcf_lock);
+
+		CDEBUG(D_INFO, "%s: Changed hash to \"%s\"\n",
+		       fld->lcf_name, hash->fh_name);
+	}
+
+	return count;
+}
+
+static ssize_t
+fld_proc_cache_flush_write(struct file *file, const char __user *buffer,
+			       size_t count, loff_t *pos)
+{
+	struct lu_client_fld *fld = file->private_data;
+
+	LASSERT(fld != NULL);
+
+	fld_cache_flush(fld->lcf_cache);
+
+	CDEBUG(D_INFO, "%s: Lookup cache is flushed\n", fld->lcf_name);
+
+	return count;
+}
+
+static int fld_proc_cache_flush_open(struct inode *inode, struct file *file)
+{
+	file->private_data = PDE_DATA(inode);
+	return 0;
+}
+
+static int fld_proc_cache_flush_release(struct inode *inode, struct file *file)
+{
+	file->private_data = NULL;
+	return 0;
+}
+
+static struct file_operations fld_proc_cache_flush_fops = {
+	.owner		= THIS_MODULE,
+	.open		= fld_proc_cache_flush_open,
+	.write		= fld_proc_cache_flush_write,
+	.release	= fld_proc_cache_flush_release,
+};
+
+LPROC_SEQ_FOPS_RO(fld_proc_targets);
+LPROC_SEQ_FOPS(fld_proc_hash);
+
+struct lprocfs_vars fld_client_proc_list[] = {
+	{ "targets", &fld_proc_targets_fops },
+	{ "hash", &fld_proc_hash_fops },
+	{ "cache_flush", &fld_proc_cache_flush_fops },
+	{ NULL }
+};
diff --git a/kernel/drivers/staging/lustre/lustre/include/cl_object.h b/kernel/drivers/staging/lustre/lustre/include/cl_object.h
new file mode 100644
index 000000000..d56c8bea8
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/include/cl_object.h
@@ -0,0 +1,3287 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+#ifndef _LUSTRE_CL_OBJECT_H
+#define _LUSTRE_CL_OBJECT_H
+
+/** \defgroup clio clio
+ *
+ * Client objects implement io operations and cache pages.
+ *
+ * Examples: lov and osc are implementations of cl interface.
+ *
+ * Big Theory Statement.
+ *
+ * Layered objects.
+ *
+ * Client implementation is based on the following data-types:
+ *
+ *   - cl_object
+ *
+ *   - cl_page
+ *
+ *   - cl_lock     represents an extent lock on an object.
+ *
+ *   - cl_io       represents high-level i/o activity such as whole read/write
+ *		 system call, or write-out of pages from under the lock being
+ *		 canceled. cl_io has sub-ios that can be stopped and resumed
+ *		 independently, thus achieving high degree of transfer
+ *		 parallelism. Single cl_io can be advanced forward by
+ *		 the multiple threads (although in the most usual case of
+ *		 read/write system call it is associated with the single user
+ *		 thread, that issued the system call).
+ *
+ *   - cl_req      represents a collection of pages for a transfer. cl_req is
+ *		 constructed by req-forming engine that tries to saturate
+ *		 transport with large and continuous transfers.
+ *
+ * Terminology
+ *
+ *     - to avoid confusion high-level I/O operation like read or write system
+ *     call is referred to as "an io", whereas low-level I/O operation, like
+ *     RPC, is referred to as "a transfer"
+ *
+ *     - "generic code" means generic (not file system specific) code in the
+ *     hosting environment. "cl-code" means code (mostly in cl_*.c files) that
+ *     is not layer specific.
+ *
+ * Locking.
+ *
+ *  - i_mutex
+ *      - PG_locked
+ *	  - cl_object_header::coh_page_guard
+ *	  - cl_object_header::coh_lock_guard
+ *	  - lu_site::ls_guard
+ *
+ * See the top comment in cl_object.c for the description of overall locking and
+ * reference-counting design.
+ *
+ * See comments below for the description of i/o, page, and dlm-locking
+ * design.
+ *
+ * @{
+ */
+
+/*
+ * super-class definitions.
+ */
+#include "lu_object.h"
+#include "linux/lustre_compat25.h"
+#include <linux/mutex.h>
+#include <linux/radix-tree.h>
+
+struct inode;
+
+struct cl_device;
+struct cl_device_operations;
+
+struct cl_object;
+struct cl_object_page_operations;
+struct cl_object_lock_operations;
+
+struct cl_page;
+struct cl_page_slice;
+struct cl_lock;
+struct cl_lock_slice;
+
+struct cl_lock_operations;
+struct cl_page_operations;
+
+struct cl_io;
+struct cl_io_slice;
+
+struct cl_req;
+struct cl_req_slice;
+
+/**
+ * Operations for each data device in the client stack.
+ *
+ * \see vvp_cl_ops, lov_cl_ops, lovsub_cl_ops, osc_cl_ops
+ */
+struct cl_device_operations {
+	/**
+	 * Initialize cl_req. This method is called top-to-bottom on all
+	 * devices in the stack to get them a chance to allocate layer-private
+	 * data, and to attach them to the cl_req by calling
+	 * cl_req_slice_add().
+	 *
+	 * \see osc_req_init(), lov_req_init(), lovsub_req_init()
+	 * \see ccc_req_init()
+	 */
+	int (*cdo_req_init)(const struct lu_env *env, struct cl_device *dev,
+			    struct cl_req *req);
+};
+
+/**
+ * Device in the client stack.
+ *
+ * \see ccc_device, lov_device, lovsub_device, osc_device
+ */
+struct cl_device {
+	/** Super-class. */
+	struct lu_device		   cd_lu_dev;
+	/** Per-layer operation vector. */
+	const struct cl_device_operations *cd_ops;
+};
+
+/** \addtogroup cl_object cl_object
+ * @{ */
+/**
+ * "Data attributes" of cl_object. Data attributes can be updated
+ * independently for a sub-object, and top-object's attributes are calculated
+ * from sub-objects' ones.
+ */
+struct cl_attr {
+	/** Object size, in bytes */
+	loff_t cat_size;
+	/**
+	 * Known minimal size, in bytes.
+	 *
+	 * This is only valid when at least one DLM lock is held.
+	 */
+	loff_t cat_kms;
+	/** Modification time. Measured in seconds since epoch. */
+	time_t cat_mtime;
+	/** Access time. Measured in seconds since epoch. */
+	time_t cat_atime;
+	/** Change time. Measured in seconds since epoch. */
+	time_t cat_ctime;
+	/**
+	 * Blocks allocated to this cl_object on the server file system.
+	 *
+	 * \todo XXX An interface for block size is needed.
+	 */
+	__u64  cat_blocks;
+	/**
+	 * User identifier for quota purposes.
+	 */
+	uid_t  cat_uid;
+	/**
+	 * Group identifier for quota purposes.
+	 */
+	gid_t  cat_gid;
+};
+
+/**
+ * Fields in cl_attr that are being set.
+ */
+enum cl_attr_valid {
+	CAT_SIZE   = 1 << 0,
+	CAT_KMS    = 1 << 1,
+	CAT_MTIME  = 1 << 3,
+	CAT_ATIME  = 1 << 4,
+	CAT_CTIME  = 1 << 5,
+	CAT_BLOCKS = 1 << 6,
+	CAT_UID    = 1 << 7,
+	CAT_GID    = 1 << 8
+};
+
+/**
+ * Sub-class of lu_object with methods common for objects on the client
+ * stacks.
+ *
+ * cl_object: represents a regular file system object, both a file and a
+ *    stripe. cl_object is based on lu_object: it is identified by a fid,
+ *    layered, cached, hashed, and lrued. Important distinction with the server
+ *    side, where md_object and dt_object are used, is that cl_object "fans out"
+ *    at the lov/sns level: depending on the file layout, single file is
+ *    represented as a set of "sub-objects" (stripes). At the implementation
+ *    level, struct lov_object contains an array of cl_objects. Each sub-object
+ *    is a full-fledged cl_object, having its fid, living in the lru and hash
+ *    table.
+ *
+ *    This leads to the next important difference with the server side: on the
+ *    client, it's quite usual to have objects with the different sequence of
+ *    layers. For example, typical top-object is composed of the following
+ *    layers:
+ *
+ *	- vvp
+ *	- lov
+ *
+ *    whereas its sub-objects are composed of
+ *
+ *	- lovsub
+ *	- osc
+ *
+ *    layers. Here "lovsub" is a mostly dummy layer, whose purpose is to keep
+ *    track of the object-subobject relationship.
+ *
+ *    Sub-objects are not cached independently: when top-object is about to
+ *    be discarded from the memory, all its sub-objects are torn-down and
+ *    destroyed too.
+ *
+ * \see ccc_object, lov_object, lovsub_object, osc_object
+ */
+struct cl_object {
+	/** super class */
+	struct lu_object		   co_lu;
+	/** per-object-layer operations */
+	const struct cl_object_operations *co_ops;
+	/** offset of page slice in cl_page buffer */
+	int				   co_slice_off;
+};
+
+/**
+ * Description of the client object configuration. This is used for the
+ * creation of a new client object that is identified by a more state than
+ * fid.
+ */
+struct cl_object_conf {
+	/** Super-class. */
+	struct lu_object_conf     coc_lu;
+	union {
+		/**
+		 * Object layout. This is consumed by lov.
+		 */
+		struct lustre_md *coc_md;
+		/**
+		 * Description of particular stripe location in the
+		 * cluster. This is consumed by osc.
+		 */
+		struct lov_oinfo *coc_oinfo;
+	} u;
+	/**
+	 * VFS inode. This is consumed by vvp.
+	 */
+	struct inode	     *coc_inode;
+	/**
+	 * Layout lock handle.
+	 */
+	struct ldlm_lock	 *coc_lock;
+	/**
+	 * Operation to handle layout, OBJECT_CONF_XYZ.
+	 */
+	int			  coc_opc;
+};
+
+enum {
+	/** configure layout, set up a new stripe, must be called while
+	 * holding layout lock. */
+	OBJECT_CONF_SET = 0,
+	/** invalidate the current stripe configuration due to losing
+	 * layout lock. */
+	OBJECT_CONF_INVALIDATE = 1,
+	/** wait for old layout to go away so that new layout can be
+	 * set up. */
+	OBJECT_CONF_WAIT = 2
+};
+
+/**
+ * Operations implemented for each cl object layer.
+ *
+ * \see vvp_ops, lov_ops, lovsub_ops, osc_ops
+ */
+struct cl_object_operations {
+	/**
+	 * Initialize page slice for this layer. Called top-to-bottom through
+	 * every object layer when a new cl_page is instantiated. Layer
+	 * keeping private per-page data, or requiring its own page operations
+	 * vector should allocate these data here, and attach then to the page
+	 * by calling cl_page_slice_add(). \a vmpage is locked (in the VM
+	 * sense). Optional.
+	 *
+	 * \retval NULL success.
+	 *
+	 * \retval ERR_PTR(errno) failure code.
+	 *
+	 * \retval valid-pointer pointer to already existing referenced page
+	 *	 to be used instead of newly created.
+	 */
+	int  (*coo_page_init)(const struct lu_env *env, struct cl_object *obj,
+				struct cl_page *page, struct page *vmpage);
+	/**
+	 * Initialize lock slice for this layer. Called top-to-bottom through
+	 * every object layer when a new cl_lock is instantiated. Layer
+	 * keeping private per-lock data, or requiring its own lock operations
+	 * vector should allocate these data here, and attach then to the lock
+	 * by calling cl_lock_slice_add(). Mandatory.
+	 */
+	int  (*coo_lock_init)(const struct lu_env *env,
+			      struct cl_object *obj, struct cl_lock *lock,
+			      const struct cl_io *io);
+	/**
+	 * Initialize io state for a given layer.
+	 *
+	 * called top-to-bottom once per io existence to initialize io
+	 * state. If layer wants to keep some state for this type of io, it
+	 * has to embed struct cl_io_slice in lu_env::le_ses, and register
+	 * slice with cl_io_slice_add(). It is guaranteed that all threads
+	 * participating in this io share the same session.
+	 */
+	int  (*coo_io_init)(const struct lu_env *env,
+			    struct cl_object *obj, struct cl_io *io);
+	/**
+	 * Fill portion of \a attr that this layer controls. This method is
+	 * called top-to-bottom through all object layers.
+	 *
+	 * \pre cl_object_header::coh_attr_guard of the top-object is locked.
+	 *
+	 * \return   0: to continue
+	 * \return +ve: to stop iterating through layers (but 0 is returned
+	 * from enclosing cl_object_attr_get())
+	 * \return -ve: to signal error
+	 */
+	int (*coo_attr_get)(const struct lu_env *env, struct cl_object *obj,
+			    struct cl_attr *attr);
+	/**
+	 * Update attributes.
+	 *
+	 * \a valid is a bitmask composed from enum #cl_attr_valid, and
+	 * indicating what attributes are to be set.
+	 *
+	 * \pre cl_object_header::coh_attr_guard of the top-object is locked.
+	 *
+	 * \return the same convention as for
+	 * cl_object_operations::coo_attr_get() is used.
+	 */
+	int (*coo_attr_set)(const struct lu_env *env, struct cl_object *obj,
+			    const struct cl_attr *attr, unsigned valid);
+	/**
+	 * Update object configuration. Called top-to-bottom to modify object
+	 * configuration.
+	 *
+	 * XXX error conditions and handling.
+	 */
+	int (*coo_conf_set)(const struct lu_env *env, struct cl_object *obj,
+			    const struct cl_object_conf *conf);
+	/**
+	 * Glimpse ast. Executed when glimpse ast arrives for a lock on this
+	 * object. Layers are supposed to fill parts of \a lvb that will be
+	 * shipped to the glimpse originator as a glimpse result.
+	 *
+	 * \see ccc_object_glimpse(), lovsub_object_glimpse(),
+	 * \see osc_object_glimpse()
+	 */
+	int (*coo_glimpse)(const struct lu_env *env,
+			   const struct cl_object *obj, struct ost_lvb *lvb);
+};
+
+/**
+ * Extended header for client object.
+ */
+struct cl_object_header {
+	/** Standard lu_object_header. cl_object::co_lu::lo_header points
+	 * here. */
+	struct lu_object_header  coh_lu;
+	/** \name locks
+	 * \todo XXX move locks below to the separate cache-lines, they are
+	 * mostly useless otherwise.
+	 */
+	/** @{ */
+	/** Lock protecting page tree. */
+	spinlock_t		 coh_page_guard;
+	/** Lock protecting lock list. */
+	spinlock_t		 coh_lock_guard;
+	/** @} locks */
+	/** Radix tree of cl_page's, cached for this object. */
+	struct radix_tree_root   coh_tree;
+	/** # of pages in radix tree. */
+	unsigned long	    coh_pages;
+	/** List of cl_lock's granted for this object. */
+	struct list_head	       coh_locks;
+
+	/**
+	 * Parent object. It is assumed that an object has a well-defined
+	 * parent, but not a well-defined child (there may be multiple
+	 * sub-objects, for the same top-object). cl_object_header::coh_parent
+	 * field allows certain code to be written generically, without
+	 * limiting possible cl_object layouts unduly.
+	 */
+	struct cl_object_header *coh_parent;
+	/**
+	 * Protects consistency between cl_attr of parent object and
+	 * attributes of sub-objects, that the former is calculated ("merged")
+	 * from.
+	 *
+	 * \todo XXX this can be read/write lock if needed.
+	 */
+	spinlock_t		 coh_attr_guard;
+	/**
+	 * Size of cl_page + page slices
+	 */
+	unsigned short		 coh_page_bufsize;
+	/**
+	 * Number of objects above this one: 0 for a top-object, 1 for its
+	 * sub-object, etc.
+	 */
+	unsigned char		 coh_nesting;
+};
+
+/**
+ * Helper macro: iterate over all layers of the object \a obj, assigning every
+ * layer top-to-bottom to \a slice.
+ */
+#define cl_object_for_each(slice, obj)				      \
+	list_for_each_entry((slice),				    \
+				&(obj)->co_lu.lo_header->loh_layers,	\
+				co_lu.lo_linkage)
+/**
+ * Helper macro: iterate over all layers of the object \a obj, assigning every
+ * layer bottom-to-top to \a slice.
+ */
+#define cl_object_for_each_reverse(slice, obj)			       \
+	list_for_each_entry_reverse((slice),			     \
+					&(obj)->co_lu.lo_header->loh_layers, \
+					co_lu.lo_linkage)
+/** @} cl_object */
+
+#ifndef pgoff_t
+#define pgoff_t unsigned long
+#endif
+
+#define CL_PAGE_EOF ((pgoff_t)~0ull)
+
+/** \addtogroup cl_page cl_page
+ * @{ */
+
+/** \struct cl_page
+ * Layered client page.
+ *
+ * cl_page: represents a portion of a file, cached in the memory. All pages
+ *    of the given file are of the same size, and are kept in the radix tree
+ *    hanging off the cl_object. cl_page doesn't fan out, but as sub-objects
+ *    of the top-level file object are first class cl_objects, they have their
+ *    own radix trees of pages and hence page is implemented as a sequence of
+ *    struct cl_pages's, linked into double-linked list through
+ *    cl_page::cp_parent and cl_page::cp_child pointers, each residing in the
+ *    corresponding radix tree at the corresponding logical offset.
+ *
+ * cl_page is associated with VM page of the hosting environment (struct
+ *    page in Linux kernel, for example), struct page. It is assumed, that this
+ *    association is implemented by one of cl_page layers (top layer in the
+ *    current design) that
+ *
+ *	- intercepts per-VM-page call-backs made by the environment (e.g.,
+ *	  memory pressure),
+ *
+ *	- translates state (page flag bits) and locking between lustre and
+ *	  environment.
+ *
+ *    The association between cl_page and struct page is immutable and
+ *    established when cl_page is created.
+ *
+ * cl_page can be "owned" by a particular cl_io (see below), guaranteeing
+ *    this io an exclusive access to this page w.r.t. other io attempts and
+ *    various events changing page state (such as transfer completion, or
+ *    eviction of the page from the memory). Note, that in general cl_io
+ *    cannot be identified with a particular thread, and page ownership is not
+ *    exactly equal to the current thread holding a lock on the page. Layer
+ *    implementing association between cl_page and struct page has to implement
+ *    ownership on top of available synchronization mechanisms.
+ *
+ *    While lustre client maintains the notion of an page ownership by io,
+ *    hosting MM/VM usually has its own page concurrency control
+ *    mechanisms. For example, in Linux, page access is synchronized by the
+ *    per-page PG_locked bit-lock, and generic kernel code (generic_file_*())
+ *    takes care to acquire and release such locks as necessary around the
+ *    calls to the file system methods (->readpage(), ->prepare_write(),
+ *    ->commit_write(), etc.). This leads to the situation when there are two
+ *    different ways to own a page in the client:
+ *
+ *	- client code explicitly and voluntary owns the page (cl_page_own());
+ *
+ *	- VM locks a page and then calls the client, that has "to assume"
+ *	  the ownership from the VM (cl_page_assume()).
+ *
+ *    Dual methods to release ownership are cl_page_disown() and
+ *    cl_page_unassume().
+ *
+ * cl_page is reference counted (cl_page::cp_ref). When reference counter
+ *    drops to 0, the page is returned to the cache, unless it is in
+ *    cl_page_state::CPS_FREEING state, in which case it is immediately
+ *    destroyed.
+ *
+ *    The general logic guaranteeing the absence of "existential races" for
+ *    pages is the following:
+ *
+ *	- there are fixed known ways for a thread to obtain a new reference
+ *	  to a page:
+ *
+ *	    - by doing a lookup in the cl_object radix tree, protected by the
+ *	      spin-lock;
+ *
+ *	    - by starting from VM-locked struct page and following some
+ *	      hosting environment method (e.g., following ->private pointer in
+ *	      the case of Linux kernel), see cl_vmpage_page();
+ *
+ *	- when the page enters cl_page_state::CPS_FREEING state, all these
+ *	  ways are severed with the proper synchronization
+ *	  (cl_page_delete());
+ *
+ *	- entry into cl_page_state::CPS_FREEING is serialized by the VM page
+ *	  lock;
+ *
+ *	- no new references to the page in cl_page_state::CPS_FREEING state
+ *	  are allowed (checked in cl_page_get()).
+ *
+ *    Together this guarantees that when last reference to a
+ *    cl_page_state::CPS_FREEING page is released, it is safe to destroy the
+ *    page, as neither references to it can be acquired at that point, nor
+ *    ones exist.
+ *
+ * cl_page is a state machine. States are enumerated in enum
+ *    cl_page_state. Possible state transitions are enumerated in
+ *    cl_page_state_set(). State transition process (i.e., actual changing of
+ *    cl_page::cp_state field) is protected by the lock on the underlying VM
+ *    page.
+ *
+ * Linux Kernel implementation.
+ *
+ *    Binding between cl_page and struct page (which is a typedef for
+ *    struct page) is implemented in the vvp layer. cl_page is attached to the
+ *    ->private pointer of the struct page, together with the setting of
+ *    PG_private bit in page->flags, and acquiring additional reference on the
+ *    struct page (much like struct buffer_head, or any similar file system
+ *    private data structures).
+ *
+ *    PG_locked lock is used to implement both ownership and transfer
+ *    synchronization, that is, page is VM-locked in CPS_{OWNED,PAGE{IN,OUT}}
+ *    states. No additional references are acquired for the duration of the
+ *    transfer.
+ *
+ * \warning *THIS IS NOT* the behavior expected by the Linux kernel, where
+ *	  write-out is "protected" by the special PG_writeback bit.
+ */
+
+/**
+ * States of cl_page. cl_page.c assumes particular order here.
+ *
+ * The page state machine is rather crude, as it doesn't recognize finer page
+ * states like "dirty" or "up to date". This is because such states are not
+ * always well defined for the whole stack (see, for example, the
+ * implementation of the read-ahead, that hides page up-to-dateness to track
+ * cache hits accurately). Such sub-states are maintained by the layers that
+ * are interested in them.
+ */
+enum cl_page_state {
+	/**
+	 * Page is in the cache, un-owned. Page leaves cached state in the
+	 * following cases:
+	 *
+	 *     - [cl_page_state::CPS_OWNED] io comes across the page and
+	 *     owns it;
+	 *
+	 *     - [cl_page_state::CPS_PAGEOUT] page is dirty, the
+	 *     req-formation engine decides that it wants to include this page
+	 *     into an cl_req being constructed, and yanks it from the cache;
+	 *
+	 *     - [cl_page_state::CPS_FREEING] VM callback is executed to
+	 *     evict the page form the memory;
+	 *
+	 * \invariant cl_page::cp_owner == NULL && cl_page::cp_req == NULL
+	 */
+	CPS_CACHED,
+	/**
+	 * Page is exclusively owned by some cl_io. Page may end up in this
+	 * state as a result of
+	 *
+	 *     - io creating new page and immediately owning it;
+	 *
+	 *     - [cl_page_state::CPS_CACHED] io finding existing cached page
+	 *     and owning it;
+	 *
+	 *     - [cl_page_state::CPS_OWNED] io finding existing owned page
+	 *     and waiting for owner to release the page;
+	 *
+	 * Page leaves owned state in the following cases:
+	 *
+	 *     - [cl_page_state::CPS_CACHED] io decides to leave the page in
+	 *     the cache, doing nothing;
+	 *
+	 *     - [cl_page_state::CPS_PAGEIN] io starts read transfer for
+	 *     this page;
+	 *
+	 *     - [cl_page_state::CPS_PAGEOUT] io starts immediate write
+	 *     transfer for this page;
+	 *
+	 *     - [cl_page_state::CPS_FREEING] io decides to destroy this
+	 *     page (e.g., as part of truncate or extent lock cancellation).
+	 *
+	 * \invariant cl_page::cp_owner != NULL && cl_page::cp_req == NULL
+	 */
+	CPS_OWNED,
+	/**
+	 * Page is being written out, as a part of a transfer. This state is
+	 * entered when req-formation logic decided that it wants this page to
+	 * be sent through the wire _now_. Specifically, it means that once
+	 * this state is achieved, transfer completion handler (with either
+	 * success or failure indication) is guaranteed to be executed against
+	 * this page independently of any locks and any scheduling decisions
+	 * made by the hosting environment (that effectively means that the
+	 * page is never put into cl_page_state::CPS_PAGEOUT state "in
+	 * advance". This property is mentioned, because it is important when
+	 * reasoning about possible dead-locks in the system). The page can
+	 * enter this state as a result of
+	 *
+	 *     - [cl_page_state::CPS_OWNED] an io requesting an immediate
+	 *     write-out of this page, or
+	 *
+	 *     - [cl_page_state::CPS_CACHED] req-forming engine deciding
+	 *     that it has enough dirty pages cached to issue a "good"
+	 *     transfer.
+	 *
+	 * The page leaves cl_page_state::CPS_PAGEOUT state when the transfer
+	 * is completed---it is moved into cl_page_state::CPS_CACHED state.
+	 *
+	 * Underlying VM page is locked for the duration of transfer.
+	 *
+	 * \invariant: cl_page::cp_owner == NULL && cl_page::cp_req != NULL
+	 */
+	CPS_PAGEOUT,
+	/**
+	 * Page is being read in, as a part of a transfer. This is quite
+	 * similar to the cl_page_state::CPS_PAGEOUT state, except that
+	 * read-in is always "immediate"---there is no such thing a sudden
+	 * construction of read cl_req from cached, presumably not up to date,
+	 * pages.
+	 *
+	 * Underlying VM page is locked for the duration of transfer.
+	 *
+	 * \invariant: cl_page::cp_owner == NULL && cl_page::cp_req != NULL
+	 */
+	CPS_PAGEIN,
+	/**
+	 * Page is being destroyed. This state is entered when client decides
+	 * that page has to be deleted from its host object, as, e.g., a part
+	 * of truncate.
+	 *
+	 * Once this state is reached, there is no way to escape it.
+	 *
+	 * \invariant: cl_page::cp_owner == NULL && cl_page::cp_req == NULL
+	 */
+	CPS_FREEING,
+	CPS_NR
+};
+
+enum cl_page_type {
+	/** Host page, the page is from the host inode which the cl_page
+	 * belongs to. */
+	CPT_CACHEABLE = 1,
+
+	/** Transient page, the transient cl_page is used to bind a cl_page
+	 *  to vmpage which is not belonging to the same object of cl_page.
+	 *  it is used in DirectIO, lockless IO and liblustre. */
+	CPT_TRANSIENT,
+};
+
+/**
+ * Flags maintained for every cl_page.
+ */
+enum cl_page_flags {
+	/**
+	 * Set when pagein completes. Used for debugging (read completes at
+	 * most once for a page).
+	 */
+	CPF_READ_COMPLETED = 1 << 0
+};
+
+/**
+ * Fields are protected by the lock on struct page, except for atomics and
+ * immutables.
+ *
+ * \invariant Data type invariants are in cl_page_invariant(). Basically:
+ * cl_page::cp_parent and cl_page::cp_child are a well-formed double-linked
+ * list, consistent with the parent/child pointers in the cl_page::cp_obj and
+ * cl_page::cp_owner (when set).
+ */
+struct cl_page {
+	/** Reference counter. */
+	atomic_t	     cp_ref;
+	/** An object this page is a part of. Immutable after creation. */
+	struct cl_object	*cp_obj;
+	/** Logical page index within the object. Immutable after creation. */
+	pgoff_t		  cp_index;
+	/** List of slices. Immutable after creation. */
+	struct list_head	       cp_layers;
+	/** Parent page, NULL for top-level page. Immutable after creation. */
+	struct cl_page	  *cp_parent;
+	/** Lower-layer page. NULL for bottommost page. Immutable after
+	 * creation. */
+	struct cl_page	  *cp_child;
+	/**
+	 * Page state. This field is const to avoid accidental update, it is
+	 * modified only internally within cl_page.c. Protected by a VM lock.
+	 */
+	const enum cl_page_state cp_state;
+	/** Linkage of pages within group. Protected by cl_page::cp_mutex. */
+	struct list_head		cp_batch;
+	/** Mutex serializing membership of a page in a batch. */
+	struct mutex		cp_mutex;
+	/** Linkage of pages within cl_req. */
+	struct list_head	       cp_flight;
+	/** Transfer error. */
+	int		      cp_error;
+
+	/**
+	 * Page type. Only CPT_TRANSIENT is used so far. Immutable after
+	 * creation.
+	 */
+	enum cl_page_type	cp_type;
+
+	/**
+	 * Owning IO in cl_page_state::CPS_OWNED state. Sub-page can be owned
+	 * by sub-io. Protected by a VM lock.
+	 */
+	struct cl_io	    *cp_owner;
+	/**
+	 * Debug information, the task is owning the page.
+	 */
+	struct task_struct	*cp_task;
+	/**
+	 * Owning IO request in cl_page_state::CPS_PAGEOUT and
+	 * cl_page_state::CPS_PAGEIN states. This field is maintained only in
+	 * the top-level pages. Protected by a VM lock.
+	 */
+	struct cl_req	   *cp_req;
+	/** List of references to this page, for debugging. */
+	struct lu_ref	    cp_reference;
+	/** Link to an object, for debugging. */
+	struct lu_ref_link       cp_obj_ref;
+	/** Link to a queue, for debugging. */
+	struct lu_ref_link       cp_queue_ref;
+	/** Per-page flags from enum cl_page_flags. Protected by a VM lock. */
+	unsigned                 cp_flags;
+	/** Assigned if doing a sync_io */
+	struct cl_sync_io       *cp_sync_io;
+};
+
+/**
+ * Per-layer part of cl_page.
+ *
+ * \see ccc_page, lov_page, osc_page
+ */
+struct cl_page_slice {
+	struct cl_page		  *cpl_page;
+	/**
+	 * Object slice corresponding to this page slice. Immutable after
+	 * creation.
+	 */
+	struct cl_object		*cpl_obj;
+	const struct cl_page_operations *cpl_ops;
+	/** Linkage into cl_page::cp_layers. Immutable after creation. */
+	struct list_head		       cpl_linkage;
+};
+
+/**
+ * Lock mode. For the client extent locks.
+ *
+ * \warning: cl_lock_mode_match() assumes particular ordering here.
+ * \ingroup cl_lock
+ */
+enum cl_lock_mode {
+	/**
+	 * Mode of a lock that protects no data, and exists only as a
+	 * placeholder. This is used for `glimpse' requests. A phantom lock
+	 * might get promoted to real lock at some point.
+	 */
+	CLM_PHANTOM,
+	CLM_READ,
+	CLM_WRITE,
+	CLM_GROUP
+};
+
+/**
+ * Requested transfer type.
+ * \ingroup cl_req
+ */
+enum cl_req_type {
+	CRT_READ,
+	CRT_WRITE,
+	CRT_NR
+};
+
+/**
+ * Per-layer page operations.
+ *
+ * Methods taking an \a io argument are for the activity happening in the
+ * context of given \a io. Page is assumed to be owned by that io, except for
+ * the obvious cases (like cl_page_operations::cpo_own()).
+ *
+ * \see vvp_page_ops, lov_page_ops, osc_page_ops
+ */
+struct cl_page_operations {
+	/**
+	 * cl_page<->struct page methods. Only one layer in the stack has to
+	 * implement these. Current code assumes that this functionality is
+	 * provided by the topmost layer, see cl_page_disown0() as an example.
+	 */
+
+	/**
+	 * \return the underlying VM page. Optional.
+	 */
+	struct page *(*cpo_vmpage)(const struct lu_env *env,
+				  const struct cl_page_slice *slice);
+	/**
+	 * Called when \a io acquires this page into the exclusive
+	 * ownership. When this method returns, it is guaranteed that the is
+	 * not owned by other io, and no transfer is going on against
+	 * it. Optional.
+	 *
+	 * \see cl_page_own()
+	 * \see vvp_page_own(), lov_page_own()
+	 */
+	int  (*cpo_own)(const struct lu_env *env,
+			const struct cl_page_slice *slice,
+			struct cl_io *io, int nonblock);
+	/** Called when ownership it yielded. Optional.
+	 *
+	 * \see cl_page_disown()
+	 * \see vvp_page_disown()
+	 */
+	void (*cpo_disown)(const struct lu_env *env,
+			   const struct cl_page_slice *slice, struct cl_io *io);
+	/**
+	 * Called for a page that is already "owned" by \a io from VM point of
+	 * view. Optional.
+	 *
+	 * \see cl_page_assume()
+	 * \see vvp_page_assume(), lov_page_assume()
+	 */
+	void (*cpo_assume)(const struct lu_env *env,
+			   const struct cl_page_slice *slice, struct cl_io *io);
+	/** Dual to cl_page_operations::cpo_assume(). Optional. Called
+	 * bottom-to-top when IO releases a page without actually unlocking
+	 * it.
+	 *
+	 * \see cl_page_unassume()
+	 * \see vvp_page_unassume()
+	 */
+	void (*cpo_unassume)(const struct lu_env *env,
+			     const struct cl_page_slice *slice,
+			     struct cl_io *io);
+	/**
+	 * Announces whether the page contains valid data or not by \a uptodate.
+	 *
+	 * \see cl_page_export()
+	 * \see vvp_page_export()
+	 */
+	void  (*cpo_export)(const struct lu_env *env,
+			    const struct cl_page_slice *slice, int uptodate);
+	/**
+	 * Unmaps page from the user space (if it is mapped).
+	 *
+	 * \see cl_page_unmap()
+	 * \see vvp_page_unmap()
+	 */
+	int (*cpo_unmap)(const struct lu_env *env,
+			 const struct cl_page_slice *slice, struct cl_io *io);
+	/**
+	 * Checks whether underlying VM page is locked (in the suitable
+	 * sense). Used for assertions.
+	 *
+	 * \retval    -EBUSY: page is protected by a lock of a given mode;
+	 * \retval  -ENODATA: page is not protected by a lock;
+	 * \retval	 0: this layer cannot decide. (Should never happen.)
+	 */
+	int (*cpo_is_vmlocked)(const struct lu_env *env,
+			       const struct cl_page_slice *slice);
+	/**
+	 * Page destruction.
+	 */
+
+	/**
+	 * Called when page is truncated from the object. Optional.
+	 *
+	 * \see cl_page_discard()
+	 * \see vvp_page_discard(), osc_page_discard()
+	 */
+	void (*cpo_discard)(const struct lu_env *env,
+			    const struct cl_page_slice *slice,
+			    struct cl_io *io);
+	/**
+	 * Called when page is removed from the cache, and is about to being
+	 * destroyed. Optional.
+	 *
+	 * \see cl_page_delete()
+	 * \see vvp_page_delete(), osc_page_delete()
+	 */
+	void (*cpo_delete)(const struct lu_env *env,
+			   const struct cl_page_slice *slice);
+	/** Destructor. Frees resources and slice itself. */
+	void (*cpo_fini)(const struct lu_env *env,
+			 struct cl_page_slice *slice);
+
+	/**
+	 * Checks whether the page is protected by a cl_lock. This is a
+	 * per-layer method, because certain layers have ways to check for the
+	 * lock much more efficiently than through the generic locks scan, or
+	 * implement locking mechanisms separate from cl_lock, e.g.,
+	 * LL_FILE_GROUP_LOCKED in vvp. If \a pending is true, check for locks
+	 * being canceled, or scheduled for cancellation as soon as the last
+	 * user goes away, too.
+	 *
+	 * \retval    -EBUSY: page is protected by a lock of a given mode;
+	 * \retval  -ENODATA: page is not protected by a lock;
+	 * \retval	 0: this layer cannot decide.
+	 *
+	 * \see cl_page_is_under_lock()
+	 */
+	int (*cpo_is_under_lock)(const struct lu_env *env,
+				 const struct cl_page_slice *slice,
+				 struct cl_io *io);
+
+	/**
+	 * Optional debugging helper. Prints given page slice.
+	 *
+	 * \see cl_page_print()
+	 */
+	int (*cpo_print)(const struct lu_env *env,
+			 const struct cl_page_slice *slice,
+			 void *cookie, lu_printer_t p);
+	/**
+	 * \name transfer
+	 *
+	 * Transfer methods. See comment on cl_req for a description of
+	 * transfer formation and life-cycle.
+	 *
+	 * @{
+	 */
+	/**
+	 * Request type dependent vector of operations.
+	 *
+	 * Transfer operations depend on transfer mode (cl_req_type). To avoid
+	 * passing transfer mode to each and every of these methods, and to
+	 * avoid branching on request type inside of the methods, separate
+	 * methods for cl_req_type:CRT_READ and cl_req_type:CRT_WRITE are
+	 * provided. That is, method invocation usually looks like
+	 *
+	 *	 slice->cp_ops.io[req->crq_type].cpo_method(env, slice, ...);
+	 */
+	struct {
+		/**
+		 * Called when a page is submitted for a transfer as a part of
+		 * cl_page_list.
+		 *
+		 * \return    0	 : page is eligible for submission;
+		 * \return    -EALREADY : skip this page;
+		 * \return    -ve       : error.
+		 *
+		 * \see cl_page_prep()
+		 */
+		int  (*cpo_prep)(const struct lu_env *env,
+				 const struct cl_page_slice *slice,
+				 struct cl_io *io);
+		/**
+		 * Completion handler. This is guaranteed to be eventually
+		 * fired after cl_page_operations::cpo_prep() or
+		 * cl_page_operations::cpo_make_ready() call.
+		 *
+		 * This method can be called in a non-blocking context. It is
+		 * guaranteed however, that the page involved and its object
+		 * are pinned in memory (and, hence, calling cl_page_put() is
+		 * safe).
+		 *
+		 * \see cl_page_completion()
+		 */
+		void (*cpo_completion)(const struct lu_env *env,
+				       const struct cl_page_slice *slice,
+				       int ioret);
+		/**
+		 * Called when cached page is about to be added to the
+		 * cl_req as a part of req formation.
+		 *
+		 * \return    0       : proceed with this page;
+		 * \return    -EAGAIN : skip this page;
+		 * \return    -ve     : error.
+		 *
+		 * \see cl_page_make_ready()
+		 */
+		int  (*cpo_make_ready)(const struct lu_env *env,
+				       const struct cl_page_slice *slice);
+		/**
+		 * Announce that this page is to be written out
+		 * opportunistically, that is, page is dirty, it is not
+		 * necessary to start write-out transfer right now, but
+		 * eventually page has to be written out.
+		 *
+		 * Main caller of this is the write path (see
+		 * vvp_io_commit_write()), using this method to build a
+		 * "transfer cache" from which large transfers are then
+		 * constructed by the req-formation engine.
+		 *
+		 * \todo XXX it would make sense to add page-age tracking
+		 * semantics here, and to oblige the req-formation engine to
+		 * send the page out not later than it is too old.
+		 *
+		 * \see cl_page_cache_add()
+		 */
+		int  (*cpo_cache_add)(const struct lu_env *env,
+				      const struct cl_page_slice *slice,
+				      struct cl_io *io);
+	} io[CRT_NR];
+	/**
+	 * Tell transfer engine that only [to, from] part of a page should be
+	 * transmitted.
+	 *
+	 * This is used for immediate transfers.
+	 *
+	 * \todo XXX this is not very good interface. It would be much better
+	 * if all transfer parameters were supplied as arguments to
+	 * cl_io_operations::cio_submit() call, but it is not clear how to do
+	 * this for page queues.
+	 *
+	 * \see cl_page_clip()
+	 */
+	void (*cpo_clip)(const struct lu_env *env,
+			 const struct cl_page_slice *slice,
+			 int from, int to);
+	/**
+	 * \pre  the page was queued for transferring.
+	 * \post page is removed from client's pending list, or -EBUSY
+	 *       is returned if it has already been in transferring.
+	 *
+	 * This is one of seldom page operation which is:
+	 * 0. called from top level;
+	 * 1. don't have vmpage locked;
+	 * 2. every layer should synchronize execution of its ->cpo_cancel()
+	 *    with completion handlers. Osc uses client obd lock for this
+	 *    purpose. Based on there is no vvp_page_cancel and
+	 *    lov_page_cancel(), cpo_cancel is defacto protected by client lock.
+	 *
+	 * \see osc_page_cancel().
+	 */
+	int (*cpo_cancel)(const struct lu_env *env,
+			  const struct cl_page_slice *slice);
+	/**
+	 * Write out a page by kernel. This is only called by ll_writepage
+	 * right now.
+	 *
+	 * \see cl_page_flush()
+	 */
+	int (*cpo_flush)(const struct lu_env *env,
+			 const struct cl_page_slice *slice,
+			 struct cl_io *io);
+	/** @} transfer */
+};
+
+/**
+ * Helper macro, dumping detailed information about \a page into a log.
+ */
+#define CL_PAGE_DEBUG(mask, env, page, format, ...)		     \
+do {								    \
+	LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, mask, NULL);		\
+									\
+	if (cfs_cdebug_show(mask, DEBUG_SUBSYSTEM)) {		   \
+		cl_page_print(env, &msgdata, lu_cdebug_printer, page);  \
+		CDEBUG(mask, format , ## __VA_ARGS__);		  \
+	}							       \
+} while (0)
+
+/**
+ * Helper macro, dumping shorter information about \a page into a log.
+ */
+#define CL_PAGE_HEADER(mask, env, page, format, ...)			  \
+do {									  \
+	LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, mask, NULL);		      \
+									      \
+	if (cfs_cdebug_show(mask, DEBUG_SUBSYSTEM)) {			 \
+		cl_page_header_print(env, &msgdata, lu_cdebug_printer, page); \
+		CDEBUG(mask, format , ## __VA_ARGS__);			\
+	}								     \
+} while (0)
+
+static inline int __page_in_use(const struct cl_page *page, int refc)
+{
+	if (page->cp_type == CPT_CACHEABLE)
+		++refc;
+	LASSERT(atomic_read(&page->cp_ref) > 0);
+	return (atomic_read(&page->cp_ref) > refc);
+}
+#define cl_page_in_use(pg)       __page_in_use(pg, 1)
+#define cl_page_in_use_noref(pg) __page_in_use(pg, 0)
+
+/** @} cl_page */
+
+/** \addtogroup cl_lock cl_lock
+ * @{ */
+/** \struct cl_lock
+ *
+ * Extent locking on the client.
+ *
+ * LAYERING
+ *
+ * The locking model of the new client code is built around
+ *
+ *	struct cl_lock
+ *
+ * data-type representing an extent lock on a regular file. cl_lock is a
+ * layered object (much like cl_object and cl_page), it consists of a header
+ * (struct cl_lock) and a list of layers (struct cl_lock_slice), linked to
+ * cl_lock::cll_layers list through cl_lock_slice::cls_linkage.
+ *
+ * All locks for a given object are linked into cl_object_header::coh_locks
+ * list (protected by cl_object_header::coh_lock_guard spin-lock) through
+ * cl_lock::cll_linkage. Currently this list is not sorted in any way. We can
+ * sort it in starting lock offset, or use altogether different data structure
+ * like a tree.
+ *
+ * Typical cl_lock consists of the two layers:
+ *
+ *     - vvp_lock (vvp specific data), and
+ *     - lov_lock (lov specific data).
+ *
+ * lov_lock contains an array of sub-locks. Each of these sub-locks is a
+ * normal cl_lock: it has a header (struct cl_lock) and a list of layers:
+ *
+ *     - lovsub_lock, and
+ *     - osc_lock
+ *
+ * Each sub-lock is associated with a cl_object (representing stripe
+ * sub-object or the file to which top-level cl_lock is associated to), and is
+ * linked into that cl_object::coh_locks. In this respect cl_lock is similar to
+ * cl_object (that at lov layer also fans out into multiple sub-objects), and
+ * is different from cl_page, that doesn't fan out (there is usually exactly
+ * one osc_page for every vvp_page). We shall call vvp-lov portion of the lock
+ * a "top-lock" and its lovsub-osc portion a "sub-lock".
+ *
+ * LIFE CYCLE
+ *
+ * cl_lock is reference counted. When reference counter drops to 0, lock is
+ * placed in the cache, except when lock is in CLS_FREEING state. CLS_FREEING
+ * lock is destroyed when last reference is released. Referencing between
+ * top-lock and its sub-locks is described in the lov documentation module.
+ *
+ * STATE MACHINE
+ *
+ * Also, cl_lock is a state machine. This requires some clarification. One of
+ * the goals of client IO re-write was to make IO path non-blocking, or at
+ * least to make it easier to make it non-blocking in the future. Here
+ * `non-blocking' means that when a system call (read, write, truncate)
+ * reaches a situation where it has to wait for a communication with the
+ * server, it should --instead of waiting-- remember its current state and
+ * switch to some other work.  E.g,. instead of waiting for a lock enqueue,
+ * client should proceed doing IO on the next stripe, etc. Obviously this is
+ * rather radical redesign, and it is not planned to be fully implemented at
+ * this time, instead we are putting some infrastructure in place, that would
+ * make it easier to do asynchronous non-blocking IO easier in the
+ * future. Specifically, where old locking code goes to sleep (waiting for
+ * enqueue, for example), new code returns cl_lock_transition::CLO_WAIT. When
+ * enqueue reply comes, its completion handler signals that lock state-machine
+ * is ready to transit to the next state. There is some generic code in
+ * cl_lock.c that sleeps, waiting for these signals. As a result, for users of
+ * this cl_lock.c code, it looks like locking is done in normal blocking
+ * fashion, and it the same time it is possible to switch to the non-blocking
+ * locking (simply by returning cl_lock_transition::CLO_WAIT from cl_lock.c
+ * functions).
+ *
+ * For a description of state machine states and transitions see enum
+ * cl_lock_state.
+ *
+ * There are two ways to restrict a set of states which lock might move to:
+ *
+ *     - placing a "hold" on a lock guarantees that lock will not be moved
+ *       into cl_lock_state::CLS_FREEING state until hold is released. Hold
+ *       can be only acquired on a lock that is not in
+ *       cl_lock_state::CLS_FREEING. All holds on a lock are counted in
+ *       cl_lock::cll_holds. Hold protects lock from cancellation and
+ *       destruction. Requests to cancel and destroy a lock on hold will be
+ *       recorded, but only honored when last hold on a lock is released;
+ *
+ *     - placing a "user" on a lock guarantees that lock will not leave
+ *       cl_lock_state::CLS_NEW, cl_lock_state::CLS_QUEUING,
+ *       cl_lock_state::CLS_ENQUEUED and cl_lock_state::CLS_HELD set of
+ *       states, once it enters this set. That is, if a user is added onto a
+ *       lock in a state not from this set, it doesn't immediately enforce
+ *       lock to move to this set, but once lock enters this set it will
+ *       remain there until all users are removed. Lock users are counted in
+ *       cl_lock::cll_users.
+ *
+ *       User is used to assure that lock is not canceled or destroyed while
+ *       it is being enqueued, or actively used by some IO.
+ *
+ *       Currently, a user always comes with a hold (cl_lock_invariant()
+ *       checks that a number of holds is not less than a number of users).
+ *
+ * CONCURRENCY
+ *
+ * This is how lock state-machine operates. struct cl_lock contains a mutex
+ * cl_lock::cll_guard that protects struct fields.
+ *
+ *     - mutex is taken, and cl_lock::cll_state is examined.
+ *
+ *     - for every state there are possible target states where lock can move
+ *       into. They are tried in order. Attempts to move into next state are
+ *       done by _try() functions in cl_lock.c:cl_{enqueue,unlock,wait}_try().
+ *
+ *     - if the transition can be performed immediately, state is changed,
+ *       and mutex is released.
+ *
+ *     - if the transition requires blocking, _try() function returns
+ *       cl_lock_transition::CLO_WAIT. Caller unlocks mutex and goes to
+ *       sleep, waiting for possibility of lock state change. It is woken
+ *       up when some event occurs, that makes lock state change possible
+ *       (e.g., the reception of the reply from the server), and repeats
+ *       the loop.
+ *
+ * Top-lock and sub-lock has separate mutexes and the latter has to be taken
+ * first to avoid dead-lock.
+ *
+ * To see an example of interaction of all these issues, take a look at the
+ * lov_cl.c:lov_lock_enqueue() function. It is called as a part of
+ * cl_enqueue_try(), and tries to advance top-lock to ENQUEUED state, by
+ * advancing state-machines of its sub-locks (lov_lock_enqueue_one()). Note
+ * also, that it uses trylock to grab sub-lock mutex to avoid dead-lock. It
+ * also has to handle CEF_ASYNC enqueue, when sub-locks enqueues have to be
+ * done in parallel, rather than one after another (this is used for glimpse
+ * locks, that cannot dead-lock).
+ *
+ * INTERFACE AND USAGE
+ *
+ * struct cl_lock_operations provide a number of call-backs that are invoked
+ * when events of interest occurs. Layers can intercept and handle glimpse,
+ * blocking, cancel ASTs and a reception of the reply from the server.
+ *
+ * One important difference with the old client locking model is that new
+ * client has a representation for the top-lock, whereas in the old code only
+ * sub-locks existed as real data structures and file-level locks are
+ * represented by "request sets" that are created and destroyed on each and
+ * every lock creation.
+ *
+ * Top-locks are cached, and can be found in the cache by the system calls. It
+ * is possible that top-lock is in cache, but some of its sub-locks were
+ * canceled and destroyed. In that case top-lock has to be enqueued again
+ * before it can be used.
+ *
+ * Overall process of the locking during IO operation is as following:
+ *
+ *     - once parameters for IO are setup in cl_io, cl_io_operations::cio_lock()
+ *       is called on each layer. Responsibility of this method is to add locks,
+ *       needed by a given layer into cl_io.ci_lockset.
+ *
+ *     - once locks for all layers were collected, they are sorted to avoid
+ *       dead-locks (cl_io_locks_sort()), and enqueued.
+ *
+ *     - when all locks are acquired, IO is performed;
+ *
+ *     - locks are released into cache.
+ *
+ * Striping introduces major additional complexity into locking. The
+ * fundamental problem is that it is generally unsafe to actively use (hold)
+ * two locks on the different OST servers at the same time, as this introduces
+ * inter-server dependency and can lead to cascading evictions.
+ *
+ * Basic solution is to sub-divide large read/write IOs into smaller pieces so
+ * that no multi-stripe locks are taken (note that this design abandons POSIX
+ * read/write semantics). Such pieces ideally can be executed concurrently. At
+ * the same time, certain types of IO cannot be sub-divived, without
+ * sacrificing correctness. This includes:
+ *
+ *  - O_APPEND write, where [0, EOF] lock has to be taken, to guarantee
+ *  atomicity;
+ *
+ *  - ftruncate(fd, offset), where [offset, EOF] lock has to be taken.
+ *
+ * Also, in the case of read(fd, buf, count) or write(fd, buf, count), where
+ * buf is a part of memory mapped Lustre file, a lock or locks protecting buf
+ * has to be held together with the usual lock on [offset, offset + count].
+ *
+ * As multi-stripe locks have to be allowed, it makes sense to cache them, so
+ * that, for example, a sequence of O_APPEND writes can proceed quickly
+ * without going down to the individual stripes to do lock matching. On the
+ * other hand, multi-stripe locks shouldn't be used by normal read/write
+ * calls. To achieve this, every layer can implement ->clo_fits_into() method,
+ * that is called by lock matching code (cl_lock_lookup()), and that can be
+ * used to selectively disable matching of certain locks for certain IOs. For
+ * example, lov layer implements lov_lock_fits_into() that allow multi-stripe
+ * locks to be matched only for truncates and O_APPEND writes.
+ *
+ * Interaction with DLM
+ *
+ * In the expected setup, cl_lock is ultimately backed up by a collection of
+ * DLM locks (struct ldlm_lock). Association between cl_lock and DLM lock is
+ * implemented in osc layer, that also matches DLM events (ASTs, cancellation,
+ * etc.) into cl_lock_operation calls. See struct osc_lock for a more detailed
+ * description of interaction with DLM.
+ */
+
+/**
+ * Lock description.
+ */
+struct cl_lock_descr {
+	/** Object this lock is granted for. */
+	struct cl_object *cld_obj;
+	/** Index of the first page protected by this lock. */
+	pgoff_t	   cld_start;
+	/** Index of the last page (inclusive) protected by this lock. */
+	pgoff_t	   cld_end;
+	/** Group ID, for group lock */
+	__u64	     cld_gid;
+	/** Lock mode. */
+	enum cl_lock_mode cld_mode;
+	/**
+	 * flags to enqueue lock. A combination of bit-flags from
+	 * enum cl_enq_flags.
+	 */
+	__u32	     cld_enq_flags;
+};
+
+#define DDESCR "%s(%d):[%lu, %lu]"
+#define PDESCR(descr)						   \
+	cl_lock_mode_name((descr)->cld_mode), (descr)->cld_mode,	\
+	(descr)->cld_start, (descr)->cld_end
+
+const char *cl_lock_mode_name(const enum cl_lock_mode mode);
+
+/**
+ * Lock state-machine states.
+ *
+ * \htmlonly
+ * <pre>
+ *
+ * Possible state transitions:
+ *
+ *	      +------------------>NEW
+ *	      |		    |
+ *	      |		    | cl_enqueue_try()
+ *	      |		    |
+ *	      |    cl_unuse_try()  V
+ *	      |  +--------------QUEUING (*)
+ *	      |  |		 |
+ *	      |  |		 | cl_enqueue_try()
+ *	      |  |		 |
+ *	      |  | cl_unuse_try()  V
+ *    sub-lock  |  +-------------ENQUEUED (*)
+ *    canceled  |  |		 |
+ *	      |  |		 | cl_wait_try()
+ *	      |  |		 |
+ *	      |  |		(R)
+ *	      |  |		 |
+ *	      |  |		 V
+ *	      |  |		HELD<---------+
+ *	      |  |		 |	    |
+ *	      |  |		 |	    | cl_use_try()
+ *	      |  |  cl_unuse_try() |	    |
+ *	      |  |		 |	    |
+ *	      |  |		 V	 ---+
+ *	      |  +------------>INTRANSIT (D) <--+
+ *	      |		    |	    |
+ *	      |     cl_unuse_try() |	    | cached lock found
+ *	      |		    |	    | cl_use_try()
+ *	      |		    |	    |
+ *	      |		    V	    |
+ *	      +------------------CACHED---------+
+ *				   |
+ *				  (C)
+ *				   |
+ *				   V
+ *				FREEING
+ *
+ * Legend:
+ *
+ *	 In states marked with (*) transition to the same state (i.e., a loop
+ *	 in the diagram) is possible.
+ *
+ *	 (R) is the point where Receive call-back is invoked: it allows layers
+ *	 to handle arrival of lock reply.
+ *
+ *	 (C) is the point where Cancellation call-back is invoked.
+ *
+ *	 (D) is the transit state which means the lock is changing.
+ *
+ *	 Transition to FREEING state is possible from any other state in the
+ *	 diagram in case of unrecoverable error.
+ * </pre>
+ * \endhtmlonly
+ *
+ * These states are for individual cl_lock object. Top-lock and its sub-locks
+ * can be in the different states. Another way to say this is that we have
+ * nested state-machines.
+ *
+ * Separate QUEUING and ENQUEUED states are needed to support non-blocking
+ * operation for locks with multiple sub-locks. Imagine lock on a file F, that
+ * intersects 3 stripes S0, S1, and S2. To enqueue F client has to send
+ * enqueue to S0, wait for its completion, then send enqueue for S1, wait for
+ * its completion and at last enqueue lock for S2, and wait for its
+ * completion. In that case, top-lock is in QUEUING state while S0, S1 are
+ * handled, and is in ENQUEUED state after enqueue to S2 has been sent (note
+ * that in this case, sub-locks move from state to state, and top-lock remains
+ * in the same state).
+ */
+enum cl_lock_state {
+	/**
+	 * Lock that wasn't yet enqueued
+	 */
+	CLS_NEW,
+	/**
+	 * Enqueue is in progress, blocking for some intermediate interaction
+	 * with the other side.
+	 */
+	CLS_QUEUING,
+	/**
+	 * Lock is fully enqueued, waiting for server to reply when it is
+	 * granted.
+	 */
+	CLS_ENQUEUED,
+	/**
+	 * Lock granted, actively used by some IO.
+	 */
+	CLS_HELD,
+	/**
+	 * This state is used to mark the lock is being used, or unused.
+	 * We need this state because the lock may have several sublocks,
+	 * so it's impossible to have an atomic way to bring all sublocks
+	 * into CLS_HELD state at use case, or all sublocks to CLS_CACHED
+	 * at unuse case.
+	 * If a thread is referring to a lock, and it sees the lock is in this
+	 * state, it must wait for the lock.
+	 * See state diagram for details.
+	 */
+	CLS_INTRANSIT,
+	/**
+	 * Lock granted, not used.
+	 */
+	CLS_CACHED,
+	/**
+	 * Lock is being destroyed.
+	 */
+	CLS_FREEING,
+	CLS_NR
+};
+
+enum cl_lock_flags {
+	/**
+	 * lock has been cancelled. This flag is never cleared once set (by
+	 * cl_lock_cancel0()).
+	 */
+	CLF_CANCELLED  = 1 << 0,
+	/** cancellation is pending for this lock. */
+	CLF_CANCELPEND = 1 << 1,
+	/** destruction is pending for this lock. */
+	CLF_DOOMED     = 1 << 2,
+	/** from enqueue RPC reply upcall. */
+	CLF_FROM_UPCALL= 1 << 3,
+};
+
+/**
+ * Lock closure.
+ *
+ * Lock closure is a collection of locks (both top-locks and sub-locks) that
+ * might be updated in a result of an operation on a certain lock (which lock
+ * this is a closure of).
+ *
+ * Closures are needed to guarantee dead-lock freedom in the presence of
+ *
+ *     - nested state-machines (top-lock state-machine composed of sub-lock
+ *       state-machines), and
+ *
+ *     - shared sub-locks.
+ *
+ * Specifically, many operations, such as lock enqueue, wait, unlock,
+ * etc. start from a top-lock, and then operate on a sub-locks of this
+ * top-lock, holding a top-lock mutex. When sub-lock state changes as a result
+ * of such operation, this change has to be propagated to all top-locks that
+ * share this sub-lock. Obviously, no natural lock ordering (e.g.,
+ * top-to-bottom or bottom-to-top) captures this scenario, so try-locking has
+ * to be used. Lock closure systematizes this try-and-repeat logic.
+ */
+struct cl_lock_closure {
+	/**
+	 * Lock that is mutexed when closure construction is started. When
+	 * closure in is `wait' mode (cl_lock_closure::clc_wait), mutex on
+	 * origin is released before waiting.
+	 */
+	struct cl_lock   *clc_origin;
+	/**
+	 * List of enclosed locks, so far. Locks are linked here through
+	 * cl_lock::cll_inclosure.
+	 */
+	struct list_head	clc_list;
+	/**
+	 * True iff closure is in a `wait' mode. This determines what
+	 * cl_lock_enclosure() does when a lock L to be added to the closure
+	 * is currently mutexed by some other thread.
+	 *
+	 * If cl_lock_closure::clc_wait is not set, then closure construction
+	 * fails with CLO_REPEAT immediately.
+	 *
+	 * In wait mode, cl_lock_enclosure() waits until next attempt to build
+	 * a closure might succeed. To this end it releases an origin mutex
+	 * (cl_lock_closure::clc_origin), that has to be the only lock mutex
+	 * owned by the current thread, and then waits on L mutex (by grabbing
+	 * it and immediately releasing), before returning CLO_REPEAT to the
+	 * caller.
+	 */
+	int	       clc_wait;
+	/** Number of locks in the closure. */
+	int	       clc_nr;
+};
+
+/**
+ * Layered client lock.
+ */
+struct cl_lock {
+	/** Reference counter. */
+	atomic_t	  cll_ref;
+	/** List of slices. Immutable after creation. */
+	struct list_head	    cll_layers;
+	/**
+	 * Linkage into cl_lock::cll_descr::cld_obj::coh_locks list. Protected
+	 * by cl_lock::cll_descr::cld_obj::coh_lock_guard.
+	 */
+	struct list_head	    cll_linkage;
+	/**
+	 * Parameters of this lock. Protected by
+	 * cl_lock::cll_descr::cld_obj::coh_lock_guard nested within
+	 * cl_lock::cll_guard. Modified only on lock creation and in
+	 * cl_lock_modify().
+	 */
+	struct cl_lock_descr  cll_descr;
+	/** Protected by cl_lock::cll_guard. */
+	enum cl_lock_state    cll_state;
+	/** signals state changes. */
+	wait_queue_head_t	   cll_wq;
+	/**
+	 * Recursive lock, most fields in cl_lock{} are protected by this.
+	 *
+	 * Locking rules: this mutex is never held across network
+	 * communication, except when lock is being canceled.
+	 *
+	 * Lock ordering: a mutex of a sub-lock is taken first, then a mutex
+	 * on a top-lock. Other direction is implemented through a
+	 * try-lock-repeat loop. Mutices of unrelated locks can be taken only
+	 * by try-locking.
+	 *
+	 * \see osc_lock_enqueue_wait(), lov_lock_cancel(), lov_sublock_wait().
+	 */
+	struct mutex		cll_guard;
+	struct task_struct	*cll_guarder;
+	int		   cll_depth;
+
+	/**
+	 * the owner for INTRANSIT state
+	 */
+	struct task_struct	*cll_intransit_owner;
+	int		   cll_error;
+	/**
+	 * Number of holds on a lock. A hold prevents a lock from being
+	 * canceled and destroyed. Protected by cl_lock::cll_guard.
+	 *
+	 * \see cl_lock_hold(), cl_lock_unhold(), cl_lock_release()
+	 */
+	int		   cll_holds;
+	 /**
+	  * Number of lock users. Valid in cl_lock_state::CLS_HELD state
+	  * only. Lock user pins lock in CLS_HELD state. Protected by
+	  * cl_lock::cll_guard.
+	  *
+	  * \see cl_wait(), cl_unuse().
+	  */
+	int		   cll_users;
+	/**
+	 * Flag bit-mask. Values from enum cl_lock_flags. Updates are
+	 * protected by cl_lock::cll_guard.
+	 */
+	unsigned long	 cll_flags;
+	/**
+	 * A linkage into a list of locks in a closure.
+	 *
+	 * \see cl_lock_closure
+	 */
+	struct list_head	    cll_inclosure;
+	/**
+	 * Confict lock at queuing time.
+	 */
+	struct cl_lock       *cll_conflict;
+	/**
+	 * A list of references to this lock, for debugging.
+	 */
+	struct lu_ref	 cll_reference;
+	/**
+	 * A list of holds on this lock, for debugging.
+	 */
+	struct lu_ref	 cll_holders;
+	/**
+	 * A reference for cl_lock::cll_descr::cld_obj. For debugging.
+	 */
+	struct lu_ref_link    cll_obj_ref;
+#ifdef CONFIG_LOCKDEP
+	/* "dep_map" name is assumed by lockdep.h macros. */
+	struct lockdep_map    dep_map;
+#endif
+};
+
+/**
+ * Per-layer part of cl_lock
+ *
+ * \see ccc_lock, lov_lock, lovsub_lock, osc_lock
+ */
+struct cl_lock_slice {
+	struct cl_lock		  *cls_lock;
+	/** Object slice corresponding to this lock slice. Immutable after
+	 * creation. */
+	struct cl_object		*cls_obj;
+	const struct cl_lock_operations *cls_ops;
+	/** Linkage into cl_lock::cll_layers. Immutable after creation. */
+	struct list_head		       cls_linkage;
+};
+
+/**
+ * Possible (non-error) return values of ->clo_{enqueue,wait,unlock}().
+ *
+ * NOTE: lov_subresult() depends on ordering here.
+ */
+enum cl_lock_transition {
+	/** operation cannot be completed immediately. Wait for state change. */
+	CLO_WAIT	= 1,
+	/** operation had to release lock mutex, restart. */
+	CLO_REPEAT      = 2,
+	/** lower layer re-enqueued. */
+	CLO_REENQUEUED  = 3,
+};
+
+/**
+ *
+ * \see vvp_lock_ops, lov_lock_ops, lovsub_lock_ops, osc_lock_ops
+ */
+struct cl_lock_operations {
+	/**
+	 * \name statemachine
+	 *
+	 * State machine transitions. These 3 methods are called to transfer
+	 * lock from one state to another, as described in the commentary
+	 * above enum #cl_lock_state.
+	 *
+	 * \retval 0	  this layer has nothing more to do to before
+	 *		       transition to the target state happens;
+	 *
+	 * \retval CLO_REPEAT method had to release and re-acquire cl_lock
+	 *		    mutex, repeat invocation of transition method
+	 *		    across all layers;
+	 *
+	 * \retval CLO_WAIT   this layer cannot move to the target state
+	 *		    immediately, as it has to wait for certain event
+	 *		    (e.g., the communication with the server). It
+	 *		    is guaranteed, that when the state transfer
+	 *		    becomes possible, cl_lock::cll_wq wait-queue
+	 *		    is signaled. Caller can wait for this event by
+	 *		    calling cl_lock_state_wait();
+	 *
+	 * \retval -ve	failure, abort state transition, move the lock
+	 *		    into cl_lock_state::CLS_FREEING state, and set
+	 *		    cl_lock::cll_error.
+	 *
+	 * Once all layers voted to agree to transition (by returning 0), lock
+	 * is moved into corresponding target state. All state transition
+	 * methods are optional.
+	 */
+	/** @{ */
+	/**
+	 * Attempts to enqueue the lock. Called top-to-bottom.
+	 *
+	 * \see ccc_lock_enqueue(), lov_lock_enqueue(), lovsub_lock_enqueue(),
+	 * \see osc_lock_enqueue()
+	 */
+	int  (*clo_enqueue)(const struct lu_env *env,
+			    const struct cl_lock_slice *slice,
+			    struct cl_io *io, __u32 enqflags);
+	/**
+	 * Attempts to wait for enqueue result. Called top-to-bottom.
+	 *
+	 * \see ccc_lock_wait(), lov_lock_wait(), osc_lock_wait()
+	 */
+	int  (*clo_wait)(const struct lu_env *env,
+			 const struct cl_lock_slice *slice);
+	/**
+	 * Attempts to unlock the lock. Called bottom-to-top. In addition to
+	 * usual return values of lock state-machine methods, this can return
+	 * -ESTALE to indicate that lock cannot be returned to the cache, and
+	 * has to be re-initialized.
+	 * unuse is a one-shot operation, so it must NOT return CLO_WAIT.
+	 *
+	 * \see ccc_lock_unuse(), lov_lock_unuse(), osc_lock_unuse()
+	 */
+	int  (*clo_unuse)(const struct lu_env *env,
+			  const struct cl_lock_slice *slice);
+	/**
+	 * Notifies layer that cached lock is started being used.
+	 *
+	 * \pre lock->cll_state == CLS_CACHED
+	 *
+	 * \see lov_lock_use(), osc_lock_use()
+	 */
+	int  (*clo_use)(const struct lu_env *env,
+			const struct cl_lock_slice *slice);
+	/** @} statemachine */
+	/**
+	 * A method invoked when lock state is changed (as a result of state
+	 * transition). This is used, for example, to track when the state of
+	 * a sub-lock changes, to propagate this change to the corresponding
+	 * top-lock. Optional
+	 *
+	 * \see lovsub_lock_state()
+	 */
+	void (*clo_state)(const struct lu_env *env,
+			  const struct cl_lock_slice *slice,
+			  enum cl_lock_state st);
+	/**
+	 * Returns true, iff given lock is suitable for the given io, idea
+	 * being, that there are certain "unsafe" locks, e.g., ones acquired
+	 * for O_APPEND writes, that we don't want to re-use for a normal
+	 * write, to avoid the danger of cascading evictions. Optional. Runs
+	 * under cl_object_header::coh_lock_guard.
+	 *
+	 * XXX this should take more information about lock needed by
+	 * io. Probably lock description or something similar.
+	 *
+	 * \see lov_fits_into()
+	 */
+	int (*clo_fits_into)(const struct lu_env *env,
+			     const struct cl_lock_slice *slice,
+			     const struct cl_lock_descr *need,
+			     const struct cl_io *io);
+	/**
+	 * \name ast
+	 * Asynchronous System Traps. All of then are optional, all are
+	 * executed bottom-to-top.
+	 */
+	/** @{ */
+
+	/**
+	 * Cancellation callback. Cancel a lock voluntarily, or under
+	 * the request of server.
+	 */
+	void (*clo_cancel)(const struct lu_env *env,
+			   const struct cl_lock_slice *slice);
+	/**
+	 * Lock weighting ast. Executed to estimate how precious this lock
+	 * is. The sum of results across all layers is used to determine
+	 * whether lock worth keeping in cache given present memory usage.
+	 *
+	 * \see osc_lock_weigh(), vvp_lock_weigh(), lovsub_lock_weigh().
+	 */
+	unsigned long (*clo_weigh)(const struct lu_env *env,
+				   const struct cl_lock_slice *slice);
+	/** @} ast */
+
+	/**
+	 * \see lovsub_lock_closure()
+	 */
+	int (*clo_closure)(const struct lu_env *env,
+			   const struct cl_lock_slice *slice,
+			   struct cl_lock_closure *closure);
+	/**
+	 * Executed bottom-to-top when lock description changes (e.g., as a
+	 * result of server granting more generous lock than was requested).
+	 *
+	 * \see lovsub_lock_modify()
+	 */
+	int (*clo_modify)(const struct lu_env *env,
+			  const struct cl_lock_slice *slice,
+			  const struct cl_lock_descr *updated);
+	/**
+	 * Notifies layers (bottom-to-top) that lock is going to be
+	 * destroyed. Responsibility of layers is to prevent new references on
+	 * this lock from being acquired once this method returns.
+	 *
+	 * This can be called multiple times due to the races.
+	 *
+	 * \see cl_lock_delete()
+	 * \see osc_lock_delete(), lovsub_lock_delete()
+	 */
+	void (*clo_delete)(const struct lu_env *env,
+			   const struct cl_lock_slice *slice);
+	/**
+	 * Destructor. Frees resources and the slice.
+	 *
+	 * \see ccc_lock_fini(), lov_lock_fini(), lovsub_lock_fini(),
+	 * \see osc_lock_fini()
+	 */
+	void (*clo_fini)(const struct lu_env *env, struct cl_lock_slice *slice);
+	/**
+	 * Optional debugging helper. Prints given lock slice.
+	 */
+	int (*clo_print)(const struct lu_env *env,
+			 void *cookie, lu_printer_t p,
+			 const struct cl_lock_slice *slice);
+};
+
+#define CL_LOCK_DEBUG(mask, env, lock, format, ...)		     \
+do {								    \
+	LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, mask, NULL);		\
+									\
+	if (cfs_cdebug_show(mask, DEBUG_SUBSYSTEM)) {		   \
+		cl_lock_print(env, &msgdata, lu_cdebug_printer, lock);  \
+		CDEBUG(mask, format , ## __VA_ARGS__);		  \
+	}							       \
+} while (0)
+
+#define CL_LOCK_ASSERT(expr, env, lock) do {			    \
+	if (likely(expr))					       \
+		break;						  \
+									\
+	CL_LOCK_DEBUG(D_ERROR, env, lock, "failed at %s.\n", #expr);    \
+	LBUG();							 \
+} while (0)
+
+/** @} cl_lock */
+
+/** \addtogroup cl_page_list cl_page_list
+ * Page list used to perform collective operations on a group of pages.
+ *
+ * Pages are added to the list one by one. cl_page_list acquires a reference
+ * for every page in it. Page list is used to perform collective operations on
+ * pages:
+ *
+ *     - submit pages for an immediate transfer,
+ *
+ *     - own pages on behalf of certain io (waiting for each page in turn),
+ *
+ *     - discard pages.
+ *
+ * When list is finalized, it releases references on all pages it still has.
+ *
+ * \todo XXX concurrency control.
+ *
+ * @{
+ */
+struct cl_page_list {
+	unsigned	     pl_nr;
+	struct list_head	   pl_pages;
+	struct task_struct	*pl_owner;
+};
+
+/**
+ * A 2-queue of pages. A convenience data-type for common use case, 2-queue
+ * contains an incoming page list and an outgoing page list.
+ */
+struct cl_2queue {
+	struct cl_page_list c2_qin;
+	struct cl_page_list c2_qout;
+};
+
+/** @} cl_page_list */
+
+/** \addtogroup cl_io cl_io
+ * @{ */
+/** \struct cl_io
+ * I/O
+ *
+ * cl_io represents a high level I/O activity like
+ * read(2)/write(2)/truncate(2) system call, or cancellation of an extent
+ * lock.
+ *
+ * cl_io is a layered object, much like cl_{object,page,lock} but with one
+ * important distinction. We want to minimize number of calls to the allocator
+ * in the fast path, e.g., in the case of read(2) when everything is cached:
+ * client already owns the lock over region being read, and data are cached
+ * due to read-ahead. To avoid allocation of cl_io layers in such situations,
+ * per-layer io state is stored in the session, associated with the io, see
+ * struct {vvp,lov,osc}_io for example. Sessions allocation is amortized
+ * by using free-lists, see cl_env_get().
+ *
+ * There is a small predefined number of possible io types, enumerated in enum
+ * cl_io_type.
+ *
+ * cl_io is a state machine, that can be advanced concurrently by the multiple
+ * threads. It is up to these threads to control the concurrency and,
+ * specifically, to detect when io is done, and its state can be safely
+ * released.
+ *
+ * For read/write io overall execution plan is as following:
+ *
+ *     (0) initialize io state through all layers;
+ *
+ *     (1) loop: prepare chunk of work to do
+ *
+ *     (2) call all layers to collect locks they need to process current chunk
+ *
+ *     (3) sort all locks to avoid dead-locks, and acquire them
+ *
+ *     (4) process the chunk: call per-page methods
+ *	 (cl_io_operations::cio_read_page() for read,
+ *	 cl_io_operations::cio_prepare_write(),
+ *	 cl_io_operations::cio_commit_write() for write)
+ *
+ *     (5) release locks
+ *
+ *     (6) repeat loop.
+ *
+ * To implement the "parallel IO mode", lov layer creates sub-io's (lazily to
+ * address allocation efficiency issues mentioned above), and returns with the
+ * special error condition from per-page method when current sub-io has to
+ * block. This causes io loop to be repeated, and lov switches to the next
+ * sub-io in its cl_io_operations::cio_iter_init() implementation.
+ */
+
+/** IO types */
+enum cl_io_type {
+	/** read system call */
+	CIT_READ,
+	/** write system call */
+	CIT_WRITE,
+	/** truncate, utime system calls */
+	CIT_SETATTR,
+	/**
+	 * page fault handling
+	 */
+	CIT_FAULT,
+	/**
+	 * fsync system call handling
+	 * To write out a range of file
+	 */
+	CIT_FSYNC,
+	/**
+	 * Miscellaneous io. This is used for occasional io activity that
+	 * doesn't fit into other types. Currently this is used for:
+	 *
+	 *     - cancellation of an extent lock. This io exists as a context
+	 *     to write dirty pages from under the lock being canceled back
+	 *     to the server;
+	 *
+	 *     - VM induced page write-out. An io context for writing page out
+	 *     for memory cleansing;
+	 *
+	 *     - glimpse. An io context to acquire glimpse lock.
+	 *
+	 *     - grouplock. An io context to acquire group lock.
+	 *
+	 * CIT_MISC io is used simply as a context in which locks and pages
+	 * are manipulated. Such io has no internal "process", that is,
+	 * cl_io_loop() is never called for it.
+	 */
+	CIT_MISC,
+	CIT_OP_NR
+};
+
+/**
+ * States of cl_io state machine
+ */
+enum cl_io_state {
+	/** Not initialized. */
+	CIS_ZERO,
+	/** Initialized. */
+	CIS_INIT,
+	/** IO iteration started. */
+	CIS_IT_STARTED,
+	/** Locks taken. */
+	CIS_LOCKED,
+	/** Actual IO is in progress. */
+	CIS_IO_GOING,
+	/** IO for the current iteration finished. */
+	CIS_IO_FINISHED,
+	/** Locks released. */
+	CIS_UNLOCKED,
+	/** Iteration completed. */
+	CIS_IT_ENDED,
+	/** cl_io finalized. */
+	CIS_FINI
+};
+
+/**
+ * IO state private for a layer.
+ *
+ * This is usually embedded into layer session data, rather than allocated
+ * dynamically.
+ *
+ * \see vvp_io, lov_io, osc_io, ccc_io
+ */
+struct cl_io_slice {
+	struct cl_io		  *cis_io;
+	/** corresponding object slice. Immutable after creation. */
+	struct cl_object	      *cis_obj;
+	/** io operations. Immutable after creation. */
+	const struct cl_io_operations *cis_iop;
+	/**
+	 * linkage into a list of all slices for a given cl_io, hanging off
+	 * cl_io::ci_layers. Immutable after creation.
+	 */
+	struct list_head		     cis_linkage;
+};
+
+
+/**
+ * Per-layer io operations.
+ * \see vvp_io_ops, lov_io_ops, lovsub_io_ops, osc_io_ops
+ */
+struct cl_io_operations {
+	/**
+	 * Vector of io state transition methods for every io type.
+	 *
+	 * \see cl_page_operations::io
+	 */
+	struct {
+		/**
+		 * Prepare io iteration at a given layer.
+		 *
+		 * Called top-to-bottom at the beginning of each iteration of
+		 * "io loop" (if it makes sense for this type of io). Here
+		 * layer selects what work it will do during this iteration.
+		 *
+		 * \see cl_io_operations::cio_iter_fini()
+		 */
+		int (*cio_iter_init) (const struct lu_env *env,
+				      const struct cl_io_slice *slice);
+		/**
+		 * Finalize io iteration.
+		 *
+		 * Called bottom-to-top at the end of each iteration of "io
+		 * loop". Here layers can decide whether IO has to be
+		 * continued.
+		 *
+		 * \see cl_io_operations::cio_iter_init()
+		 */
+		void (*cio_iter_fini) (const struct lu_env *env,
+				       const struct cl_io_slice *slice);
+		/**
+		 * Collect locks for the current iteration of io.
+		 *
+		 * Called top-to-bottom to collect all locks necessary for
+		 * this iteration. This methods shouldn't actually enqueue
+		 * anything, instead it should post a lock through
+		 * cl_io_lock_add(). Once all locks are collected, they are
+		 * sorted and enqueued in the proper order.
+		 */
+		int  (*cio_lock) (const struct lu_env *env,
+				  const struct cl_io_slice *slice);
+		/**
+		 * Finalize unlocking.
+		 *
+		 * Called bottom-to-top to finish layer specific unlocking
+		 * functionality, after generic code released all locks
+		 * acquired by cl_io_operations::cio_lock().
+		 */
+		void  (*cio_unlock)(const struct lu_env *env,
+				    const struct cl_io_slice *slice);
+		/**
+		 * Start io iteration.
+		 *
+		 * Once all locks are acquired, called top-to-bottom to
+		 * commence actual IO. In the current implementation,
+		 * top-level vvp_io_{read,write}_start() does all the work
+		 * synchronously by calling generic_file_*(), so other layers
+		 * are called when everything is done.
+		 */
+		int  (*cio_start)(const struct lu_env *env,
+				  const struct cl_io_slice *slice);
+		/**
+		 * Called top-to-bottom at the end of io loop. Here layer
+		 * might wait for an unfinished asynchronous io.
+		 */
+		void (*cio_end)  (const struct lu_env *env,
+				  const struct cl_io_slice *slice);
+		/**
+		 * Called bottom-to-top to notify layers that read/write IO
+		 * iteration finished, with \a nob bytes transferred.
+		 */
+		void (*cio_advance)(const struct lu_env *env,
+				    const struct cl_io_slice *slice,
+				    size_t nob);
+		/**
+		 * Called once per io, bottom-to-top to release io resources.
+		 */
+		void (*cio_fini) (const struct lu_env *env,
+				  const struct cl_io_slice *slice);
+	} op[CIT_OP_NR];
+	struct {
+		/**
+		 * Submit pages from \a queue->c2_qin for IO, and move
+		 * successfully submitted pages into \a queue->c2_qout. Return
+		 * non-zero if failed to submit even the single page. If
+		 * submission failed after some pages were moved into \a
+		 * queue->c2_qout, completion callback with non-zero ioret is
+		 * executed on them.
+		 */
+		int  (*cio_submit)(const struct lu_env *env,
+				   const struct cl_io_slice *slice,
+				   enum cl_req_type crt,
+				   struct cl_2queue *queue);
+	} req_op[CRT_NR];
+	/**
+	 * Read missing page.
+	 *
+	 * Called by a top-level cl_io_operations::op[CIT_READ]::cio_start()
+	 * method, when it hits not-up-to-date page in the range. Optional.
+	 *
+	 * \pre io->ci_type == CIT_READ
+	 */
+	int (*cio_read_page)(const struct lu_env *env,
+			     const struct cl_io_slice *slice,
+			     const struct cl_page_slice *page);
+	/**
+	 * Prepare write of a \a page. Called bottom-to-top by a top-level
+	 * cl_io_operations::op[CIT_WRITE]::cio_start() to prepare page for
+	 * get data from user-level buffer.
+	 *
+	 * \pre io->ci_type == CIT_WRITE
+	 *
+	 * \see vvp_io_prepare_write(), lov_io_prepare_write(),
+	 * osc_io_prepare_write().
+	 */
+	int (*cio_prepare_write)(const struct lu_env *env,
+				 const struct cl_io_slice *slice,
+				 const struct cl_page_slice *page,
+				 unsigned from, unsigned to);
+	/**
+	 *
+	 * \pre io->ci_type == CIT_WRITE
+	 *
+	 * \see vvp_io_commit_write(), lov_io_commit_write(),
+	 * osc_io_commit_write().
+	 */
+	int (*cio_commit_write)(const struct lu_env *env,
+				const struct cl_io_slice *slice,
+				const struct cl_page_slice *page,
+				unsigned from, unsigned to);
+	/**
+	 * Optional debugging helper. Print given io slice.
+	 */
+	int (*cio_print)(const struct lu_env *env, void *cookie,
+			 lu_printer_t p, const struct cl_io_slice *slice);
+};
+
+/**
+ * Flags to lock enqueue procedure.
+ * \ingroup cl_lock
+ */
+enum cl_enq_flags {
+	/**
+	 * instruct server to not block, if conflicting lock is found. Instead
+	 * -EWOULDBLOCK is returned immediately.
+	 */
+	CEF_NONBLOCK     = 0x00000001,
+	/**
+	 * take lock asynchronously (out of order), as it cannot
+	 * deadlock. This is for LDLM_FL_HAS_INTENT locks used for glimpsing.
+	 */
+	CEF_ASYNC	= 0x00000002,
+	/**
+	 * tell the server to instruct (though a flag in the blocking ast) an
+	 * owner of the conflicting lock, that it can drop dirty pages
+	 * protected by this lock, without sending them to the server.
+	 */
+	CEF_DISCARD_DATA = 0x00000004,
+	/**
+	 * tell the sub layers that it must be a `real' lock. This is used for
+	 * mmapped-buffer locks and glimpse locks that must be never converted
+	 * into lockless mode.
+	 *
+	 * \see vvp_mmap_locks(), cl_glimpse_lock().
+	 */
+	CEF_MUST	 = 0x00000008,
+	/**
+	 * tell the sub layers that never request a `real' lock. This flag is
+	 * not used currently.
+	 *
+	 * cl_io::ci_lockreq and CEF_{MUST,NEVER} flags specify lockless
+	 * conversion policy: ci_lockreq describes generic information of lock
+	 * requirement for this IO, especially for locks which belong to the
+	 * object doing IO; however, lock itself may have precise requirements
+	 * that are described by the enqueue flags.
+	 */
+	CEF_NEVER	= 0x00000010,
+	/**
+	 * for async glimpse lock.
+	 */
+	CEF_AGL	  = 0x00000020,
+	/**
+	 * mask of enq_flags.
+	 */
+	CEF_MASK	 = 0x0000003f,
+};
+
+/**
+ * Link between lock and io. Intermediate structure is needed, because the
+ * same lock can be part of multiple io's simultaneously.
+ */
+struct cl_io_lock_link {
+	/** linkage into one of cl_lockset lists. */
+	struct list_head	   cill_linkage;
+	struct cl_lock_descr cill_descr;
+	struct cl_lock      *cill_lock;
+	/** optional destructor */
+	void	       (*cill_fini)(const struct lu_env *env,
+					struct cl_io_lock_link *link);
+};
+
+/**
+ * Lock-set represents a collection of locks, that io needs at a
+ * time. Generally speaking, client tries to avoid holding multiple locks when
+ * possible, because
+ *
+ *      - holding extent locks over multiple ost's introduces the danger of
+ *	"cascading timeouts";
+ *
+ *      - holding multiple locks over the same ost is still dead-lock prone,
+ *	see comment in osc_lock_enqueue(),
+ *
+ * but there are certain situations where this is unavoidable:
+ *
+ *      - O_APPEND writes have to take [0, EOF] lock for correctness;
+ *
+ *      - truncate has to take [new-size, EOF] lock for correctness;
+ *
+ *      - SNS has to take locks across full stripe for correctness;
+ *
+ *      - in the case when user level buffer, supplied to {read,write}(file0),
+ *	is a part of a memory mapped lustre file, client has to take a dlm
+ *	locks on file0, and all files that back up the buffer (or a part of
+ *	the buffer, that is being processed in the current chunk, in any
+ *	case, there are situations where at least 2 locks are necessary).
+ *
+ * In such cases we at least try to take locks in the same consistent
+ * order. To this end, all locks are first collected, then sorted, and then
+ * enqueued.
+ */
+struct cl_lockset {
+	/** locks to be acquired. */
+	struct list_head  cls_todo;
+	/** locks currently being processed. */
+	struct list_head  cls_curr;
+	/** locks acquired. */
+	struct list_head  cls_done;
+};
+
+/**
+ * Lock requirements(demand) for IO. It should be cl_io_lock_req,
+ * but 'req' is always to be thought as 'request' :-)
+ */
+enum cl_io_lock_dmd {
+	/** Always lock data (e.g., O_APPEND). */
+	CILR_MANDATORY = 0,
+	/** Layers are free to decide between local and global locking. */
+	CILR_MAYBE,
+	/** Never lock: there is no cache (e.g., liblustre). */
+	CILR_NEVER
+};
+
+enum cl_fsync_mode {
+	/** start writeback, do not wait for them to finish */
+	CL_FSYNC_NONE  = 0,
+	/** start writeback and wait for them to finish */
+	CL_FSYNC_LOCAL = 1,
+	/** discard all of dirty pages in a specific file range */
+	CL_FSYNC_DISCARD = 2,
+	/** start writeback and make sure they have reached storage before
+	 * return. OST_SYNC RPC must be issued and finished */
+	CL_FSYNC_ALL   = 3
+};
+
+struct cl_io_rw_common {
+	loff_t      crw_pos;
+	size_t      crw_count;
+	int	 crw_nonblock;
+};
+
+
+/**
+ * State for io.
+ *
+ * cl_io is shared by all threads participating in this IO (in current
+ * implementation only one thread advances IO, but parallel IO design and
+ * concurrent copy_*_user() require multiple threads acting on the same IO. It
+ * is up to these threads to serialize their activities, including updates to
+ * mutable cl_io fields.
+ */
+struct cl_io {
+	/** type of this IO. Immutable after creation. */
+	enum cl_io_type		ci_type;
+	/** current state of cl_io state machine. */
+	enum cl_io_state	       ci_state;
+	/** main object this io is against. Immutable after creation. */
+	struct cl_object	      *ci_obj;
+	/**
+	 * Upper layer io, of which this io is a part of. Immutable after
+	 * creation.
+	 */
+	struct cl_io		  *ci_parent;
+	/** List of slices. Immutable after creation. */
+	struct list_head		     ci_layers;
+	/** list of locks (to be) acquired by this io. */
+	struct cl_lockset	      ci_lockset;
+	/** lock requirements, this is just a help info for sublayers. */
+	enum cl_io_lock_dmd	    ci_lockreq;
+	union {
+		struct cl_rd_io {
+			struct cl_io_rw_common rd;
+		} ci_rd;
+		struct cl_wr_io {
+			struct cl_io_rw_common wr;
+			int		    wr_append;
+			int		    wr_sync;
+		} ci_wr;
+		struct cl_io_rw_common ci_rw;
+		struct cl_setattr_io {
+			struct ost_lvb   sa_attr;
+			unsigned int     sa_valid;
+			struct obd_capa *sa_capa;
+		} ci_setattr;
+		struct cl_fault_io {
+			/** page index within file. */
+			pgoff_t	 ft_index;
+			/** bytes valid byte on a faulted page. */
+			int	     ft_nob;
+			/** writable page? for nopage() only */
+			int	     ft_writable;
+			/** page of an executable? */
+			int	     ft_executable;
+			/** page_mkwrite() */
+			int	     ft_mkwrite;
+			/** resulting page */
+			struct cl_page *ft_page;
+		} ci_fault;
+		struct cl_fsync_io {
+			loff_t	     fi_start;
+			loff_t	     fi_end;
+			struct obd_capa   *fi_capa;
+			/** file system level fid */
+			struct lu_fid     *fi_fid;
+			enum cl_fsync_mode fi_mode;
+			/* how many pages were written/discarded */
+			unsigned int       fi_nr_written;
+		} ci_fsync;
+	} u;
+	struct cl_2queue     ci_queue;
+	size_t	       ci_nob;
+	int		  ci_result;
+	unsigned int	 ci_continue:1,
+	/**
+	 * This io has held grouplock, to inform sublayers that
+	 * don't do lockless i/o.
+	 */
+			     ci_no_srvlock:1,
+	/**
+	 * The whole IO need to be restarted because layout has been changed
+	 */
+			     ci_need_restart:1,
+	/**
+	 * to not refresh layout - the IO issuer knows that the layout won't
+	 * change(page operations, layout change causes all page to be
+	 * discarded), or it doesn't matter if it changes(sync).
+	 */
+			     ci_ignore_layout:1,
+	/**
+	 * Check if layout changed after the IO finishes. Mainly for HSM
+	 * requirement. If IO occurs to openning files, it doesn't need to
+	 * verify layout because HSM won't release openning files.
+	 * Right now, only two operations need to verify layout: glimpse
+	 * and setattr.
+	 */
+			     ci_verify_layout:1,
+	/**
+	 * file is released, restore has to to be triggered by vvp layer
+	 */
+			     ci_restore_needed:1,
+	/**
+	 * O_NOATIME
+	 */
+			     ci_noatime:1;
+	/**
+	 * Number of pages owned by this IO. For invariant checking.
+	 */
+	unsigned	     ci_owned_nr;
+};
+
+/** @} cl_io */
+
+/** \addtogroup cl_req cl_req
+ * @{ */
+/** \struct cl_req
+ * Transfer.
+ *
+ * There are two possible modes of transfer initiation on the client:
+ *
+ *     - immediate transfer: this is started when a high level io wants a page
+ *       or a collection of pages to be transferred right away. Examples:
+ *       read-ahead, synchronous read in the case of non-page aligned write,
+ *       page write-out as a part of extent lock cancellation, page write-out
+ *       as a part of memory cleansing. Immediate transfer can be both
+ *       cl_req_type::CRT_READ and cl_req_type::CRT_WRITE;
+ *
+ *     - opportunistic transfer (cl_req_type::CRT_WRITE only), that happens
+ *       when io wants to transfer a page to the server some time later, when
+ *       it can be done efficiently. Example: pages dirtied by the write(2)
+ *       path.
+ *
+ * In any case, transfer takes place in the form of a cl_req, which is a
+ * representation for a network RPC.
+ *
+ * Pages queued for an opportunistic transfer are cached until it is decided
+ * that efficient RPC can be composed of them. This decision is made by "a
+ * req-formation engine", currently implemented as a part of osc
+ * layer. Req-formation depends on many factors: the size of the resulting
+ * RPC, whether or not multi-object RPCs are supported by the server,
+ * max-rpc-in-flight limitations, size of the dirty cache, etc.
+ *
+ * For the immediate transfer io submits a cl_page_list, that req-formation
+ * engine slices into cl_req's, possibly adding cached pages to some of
+ * the resulting req's.
+ *
+ * Whenever a page from cl_page_list is added to a newly constructed req, its
+ * cl_page_operations::cpo_prep() layer methods are called. At that moment,
+ * page state is atomically changed from cl_page_state::CPS_OWNED to
+ * cl_page_state::CPS_PAGEOUT or cl_page_state::CPS_PAGEIN, cl_page::cp_owner
+ * is zeroed, and cl_page::cp_req is set to the
+ * req. cl_page_operations::cpo_prep() method at the particular layer might
+ * return -EALREADY to indicate that it does not need to submit this page
+ * at all. This is possible, for example, if page, submitted for read,
+ * became up-to-date in the meantime; and for write, the page don't have
+ * dirty bit marked. \see cl_io_submit_rw()
+ *
+ * Whenever a cached page is added to a newly constructed req, its
+ * cl_page_operations::cpo_make_ready() layer methods are called. At that
+ * moment, page state is atomically changed from cl_page_state::CPS_CACHED to
+ * cl_page_state::CPS_PAGEOUT, and cl_page::cp_req is set to
+ * req. cl_page_operations::cpo_make_ready() method at the particular layer
+ * might return -EAGAIN to indicate that this page is not eligible for the
+ * transfer right now.
+ *
+ * FUTURE
+ *
+ * Plan is to divide transfers into "priority bands" (indicated when
+ * submitting cl_page_list, and queuing a page for the opportunistic transfer)
+ * and allow glueing of cached pages to immediate transfers only within single
+ * band. This would make high priority transfers (like lock cancellation or
+ * memory pressure induced write-out) really high priority.
+ *
+ */
+
+/**
+ * Per-transfer attributes.
+ */
+struct cl_req_attr {
+	/** Generic attributes for the server consumption. */
+	struct obdo	*cra_oa;
+	/** Capability. */
+	struct obd_capa	*cra_capa;
+	/** Jobid */
+	char		 cra_jobid[JOBSTATS_JOBID_SIZE];
+};
+
+/**
+ * Transfer request operations definable at every layer.
+ *
+ * Concurrency: transfer formation engine synchronizes calls to all transfer
+ * methods.
+ */
+struct cl_req_operations {
+	/**
+	 * Invoked top-to-bottom by cl_req_prep() when transfer formation is
+	 * complete (all pages are added).
+	 *
+	 * \see osc_req_prep()
+	 */
+	int  (*cro_prep)(const struct lu_env *env,
+			 const struct cl_req_slice *slice);
+	/**
+	 * Called top-to-bottom to fill in \a oa fields. This is called twice
+	 * with different flags, see bug 10150 and osc_build_req().
+	 *
+	 * \param obj an object from cl_req which attributes are to be set in
+	 *	    \a oa.
+	 *
+	 * \param oa struct obdo where attributes are placed
+	 *
+	 * \param flags \a oa fields to be filled.
+	 */
+	void (*cro_attr_set)(const struct lu_env *env,
+			     const struct cl_req_slice *slice,
+			     const struct cl_object *obj,
+			     struct cl_req_attr *attr, u64 flags);
+	/**
+	 * Called top-to-bottom from cl_req_completion() to notify layers that
+	 * transfer completed. Has to free all state allocated by
+	 * cl_device_operations::cdo_req_init().
+	 */
+	void (*cro_completion)(const struct lu_env *env,
+			       const struct cl_req_slice *slice, int ioret);
+};
+
+/**
+ * A per-object state that (potentially multi-object) transfer request keeps.
+ */
+struct cl_req_obj {
+	/** object itself */
+	struct cl_object   *ro_obj;
+	/** reference to cl_req_obj::ro_obj. For debugging. */
+	struct lu_ref_link  ro_obj_ref;
+	/* something else? Number of pages for a given object? */
+};
+
+/**
+ * Transfer request.
+ *
+ * Transfer requests are not reference counted, because IO sub-system owns
+ * them exclusively and knows when to free them.
+ *
+ * Life cycle.
+ *
+ * cl_req is created by cl_req_alloc() that calls
+ * cl_device_operations::cdo_req_init() device methods to allocate per-req
+ * state in every layer.
+ *
+ * Then pages are added (cl_req_page_add()), req keeps track of all objects it
+ * contains pages for.
+ *
+ * Once all pages were collected, cl_page_operations::cpo_prep() method is
+ * called top-to-bottom. At that point layers can modify req, let it pass, or
+ * deny it completely. This is to support things like SNS that have transfer
+ * ordering requirements invisible to the individual req-formation engine.
+ *
+ * On transfer completion (or transfer timeout, or failure to initiate the
+ * transfer of an allocated req), cl_req_operations::cro_completion() method
+ * is called, after execution of cl_page_operations::cpo_completion() of all
+ * req's pages.
+ */
+struct cl_req {
+	enum cl_req_type      crq_type;
+	/** A list of pages being transferred */
+	struct list_head	    crq_pages;
+	/** Number of pages in cl_req::crq_pages */
+	unsigned	      crq_nrpages;
+	/** An array of objects which pages are in ->crq_pages */
+	struct cl_req_obj    *crq_o;
+	/** Number of elements in cl_req::crq_objs[] */
+	unsigned	      crq_nrobjs;
+	struct list_head	    crq_layers;
+};
+
+/**
+ * Per-layer state for request.
+ */
+struct cl_req_slice {
+	struct cl_req    *crs_req;
+	struct cl_device *crs_dev;
+	struct list_head	crs_linkage;
+	const struct cl_req_operations *crs_ops;
+};
+
+/* @} cl_req */
+
+enum cache_stats_item {
+	/** how many cache lookups were performed */
+	CS_lookup = 0,
+	/** how many times cache lookup resulted in a hit */
+	CS_hit,
+	/** how many entities are in the cache right now */
+	CS_total,
+	/** how many entities in the cache are actively used (and cannot be
+	 * evicted) right now */
+	CS_busy,
+	/** how many entities were created at all */
+	CS_create,
+	CS_NR
+};
+
+#define CS_NAMES { "lookup", "hit", "total", "busy", "create" }
+
+/**
+ * Stats for a generic cache (similar to inode, lu_object, etc. caches).
+ */
+struct cache_stats {
+	const char    *cs_name;
+	atomic_t   cs_stats[CS_NR];
+};
+
+/** These are not exported so far */
+void cache_stats_init (struct cache_stats *cs, const char *name);
+
+/**
+ * Client-side site. This represents particular client stack. "Global"
+ * variables should (directly or indirectly) be added here to allow multiple
+ * clients to co-exist in the single address space.
+ */
+struct cl_site {
+	struct lu_site	cs_lu;
+	/**
+	 * Statistical counters. Atomics do not scale, something better like
+	 * per-cpu counters is needed.
+	 *
+	 * These are exported as /proc/fs/lustre/llite/.../site
+	 *
+	 * When interpreting keep in mind that both sub-locks (and sub-pages)
+	 * and top-locks (and top-pages) are accounted here.
+	 */
+	struct cache_stats    cs_pages;
+	struct cache_stats    cs_locks;
+	atomic_t	  cs_pages_state[CPS_NR];
+	atomic_t	  cs_locks_state[CLS_NR];
+};
+
+int  cl_site_init (struct cl_site *s, struct cl_device *top);
+void cl_site_fini (struct cl_site *s);
+void cl_stack_fini(const struct lu_env *env, struct cl_device *cl);
+
+/**
+ * Output client site statistical counters into a buffer. Suitable for
+ * ll_rd_*()-style functions.
+ */
+int cl_site_stats_print(const struct cl_site *site, struct seq_file *m);
+
+/**
+ * \name helpers
+ *
+ * Type conversion and accessory functions.
+ */
+/** @{ */
+
+static inline struct cl_site *lu2cl_site(const struct lu_site *site)
+{
+	return container_of(site, struct cl_site, cs_lu);
+}
+
+static inline int lu_device_is_cl(const struct lu_device *d)
+{
+	return d->ld_type->ldt_tags & LU_DEVICE_CL;
+}
+
+static inline struct cl_device *lu2cl_dev(const struct lu_device *d)
+{
+	LASSERT(d == NULL || IS_ERR(d) || lu_device_is_cl(d));
+	return container_of0(d, struct cl_device, cd_lu_dev);
+}
+
+static inline struct lu_device *cl2lu_dev(struct cl_device *d)
+{
+	return &d->cd_lu_dev;
+}
+
+static inline struct cl_object *lu2cl(const struct lu_object *o)
+{
+	LASSERT(o == NULL || IS_ERR(o) || lu_device_is_cl(o->lo_dev));
+	return container_of0(o, struct cl_object, co_lu);
+}
+
+static inline const struct cl_object_conf *
+lu2cl_conf(const struct lu_object_conf *conf)
+{
+	return container_of0(conf, struct cl_object_conf, coc_lu);
+}
+
+static inline struct cl_object *cl_object_next(const struct cl_object *obj)
+{
+	return obj ? lu2cl(lu_object_next(&obj->co_lu)) : NULL;
+}
+
+static inline struct cl_device *cl_object_device(const struct cl_object *o)
+{
+	LASSERT(o == NULL || IS_ERR(o) || lu_device_is_cl(o->co_lu.lo_dev));
+	return container_of0(o->co_lu.lo_dev, struct cl_device, cd_lu_dev);
+}
+
+static inline struct cl_object_header *luh2coh(const struct lu_object_header *h)
+{
+	return container_of0(h, struct cl_object_header, coh_lu);
+}
+
+static inline struct cl_site *cl_object_site(const struct cl_object *obj)
+{
+	return lu2cl_site(obj->co_lu.lo_dev->ld_site);
+}
+
+static inline
+struct cl_object_header *cl_object_header(const struct cl_object *obj)
+{
+	return luh2coh(obj->co_lu.lo_header);
+}
+
+static inline int cl_device_init(struct cl_device *d, struct lu_device_type *t)
+{
+	return lu_device_init(&d->cd_lu_dev, t);
+}
+
+static inline void cl_device_fini(struct cl_device *d)
+{
+	lu_device_fini(&d->cd_lu_dev);
+}
+
+void cl_page_slice_add(struct cl_page *page, struct cl_page_slice *slice,
+		       struct cl_object *obj,
+		       const struct cl_page_operations *ops);
+void cl_lock_slice_add(struct cl_lock *lock, struct cl_lock_slice *slice,
+		       struct cl_object *obj,
+		       const struct cl_lock_operations *ops);
+void cl_io_slice_add(struct cl_io *io, struct cl_io_slice *slice,
+		     struct cl_object *obj, const struct cl_io_operations *ops);
+void cl_req_slice_add(struct cl_req *req, struct cl_req_slice *slice,
+		      struct cl_device *dev,
+		      const struct cl_req_operations *ops);
+/** @} helpers */
+
+/** \defgroup cl_object cl_object
+ * @{ */
+struct cl_object *cl_object_top (struct cl_object *o);
+struct cl_object *cl_object_find(const struct lu_env *env, struct cl_device *cd,
+				 const struct lu_fid *fid,
+				 const struct cl_object_conf *c);
+
+int  cl_object_header_init(struct cl_object_header *h);
+void cl_object_header_fini(struct cl_object_header *h);
+void cl_object_put	(const struct lu_env *env, struct cl_object *o);
+void cl_object_get	(struct cl_object *o);
+void cl_object_attr_lock  (struct cl_object *o);
+void cl_object_attr_unlock(struct cl_object *o);
+int  cl_object_attr_get   (const struct lu_env *env, struct cl_object *obj,
+			   struct cl_attr *attr);
+int  cl_object_attr_set   (const struct lu_env *env, struct cl_object *obj,
+			   const struct cl_attr *attr, unsigned valid);
+int  cl_object_glimpse    (const struct lu_env *env, struct cl_object *obj,
+			   struct ost_lvb *lvb);
+int  cl_conf_set	  (const struct lu_env *env, struct cl_object *obj,
+			   const struct cl_object_conf *conf);
+void cl_object_prune      (const struct lu_env *env, struct cl_object *obj);
+void cl_object_kill       (const struct lu_env *env, struct cl_object *obj);
+int  cl_object_has_locks  (struct cl_object *obj);
+
+/**
+ * Returns true, iff \a o0 and \a o1 are slices of the same object.
+ */
+static inline int cl_object_same(struct cl_object *o0, struct cl_object *o1)
+{
+	return cl_object_header(o0) == cl_object_header(o1);
+}
+
+static inline void cl_object_page_init(struct cl_object *clob, int size)
+{
+	clob->co_slice_off = cl_object_header(clob)->coh_page_bufsize;
+	cl_object_header(clob)->coh_page_bufsize += ALIGN(size, 8);
+}
+
+static inline void *cl_object_page_slice(struct cl_object *clob,
+					 struct cl_page *page)
+{
+	return (void *)((char *)page + clob->co_slice_off);
+}
+
+/** @} cl_object */
+
+/** \defgroup cl_page cl_page
+ * @{ */
+enum {
+	CLP_GANG_OKAY = 0,
+	CLP_GANG_RESCHED,
+	CLP_GANG_AGAIN,
+	CLP_GANG_ABORT
+};
+
+/* callback of cl_page_gang_lookup() */
+typedef int   (*cl_page_gang_cb_t)  (const struct lu_env *, struct cl_io *,
+				     struct cl_page *, void *);
+int	     cl_page_gang_lookup (const struct lu_env *env,
+				     struct cl_object *obj,
+				     struct cl_io *io,
+				     pgoff_t start, pgoff_t end,
+				     cl_page_gang_cb_t cb, void *cbdata);
+struct cl_page *cl_page_lookup      (struct cl_object_header *hdr,
+				     pgoff_t index);
+struct cl_page *cl_page_find	(const struct lu_env *env,
+				     struct cl_object *obj,
+				     pgoff_t idx, struct page *vmpage,
+				     enum cl_page_type type);
+struct cl_page *cl_page_find_sub    (const struct lu_env *env,
+				     struct cl_object *obj,
+				     pgoff_t idx, struct page *vmpage,
+				     struct cl_page *parent);
+void	    cl_page_get	 (struct cl_page *page);
+void	    cl_page_put	 (const struct lu_env *env,
+				     struct cl_page *page);
+void	    cl_page_print       (const struct lu_env *env, void *cookie,
+				     lu_printer_t printer,
+				     const struct cl_page *pg);
+void	    cl_page_header_print(const struct lu_env *env, void *cookie,
+				     lu_printer_t printer,
+				     const struct cl_page *pg);
+struct page     *cl_page_vmpage      (const struct lu_env *env,
+				     struct cl_page *page);
+struct cl_page *cl_vmpage_page      (struct page *vmpage, struct cl_object *obj);
+struct cl_page *cl_page_top	 (struct cl_page *page);
+
+const struct cl_page_slice *cl_page_at(const struct cl_page *page,
+				       const struct lu_device_type *dtype);
+
+/**
+ * \name ownership
+ *
+ * Functions dealing with the ownership of page by io.
+ */
+/** @{ */
+
+int  cl_page_own	(const struct lu_env *env,
+			 struct cl_io *io, struct cl_page *page);
+int  cl_page_own_try    (const struct lu_env *env,
+			 struct cl_io *io, struct cl_page *page);
+void cl_page_assume     (const struct lu_env *env,
+			 struct cl_io *io, struct cl_page *page);
+void cl_page_unassume   (const struct lu_env *env,
+			 struct cl_io *io, struct cl_page *pg);
+void cl_page_disown     (const struct lu_env *env,
+			 struct cl_io *io, struct cl_page *page);
+int  cl_page_is_owned   (const struct cl_page *pg, const struct cl_io *io);
+
+/** @} ownership */
+
+/**
+ * \name transfer
+ *
+ * Functions dealing with the preparation of a page for a transfer, and
+ * tracking transfer state.
+ */
+/** @{ */
+int  cl_page_prep       (const struct lu_env *env, struct cl_io *io,
+			 struct cl_page *pg, enum cl_req_type crt);
+void cl_page_completion (const struct lu_env *env,
+			 struct cl_page *pg, enum cl_req_type crt, int ioret);
+int  cl_page_make_ready (const struct lu_env *env, struct cl_page *pg,
+			 enum cl_req_type crt);
+int  cl_page_cache_add  (const struct lu_env *env, struct cl_io *io,
+			 struct cl_page *pg, enum cl_req_type crt);
+void cl_page_clip       (const struct lu_env *env, struct cl_page *pg,
+			 int from, int to);
+int  cl_page_cancel     (const struct lu_env *env, struct cl_page *page);
+int  cl_page_flush      (const struct lu_env *env, struct cl_io *io,
+			 struct cl_page *pg);
+
+/** @} transfer */
+
+
+/**
+ * \name helper routines
+ * Functions to discard, delete and export a cl_page.
+ */
+/** @{ */
+void    cl_page_discard      (const struct lu_env *env, struct cl_io *io,
+			      struct cl_page *pg);
+void    cl_page_delete       (const struct lu_env *env, struct cl_page *pg);
+int     cl_page_unmap	(const struct lu_env *env, struct cl_io *io,
+			      struct cl_page *pg);
+int     cl_page_is_vmlocked  (const struct lu_env *env,
+			      const struct cl_page *pg);
+void    cl_page_export       (const struct lu_env *env,
+			      struct cl_page *pg, int uptodate);
+int     cl_page_is_under_lock(const struct lu_env *env, struct cl_io *io,
+			      struct cl_page *page);
+loff_t  cl_offset	    (const struct cl_object *obj, pgoff_t idx);
+pgoff_t cl_index	     (const struct cl_object *obj, loff_t offset);
+int     cl_page_size	 (const struct cl_object *obj);
+int     cl_pages_prune       (const struct lu_env *env, struct cl_object *obj);
+
+void cl_lock_print      (const struct lu_env *env, void *cookie,
+			 lu_printer_t printer, const struct cl_lock *lock);
+void cl_lock_descr_print(const struct lu_env *env, void *cookie,
+			 lu_printer_t printer,
+			 const struct cl_lock_descr *descr);
+/* @} helper */
+
+/** @} cl_page */
+
+/** \defgroup cl_lock cl_lock
+ * @{ */
+
+struct cl_lock *cl_lock_hold(const struct lu_env *env, const struct cl_io *io,
+			     const struct cl_lock_descr *need,
+			     const char *scope, const void *source);
+struct cl_lock *cl_lock_peek(const struct lu_env *env, const struct cl_io *io,
+			     const struct cl_lock_descr *need,
+			     const char *scope, const void *source);
+struct cl_lock *cl_lock_request(const struct lu_env *env, struct cl_io *io,
+				const struct cl_lock_descr *need,
+				const char *scope, const void *source);
+struct cl_lock *cl_lock_at_pgoff(const struct lu_env *env,
+				 struct cl_object *obj, pgoff_t index,
+				 struct cl_lock *except, int pending,
+				 int canceld);
+static inline struct cl_lock *cl_lock_at_page(const struct lu_env *env,
+					      struct cl_object *obj,
+					      struct cl_page *page,
+					      struct cl_lock *except,
+					      int pending, int canceld)
+{
+	LASSERT(cl_object_header(obj) == cl_object_header(page->cp_obj));
+	return cl_lock_at_pgoff(env, obj, page->cp_index, except,
+				pending, canceld);
+}
+
+const struct cl_lock_slice *cl_lock_at(const struct cl_lock *lock,
+				       const struct lu_device_type *dtype);
+
+void  cl_lock_get       (struct cl_lock *lock);
+void  cl_lock_get_trust (struct cl_lock *lock);
+void  cl_lock_put       (const struct lu_env *env, struct cl_lock *lock);
+void  cl_lock_hold_add  (const struct lu_env *env, struct cl_lock *lock,
+			 const char *scope, const void *source);
+void cl_lock_hold_release(const struct lu_env *env, struct cl_lock *lock,
+			  const char *scope, const void *source);
+void  cl_lock_unhold    (const struct lu_env *env, struct cl_lock *lock,
+			 const char *scope, const void *source);
+void  cl_lock_release   (const struct lu_env *env, struct cl_lock *lock,
+			 const char *scope, const void *source);
+void  cl_lock_user_add  (const struct lu_env *env, struct cl_lock *lock);
+void  cl_lock_user_del  (const struct lu_env *env, struct cl_lock *lock);
+
+enum cl_lock_state cl_lock_intransit(const struct lu_env *env,
+				     struct cl_lock *lock);
+void cl_lock_extransit(const struct lu_env *env, struct cl_lock *lock,
+		       enum cl_lock_state state);
+int cl_lock_is_intransit(struct cl_lock *lock);
+
+int cl_lock_enqueue_wait(const struct lu_env *env, struct cl_lock *lock,
+			 int keep_mutex);
+
+/** \name statemachine statemachine
+ * Interface to lock state machine consists of 3 parts:
+ *
+ *     - "try" functions that attempt to effect a state transition. If state
+ *     transition is not possible right now (e.g., if it has to wait for some
+ *     asynchronous event to occur), these functions return
+ *     cl_lock_transition::CLO_WAIT.
+ *
+ *     - "non-try" functions that implement synchronous blocking interface on
+ *     top of non-blocking "try" functions. These functions repeatedly call
+ *     corresponding "try" versions, and if state transition is not possible
+ *     immediately, wait for lock state change.
+ *
+ *     - methods from cl_lock_operations, called by "try" functions. Lock can
+ *     be advanced to the target state only when all layers voted that they
+ *     are ready for this transition. "Try" functions call methods under lock
+ *     mutex. If a layer had to release a mutex, it re-acquires it and returns
+ *     cl_lock_transition::CLO_REPEAT, causing "try" function to call all
+ *     layers again.
+ *
+ * TRY	      NON-TRY      METHOD			    FINAL STATE
+ *
+ * cl_enqueue_try() cl_enqueue() cl_lock_operations::clo_enqueue() CLS_ENQUEUED
+ *
+ * cl_wait_try()    cl_wait()    cl_lock_operations::clo_wait()    CLS_HELD
+ *
+ * cl_unuse_try()   cl_unuse()   cl_lock_operations::clo_unuse()   CLS_CACHED
+ *
+ * cl_use_try()     NONE	 cl_lock_operations::clo_use()     CLS_HELD
+ *
+ * @{ */
+
+int   cl_enqueue    (const struct lu_env *env, struct cl_lock *lock,
+		     struct cl_io *io, __u32 flags);
+int   cl_wait       (const struct lu_env *env, struct cl_lock *lock);
+void  cl_unuse      (const struct lu_env *env, struct cl_lock *lock);
+int   cl_enqueue_try(const struct lu_env *env, struct cl_lock *lock,
+		     struct cl_io *io, __u32 flags);
+int   cl_unuse_try  (const struct lu_env *env, struct cl_lock *lock);
+int   cl_wait_try   (const struct lu_env *env, struct cl_lock *lock);
+int   cl_use_try    (const struct lu_env *env, struct cl_lock *lock, int atomic);
+
+/** @} statemachine */
+
+void cl_lock_signal      (const struct lu_env *env, struct cl_lock *lock);
+int  cl_lock_state_wait  (const struct lu_env *env, struct cl_lock *lock);
+void cl_lock_state_set   (const struct lu_env *env, struct cl_lock *lock,
+			  enum cl_lock_state state);
+int  cl_queue_match      (const struct list_head *queue,
+			  const struct cl_lock_descr *need);
+
+void cl_lock_mutex_get  (const struct lu_env *env, struct cl_lock *lock);
+int  cl_lock_mutex_try  (const struct lu_env *env, struct cl_lock *lock);
+void cl_lock_mutex_put  (const struct lu_env *env, struct cl_lock *lock);
+int  cl_lock_is_mutexed (struct cl_lock *lock);
+int  cl_lock_nr_mutexed (const struct lu_env *env);
+int  cl_lock_discard_pages(const struct lu_env *env, struct cl_lock *lock);
+int  cl_lock_ext_match  (const struct cl_lock_descr *has,
+			 const struct cl_lock_descr *need);
+int  cl_lock_descr_match(const struct cl_lock_descr *has,
+			 const struct cl_lock_descr *need);
+int  cl_lock_mode_match (enum cl_lock_mode has, enum cl_lock_mode need);
+int  cl_lock_modify     (const struct lu_env *env, struct cl_lock *lock,
+			 const struct cl_lock_descr *desc);
+
+void cl_lock_closure_init (const struct lu_env *env,
+			   struct cl_lock_closure *closure,
+			   struct cl_lock *origin, int wait);
+void cl_lock_closure_fini (struct cl_lock_closure *closure);
+int  cl_lock_closure_build(const struct lu_env *env, struct cl_lock *lock,
+			   struct cl_lock_closure *closure);
+void cl_lock_disclosure   (const struct lu_env *env,
+			   struct cl_lock_closure *closure);
+int  cl_lock_enclosure    (const struct lu_env *env, struct cl_lock *lock,
+			   struct cl_lock_closure *closure);
+
+void cl_lock_cancel(const struct lu_env *env, struct cl_lock *lock);
+void cl_lock_delete(const struct lu_env *env, struct cl_lock *lock);
+void cl_lock_error (const struct lu_env *env, struct cl_lock *lock, int error);
+void cl_locks_prune(const struct lu_env *env, struct cl_object *obj, int wait);
+
+unsigned long cl_lock_weigh(const struct lu_env *env, struct cl_lock *lock);
+
+/** @} cl_lock */
+
+/** \defgroup cl_io cl_io
+ * @{ */
+
+int   cl_io_init	 (const struct lu_env *env, struct cl_io *io,
+			  enum cl_io_type iot, struct cl_object *obj);
+int   cl_io_sub_init     (const struct lu_env *env, struct cl_io *io,
+			  enum cl_io_type iot, struct cl_object *obj);
+int   cl_io_rw_init      (const struct lu_env *env, struct cl_io *io,
+			  enum cl_io_type iot, loff_t pos, size_t count);
+int   cl_io_loop	 (const struct lu_env *env, struct cl_io *io);
+
+void  cl_io_fini	 (const struct lu_env *env, struct cl_io *io);
+int   cl_io_iter_init    (const struct lu_env *env, struct cl_io *io);
+void  cl_io_iter_fini    (const struct lu_env *env, struct cl_io *io);
+int   cl_io_lock	 (const struct lu_env *env, struct cl_io *io);
+void  cl_io_unlock       (const struct lu_env *env, struct cl_io *io);
+int   cl_io_start	(const struct lu_env *env, struct cl_io *io);
+void  cl_io_end	  (const struct lu_env *env, struct cl_io *io);
+int   cl_io_lock_add     (const struct lu_env *env, struct cl_io *io,
+			  struct cl_io_lock_link *link);
+int   cl_io_lock_alloc_add(const struct lu_env *env, struct cl_io *io,
+			   struct cl_lock_descr *descr);
+int   cl_io_read_page    (const struct lu_env *env, struct cl_io *io,
+			  struct cl_page *page);
+int   cl_io_prepare_write(const struct lu_env *env, struct cl_io *io,
+			  struct cl_page *page, unsigned from, unsigned to);
+int   cl_io_commit_write (const struct lu_env *env, struct cl_io *io,
+			  struct cl_page *page, unsigned from, unsigned to);
+int   cl_io_submit_rw    (const struct lu_env *env, struct cl_io *io,
+			  enum cl_req_type iot, struct cl_2queue *queue);
+int   cl_io_submit_sync  (const struct lu_env *env, struct cl_io *io,
+			  enum cl_req_type iot, struct cl_2queue *queue,
+			  long timeout);
+void  cl_io_rw_advance   (const struct lu_env *env, struct cl_io *io,
+			  size_t nob);
+int   cl_io_cancel       (const struct lu_env *env, struct cl_io *io,
+			  struct cl_page_list *queue);
+int   cl_io_is_going     (const struct lu_env *env);
+
+/**
+ * True, iff \a io is an O_APPEND write(2).
+ */
+static inline int cl_io_is_append(const struct cl_io *io)
+{
+	return io->ci_type == CIT_WRITE && io->u.ci_wr.wr_append;
+}
+
+static inline int cl_io_is_sync_write(const struct cl_io *io)
+{
+	return io->ci_type == CIT_WRITE && io->u.ci_wr.wr_sync;
+}
+
+static inline int cl_io_is_mkwrite(const struct cl_io *io)
+{
+	return io->ci_type == CIT_FAULT && io->u.ci_fault.ft_mkwrite;
+}
+
+/**
+ * True, iff \a io is a truncate(2).
+ */
+static inline int cl_io_is_trunc(const struct cl_io *io)
+{
+	return io->ci_type == CIT_SETATTR &&
+		(io->u.ci_setattr.sa_valid & ATTR_SIZE);
+}
+
+struct cl_io *cl_io_top(struct cl_io *io);
+
+void cl_io_print(const struct lu_env *env, void *cookie,
+		 lu_printer_t printer, const struct cl_io *io);
+
+#define CL_IO_SLICE_CLEAN(foo_io, base)					\
+do {									\
+	typeof(foo_io) __foo_io = (foo_io);				\
+									\
+	CLASSERT(offsetof(typeof(*__foo_io), base) == 0);		\
+	memset(&__foo_io->base + 1, 0,					\
+	       sizeof(*__foo_io) - sizeof(__foo_io->base));		\
+} while (0)
+
+/** @} cl_io */
+
+/** \defgroup cl_page_list cl_page_list
+ * @{ */
+
+/**
+ * Last page in the page list.
+ */
+static inline struct cl_page *cl_page_list_last(struct cl_page_list *plist)
+{
+	LASSERT(plist->pl_nr > 0);
+	return list_entry(plist->pl_pages.prev, struct cl_page, cp_batch);
+}
+
+/**
+ * Iterate over pages in a page list.
+ */
+#define cl_page_list_for_each(page, list)			       \
+	list_for_each_entry((page), &(list)->pl_pages, cp_batch)
+
+/**
+ * Iterate over pages in a page list, taking possible removals into account.
+ */
+#define cl_page_list_for_each_safe(page, temp, list)		    \
+	list_for_each_entry_safe((page), (temp), &(list)->pl_pages, cp_batch)
+
+void cl_page_list_init   (struct cl_page_list *plist);
+void cl_page_list_add    (struct cl_page_list *plist, struct cl_page *page);
+void cl_page_list_move   (struct cl_page_list *dst, struct cl_page_list *src,
+			  struct cl_page *page);
+void cl_page_list_splice (struct cl_page_list *list,
+			  struct cl_page_list *head);
+void cl_page_list_del    (const struct lu_env *env,
+			  struct cl_page_list *plist, struct cl_page *page);
+void cl_page_list_disown (const struct lu_env *env,
+			  struct cl_io *io, struct cl_page_list *plist);
+int  cl_page_list_own    (const struct lu_env *env,
+			  struct cl_io *io, struct cl_page_list *plist);
+void cl_page_list_assume (const struct lu_env *env,
+			  struct cl_io *io, struct cl_page_list *plist);
+void cl_page_list_discard(const struct lu_env *env,
+			  struct cl_io *io, struct cl_page_list *plist);
+int  cl_page_list_unmap  (const struct lu_env *env,
+			  struct cl_io *io, struct cl_page_list *plist);
+void cl_page_list_fini   (const struct lu_env *env, struct cl_page_list *plist);
+
+void cl_2queue_init     (struct cl_2queue *queue);
+void cl_2queue_add      (struct cl_2queue *queue, struct cl_page *page);
+void cl_2queue_disown   (const struct lu_env *env,
+			 struct cl_io *io, struct cl_2queue *queue);
+void cl_2queue_assume   (const struct lu_env *env,
+			 struct cl_io *io, struct cl_2queue *queue);
+void cl_2queue_discard  (const struct lu_env *env,
+			 struct cl_io *io, struct cl_2queue *queue);
+void cl_2queue_fini     (const struct lu_env *env, struct cl_2queue *queue);
+void cl_2queue_init_page(struct cl_2queue *queue, struct cl_page *page);
+
+/** @} cl_page_list */
+
+/** \defgroup cl_req cl_req
+ * @{ */
+struct cl_req *cl_req_alloc(const struct lu_env *env, struct cl_page *page,
+			    enum cl_req_type crt, int nr_objects);
+
+void cl_req_page_add  (const struct lu_env *env, struct cl_req *req,
+		       struct cl_page *page);
+void cl_req_page_done (const struct lu_env *env, struct cl_page *page);
+int  cl_req_prep      (const struct lu_env *env, struct cl_req *req);
+void cl_req_attr_set  (const struct lu_env *env, struct cl_req *req,
+		       struct cl_req_attr *attr, u64 flags);
+void cl_req_completion(const struct lu_env *env, struct cl_req *req, int ioret);
+
+/** \defgroup cl_sync_io cl_sync_io
+ * @{ */
+
+/**
+ * Anchor for synchronous transfer. This is allocated on a stack by thread
+ * doing synchronous transfer, and a pointer to this structure is set up in
+ * every page submitted for transfer. Transfer completion routine updates
+ * anchor and wakes up waiting thread when transfer is complete.
+ */
+struct cl_sync_io {
+	/** number of pages yet to be transferred. */
+	atomic_t		csi_sync_nr;
+	/** error code. */
+	int			csi_sync_rc;
+	/** barrier of destroy this structure */
+	atomic_t		csi_barrier;
+	/** completion to be signaled when transfer is complete. */
+	wait_queue_head_t		csi_waitq;
+};
+
+void cl_sync_io_init(struct cl_sync_io *anchor, int nrpages);
+int  cl_sync_io_wait(const struct lu_env *env, struct cl_io *io,
+		     struct cl_page_list *queue, struct cl_sync_io *anchor,
+		     long timeout);
+void cl_sync_io_note(struct cl_sync_io *anchor, int ioret);
+
+/** @} cl_sync_io */
+
+/** @} cl_req */
+
+/** \defgroup cl_env cl_env
+ *
+ * lu_env handling for a client.
+ *
+ * lu_env is an environment within which lustre code executes. Its major part
+ * is lu_context---a fast memory allocation mechanism that is used to conserve
+ * precious kernel stack space. Originally lu_env was designed for a server,
+ * where
+ *
+ *     - there is a (mostly) fixed number of threads, and
+ *
+ *     - call chains have no non-lustre portions inserted between lustre code.
+ *
+ * On a client both these assumption fails, because every user thread can
+ * potentially execute lustre code as part of a system call, and lustre calls
+ * into VFS or MM that call back into lustre.
+ *
+ * To deal with that, cl_env wrapper functions implement the following
+ * optimizations:
+ *
+ *     - allocation and destruction of environment is amortized by caching no
+ *     longer used environments instead of destroying them;
+ *
+ *     - there is a notion of "current" environment, attached to the kernel
+ *     data structure representing current thread Top-level lustre code
+ *     allocates an environment and makes it current, then calls into
+ *     non-lustre code, that in turn calls lustre back. Low-level lustre
+ *     code thus called can fetch environment created by the top-level code
+ *     and reuse it, avoiding additional environment allocation.
+ *       Right now, three interfaces can attach the cl_env to running thread:
+ *       - cl_env_get
+ *       - cl_env_implant
+ *       - cl_env_reexit(cl_env_reenter had to be called priorly)
+ *
+ * \see lu_env, lu_context, lu_context_key
+ * @{ */
+
+struct cl_env_nest {
+	int   cen_refcheck;
+	void *cen_cookie;
+};
+
+struct lu_env *cl_env_peek       (int *refcheck);
+struct lu_env *cl_env_get	(int *refcheck);
+struct lu_env *cl_env_alloc      (int *refcheck, __u32 tags);
+struct lu_env *cl_env_nested_get (struct cl_env_nest *nest);
+void	   cl_env_put	(struct lu_env *env, int *refcheck);
+void	   cl_env_nested_put (struct cl_env_nest *nest, struct lu_env *env);
+void	  *cl_env_reenter    (void);
+void	   cl_env_reexit     (void *cookie);
+void	   cl_env_implant    (struct lu_env *env, int *refcheck);
+void	   cl_env_unplant    (struct lu_env *env, int *refcheck);
+
+/** @} cl_env */
+
+/*
+ * Misc
+ */
+void cl_attr2lvb(struct ost_lvb *lvb, const struct cl_attr *attr);
+void cl_lvb2attr(struct cl_attr *attr, const struct ost_lvb *lvb);
+
+struct cl_device *cl_type_setup(const struct lu_env *env, struct lu_site *site,
+				struct lu_device_type *ldt,
+				struct lu_device *next);
+/** @} clio */
+
+int cl_global_init(void);
+void cl_global_fini(void);
+
+#endif /* _LINUX_CL_OBJECT_H */
diff --git a/kernel/drivers/staging/lustre/lustre/include/dt_object.h b/kernel/drivers/staging/lustre/lustre/include/dt_object.h
new file mode 100644
index 000000000..be4c7d95e
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/include/dt_object.h
@@ -0,0 +1,1499 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef __LUSTRE_DT_OBJECT_H
+#define __LUSTRE_DT_OBJECT_H
+
+/** \defgroup dt dt
+ * Sub-class of lu_object with methods common for "data" objects in OST stack.
+ *
+ * Data objects behave like regular files: you can read/write them, get and
+ * set their attributes. Implementation of dt interface is supposed to
+ * implement some form of garbage collection, normally reference counting
+ * (nlink) based one.
+ *
+ * Examples: osd (lustre/osd) is an implementation of dt interface.
+ * @{
+ */
+
+
+/*
+ * super-class definitions.
+ */
+#include "lu_object.h"
+
+#include "../../include/linux/libcfs/libcfs.h"
+
+struct seq_file;
+struct proc_dir_entry;
+struct lustre_cfg;
+
+struct thandle;
+struct dt_device;
+struct dt_object;
+struct dt_index_features;
+struct niobuf_local;
+struct niobuf_remote;
+struct ldlm_enqueue_info;
+
+typedef enum {
+	MNTOPT_USERXATTR	= 0x00000001,
+	MNTOPT_ACL	      = 0x00000002,
+} mntopt_t;
+
+struct dt_device_param {
+	unsigned	   ddp_max_name_len;
+	unsigned	   ddp_max_nlink;
+	unsigned	   ddp_block_shift;
+	mntopt_t	   ddp_mntopts;
+	unsigned	   ddp_max_ea_size;
+	void	      *ddp_mnt; /* XXX: old code can retrieve mnt -bzzz */
+	int		ddp_mount_type;
+	unsigned long long ddp_maxbytes;
+	/* percentage of available space to reserve for grant error margin */
+	int		ddp_grant_reserved;
+	/* per-inode space consumption */
+	short	      ddp_inodespace;
+	/* per-fragment grant overhead to be used by client for grant
+	 * calculation */
+	int		ddp_grant_frag;
+};
+
+/**
+ * Per-transaction commit callback function
+ */
+struct dt_txn_commit_cb;
+typedef void (*dt_cb_t)(struct lu_env *env, struct thandle *th,
+			struct dt_txn_commit_cb *cb, int err);
+/**
+ * Special per-transaction callback for cases when just commit callback
+ * is needed and per-device callback are not convenient to use
+ */
+#define TRANS_COMMIT_CB_MAGIC	0xa0a00a0a
+#define MAX_COMMIT_CB_STR_LEN	32
+
+struct dt_txn_commit_cb {
+	struct list_head	dcb_linkage;
+	dt_cb_t		dcb_func;
+	__u32		dcb_magic;
+	char		dcb_name[MAX_COMMIT_CB_STR_LEN];
+};
+
+/**
+ * Operations on dt device.
+ */
+struct dt_device_operations {
+	/**
+	 * Return device-wide statistics.
+	 */
+	int   (*dt_statfs)(const struct lu_env *env,
+			   struct dt_device *dev, struct obd_statfs *osfs);
+	/**
+	 * Create transaction, described by \a param.
+	 */
+	struct thandle *(*dt_trans_create)(const struct lu_env *env,
+					   struct dt_device *dev);
+	/**
+	 * Start transaction, described by \a param.
+	 */
+	int   (*dt_trans_start)(const struct lu_env *env,
+				struct dt_device *dev, struct thandle *th);
+	/**
+	 * Finish previously started transaction.
+	 */
+	int   (*dt_trans_stop)(const struct lu_env *env,
+			       struct thandle *th);
+	/**
+	 * Add commit callback to the transaction.
+	 */
+	int   (*dt_trans_cb_add)(struct thandle *th,
+				 struct dt_txn_commit_cb *dcb);
+	/**
+	 * Return fid of root index object.
+	 */
+	int   (*dt_root_get)(const struct lu_env *env,
+			     struct dt_device *dev, struct lu_fid *f);
+	/**
+	 * Return device configuration data.
+	 */
+	void  (*dt_conf_get)(const struct lu_env *env,
+			     const struct dt_device *dev,
+			     struct dt_device_param *param);
+	/**
+	 *  handling device state, mostly for tests
+	 */
+	int   (*dt_sync)(const struct lu_env *env, struct dt_device *dev);
+	int   (*dt_ro)(const struct lu_env *env, struct dt_device *dev);
+	/**
+	  * Start a transaction commit asynchronously
+	  *
+	  * \param env environment
+	  * \param dev dt_device to start commit on
+	  *
+	  * \return 0 success, negative value if error
+	  */
+	 int   (*dt_commit_async)(const struct lu_env *env,
+				  struct dt_device *dev);
+	/**
+	 * Initialize capability context.
+	 */
+	int   (*dt_init_capa_ctxt)(const struct lu_env *env,
+				   struct dt_device *dev,
+				   int mode, unsigned long timeout,
+				   __u32 alg, struct lustre_capa_key *keys);
+};
+
+struct dt_index_features {
+	/** required feature flags from enum dt_index_flags */
+	__u32 dif_flags;
+	/** minimal required key size */
+	size_t dif_keysize_min;
+	/** maximal required key size, 0 if no limit */
+	size_t dif_keysize_max;
+	/** minimal required record size */
+	size_t dif_recsize_min;
+	/** maximal required record size, 0 if no limit */
+	size_t dif_recsize_max;
+	/** pointer size for record */
+	size_t dif_ptrsize;
+};
+
+enum dt_index_flags {
+	/** index supports variable sized keys */
+	DT_IND_VARKEY = 1 << 0,
+	/** index supports variable sized records */
+	DT_IND_VARREC = 1 << 1,
+	/** index can be modified */
+	DT_IND_UPDATE = 1 << 2,
+	/** index supports records with non-unique (duplicate) keys */
+	DT_IND_NONUNQ = 1 << 3,
+	/**
+	 * index support fixed-size keys sorted with natural numerical way
+	 * and is able to return left-side value if no exact value found
+	 */
+	DT_IND_RANGE = 1 << 4,
+};
+
+/**
+ * Features, required from index to support file system directories (mapping
+ * names to fids).
+ */
+extern const struct dt_index_features dt_directory_features;
+extern const struct dt_index_features dt_otable_features;
+extern const struct dt_index_features dt_lfsck_features;
+
+/* index features supported by the accounting objects */
+extern const struct dt_index_features dt_acct_features;
+
+/* index features supported by the quota global indexes */
+extern const struct dt_index_features dt_quota_glb_features;
+
+/* index features supported by the quota slave indexes */
+extern const struct dt_index_features dt_quota_slv_features;
+
+/**
+ * This is a general purpose dt allocation hint.
+ * It now contains the parent object.
+ * It can contain any allocation hint in the future.
+ */
+struct dt_allocation_hint {
+	struct dt_object	   *dah_parent;
+	__u32		       dah_mode;
+};
+
+/**
+ * object type specifier.
+ */
+
+enum dt_format_type {
+	DFT_REGULAR,
+	DFT_DIR,
+	/** for mknod */
+	DFT_NODE,
+	/** for special index */
+	DFT_INDEX,
+	/** for symbolic link */
+	DFT_SYM,
+};
+
+/**
+ * object format specifier.
+ */
+struct dt_object_format {
+	/** type for dt object */
+	enum dt_format_type dof_type;
+	union {
+		struct dof_regular {
+			int striped;
+		} dof_reg;
+		struct dof_dir {
+		} dof_dir;
+		struct dof_node {
+		} dof_node;
+		/**
+		 * special index need feature as parameter to create
+		 * special idx
+		 */
+		struct dof_index {
+			const struct dt_index_features *di_feat;
+		} dof_idx;
+	} u;
+};
+
+enum dt_format_type dt_mode_to_dft(__u32 mode);
+
+typedef __u64 dt_obj_version_t;
+
+/**
+ * Per-dt-object operations.
+ */
+struct dt_object_operations {
+	void  (*do_read_lock)(const struct lu_env *env,
+			      struct dt_object *dt, unsigned role);
+	void  (*do_write_lock)(const struct lu_env *env,
+			       struct dt_object *dt, unsigned role);
+	void  (*do_read_unlock)(const struct lu_env *env,
+				struct dt_object *dt);
+	void  (*do_write_unlock)(const struct lu_env *env,
+				 struct dt_object *dt);
+	int  (*do_write_locked)(const struct lu_env *env,
+				struct dt_object *dt);
+	/**
+	 * Note: following ->do_{x,}attr_{set,get}() operations are very
+	 * similar to ->moo_{x,}attr_{set,get}() operations in struct
+	 * md_object_operations (see md_object.h). These operations are not in
+	 * lu_object_operations, because ->do_{x,}attr_set() versions take
+	 * transaction handle as an argument (this transaction is started by
+	 * caller). We might factor ->do_{x,}attr_get() into
+	 * lu_object_operations, but that would break existing symmetry.
+	 */
+
+	/**
+	 * Return standard attributes.
+	 *
+	 * precondition: lu_object_exists(&dt->do_lu);
+	 */
+	int   (*do_attr_get)(const struct lu_env *env,
+			     struct dt_object *dt, struct lu_attr *attr,
+			     struct lustre_capa *capa);
+	/**
+	 * Set standard attributes.
+	 *
+	 * precondition: dt_object_exists(dt);
+	 */
+	int   (*do_declare_attr_set)(const struct lu_env *env,
+				     struct dt_object *dt,
+				     const struct lu_attr *attr,
+				     struct thandle *handle);
+	int   (*do_attr_set)(const struct lu_env *env,
+			     struct dt_object *dt,
+			     const struct lu_attr *attr,
+			     struct thandle *handle,
+			     struct lustre_capa *capa);
+	/**
+	 * Return a value of an extended attribute.
+	 *
+	 * precondition: dt_object_exists(dt);
+	 */
+	int   (*do_xattr_get)(const struct lu_env *env, struct dt_object *dt,
+			      struct lu_buf *buf, const char *name,
+			      struct lustre_capa *capa);
+	/**
+	 * Set value of an extended attribute.
+	 *
+	 * \a fl - flags from enum lu_xattr_flags
+	 *
+	 * precondition: dt_object_exists(dt);
+	 */
+	int   (*do_declare_xattr_set)(const struct lu_env *env,
+				      struct dt_object *dt,
+				      const struct lu_buf *buf,
+				      const char *name, int fl,
+				      struct thandle *handle);
+	int   (*do_xattr_set)(const struct lu_env *env,
+			      struct dt_object *dt, const struct lu_buf *buf,
+			      const char *name, int fl, struct thandle *handle,
+			      struct lustre_capa *capa);
+	/**
+	 * Delete existing extended attribute.
+	 *
+	 * precondition: dt_object_exists(dt);
+	 */
+	int   (*do_declare_xattr_del)(const struct lu_env *env,
+				      struct dt_object *dt,
+				      const char *name, struct thandle *handle);
+	int   (*do_xattr_del)(const struct lu_env *env,
+			      struct dt_object *dt,
+			      const char *name, struct thandle *handle,
+			      struct lustre_capa *capa);
+	/**
+	 * Place list of existing extended attributes into \a buf (which has
+	 * length len).
+	 *
+	 * precondition: dt_object_exists(dt);
+	 */
+	int   (*do_xattr_list)(const struct lu_env *env,
+			       struct dt_object *dt, struct lu_buf *buf,
+			       struct lustre_capa *capa);
+	/**
+	 * Init allocation hint using parent object and child mode.
+	 * (1) The \a parent might be NULL if this is a partial creation for
+	 *     remote object.
+	 * (2) The type of child is in \a child_mode.
+	 * (3) The result hint is stored in \a ah;
+	 */
+	void  (*do_ah_init)(const struct lu_env *env,
+			    struct dt_allocation_hint *ah,
+			    struct dt_object *parent,
+			    struct dt_object *child,
+			    umode_t child_mode);
+	/**
+	 * Create new object on this device.
+	 *
+	 * precondition: !dt_object_exists(dt);
+	 * postcondition: ergo(result == 0, dt_object_exists(dt));
+	 */
+	int   (*do_declare_create)(const struct lu_env *env,
+				   struct dt_object *dt,
+				   struct lu_attr *attr,
+				   struct dt_allocation_hint *hint,
+				   struct dt_object_format *dof,
+				   struct thandle *th);
+	int   (*do_create)(const struct lu_env *env, struct dt_object *dt,
+			   struct lu_attr *attr,
+			   struct dt_allocation_hint *hint,
+			   struct dt_object_format *dof,
+			   struct thandle *th);
+
+	/**
+	  Destroy object on this device
+	 * precondition: !dt_object_exists(dt);
+	 * postcondition: ergo(result == 0, dt_object_exists(dt));
+	 */
+	int   (*do_declare_destroy)(const struct lu_env *env,
+				    struct dt_object *dt,
+				    struct thandle *th);
+	int   (*do_destroy)(const struct lu_env *env, struct dt_object *dt,
+			    struct thandle *th);
+
+	/**
+	 * Announce that this object is going to be used as an index. This
+	 * operation check that object supports indexing operations and
+	 * installs appropriate dt_index_operations vector on success.
+	 *
+	 * Also probes for features. Operation is successful if all required
+	 * features are supported.
+	 */
+	int   (*do_index_try)(const struct lu_env *env,
+			      struct dt_object *dt,
+			      const struct dt_index_features *feat);
+	/**
+	 * Add nlink of the object
+	 * precondition: dt_object_exists(dt);
+	 */
+	int   (*do_declare_ref_add)(const struct lu_env *env,
+				    struct dt_object *dt, struct thandle *th);
+	int   (*do_ref_add)(const struct lu_env *env,
+			    struct dt_object *dt, struct thandle *th);
+	/**
+	 * Del nlink of the object
+	 * precondition: dt_object_exists(dt);
+	 */
+	int   (*do_declare_ref_del)(const struct lu_env *env,
+				    struct dt_object *dt, struct thandle *th);
+	int   (*do_ref_del)(const struct lu_env *env,
+			    struct dt_object *dt, struct thandle *th);
+
+	struct obd_capa *(*do_capa_get)(const struct lu_env *env,
+					struct dt_object *dt,
+					struct lustre_capa *old,
+					__u64 opc);
+	int (*do_object_sync)(const struct lu_env *env, struct dt_object *obj,
+			      __u64 start, __u64 end);
+	/**
+	 * Get object info of next level. Currently, only get inode from osd.
+	 * This is only used by quota b=16542
+	 * precondition: dt_object_exists(dt);
+	 */
+	int (*do_data_get)(const struct lu_env *env, struct dt_object *dt,
+			   void **data);
+
+	/**
+	 * Lock object.
+	 */
+	int (*do_object_lock)(const struct lu_env *env, struct dt_object *dt,
+			      struct lustre_handle *lh,
+			      struct ldlm_enqueue_info *einfo,
+			      void *policy);
+};
+
+/**
+ * Per-dt-object operations on "file body".
+ */
+struct dt_body_operations {
+	/**
+	 * precondition: dt_object_exists(dt);
+	 */
+	ssize_t (*dbo_read)(const struct lu_env *env, struct dt_object *dt,
+			    struct lu_buf *buf, loff_t *pos,
+			    struct lustre_capa *capa);
+	/**
+	 * precondition: dt_object_exists(dt);
+	 */
+	ssize_t (*dbo_declare_write)(const struct lu_env *env,
+				     struct dt_object *dt,
+				     const loff_t size, loff_t pos,
+				     struct thandle *handle);
+	ssize_t (*dbo_write)(const struct lu_env *env, struct dt_object *dt,
+			     const struct lu_buf *buf, loff_t *pos,
+			     struct thandle *handle, struct lustre_capa *capa,
+			     int ignore_quota);
+	/*
+	 * methods for zero-copy IO
+	 */
+
+	/*
+	 * precondition: dt_object_exists(dt);
+	 * returns:
+	 * < 0 - error code
+	 * = 0 - illegal
+	 * > 0 - number of local buffers prepared
+	 */
+	int (*dbo_bufs_get)(const struct lu_env *env, struct dt_object *dt,
+			    loff_t pos, ssize_t len, struct niobuf_local *lb,
+			    int rw, struct lustre_capa *capa);
+	/*
+	 * precondition: dt_object_exists(dt);
+	 */
+	int (*dbo_bufs_put)(const struct lu_env *env, struct dt_object *dt,
+			    struct niobuf_local *lb, int nr);
+	/*
+	 * precondition: dt_object_exists(dt);
+	 */
+	int (*dbo_write_prep)(const struct lu_env *env, struct dt_object *dt,
+			      struct niobuf_local *lb, int nr);
+	/*
+	 * precondition: dt_object_exists(dt);
+	 */
+	int (*dbo_declare_write_commit)(const struct lu_env *env,
+					struct dt_object *dt,
+					struct niobuf_local *,
+					int, struct thandle *);
+	/*
+	 * precondition: dt_object_exists(dt);
+	 */
+	int (*dbo_write_commit)(const struct lu_env *env, struct dt_object *dt,
+				struct niobuf_local *, int, struct thandle *);
+	/*
+	 * precondition: dt_object_exists(dt);
+	 */
+	int (*dbo_read_prep)(const struct lu_env *env, struct dt_object *dt,
+			     struct niobuf_local *lnb, int nr);
+	int (*dbo_fiemap_get)(const struct lu_env *env, struct dt_object *dt,
+			      struct ll_user_fiemap *fm);
+	/**
+	 * Punch object's content
+	 * precondition: regular object, not index
+	 */
+	int   (*dbo_declare_punch)(const struct lu_env *, struct dt_object *,
+				  __u64, __u64, struct thandle *th);
+	int   (*dbo_punch)(const struct lu_env *env, struct dt_object *dt,
+			  __u64 start, __u64 end, struct thandle *th,
+			  struct lustre_capa *capa);
+};
+
+/**
+ * Incomplete type of index record.
+ */
+struct dt_rec;
+
+/**
+ * Incomplete type of index key.
+ */
+struct dt_key;
+
+/**
+ * Incomplete type of dt iterator.
+ */
+struct dt_it;
+
+/**
+ * Per-dt-object operations on object as index.
+ */
+struct dt_index_operations {
+	/**
+	 * precondition: dt_object_exists(dt);
+	 */
+	int (*dio_lookup)(const struct lu_env *env, struct dt_object *dt,
+			  struct dt_rec *rec, const struct dt_key *key,
+			  struct lustre_capa *capa);
+	/**
+	 * precondition: dt_object_exists(dt);
+	 */
+	int (*dio_declare_insert)(const struct lu_env *env,
+				  struct dt_object *dt,
+				  const struct dt_rec *rec,
+				  const struct dt_key *key,
+				  struct thandle *handle);
+	int (*dio_insert)(const struct lu_env *env, struct dt_object *dt,
+			  const struct dt_rec *rec, const struct dt_key *key,
+			  struct thandle *handle, struct lustre_capa *capa,
+			  int ignore_quota);
+	/**
+	 * precondition: dt_object_exists(dt);
+	 */
+	int (*dio_declare_delete)(const struct lu_env *env,
+				  struct dt_object *dt,
+				  const struct dt_key *key,
+				  struct thandle *handle);
+	int (*dio_delete)(const struct lu_env *env, struct dt_object *dt,
+			  const struct dt_key *key, struct thandle *handle,
+			  struct lustre_capa *capa);
+	/**
+	 * Iterator interface
+	 */
+	struct dt_it_ops {
+		/**
+		 * Allocate and initialize new iterator.
+		 *
+		 * precondition: dt_object_exists(dt);
+		 */
+		struct dt_it *(*init)(const struct lu_env *env,
+				      struct dt_object *dt,
+				      __u32 attr,
+				      struct lustre_capa *capa);
+		void	  (*fini)(const struct lu_env *env,
+				      struct dt_it *di);
+		int	    (*get)(const struct lu_env *env,
+				      struct dt_it *di,
+				      const struct dt_key *key);
+		void	   (*put)(const struct lu_env *env,
+				      struct dt_it *di);
+		int	   (*next)(const struct lu_env *env,
+				      struct dt_it *di);
+		struct dt_key *(*key)(const struct lu_env *env,
+				      const struct dt_it *di);
+		int       (*key_size)(const struct lu_env *env,
+				      const struct dt_it *di);
+		int	    (*rec)(const struct lu_env *env,
+				      const struct dt_it *di,
+				      struct dt_rec *rec,
+				      __u32 attr);
+		__u64	(*store)(const struct lu_env *env,
+				      const struct dt_it *di);
+		int	   (*load)(const struct lu_env *env,
+				      const struct dt_it *di, __u64 hash);
+		int	(*key_rec)(const struct lu_env *env,
+				      const struct dt_it *di, void *key_rec);
+	} dio_it;
+};
+
+enum dt_otable_it_valid {
+	DOIV_ERROR_HANDLE	= 0x0001,
+};
+
+enum dt_otable_it_flags {
+	/* Exit when fail. */
+	DOIF_FAILOUT	= 0x0001,
+
+	/* Reset iteration position to the device beginning. */
+	DOIF_RESET	= 0x0002,
+
+	/* There is up layer component uses the iteration. */
+	DOIF_OUTUSED	= 0x0004,
+};
+
+/* otable based iteration needs to use the common DT interation APIs.
+ * To initialize the iteration, it needs call dio_it::init() firstly.
+ * Here is how the otable based iteration should prepare arguments to
+ * call dt_it_ops::init().
+ *
+ * For otable based iteration, the 32-bits 'attr' for dt_it_ops::init()
+ * is composed of two parts:
+ * low 16-bits is for valid bits, high 16-bits is for flags bits. */
+#define DT_OTABLE_IT_FLAGS_SHIFT	16
+#define DT_OTABLE_IT_FLAGS_MASK 	0xffff0000
+
+struct dt_device {
+	struct lu_device		   dd_lu_dev;
+	const struct dt_device_operations *dd_ops;
+
+	/**
+	 * List of dt_txn_callback (see below). This is not protected in any
+	 * way, because callbacks are supposed to be added/deleted only during
+	 * single-threaded start-up shut-down procedures.
+	 */
+	struct list_head			 dd_txn_callbacks;
+};
+
+int  dt_device_init(struct dt_device *dev, struct lu_device_type *t);
+void dt_device_fini(struct dt_device *dev);
+
+static inline int lu_device_is_dt(const struct lu_device *d)
+{
+	return ergo(d != NULL, d->ld_type->ldt_tags & LU_DEVICE_DT);
+}
+
+static inline struct dt_device *lu2dt_dev(struct lu_device *l)
+{
+	LASSERT(lu_device_is_dt(l));
+	return container_of0(l, struct dt_device, dd_lu_dev);
+}
+
+struct dt_object {
+	struct lu_object		   do_lu;
+	const struct dt_object_operations *do_ops;
+	const struct dt_body_operations   *do_body_ops;
+	const struct dt_index_operations  *do_index_ops;
+};
+
+/*
+ * In-core representation of per-device local object OID storage
+ */
+struct local_oid_storage {
+	/* all initialized llog systems on this node linked by this */
+	struct list_head	  los_list;
+
+	/* how many handle's reference this los has */
+	atomic_t	  los_refcount;
+	struct dt_device *los_dev;
+	struct dt_object *los_obj;
+
+	/* data used to generate new fids */
+	struct mutex	  los_id_lock;
+	__u64		  los_seq;
+	__u32		  los_last_oid;
+};
+
+static inline struct dt_object *lu2dt(struct lu_object *l)
+{
+	LASSERT(l == NULL || IS_ERR(l) || lu_device_is_dt(l->lo_dev));
+	return container_of0(l, struct dt_object, do_lu);
+}
+
+int  dt_object_init(struct dt_object *obj,
+		    struct lu_object_header *h, struct lu_device *d);
+
+void dt_object_fini(struct dt_object *obj);
+
+static inline int dt_object_exists(const struct dt_object *dt)
+{
+	return lu_object_exists(&dt->do_lu);
+}
+
+static inline int dt_object_remote(const struct dt_object *dt)
+{
+	return lu_object_remote(&dt->do_lu);
+}
+
+static inline struct dt_object *lu2dt_obj(struct lu_object *o)
+{
+	LASSERT(ergo(o != NULL, lu_device_is_dt(o->lo_dev)));
+	return container_of0(o, struct dt_object, do_lu);
+}
+
+/**
+ * This is the general purpose transaction handle.
+ * 1. Transaction Life Cycle
+ *      This transaction handle is allocated upon starting a new transaction,
+ *      and deallocated after this transaction is committed.
+ * 2. Transaction Nesting
+ *      We do _NOT_ support nested transaction. So, every thread should only
+ *      have one active transaction, and a transaction only belongs to one
+ *      thread. Due to this, transaction handle need no reference count.
+ * 3. Transaction & dt_object locking
+ *      dt_object locks should be taken inside transaction.
+ * 4. Transaction & RPC
+ *      No RPC request should be issued inside transaction.
+ */
+struct thandle {
+	/** the dt device on which the transactions are executed */
+	struct dt_device *th_dev;
+
+	/** context for this transaction, tag is LCT_TX_HANDLE */
+	struct lu_context th_ctx;
+
+	/** additional tags (layers can add in declare) */
+	__u32	     th_tags;
+
+	/** the last operation result in this transaction.
+	 * this value is used in recovery */
+	__s32	     th_result;
+
+	/** whether we need sync commit */
+	unsigned int		th_sync:1;
+
+	/* local transation, no need to inform other layers */
+	unsigned int		th_local:1;
+
+	/* In DNE, one transaction can be disassemblied into
+	 * updates on several different MDTs, and these updates
+	 * will be attached to th_remote_update_list per target.
+	 * Only single thread will access the list, no need lock
+	 */
+	struct list_head		th_remote_update_list;
+	struct update_request	*th_current_request;
+};
+
+/**
+ * Transaction call-backs.
+ *
+ * These are invoked by osd (or underlying transaction engine) when
+ * transaction changes state.
+ *
+ * Call-backs are used by upper layers to modify transaction parameters and to
+ * perform some actions on for each transaction state transition. Typical
+ * example is mdt registering call-back to write into last-received file
+ * before each transaction commit.
+ */
+struct dt_txn_callback {
+	int (*dtc_txn_start)(const struct lu_env *env,
+			     struct thandle *txn, void *cookie);
+	int (*dtc_txn_stop)(const struct lu_env *env,
+			    struct thandle *txn, void *cookie);
+	void (*dtc_txn_commit)(struct thandle *txn, void *cookie);
+	void		*dtc_cookie;
+	__u32		dtc_tag;
+	struct list_head	   dtc_linkage;
+};
+
+void dt_txn_callback_add(struct dt_device *dev, struct dt_txn_callback *cb);
+void dt_txn_callback_del(struct dt_device *dev, struct dt_txn_callback *cb);
+
+int dt_txn_hook_start(const struct lu_env *env,
+		      struct dt_device *dev, struct thandle *txn);
+int dt_txn_hook_stop(const struct lu_env *env, struct thandle *txn);
+void dt_txn_hook_commit(struct thandle *txn);
+
+int dt_try_as_dir(const struct lu_env *env, struct dt_object *obj);
+
+/**
+ * Callback function used for parsing path.
+ * \see llo_store_resolve
+ */
+typedef int (*dt_entry_func_t)(const struct lu_env *env,
+			    const char *name,
+			    void *pvt);
+
+#define DT_MAX_PATH 1024
+
+int dt_path_parser(const struct lu_env *env,
+		   char *local, dt_entry_func_t entry_func,
+		   void *data);
+
+struct dt_object *
+dt_store_resolve(const struct lu_env *env, struct dt_device *dt,
+		 const char *path, struct lu_fid *fid);
+
+struct dt_object *dt_store_open(const struct lu_env *env,
+				struct dt_device *dt,
+				const char *dirname,
+				const char *filename,
+				struct lu_fid *fid);
+
+struct dt_object *dt_find_or_create(const struct lu_env *env,
+				    struct dt_device *dt,
+				    const struct lu_fid *fid,
+				    struct dt_object_format *dof,
+				    struct lu_attr *attr);
+
+struct dt_object *dt_locate_at(const struct lu_env *env,
+			       struct dt_device *dev,
+			       const struct lu_fid *fid,
+			       struct lu_device *top_dev);
+static inline struct dt_object *
+dt_locate(const struct lu_env *env, struct dt_device *dev,
+	  const struct lu_fid *fid)
+{
+	return dt_locate_at(env, dev, fid, dev->dd_lu_dev.ld_site->ls_top_dev);
+}
+
+
+int local_oid_storage_init(const struct lu_env *env, struct dt_device *dev,
+			   const struct lu_fid *first_fid,
+			   struct local_oid_storage **los);
+void local_oid_storage_fini(const struct lu_env *env,
+			    struct local_oid_storage *los);
+int local_object_fid_generate(const struct lu_env *env,
+			      struct local_oid_storage *los,
+			      struct lu_fid *fid);
+int local_object_declare_create(const struct lu_env *env,
+				struct local_oid_storage *los,
+				struct dt_object *o,
+				struct lu_attr *attr,
+				struct dt_object_format *dof,
+				struct thandle *th);
+int local_object_create(const struct lu_env *env,
+			struct local_oid_storage *los,
+			struct dt_object *o,
+			struct lu_attr *attr, struct dt_object_format *dof,
+			struct thandle *th);
+struct dt_object *local_file_find_or_create(const struct lu_env *env,
+					    struct local_oid_storage *los,
+					    struct dt_object *parent,
+					    const char *name, __u32 mode);
+struct dt_object *local_file_find_or_create_with_fid(const struct lu_env *env,
+						     struct dt_device *dt,
+						     const struct lu_fid *fid,
+						     struct dt_object *parent,
+						     const char *name,
+						     __u32 mode);
+struct dt_object *
+local_index_find_or_create(const struct lu_env *env,
+			   struct local_oid_storage *los,
+			   struct dt_object *parent,
+			   const char *name, __u32 mode,
+			   const struct dt_index_features *ft);
+struct dt_object *
+local_index_find_or_create_with_fid(const struct lu_env *env,
+				    struct dt_device *dt,
+				    const struct lu_fid *fid,
+				    struct dt_object *parent,
+				    const char *name, __u32 mode,
+				    const struct dt_index_features *ft);
+int local_object_unlink(const struct lu_env *env, struct dt_device *dt,
+			struct dt_object *parent, const char *name);
+
+static inline int dt_object_lock(const struct lu_env *env,
+				 struct dt_object *o, struct lustre_handle *lh,
+				 struct ldlm_enqueue_info *einfo,
+				 void *policy)
+{
+	LASSERT(o);
+	LASSERT(o->do_ops);
+	LASSERT(o->do_ops->do_object_lock);
+	return o->do_ops->do_object_lock(env, o, lh, einfo, policy);
+}
+
+int dt_lookup_dir(const struct lu_env *env, struct dt_object *dir,
+		  const char *name, struct lu_fid *fid);
+
+static inline int dt_object_sync(const struct lu_env *env, struct dt_object *o,
+				 __u64 start, __u64 end)
+{
+	LASSERT(o);
+	LASSERT(o->do_ops);
+	LASSERT(o->do_ops->do_object_sync);
+	return o->do_ops->do_object_sync(env, o, start, end);
+}
+
+int dt_declare_version_set(const struct lu_env *env, struct dt_object *o,
+			   struct thandle *th);
+void dt_version_set(const struct lu_env *env, struct dt_object *o,
+		    dt_obj_version_t version, struct thandle *th);
+dt_obj_version_t dt_version_get(const struct lu_env *env, struct dt_object *o);
+
+
+int dt_read(const struct lu_env *env, struct dt_object *dt,
+	    struct lu_buf *buf, loff_t *pos);
+int dt_record_read(const struct lu_env *env, struct dt_object *dt,
+		   struct lu_buf *buf, loff_t *pos);
+int dt_record_write(const struct lu_env *env, struct dt_object *dt,
+		    const struct lu_buf *buf, loff_t *pos, struct thandle *th);
+typedef int (*dt_index_page_build_t)(const struct lu_env *env,
+				     union lu_page *lp, int nob,
+				     const struct dt_it_ops *iops,
+				     struct dt_it *it, __u32 attr, void *arg);
+int dt_index_walk(const struct lu_env *env, struct dt_object *obj,
+		  const struct lu_rdpg *rdpg, dt_index_page_build_t filler,
+		  void *arg);
+int dt_index_read(const struct lu_env *env, struct dt_device *dev,
+		  struct idx_info *ii, const struct lu_rdpg *rdpg);
+
+static inline struct thandle *dt_trans_create(const struct lu_env *env,
+					      struct dt_device *d)
+{
+	LASSERT(d->dd_ops->dt_trans_create);
+	return d->dd_ops->dt_trans_create(env, d);
+}
+
+static inline int dt_trans_start(const struct lu_env *env,
+				 struct dt_device *d, struct thandle *th)
+{
+	LASSERT(d->dd_ops->dt_trans_start);
+	return d->dd_ops->dt_trans_start(env, d, th);
+}
+
+/* for this transaction hooks shouldn't be called */
+static inline int dt_trans_start_local(const struct lu_env *env,
+				       struct dt_device *d, struct thandle *th)
+{
+	LASSERT(d->dd_ops->dt_trans_start);
+	th->th_local = 1;
+	return d->dd_ops->dt_trans_start(env, d, th);
+}
+
+static inline int dt_trans_stop(const struct lu_env *env,
+				struct dt_device *d, struct thandle *th)
+{
+	LASSERT(d->dd_ops->dt_trans_stop);
+	return d->dd_ops->dt_trans_stop(env, th);
+}
+
+static inline int dt_trans_cb_add(struct thandle *th,
+				  struct dt_txn_commit_cb *dcb)
+{
+	LASSERT(th->th_dev->dd_ops->dt_trans_cb_add);
+	dcb->dcb_magic = TRANS_COMMIT_CB_MAGIC;
+	return th->th_dev->dd_ops->dt_trans_cb_add(th, dcb);
+}
+/** @} dt */
+
+
+static inline int dt_declare_record_write(const struct lu_env *env,
+					  struct dt_object *dt,
+					  int size, loff_t pos,
+					  struct thandle *th)
+{
+	int rc;
+
+	LASSERTF(dt != NULL, "dt is NULL when we want to write record\n");
+	LASSERT(th != NULL);
+	LASSERT(dt->do_body_ops);
+	LASSERT(dt->do_body_ops->dbo_declare_write);
+	rc = dt->do_body_ops->dbo_declare_write(env, dt, size, pos, th);
+	return rc;
+}
+
+static inline int dt_declare_create(const struct lu_env *env,
+				    struct dt_object *dt,
+				    struct lu_attr *attr,
+				    struct dt_allocation_hint *hint,
+				    struct dt_object_format *dof,
+				    struct thandle *th)
+{
+	LASSERT(dt);
+	LASSERT(dt->do_ops);
+	LASSERT(dt->do_ops->do_declare_create);
+	return dt->do_ops->do_declare_create(env, dt, attr, hint, dof, th);
+}
+
+static inline int dt_create(const struct lu_env *env,
+				    struct dt_object *dt,
+				    struct lu_attr *attr,
+				    struct dt_allocation_hint *hint,
+				    struct dt_object_format *dof,
+				    struct thandle *th)
+{
+	LASSERT(dt);
+	LASSERT(dt->do_ops);
+	LASSERT(dt->do_ops->do_create);
+	return dt->do_ops->do_create(env, dt, attr, hint, dof, th);
+}
+
+static inline int dt_declare_destroy(const struct lu_env *env,
+				     struct dt_object *dt,
+				     struct thandle *th)
+{
+	LASSERT(dt);
+	LASSERT(dt->do_ops);
+	LASSERT(dt->do_ops->do_declare_destroy);
+	return dt->do_ops->do_declare_destroy(env, dt, th);
+}
+
+static inline int dt_destroy(const struct lu_env *env,
+			     struct dt_object *dt,
+			     struct thandle *th)
+{
+	LASSERT(dt);
+	LASSERT(dt->do_ops);
+	LASSERT(dt->do_ops->do_destroy);
+	return dt->do_ops->do_destroy(env, dt, th);
+}
+
+static inline void dt_read_lock(const struct lu_env *env,
+				struct dt_object *dt,
+				unsigned role)
+{
+	LASSERT(dt);
+	LASSERT(dt->do_ops);
+	LASSERT(dt->do_ops->do_read_lock);
+	dt->do_ops->do_read_lock(env, dt, role);
+}
+
+static inline void dt_write_lock(const struct lu_env *env,
+				struct dt_object *dt,
+				unsigned role)
+{
+	LASSERT(dt);
+	LASSERT(dt->do_ops);
+	LASSERT(dt->do_ops->do_write_lock);
+	dt->do_ops->do_write_lock(env, dt, role);
+}
+
+static inline void dt_read_unlock(const struct lu_env *env,
+				struct dt_object *dt)
+{
+	LASSERT(dt);
+	LASSERT(dt->do_ops);
+	LASSERT(dt->do_ops->do_read_unlock);
+	dt->do_ops->do_read_unlock(env, dt);
+}
+
+static inline void dt_write_unlock(const struct lu_env *env,
+				struct dt_object *dt)
+{
+	LASSERT(dt);
+	LASSERT(dt->do_ops);
+	LASSERT(dt->do_ops->do_write_unlock);
+	dt->do_ops->do_write_unlock(env, dt);
+}
+
+static inline int dt_write_locked(const struct lu_env *env,
+				  struct dt_object *dt)
+{
+	LASSERT(dt);
+	LASSERT(dt->do_ops);
+	LASSERT(dt->do_ops->do_write_locked);
+	return dt->do_ops->do_write_locked(env, dt);
+}
+
+static inline int dt_attr_get(const struct lu_env *env, struct dt_object *dt,
+			      struct lu_attr *la, void *arg)
+{
+	LASSERT(dt);
+	LASSERT(dt->do_ops);
+	LASSERT(dt->do_ops->do_attr_get);
+	return dt->do_ops->do_attr_get(env, dt, la, arg);
+}
+
+static inline int dt_declare_attr_set(const struct lu_env *env,
+				      struct dt_object *dt,
+				      const struct lu_attr *la,
+				      struct thandle *th)
+{
+	LASSERT(dt);
+	LASSERT(dt->do_ops);
+	LASSERT(dt->do_ops->do_declare_attr_set);
+	return dt->do_ops->do_declare_attr_set(env, dt, la, th);
+}
+
+static inline int dt_attr_set(const struct lu_env *env, struct dt_object *dt,
+			      const struct lu_attr *la, struct thandle *th,
+			      struct lustre_capa *capa)
+{
+	LASSERT(dt);
+	LASSERT(dt->do_ops);
+	LASSERT(dt->do_ops->do_attr_set);
+	return dt->do_ops->do_attr_set(env, dt, la, th, capa);
+}
+
+static inline int dt_declare_ref_add(const struct lu_env *env,
+				     struct dt_object *dt, struct thandle *th)
+{
+	LASSERT(dt);
+	LASSERT(dt->do_ops);
+	LASSERT(dt->do_ops->do_declare_ref_add);
+	return dt->do_ops->do_declare_ref_add(env, dt, th);
+}
+
+static inline int dt_ref_add(const struct lu_env *env,
+			     struct dt_object *dt, struct thandle *th)
+{
+	LASSERT(dt);
+	LASSERT(dt->do_ops);
+	LASSERT(dt->do_ops->do_ref_add);
+	return dt->do_ops->do_ref_add(env, dt, th);
+}
+
+static inline int dt_declare_ref_del(const struct lu_env *env,
+				     struct dt_object *dt, struct thandle *th)
+{
+	LASSERT(dt);
+	LASSERT(dt->do_ops);
+	LASSERT(dt->do_ops->do_declare_ref_del);
+	return dt->do_ops->do_declare_ref_del(env, dt, th);
+}
+
+static inline int dt_ref_del(const struct lu_env *env,
+			     struct dt_object *dt, struct thandle *th)
+{
+	LASSERT(dt);
+	LASSERT(dt->do_ops);
+	LASSERT(dt->do_ops->do_ref_del);
+	return dt->do_ops->do_ref_del(env, dt, th);
+}
+
+static inline struct obd_capa *dt_capa_get(const struct lu_env *env,
+					   struct dt_object *dt,
+					   struct lustre_capa *old, __u64 opc)
+{
+	LASSERT(dt);
+	LASSERT(dt->do_ops);
+	LASSERT(dt->do_ops->do_ref_del);
+	return dt->do_ops->do_capa_get(env, dt, old, opc);
+}
+
+static inline int dt_bufs_get(const struct lu_env *env, struct dt_object *d,
+			      struct niobuf_remote *rnb,
+			      struct niobuf_local *lnb, int rw,
+			      struct lustre_capa *capa)
+{
+	LASSERT(d);
+	LASSERT(d->do_body_ops);
+	LASSERT(d->do_body_ops->dbo_bufs_get);
+	return d->do_body_ops->dbo_bufs_get(env, d, rnb->offset,
+					    rnb->len, lnb, rw, capa);
+}
+
+static inline int dt_bufs_put(const struct lu_env *env, struct dt_object *d,
+			      struct niobuf_local *lnb, int n)
+{
+	LASSERT(d);
+	LASSERT(d->do_body_ops);
+	LASSERT(d->do_body_ops->dbo_bufs_put);
+	return d->do_body_ops->dbo_bufs_put(env, d, lnb, n);
+}
+
+static inline int dt_write_prep(const struct lu_env *env, struct dt_object *d,
+				struct niobuf_local *lnb, int n)
+{
+	LASSERT(d);
+	LASSERT(d->do_body_ops);
+	LASSERT(d->do_body_ops->dbo_write_prep);
+	return d->do_body_ops->dbo_write_prep(env, d, lnb, n);
+}
+
+static inline int dt_declare_write_commit(const struct lu_env *env,
+					  struct dt_object *d,
+					  struct niobuf_local *lnb,
+					  int n, struct thandle *th)
+{
+	LASSERTF(d != NULL, "dt is NULL when we want to declare write\n");
+	LASSERT(th != NULL);
+	return d->do_body_ops->dbo_declare_write_commit(env, d, lnb, n, th);
+}
+
+
+static inline int dt_write_commit(const struct lu_env *env,
+				  struct dt_object *d, struct niobuf_local *lnb,
+				  int n, struct thandle *th)
+{
+	LASSERT(d);
+	LASSERT(d->do_body_ops);
+	LASSERT(d->do_body_ops->dbo_write_commit);
+	return d->do_body_ops->dbo_write_commit(env, d, lnb, n, th);
+}
+
+static inline int dt_read_prep(const struct lu_env *env, struct dt_object *d,
+			       struct niobuf_local *lnb, int n)
+{
+	LASSERT(d);
+	LASSERT(d->do_body_ops);
+	LASSERT(d->do_body_ops->dbo_read_prep);
+	return d->do_body_ops->dbo_read_prep(env, d, lnb, n);
+}
+
+static inline int dt_declare_punch(const struct lu_env *env,
+				   struct dt_object *dt, __u64 start,
+				   __u64 end, struct thandle *th)
+{
+	LASSERT(dt);
+	LASSERT(dt->do_body_ops);
+	LASSERT(dt->do_body_ops->dbo_declare_punch);
+	return dt->do_body_ops->dbo_declare_punch(env, dt, start, end, th);
+}
+
+static inline int dt_punch(const struct lu_env *env, struct dt_object *dt,
+			   __u64 start, __u64 end, struct thandle *th,
+			   struct lustre_capa *capa)
+{
+	LASSERT(dt);
+	LASSERT(dt->do_body_ops);
+	LASSERT(dt->do_body_ops->dbo_punch);
+	return dt->do_body_ops->dbo_punch(env, dt, start, end, th, capa);
+}
+
+static inline int dt_fiemap_get(const struct lu_env *env, struct dt_object *d,
+				struct ll_user_fiemap *fm)
+{
+	LASSERT(d);
+	if (d->do_body_ops == NULL)
+		return -EPROTO;
+	if (d->do_body_ops->dbo_fiemap_get == NULL)
+		return -EOPNOTSUPP;
+	return d->do_body_ops->dbo_fiemap_get(env, d, fm);
+}
+
+static inline int dt_statfs(const struct lu_env *env, struct dt_device *dev,
+			    struct obd_statfs *osfs)
+{
+	LASSERT(dev);
+	LASSERT(dev->dd_ops);
+	LASSERT(dev->dd_ops->dt_statfs);
+	return dev->dd_ops->dt_statfs(env, dev, osfs);
+}
+
+static inline int dt_root_get(const struct lu_env *env, struct dt_device *dev,
+			      struct lu_fid *f)
+{
+	LASSERT(dev);
+	LASSERT(dev->dd_ops);
+	LASSERT(dev->dd_ops->dt_root_get);
+	return dev->dd_ops->dt_root_get(env, dev, f);
+}
+
+static inline void dt_conf_get(const struct lu_env *env,
+			       const struct dt_device *dev,
+			       struct dt_device_param *param)
+{
+	LASSERT(dev);
+	LASSERT(dev->dd_ops);
+	LASSERT(dev->dd_ops->dt_conf_get);
+	return dev->dd_ops->dt_conf_get(env, dev, param);
+}
+
+static inline int dt_sync(const struct lu_env *env, struct dt_device *dev)
+{
+	LASSERT(dev);
+	LASSERT(dev->dd_ops);
+	LASSERT(dev->dd_ops->dt_sync);
+	return dev->dd_ops->dt_sync(env, dev);
+}
+
+static inline int dt_ro(const struct lu_env *env, struct dt_device *dev)
+{
+	LASSERT(dev);
+	LASSERT(dev->dd_ops);
+	LASSERT(dev->dd_ops->dt_ro);
+	return dev->dd_ops->dt_ro(env, dev);
+}
+
+static inline int dt_declare_insert(const struct lu_env *env,
+				    struct dt_object *dt,
+				    const struct dt_rec *rec,
+				    const struct dt_key *key,
+				    struct thandle *th)
+{
+	LASSERT(dt);
+	LASSERT(dt->do_index_ops);
+	LASSERT(dt->do_index_ops->dio_declare_insert);
+	return dt->do_index_ops->dio_declare_insert(env, dt, rec, key, th);
+}
+
+static inline int dt_insert(const struct lu_env *env,
+				    struct dt_object *dt,
+				    const struct dt_rec *rec,
+				    const struct dt_key *key,
+				    struct thandle *th,
+				    struct lustre_capa *capa,
+				    int noquota)
+{
+	LASSERT(dt);
+	LASSERT(dt->do_index_ops);
+	LASSERT(dt->do_index_ops->dio_insert);
+	return dt->do_index_ops->dio_insert(env, dt, rec, key, th,
+					    capa, noquota);
+}
+
+static inline int dt_declare_xattr_del(const struct lu_env *env,
+				       struct dt_object *dt,
+				       const char *name,
+				       struct thandle *th)
+{
+	LASSERT(dt);
+	LASSERT(dt->do_ops);
+	LASSERT(dt->do_ops->do_declare_xattr_del);
+	return dt->do_ops->do_declare_xattr_del(env, dt, name, th);
+}
+
+static inline int dt_xattr_del(const struct lu_env *env,
+			       struct dt_object *dt, const char *name,
+			       struct thandle *th,
+			       struct lustre_capa *capa)
+{
+	LASSERT(dt);
+	LASSERT(dt->do_ops);
+	LASSERT(dt->do_ops->do_xattr_del);
+	return dt->do_ops->do_xattr_del(env, dt, name, th, capa);
+}
+
+static inline int dt_declare_xattr_set(const struct lu_env *env,
+				      struct dt_object *dt,
+				      const struct lu_buf *buf,
+				      const char *name, int fl,
+				      struct thandle *th)
+{
+	LASSERT(dt);
+	LASSERT(dt->do_ops);
+	LASSERT(dt->do_ops->do_declare_xattr_set);
+	return dt->do_ops->do_declare_xattr_set(env, dt, buf, name, fl, th);
+}
+
+static inline int dt_xattr_set(const struct lu_env *env,
+			      struct dt_object *dt, const struct lu_buf *buf,
+			      const char *name, int fl, struct thandle *th,
+			      struct lustre_capa *capa)
+{
+	LASSERT(dt);
+	LASSERT(dt->do_ops);
+	LASSERT(dt->do_ops->do_xattr_set);
+	return dt->do_ops->do_xattr_set(env, dt, buf, name, fl, th, capa);
+}
+
+static inline int dt_xattr_get(const struct lu_env *env,
+			      struct dt_object *dt, struct lu_buf *buf,
+			      const char *name, struct lustre_capa *capa)
+{
+	LASSERT(dt);
+	LASSERT(dt->do_ops);
+	LASSERT(dt->do_ops->do_xattr_get);
+	return dt->do_ops->do_xattr_get(env, dt, buf, name, capa);
+}
+
+static inline int dt_xattr_list(const struct lu_env *env,
+			       struct dt_object *dt, struct lu_buf *buf,
+			       struct lustre_capa *capa)
+{
+	LASSERT(dt);
+	LASSERT(dt->do_ops);
+	LASSERT(dt->do_ops->do_xattr_list);
+	return dt->do_ops->do_xattr_list(env, dt, buf, capa);
+}
+
+static inline int dt_declare_delete(const struct lu_env *env,
+				    struct dt_object *dt,
+				    const struct dt_key *key,
+				    struct thandle *th)
+{
+	LASSERT(dt);
+	LASSERT(dt->do_index_ops);
+	LASSERT(dt->do_index_ops->dio_declare_delete);
+	return dt->do_index_ops->dio_declare_delete(env, dt, key, th);
+}
+
+static inline int dt_delete(const struct lu_env *env,
+			    struct dt_object *dt,
+			    const struct dt_key *key,
+			    struct thandle *th,
+			    struct lustre_capa *capa)
+{
+	LASSERT(dt);
+	LASSERT(dt->do_index_ops);
+	LASSERT(dt->do_index_ops->dio_delete);
+	return dt->do_index_ops->dio_delete(env, dt, key, th, capa);
+}
+
+static inline int dt_commit_async(const struct lu_env *env,
+				  struct dt_device *dev)
+{
+	LASSERT(dev);
+	LASSERT(dev->dd_ops);
+	LASSERT(dev->dd_ops->dt_commit_async);
+	return dev->dd_ops->dt_commit_async(env, dev);
+}
+
+static inline int dt_init_capa_ctxt(const struct lu_env *env,
+				    struct dt_device *dev,
+				    int mode, unsigned long timeout,
+				    __u32 alg, struct lustre_capa_key *keys)
+{
+	LASSERT(dev);
+	LASSERT(dev->dd_ops);
+	LASSERT(dev->dd_ops->dt_init_capa_ctxt);
+	return dev->dd_ops->dt_init_capa_ctxt(env, dev, mode,
+					      timeout, alg, keys);
+}
+
+static inline int dt_lookup(const struct lu_env *env,
+			    struct dt_object *dt,
+			    struct dt_rec *rec,
+			    const struct dt_key *key,
+			    struct lustre_capa *capa)
+{
+	int ret;
+
+	LASSERT(dt);
+	LASSERT(dt->do_index_ops);
+	LASSERT(dt->do_index_ops->dio_lookup);
+
+	ret = dt->do_index_ops->dio_lookup(env, dt, rec, key, capa);
+	if (ret > 0)
+		ret = 0;
+	else if (ret == 0)
+		ret = -ENOENT;
+	return ret;
+}
+
+#define LU221_BAD_TIME (0x80000000U + 24 * 3600)
+
+struct dt_find_hint {
+	struct lu_fid	*dfh_fid;
+	struct dt_device     *dfh_dt;
+	struct dt_object     *dfh_o;
+};
+
+struct dt_thread_info {
+	char		     dti_buf[DT_MAX_PATH];
+	struct dt_find_hint      dti_dfh;
+	struct lu_attr	   dti_attr;
+	struct lu_fid	    dti_fid;
+	struct dt_object_format  dti_dof;
+	struct lustre_mdt_attrs  dti_lma;
+	struct lu_buf	    dti_lb;
+	loff_t		   dti_off;
+};
+
+extern struct lu_context_key dt_key;
+
+static inline struct dt_thread_info *dt_info(const struct lu_env *env)
+{
+	struct dt_thread_info *dti;
+
+	dti = lu_context_key_get(&env->le_ctx, &dt_key);
+	LASSERT(dti);
+	return dti;
+}
+
+int dt_global_init(void);
+void dt_global_fini(void);
+
+#if defined (CONFIG_PROC_FS)
+int lprocfs_dt_rd_blksize(char *page, char **start, off_t off,
+			  int count, int *eof, void *data);
+int lprocfs_dt_rd_kbytestotal(char *page, char **start, off_t off,
+			      int count, int *eof, void *data);
+int lprocfs_dt_rd_kbytesfree(char *page, char **start, off_t off,
+			     int count, int *eof, void *data);
+int lprocfs_dt_rd_kbytesavail(char *page, char **start, off_t off,
+			      int count, int *eof, void *data);
+int lprocfs_dt_rd_filestotal(char *page, char **start, off_t off,
+			     int count, int *eof, void *data);
+int lprocfs_dt_rd_filesfree(char *page, char **start, off_t off,
+			    int count, int *eof, void *data);
+#endif /* CONFIG_PROC_FS */
+
+#endif /* __LUSTRE_DT_OBJECT_H */
diff --git a/kernel/drivers/staging/lustre/lustre/include/interval_tree.h b/kernel/drivers/staging/lustre/lustre/include/interval_tree.h
new file mode 100644
index 000000000..bf9027d5f
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/include/interval_tree.h
@@ -0,0 +1,124 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/include/interval_tree.h
+ *
+ * Author: Huang Wei <huangwei@clusterfs.com>
+ * Author: Jay Xiong <jinshan.xiong@sun.com>
+ */
+
+#ifndef _INTERVAL_H__
+#define _INTERVAL_H__
+
+#include "../../include/linux/libcfs/libcfs.h"	/* LASSERT. */
+
+struct interval_node {
+	struct interval_node   *in_left;
+	struct interval_node   *in_right;
+	struct interval_node   *in_parent;
+	unsigned		in_color:1,
+				in_intree:1, /** set if the node is in tree */
+				in_res1:30;
+	__u8		    in_res2[4];  /** tags, 8-bytes aligned */
+	__u64		   in_max_high;
+	struct interval_node_extent {
+		__u64 start;
+		__u64 end;
+	} in_extent;
+};
+
+enum interval_iter {
+	INTERVAL_ITER_CONT = 1,
+	INTERVAL_ITER_STOP = 2
+};
+
+static inline int interval_is_intree(struct interval_node *node)
+{
+	return node->in_intree == 1;
+}
+
+static inline __u64 interval_low(struct interval_node *node)
+{
+	return node->in_extent.start;
+}
+
+static inline __u64 interval_high(struct interval_node *node)
+{
+	return node->in_extent.end;
+}
+
+static inline void interval_set(struct interval_node *node,
+				__u64 start, __u64 end)
+{
+	LASSERT(start <= end);
+	node->in_extent.start = start;
+	node->in_extent.end = end;
+	node->in_max_high = end;
+}
+
+/* Rules to write an interval callback.
+ *  - the callback returns INTERVAL_ITER_STOP when it thinks the iteration
+ *    should be stopped. It will then cause the iteration function to return
+ *    immediately with return value INTERVAL_ITER_STOP.
+ *  - callbacks for interval_iterate and interval_iterate_reverse: Every
+ *    nodes in the tree will be set to @node before the callback being called
+ *  - callback for interval_search: Only overlapped node will be set to @node
+ *    before the callback being called.
+ */
+typedef enum interval_iter (*interval_callback_t)(struct interval_node *node,
+						  void *args);
+
+struct interval_node *interval_insert(struct interval_node *node,
+				      struct interval_node **root);
+void interval_erase(struct interval_node *node, struct interval_node **root);
+
+/* Search the extents in the tree and call @func for each overlapped
+ * extents. */
+enum interval_iter interval_search(struct interval_node *root,
+				   struct interval_node_extent *ex,
+				   interval_callback_t func, void *data);
+
+/* Iterate every node in the tree - by reverse order or regular order. */
+enum interval_iter interval_iterate(struct interval_node *root,
+				    interval_callback_t func, void *data);
+enum interval_iter interval_iterate_reverse(struct interval_node *root,
+				    interval_callback_t func, void *data);
+
+void interval_expand(struct interval_node *root,
+		     struct interval_node_extent *ext,
+		     struct interval_node_extent *limiter);
+int interval_is_overlapped(struct interval_node *root,
+			   struct interval_node_extent *ex);
+struct interval_node *interval_find(struct interval_node *root,
+				    struct interval_node_extent *ex);
+#endif
diff --git a/kernel/drivers/staging/lustre/lustre/include/lclient.h b/kernel/drivers/staging/lustre/lustre/include/lclient.h
new file mode 100644
index 000000000..c5c3a8d9e
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/include/lclient.h
@@ -0,0 +1,433 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Definitions shared between vvp and liblustre, and other clients in the
+ * future.
+ *
+ *   Author: Oleg Drokin <oleg.drokin@sun.com>
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ */
+
+#ifndef LCLIENT_H
+#define LCLIENT_H
+
+blkcnt_t dirty_cnt(struct inode *inode);
+
+int cl_glimpse_size0(struct inode *inode, int agl);
+int cl_glimpse_lock(const struct lu_env *env, struct cl_io *io,
+		    struct inode *inode, struct cl_object *clob, int agl);
+
+static inline int cl_glimpse_size(struct inode *inode)
+{
+	return cl_glimpse_size0(inode, 0);
+}
+
+static inline int cl_agl(struct inode *inode)
+{
+	return cl_glimpse_size0(inode, 1);
+}
+
+/**
+ * Locking policy for setattr.
+ */
+enum ccc_setattr_lock_type {
+	/** Locking is done by server */
+	SETATTR_NOLOCK,
+	/** Extent lock is enqueued */
+	SETATTR_EXTENT_LOCK,
+	/** Existing local extent lock is used */
+	SETATTR_MATCH_LOCK
+};
+
+
+/**
+ * IO state private to vvp or slp layers.
+ */
+struct ccc_io {
+	/** super class */
+	struct cl_io_slice     cui_cl;
+	struct cl_io_lock_link cui_link;
+	/**
+	 * I/O vector information to or from which read/write is going.
+	 */
+	struct iov_iter *cui_iter;
+	/**
+	 * Total size for the left IO.
+	 */
+	size_t cui_tot_count;
+
+	union {
+		struct {
+			enum ccc_setattr_lock_type cui_local_lock;
+		} setattr;
+	} u;
+	/**
+	 * True iff io is processing glimpse right now.
+	 */
+	int		  cui_glimpse;
+	/**
+	 * Layout version when this IO is initialized
+	 */
+	__u32		cui_layout_gen;
+	/**
+	 * File descriptor against which IO is done.
+	 */
+	struct ll_file_data *cui_fd;
+	struct kiocb *cui_iocb;
+};
+
+/**
+ * True, if \a io is a normal io, False for splice_{read,write}.
+ * must be implemented in arch specific code.
+ */
+int cl_is_normalio(const struct lu_env *env, const struct cl_io *io);
+
+extern struct lu_context_key ccc_key;
+extern struct lu_context_key ccc_session_key;
+
+struct ccc_thread_info {
+	struct cl_lock_descr cti_descr;
+	struct cl_io	 cti_io;
+	struct cl_attr       cti_attr;
+};
+
+static inline struct ccc_thread_info *ccc_env_info(const struct lu_env *env)
+{
+	struct ccc_thread_info      *info;
+
+	info = lu_context_key_get(&env->le_ctx, &ccc_key);
+	LASSERT(info != NULL);
+	return info;
+}
+
+static inline struct cl_attr *ccc_env_thread_attr(const struct lu_env *env)
+{
+	struct cl_attr *attr = &ccc_env_info(env)->cti_attr;
+
+	memset(attr, 0, sizeof(*attr));
+	return attr;
+}
+
+static inline struct cl_io *ccc_env_thread_io(const struct lu_env *env)
+{
+	struct cl_io *io = &ccc_env_info(env)->cti_io;
+
+	memset(io, 0, sizeof(*io));
+	return io;
+}
+
+struct ccc_session {
+	struct ccc_io cs_ios;
+};
+
+static inline struct ccc_session *ccc_env_session(const struct lu_env *env)
+{
+	struct ccc_session *ses;
+
+	ses = lu_context_key_get(env->le_ses, &ccc_session_key);
+	LASSERT(ses != NULL);
+	return ses;
+}
+
+static inline struct ccc_io *ccc_env_io(const struct lu_env *env)
+{
+	return &ccc_env_session(env)->cs_ios;
+}
+
+/**
+ * ccc-private object state.
+ */
+struct ccc_object {
+	struct cl_object_header cob_header;
+	struct cl_object	cob_cl;
+	struct inode	   *cob_inode;
+
+	/**
+	 * A list of dirty pages pending IO in the cache. Used by
+	 * SOM. Protected by ll_inode_info::lli_lock.
+	 *
+	 * \see ccc_page::cpg_pending_linkage
+	 */
+	struct list_head	     cob_pending_list;
+
+	/**
+	 * Access this counter is protected by inode->i_sem. Now that
+	 * the lifetime of transient pages must be covered by inode sem,
+	 * we don't need to hold any lock..
+	 */
+	int		     cob_transient_pages;
+	/**
+	 * Number of outstanding mmaps on this file.
+	 *
+	 * \see ll_vm_open(), ll_vm_close().
+	 */
+	atomic_t	    cob_mmap_cnt;
+
+	/**
+	 * various flags
+	 * cob_discard_page_warned
+	 *     if pages belonging to this object are discarded when a client
+	 * is evicted, some debug info will be printed, this flag will be set
+	 * during processing the first discarded page, then avoid flooding
+	 * debug message for lots of discarded pages.
+	 *
+	 * \see ll_dirty_page_discard_warn.
+	 */
+	unsigned int		cob_discard_page_warned:1;
+};
+
+/**
+ * ccc-private page state.
+ */
+struct ccc_page {
+	struct cl_page_slice cpg_cl;
+	int		  cpg_defer_uptodate;
+	int		  cpg_ra_used;
+	int		  cpg_write_queued;
+	/**
+	 * Non-empty iff this page is already counted in
+	 * ccc_object::cob_pending_list. Protected by
+	 * ccc_object::cob_pending_guard. This list is only used as a flag,
+	 * that is, never iterated through, only checked for list_empty(), but
+	 * having a list is useful for debugging.
+	 */
+	struct list_head	   cpg_pending_linkage;
+	/** VM page */
+	struct page	  *cpg_page;
+};
+
+static inline struct ccc_page *cl2ccc_page(const struct cl_page_slice *slice)
+{
+	return container_of(slice, struct ccc_page, cpg_cl);
+}
+
+struct cl_page    *ccc_vmpage_page_transient(struct page *vmpage);
+
+struct ccc_device {
+	struct cl_device    cdv_cl;
+	struct super_block *cdv_sb;
+	struct cl_device   *cdv_next;
+};
+
+struct ccc_lock {
+	struct cl_lock_slice clk_cl;
+};
+
+struct ccc_req {
+	struct cl_req_slice  crq_cl;
+};
+
+void *ccc_key_init	(const struct lu_context *ctx,
+			   struct lu_context_key *key);
+void  ccc_key_fini	(const struct lu_context *ctx,
+			   struct lu_context_key *key, void *data);
+void *ccc_session_key_init(const struct lu_context *ctx,
+			   struct lu_context_key *key);
+void  ccc_session_key_fini(const struct lu_context *ctx,
+			   struct lu_context_key *key, void *data);
+
+int	      ccc_device_init  (const struct lu_env *env,
+				   struct lu_device *d,
+				   const char *name, struct lu_device *next);
+struct lu_device *ccc_device_fini (const struct lu_env *env,
+				   struct lu_device *d);
+struct lu_device *ccc_device_alloc(const struct lu_env *env,
+				   struct lu_device_type *t,
+				   struct lustre_cfg *cfg,
+				   const struct lu_device_operations *luops,
+				   const struct cl_device_operations *clops);
+struct lu_device *ccc_device_free (const struct lu_env *env,
+				   struct lu_device *d);
+struct lu_object *ccc_object_alloc(const struct lu_env *env,
+				   const struct lu_object_header *hdr,
+				   struct lu_device *dev,
+				   const struct cl_object_operations *clops,
+				   const struct lu_object_operations *luops);
+
+int ccc_req_init(const struct lu_env *env, struct cl_device *dev,
+		 struct cl_req *req);
+void ccc_umount(const struct lu_env *env, struct cl_device *dev);
+int ccc_global_init(struct lu_device_type *device_type);
+void ccc_global_fini(struct lu_device_type *device_type);
+int ccc_object_init0(const struct lu_env *env, struct ccc_object *vob,
+		     const struct cl_object_conf *conf);
+int ccc_object_init(const struct lu_env *env, struct lu_object *obj,
+		    const struct lu_object_conf *conf);
+void ccc_object_free(const struct lu_env *env, struct lu_object *obj);
+int ccc_lock_init(const struct lu_env *env, struct cl_object *obj,
+		  struct cl_lock *lock, const struct cl_io *io,
+		  const struct cl_lock_operations *lkops);
+int ccc_attr_set(const struct lu_env *env, struct cl_object *obj,
+		 const struct cl_attr *attr, unsigned valid);
+int ccc_object_glimpse(const struct lu_env *env,
+		       const struct cl_object *obj, struct ost_lvb *lvb);
+int ccc_conf_set(const struct lu_env *env, struct cl_object *obj,
+		 const struct cl_object_conf *conf);
+struct page *ccc_page_vmpage(const struct lu_env *env,
+			    const struct cl_page_slice *slice);
+int ccc_page_is_under_lock(const struct lu_env *env,
+			   const struct cl_page_slice *slice, struct cl_io *io);
+int ccc_fail(const struct lu_env *env, const struct cl_page_slice *slice);
+void ccc_transient_page_verify(const struct cl_page *page);
+int  ccc_transient_page_own(const struct lu_env *env,
+			    const struct cl_page_slice *slice,
+			    struct cl_io *io, int nonblock);
+void ccc_transient_page_assume(const struct lu_env *env,
+			       const struct cl_page_slice *slice,
+			       struct cl_io *io);
+void ccc_transient_page_unassume(const struct lu_env *env,
+				 const struct cl_page_slice *slice,
+				 struct cl_io *io);
+void ccc_transient_page_disown(const struct lu_env *env,
+			       const struct cl_page_slice *slice,
+			       struct cl_io *io);
+void ccc_transient_page_discard(const struct lu_env *env,
+				const struct cl_page_slice *slice,
+				struct cl_io *io);
+int ccc_transient_page_prep(const struct lu_env *env,
+			    const struct cl_page_slice *slice,
+			    struct cl_io *io);
+void ccc_lock_delete(const struct lu_env *env,
+		     const struct cl_lock_slice *slice);
+void ccc_lock_fini(const struct lu_env *env, struct cl_lock_slice *slice);
+int ccc_lock_enqueue(const struct lu_env *env,
+		     const struct cl_lock_slice *slice,
+		     struct cl_io *io, __u32 enqflags);
+int ccc_lock_use(const struct lu_env *env, const struct cl_lock_slice *slice);
+int ccc_lock_unuse(const struct lu_env *env, const struct cl_lock_slice *slice);
+int ccc_lock_wait(const struct lu_env *env, const struct cl_lock_slice *slice);
+int ccc_lock_fits_into(const struct lu_env *env,
+		       const struct cl_lock_slice *slice,
+		       const struct cl_lock_descr *need,
+		       const struct cl_io *io);
+void ccc_lock_state(const struct lu_env *env,
+		    const struct cl_lock_slice *slice,
+		    enum cl_lock_state state);
+
+void ccc_io_fini(const struct lu_env *env, const struct cl_io_slice *ios);
+int ccc_io_one_lock_index(const struct lu_env *env, struct cl_io *io,
+			  __u32 enqflags, enum cl_lock_mode mode,
+			  pgoff_t start, pgoff_t end);
+int ccc_io_one_lock(const struct lu_env *env, struct cl_io *io,
+		    __u32 enqflags, enum cl_lock_mode mode,
+		    loff_t start, loff_t end);
+void ccc_io_end(const struct lu_env *env, const struct cl_io_slice *ios);
+void ccc_io_advance(const struct lu_env *env, const struct cl_io_slice *ios,
+		    size_t nob);
+void ccc_io_update_iov(const struct lu_env *env, struct ccc_io *cio,
+		       struct cl_io *io);
+int ccc_prep_size(const struct lu_env *env, struct cl_object *obj,
+		  struct cl_io *io, loff_t start, size_t count, int *exceed);
+void ccc_req_completion(const struct lu_env *env,
+			const struct cl_req_slice *slice, int ioret);
+void ccc_req_attr_set(const struct lu_env *env,
+		      const struct cl_req_slice *slice,
+		      const struct cl_object *obj,
+		      struct cl_req_attr *oa, u64 flags);
+
+struct lu_device   *ccc2lu_dev      (struct ccc_device *vdv);
+struct lu_object   *ccc2lu	  (struct ccc_object *vob);
+struct ccc_device  *lu2ccc_dev      (const struct lu_device *d);
+struct ccc_device  *cl2ccc_dev      (const struct cl_device *d);
+struct ccc_object  *lu2ccc	  (const struct lu_object *obj);
+struct ccc_object  *cl2ccc	  (const struct cl_object *obj);
+struct ccc_lock    *cl2ccc_lock     (const struct cl_lock_slice *slice);
+struct ccc_io      *cl2ccc_io       (const struct lu_env *env,
+				     const struct cl_io_slice *slice);
+struct ccc_req     *cl2ccc_req      (const struct cl_req_slice *slice);
+struct page	 *cl2vm_page      (const struct cl_page_slice *slice);
+struct inode       *ccc_object_inode(const struct cl_object *obj);
+struct ccc_object  *cl_inode2ccc    (struct inode *inode);
+
+int cl_setattr_ost(struct inode *inode, const struct iattr *attr,
+		   struct obd_capa *capa);
+
+struct cl_page *ccc_vmpage_page_transient(struct page *vmpage);
+int ccc_object_invariant(const struct cl_object *obj);
+int cl_file_inode_init(struct inode *inode, struct lustre_md *md);
+void cl_inode_fini(struct inode *inode);
+int cl_local_size(struct inode *inode);
+
+__u16 ll_dirent_type_get(struct lu_dirent *ent);
+__u64 cl_fid_build_ino(const struct lu_fid *fid, int api32);
+__u32 cl_fid_build_gen(const struct lu_fid *fid);
+
+# define CLOBINVRNT(env, clob, expr)					\
+	((void)sizeof(env), (void)sizeof(clob), (void)sizeof(!!(expr)))
+
+int cl_init_ea_size(struct obd_export *md_exp, struct obd_export *dt_exp);
+int cl_ocd_update(struct obd_device *host,
+		  struct obd_device *watched,
+		  enum obd_notify_event ev, void *owner, void *data);
+
+struct ccc_grouplock {
+	struct lu_env   *cg_env;
+	struct cl_io    *cg_io;
+	struct cl_lock  *cg_lock;
+	unsigned long    cg_gid;
+};
+
+int  cl_get_grouplock(struct cl_object *obj, unsigned long gid, int nonblock,
+		      struct ccc_grouplock *cg);
+void cl_put_grouplock(struct ccc_grouplock *cg);
+
+/**
+ * New interfaces to get and put lov_stripe_md from lov layer. This violates
+ * layering because lov_stripe_md is supposed to be a private data in lov.
+ *
+ * NB: If you find you have to use these interfaces for your new code, please
+ * think about it again. These interfaces may be removed in the future for
+ * better layering. */
+struct lov_stripe_md *lov_lsm_get(struct cl_object *clobj);
+void lov_lsm_put(struct cl_object *clobj, struct lov_stripe_md *lsm);
+int lov_read_and_clear_async_rc(struct cl_object *clob);
+
+struct lov_stripe_md *ccc_inode_lsm_get(struct inode *inode);
+void ccc_inode_lsm_put(struct inode *inode, struct lov_stripe_md *lsm);
+
+/**
+ * Data structure managing a client's cached clean pages. An LRU of
+ * pages is maintained, along with other statistics.
+ */
+struct cl_client_cache {
+	atomic_t	ccc_users;    /* # of users (OSCs) of this data */
+	struct list_head	ccc_lru;      /* LRU list of cached clean pages */
+	spinlock_t	ccc_lru_lock; /* lock for list */
+	atomic_t	ccc_lru_left; /* # of LRU entries available */
+	unsigned long	ccc_lru_max;  /* Max # of LRU entries possible */
+	unsigned int	ccc_lru_shrinkers; /* # of threads reclaiming */
+};
+
+#endif /*LCLIENT_H */
diff --git a/kernel/drivers/staging/lustre/lustre/include/linux/lustre_compat25.h b/kernel/drivers/staging/lustre/lustre/include/linux/lustre_compat25.h
new file mode 100644
index 000000000..3925db160
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/include/linux/lustre_compat25.h
@@ -0,0 +1,216 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef _LINUX_COMPAT25_H
+#define _LINUX_COMPAT25_H
+
+#include <linux/fs_struct.h>
+#include <linux/namei.h>
+
+#include "lustre_patchless_compat.h"
+
+/*
+ * set ATTR_BLOCKS to a high value to avoid any risk of collision with other
+ * ATTR_* attributes (see bug 13828)
+ */
+#define ATTR_BLOCKS    (1 << 27)
+
+#define current_ngroups current_cred()->group_info->ngroups
+#define current_groups current_cred()->group_info->small_block
+
+/*
+ * OBD need working random driver, thus all our
+ * initialization routines must be called after device
+ * driver initialization
+ */
+#ifndef MODULE
+#undef module_init
+#define module_init(a)     late_initcall(a)
+#endif
+
+
+#define LTIME_S(time)		   (time.tv_sec)
+
+/* inode_dio_wait(i) use as-is for write lock */
+# define inode_dio_write_done(i)	do {} while (0) /* for write unlock */
+# define inode_dio_read(i)		atomic_inc(&(i)->i_dio_count)
+/* inode_dio_done(i) use as-is for read unlock */
+
+
+#ifndef FS_HAS_FIEMAP
+#define FS_HAS_FIEMAP			(0)
+#endif
+
+#define ll_vfs_rmdir(dir, entry, mnt)	     vfs_rmdir(dir, entry)
+#define ll_vfs_mkdir(inode, dir, mnt, mode)	vfs_mkdir(inode, dir, mode)
+#define ll_vfs_link(old, mnt, dir, new, mnt1)       vfs_link(old, dir, new)
+#define ll_vfs_unlink(inode, entry, mnt)	  vfs_unlink(inode, entry)
+#define ll_vfs_mknod(dir, entry, mnt, mode, dev) \
+		     vfs_mknod(dir, entry, mode, dev)
+#define ll_security_inode_unlink(dir, entry, mnt) \
+				 security_inode_unlink(dir, entry)
+#define ll_vfs_rename(old, old_dir, mnt, new, new_dir, mnt1) \
+		vfs_rename(old, old_dir, new, new_dir, NULL, 0)
+
+#define cfs_bio_io_error(a, b)   bio_io_error((a))
+#define cfs_bio_endio(a, b, c)    bio_endio((a), (c))
+
+#define cfs_path_put(nd)     path_put(&(nd)->path)
+
+
+#ifndef SLAB_DESTROY_BY_RCU
+#define SLAB_DESTROY_BY_RCU 0
+#endif
+
+
+
+static inline int
+ll_quota_on(struct super_block *sb, int off, int ver, char *name, int remount)
+{
+	int rc;
+
+	if (sb->s_qcop->quota_on) {
+		struct path path;
+
+		rc = kern_path(name, LOOKUP_FOLLOW, &path);
+		if (!rc)
+			return rc;
+		rc = sb->s_qcop->quota_on(sb, off, ver
+					    , &path
+					   );
+		path_put(&path);
+		return rc;
+	} else
+		return -ENOSYS;
+}
+
+static inline int ll_quota_off(struct super_block *sb, int off, int remount)
+{
+	if (sb->s_qcop->quota_off) {
+		return sb->s_qcop->quota_off(sb, off
+					    );
+	} else
+		return -ENOSYS;
+}
+
+
+# define ll_vfs_dq_init	     dquot_initialize
+# define ll_vfs_dq_drop	     dquot_drop
+# define ll_vfs_dq_transfer	 dquot_transfer
+# define ll_vfs_dq_off(sb, remount) dquot_suspend(sb, -1)
+
+
+
+
+
+#define queue_max_phys_segments(rq)       queue_max_segments(rq)
+#define queue_max_hw_segments(rq)	 queue_max_segments(rq)
+
+
+#define ll_d_hlist_node hlist_node
+#define ll_d_hlist_empty(list) hlist_empty(list)
+#define ll_d_hlist_entry(ptr, type, name) hlist_entry(ptr.first, type, name)
+#define ll_d_hlist_for_each(tmp, i_dentry) hlist_for_each(tmp, i_dentry)
+#define ll_d_hlist_for_each_entry(dentry, p, i_dentry, alias) \
+	p = NULL; hlist_for_each_entry(dentry, i_dentry, alias)
+
+
+#define bio_hw_segments(q, bio) 0
+
+
+#define ll_pagevec_init(pv, cold)       do {} while (0)
+#define ll_pagevec_add(pv, pg)	  (0)
+#define ll_pagevec_lru_add_file(pv)     do {} while (0)
+
+
+#ifndef QUOTA_OK
+# define QUOTA_OK 0
+#endif
+#ifndef NO_QUOTA
+# define NO_QUOTA (-EDQUOT)
+#endif
+
+#ifndef SEEK_DATA
+#define SEEK_DATA      3       /* seek to the next data */
+#endif
+#ifndef SEEK_HOLE
+#define SEEK_HOLE      4       /* seek to the next hole */
+#endif
+
+#ifndef FMODE_UNSIGNED_OFFSET
+#define FMODE_UNSIGNED_OFFSET	((__force fmode_t)0x2000)
+#endif
+
+#if !defined(_ASM_GENERIC_BITOPS_EXT2_NON_ATOMIC_H_) && !defined(ext2_set_bit)
+# define ext2_set_bit	     __test_and_set_bit_le
+# define ext2_clear_bit	   __test_and_clear_bit_le
+# define ext2_test_bit	    test_bit_le
+# define ext2_find_first_zero_bit find_first_zero_bit_le
+# define ext2_find_next_zero_bit  find_next_zero_bit_le
+#endif
+
+#ifdef ATTR_TIMES_SET
+# define TIMES_SET_FLAGS (ATTR_MTIME_SET | ATTR_ATIME_SET | ATTR_TIMES_SET)
+#else
+# define TIMES_SET_FLAGS (ATTR_MTIME_SET | ATTR_ATIME_SET)
+#endif
+
+
+
+/*
+ * After 3.1, kernel's nameidata.intent.open.flags is different
+ * with lustre's lookup_intent.it_flags, as lustre's it_flags'
+ * lower bits equal to FMODE_xxx while kernel doesn't transliterate
+ * lower bits of nameidata.intent.open.flags to FMODE_xxx.
+ * */
+#include <linux/version.h>
+static inline int ll_namei_to_lookup_intent_flag(int flag)
+{
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 1, 0)
+	flag = (flag & ~O_ACCMODE) | OPEN_FMODE(flag);
+#endif
+	return flag;
+}
+
+#include <linux/fs.h>
+
+# define ll_umode_t	umode_t
+
+#include <linux/dcache.h>
+
+# define ll_dirty_inode(inode, flag)	(inode)->i_sb->s_op->dirty_inode((inode), flag)
+
+#endif /* _COMPAT25_H */
diff --git a/kernel/drivers/staging/lustre/lustre/include/linux/lustre_lite.h b/kernel/drivers/staging/lustre/lustre/include/linux/lustre_lite.h
new file mode 100644
index 000000000..a7658a99a
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/include/linux/lustre_lite.h
@@ -0,0 +1,98 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef _LINUX_LL_H
+#define _LINUX_LL_H
+
+#ifndef _LL_H
+#error Do not #include this file directly. #include <lustre_lite.h> instead
+#endif
+
+
+#include <asm/statfs.h>
+
+#include <linux/fs.h>
+#include <linux/dcache.h>
+#include <linux/proc_fs.h>
+
+#include "../obd_class.h"
+#include "../lustre_net.h"
+#include "../lustre_ha.h"
+
+#include <linux/rbtree.h>
+#include "../../include/linux/lustre_compat25.h"
+#include <linux/pagemap.h>
+
+/* lprocfs.c */
+enum {
+	 LPROC_LL_DIRTY_HITS = 0,
+	 LPROC_LL_DIRTY_MISSES,
+	 LPROC_LL_READ_BYTES,
+	 LPROC_LL_WRITE_BYTES,
+	 LPROC_LL_BRW_READ,
+	 LPROC_LL_BRW_WRITE,
+	 LPROC_LL_OSC_READ,
+	 LPROC_LL_OSC_WRITE,
+	 LPROC_LL_IOCTL,
+	 LPROC_LL_OPEN,
+	 LPROC_LL_RELEASE,
+	 LPROC_LL_MAP,
+	 LPROC_LL_LLSEEK,
+	 LPROC_LL_FSYNC,
+	 LPROC_LL_READDIR,
+	 LPROC_LL_SETATTR,
+	 LPROC_LL_TRUNC,
+	 LPROC_LL_FLOCK,
+	 LPROC_LL_GETATTR,
+	 LPROC_LL_CREATE,
+	 LPROC_LL_LINK,
+	 LPROC_LL_UNLINK,
+	 LPROC_LL_SYMLINK,
+	 LPROC_LL_MKDIR,
+	 LPROC_LL_RMDIR,
+	 LPROC_LL_MKNOD,
+	 LPROC_LL_RENAME,
+	 LPROC_LL_STAFS,
+	 LPROC_LL_ALLOC_INODE,
+	 LPROC_LL_SETXATTR,
+	 LPROC_LL_GETXATTR,
+	 LPROC_LL_GETXATTR_HITS,
+	 LPROC_LL_LISTXATTR,
+	 LPROC_LL_REMOVEXATTR,
+	 LPROC_LL_INODE_PERM,
+	 LPROC_LL_FILE_OPCODES
+};
+
+
+#endif
diff --git a/kernel/drivers/staging/lustre/lustre/include/linux/lustre_patchless_compat.h b/kernel/drivers/staging/lustre/lustre/include/linux/lustre_patchless_compat.h
new file mode 100644
index 000000000..d72605864
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/include/linux/lustre_patchless_compat.h
@@ -0,0 +1,85 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef LUSTRE_PATCHLESS_COMPAT_H
+#define LUSTRE_PATCHLESS_COMPAT_H
+
+#include <linux/fs.h>
+
+#include <linux/list.h>
+#include <linux/mm.h>
+#include <linux/hash.h>
+
+
+#define ll_delete_from_page_cache(page) delete_from_page_cache(page)
+
+static inline void
+truncate_complete_page(struct address_space *mapping, struct page *page)
+{
+	if (page->mapping != mapping)
+		return;
+
+	if (PagePrivate(page))
+		page->mapping->a_ops->invalidatepage(page, 0, PAGE_CACHE_SIZE);
+
+	if (TestClearPageDirty(page))
+		account_page_cleaned(page, mapping);
+
+	ClearPageMappedToDisk(page);
+	ll_delete_from_page_cache(page);
+}
+
+#ifdef ATTR_OPEN
+# define ATTR_FROM_OPEN ATTR_OPEN
+#else
+# ifndef ATTR_FROM_OPEN
+#  define ATTR_FROM_OPEN 0
+# endif
+#endif /* ATTR_OPEN */
+
+#ifndef ATTR_RAW
+#define ATTR_RAW 0
+#endif
+
+#ifndef ATTR_CTIME_SET
+/*
+ * set ATTR_CTIME_SET to a high value to avoid any risk of collision with other
+ * ATTR_* attributes (see bug 13828)
+ */
+#define ATTR_CTIME_SET (1 << 28)
+#endif
+
+#endif /* LUSTRE_PATCHLESS_COMPAT_H */
diff --git a/kernel/drivers/staging/lustre/lustre/include/linux/lustre_user.h b/kernel/drivers/staging/lustre/lustre/include/linux/lustre_user.h
new file mode 100644
index 000000000..9cc2849f3
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/include/linux/lustre_user.h
@@ -0,0 +1,70 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/include/linux/lustre_user.h
+ *
+ * Lustre public user-space interface definitions.
+ */
+
+#ifndef _LINUX_LUSTRE_USER_H
+#define _LINUX_LUSTRE_USER_H
+
+# include <linux/quota.h>
+
+/*
+ * asm-x86_64/processor.h on some SLES 9 distros seems to use
+ * kernel-only typedefs.  fortunately skipping it altogether is ok
+ * (for now).
+ */
+#define __ASM_X86_64_PROCESSOR_H
+
+#include <linux/string.h>
+
+/*
+ * We need to always use 64bit version because the structure
+ * is shared across entire cluster where 32bit and 64bit machines
+ * are co-existing.
+ */
+#if __BITS_PER_LONG != 64 || defined(__ARCH_WANT_STAT64)
+typedef struct stat64   lstat_t;
+#define lstat_f	 lstat64
+#else
+typedef struct stat     lstat_t;
+#define lstat_f	 lstat
+#endif
+
+#define HAVE_LOV_USER_MDS_DATA
+
+#endif /* _LUSTRE_USER_H */
diff --git a/kernel/drivers/staging/lustre/lustre/include/linux/obd.h b/kernel/drivers/staging/lustre/lustre/include/linux/obd.h
new file mode 100644
index 000000000..9cd868357
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/include/linux/obd.h
@@ -0,0 +1,125 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 1999, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef __LINUX_OBD_H
+#define __LINUX_OBD_H
+
+#ifndef __OBD_H
+#error Do not #include this file directly. #include <obd.h> instead
+#endif
+
+#include "../obd_support.h"
+
+# include <linux/fs.h>
+# include <linux/list.h>
+# include <linux/sched.h>  /* for struct task_struct, for current.h */
+# include <linux/proc_fs.h>
+# include <linux/mount.h>
+#include "../lustre_intent.h"
+
+struct ll_iattr {
+	struct iattr	iattr;
+	unsigned int	ia_attr_flags;
+};
+
+#define CLIENT_OBD_LIST_LOCK_DEBUG 1
+
+typedef struct {
+	spinlock_t		lock;
+
+	unsigned long       time;
+	struct task_struct *task;
+	const char	 *func;
+	int		 line;
+} client_obd_lock_t;
+
+static inline void __client_obd_list_lock(client_obd_lock_t *lock,
+					  const char *func, int line)
+{
+	unsigned long cur = jiffies;
+	while (1) {
+		if (spin_trylock(&lock->lock)) {
+			LASSERT(lock->task == NULL);
+			lock->task = current;
+			lock->func = func;
+			lock->line = line;
+			lock->time = jiffies;
+			break;
+		}
+
+		if (time_before(cur + 5 * HZ, jiffies) &&
+		    time_before(lock->time + 5 * HZ, jiffies)) {
+			struct task_struct *task = lock->task;
+
+			if (task == NULL)
+				continue;
+
+			LCONSOLE_WARN("%s:%d: lock %p was acquired by <%s:%d:%s:%d> for %lu seconds.\n",
+				      current->comm, current->pid,
+				      lock, task->comm, task->pid,
+				      lock->func, lock->line,
+				      (jiffies - lock->time) / HZ);
+			LCONSOLE_WARN("====== for current process =====\n");
+			dump_stack();
+			LCONSOLE_WARN("====== end =======\n");
+			set_current_state(TASK_UNINTERRUPTIBLE);
+			schedule_timeout(1000 * HZ);
+		}
+		cpu_relax();
+	}
+}
+
+#define client_obd_list_lock(lock) \
+	__client_obd_list_lock(lock, __func__, __LINE__)
+
+static inline void client_obd_list_unlock(client_obd_lock_t *lock)
+{
+	LASSERT(lock->task != NULL);
+	lock->task = NULL;
+	lock->time = jiffies;
+	spin_unlock(&lock->lock);
+}
+
+
+static inline void client_obd_list_lock_init(client_obd_lock_t *lock)
+{
+	spin_lock_init(&lock->lock);
+}
+
+static inline void client_obd_list_lock_done(client_obd_lock_t *lock)
+{}
+
+#endif /* __LINUX_OBD_H */
diff --git a/kernel/drivers/staging/lustre/lustre/include/lprocfs_status.h b/kernel/drivers/staging/lustre/lustre/include/lprocfs_status.h
new file mode 100644
index 000000000..d030847e5
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/include/lprocfs_status.h
@@ -0,0 +1,1015 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/include/lprocfs_status.h
+ *
+ * Top level header file for LProc SNMP
+ *
+ * Author: Hariharan Thantry thantry@users.sourceforge.net
+ */
+#ifndef _LPROCFS_SNMP_H
+#define _LPROCFS_SNMP_H
+
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/spinlock.h>
+#include <linux/types.h>
+
+#include "lustre/lustre_idl.h"
+
+struct lprocfs_vars {
+	const char		*name;
+	struct file_operations	*fops;
+	void			*data;
+	/**
+	 * /proc file mode.
+	 */
+	umode_t			proc_mode;
+};
+
+struct lprocfs_static_vars {
+	struct lprocfs_vars *module_vars;
+	struct lprocfs_vars *obd_vars;
+};
+
+/* if we find more consumers this could be generalized */
+#define OBD_HIST_MAX 32
+struct obd_histogram {
+	spinlock_t	oh_lock;
+	unsigned long	oh_buckets[OBD_HIST_MAX];
+};
+
+enum {
+	BRW_R_PAGES = 0,
+	BRW_W_PAGES,
+	BRW_R_RPC_HIST,
+	BRW_W_RPC_HIST,
+	BRW_R_IO_TIME,
+	BRW_W_IO_TIME,
+	BRW_R_DISCONT_PAGES,
+	BRW_W_DISCONT_PAGES,
+	BRW_R_DISCONT_BLOCKS,
+	BRW_W_DISCONT_BLOCKS,
+	BRW_R_DISK_IOSIZE,
+	BRW_W_DISK_IOSIZE,
+	BRW_R_DIO_FRAGS,
+	BRW_W_DIO_FRAGS,
+	BRW_LAST,
+};
+
+struct brw_stats {
+	struct obd_histogram hist[BRW_LAST];
+};
+
+enum {
+	RENAME_SAMEDIR_SIZE = 0,
+	RENAME_CROSSDIR_SRC_SIZE,
+	RENAME_CROSSDIR_TGT_SIZE,
+	RENAME_LAST,
+};
+
+struct rename_stats {
+	struct obd_histogram hist[RENAME_LAST];
+};
+
+/* An lprocfs counter can be configured using the enum bit masks below.
+ *
+ * LPROCFS_CNTR_EXTERNALLOCK indicates that an external lock already
+ * protects this counter from concurrent updates. If not specified,
+ * lprocfs an internal per-counter lock variable. External locks are
+ * not used to protect counter increments, but are used to protect
+ * counter readout and resets.
+ *
+ * LPROCFS_CNTR_AVGMINMAX indicates a multi-valued counter samples,
+ * (i.e. counter can be incremented by more than "1"). When specified,
+ * the counter maintains min, max and sum in addition to a simple
+ * invocation count. This allows averages to be be computed.
+ * If not specified, the counter is an increment-by-1 counter.
+ * min, max, sum, etc. are not maintained.
+ *
+ * LPROCFS_CNTR_STDDEV indicates that the counter should track sum of
+ * squares (for multi-valued counter samples only). This allows
+ * external computation of standard deviation, but involves a 64-bit
+ * multiply per counter increment.
+ */
+
+enum {
+	LPROCFS_CNTR_EXTERNALLOCK = 0x0001,
+	LPROCFS_CNTR_AVGMINMAX    = 0x0002,
+	LPROCFS_CNTR_STDDEV       = 0x0004,
+
+	/* counter data type */
+	LPROCFS_TYPE_REGS	 = 0x0100,
+	LPROCFS_TYPE_BYTES	= 0x0200,
+	LPROCFS_TYPE_PAGES	= 0x0400,
+	LPROCFS_TYPE_CYCLE	= 0x0800,
+};
+
+#define LC_MIN_INIT ((~(__u64)0) >> 1)
+
+struct lprocfs_counter_header {
+	unsigned int		lc_config;
+	const char		*lc_name;   /* must be static */
+	const char		*lc_units;  /* must be static */
+};
+
+struct lprocfs_counter {
+	__s64	lc_count;
+	__s64	lc_min;
+	__s64	lc_max;
+	__s64	lc_sumsquare;
+	/*
+	 * Every counter has lc_array_sum[0], while lc_array_sum[1] is only
+	 * for irq context counter, i.e. stats with
+	 * LPROCFS_STATS_FLAG_IRQ_SAFE flag, its counter need
+	 * lc_array_sum[1]
+	 */
+	__s64	lc_array_sum[1];
+};
+#define lc_sum		lc_array_sum[0]
+#define lc_sum_irq	lc_array_sum[1]
+
+struct lprocfs_percpu {
+#ifndef __GNUC__
+	__s64			pad;
+#endif
+	struct lprocfs_counter lp_cntr[0];
+};
+
+#define LPROCFS_GET_NUM_CPU 0x0001
+#define LPROCFS_GET_SMP_ID  0x0002
+
+enum lprocfs_stats_flags {
+	LPROCFS_STATS_FLAG_NONE     = 0x0000, /* per cpu counter */
+	LPROCFS_STATS_FLAG_NOPERCPU = 0x0001, /* stats have no percpu
+					       * area and need locking */
+	LPROCFS_STATS_FLAG_IRQ_SAFE = 0x0002, /* alloc need irq safe */
+};
+
+enum lprocfs_fields_flags {
+	LPROCFS_FIELDS_FLAGS_CONFIG     = 0x0001,
+	LPROCFS_FIELDS_FLAGS_SUM	= 0x0002,
+	LPROCFS_FIELDS_FLAGS_MIN	= 0x0003,
+	LPROCFS_FIELDS_FLAGS_MAX	= 0x0004,
+	LPROCFS_FIELDS_FLAGS_AVG	= 0x0005,
+	LPROCFS_FIELDS_FLAGS_SUMSQUARE  = 0x0006,
+	LPROCFS_FIELDS_FLAGS_COUNT      = 0x0007,
+};
+
+struct lprocfs_stats {
+	/* # of counters */
+	unsigned short			ls_num;
+	/* 1 + the biggest cpu # whose ls_percpu slot has been allocated */
+	unsigned short			ls_biggest_alloc_num;
+	enum lprocfs_stats_flags	ls_flags;
+	/* Lock used when there are no percpu stats areas; For percpu stats,
+	 * it is used to protect ls_biggest_alloc_num change */
+	spinlock_t			ls_lock;
+
+	/* has ls_num of counter headers */
+	struct lprocfs_counter_header	*ls_cnt_header;
+	struct lprocfs_percpu		*ls_percpu[0];
+};
+
+#define OPC_RANGE(seg) (seg ## _LAST_OPC - seg ## _FIRST_OPC)
+
+/* Pack all opcodes down into a single monotonically increasing index */
+static inline int opcode_offset(__u32 opc) {
+	if (opc < OST_LAST_OPC) {
+		 /* OST opcode */
+		return (opc - OST_FIRST_OPC);
+	} else if (opc < MDS_LAST_OPC) {
+		/* MDS opcode */
+		return (opc - MDS_FIRST_OPC +
+			OPC_RANGE(OST));
+	} else if (opc < LDLM_LAST_OPC) {
+		/* LDLM Opcode */
+		return (opc - LDLM_FIRST_OPC +
+			OPC_RANGE(MDS) +
+			OPC_RANGE(OST));
+	} else if (opc < MGS_LAST_OPC) {
+		/* MGS Opcode */
+		return (opc - MGS_FIRST_OPC +
+			OPC_RANGE(LDLM) +
+			OPC_RANGE(MDS) +
+			OPC_RANGE(OST));
+	} else if (opc < OBD_LAST_OPC) {
+		/* OBD Ping */
+		return (opc - OBD_FIRST_OPC +
+			OPC_RANGE(MGS) +
+			OPC_RANGE(LDLM) +
+			OPC_RANGE(MDS) +
+			OPC_RANGE(OST));
+	} else if (opc < LLOG_LAST_OPC) {
+		/* LLOG Opcode */
+		return (opc - LLOG_FIRST_OPC +
+			OPC_RANGE(OBD) +
+			OPC_RANGE(MGS) +
+			OPC_RANGE(LDLM) +
+			OPC_RANGE(MDS) +
+			OPC_RANGE(OST));
+	} else if (opc < QUOTA_LAST_OPC) {
+		/* LQUOTA Opcode */
+		return (opc - QUOTA_FIRST_OPC +
+			OPC_RANGE(LLOG) +
+			OPC_RANGE(OBD) +
+			OPC_RANGE(MGS) +
+			OPC_RANGE(LDLM) +
+			OPC_RANGE(MDS) +
+			OPC_RANGE(OST));
+	} else if (opc < SEQ_LAST_OPC) {
+		/* SEQ opcode */
+		return (opc - SEQ_FIRST_OPC +
+			OPC_RANGE(QUOTA) +
+			OPC_RANGE(LLOG) +
+			OPC_RANGE(OBD) +
+			OPC_RANGE(MGS) +
+			OPC_RANGE(LDLM) +
+			OPC_RANGE(MDS) +
+			OPC_RANGE(OST));
+	} else if (opc < SEC_LAST_OPC) {
+		/* SEC opcode */
+		return (opc - SEC_FIRST_OPC +
+			OPC_RANGE(SEQ) +
+			OPC_RANGE(QUOTA) +
+			OPC_RANGE(LLOG) +
+			OPC_RANGE(OBD) +
+			OPC_RANGE(MGS) +
+			OPC_RANGE(LDLM) +
+			OPC_RANGE(MDS) +
+			OPC_RANGE(OST));
+	} else if (opc < FLD_LAST_OPC) {
+		/* FLD opcode */
+		 return (opc - FLD_FIRST_OPC +
+			OPC_RANGE(SEC) +
+			OPC_RANGE(SEQ) +
+			OPC_RANGE(QUOTA) +
+			OPC_RANGE(LLOG) +
+			OPC_RANGE(OBD) +
+			OPC_RANGE(MGS) +
+			OPC_RANGE(LDLM) +
+			OPC_RANGE(MDS) +
+			OPC_RANGE(OST));
+	} else if (opc < UPDATE_LAST_OPC) {
+		/* update opcode */
+		return (opc - UPDATE_FIRST_OPC +
+			OPC_RANGE(FLD) +
+			OPC_RANGE(SEC) +
+			OPC_RANGE(SEQ) +
+			OPC_RANGE(QUOTA) +
+			OPC_RANGE(LLOG) +
+			OPC_RANGE(OBD) +
+			OPC_RANGE(MGS) +
+			OPC_RANGE(LDLM) +
+			OPC_RANGE(MDS) +
+			OPC_RANGE(OST));
+	} else {
+		/* Unknown Opcode */
+		return -1;
+	}
+}
+
+
+#define LUSTRE_MAX_OPCODES (OPC_RANGE(OST)  + \
+			    OPC_RANGE(MDS)  + \
+			    OPC_RANGE(LDLM) + \
+			    OPC_RANGE(MGS)  + \
+			    OPC_RANGE(OBD)  + \
+			    OPC_RANGE(LLOG) + \
+			    OPC_RANGE(SEC)  + \
+			    OPC_RANGE(SEQ)  + \
+			    OPC_RANGE(SEC)  + \
+			    OPC_RANGE(FLD)  + \
+			    OPC_RANGE(UPDATE))
+
+#define EXTRA_MAX_OPCODES ((PTLRPC_LAST_CNTR - PTLRPC_FIRST_CNTR)  + \
+			    OPC_RANGE(EXTRA))
+
+enum {
+	PTLRPC_REQWAIT_CNTR = 0,
+	PTLRPC_REQQDEPTH_CNTR,
+	PTLRPC_REQACTIVE_CNTR,
+	PTLRPC_TIMEOUT,
+	PTLRPC_REQBUF_AVAIL_CNTR,
+	PTLRPC_LAST_CNTR
+};
+
+#define PTLRPC_FIRST_CNTR PTLRPC_REQWAIT_CNTR
+
+enum {
+	LDLM_GLIMPSE_ENQUEUE = 0,
+	LDLM_PLAIN_ENQUEUE,
+	LDLM_EXTENT_ENQUEUE,
+	LDLM_FLOCK_ENQUEUE,
+	LDLM_IBITS_ENQUEUE,
+	MDS_REINT_SETATTR,
+	MDS_REINT_CREATE,
+	MDS_REINT_LINK,
+	MDS_REINT_UNLINK,
+	MDS_REINT_RENAME,
+	MDS_REINT_OPEN,
+	MDS_REINT_SETXATTR,
+	BRW_READ_BYTES,
+	BRW_WRITE_BYTES,
+	EXTRA_LAST_OPC
+};
+
+#define EXTRA_FIRST_OPC LDLM_GLIMPSE_ENQUEUE
+/* class_obd.c */
+extern struct proc_dir_entry *proc_lustre_root;
+
+struct obd_device;
+struct obd_histogram;
+
+/* Days / hours / mins / seconds format */
+struct dhms {
+	int d, h, m, s;
+};
+static inline void s2dhms(struct dhms *ts, time_t secs)
+{
+	ts->d = secs / 86400;
+	secs = secs % 86400;
+	ts->h = secs / 3600;
+	secs = secs % 3600;
+	ts->m = secs / 60;
+	ts->s = secs % 60;
+}
+#define DHMS_FMT "%dd%dh%02dm%02ds"
+#define DHMS_VARS(x) (x)->d, (x)->h, (x)->m, (x)->s
+
+#define JOBSTATS_JOBID_VAR_MAX_LEN	20
+#define JOBSTATS_DISABLE		"disable"
+#define JOBSTATS_PROCNAME_UID		"procname_uid"
+#define JOBSTATS_NODELOCAL		"nodelocal"
+
+extern int lprocfs_write_frac_helper(const char __user *buffer,
+				     unsigned long count, int *val, int mult);
+extern int lprocfs_read_frac_helper(char *buffer, unsigned long count,
+				    long val, int mult);
+#if defined (CONFIG_PROC_FS)
+
+extern int lprocfs_stats_alloc_one(struct lprocfs_stats *stats,
+				   unsigned int cpuid);
+/*
+ * \return value
+ *      < 0     : on error (only possible for opc as LPROCFS_GET_SMP_ID)
+ */
+static inline int lprocfs_stats_lock(struct lprocfs_stats *stats, int opc,
+				     unsigned long *flags)
+{
+	int		rc = 0;
+
+	switch (opc) {
+	default:
+		LBUG();
+
+	case LPROCFS_GET_SMP_ID:
+		if (stats->ls_flags & LPROCFS_STATS_FLAG_NOPERCPU) {
+			if (stats->ls_flags & LPROCFS_STATS_FLAG_IRQ_SAFE)
+				spin_lock_irqsave(&stats->ls_lock, *flags);
+			else
+				spin_lock(&stats->ls_lock);
+			return 0;
+		} else {
+			unsigned int cpuid = get_cpu();
+
+			if (unlikely(stats->ls_percpu[cpuid] == NULL)) {
+				rc = lprocfs_stats_alloc_one(stats, cpuid);
+				if (rc < 0) {
+					put_cpu();
+					return rc;
+				}
+			}
+			return cpuid;
+		}
+
+	case LPROCFS_GET_NUM_CPU:
+		if (stats->ls_flags & LPROCFS_STATS_FLAG_NOPERCPU) {
+			if (stats->ls_flags & LPROCFS_STATS_FLAG_IRQ_SAFE)
+				spin_lock_irqsave(&stats->ls_lock, *flags);
+			else
+				spin_lock(&stats->ls_lock);
+			return 1;
+		} else {
+			return stats->ls_biggest_alloc_num;
+		}
+	}
+}
+
+static inline void lprocfs_stats_unlock(struct lprocfs_stats *stats, int opc,
+					unsigned long *flags)
+{
+	switch (opc) {
+	default:
+		LBUG();
+
+	case LPROCFS_GET_SMP_ID:
+		if (stats->ls_flags & LPROCFS_STATS_FLAG_NOPERCPU) {
+			if (stats->ls_flags & LPROCFS_STATS_FLAG_IRQ_SAFE) {
+				spin_unlock_irqrestore(&stats->ls_lock,
+							   *flags);
+			} else {
+				spin_unlock(&stats->ls_lock);
+			}
+		} else {
+			put_cpu();
+		}
+		return;
+
+	case LPROCFS_GET_NUM_CPU:
+		if (stats->ls_flags & LPROCFS_STATS_FLAG_NOPERCPU) {
+			if (stats->ls_flags & LPROCFS_STATS_FLAG_IRQ_SAFE) {
+				spin_unlock_irqrestore(&stats->ls_lock,
+							   *flags);
+			} else {
+				spin_unlock(&stats->ls_lock);
+			}
+		}
+		return;
+	}
+}
+
+static inline unsigned int
+lprocfs_stats_counter_size(struct lprocfs_stats *stats)
+{
+	unsigned int percpusize;
+
+	percpusize = offsetof(struct lprocfs_percpu, lp_cntr[stats->ls_num]);
+
+	/* irq safe stats need lc_array_sum[1] */
+	if ((stats->ls_flags & LPROCFS_STATS_FLAG_IRQ_SAFE) != 0)
+		percpusize += stats->ls_num * sizeof(__s64);
+
+	if ((stats->ls_flags & LPROCFS_STATS_FLAG_NOPERCPU) == 0)
+		percpusize = L1_CACHE_ALIGN(percpusize);
+
+	return percpusize;
+}
+
+static inline struct lprocfs_counter *
+lprocfs_stats_counter_get(struct lprocfs_stats *stats, unsigned int cpuid,
+			  int index)
+{
+	struct lprocfs_counter *cntr;
+
+	cntr = &stats->ls_percpu[cpuid]->lp_cntr[index];
+
+	if ((stats->ls_flags & LPROCFS_STATS_FLAG_IRQ_SAFE) != 0)
+		cntr = (void *)cntr + index * sizeof(__s64);
+
+	return cntr;
+}
+
+/* Two optimized LPROCFS counter increment functions are provided:
+ *     lprocfs_counter_incr(cntr, value) - optimized for by-one counters
+ *     lprocfs_counter_add(cntr) - use for multi-valued counters
+ * Counter data layout allows config flag, counter lock and the
+ * count itself to reside within a single cache line.
+ */
+
+extern void lprocfs_counter_add(struct lprocfs_stats *stats, int idx,
+				long amount);
+extern void lprocfs_counter_sub(struct lprocfs_stats *stats, int idx,
+				long amount);
+
+#define lprocfs_counter_incr(stats, idx) \
+	lprocfs_counter_add(stats, idx, 1)
+#define lprocfs_counter_decr(stats, idx) \
+	lprocfs_counter_sub(stats, idx, 1)
+
+extern __s64 lprocfs_read_helper(struct lprocfs_counter *lc,
+				 struct lprocfs_counter_header *header,
+				 enum lprocfs_stats_flags flags,
+				 enum lprocfs_fields_flags field);
+static inline __u64 lprocfs_stats_collector(struct lprocfs_stats *stats,
+					    int idx,
+					    enum lprocfs_fields_flags field)
+{
+	int	      i;
+	unsigned int  num_cpu;
+	unsigned long flags	= 0;
+	__u64	      ret	= 0;
+
+	LASSERT(stats != NULL);
+
+	num_cpu = lprocfs_stats_lock(stats, LPROCFS_GET_NUM_CPU, &flags);
+	for (i = 0; i < num_cpu; i++) {
+		if (stats->ls_percpu[i] == NULL)
+			continue;
+		ret += lprocfs_read_helper(
+				lprocfs_stats_counter_get(stats, i, idx),
+				&stats->ls_cnt_header[idx], stats->ls_flags,
+				field);
+	}
+	lprocfs_stats_unlock(stats, LPROCFS_GET_NUM_CPU, &flags);
+	return ret;
+}
+
+extern struct lprocfs_stats *
+lprocfs_alloc_stats(unsigned int num, enum lprocfs_stats_flags flags);
+extern void lprocfs_clear_stats(struct lprocfs_stats *stats);
+extern void lprocfs_free_stats(struct lprocfs_stats **stats);
+extern void lprocfs_init_ops_stats(int num_private_stats,
+				   struct lprocfs_stats *stats);
+extern void lprocfs_init_mps_stats(int num_private_stats,
+				   struct lprocfs_stats *stats);
+extern void lprocfs_init_ldlm_stats(struct lprocfs_stats *ldlm_stats);
+extern int lprocfs_alloc_obd_stats(struct obd_device *obddev,
+				   unsigned int num_private_stats);
+extern int lprocfs_alloc_md_stats(struct obd_device *obddev,
+				  unsigned int num_private_stats);
+extern void lprocfs_counter_init(struct lprocfs_stats *stats, int index,
+				 unsigned conf, const char *name,
+				 const char *units);
+extern void lprocfs_free_obd_stats(struct obd_device *obddev);
+extern void lprocfs_free_md_stats(struct obd_device *obddev);
+struct obd_export;
+struct nid_stat;
+extern int lprocfs_add_clear_entry(struct obd_device *obd,
+				   struct proc_dir_entry *entry);
+extern int lprocfs_exp_setup(struct obd_export *exp,
+			     lnet_nid_t *peer_nid, int *newnid);
+extern int lprocfs_exp_cleanup(struct obd_export *exp);
+extern struct proc_dir_entry *lprocfs_add_simple(struct proc_dir_entry *root,
+						char *name,
+						void *data,
+						struct file_operations *fops);
+extern struct proc_dir_entry *
+lprocfs_add_symlink(const char *name, struct proc_dir_entry *parent,
+		    const char *format, ...);
+extern void lprocfs_free_per_client_stats(struct obd_device *obd);
+extern int
+lprocfs_nid_stats_clear_write(struct file *file, const char *buffer,
+			      unsigned long count, void *data);
+extern int lprocfs_nid_stats_clear_read(struct seq_file *m, void *data);
+
+extern int lprocfs_register_stats(struct proc_dir_entry *root, const char *name,
+				  struct lprocfs_stats *stats);
+
+/* lprocfs_status.c */
+extern int lprocfs_add_vars(struct proc_dir_entry *root,
+			    struct lprocfs_vars *var,
+			    void *data);
+
+extern struct proc_dir_entry *lprocfs_register(const char *name,
+					      struct proc_dir_entry *parent,
+					      struct lprocfs_vars *list,
+					      void *data);
+
+extern void lprocfs_remove(struct proc_dir_entry **root);
+extern void lprocfs_remove_proc_entry(const char *name,
+				      struct proc_dir_entry *parent);
+
+extern int lprocfs_obd_setup(struct obd_device *obd, struct lprocfs_vars *list);
+extern int lprocfs_obd_cleanup(struct obd_device *obd);
+
+extern int lprocfs_seq_create(struct proc_dir_entry *parent, const char *name,
+			      umode_t mode,
+			      const struct file_operations *seq_fops,
+			      void *data);
+extern int lprocfs_obd_seq_create(struct obd_device *dev, const char *name,
+				  umode_t mode,
+				  const struct file_operations *seq_fops,
+				  void *data);
+
+/* Generic callbacks */
+
+extern int lprocfs_rd_u64(struct seq_file *m, void *data);
+extern int lprocfs_rd_atomic(struct seq_file *m, void *data);
+extern int lprocfs_wr_atomic(struct file *file, const char __user *buffer,
+			     unsigned long count, void *data);
+extern int lprocfs_rd_uint(struct seq_file *m, void *data);
+extern int lprocfs_wr_uint(struct file *file, const char __user *buffer,
+			   unsigned long count, void *data);
+extern int lprocfs_rd_uuid(struct seq_file *m, void *data);
+extern int lprocfs_rd_name(struct seq_file *m, void *data);
+extern int lprocfs_rd_server_uuid(struct seq_file *m, void *data);
+extern int lprocfs_rd_conn_uuid(struct seq_file *m, void *data);
+extern int lprocfs_rd_import(struct seq_file *m, void *data);
+extern int lprocfs_rd_state(struct seq_file *m, void *data);
+extern int lprocfs_rd_connect_flags(struct seq_file *m, void *data);
+extern int lprocfs_rd_num_exports(struct seq_file *m, void *data);
+extern int lprocfs_rd_numrefs(struct seq_file *m, void *data);
+
+struct adaptive_timeout;
+extern int lprocfs_at_hist_helper(struct seq_file *m,
+				  struct adaptive_timeout *at);
+extern int lprocfs_rd_timeouts(struct seq_file *m, void *data);
+extern int lprocfs_wr_timeouts(struct file *file, const char __user *buffer,
+			       unsigned long count, void *data);
+extern int lprocfs_wr_evict_client(struct file *file, const char __user *buffer,
+			    size_t count, loff_t *off);
+extern int lprocfs_wr_ping(struct file *file, const char __user *buffer,
+			   size_t count, loff_t *off);
+extern int lprocfs_wr_import(struct file *file, const char __user *buffer,
+		      size_t count, loff_t *off);
+extern int lprocfs_rd_pinger_recov(struct seq_file *m, void *n);
+extern int lprocfs_wr_pinger_recov(struct file *file, const char __user *buffer,
+				   size_t count, loff_t *off);
+
+/* Statfs helpers */
+extern int lprocfs_rd_blksize(struct seq_file *m, void *data);
+extern int lprocfs_rd_kbytestotal(struct seq_file *m, void *data);
+extern int lprocfs_rd_kbytesfree(struct seq_file *m, void *data);
+extern int lprocfs_rd_kbytesavail(struct seq_file *m, void *data);
+extern int lprocfs_rd_filestotal(struct seq_file *m, void *data);
+extern int lprocfs_rd_filesfree(struct seq_file *m, void *data);
+
+extern int lprocfs_write_helper(const char __user *buffer, unsigned long count,
+				int *val);
+extern int lprocfs_seq_read_frac_helper(struct seq_file *m, long val, int mult);
+extern int lprocfs_write_u64_helper(const char __user *buffer,
+				unsigned long count, __u64 *val);
+extern int lprocfs_write_frac_u64_helper(const char *buffer,
+					 unsigned long count,
+					 __u64 *val, int mult);
+extern char *lprocfs_find_named_value(const char *buffer, const char *name,
+				      size_t *count);
+void lprocfs_oh_tally(struct obd_histogram *oh, unsigned int value);
+void lprocfs_oh_tally_log2(struct obd_histogram *oh, unsigned int value);
+void lprocfs_oh_clear(struct obd_histogram *oh);
+unsigned long lprocfs_oh_sum(struct obd_histogram *oh);
+
+void lprocfs_stats_collect(struct lprocfs_stats *stats, int idx,
+			   struct lprocfs_counter *cnt);
+
+extern int lprocfs_single_release(struct inode *, struct file *);
+extern int lprocfs_seq_release(struct inode *, struct file *);
+
+/* You must use these macros when you want to refer to
+ * the import in a client obd_device for a lprocfs entry */
+#define LPROCFS_CLIMP_CHECK(obd) do {	   \
+	typecheck(struct obd_device *, obd);    \
+	down_read(&(obd)->u.cli.cl_sem);    \
+	if ((obd)->u.cli.cl_import == NULL) {   \
+	     up_read(&(obd)->u.cli.cl_sem); \
+	     return -ENODEV;		    \
+	}				       \
+} while (0)
+#define LPROCFS_CLIMP_EXIT(obd)		 \
+	up_read(&(obd)->u.cli.cl_sem)
+
+
+/* write the name##_seq_show function, call LPROC_SEQ_FOPS_RO for read-only
+  proc entries; otherwise, you will define name##_seq_write function also for
+  a read-write proc entry, and then call LPROC_SEQ_SEQ instead. Finally,
+  call lprocfs_obd_seq_create(obd, filename, 0444, &name#_fops, data); */
+#define __LPROC_SEQ_FOPS(name, custom_seq_write)			\
+static int name##_single_open(struct inode *inode, struct file *file)	\
+{									\
+	return single_open(file, name##_seq_show, PDE_DATA(inode));	\
+}									\
+static struct file_operations name##_fops = {				\
+	.owner   = THIS_MODULE,					    \
+	.open    = name##_single_open,				     \
+	.read    = seq_read,					       \
+	.write   = custom_seq_write,				       \
+	.llseek  = seq_lseek,					      \
+	.release = lprocfs_single_release,				 \
+}
+
+#define LPROC_SEQ_FOPS_RO(name)	 __LPROC_SEQ_FOPS(name, NULL)
+#define LPROC_SEQ_FOPS(name)	    __LPROC_SEQ_FOPS(name, name##_seq_write)
+
+#define LPROC_SEQ_FOPS_RO_TYPE(name, type)				\
+	static int name##_##type##_seq_show(struct seq_file *m, void *v)\
+	{								\
+		return lprocfs_rd_##type(m, m->private);		\
+	}								\
+	LPROC_SEQ_FOPS_RO(name##_##type)
+
+#define LPROC_SEQ_FOPS_RW_TYPE(name, type)				\
+	static int name##_##type##_seq_show(struct seq_file *m, void *v)\
+	{								\
+		return lprocfs_rd_##type(m, m->private);		\
+	}								\
+	static ssize_t name##_##type##_seq_write(struct file *file,	\
+			const char __user *buffer, size_t count,	\
+						loff_t *off)		\
+	{								\
+		struct seq_file *seq = file->private_data;		\
+		return lprocfs_wr_##type(file, buffer,			\
+					 count, seq->private);		\
+	}								\
+	LPROC_SEQ_FOPS(name##_##type)
+
+#define LPROC_SEQ_FOPS_WR_ONLY(name, type)				\
+	static ssize_t name##_##type##_write(struct file *file,		\
+			const char __user *buffer, size_t count,	\
+						loff_t *off)		\
+	{								\
+		return lprocfs_wr_##type(file, buffer, count, off);	\
+	}								\
+	static int name##_##type##_open(struct inode *inode, struct file *file) \
+	{								\
+		return single_open(file, NULL, PDE_DATA(inode));	\
+	}								\
+	static struct file_operations name##_##type##_fops = {	\
+		.open	= name##_##type##_open,				\
+		.write	= name##_##type##_write,			\
+		.release = lprocfs_single_release,			\
+	}
+
+/* lproc_ptlrpc.c */
+struct ptlrpc_request;
+extern void target_print_req(void *seq_file, struct ptlrpc_request *req);
+
+/* lproc_status.c */
+int lprocfs_obd_rd_max_pages_per_rpc(struct seq_file *m, void *data);
+int lprocfs_obd_wr_max_pages_per_rpc(struct file *file, const char *buffer,
+				     size_t count, loff_t *off);
+
+/* all quota proc functions */
+extern int lprocfs_quota_rd_bunit(char *page, char **start,
+				  loff_t off, int count,
+				  int *eof, void *data);
+extern int lprocfs_quota_wr_bunit(struct file *file, const char *buffer,
+				  unsigned long count, void *data);
+extern int lprocfs_quota_rd_btune(char *page, char **start,
+				  loff_t off, int count,
+				  int *eof, void *data);
+extern int lprocfs_quota_wr_btune(struct file *file, const char *buffer,
+				  unsigned long count, void *data);
+extern int lprocfs_quota_rd_iunit(char *page, char **start,
+				  loff_t off, int count,
+				  int *eof, void *data);
+extern int lprocfs_quota_wr_iunit(struct file *file, const char *buffer,
+				  unsigned long count, void *data);
+extern int lprocfs_quota_rd_itune(char *page, char **start,
+				  loff_t off, int count,
+				  int *eof, void *data);
+extern int lprocfs_quota_wr_itune(struct file *file, const char *buffer,
+				  unsigned long count, void *data);
+extern int lprocfs_quota_rd_type(char *page, char **start, loff_t off, int count,
+				 int *eof, void *data);
+extern int lprocfs_quota_wr_type(struct file *file, const char *buffer,
+				 unsigned long count, void *data);
+extern int lprocfs_quota_rd_switch_seconds(char *page, char **start, loff_t off,
+					   int count, int *eof, void *data);
+extern int lprocfs_quota_wr_switch_seconds(struct file *file,
+					   const char *buffer,
+					   unsigned long count, void *data);
+extern int lprocfs_quota_rd_sync_blk(char *page, char **start, loff_t off,
+				     int count, int *eof, void *data);
+extern int lprocfs_quota_wr_sync_blk(struct file *file, const char *buffer,
+				     unsigned long count, void *data);
+extern int lprocfs_quota_rd_switch_qs(char *page, char **start, loff_t off,
+				      int count, int *eof, void *data);
+extern int lprocfs_quota_wr_switch_qs(struct file *file,
+				      const char *buffer,
+				      unsigned long count, void *data);
+extern int lprocfs_quota_rd_boundary_factor(char *page, char **start, loff_t off,
+					    int count, int *eof, void *data);
+extern int lprocfs_quota_wr_boundary_factor(struct file *file,
+					    const char *buffer,
+					    unsigned long count, void *data);
+extern int lprocfs_quota_rd_least_bunit(char *page, char **start, loff_t off,
+					int count, int *eof, void *data);
+extern int lprocfs_quota_wr_least_bunit(struct file *file,
+					const char *buffer,
+					unsigned long count, void *data);
+extern int lprocfs_quota_rd_least_iunit(char *page, char **start, loff_t off,
+					int count, int *eof, void *data);
+extern int lprocfs_quota_wr_least_iunit(struct file *file,
+					const char *buffer,
+					unsigned long count, void *data);
+extern int lprocfs_quota_rd_qs_factor(char *page, char **start, loff_t off,
+				      int count, int *eof, void *data);
+extern int lprocfs_quota_wr_qs_factor(struct file *file,
+				      const char *buffer,
+				      unsigned long count, void *data);
+#else
+/* CONFIG_PROC_FS is not defined */
+
+#define proc_lustre_root NULL
+
+static inline void lprocfs_counter_add(struct lprocfs_stats *stats,
+				       int index, long amount)
+{ return; }
+static inline void lprocfs_counter_incr(struct lprocfs_stats *stats,
+					int index)
+{ return; }
+static inline void lprocfs_counter_sub(struct lprocfs_stats *stats,
+				       int index, long amount)
+{ return; }
+static inline void lprocfs_counter_decr(struct lprocfs_stats *stats,
+					int index)
+{ return; }
+static inline void lprocfs_counter_init(struct lprocfs_stats *stats,
+					int index, unsigned conf,
+					const char *name, const char *units)
+{ return; }
+
+static inline __u64 lc_read_helper(struct lprocfs_counter *lc,
+				   enum lprocfs_fields_flags field)
+{ return 0; }
+
+/* NB: we return !NULL to satisfy error checker */
+static inline struct lprocfs_stats *
+lprocfs_alloc_stats(unsigned int num, enum lprocfs_stats_flags flags)
+{ return (struct lprocfs_stats *)1; }
+static inline void lprocfs_clear_stats(struct lprocfs_stats *stats)
+{ return; }
+static inline void lprocfs_free_stats(struct lprocfs_stats **stats)
+{ return; }
+static inline int lprocfs_register_stats(struct proc_dir_entry *root,
+					 const char *name,
+					 struct lprocfs_stats *stats)
+{ return 0; }
+static inline void lprocfs_init_ops_stats(int num_private_stats,
+					  struct lprocfs_stats *stats)
+{ return; }
+static inline void lprocfs_init_mps_stats(int num_private_stats,
+					  struct lprocfs_stats *stats)
+{ return; }
+static inline void lprocfs_init_ldlm_stats(struct lprocfs_stats *ldlm_stats)
+{ return; }
+static inline int lprocfs_alloc_obd_stats(struct obd_device *obddev,
+					  unsigned int num_private_stats)
+{ return 0; }
+static inline int lprocfs_alloc_md_stats(struct obd_device *obddev,
+					 unsigned int num_private_stats)
+{ return 0; }
+static inline void lprocfs_free_obd_stats(struct obd_device *obddev)
+{ return; }
+static inline void lprocfs_free_md_stats(struct obd_device *obddev)
+{ return; }
+
+struct obd_export;
+static inline int lprocfs_add_clear_entry(struct obd_export *exp)
+{ return 0; }
+static inline int lprocfs_exp_setup(struct obd_export *exp,
+				    lnet_nid_t *peer_nid,
+				    int *newnid)
+{ return 0; }
+static inline int lprocfs_exp_cleanup(struct obd_export *exp)
+{ return 0; }
+static inline struct proc_dir_entry *
+lprocfs_add_simple(struct proc_dir_entry *root, char *name,
+		   void *data, struct file_operations *fops)
+{return 0; }
+static inline struct proc_dir_entry *
+lprocfs_add_symlink(const char *name, struct proc_dir_entry *parent,
+		    const char *format, ...)
+{return NULL; }
+static inline void lprocfs_free_per_client_stats(struct obd_device *obd)
+{ return; }
+static inline
+int lprocfs_nid_stats_clear_write(struct file *file, const char *buffer,
+				  unsigned long count, void *data)
+{return count;}
+static inline
+int lprocfs_nid_stats_clear_read(struct seq_file *m, void *data)
+{ return 0; }
+
+static inline struct proc_dir_entry *
+lprocfs_register(const char *name, struct proc_dir_entry *parent,
+		 struct lprocfs_vars *list, void *data)
+{ return NULL; }
+static inline int lprocfs_add_vars(struct proc_dir_entry *root,
+				   struct lprocfs_vars *var,
+				   void *data)
+{ return 0; }
+static inline void lprocfs_remove(struct proc_dir_entry **root)
+{ return; }
+static inline void lprocfs_remove_proc_entry(const char *name,
+					     struct proc_dir_entry *parent)
+{ return; }
+static inline int lprocfs_obd_setup(struct obd_device *dev,
+				    struct lprocfs_vars *list)
+{ return 0; }
+static inline int lprocfs_obd_cleanup(struct obd_device *dev)
+{ return 0; }
+static inline int lprocfs_rd_u64(struct seq_file *m, void *data)
+{ return 0; }
+static inline int lprocfs_rd_uuid(struct seq_file *m, void *data)
+{ return 0; }
+static inline int lprocfs_rd_name(struct seq_file *m, void *data)
+{ return 0; }
+static inline int lprocfs_rd_server_uuid(struct seq_file *m, void *data)
+{ return 0; }
+static inline int lprocfs_rd_conn_uuid(struct seq_file *m, void *data)
+{ return 0; }
+static inline int lprocfs_rd_import(struct seq_file *m, void *data)
+{ return 0; }
+static inline int lprocfs_rd_pinger_recov(struct seq_file *m, void *n)
+{ return 0; }
+static inline int lprocfs_rd_state(struct seq_file *m, void *data)
+{ return 0; }
+static inline int lprocfs_rd_connect_flags(struct seq_file *m, void *data)
+{ return 0; }
+static inline int lprocfs_rd_num_exports(struct seq_file *m, void *data)
+{ return 0; }
+extern inline int lprocfs_rd_numrefs(struct seq_file *m, void *data)
+{ return 0; }
+struct adaptive_timeout;
+static inline int lprocfs_at_hist_helper(struct seq_file *m,
+					 struct adaptive_timeout *at)
+{ return 0; }
+static inline int lprocfs_rd_timeouts(struct seq_file *m, void *data)
+{ return 0; }
+static inline int lprocfs_wr_timeouts(struct file *file,
+				const char __user *buffer,
+				unsigned long count, void *data)
+{ return 0; }
+static inline int lprocfs_wr_evict_client(struct file *file,
+				const char __user *buffer,
+				size_t count, loff_t *off)
+{ return 0; }
+static inline int lprocfs_wr_ping(struct file *file,
+				const char __user *buffer,
+				size_t count, loff_t *off)
+{ return 0; }
+static inline int lprocfs_wr_import(struct file *file,
+				const char __user *buffer,
+				size_t count, loff_t *off)
+{ return 0; }
+static inline int lprocfs_wr_pinger_recov(struct file *file,
+				const char __user *buffer,
+				size_t count, loff_t *off)
+{ return 0; }
+
+/* Statfs helpers */
+static inline
+int lprocfs_rd_blksize(struct seq_file *m, void *data)
+{ return 0; }
+static inline
+int lprocfs_rd_kbytestotal(struct seq_file *m, void *data)
+{ return 0; }
+static inline
+int lprocfs_rd_kbytesfree(struct seq_file *m, void *data)
+{ return 0; }
+static inline
+int lprocfs_rd_kbytesavail(struct seq_file *m, void *data)
+{ return 0; }
+static inline
+int lprocfs_rd_filestotal(struct seq_file *m, void *data)
+{ return 0; }
+static inline
+int lprocfs_rd_filesfree(struct seq_file *m, void *data)
+{ return 0; }
+static inline
+void lprocfs_oh_tally(struct obd_histogram *oh, unsigned int value)
+{ return; }
+static inline
+void lprocfs_oh_tally_log2(struct obd_histogram *oh, unsigned int value)
+{ return; }
+static inline
+void lprocfs_oh_clear(struct obd_histogram *oh)
+{ return; }
+static inline
+unsigned long lprocfs_oh_sum(struct obd_histogram *oh)
+{ return 0; }
+static inline
+void lprocfs_stats_collect(struct lprocfs_stats *stats, int idx,
+			   struct lprocfs_counter *cnt)
+{ return; }
+static inline
+__u64 lprocfs_stats_collector(struct lprocfs_stats *stats, int idx,
+			       enum lprocfs_fields_flags field)
+{ return (__u64)0; }
+
+#define LPROC_SEQ_FOPS_RO(name)
+#define LPROC_SEQ_FOPS(name)
+#define LPROC_SEQ_FOPS_RO_TYPE(name, type)
+#define LPROC_SEQ_FOPS_RW_TYPE(name, type)
+#define LPROC_SEQ_FOPS_WR_ONLY(name, type)
+
+/* lproc_ptlrpc.c */
+#define target_print_req NULL
+
+#endif /* CONFIG_PROC_FS */
+
+#endif /* LPROCFS_SNMP_H */
diff --git a/kernel/drivers/staging/lustre/lustre/include/lu_object.h b/kernel/drivers/staging/lustre/lustre/include/lu_object.h
new file mode 100644
index 000000000..c8cc48f00
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/include/lu_object.h
@@ -0,0 +1,1340 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef __LUSTRE_LU_OBJECT_H
+#define __LUSTRE_LU_OBJECT_H
+
+#include <stdarg.h>
+#include "../../include/linux/libcfs/libcfs.h"
+#include "lustre/lustre_idl.h"
+#include "lu_ref.h"
+
+struct seq_file;
+struct proc_dir_entry;
+struct lustre_cfg;
+struct lprocfs_stats;
+
+/** \defgroup lu lu
+ * lu_* data-types represent server-side entities shared by data and meta-data
+ * stacks.
+ *
+ * Design goals:
+ *
+ * -# support for layering.
+ *
+ *     Server side object is split into layers, one per device in the
+ *     corresponding device stack. Individual layer is represented by struct
+ *     lu_object. Compound layered object --- by struct lu_object_header. Most
+ *     interface functions take lu_object as an argument and operate on the
+ *     whole compound object. This decision was made due to the following
+ *     reasons:
+ *
+ *	- it's envisaged that lu_object will be used much more often than
+ *	lu_object_header;
+ *
+ *	- we want lower (non-top) layers to be able to initiate operations
+ *	on the whole object.
+ *
+ *     Generic code supports layering more complex than simple stacking, e.g.,
+ *     it is possible that at some layer object "spawns" multiple sub-objects
+ *     on the lower layer.
+ *
+ * -# fid-based identification.
+ *
+ *     Compound object is uniquely identified by its fid. Objects are indexed
+ *     by their fids (hash table is used for index).
+ *
+ * -# caching and life-cycle management.
+ *
+ *     Object's life-time is controlled by reference counting. When reference
+ *     count drops to 0, object is returned to cache. Cached objects still
+ *     retain their identity (i.e., fid), and can be recovered from cache.
+ *
+ *     Objects are kept in the global LRU list, and lu_site_purge() function
+ *     can be used to reclaim given number of unused objects from the tail of
+ *     the LRU.
+ *
+ * -# avoiding recursion.
+ *
+ *     Generic code tries to replace recursion through layers by iterations
+ *     where possible. Additionally to the end of reducing stack consumption,
+ *     data, when practically possible, are allocated through lu_context_key
+ *     interface rather than on stack.
+ * @{
+ */
+
+struct lu_site;
+struct lu_object;
+struct lu_device;
+struct lu_object_header;
+struct lu_context;
+struct lu_env;
+
+/**
+ * Operations common for data and meta-data devices.
+ */
+struct lu_device_operations {
+	/**
+	 * Allocate object for the given device (without lower-layer
+	 * parts). This is called by lu_object_operations::loo_object_init()
+	 * from the parent layer, and should setup at least lu_object::lo_dev
+	 * and lu_object::lo_ops fields of resulting lu_object.
+	 *
+	 * Object creation protocol.
+	 *
+	 * Due to design goal of avoiding recursion, object creation (see
+	 * lu_object_alloc()) is somewhat involved:
+	 *
+	 *  - first, lu_device_operations::ldo_object_alloc() method of the
+	 *  top-level device in the stack is called. It should allocate top
+	 *  level object (including lu_object_header), but without any
+	 *  lower-layer sub-object(s).
+	 *
+	 *  - then lu_object_alloc() sets fid in the header of newly created
+	 *  object.
+	 *
+	 *  - then lu_object_operations::loo_object_init() is called. It has
+	 *  to allocate lower-layer object(s). To do this,
+	 *  lu_object_operations::loo_object_init() calls ldo_object_alloc()
+	 *  of the lower-layer device(s).
+	 *
+	 *  - for all new objects allocated by
+	 *  lu_object_operations::loo_object_init() (and inserted into object
+	 *  stack), lu_object_operations::loo_object_init() is called again
+	 *  repeatedly, until no new objects are created.
+	 *
+	 * \post ergo(!IS_ERR(result), result->lo_dev == d &&
+	 *			     result->lo_ops != NULL);
+	 */
+	struct lu_object *(*ldo_object_alloc)(const struct lu_env *env,
+					      const struct lu_object_header *h,
+					      struct lu_device *d);
+	/**
+	 * process config specific for device.
+	 */
+	int (*ldo_process_config)(const struct lu_env *env,
+				  struct lu_device *, struct lustre_cfg *);
+	int (*ldo_recovery_complete)(const struct lu_env *,
+				     struct lu_device *);
+
+	/**
+	 * initialize local objects for device. this method called after layer has
+	 * been initialized (after LCFG_SETUP stage) and before it starts serving
+	 * user requests.
+	 */
+
+	int (*ldo_prepare)(const struct lu_env *,
+			   struct lu_device *parent,
+			   struct lu_device *dev);
+
+};
+
+/**
+ * For lu_object_conf flags
+ */
+typedef enum {
+	/* This is a new object to be allocated, or the file
+	 * corresponding to the object does not exists. */
+	LOC_F_NEW	= 0x00000001,
+} loc_flags_t;
+
+/**
+ * Object configuration, describing particulars of object being created. On
+ * server this is not used, as server objects are full identified by fid. On
+ * client configuration contains struct lustre_md.
+ */
+struct lu_object_conf {
+	/**
+	 * Some hints for obj find and alloc.
+	 */
+	loc_flags_t     loc_flags;
+};
+
+/**
+ * Type of "printer" function used by lu_object_operations::loo_object_print()
+ * method.
+ *
+ * Printer function is needed to provide some flexibility in (semi-)debugging
+ * output: possible implementations: printk, CDEBUG, sysfs/seq_file
+ */
+typedef int (*lu_printer_t)(const struct lu_env *env,
+			    void *cookie, const char *format, ...)
+	__printf(3, 4);
+
+/**
+ * Operations specific for particular lu_object.
+ */
+struct lu_object_operations {
+
+	/**
+	 * Allocate lower-layer parts of the object by calling
+	 * lu_device_operations::ldo_object_alloc() of the corresponding
+	 * underlying device.
+	 *
+	 * This method is called once for each object inserted into object
+	 * stack. It's responsibility of this method to insert lower-layer
+	 * object(s) it create into appropriate places of object stack.
+	 */
+	int (*loo_object_init)(const struct lu_env *env,
+			       struct lu_object *o,
+			       const struct lu_object_conf *conf);
+	/**
+	 * Called (in top-to-bottom order) during object allocation after all
+	 * layers were allocated and initialized. Can be used to perform
+	 * initialization depending on lower layers.
+	 */
+	int (*loo_object_start)(const struct lu_env *env,
+				struct lu_object *o);
+	/**
+	 * Called before lu_object_operations::loo_object_free() to signal
+	 * that object is being destroyed. Dual to
+	 * lu_object_operations::loo_object_init().
+	 */
+	void (*loo_object_delete)(const struct lu_env *env,
+				  struct lu_object *o);
+	/**
+	 * Dual to lu_device_operations::ldo_object_alloc(). Called when
+	 * object is removed from memory.
+	 */
+	void (*loo_object_free)(const struct lu_env *env,
+				struct lu_object *o);
+	/**
+	 * Called when last active reference to the object is released (and
+	 * object returns to the cache). This method is optional.
+	 */
+	void (*loo_object_release)(const struct lu_env *env,
+				   struct lu_object *o);
+	/**
+	 * Optional debugging helper. Print given object.
+	 */
+	int (*loo_object_print)(const struct lu_env *env, void *cookie,
+				lu_printer_t p, const struct lu_object *o);
+	/**
+	 * Optional debugging method. Returns true iff method is internally
+	 * consistent.
+	 */
+	int (*loo_object_invariant)(const struct lu_object *o);
+};
+
+/**
+ * Type of lu_device.
+ */
+struct lu_device_type;
+
+/**
+ * Device: a layer in the server side abstraction stacking.
+ */
+struct lu_device {
+	/**
+	 * reference count. This is incremented, in particular, on each object
+	 * created at this layer.
+	 *
+	 * \todo XXX which means that atomic_t is probably too small.
+	 */
+	atomic_t		       ld_ref;
+	/**
+	 * Pointer to device type. Never modified once set.
+	 */
+	struct lu_device_type       *ld_type;
+	/**
+	 * Operation vector for this device.
+	 */
+	const struct lu_device_operations *ld_ops;
+	/**
+	 * Stack this device belongs to.
+	 */
+	struct lu_site		    *ld_site;
+	struct proc_dir_entry	     *ld_proc_entry;
+
+	/** \todo XXX: temporary back pointer into obd. */
+	struct obd_device		 *ld_obd;
+	/**
+	 * A list of references to this object, for debugging.
+	 */
+	struct lu_ref		      ld_reference;
+	/**
+	 * Link the device to the site.
+	 **/
+	struct list_head			 ld_linkage;
+};
+
+struct lu_device_type_operations;
+
+/**
+ * Tag bits for device type. They are used to distinguish certain groups of
+ * device types.
+ */
+enum lu_device_tag {
+	/** this is meta-data device */
+	LU_DEVICE_MD = (1 << 0),
+	/** this is data device */
+	LU_DEVICE_DT = (1 << 1),
+	/** data device in the client stack */
+	LU_DEVICE_CL = (1 << 2)
+};
+
+/**
+ * Type of device.
+ */
+struct lu_device_type {
+	/**
+	 * Tag bits. Taken from enum lu_device_tag. Never modified once set.
+	 */
+	__u32				   ldt_tags;
+	/**
+	 * Name of this class. Unique system-wide. Never modified once set.
+	 */
+	char				   *ldt_name;
+	/**
+	 * Operations for this type.
+	 */
+	const struct lu_device_type_operations *ldt_ops;
+	/**
+	 * \todo XXX: temporary pointer to associated obd_type.
+	 */
+	struct obd_type			*ldt_obd_type;
+	/**
+	 * \todo XXX: temporary: context tags used by obd_*() calls.
+	 */
+	__u32				   ldt_ctx_tags;
+	/**
+	 * Number of existing device type instances.
+	 */
+	unsigned				ldt_device_nr;
+	/**
+	 * Linkage into a global list of all device types.
+	 *
+	 * \see lu_device_types.
+	 */
+	struct list_head			      ldt_linkage;
+};
+
+/**
+ * Operations on a device type.
+ */
+struct lu_device_type_operations {
+	/**
+	 * Allocate new device.
+	 */
+	struct lu_device *(*ldto_device_alloc)(const struct lu_env *env,
+					       struct lu_device_type *t,
+					       struct lustre_cfg *lcfg);
+	/**
+	 * Free device. Dual to
+	 * lu_device_type_operations::ldto_device_alloc(). Returns pointer to
+	 * the next device in the stack.
+	 */
+	struct lu_device *(*ldto_device_free)(const struct lu_env *,
+					      struct lu_device *);
+
+	/**
+	 * Initialize the devices after allocation
+	 */
+	int  (*ldto_device_init)(const struct lu_env *env,
+				 struct lu_device *, const char *,
+				 struct lu_device *);
+	/**
+	 * Finalize device. Dual to
+	 * lu_device_type_operations::ldto_device_init(). Returns pointer to
+	 * the next device in the stack.
+	 */
+	struct lu_device *(*ldto_device_fini)(const struct lu_env *env,
+					      struct lu_device *);
+	/**
+	 * Initialize device type. This is called on module load.
+	 */
+	int  (*ldto_init)(struct lu_device_type *t);
+	/**
+	 * Finalize device type. Dual to
+	 * lu_device_type_operations::ldto_init(). Called on module unload.
+	 */
+	void (*ldto_fini)(struct lu_device_type *t);
+	/**
+	 * Called when the first device is created.
+	 */
+	void (*ldto_start)(struct lu_device_type *t);
+	/**
+	 * Called when number of devices drops to 0.
+	 */
+	void (*ldto_stop)(struct lu_device_type *t);
+};
+
+static inline int lu_device_is_md(const struct lu_device *d)
+{
+	return ergo(d != NULL, d->ld_type->ldt_tags & LU_DEVICE_MD);
+}
+
+/**
+ * Common object attributes.
+ */
+struct lu_attr {
+	/** size in bytes */
+	__u64	  la_size;
+	/** modification time in seconds since Epoch */
+	s64	  la_mtime;
+	/** access time in seconds since Epoch */
+	s64	  la_atime;
+	/** change time in seconds since Epoch */
+	s64	  la_ctime;
+	/** 512-byte blocks allocated to object */
+	__u64	  la_blocks;
+	/** permission bits and file type */
+	__u32	  la_mode;
+	/** owner id */
+	__u32	  la_uid;
+	/** group id */
+	__u32	  la_gid;
+	/** object flags */
+	__u32	  la_flags;
+	/** number of persistent references to this object */
+	__u32	  la_nlink;
+	/** blk bits of the object*/
+	__u32	  la_blkbits;
+	/** blk size of the object*/
+	__u32	  la_blksize;
+	/** real device */
+	__u32	  la_rdev;
+	/**
+	 * valid bits
+	 *
+	 * \see enum la_valid
+	 */
+	__u64	  la_valid;
+};
+
+/** Bit-mask of valid attributes */
+enum la_valid {
+	LA_ATIME = 1 << 0,
+	LA_MTIME = 1 << 1,
+	LA_CTIME = 1 << 2,
+	LA_SIZE  = 1 << 3,
+	LA_MODE  = 1 << 4,
+	LA_UID   = 1 << 5,
+	LA_GID   = 1 << 6,
+	LA_BLOCKS = 1 << 7,
+	LA_TYPE   = 1 << 8,
+	LA_FLAGS  = 1 << 9,
+	LA_NLINK  = 1 << 10,
+	LA_RDEV   = 1 << 11,
+	LA_BLKSIZE = 1 << 12,
+	LA_KILL_SUID = 1 << 13,
+	LA_KILL_SGID = 1 << 14,
+};
+
+/**
+ * Layer in the layered object.
+ */
+struct lu_object {
+	/**
+	 * Header for this object.
+	 */
+	struct lu_object_header	   *lo_header;
+	/**
+	 * Device for this layer.
+	 */
+	struct lu_device		  *lo_dev;
+	/**
+	 * Operations for this object.
+	 */
+	const struct lu_object_operations *lo_ops;
+	/**
+	 * Linkage into list of all layers.
+	 */
+	struct list_head			 lo_linkage;
+	/**
+	 * Link to the device, for debugging.
+	 */
+	struct lu_ref_link                 lo_dev_ref;
+};
+
+enum lu_object_header_flags {
+	/**
+	 * Don't keep this object in cache. Object will be destroyed as soon
+	 * as last reference to it is released. This flag cannot be cleared
+	 * once set.
+	 */
+	LU_OBJECT_HEARD_BANSHEE = 0,
+	/**
+	 * Mark this object has already been taken out of cache.
+	 */
+	LU_OBJECT_UNHASHED = 1
+};
+
+enum lu_object_header_attr {
+	LOHA_EXISTS   = 1 << 0,
+	LOHA_REMOTE   = 1 << 1,
+	/**
+	 * UNIX file type is stored in S_IFMT bits.
+	 */
+	LOHA_FT_START = 001 << 12, /**< S_IFIFO */
+	LOHA_FT_END   = 017 << 12, /**< S_IFMT */
+};
+
+/**
+ * "Compound" object, consisting of multiple layers.
+ *
+ * Compound object with given fid is unique with given lu_site.
+ *
+ * Note, that object does *not* necessary correspond to the real object in the
+ * persistent storage: object is an anchor for locking and method calling, so
+ * it is created for things like not-yet-existing child created by mkdir or
+ * create calls. lu_object_operations::loo_exists() can be used to check
+ * whether object is backed by persistent storage entity.
+ */
+struct lu_object_header {
+	/**
+	 * Fid, uniquely identifying this object.
+	 */
+	struct lu_fid		loh_fid;
+	/**
+	 * Object flags from enum lu_object_header_flags. Set and checked
+	 * atomically.
+	 */
+	unsigned long	  loh_flags;
+	/**
+	 * Object reference count. Protected by lu_site::ls_guard.
+	 */
+	atomic_t	   loh_ref;
+	/**
+	 * Common object attributes, cached for efficiency. From enum
+	 * lu_object_header_attr.
+	 */
+	__u32		  loh_attr;
+	/**
+	 * Linkage into per-site hash table. Protected by lu_site::ls_guard.
+	 */
+	struct hlist_node       loh_hash;
+	/**
+	 * Linkage into per-site LRU list. Protected by lu_site::ls_guard.
+	 */
+	struct list_head	     loh_lru;
+	/**
+	 * Linkage into list of layers. Never modified once set (except lately
+	 * during object destruction). No locking is necessary.
+	 */
+	struct list_head	     loh_layers;
+	/**
+	 * A list of references to this object, for debugging.
+	 */
+	struct lu_ref	  loh_reference;
+};
+
+struct fld;
+
+struct lu_site_bkt_data {
+	/**
+	 * number of busy object on this bucket
+	 */
+	long		      lsb_busy;
+	/**
+	 * LRU list, updated on each access to object. Protected by
+	 * bucket lock of lu_site::ls_obj_hash.
+	 *
+	 * "Cold" end of LRU is lu_site::ls_lru.next. Accessed object are
+	 * moved to the lu_site::ls_lru.prev (this is due to the non-existence
+	 * of list_for_each_entry_safe_reverse()).
+	 */
+	struct list_head		lsb_lru;
+	/**
+	 * Wait-queue signaled when an object in this site is ultimately
+	 * destroyed (lu_object_free()). It is used by lu_object_find() to
+	 * wait before re-trying when object in the process of destruction is
+	 * found in the hash table.
+	 *
+	 * \see htable_lookup().
+	 */
+	wait_queue_head_t	       lsb_marche_funebre;
+};
+
+enum {
+	LU_SS_CREATED	 = 0,
+	LU_SS_CACHE_HIT,
+	LU_SS_CACHE_MISS,
+	LU_SS_CACHE_RACE,
+	LU_SS_CACHE_DEATH_RACE,
+	LU_SS_LRU_PURGED,
+	LU_SS_LAST_STAT
+};
+
+/**
+ * lu_site is a "compartment" within which objects are unique, and LRU
+ * discipline is maintained.
+ *
+ * lu_site exists so that multiple layered stacks can co-exist in the same
+ * address space.
+ *
+ * lu_site has the same relation to lu_device as lu_object_header to
+ * lu_object.
+ */
+struct lu_site {
+	/**
+	 * objects hash table
+	 */
+	struct cfs_hash	       *ls_obj_hash;
+	/**
+	 * index of bucket on hash table while purging
+	 */
+	int		       ls_purge_start;
+	/**
+	 * Top-level device for this stack.
+	 */
+	struct lu_device	 *ls_top_dev;
+	/**
+	 * Bottom-level device for this stack
+	 */
+	struct lu_device	*ls_bottom_dev;
+	/**
+	 * Linkage into global list of sites.
+	 */
+	struct list_head		ls_linkage;
+	/**
+	 * List for lu device for this site, protected
+	 * by ls_ld_lock.
+	 **/
+	struct list_head		ls_ld_linkage;
+	spinlock_t		ls_ld_lock;
+
+	/**
+	 * lu_site stats
+	 */
+	struct lprocfs_stats	*ls_stats;
+	/**
+	 * XXX: a hack! fld has to find md_site via site, remove when possible
+	 */
+	struct seq_server_site	*ld_seq_site;
+};
+
+static inline struct lu_site_bkt_data *
+lu_site_bkt_from_fid(struct lu_site *site, struct lu_fid *fid)
+{
+	struct cfs_hash_bd bd;
+
+	cfs_hash_bd_get(site->ls_obj_hash, fid, &bd);
+	return cfs_hash_bd_extra_get(site->ls_obj_hash, &bd);
+}
+
+static inline struct seq_server_site *lu_site2seq(const struct lu_site *s)
+{
+	return s->ld_seq_site;
+}
+
+/** \name ctors
+ * Constructors/destructors.
+ * @{
+ */
+
+int  lu_site_init	 (struct lu_site *s, struct lu_device *d);
+void lu_site_fini	 (struct lu_site *s);
+int  lu_site_init_finish  (struct lu_site *s);
+void lu_stack_fini	(const struct lu_env *env, struct lu_device *top);
+void lu_device_get	(struct lu_device *d);
+void lu_device_put	(struct lu_device *d);
+int  lu_device_init       (struct lu_device *d, struct lu_device_type *t);
+void lu_device_fini       (struct lu_device *d);
+int  lu_object_header_init(struct lu_object_header *h);
+void lu_object_header_fini(struct lu_object_header *h);
+int  lu_object_init       (struct lu_object *o,
+			   struct lu_object_header *h, struct lu_device *d);
+void lu_object_fini       (struct lu_object *o);
+void lu_object_add_top    (struct lu_object_header *h, struct lu_object *o);
+void lu_object_add	(struct lu_object *before, struct lu_object *o);
+
+void lu_dev_add_linkage(struct lu_site *s, struct lu_device *d);
+void lu_dev_del_linkage(struct lu_site *s, struct lu_device *d);
+
+/**
+ * Helpers to initialize and finalize device types.
+ */
+
+int  lu_device_type_init(struct lu_device_type *ldt);
+void lu_device_type_fini(struct lu_device_type *ldt);
+void lu_types_stop(void);
+
+/** @} ctors */
+
+/** \name caching
+ * Caching and reference counting.
+ * @{
+ */
+
+/**
+ * Acquire additional reference to the given object. This function is used to
+ * attain additional reference. To acquire initial reference use
+ * lu_object_find().
+ */
+static inline void lu_object_get(struct lu_object *o)
+{
+	LASSERT(atomic_read(&o->lo_header->loh_ref) > 0);
+	atomic_inc(&o->lo_header->loh_ref);
+}
+
+/**
+ * Return true of object will not be cached after last reference to it is
+ * released.
+ */
+static inline int lu_object_is_dying(const struct lu_object_header *h)
+{
+	return test_bit(LU_OBJECT_HEARD_BANSHEE, &h->loh_flags);
+}
+
+void lu_object_put(const struct lu_env *env, struct lu_object *o);
+void lu_object_put_nocache(const struct lu_env *env, struct lu_object *o);
+void lu_object_unhash(const struct lu_env *env, struct lu_object *o);
+
+int lu_site_purge(const struct lu_env *env, struct lu_site *s, int nr);
+
+void lu_site_print(const struct lu_env *env, struct lu_site *s, void *cookie,
+		   lu_printer_t printer);
+struct lu_object *lu_object_find(const struct lu_env *env,
+				 struct lu_device *dev, const struct lu_fid *f,
+				 const struct lu_object_conf *conf);
+struct lu_object *lu_object_find_at(const struct lu_env *env,
+				    struct lu_device *dev,
+				    const struct lu_fid *f,
+				    const struct lu_object_conf *conf);
+struct lu_object *lu_object_find_slice(const struct lu_env *env,
+				       struct lu_device *dev,
+				       const struct lu_fid *f,
+				       const struct lu_object_conf *conf);
+/** @} caching */
+
+/** \name helpers
+ * Helpers.
+ * @{
+ */
+
+/**
+ * First (topmost) sub-object of given compound object
+ */
+static inline struct lu_object *lu_object_top(struct lu_object_header *h)
+{
+	LASSERT(!list_empty(&h->loh_layers));
+	return container_of0(h->loh_layers.next, struct lu_object, lo_linkage);
+}
+
+/**
+ * Next sub-object in the layering
+ */
+static inline struct lu_object *lu_object_next(const struct lu_object *o)
+{
+	return container_of0(o->lo_linkage.next, struct lu_object, lo_linkage);
+}
+
+/**
+ * Pointer to the fid of this object.
+ */
+static inline const struct lu_fid *lu_object_fid(const struct lu_object *o)
+{
+	return &o->lo_header->loh_fid;
+}
+
+/**
+ * return device operations vector for this object
+ */
+static const inline struct lu_device_operations *
+lu_object_ops(const struct lu_object *o)
+{
+	return o->lo_dev->ld_ops;
+}
+
+/**
+ * Given a compound object, find its slice, corresponding to the device type
+ * \a dtype.
+ */
+struct lu_object *lu_object_locate(struct lu_object_header *h,
+				   const struct lu_device_type *dtype);
+
+/**
+ * Printer function emitting messages through libcfs_debug_msg().
+ */
+int lu_cdebug_printer(const struct lu_env *env,
+		      void *cookie, const char *format, ...);
+
+/**
+ * Print object description followed by a user-supplied message.
+ */
+#define LU_OBJECT_DEBUG(mask, env, object, format, ...)		   \
+do {								      \
+	LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, mask, NULL);		  \
+									  \
+	if (cfs_cdebug_show(mask, DEBUG_SUBSYSTEM)) {		     \
+		lu_object_print(env, &msgdata, lu_cdebug_printer, object);\
+		CDEBUG(mask, format , ## __VA_ARGS__);		    \
+	}								 \
+} while (0)
+
+/**
+ * Print short object description followed by a user-supplied message.
+ */
+#define LU_OBJECT_HEADER(mask, env, object, format, ...)		\
+do {								    \
+	LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, mask, NULL);		\
+									\
+	if (cfs_cdebug_show(mask, DEBUG_SUBSYSTEM)) {		   \
+		lu_object_header_print(env, &msgdata, lu_cdebug_printer,\
+				       (object)->lo_header);	    \
+		lu_cdebug_printer(env, &msgdata, "\n");		 \
+		CDEBUG(mask, format , ## __VA_ARGS__);		  \
+	}							       \
+} while (0)
+
+void lu_object_print       (const struct lu_env *env, void *cookie,
+			    lu_printer_t printer, const struct lu_object *o);
+void lu_object_header_print(const struct lu_env *env, void *cookie,
+			    lu_printer_t printer,
+			    const struct lu_object_header *hdr);
+
+/**
+ * Check object consistency.
+ */
+int lu_object_invariant(const struct lu_object *o);
+
+
+/**
+ * Check whether object exists, no matter on local or remote storage.
+ * Note: LOHA_EXISTS will be set once some one created the object,
+ * and it does not needs to be committed to storage.
+ */
+#define lu_object_exists(o) ((o)->lo_header->loh_attr & LOHA_EXISTS)
+
+/**
+ * Check whether object on the remote storage.
+ */
+#define lu_object_remote(o) unlikely((o)->lo_header->loh_attr & LOHA_REMOTE)
+
+static inline int lu_object_assert_exists(const struct lu_object *o)
+{
+	return lu_object_exists(o);
+}
+
+static inline int lu_object_assert_not_exists(const struct lu_object *o)
+{
+	return !lu_object_exists(o);
+}
+
+/**
+ * Attr of this object.
+ */
+static inline __u32 lu_object_attr(const struct lu_object *o)
+{
+	LASSERT(lu_object_exists(o) != 0);
+	return o->lo_header->loh_attr;
+}
+
+static inline void lu_object_ref_add(struct lu_object *o,
+				     const char *scope,
+				     const void *source)
+{
+	lu_ref_add(&o->lo_header->loh_reference, scope, source);
+}
+
+static inline void lu_object_ref_add_at(struct lu_object *o,
+					struct lu_ref_link *link,
+					const char *scope,
+					const void *source)
+{
+	lu_ref_add_at(&o->lo_header->loh_reference, link, scope, source);
+}
+
+static inline void lu_object_ref_del(struct lu_object *o,
+				     const char *scope, const void *source)
+{
+	lu_ref_del(&o->lo_header->loh_reference, scope, source);
+}
+
+static inline void lu_object_ref_del_at(struct lu_object *o,
+					struct lu_ref_link *link,
+					const char *scope, const void *source)
+{
+	lu_ref_del_at(&o->lo_header->loh_reference, link, scope, source);
+}
+
+/** input params, should be filled out by mdt */
+struct lu_rdpg {
+	/** hash */
+	__u64		   rp_hash;
+	/** count in bytes */
+	unsigned int	    rp_count;
+	/** number of pages */
+	unsigned int	    rp_npages;
+	/** requested attr */
+	__u32		   rp_attrs;
+	/** pointers to pages */
+	struct page	   **rp_pages;
+};
+
+enum lu_xattr_flags {
+	LU_XATTR_REPLACE = (1 << 0),
+	LU_XATTR_CREATE  = (1 << 1)
+};
+
+/** @} helpers */
+
+/** \name lu_context
+ * @{ */
+
+/** For lu_context health-checks */
+enum lu_context_state {
+	LCS_INITIALIZED = 1,
+	LCS_ENTERED,
+	LCS_LEFT,
+	LCS_FINALIZED
+};
+
+/**
+ * lu_context. Execution context for lu_object methods. Currently associated
+ * with thread.
+ *
+ * All lu_object methods, except device and device type methods (called during
+ * system initialization and shutdown) are executed "within" some
+ * lu_context. This means, that pointer to some "current" lu_context is passed
+ * as an argument to all methods.
+ *
+ * All service ptlrpc threads create lu_context as part of their
+ * initialization. It is possible to create "stand-alone" context for other
+ * execution environments (like system calls).
+ *
+ * lu_object methods mainly use lu_context through lu_context_key interface
+ * that allows each layer to associate arbitrary pieces of data with each
+ * context (see pthread_key_create(3) for similar interface).
+ *
+ * On a client, lu_context is bound to a thread, see cl_env_get().
+ *
+ * \see lu_context_key
+ */
+struct lu_context {
+	/**
+	 * lu_context is used on the client side too. Yet we don't want to
+	 * allocate values of server-side keys for the client contexts and
+	 * vice versa.
+	 *
+	 * To achieve this, set of tags in introduced. Contexts and keys are
+	 * marked with tags. Key value are created only for context whose set
+	 * of tags has non-empty intersection with one for key. Tags are taken
+	 * from enum lu_context_tag.
+	 */
+	__u32		  lc_tags;
+	enum lu_context_state  lc_state;
+	/**
+	 * Pointer to the home service thread. NULL for other execution
+	 * contexts.
+	 */
+	struct ptlrpc_thread  *lc_thread;
+	/**
+	 * Pointer to an array with key values. Internal implementation
+	 * detail.
+	 */
+	void		 **lc_value;
+	/**
+	 * Linkage into a list of all remembered contexts. Only
+	 * `non-transient' contexts, i.e., ones created for service threads
+	 * are placed here.
+	 */
+	struct list_head	     lc_remember;
+	/**
+	 * Version counter used to skip calls to lu_context_refill() when no
+	 * keys were registered.
+	 */
+	unsigned	       lc_version;
+	/**
+	 * Debugging cookie.
+	 */
+	unsigned	       lc_cookie;
+};
+
+/**
+ * lu_context_key interface. Similar to pthread_key.
+ */
+
+enum lu_context_tag {
+	/**
+	 * Thread on md server
+	 */
+	LCT_MD_THREAD = 1 << 0,
+	/**
+	 * Thread on dt server
+	 */
+	LCT_DT_THREAD = 1 << 1,
+	/**
+	 * Context for transaction handle
+	 */
+	LCT_TX_HANDLE = 1 << 2,
+	/**
+	 * Thread on client
+	 */
+	LCT_CL_THREAD = 1 << 3,
+	/**
+	 * A per-request session on a server, and a per-system-call session on
+	 * a client.
+	 */
+	LCT_SESSION   = 1 << 4,
+	/**
+	 * A per-request data on OSP device
+	 */
+	LCT_OSP_THREAD = 1 << 5,
+	/**
+	 * MGS device thread
+	 */
+	LCT_MG_THREAD = 1 << 6,
+	/**
+	 * Context for local operations
+	 */
+	LCT_LOCAL = 1 << 7,
+	/**
+	 * Set when at least one of keys, having values in this context has
+	 * non-NULL lu_context_key::lct_exit() method. This is used to
+	 * optimize lu_context_exit() call.
+	 */
+	LCT_HAS_EXIT  = 1 << 28,
+	/**
+	 * Don't add references for modules creating key values in that context.
+	 * This is only for contexts used internally by lu_object framework.
+	 */
+	LCT_NOREF     = 1 << 29,
+	/**
+	 * Key is being prepared for retiring, don't create new values for it.
+	 */
+	LCT_QUIESCENT = 1 << 30,
+	/**
+	 * Context should be remembered.
+	 */
+	LCT_REMEMBER  = 1 << 31,
+	/**
+	 * Contexts usable in cache shrinker thread.
+	 */
+	LCT_SHRINKER  = LCT_MD_THREAD|LCT_DT_THREAD|LCT_CL_THREAD|LCT_NOREF
+};
+
+/**
+ * Key. Represents per-context value slot.
+ *
+ * Keys are usually registered when module owning the key is initialized, and
+ * de-registered when module is unloaded. Once key is registered, all new
+ * contexts with matching tags, will get key value. "Old" contexts, already
+ * initialized at the time of key registration, can be forced to get key value
+ * by calling lu_context_refill().
+ *
+ * Every key value is counted in lu_context_key::lct_used and acquires a
+ * reference on an owning module. This means, that all key values have to be
+ * destroyed before module can be unloaded. This is usually achieved by
+ * stopping threads started by the module, that created contexts in their
+ * entry functions. Situation is complicated by the threads shared by multiple
+ * modules, like ptlrpcd daemon on a client. To work around this problem,
+ * contexts, created in such threads, are `remembered' (see
+ * LCT_REMEMBER)---i.e., added into a global list. When module is preparing
+ * for unloading it does the following:
+ *
+ *     - marks its keys as `quiescent' (lu_context_tag::LCT_QUIESCENT)
+ *       preventing new key values from being allocated in the new contexts,
+ *       and
+ *
+ *     - scans a list of remembered contexts, destroying values of module
+ *       keys, thus releasing references to the module.
+ *
+ * This is done by lu_context_key_quiesce(). If module is re-activated
+ * before key has been de-registered, lu_context_key_revive() call clears
+ * `quiescent' marker.
+ *
+ * lu_context code doesn't provide any internal synchronization for these
+ * activities---it's assumed that startup (including threads start-up) and
+ * shutdown are serialized by some external means.
+ *
+ * \see lu_context
+ */
+struct lu_context_key {
+	/**
+	 * Set of tags for which values of this key are to be instantiated.
+	 */
+	__u32 lct_tags;
+	/**
+	 * Value constructor. This is called when new value is created for a
+	 * context. Returns pointer to new value of error pointer.
+	 */
+	void  *(*lct_init)(const struct lu_context *ctx,
+			   struct lu_context_key *key);
+	/**
+	 * Value destructor. Called when context with previously allocated
+	 * value of this slot is destroyed. \a data is a value that was returned
+	 * by a matching call to lu_context_key::lct_init().
+	 */
+	void   (*lct_fini)(const struct lu_context *ctx,
+			   struct lu_context_key *key, void *data);
+	/**
+	 * Optional method called on lu_context_exit() for all allocated
+	 * keys. Can be used by debugging code checking that locks are
+	 * released, etc.
+	 */
+	void   (*lct_exit)(const struct lu_context *ctx,
+			   struct lu_context_key *key, void *data);
+	/**
+	 * Internal implementation detail: index within lu_context::lc_value[]
+	 * reserved for this key.
+	 */
+	int      lct_index;
+	/**
+	 * Internal implementation detail: number of values created for this
+	 * key.
+	 */
+	atomic_t lct_used;
+	/**
+	 * Internal implementation detail: module for this key.
+	 */
+	struct module *lct_owner;
+	/**
+	 * References to this key. For debugging.
+	 */
+	struct lu_ref  lct_reference;
+};
+
+#define LU_KEY_INIT(mod, type)				    \
+	static void *mod##_key_init(const struct lu_context *ctx, \
+				    struct lu_context_key *key)   \
+	{							 \
+		type *value;				      \
+								  \
+		CLASSERT(PAGE_CACHE_SIZE >= sizeof (*value));       \
+								  \
+		OBD_ALLOC_PTR(value);			     \
+		if (value == NULL)				\
+			value = ERR_PTR(-ENOMEM);		 \
+								  \
+		return value;				     \
+	}							 \
+	struct __##mod##__dummy_init {;} /* semicolon catcher */
+
+#define LU_KEY_FINI(mod, type)					      \
+	static void mod##_key_fini(const struct lu_context *ctx,	    \
+				    struct lu_context_key *key, void *data) \
+	{								   \
+		type *info = data;					  \
+									    \
+		OBD_FREE_PTR(info);					 \
+	}								   \
+	struct __##mod##__dummy_fini {;} /* semicolon catcher */
+
+#define LU_KEY_INIT_FINI(mod, type)   \
+	LU_KEY_INIT(mod, type);	\
+	LU_KEY_FINI(mod, type)
+
+#define LU_CONTEXT_KEY_DEFINE(mod, tags)		\
+	struct lu_context_key mod##_thread_key = {      \
+		.lct_tags = tags,		       \
+		.lct_init = mod##_key_init,	     \
+		.lct_fini = mod##_key_fini	      \
+	}
+
+#define LU_CONTEXT_KEY_INIT(key)			\
+do {						    \
+	(key)->lct_owner = THIS_MODULE;		 \
+} while (0)
+
+int   lu_context_key_register(struct lu_context_key *key);
+void  lu_context_key_degister(struct lu_context_key *key);
+void *lu_context_key_get     (const struct lu_context *ctx,
+			       const struct lu_context_key *key);
+void  lu_context_key_quiesce (struct lu_context_key *key);
+void  lu_context_key_revive  (struct lu_context_key *key);
+
+
+/*
+ * LU_KEY_INIT_GENERIC() has to be a macro to correctly determine an
+ * owning module.
+ */
+
+#define LU_KEY_INIT_GENERIC(mod)					\
+	static void mod##_key_init_generic(struct lu_context_key *k, ...) \
+	{							       \
+		struct lu_context_key *key = k;			 \
+		va_list args;					   \
+									\
+		va_start(args, k);				      \
+		do {						    \
+			LU_CONTEXT_KEY_INIT(key);		       \
+			key = va_arg(args, struct lu_context_key *);    \
+		} while (key != NULL);				  \
+		va_end(args);					   \
+	}
+
+#define LU_TYPE_INIT(mod, ...)					  \
+	LU_KEY_INIT_GENERIC(mod)					\
+	static int mod##_type_init(struct lu_device_type *t)	    \
+	{							       \
+		mod##_key_init_generic(__VA_ARGS__, NULL);	      \
+		return lu_context_key_register_many(__VA_ARGS__, NULL); \
+	}							       \
+	struct __##mod##_dummy_type_init {;}
+
+#define LU_TYPE_FINI(mod, ...)					  \
+	static void mod##_type_fini(struct lu_device_type *t)	   \
+	{							       \
+		lu_context_key_degister_many(__VA_ARGS__, NULL);	\
+	}							       \
+	struct __##mod##_dummy_type_fini {;}
+
+#define LU_TYPE_START(mod, ...)				 \
+	static void mod##_type_start(struct lu_device_type *t)  \
+	{						       \
+		lu_context_key_revive_many(__VA_ARGS__, NULL);  \
+	}						       \
+	struct __##mod##_dummy_type_start {;}
+
+#define LU_TYPE_STOP(mod, ...)				  \
+	static void mod##_type_stop(struct lu_device_type *t)   \
+	{						       \
+		lu_context_key_quiesce_many(__VA_ARGS__, NULL); \
+	}						       \
+	struct __##mod##_dummy_type_stop {;}
+
+
+
+#define LU_TYPE_INIT_FINI(mod, ...)	     \
+	LU_TYPE_INIT(mod, __VA_ARGS__);	 \
+	LU_TYPE_FINI(mod, __VA_ARGS__);	 \
+	LU_TYPE_START(mod, __VA_ARGS__);	\
+	LU_TYPE_STOP(mod, __VA_ARGS__)
+
+int   lu_context_init  (struct lu_context *ctx, __u32 tags);
+void  lu_context_fini  (struct lu_context *ctx);
+void  lu_context_enter (struct lu_context *ctx);
+void  lu_context_exit  (struct lu_context *ctx);
+int   lu_context_refill(struct lu_context *ctx);
+
+/*
+ * Helper functions to operate on multiple keys. These are used by the default
+ * device type operations, defined by LU_TYPE_INIT_FINI().
+ */
+
+int  lu_context_key_register_many(struct lu_context_key *k, ...);
+void lu_context_key_degister_many(struct lu_context_key *k, ...);
+void lu_context_key_revive_many  (struct lu_context_key *k, ...);
+void lu_context_key_quiesce_many (struct lu_context_key *k, ...);
+
+/*
+ * update/clear ctx/ses tags.
+ */
+void lu_context_tags_update(__u32 tags);
+void lu_context_tags_clear(__u32 tags);
+void lu_session_tags_update(__u32 tags);
+void lu_session_tags_clear(__u32 tags);
+
+/**
+ * Environment.
+ */
+struct lu_env {
+	/**
+	 * "Local" context, used to store data instead of stack.
+	 */
+	struct lu_context  le_ctx;
+	/**
+	 * "Session" context for per-request data.
+	 */
+	struct lu_context *le_ses;
+};
+
+int  lu_env_init  (struct lu_env *env, __u32 tags);
+void lu_env_fini  (struct lu_env *env);
+int  lu_env_refill(struct lu_env *env);
+int  lu_env_refill_by_tags(struct lu_env *env, __u32 ctags, __u32 stags);
+
+/** @} lu_context */
+
+/**
+ * Output site statistical counters into a buffer. Suitable for
+ * ll_rd_*()-style functions.
+ */
+int lu_site_stats_print(const struct lu_site *s, struct seq_file *m);
+
+/**
+ * Common name structure to be passed around for various name related methods.
+ */
+struct lu_name {
+	const char    *ln_name;
+	int	    ln_namelen;
+};
+
+/**
+ * Common buffer structure to be passed around for various xattr_{s,g}et()
+ * methods.
+ */
+struct lu_buf {
+	void   *lb_buf;
+	ssize_t lb_len;
+};
+
+#define DLUBUF "(%p %zu)"
+#define PLUBUF(buf) (buf)->lb_buf, (buf)->lb_len
+/**
+ * One-time initializers, called at obdclass module initialization, not
+ * exported.
+ */
+
+/**
+ * Initialization of global lu_* data.
+ */
+int lu_global_init(void);
+
+/**
+ * Dual to lu_global_init().
+ */
+void lu_global_fini(void);
+
+struct lu_kmem_descr {
+	struct kmem_cache **ckd_cache;
+	const char       *ckd_name;
+	const size_t      ckd_size;
+};
+
+int  lu_kmem_init(struct lu_kmem_descr *caches);
+void lu_kmem_fini(struct lu_kmem_descr *caches);
+
+void lu_object_assign_fid(const struct lu_env *env, struct lu_object *o,
+			  const struct lu_fid *fid);
+struct lu_object *lu_object_anon(const struct lu_env *env,
+				 struct lu_device *dev,
+				 const struct lu_object_conf *conf);
+
+/** null buffer */
+extern struct lu_buf LU_BUF_NULL;
+
+void lu_buf_free(struct lu_buf *buf);
+void lu_buf_alloc(struct lu_buf *buf, int size);
+void lu_buf_realloc(struct lu_buf *buf, int size);
+
+int lu_buf_check_and_grow(struct lu_buf *buf, int len);
+struct lu_buf *lu_buf_check_and_alloc(struct lu_buf *buf, int len);
+
+/** @} lu */
+#endif /* __LUSTRE_LU_OBJECT_H */
diff --git a/kernel/drivers/staging/lustre/lustre/include/lu_ref.h b/kernel/drivers/staging/lustre/lustre/include/lu_ref.h
new file mode 100644
index 000000000..b451a888c
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/include/lu_ref.h
@@ -0,0 +1,182 @@
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ *
+ *   This file is part of Lustre, http://www.lustre.org.
+ *
+ *   Lustre is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Lustre is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Lustre; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+
+#ifndef __LUSTRE_LU_REF_H
+#define __LUSTRE_LU_REF_H
+
+#include <linux/list.h>
+
+/** \defgroup lu_ref lu_ref
+ *
+ * An interface to track references between objects. Mostly for debugging.
+ *
+ * Suppose there is a reference counted data-structure struct foo. To track
+ * who acquired references to instance of struct foo, add lu_ref field to it:
+ *
+ * \code
+ *	 struct foo {
+ *		 atomic_t      foo_refcount;
+ *		 struct lu_ref foo_reference;
+ *		 ...
+ *	 };
+ * \endcode
+ *
+ * foo::foo_reference has to be initialized by calling
+ * lu_ref_init(). Typically there will be functions or macros to increment and
+ * decrement foo::foo_refcount, let's say they are foo_get(struct foo *foo)
+ * and foo_put(struct foo *foo), respectively.
+ *
+ * Whenever foo_get() is called to acquire a reference on a foo, lu_ref_add()
+ * has to be called to insert into foo::foo_reference a record, describing
+ * acquired reference. Dually, lu_ref_del() removes matching record. Typical
+ * usages are:
+ *
+ * \code
+ *	struct bar *bar;
+ *
+ *	// bar owns a reference to foo.
+ *	bar->bar_foo = foo_get(foo);
+ *	lu_ref_add(&foo->foo_reference, "bar", bar);
+ *
+ *	...
+ *
+ *	// reference from bar to foo is released.
+ *	lu_ref_del(&foo->foo_reference, "bar", bar);
+ *	foo_put(bar->bar_foo);
+ *
+ *
+ *	// current thread acquired a temporary reference to foo.
+ *	foo_get(foo);
+ *	lu_ref_add(&foo->reference, __func__, current);
+ *
+ *	...
+ *
+ *	// temporary reference is released.
+ *	lu_ref_del(&foo->reference, __func__, current);
+ *	foo_put(foo);
+ * \endcode
+ *
+ * \e Et \e cetera. Often it makes sense to include lu_ref_add() and
+ * lu_ref_del() calls into foo_get() and foo_put(). When an instance of struct
+ * foo is destroyed, lu_ref_fini() has to be called that checks that no
+ * pending references remain. lu_ref_print() can be used to dump a list of
+ * pending references, while hunting down a leak.
+ *
+ * For objects to which a large number of references can be acquired,
+ * lu_ref_del() can become cpu consuming, as it has to scan the list of
+ * references. To work around this, remember result of lu_ref_add() (usually
+ * in the same place where pointer to struct foo is stored), and use
+ * lu_ref_del_at():
+ *
+ * \code
+ *	// There is a large number of bar's for a single foo.
+ *	bar->bar_foo     = foo_get(foo);
+ *	bar->bar_foo_ref = lu_ref_add(&foo->foo_reference, "bar", bar);
+ *
+ *	...
+ *
+ *	// reference from bar to foo is released.
+ *	lu_ref_del_at(&foo->foo_reference, bar->bar_foo_ref, "bar", bar);
+ *	foo_put(bar->bar_foo);
+ * \endcode
+ *
+ * lu_ref interface degrades gracefully in case of memory shortages.
+ *
+ * @{
+ */
+
+
+/*
+ * dummy data structures/functions to pass compile for now.
+ * We need to reimplement them with kref.
+ */
+struct lu_ref {};
+struct lu_ref_link {};
+
+static inline void lu_ref_init(struct lu_ref *ref)
+{
+}
+
+static inline void lu_ref_fini(struct lu_ref *ref)
+{
+}
+
+static inline struct lu_ref_link *lu_ref_add(struct lu_ref *ref,
+					     const char *scope,
+					     const void *source)
+{
+	return NULL;
+}
+
+static inline struct lu_ref_link *lu_ref_add_atomic(struct lu_ref *ref,
+						    const char *scope,
+						    const void *source)
+{
+	return NULL;
+}
+
+static inline void lu_ref_add_at(struct lu_ref *ref,
+				 struct lu_ref_link *link,
+				 const char *scope,
+				 const void *source)
+{
+}
+
+static inline void lu_ref_del(struct lu_ref *ref, const char *scope,
+			      const void *source)
+{
+}
+
+static inline void lu_ref_set_at(struct lu_ref *ref, struct lu_ref_link *link,
+				 const char *scope, const void *source0,
+				 const void *source1)
+{
+}
+
+static inline void lu_ref_del_at(struct lu_ref *ref, struct lu_ref_link *link,
+				 const char *scope, const void *source)
+{
+}
+
+static inline int lu_ref_global_init(void)
+{
+	return 0;
+}
+
+static inline void lu_ref_global_fini(void)
+{
+}
+
+static inline void lu_ref_print(const struct lu_ref *ref)
+{
+}
+
+static inline void lu_ref_print_all(void)
+{
+}
+
+/** @} lu */
+
+#endif /* __LUSTRE_LU_REF_H */
diff --git a/kernel/drivers/staging/lustre/lustre/include/lustre/libiam.h b/kernel/drivers/staging/lustre/lustre/include/lustre/libiam.h
new file mode 100644
index 000000000..e8e0b084a
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/include/lustre/libiam.h
@@ -0,0 +1,145 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/include/lustre/libiam.h
+ *
+ * iam user level library
+ *
+ * Author: Wang Di <wangdi@clusterfs.com>
+ * Author: Nikita Danilov <nikita@clusterfs.com>
+ * Author: Fan Yong <fanyong@clusterfs.com>
+ */
+
+/*
+ *  lustre/libiam.h
+ */
+
+#ifndef __IAM_ULIB_H__
+#define __IAM_ULIB_H__
+
+/** \defgroup libiam libiam
+ *
+ * @{
+ */
+
+
+#define DX_FMT_NAME_LEN 16
+
+enum iam_fmt_t {
+	FMT_LFIX,
+	FMT_LVAR
+};
+
+struct iam_uapi_info {
+	__u16 iui_keysize;
+	__u16 iui_recsize;
+	__u16 iui_ptrsize;
+	__u16 iui_height;
+	char  iui_fmt_name[DX_FMT_NAME_LEN];
+};
+
+/*
+ * Creat an iam file, but do NOT open it.
+ * Return 0 if success, else -1.
+ */
+int iam_creat(char *filename, enum iam_fmt_t fmt,
+	      int blocksize, int keysize, int recsize, int ptrsize);
+
+/*
+ * Open an iam file, but do NOT creat it if the file doesn't exist.
+ * Please use iam_creat for creating the file before use iam_open.
+ * Return file id (fd) if success, else -1.
+ */
+int iam_open(char *filename, struct iam_uapi_info *ua);
+
+/*
+ * Close file opened by iam_open.
+ */
+int iam_close(int fd);
+
+/*
+ * Please use iam_open before use this function.
+ */
+int iam_insert(int fd, struct iam_uapi_info *ua,
+	       int key_need_convert, char *keybuf,
+	       int rec_need_convert, char *recbuf);
+
+/*
+ * Please use iam_open before use this function.
+ */
+int iam_lookup(int fd, struct iam_uapi_info *ua,
+	       int key_need_convert, char *key_buf,
+	       int *keysize, char *save_key,
+	       int rec_need_convert, char *rec_buf,
+	       int *recsize, char *save_rec);
+
+/*
+ * Please use iam_open before use this function.
+ */
+int iam_delete(int fd, struct iam_uapi_info *ua,
+	       int key_need_convert, char *keybuf,
+	       int rec_need_convert, char *recbuf);
+
+/*
+ * Please use iam_open before use this function.
+ */
+int iam_it_start(int fd, struct iam_uapi_info *ua,
+		 int key_need_convert, char *key_buf,
+		 int *keysize, char *save_key,
+		 int rec_need_convert, char *rec_buf,
+		 int *recsize, char *save_rec);
+
+/*
+ * Please use iam_open before use this function.
+ */
+int iam_it_next(int fd, struct iam_uapi_info *ua,
+		int key_need_convert, char *key_buf,
+		int *keysize, char *save_key,
+		int rec_need_convert, char *rec_buf,
+		int *recsize, char *save_rec);
+
+/*
+ * Please use iam_open before use this function.
+ */
+int iam_it_stop(int fd, struct iam_uapi_info *ua,
+		int key_need_convert, char *keybuf,
+		int rec_need_convert, char *recbuf);
+
+/*
+ * Change iam file mode.
+ */
+int iam_polymorph(char *filename, unsigned long mode);
+
+/** @} libiam */
+
+#endif
diff --git a/kernel/drivers/staging/lustre/lustre/include/lustre/ll_fiemap.h b/kernel/drivers/staging/lustre/lustre/include/lustre/ll_fiemap.h
new file mode 100644
index 000000000..ad253c6de
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/include/lustre/ll_fiemap.h
@@ -0,0 +1,121 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/include/lustre/ll_fiemap.h
+ *
+ * FIEMAP data structures and flags. This header file will be used until
+ * fiemap.h is available in the upstream kernel.
+ *
+ * Author: Kalpak Shah <kalpak.shah@sun.com>
+ * Author: Andreas Dilger <adilger@sun.com>
+ */
+
+#ifndef _LUSTRE_FIEMAP_H
+#define _LUSTRE_FIEMAP_H
+
+
+
+struct ll_fiemap_extent {
+	__u64 fe_logical;  /* logical offset in bytes for the start of
+			    * the extent from the beginning of the file */
+	__u64 fe_physical; /* physical offset in bytes for the start
+			    * of the extent from the beginning of the disk */
+	__u64 fe_length;   /* length in bytes for this extent */
+	__u64 fe_reserved64[2];
+	__u32 fe_flags;    /* FIEMAP_EXTENT_* flags for this extent */
+	__u32 fe_device;   /* device number for this extent */
+	__u32 fe_reserved[2];
+};
+
+struct ll_user_fiemap {
+	__u64 fm_start;  /* logical offset (inclusive) at
+			  * which to start mapping (in) */
+	__u64 fm_length; /* logical length of mapping which
+			  * userspace wants (in) */
+	__u32 fm_flags;  /* FIEMAP_FLAG_* flags for request (in/out) */
+	__u32 fm_mapped_extents;/* number of extents that were mapped (out) */
+	__u32 fm_extent_count;  /* size of fm_extents array (in) */
+	__u32 fm_reserved;
+	struct ll_fiemap_extent fm_extents[0]; /* array of mapped extents (out) */
+};
+
+#define FIEMAP_MAX_OFFSET      (~0ULL)
+
+#define FIEMAP_FLAG_SYNC	 0x00000001 /* sync file data before map */
+#define FIEMAP_FLAG_XATTR	0x00000002 /* map extended attribute tree */
+
+#define FIEMAP_EXTENT_LAST	      0x00000001 /* Last extent in file. */
+#define FIEMAP_EXTENT_UNKNOWN	   0x00000002 /* Data location unknown. */
+#define FIEMAP_EXTENT_DELALLOC	  0x00000004 /* Location still pending.
+						    * Sets EXTENT_UNKNOWN. */
+#define FIEMAP_EXTENT_ENCODED	   0x00000008 /* Data can not be read
+						    * while fs is unmounted */
+#define FIEMAP_EXTENT_DATA_ENCRYPTED    0x00000080 /* Data is encrypted by fs.
+						    * Sets EXTENT_NO_DIRECT. */
+#define FIEMAP_EXTENT_NOT_ALIGNED       0x00000100 /* Extent offsets may not be
+						    * block aligned. */
+#define FIEMAP_EXTENT_DATA_INLINE       0x00000200 /* Data mixed with metadata.
+						    * Sets EXTENT_NOT_ALIGNED.*/
+#define FIEMAP_EXTENT_DATA_TAIL	 0x00000400 /* Multiple files in block.
+						    * Sets EXTENT_NOT_ALIGNED.*/
+#define FIEMAP_EXTENT_UNWRITTEN	 0x00000800 /* Space allocated, but
+						    * no data (i.e. zero). */
+#define FIEMAP_EXTENT_MERGED	    0x00001000 /* File does not natively
+						    * support extents. Result
+						    * merged for efficiency. */
+
+
+static inline size_t fiemap_count_to_size(size_t extent_count)
+{
+	return (sizeof(struct ll_user_fiemap) + extent_count *
+					       sizeof(struct ll_fiemap_extent));
+}
+
+static inline unsigned fiemap_size_to_count(size_t array_size)
+{
+	return ((array_size - sizeof(struct ll_user_fiemap)) /
+					       sizeof(struct ll_fiemap_extent));
+}
+
+#define FIEMAP_FLAG_DEVICE_ORDER 0x40000000 /* return device ordered mapping */
+
+#ifdef FIEMAP_FLAGS_COMPAT
+#undef FIEMAP_FLAGS_COMPAT
+#endif
+
+/* Lustre specific flags - use a high bit, don't conflict with upstream flag */
+#define FIEMAP_EXTENT_NO_DIRECT	 0x40000000 /* Data mapping undefined */
+#define FIEMAP_EXTENT_NET	       0x80000000 /* Data stored remotely.
+						    * Sets NO_DIRECT flag */
+
+#endif /* _LUSTRE_FIEMAP_H */
diff --git a/kernel/drivers/staging/lustre/lustre/include/lustre/lustre_build_version.h b/kernel/drivers/staging/lustre/lustre/include/lustre/lustre_build_version.h
new file mode 100644
index 000000000..93a3d7db3
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/include/lustre/lustre_build_version.h
@@ -0,0 +1,2 @@
+#define BUILD_VERSION "v2_3_64_0-g6e62c21-CHANGED-3.9.0"
+#define LUSTRE_RELEASE 3.9.0_g6e62c21
diff --git a/kernel/drivers/staging/lustre/lustre/include/lustre/lustre_errno.h b/kernel/drivers/staging/lustre/lustre/include/lustre/lustre_errno.h
new file mode 100644
index 000000000..35aefa2cd
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/include/lustre/lustre_errno.h
@@ -0,0 +1,215 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.txt
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (C) 2011 FUJITSU LIMITED.  All rights reserved.
+ *
+ * Copyright (c) 2013, Intel Corporation.
+ */
+
+#ifndef LUSTRE_ERRNO_H
+#define LUSTRE_ERRNO_H
+
+/*
+ * Only "network" errnos, which are defined below, are allowed on wire (or on
+ * disk).  Generic routines exist to help translate between these and a subset
+ * of the "host" errnos.  Some host errnos (e.g., EDEADLOCK) are intentionally
+ * left out.  See also the comment on lustre_errno_hton_mapping[].
+ *
+ * To maintain compatibility with existing x86 clients and servers, each of
+ * these network errnos has the same numerical value as its corresponding host
+ * errno on x86.
+ */
+#define LUSTRE_EPERM		1	/* Operation not permitted */
+#define LUSTRE_ENOENT		2	/* No such file or directory */
+#define LUSTRE_ESRCH		3	/* No such process */
+#define LUSTRE_EINTR		4	/* Interrupted system call */
+#define LUSTRE_EIO		5	/* I/O error */
+#define LUSTRE_ENXIO		6	/* No such device or address */
+#define LUSTRE_E2BIG		7	/* Argument list too long */
+#define LUSTRE_ENOEXEC		8	/* Exec format error */
+#define LUSTRE_EBADF		9	/* Bad file number */
+#define LUSTRE_ECHILD		10	/* No child processes */
+#define LUSTRE_EAGAIN		11	/* Try again */
+#define LUSTRE_ENOMEM		12	/* Out of memory */
+#define LUSTRE_EACCES		13	/* Permission denied */
+#define LUSTRE_EFAULT		14	/* Bad address */
+#define LUSTRE_ENOTBLK		15	/* Block device required */
+#define LUSTRE_EBUSY		16	/* Device or resource busy */
+#define LUSTRE_EEXIST		17	/* File exists */
+#define LUSTRE_EXDEV		18	/* Cross-device link */
+#define LUSTRE_ENODEV		19	/* No such device */
+#define LUSTRE_ENOTDIR		20	/* Not a directory */
+#define LUSTRE_EISDIR		21	/* Is a directory */
+#define LUSTRE_EINVAL		22	/* Invalid argument */
+#define LUSTRE_ENFILE		23	/* File table overflow */
+#define LUSTRE_EMFILE		24	/* Too many open files */
+#define LUSTRE_ENOTTY		25	/* Not a typewriter */
+#define LUSTRE_ETXTBSY		26	/* Text file busy */
+#define LUSTRE_EFBIG		27	/* File too large */
+#define LUSTRE_ENOSPC		28	/* No space left on device */
+#define LUSTRE_ESPIPE		29	/* Illegal seek */
+#define LUSTRE_EROFS		30	/* Read-only file system */
+#define LUSTRE_EMLINK		31	/* Too many links */
+#define LUSTRE_EPIPE		32	/* Broken pipe */
+#define LUSTRE_EDOM		33	/* Math argument out of domain of
+					   func */
+#define LUSTRE_ERANGE		34	/* Math result not representable */
+#define LUSTRE_EDEADLK		35	/* Resource deadlock would occur */
+#define LUSTRE_ENAMETOOLONG	36	/* File name too long */
+#define LUSTRE_ENOLCK		37	/* No record locks available */
+#define LUSTRE_ENOSYS		38	/* Function not implemented */
+#define LUSTRE_ENOTEMPTY	39	/* Directory not empty */
+#define LUSTRE_ELOOP		40	/* Too many symbolic links
+					   encountered */
+#define LUSTRE_ENOMSG		42	/* No message of desired type */
+#define LUSTRE_EIDRM		43	/* Identifier removed */
+#define LUSTRE_ECHRNG		44	/* Channel number out of range */
+#define LUSTRE_EL2NSYNC		45	/* Level 2 not synchronized */
+#define LUSTRE_EL3HLT		46	/* Level 3 halted */
+#define LUSTRE_EL3RST		47	/* Level 3 reset */
+#define LUSTRE_ELNRNG		48	/* Link number out of range */
+#define LUSTRE_EUNATCH		49	/* Protocol driver not attached */
+#define LUSTRE_ENOCSI		50	/* No CSI structure available */
+#define LUSTRE_EL2HLT		51	/* Level 2 halted */
+#define LUSTRE_EBADE		52	/* Invalid exchange */
+#define LUSTRE_EBADR		53	/* Invalid request descriptor */
+#define LUSTRE_EXFULL		54	/* Exchange full */
+#define LUSTRE_ENOANO		55	/* No anode */
+#define LUSTRE_EBADRQC		56	/* Invalid request code */
+#define LUSTRE_EBADSLT		57	/* Invalid slot */
+#define LUSTRE_EBFONT		59	/* Bad font file format */
+#define LUSTRE_ENOSTR		60	/* Device not a stream */
+#define LUSTRE_ENODATA		61	/* No data available */
+#define LUSTRE_ETIME		62	/* Timer expired */
+#define LUSTRE_ENOSR		63	/* Out of streams resources */
+#define LUSTRE_ENONET		64	/* Machine is not on the network */
+#define LUSTRE_ENOPKG		65	/* Package not installed */
+#define LUSTRE_EREMOTE		66	/* Object is remote */
+#define LUSTRE_ENOLINK		67	/* Link has been severed */
+#define LUSTRE_EADV		68	/* Advertise error */
+#define LUSTRE_ESRMNT		69	/* Srmount error */
+#define LUSTRE_ECOMM		70	/* Communication error on send */
+#define LUSTRE_EPROTO		71	/* Protocol error */
+#define LUSTRE_EMULTIHOP	72	/* Multihop attempted */
+#define LUSTRE_EDOTDOT		73	/* RFS specific error */
+#define LUSTRE_EBADMSG		74	/* Not a data message */
+#define LUSTRE_EOVERFLOW	75	/* Value too large for defined data
+					   type */
+#define LUSTRE_ENOTUNIQ		76	/* Name not unique on network */
+#define LUSTRE_EBADFD		77	/* File descriptor in bad state */
+#define LUSTRE_EREMCHG		78	/* Remote address changed */
+#define LUSTRE_ELIBACC		79	/* Can not access a needed shared
+					   library */
+#define LUSTRE_ELIBBAD		80	/* Accessing a corrupted shared
+					   library */
+#define LUSTRE_ELIBSCN		81	/* .lib section in a.out corrupted */
+#define LUSTRE_ELIBMAX		82	/* Attempting to link in too many shared
+					   libraries */
+#define LUSTRE_ELIBEXEC		83	/* Cannot exec a shared library
+					   directly */
+#define LUSTRE_EILSEQ		84	/* Illegal byte sequence */
+#define LUSTRE_ERESTART		85	/* Interrupted system call should be
+					   restarted */
+#define LUSTRE_ESTRPIPE		86	/* Streams pipe error */
+#define LUSTRE_EUSERS		87	/* Too many users */
+#define LUSTRE_ENOTSOCK		88	/* Socket operation on non-socket */
+#define LUSTRE_EDESTADDRREQ	89	/* Destination address required */
+#define LUSTRE_EMSGSIZE		90	/* Message too long */
+#define LUSTRE_EPROTOTYPE	91	/* Protocol wrong type for socket */
+#define LUSTRE_ENOPROTOOPT	92	/* Protocol not available */
+#define LUSTRE_EPROTONOSUPPORT	93	/* Protocol not supported */
+#define LUSTRE_ESOCKTNOSUPPORT	94	/* Socket type not supported */
+#define LUSTRE_EOPNOTSUPP	95	/* Operation not supported on transport
+					   endpoint */
+#define LUSTRE_EPFNOSUPPORT	96	/* Protocol family not supported */
+#define LUSTRE_EAFNOSUPPORT	97	/* Address family not supported by
+					   protocol */
+#define LUSTRE_EADDRINUSE	98	/* Address already in use */
+#define LUSTRE_EADDRNOTAVAIL	99	/* Cannot assign requested address */
+#define LUSTRE_ENETDOWN		100	/* Network is down */
+#define LUSTRE_ENETUNREACH	101	/* Network is unreachable */
+#define LUSTRE_ENETRESET	102	/* Network dropped connection because of
+					   reset */
+#define LUSTRE_ECONNABORTED	103	/* Software caused connection abort */
+#define LUSTRE_ECONNRESET	104	/* Connection reset by peer */
+#define LUSTRE_ENOBUFS		105	/* No buffer space available */
+#define LUSTRE_EISCONN		106	/* Transport endpoint is already
+					   connected */
+#define LUSTRE_ENOTCONN		107	/* Transport endpoint is not
+					   connected */
+#define LUSTRE_ESHUTDOWN	108	/* Cannot send after transport endpoint
+					   shutdown */
+#define LUSTRE_ETOOMANYREFS	109	/* Too many references: cannot splice */
+#define LUSTRE_ETIMEDOUT	110	/* Connection timed out */
+#define LUSTRE_ECONNREFUSED	111	/* Connection refused */
+#define LUSTRE_EHOSTDOWN	112	/* Host is down */
+#define LUSTRE_EHOSTUNREACH	113	/* No route to host */
+#define LUSTRE_EALREADY		114	/* Operation already in progress */
+#define LUSTRE_EINPROGRESS	115	/* Operation now in progress */
+#define LUSTRE_ESTALE		116	/* Stale file handle */
+#define LUSTRE_EUCLEAN		117	/* Structure needs cleaning */
+#define LUSTRE_ENOTNAM		118	/* Not a XENIX named type file */
+#define LUSTRE_ENAVAIL		119	/* No XENIX semaphores available */
+#define LUSTRE_EISNAM		120	/* Is a named type file */
+#define LUSTRE_EREMOTEIO	121	/* Remote I/O error */
+#define LUSTRE_EDQUOT		122	/* Quota exceeded */
+#define LUSTRE_ENOMEDIUM	123	/* No medium found */
+#define LUSTRE_EMEDIUMTYPE	124	/* Wrong medium type */
+#define LUSTRE_ECANCELED	125	/* Operation Canceled */
+#define LUSTRE_ENOKEY		126	/* Required key not available */
+#define LUSTRE_EKEYEXPIRED	127	/* Key has expired */
+#define LUSTRE_EKEYREVOKED	128	/* Key has been revoked */
+#define LUSTRE_EKEYREJECTED	129	/* Key was rejected by service */
+#define LUSTRE_EOWNERDEAD	130	/* Owner died */
+#define LUSTRE_ENOTRECOVERABLE	131	/* State not recoverable */
+#define LUSTRE_ERESTARTSYS	512
+#define LUSTRE_ERESTARTNOINTR	513
+#define LUSTRE_ERESTARTNOHAND	514	/* restart if no handler.. */
+#define LUSTRE_ENOIOCTLCMD	515	/* No ioctl command */
+#define LUSTRE_ERESTART_RESTARTBLOCK 516 /* restart by calling
+					    sys_restart_syscall */
+#define LUSTRE_EBADHANDLE	521	/* Illegal NFS file handle */
+#define LUSTRE_ENOTSYNC		522	/* Update synchronization mismatch */
+#define LUSTRE_EBADCOOKIE	523	/* Cookie is stale */
+#define LUSTRE_ENOTSUPP		524	/* Operation is not supported */
+#define LUSTRE_ETOOSMALL	525	/* Buffer or request is too small */
+#define LUSTRE_ESERVERFAULT	526	/* An untranslatable error occurred */
+#define LUSTRE_EBADTYPE		527	/* Type not supported by server */
+#define LUSTRE_EJUKEBOX		528	/* Request initiated, but will not
+					   complete before timeout */
+#define LUSTRE_EIOCBQUEUED	529	/* iocb queued, will get completion
+					   event */
+#define LUSTRE_EIOCBRETRY	530	/* iocb queued, will trigger a retry */
+
+/*
+ * Translations are optimized away on x86.  Host errnos that shouldn't be put
+ * on wire could leak through as a result.  Do not count on this side effect.
+ */
+#ifdef CONFIG_LUSTRE_TRANSLATE_ERRNOS
+unsigned int lustre_errno_hton(unsigned int h);
+unsigned int lustre_errno_ntoh(unsigned int n);
+#else
+#define lustre_errno_hton(h) (h)
+#define lustre_errno_ntoh(n) (n)
+#endif
+
+#endif /* LUSTRE_ERRNO_H */
diff --git a/kernel/drivers/staging/lustre/lustre/include/lustre/lustre_idl.h b/kernel/drivers/staging/lustre/lustre/include/lustre/lustre_idl.h
new file mode 100644
index 000000000..305ecbee9
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/include/lustre/lustre_idl.h
@@ -0,0 +1,3734 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/include/lustre/lustre_idl.h
+ *
+ * Lustre wire protocol definitions.
+ */
+
+/** \defgroup lustreidl lustreidl
+ *
+ * Lustre wire protocol definitions.
+ *
+ * ALL structs passing over the wire should be declared here.  Structs
+ * that are used in interfaces with userspace should go in lustre_user.h.
+ *
+ * All structs being declared here should be built from simple fixed-size
+ * types (__u8, __u16, __u32, __u64) or be built from other types or
+ * structs also declared in this file.  Similarly, all flags and magic
+ * values in those structs should also be declared here.  This ensures
+ * that the Lustre wire protocol is not influenced by external dependencies.
+ *
+ * The only other acceptable items in this file are VERY SIMPLE accessor
+ * functions to avoid callers grubbing inside the structures, and the
+ * prototypes of the swabber functions for each struct.  Nothing that
+ * depends on external functions or definitions should be in here.
+ *
+ * Structs must be properly aligned to put 64-bit values on an 8-byte
+ * boundary.  Any structs being added here must also be added to
+ * utils/wirecheck.c and "make newwiretest" run to regenerate the
+ * utils/wiretest.c sources.  This allows us to verify that wire structs
+ * have the proper alignment/size on all architectures.
+ *
+ * DO NOT CHANGE any of the structs, flags, values declared here and used
+ * in released Lustre versions.  Some structs may have padding fields that
+ * can be used.  Some structs might allow addition at the end (verify this
+ * in the code to ensure that new/old clients that see this larger struct
+ * do not fail, otherwise you need to implement protocol compatibility).
+ *
+ * We assume all nodes are either little-endian or big-endian, and we
+ * always send messages in the sender's native format.  The receiver
+ * detects the message format by checking the 'magic' field of the message
+ * (see lustre_msg_swabbed() below).
+ *
+ * Each wire type has corresponding 'lustre_swab_xxxtypexxx()' routines,
+ * implemented either here, inline (trivial implementations) or in
+ * ptlrpc/pack_generic.c.  These 'swabbers' convert the type from "other"
+ * endian, in-place in the message buffer.
+ *
+ * A swabber takes a single pointer argument.  The caller must already have
+ * verified that the length of the message buffer >= sizeof (type).
+ *
+ * For variable length types, a second 'lustre_swab_v_xxxtypexxx()' routine
+ * may be defined that swabs just the variable part, after the caller has
+ * verified that the message buffer is large enough.
+ *
+ * @{
+ */
+
+#ifndef _LUSTRE_IDL_H_
+#define _LUSTRE_IDL_H_
+
+#include "../../../include/linux/libcfs/libcfs.h"
+
+/* Defn's shared with user-space. */
+#include "lustre_user.h"
+#include "lustre_errno.h"
+
+/*
+ *  GENERAL STUFF
+ */
+/* FOO_REQUEST_PORTAL is for incoming requests on the FOO
+ * FOO_REPLY_PORTAL   is for incoming replies on the FOO
+ * FOO_BULK_PORTAL    is for incoming bulk on the FOO
+ */
+
+/* Lustre service names are following the format
+ * service name + MDT + seq name
+ */
+#define LUSTRE_MDT_MAXNAMELEN	80
+
+#define CONNMGR_REQUEST_PORTAL	  1
+#define CONNMGR_REPLY_PORTAL	    2
+//#define OSC_REQUEST_PORTAL	    3
+#define OSC_REPLY_PORTAL		4
+//#define OSC_BULK_PORTAL	       5
+#define OST_IO_PORTAL		   6
+#define OST_CREATE_PORTAL	       7
+#define OST_BULK_PORTAL		 8
+//#define MDC_REQUEST_PORTAL	    9
+#define MDC_REPLY_PORTAL	       10
+//#define MDC_BULK_PORTAL	      11
+#define MDS_REQUEST_PORTAL	     12
+//#define MDS_REPLY_PORTAL	     13
+#define MDS_BULK_PORTAL		14
+#define LDLM_CB_REQUEST_PORTAL	 15
+#define LDLM_CB_REPLY_PORTAL	   16
+#define LDLM_CANCEL_REQUEST_PORTAL     17
+#define LDLM_CANCEL_REPLY_PORTAL       18
+//#define PTLBD_REQUEST_PORTAL	   19
+//#define PTLBD_REPLY_PORTAL	     20
+//#define PTLBD_BULK_PORTAL	      21
+#define MDS_SETATTR_PORTAL	     22
+#define MDS_READPAGE_PORTAL	    23
+#define OUT_PORTAL		    24
+
+#define MGC_REPLY_PORTAL	       25
+#define MGS_REQUEST_PORTAL	     26
+#define MGS_REPLY_PORTAL	       27
+#define OST_REQUEST_PORTAL	     28
+#define FLD_REQUEST_PORTAL	     29
+#define SEQ_METADATA_PORTAL	    30
+#define SEQ_DATA_PORTAL		31
+#define SEQ_CONTROLLER_PORTAL	  32
+#define MGS_BULK_PORTAL		33
+
+/* Portal 63 is reserved for the Cray Inc DVS - nic@cray.com, roe@cray.com, n8851@cray.com */
+
+/* packet types */
+#define PTL_RPC_MSG_REQUEST 4711
+#define PTL_RPC_MSG_ERR     4712
+#define PTL_RPC_MSG_REPLY   4713
+
+/* DON'T use swabbed values of MAGIC as magic! */
+#define LUSTRE_MSG_MAGIC_V1 0x0BD00BD0
+#define LUSTRE_MSG_MAGIC_V2 0x0BD00BD3
+
+#define LUSTRE_MSG_MAGIC_V1_SWABBED 0xD00BD00B
+#define LUSTRE_MSG_MAGIC_V2_SWABBED 0xD30BD00B
+
+#define LUSTRE_MSG_MAGIC LUSTRE_MSG_MAGIC_V2
+
+#define PTLRPC_MSG_VERSION  0x00000003
+#define LUSTRE_VERSION_MASK 0xffff0000
+#define LUSTRE_OBD_VERSION  0x00010000
+#define LUSTRE_MDS_VERSION  0x00020000
+#define LUSTRE_OST_VERSION  0x00030000
+#define LUSTRE_DLM_VERSION  0x00040000
+#define LUSTRE_LOG_VERSION  0x00050000
+#define LUSTRE_MGS_VERSION  0x00060000
+
+/**
+ * Describes a range of sequence, lsr_start is included but lsr_end is
+ * not in the range.
+ * Same structure is used in fld module where lsr_index field holds mdt id
+ * of the home mdt.
+ */
+struct lu_seq_range {
+	__u64 lsr_start;
+	__u64 lsr_end;
+	__u32 lsr_index;
+	__u32 lsr_flags;
+};
+
+#define LU_SEQ_RANGE_MDT	0x0
+#define LU_SEQ_RANGE_OST	0x1
+#define LU_SEQ_RANGE_ANY	0x3
+
+#define LU_SEQ_RANGE_MASK	0x3
+
+static inline unsigned fld_range_type(const struct lu_seq_range *range)
+{
+	return range->lsr_flags & LU_SEQ_RANGE_MASK;
+}
+
+static inline int fld_range_is_ost(const struct lu_seq_range *range)
+{
+	return fld_range_type(range) == LU_SEQ_RANGE_OST;
+}
+
+static inline int fld_range_is_mdt(const struct lu_seq_range *range)
+{
+	return fld_range_type(range) == LU_SEQ_RANGE_MDT;
+}
+
+/**
+ * This all range is only being used when fld client sends fld query request,
+ * but it does not know whether the seq is MDT or OST, so it will send req
+ * with ALL type, which means either seq type gotten from lookup can be
+ * expected.
+ */
+static inline unsigned fld_range_is_any(const struct lu_seq_range *range)
+{
+	return fld_range_type(range) == LU_SEQ_RANGE_ANY;
+}
+
+static inline void fld_range_set_type(struct lu_seq_range *range,
+				      unsigned flags)
+{
+	range->lsr_flags |= flags;
+}
+
+static inline void fld_range_set_mdt(struct lu_seq_range *range)
+{
+	fld_range_set_type(range, LU_SEQ_RANGE_MDT);
+}
+
+static inline void fld_range_set_ost(struct lu_seq_range *range)
+{
+	fld_range_set_type(range, LU_SEQ_RANGE_OST);
+}
+
+static inline void fld_range_set_any(struct lu_seq_range *range)
+{
+	fld_range_set_type(range, LU_SEQ_RANGE_ANY);
+}
+
+/**
+ * returns  width of given range \a r
+ */
+
+static inline __u64 range_space(const struct lu_seq_range *range)
+{
+	return range->lsr_end - range->lsr_start;
+}
+
+/**
+ * initialize range to zero
+ */
+
+static inline void range_init(struct lu_seq_range *range)
+{
+	memset(range, 0, sizeof(*range));
+}
+
+/**
+ * check if given seq id \a s is within given range \a r
+ */
+
+static inline int range_within(const struct lu_seq_range *range,
+			       __u64 s)
+{
+	return s >= range->lsr_start && s < range->lsr_end;
+}
+
+static inline int range_is_sane(const struct lu_seq_range *range)
+{
+	return (range->lsr_end >= range->lsr_start);
+}
+
+static inline int range_is_zero(const struct lu_seq_range *range)
+{
+	return (range->lsr_start == 0 && range->lsr_end == 0);
+}
+
+static inline int range_is_exhausted(const struct lu_seq_range *range)
+
+{
+	return range_space(range) == 0;
+}
+
+/* return 0 if two range have the same location */
+static inline int range_compare_loc(const struct lu_seq_range *r1,
+				    const struct lu_seq_range *r2)
+{
+	return r1->lsr_index != r2->lsr_index ||
+	       r1->lsr_flags != r2->lsr_flags;
+}
+
+#define DRANGE "[%#16.16Lx-%#16.16Lx):%x:%s"
+
+#define PRANGE(range)		\
+	(range)->lsr_start,	\
+	(range)->lsr_end,	\
+	(range)->lsr_index,	\
+	fld_range_is_mdt(range) ? "mdt" : "ost"
+
+
+/** \defgroup lu_fid lu_fid
+ * @{ */
+
+/**
+ * Flags for lustre_mdt_attrs::lma_compat and lustre_mdt_attrs::lma_incompat.
+ * Deprecated since HSM and SOM attributes are now stored in separate on-disk
+ * xattr.
+ */
+enum lma_compat {
+	LMAC_HSM	= 0x00000001,
+	LMAC_SOM	= 0x00000002,
+	LMAC_NOT_IN_OI	= 0x00000004, /* the object does NOT need OI mapping */
+	LMAC_FID_ON_OST = 0x00000008, /* For OST-object, its OI mapping is
+				       * under /O/<seq>/d<x>. */
+};
+
+/**
+ * Masks for all features that should be supported by a Lustre version to
+ * access a specific file.
+ * This information is stored in lustre_mdt_attrs::lma_incompat.
+ */
+enum lma_incompat {
+	LMAI_RELEASED		= 0x00000001, /* file is released */
+	LMAI_AGENT		= 0x00000002, /* agent inode */
+	LMAI_REMOTE_PARENT	= 0x00000004, /* the parent of the object
+						 is on the remote MDT */
+};
+#define LMA_INCOMPAT_SUPP	(LMAI_AGENT | LMAI_REMOTE_PARENT)
+
+/**
+ * fid constants
+ */
+enum {
+	/** LASTID file has zero OID */
+	LUSTRE_FID_LASTID_OID = 0UL,
+	/** initial fid id value */
+	LUSTRE_FID_INIT_OID  = 1UL
+};
+
+/** returns fid object sequence */
+static inline __u64 fid_seq(const struct lu_fid *fid)
+{
+	return fid->f_seq;
+}
+
+/** returns fid object id */
+static inline __u32 fid_oid(const struct lu_fid *fid)
+{
+	return fid->f_oid;
+}
+
+/** returns fid object version */
+static inline __u32 fid_ver(const struct lu_fid *fid)
+{
+	return fid->f_ver;
+}
+
+static inline void fid_zero(struct lu_fid *fid)
+{
+	memset(fid, 0, sizeof(*fid));
+}
+
+static inline __u64 fid_ver_oid(const struct lu_fid *fid)
+{
+	return ((__u64)fid_ver(fid) << 32 | fid_oid(fid));
+}
+
+/**
+ * Note that reserved SEQ numbers below 12 will conflict with ldiskfs
+ * inodes in the IGIF namespace, so these reserved SEQ numbers can be
+ * used for other purposes and not risk collisions with existing inodes.
+ *
+ * Different FID Format
+ * http://arch.lustre.org/index.php?title=Interoperability_fids_zfs#NEW.0
+ */
+enum fid_seq {
+	FID_SEQ_OST_MDT0	= 0,
+	FID_SEQ_LLOG		= 1, /* unnamed llogs */
+	FID_SEQ_ECHO		= 2,
+	FID_SEQ_OST_MDT1	= 3,
+	FID_SEQ_OST_MAX		= 9, /* Max MDT count before OST_on_FID */
+	FID_SEQ_LLOG_NAME	= 10, /* named llogs */
+	FID_SEQ_RSVD		= 11,
+	FID_SEQ_IGIF		= 12,
+	FID_SEQ_IGIF_MAX	= 0x0ffffffffULL,
+	FID_SEQ_IDIF		= 0x100000000ULL,
+	FID_SEQ_IDIF_MAX	= 0x1ffffffffULL,
+	/* Normal FID sequence starts from this value, i.e. 1<<33 */
+	FID_SEQ_START		= 0x200000000ULL,
+	/* sequence for local pre-defined FIDs listed in local_oid */
+	FID_SEQ_LOCAL_FILE	= 0x200000001ULL,
+	FID_SEQ_DOT_LUSTRE	= 0x200000002ULL,
+	/* sequence is used for local named objects FIDs generated
+	 * by local_object_storage library */
+	FID_SEQ_LOCAL_NAME	= 0x200000003ULL,
+	/* Because current FLD will only cache the fid sequence, instead
+	 * of oid on the client side, if the FID needs to be exposed to
+	 * clients sides, it needs to make sure all of fids under one
+	 * sequence will be located in one MDT. */
+	FID_SEQ_SPECIAL		= 0x200000004ULL,
+	FID_SEQ_QUOTA		= 0x200000005ULL,
+	FID_SEQ_QUOTA_GLB	= 0x200000006ULL,
+	FID_SEQ_ROOT		= 0x200000007ULL,  /* Located on MDT0 */
+	FID_SEQ_NORMAL		= 0x200000400ULL,
+	FID_SEQ_LOV_DEFAULT	= 0xffffffffffffffffULL
+};
+
+#define OBIF_OID_MAX_BITS	   32
+#define OBIF_MAX_OID		(1ULL << OBIF_OID_MAX_BITS)
+#define OBIF_OID_MASK	       ((1ULL << OBIF_OID_MAX_BITS) - 1)
+#define IDIF_OID_MAX_BITS	   48
+#define IDIF_MAX_OID		(1ULL << IDIF_OID_MAX_BITS)
+#define IDIF_OID_MASK	       ((1ULL << IDIF_OID_MAX_BITS) - 1)
+
+/** OID for FID_SEQ_SPECIAL */
+enum special_oid {
+	/* Big Filesystem Lock to serialize rename operations */
+	FID_OID_SPECIAL_BFL     = 1UL,
+};
+
+/** OID for FID_SEQ_DOT_LUSTRE */
+enum dot_lustre_oid {
+	FID_OID_DOT_LUSTRE  = 1UL,
+	FID_OID_DOT_LUSTRE_OBF = 2UL,
+};
+
+static inline int fid_seq_is_mdt0(__u64 seq)
+{
+	return (seq == FID_SEQ_OST_MDT0);
+}
+
+static inline int fid_seq_is_mdt(const __u64 seq)
+{
+	return seq == FID_SEQ_OST_MDT0 || seq >= FID_SEQ_NORMAL;
+};
+
+static inline int fid_seq_is_echo(__u64 seq)
+{
+	return (seq == FID_SEQ_ECHO);
+}
+
+static inline int fid_is_echo(const struct lu_fid *fid)
+{
+	return fid_seq_is_echo(fid_seq(fid));
+}
+
+static inline int fid_seq_is_llog(__u64 seq)
+{
+	return (seq == FID_SEQ_LLOG);
+}
+
+static inline int fid_is_llog(const struct lu_fid *fid)
+{
+	/* file with OID == 0 is not llog but contains last oid */
+	return fid_seq_is_llog(fid_seq(fid)) && fid_oid(fid) > 0;
+}
+
+static inline int fid_seq_is_rsvd(const __u64 seq)
+{
+	return (seq > FID_SEQ_OST_MDT0 && seq <= FID_SEQ_RSVD);
+};
+
+static inline int fid_seq_is_special(const __u64 seq)
+{
+	return seq == FID_SEQ_SPECIAL;
+};
+
+static inline int fid_seq_is_local_file(const __u64 seq)
+{
+	return seq == FID_SEQ_LOCAL_FILE ||
+	       seq == FID_SEQ_LOCAL_NAME;
+};
+
+static inline int fid_seq_is_root(const __u64 seq)
+{
+	return seq == FID_SEQ_ROOT;
+}
+
+static inline int fid_seq_is_dot(const __u64 seq)
+{
+	return seq == FID_SEQ_DOT_LUSTRE;
+}
+
+static inline int fid_seq_is_default(const __u64 seq)
+{
+	return seq == FID_SEQ_LOV_DEFAULT;
+}
+
+static inline int fid_is_mdt0(const struct lu_fid *fid)
+{
+	return fid_seq_is_mdt0(fid_seq(fid));
+}
+
+static inline void lu_root_fid(struct lu_fid *fid)
+{
+	fid->f_seq = FID_SEQ_ROOT;
+	fid->f_oid = 1;
+	fid->f_ver = 0;
+}
+
+/**
+ * Check if a fid is igif or not.
+ * \param fid the fid to be tested.
+ * \return true if the fid is a igif; otherwise false.
+ */
+static inline int fid_seq_is_igif(const __u64 seq)
+{
+	return seq >= FID_SEQ_IGIF && seq <= FID_SEQ_IGIF_MAX;
+}
+
+static inline int fid_is_igif(const struct lu_fid *fid)
+{
+	return fid_seq_is_igif(fid_seq(fid));
+}
+
+/**
+ * Check if a fid is idif or not.
+ * \param fid the fid to be tested.
+ * \return true if the fid is a idif; otherwise false.
+ */
+static inline int fid_seq_is_idif(const __u64 seq)
+{
+	return seq >= FID_SEQ_IDIF && seq <= FID_SEQ_IDIF_MAX;
+}
+
+static inline int fid_is_idif(const struct lu_fid *fid)
+{
+	return fid_seq_is_idif(fid_seq(fid));
+}
+
+static inline int fid_is_local_file(const struct lu_fid *fid)
+{
+	return fid_seq_is_local_file(fid_seq(fid));
+}
+
+static inline int fid_seq_is_norm(const __u64 seq)
+{
+	return (seq >= FID_SEQ_NORMAL);
+}
+
+static inline int fid_is_norm(const struct lu_fid *fid)
+{
+	return fid_seq_is_norm(fid_seq(fid));
+}
+
+/* convert an OST objid into an IDIF FID SEQ number */
+static inline __u64 fid_idif_seq(__u64 id, __u32 ost_idx)
+{
+	return FID_SEQ_IDIF | (ost_idx << 16) | ((id >> 32) & 0xffff);
+}
+
+/* convert a packed IDIF FID into an OST objid */
+static inline __u64 fid_idif_id(__u64 seq, __u32 oid, __u32 ver)
+{
+	return ((__u64)ver << 48) | ((seq & 0xffff) << 32) | oid;
+}
+
+/* extract ost index from IDIF FID */
+static inline __u32 fid_idif_ost_idx(const struct lu_fid *fid)
+{
+	return (fid_seq(fid) >> 16) & 0xffff;
+}
+
+/* extract OST sequence (group) from a wire ost_id (id/seq) pair */
+static inline __u64 ostid_seq(const struct ost_id *ostid)
+{
+	if (fid_seq_is_mdt0(ostid->oi.oi_seq))
+		return FID_SEQ_OST_MDT0;
+
+	if (fid_seq_is_default(ostid->oi.oi_seq))
+		return FID_SEQ_LOV_DEFAULT;
+
+	if (fid_is_idif(&ostid->oi_fid))
+		return FID_SEQ_OST_MDT0;
+
+	return fid_seq(&ostid->oi_fid);
+}
+
+/* extract OST objid from a wire ost_id (id/seq) pair */
+static inline __u64 ostid_id(const struct ost_id *ostid)
+{
+	if (fid_seq_is_mdt0(ostid_seq(ostid)))
+		return ostid->oi.oi_id & IDIF_OID_MASK;
+
+	if (fid_is_idif(&ostid->oi_fid))
+		return fid_idif_id(fid_seq(&ostid->oi_fid),
+				   fid_oid(&ostid->oi_fid), 0);
+
+	return fid_oid(&ostid->oi_fid);
+}
+
+static inline void ostid_set_seq(struct ost_id *oi, __u64 seq)
+{
+	if (fid_seq_is_mdt0(seq) || fid_seq_is_default(seq)) {
+		oi->oi.oi_seq = seq;
+	} else {
+		oi->oi_fid.f_seq = seq;
+		/* Note: if f_oid + f_ver is zero, we need init it
+		 * to be 1, otherwise, ostid_seq will treat this
+		 * as old ostid (oi_seq == 0) */
+		if (oi->oi_fid.f_oid == 0 && oi->oi_fid.f_ver == 0)
+			oi->oi_fid.f_oid = LUSTRE_FID_INIT_OID;
+	}
+}
+
+static inline void ostid_set_seq_mdt0(struct ost_id *oi)
+{
+	ostid_set_seq(oi, FID_SEQ_OST_MDT0);
+}
+
+static inline void ostid_set_seq_echo(struct ost_id *oi)
+{
+	ostid_set_seq(oi, FID_SEQ_ECHO);
+}
+
+static inline void ostid_set_seq_llog(struct ost_id *oi)
+{
+	ostid_set_seq(oi, FID_SEQ_LLOG);
+}
+
+/**
+ * Note: we need check oi_seq to decide where to set oi_id,
+ * so oi_seq should always be set ahead of oi_id.
+ */
+static inline void ostid_set_id(struct ost_id *oi, __u64 oid)
+{
+	if (fid_seq_is_mdt0(ostid_seq(oi))) {
+		if (oid >= IDIF_MAX_OID) {
+			CERROR("Bad %llu to set "DOSTID"\n",
+				oid, POSTID(oi));
+			return;
+		}
+		oi->oi.oi_id = oid;
+	} else {
+		if (oid > OBIF_MAX_OID) {
+			CERROR("Bad %llu to set "DOSTID"\n",
+				oid, POSTID(oi));
+			return;
+		}
+		oi->oi_fid.f_oid = oid;
+	}
+}
+
+static inline void ostid_inc_id(struct ost_id *oi)
+{
+	if (fid_seq_is_mdt0(ostid_seq(oi))) {
+		if (unlikely(ostid_id(oi) + 1 > IDIF_MAX_OID)) {
+			CERROR("Bad inc "DOSTID"\n", POSTID(oi));
+			return;
+		}
+		oi->oi.oi_id++;
+	} else {
+		oi->oi_fid.f_oid++;
+	}
+}
+
+static inline void ostid_dec_id(struct ost_id *oi)
+{
+	if (fid_seq_is_mdt0(ostid_seq(oi)))
+		oi->oi.oi_id--;
+	else
+		oi->oi_fid.f_oid--;
+}
+
+/**
+ * Unpack an OST object id/seq (group) into a FID.  This is needed for
+ * converting all obdo, lmm, lsm, etc. 64-bit id/seq pairs into proper
+ * FIDs.  Note that if an id/seq is already in FID/IDIF format it will
+ * be passed through unchanged.  Only legacy OST objects in "group 0"
+ * will be mapped into the IDIF namespace so that they can fit into the
+ * struct lu_fid fields without loss.  For reference see:
+ * http://arch.lustre.org/index.php?title=Interoperability_fids_zfs
+ */
+static inline int ostid_to_fid(struct lu_fid *fid, struct ost_id *ostid,
+			       __u32 ost_idx)
+{
+	if (ost_idx > 0xffff) {
+		CERROR("bad ost_idx, "DOSTID" ost_idx:%u\n", POSTID(ostid),
+		       ost_idx);
+		return -EBADF;
+	}
+
+	if (fid_seq_is_mdt0(ostid_seq(ostid))) {
+		/* This is a "legacy" (old 1.x/2.early) OST object in "group 0"
+		 * that we map into the IDIF namespace.  It allows up to 2^48
+		 * objects per OST, as this is the object namespace that has
+		 * been in production for years.  This can handle create rates
+		 * of 1M objects/s/OST for 9 years, or combinations thereof. */
+		if (ostid_id(ostid) >= IDIF_MAX_OID) {
+			 CERROR("bad MDT0 id, "DOSTID" ost_idx:%u\n",
+				POSTID(ostid), ost_idx);
+			 return -EBADF;
+		}
+		fid->f_seq = fid_idif_seq(ostid_id(ostid), ost_idx);
+		/* truncate to 32 bits by assignment */
+		fid->f_oid = ostid_id(ostid);
+		/* in theory, not currently used */
+		fid->f_ver = ostid_id(ostid) >> 48;
+	} else /* if (fid_seq_is_idif(seq) || fid_seq_is_norm(seq)) */ {
+	       /* This is either an IDIF object, which identifies objects across
+		* all OSTs, or a regular FID.  The IDIF namespace maps legacy
+		* OST objects into the FID namespace.  In both cases, we just
+		* pass the FID through, no conversion needed. */
+		if (ostid->oi_fid.f_ver != 0) {
+			CERROR("bad MDT0 id, "DOSTID" ost_idx:%u\n",
+				POSTID(ostid), ost_idx);
+			return -EBADF;
+		}
+		*fid = ostid->oi_fid;
+	}
+
+	return 0;
+}
+
+/* pack any OST FID into an ostid (id/seq) for the wire/disk */
+static inline int fid_to_ostid(const struct lu_fid *fid, struct ost_id *ostid)
+{
+	if (unlikely(fid_seq_is_igif(fid->f_seq))) {
+		CERROR("bad IGIF, "DFID"\n", PFID(fid));
+		return -EBADF;
+	}
+
+	if (fid_is_idif(fid)) {
+		ostid_set_seq_mdt0(ostid);
+		ostid_set_id(ostid, fid_idif_id(fid_seq(fid), fid_oid(fid),
+						fid_ver(fid)));
+	} else {
+		ostid->oi_fid = *fid;
+	}
+
+	return 0;
+}
+
+/* Check whether the fid is for LAST_ID */
+static inline int fid_is_last_id(const struct lu_fid *fid)
+{
+	return (fid_oid(fid) == 0);
+}
+
+/**
+ * Get inode number from a igif.
+ * \param fid a igif to get inode number from.
+ * \return inode number for the igif.
+ */
+static inline ino_t lu_igif_ino(const struct lu_fid *fid)
+{
+	return fid_seq(fid);
+}
+
+extern void lustre_swab_ost_id(struct ost_id *oid);
+
+/**
+ * Get inode generation from a igif.
+ * \param fid a igif to get inode generation from.
+ * \return inode generation for the igif.
+ */
+static inline __u32 lu_igif_gen(const struct lu_fid *fid)
+{
+	return fid_oid(fid);
+}
+
+/**
+ * Build igif from the inode number/generation.
+ */
+static inline void lu_igif_build(struct lu_fid *fid, __u32 ino, __u32 gen)
+{
+	fid->f_seq = ino;
+	fid->f_oid = gen;
+	fid->f_ver = 0;
+}
+
+/*
+ * Fids are transmitted across network (in the sender byte-ordering),
+ * and stored on disk in big-endian order.
+ */
+static inline void fid_cpu_to_le(struct lu_fid *dst, const struct lu_fid *src)
+{
+	dst->f_seq = cpu_to_le64(fid_seq(src));
+	dst->f_oid = cpu_to_le32(fid_oid(src));
+	dst->f_ver = cpu_to_le32(fid_ver(src));
+}
+
+static inline void fid_le_to_cpu(struct lu_fid *dst, const struct lu_fid *src)
+{
+	dst->f_seq = le64_to_cpu(fid_seq(src));
+	dst->f_oid = le32_to_cpu(fid_oid(src));
+	dst->f_ver = le32_to_cpu(fid_ver(src));
+}
+
+static inline void fid_cpu_to_be(struct lu_fid *dst, const struct lu_fid *src)
+{
+	dst->f_seq = cpu_to_be64(fid_seq(src));
+	dst->f_oid = cpu_to_be32(fid_oid(src));
+	dst->f_ver = cpu_to_be32(fid_ver(src));
+}
+
+static inline void fid_be_to_cpu(struct lu_fid *dst, const struct lu_fid *src)
+{
+	dst->f_seq = be64_to_cpu(fid_seq(src));
+	dst->f_oid = be32_to_cpu(fid_oid(src));
+	dst->f_ver = be32_to_cpu(fid_ver(src));
+}
+
+static inline int fid_is_sane(const struct lu_fid *fid)
+{
+	return fid != NULL &&
+	       ((fid_seq(fid) >= FID_SEQ_START && fid_ver(fid) == 0) ||
+		fid_is_igif(fid) || fid_is_idif(fid) ||
+		fid_seq_is_rsvd(fid_seq(fid)));
+}
+
+static inline int fid_is_zero(const struct lu_fid *fid)
+{
+	return fid_seq(fid) == 0 && fid_oid(fid) == 0;
+}
+
+extern void lustre_swab_lu_fid(struct lu_fid *fid);
+extern void lustre_swab_lu_seq_range(struct lu_seq_range *range);
+
+static inline int lu_fid_eq(const struct lu_fid *f0, const struct lu_fid *f1)
+{
+	return memcmp(f0, f1, sizeof(*f0)) == 0;
+}
+
+#define __diff_normalize(val0, val1)			    \
+({							      \
+	typeof(val0) __val0 = (val0);			   \
+	typeof(val1) __val1 = (val1);			   \
+								\
+	(__val0 == __val1 ? 0 : __val0 > __val1 ? +1 : -1);     \
+})
+
+static inline int lu_fid_cmp(const struct lu_fid *f0,
+			     const struct lu_fid *f1)
+{
+	return
+		__diff_normalize(fid_seq(f0), fid_seq(f1)) ?:
+		__diff_normalize(fid_oid(f0), fid_oid(f1)) ?:
+		__diff_normalize(fid_ver(f0), fid_ver(f1));
+}
+
+static inline void ostid_cpu_to_le(const struct ost_id *src_oi,
+				   struct ost_id *dst_oi)
+{
+	if (fid_seq_is_mdt0(ostid_seq(src_oi))) {
+		dst_oi->oi.oi_id = cpu_to_le64(src_oi->oi.oi_id);
+		dst_oi->oi.oi_seq = cpu_to_le64(src_oi->oi.oi_seq);
+	} else {
+		fid_cpu_to_le(&dst_oi->oi_fid, &src_oi->oi_fid);
+	}
+}
+
+static inline void ostid_le_to_cpu(const struct ost_id *src_oi,
+				   struct ost_id *dst_oi)
+{
+	if (fid_seq_is_mdt0(ostid_seq(src_oi))) {
+		dst_oi->oi.oi_id = le64_to_cpu(src_oi->oi.oi_id);
+		dst_oi->oi.oi_seq = le64_to_cpu(src_oi->oi.oi_seq);
+	} else {
+		fid_le_to_cpu(&dst_oi->oi_fid, &src_oi->oi_fid);
+	}
+}
+
+/** @} lu_fid */
+
+/** \defgroup lu_dir lu_dir
+ * @{ */
+
+/**
+ * Enumeration of possible directory entry attributes.
+ *
+ * Attributes follow directory entry header in the order they appear in this
+ * enumeration.
+ */
+enum lu_dirent_attrs {
+	LUDA_FID		= 0x0001,
+	LUDA_TYPE		= 0x0002,
+	LUDA_64BITHASH		= 0x0004,
+
+	/* The following attrs are used for MDT internal only,
+	 * not visible to client */
+
+	/* Verify the dirent consistency */
+	LUDA_VERIFY		= 0x8000,
+	/* Only check but not repair the dirent inconsistency */
+	LUDA_VERIFY_DRYRUN	= 0x4000,
+	/* The dirent has been repaired, or to be repaired (dryrun). */
+	LUDA_REPAIR		= 0x2000,
+	/* The system is upgraded, has beed or to be repaired (dryrun). */
+	LUDA_UPGRADE		= 0x1000,
+	/* Ignore this record, go to next directly. */
+	LUDA_IGNORE		= 0x0800,
+};
+
+#define LU_DIRENT_ATTRS_MASK	0xf800
+
+/**
+ * Layout of readdir pages, as transmitted on wire.
+ */
+struct lu_dirent {
+	/** valid if LUDA_FID is set. */
+	struct lu_fid lde_fid;
+	/** a unique entry identifier: a hash or an offset. */
+	__u64	 lde_hash;
+	/** total record length, including all attributes. */
+	__u16	 lde_reclen;
+	/** name length */
+	__u16	 lde_namelen;
+	/** optional variable size attributes following this entry.
+	 *  taken from enum lu_dirent_attrs.
+	 */
+	__u32	 lde_attrs;
+	/** name is followed by the attributes indicated in ->ldp_attrs, in
+	 *  their natural order. After the last attribute, padding bytes are
+	 *  added to make ->lde_reclen a multiple of 8.
+	 */
+	char	  lde_name[0];
+};
+
+/*
+ * Definitions of optional directory entry attributes formats.
+ *
+ * Individual attributes do not have their length encoded in a generic way. It
+ * is assumed that consumer of an attribute knows its format. This means that
+ * it is impossible to skip over an unknown attribute, except by skipping over all
+ * remaining attributes (by using ->lde_reclen), which is not too
+ * constraining, because new server versions will append new attributes at
+ * the end of an entry.
+ */
+
+/**
+ * Fid directory attribute: a fid of an object referenced by the entry. This
+ * will be almost always requested by the client and supplied by the server.
+ *
+ * Aligned to 8 bytes.
+ */
+/* To have compatibility with 1.8, lets have fid in lu_dirent struct. */
+
+/**
+ * File type.
+ *
+ * Aligned to 2 bytes.
+ */
+struct luda_type {
+	__u16 lt_type;
+};
+
+#ifndef IFSHIFT
+#define IFSHIFT                 12
+#endif
+
+#ifndef IFTODT
+#define IFTODT(type)		(((type) & S_IFMT) >> IFSHIFT)
+#endif
+#ifndef DTTOIF
+#define DTTOIF(dirtype)		((dirtype) << IFSHIFT)
+#endif
+
+
+struct lu_dirpage {
+	__u64	    ldp_hash_start;
+	__u64	    ldp_hash_end;
+	__u32	    ldp_flags;
+	__u32	    ldp_pad0;
+	struct lu_dirent ldp_entries[0];
+};
+
+enum lu_dirpage_flags {
+	/**
+	 * dirpage contains no entry.
+	 */
+	LDF_EMPTY   = 1 << 0,
+	/**
+	 * last entry's lde_hash equals ldp_hash_end.
+	 */
+	LDF_COLLIDE = 1 << 1
+};
+
+static inline struct lu_dirent *lu_dirent_start(struct lu_dirpage *dp)
+{
+	if (le32_to_cpu(dp->ldp_flags) & LDF_EMPTY)
+		return NULL;
+	else
+		return dp->ldp_entries;
+}
+
+static inline struct lu_dirent *lu_dirent_next(struct lu_dirent *ent)
+{
+	struct lu_dirent *next;
+
+	if (le16_to_cpu(ent->lde_reclen) != 0)
+		next = ((void *)ent) + le16_to_cpu(ent->lde_reclen);
+	else
+		next = NULL;
+
+	return next;
+}
+
+static inline int lu_dirent_calc_size(int namelen, __u16 attr)
+{
+	int size;
+
+	if (attr & LUDA_TYPE) {
+		const unsigned align = sizeof(struct luda_type) - 1;
+		size = (sizeof(struct lu_dirent) + namelen + align) & ~align;
+		size += sizeof(struct luda_type);
+	} else
+		size = sizeof(struct lu_dirent) + namelen;
+
+	return (size + 7) & ~7;
+}
+
+static inline int lu_dirent_size(struct lu_dirent *ent)
+{
+	if (le16_to_cpu(ent->lde_reclen) == 0) {
+		return lu_dirent_calc_size(le16_to_cpu(ent->lde_namelen),
+					   le32_to_cpu(ent->lde_attrs));
+	}
+	return le16_to_cpu(ent->lde_reclen);
+}
+
+#define MDS_DIR_END_OFF 0xfffffffffffffffeULL
+
+/**
+ * MDS_READPAGE page size
+ *
+ * This is the directory page size packed in MDS_READPAGE RPC.
+ * It's different than PAGE_CACHE_SIZE because the client needs to
+ * access the struct lu_dirpage header packed at the beginning of
+ * the "page" and without this there isn't any way to know find the
+ * lu_dirpage header is if client and server PAGE_CACHE_SIZE differ.
+ */
+#define LU_PAGE_SHIFT 12
+#define LU_PAGE_SIZE  (1UL << LU_PAGE_SHIFT)
+#define LU_PAGE_MASK  (~(LU_PAGE_SIZE - 1))
+
+#define LU_PAGE_COUNT (1 << (PAGE_CACHE_SHIFT - LU_PAGE_SHIFT))
+
+/** @} lu_dir */
+
+struct lustre_handle {
+	__u64 cookie;
+};
+#define DEAD_HANDLE_MAGIC 0xdeadbeefcafebabeULL
+
+static inline int lustre_handle_is_used(struct lustre_handle *lh)
+{
+	return lh->cookie != 0ull;
+}
+
+static inline int lustre_handle_equal(const struct lustre_handle *lh1,
+				      const struct lustre_handle *lh2)
+{
+	return lh1->cookie == lh2->cookie;
+}
+
+static inline void lustre_handle_copy(struct lustre_handle *tgt,
+				      struct lustre_handle *src)
+{
+	tgt->cookie = src->cookie;
+}
+
+/* flags for lm_flags */
+#define MSGHDR_AT_SUPPORT	       0x1
+#define MSGHDR_CKSUM_INCOMPAT18	 0x2
+
+#define lustre_msg lustre_msg_v2
+/* we depend on this structure to be 8-byte aligned */
+/* this type is only endian-adjusted in lustre_unpack_msg() */
+struct lustre_msg_v2 {
+	__u32 lm_bufcount;
+	__u32 lm_secflvr;
+	__u32 lm_magic;
+	__u32 lm_repsize;
+	__u32 lm_cksum;
+	__u32 lm_flags;
+	__u32 lm_padding_2;
+	__u32 lm_padding_3;
+	__u32 lm_buflens[0];
+};
+
+/* without gss, ptlrpc_body is put at the first buffer. */
+#define PTLRPC_NUM_VERSIONS     4
+#define JOBSTATS_JOBID_SIZE     32  /* 32 bytes string */
+struct ptlrpc_body_v3 {
+	struct lustre_handle pb_handle;
+	__u32 pb_type;
+	__u32 pb_version;
+	__u32 pb_opc;
+	__u32 pb_status;
+	__u64 pb_last_xid;
+	__u64 pb_last_seen;
+	__u64 pb_last_committed;
+	__u64 pb_transno;
+	__u32 pb_flags;
+	__u32 pb_op_flags;
+	__u32 pb_conn_cnt;
+	__u32 pb_timeout;  /* for req, the deadline, for rep, the service est */
+	__u32 pb_service_time; /* for rep, actual service time */
+	__u32 pb_limit;
+	__u64 pb_slv;
+	/* VBR: pre-versions */
+	__u64 pb_pre_versions[PTLRPC_NUM_VERSIONS];
+	/* padding for future needs */
+	__u64 pb_padding[4];
+	char  pb_jobid[JOBSTATS_JOBID_SIZE];
+};
+#define ptlrpc_body     ptlrpc_body_v3
+
+struct ptlrpc_body_v2 {
+	struct lustre_handle pb_handle;
+	__u32 pb_type;
+	__u32 pb_version;
+	__u32 pb_opc;
+	__u32 pb_status;
+	__u64 pb_last_xid;
+	__u64 pb_last_seen;
+	__u64 pb_last_committed;
+	__u64 pb_transno;
+	__u32 pb_flags;
+	__u32 pb_op_flags;
+	__u32 pb_conn_cnt;
+	__u32 pb_timeout;  /* for req, the deadline, for rep, the service est */
+	__u32 pb_service_time; /* for rep, actual service time, also used for
+				  net_latency of req */
+	__u32 pb_limit;
+	__u64 pb_slv;
+	/* VBR: pre-versions */
+	__u64 pb_pre_versions[PTLRPC_NUM_VERSIONS];
+	/* padding for future needs */
+	__u64 pb_padding[4];
+};
+
+extern void lustre_swab_ptlrpc_body(struct ptlrpc_body *pb);
+
+/* message body offset for lustre_msg_v2 */
+/* ptlrpc body offset in all request/reply messages */
+#define MSG_PTLRPC_BODY_OFF	     0
+
+/* normal request/reply message record offset */
+#define REQ_REC_OFF		     1
+#define REPLY_REC_OFF		   1
+
+/* ldlm request message body offset */
+#define DLM_LOCKREQ_OFF		 1 /* lockreq offset */
+#define DLM_REQ_REC_OFF		 2 /* normal dlm request record offset */
+
+/* ldlm intent lock message body offset */
+#define DLM_INTENT_IT_OFF	       2 /* intent lock it offset */
+#define DLM_INTENT_REC_OFF	      3 /* intent lock record offset */
+
+/* ldlm reply message body offset */
+#define DLM_LOCKREPLY_OFF	       1 /* lockrep offset */
+#define DLM_REPLY_REC_OFF	       2 /* reply record offset */
+
+/** only use in req->rq_{req,rep}_swab_mask */
+#define MSG_PTLRPC_HEADER_OFF	   31
+
+/* Flags that are operation-specific go in the top 16 bits. */
+#define MSG_OP_FLAG_MASK   0xffff0000
+#define MSG_OP_FLAG_SHIFT  16
+
+/* Flags that apply to all requests are in the bottom 16 bits */
+#define MSG_GEN_FLAG_MASK     0x0000ffff
+#define MSG_LAST_REPLAY	   0x0001
+#define MSG_RESENT		0x0002
+#define MSG_REPLAY		0x0004
+/* #define MSG_AT_SUPPORT	 0x0008
+ * This was used in early prototypes of adaptive timeouts, and while there
+ * shouldn't be any users of that code there also isn't a need for using this
+ * bits. Defer usage until at least 1.10 to avoid potential conflict. */
+#define MSG_DELAY_REPLAY	  0x0010
+#define MSG_VERSION_REPLAY	0x0020
+#define MSG_REQ_REPLAY_DONE       0x0040
+#define MSG_LOCK_REPLAY_DONE      0x0080
+
+/*
+ * Flags for all connect opcodes (MDS_CONNECT, OST_CONNECT)
+ */
+
+#define MSG_CONNECT_RECOVERING  0x00000001
+#define MSG_CONNECT_RECONNECT   0x00000002
+#define MSG_CONNECT_REPLAYABLE  0x00000004
+//#define MSG_CONNECT_PEER	0x8
+#define MSG_CONNECT_LIBCLIENT   0x00000010
+#define MSG_CONNECT_INITIAL     0x00000020
+#define MSG_CONNECT_ASYNC       0x00000040
+#define MSG_CONNECT_NEXT_VER    0x00000080 /* use next version of lustre_msg */
+#define MSG_CONNECT_TRANSNO     0x00000100 /* report transno */
+
+/* Connect flags */
+#define OBD_CONNECT_RDONLY		0x1ULL /*client has read-only access*/
+#define OBD_CONNECT_INDEX		 0x2ULL /*connect specific LOV idx */
+#define OBD_CONNECT_MDS		   0x4ULL /*connect from MDT to OST */
+#define OBD_CONNECT_GRANT		 0x8ULL /*OSC gets grant at connect */
+#define OBD_CONNECT_SRVLOCK	      0x10ULL /*server takes locks for cli */
+#define OBD_CONNECT_VERSION	      0x20ULL /*Lustre versions in ocd */
+#define OBD_CONNECT_REQPORTAL	    0x40ULL /*Separate non-IO req portal */
+#define OBD_CONNECT_ACL		  0x80ULL /*access control lists */
+#define OBD_CONNECT_XATTR	       0x100ULL /*client use extended attr */
+#define OBD_CONNECT_CROW		0x200ULL /*MDS+OST create obj on write*/
+#define OBD_CONNECT_TRUNCLOCK	   0x400ULL /*locks on server for punch */
+#define OBD_CONNECT_TRANSNO	     0x800ULL /*replay sends init transno */
+#define OBD_CONNECT_IBITS	      0x1000ULL /*support for inodebits locks*/
+#define OBD_CONNECT_JOIN	       0x2000ULL /*files can be concatenated.
+						  *We do not support JOIN FILE
+						  *anymore, reserve this flags
+						  *just for preventing such bit
+						  *to be reused.*/
+#define OBD_CONNECT_ATTRFID	    0x4000ULL /*Server can GetAttr By Fid*/
+#define OBD_CONNECT_NODEVOH	    0x8000ULL /*No open hndl on specl nodes*/
+#define OBD_CONNECT_RMT_CLIENT	0x10000ULL /*Remote client */
+#define OBD_CONNECT_RMT_CLIENT_FORCE  0x20000ULL /*Remote client by force */
+#define OBD_CONNECT_BRW_SIZE	  0x40000ULL /*Max bytes per rpc */
+#define OBD_CONNECT_QUOTA64	   0x80000ULL /*Not used since 2.4 */
+#define OBD_CONNECT_MDS_CAPA	 0x100000ULL /*MDS capability */
+#define OBD_CONNECT_OSS_CAPA	 0x200000ULL /*OSS capability */
+#define OBD_CONNECT_CANCELSET	0x400000ULL /*Early batched cancels. */
+#define OBD_CONNECT_SOM	      0x800000ULL /*Size on MDS */
+#define OBD_CONNECT_AT	      0x1000000ULL /*client uses AT */
+#define OBD_CONNECT_LRU_RESIZE      0x2000000ULL /*LRU resize feature. */
+#define OBD_CONNECT_MDS_MDS	 0x4000000ULL /*MDS-MDS connection */
+#define OBD_CONNECT_REAL	    0x8000000ULL /*real connection */
+#define OBD_CONNECT_CHANGE_QS      0x10000000ULL /*Not used since 2.4 */
+#define OBD_CONNECT_CKSUM	  0x20000000ULL /*support several cksum algos*/
+#define OBD_CONNECT_FID	    0x40000000ULL /*FID is supported by server */
+#define OBD_CONNECT_VBR	    0x80000000ULL /*version based recovery */
+#define OBD_CONNECT_LOV_V3	0x100000000ULL /*client supports LOV v3 EA */
+#define OBD_CONNECT_GRANT_SHRINK  0x200000000ULL /* support grant shrink */
+#define OBD_CONNECT_SKIP_ORPHAN   0x400000000ULL /* don't reuse orphan objids */
+#define OBD_CONNECT_MAX_EASIZE    0x800000000ULL /* preserved for large EA */
+#define OBD_CONNECT_FULL20       0x1000000000ULL /* it is 2.0 client */
+#define OBD_CONNECT_LAYOUTLOCK   0x2000000000ULL /* client uses layout lock */
+#define OBD_CONNECT_64BITHASH    0x4000000000ULL /* client supports 64-bits
+						  * directory hash */
+#define OBD_CONNECT_MAXBYTES     0x8000000000ULL /* max stripe size */
+#define OBD_CONNECT_IMP_RECOV   0x10000000000ULL /* imp recovery support */
+#define OBD_CONNECT_JOBSTATS    0x20000000000ULL /* jobid in ptlrpc_body */
+#define OBD_CONNECT_UMASK       0x40000000000ULL /* create uses client umask */
+#define OBD_CONNECT_EINPROGRESS 0x80000000000ULL /* client handles -EINPROGRESS
+						  * RPC error properly */
+#define OBD_CONNECT_GRANT_PARAM 0x100000000000ULL/* extra grant params used for
+						  * finer space reservation */
+#define OBD_CONNECT_FLOCK_OWNER 0x200000000000ULL /* for the fixed 1.8
+						   * policy and 2.x server */
+#define OBD_CONNECT_LVB_TYPE	0x400000000000ULL /* variable type of LVB */
+#define OBD_CONNECT_NANOSEC_TIME 0x800000000000ULL /* nanosecond timestamps */
+#define OBD_CONNECT_LIGHTWEIGHT 0x1000000000000ULL/* lightweight connection */
+#define OBD_CONNECT_SHORTIO     0x2000000000000ULL/* short io */
+#define OBD_CONNECT_PINGLESS	0x4000000000000ULL/* pings not required */
+#define OBD_CONNECT_FLOCK_DEAD	0x8000000000000ULL/* flock deadlock detection */
+#define OBD_CONNECT_DISP_STRIPE 0x10000000000000ULL/*create stripe disposition*/
+
+/* XXX README XXX:
+ * Please DO NOT add flag values here before first ensuring that this same
+ * flag value is not in use on some other branch.  Please clear any such
+ * changes with senior engineers before starting to use a new flag.  Then,
+ * submit a small patch against EVERY branch that ONLY adds the new flag,
+ * updates obd_connect_names[] for lprocfs_rd_connect_flags(), adds the
+ * flag to check_obd_connect_data(), and updates wiretests accordingly, so it
+ * can be approved and landed easily to reserve the flag for future use. */
+
+/* The MNE_SWAB flag is overloading the MDS_MDS bit only for the MGS
+ * connection.  It is a temporary bug fix for Imperative Recovery interop
+ * between 2.2 and 2.3 x86/ppc nodes, and can be removed when interop for
+ * 2.2 clients/servers is no longer needed.  LU-1252/LU-1644. */
+#define OBD_CONNECT_MNE_SWAB		 OBD_CONNECT_MDS_MDS
+
+#define OCD_HAS_FLAG(ocd, flg)  \
+	(!!((ocd)->ocd_connect_flags & OBD_CONNECT_##flg))
+
+
+#define LRU_RESIZE_CONNECT_FLAG OBD_CONNECT_LRU_RESIZE
+
+#define MDT_CONNECT_SUPPORTED  (OBD_CONNECT_RDONLY | OBD_CONNECT_VERSION | \
+				OBD_CONNECT_ACL | OBD_CONNECT_XATTR | \
+				OBD_CONNECT_IBITS | \
+				OBD_CONNECT_NODEVOH | OBD_CONNECT_ATTRFID | \
+				OBD_CONNECT_CANCELSET | OBD_CONNECT_AT | \
+				OBD_CONNECT_RMT_CLIENT | \
+				OBD_CONNECT_RMT_CLIENT_FORCE | \
+				OBD_CONNECT_BRW_SIZE | OBD_CONNECT_MDS_CAPA | \
+				OBD_CONNECT_OSS_CAPA | OBD_CONNECT_MDS_MDS | \
+				OBD_CONNECT_FID | LRU_RESIZE_CONNECT_FLAG | \
+				OBD_CONNECT_VBR | OBD_CONNECT_LOV_V3 | \
+				OBD_CONNECT_SOM | OBD_CONNECT_FULL20 | \
+				OBD_CONNECT_64BITHASH | OBD_CONNECT_JOBSTATS | \
+				OBD_CONNECT_EINPROGRESS | \
+				OBD_CONNECT_LIGHTWEIGHT | OBD_CONNECT_UMASK | \
+				OBD_CONNECT_LVB_TYPE | OBD_CONNECT_LAYOUTLOCK |\
+				OBD_CONNECT_PINGLESS | OBD_CONNECT_MAX_EASIZE |\
+				OBD_CONNECT_FLOCK_DEAD | \
+				OBD_CONNECT_DISP_STRIPE)
+
+#define OST_CONNECT_SUPPORTED  (OBD_CONNECT_SRVLOCK | OBD_CONNECT_GRANT | \
+				OBD_CONNECT_REQPORTAL | OBD_CONNECT_VERSION | \
+				OBD_CONNECT_TRUNCLOCK | OBD_CONNECT_INDEX | \
+				OBD_CONNECT_BRW_SIZE | OBD_CONNECT_OSS_CAPA | \
+				OBD_CONNECT_CANCELSET | OBD_CONNECT_AT | \
+				LRU_RESIZE_CONNECT_FLAG | OBD_CONNECT_CKSUM | \
+				OBD_CONNECT_RMT_CLIENT | \
+				OBD_CONNECT_RMT_CLIENT_FORCE | OBD_CONNECT_VBR | \
+				OBD_CONNECT_MDS | OBD_CONNECT_SKIP_ORPHAN | \
+				OBD_CONNECT_GRANT_SHRINK | OBD_CONNECT_FULL20 | \
+				OBD_CONNECT_64BITHASH | OBD_CONNECT_MAXBYTES | \
+				OBD_CONNECT_MAX_EASIZE | \
+				OBD_CONNECT_EINPROGRESS | \
+				OBD_CONNECT_JOBSTATS | \
+				OBD_CONNECT_LIGHTWEIGHT | OBD_CONNECT_LVB_TYPE|\
+				OBD_CONNECT_LAYOUTLOCK | OBD_CONNECT_FID | \
+				OBD_CONNECT_PINGLESS)
+#define ECHO_CONNECT_SUPPORTED (0)
+#define MGS_CONNECT_SUPPORTED  (OBD_CONNECT_VERSION | OBD_CONNECT_AT | \
+				OBD_CONNECT_FULL20 | OBD_CONNECT_IMP_RECOV | \
+				OBD_CONNECT_MNE_SWAB | OBD_CONNECT_PINGLESS)
+
+/* Features required for this version of the client to work with server */
+#define CLIENT_CONNECT_MDT_REQD (OBD_CONNECT_IBITS | OBD_CONNECT_FID | \
+				 OBD_CONNECT_FULL20)
+
+#define OBD_OCD_VERSION(major, minor, patch, fix) (((major)<<24) + \
+						  ((minor)<<16) + \
+						  ((patch)<<8) + (fix))
+#define OBD_OCD_VERSION_MAJOR(version) ((int)((version)>>24)&255)
+#define OBD_OCD_VERSION_MINOR(version) ((int)((version)>>16)&255)
+#define OBD_OCD_VERSION_PATCH(version) ((int)((version)>>8)&255)
+#define OBD_OCD_VERSION_FIX(version)   ((int)(version)&255)
+
+/* This structure is used for both request and reply.
+ *
+ * If we eventually have separate connect data for different types, which we
+ * almost certainly will, then perhaps we stick a union in here. */
+struct obd_connect_data_v1 {
+	__u64 ocd_connect_flags; /* OBD_CONNECT_* per above */
+	__u32 ocd_version;	 /* lustre release version number */
+	__u32 ocd_grant;	 /* initial cache grant amount (bytes) */
+	__u32 ocd_index;	 /* LOV index to connect to */
+	__u32 ocd_brw_size;	 /* Maximum BRW size in bytes, must be 2^n */
+	__u64 ocd_ibits_known;   /* inode bits this client understands */
+	__u8  ocd_blocksize;     /* log2 of the backend filesystem blocksize */
+	__u8  ocd_inodespace;    /* log2 of the per-inode space consumption */
+	__u16 ocd_grant_extent;  /* per-extent grant overhead, in 1K blocks */
+	__u32 ocd_unused;	/* also fix lustre_swab_connect */
+	__u64 ocd_transno;       /* first transno from client to be replayed */
+	__u32 ocd_group;	 /* MDS group on OST */
+	__u32 ocd_cksum_types;   /* supported checksum algorithms */
+	__u32 ocd_max_easize;    /* How big LOV EA can be on MDS */
+	__u32 ocd_instance;      /* also fix lustre_swab_connect */
+	__u64 ocd_maxbytes;      /* Maximum stripe size in bytes */
+};
+
+struct obd_connect_data {
+	__u64 ocd_connect_flags; /* OBD_CONNECT_* per above */
+	__u32 ocd_version;	 /* lustre release version number */
+	__u32 ocd_grant;	 /* initial cache grant amount (bytes) */
+	__u32 ocd_index;	 /* LOV index to connect to */
+	__u32 ocd_brw_size;	 /* Maximum BRW size in bytes */
+	__u64 ocd_ibits_known;   /* inode bits this client understands */
+	__u8  ocd_blocksize;     /* log2 of the backend filesystem blocksize */
+	__u8  ocd_inodespace;    /* log2 of the per-inode space consumption */
+	__u16 ocd_grant_extent;  /* per-extent grant overhead, in 1K blocks */
+	__u32 ocd_unused;	/* also fix lustre_swab_connect */
+	__u64 ocd_transno;       /* first transno from client to be replayed */
+	__u32 ocd_group;	 /* MDS group on OST */
+	__u32 ocd_cksum_types;   /* supported checksum algorithms */
+	__u32 ocd_max_easize;    /* How big LOV EA can be on MDS */
+	__u32 ocd_instance;      /* instance # of this target */
+	__u64 ocd_maxbytes;      /* Maximum stripe size in bytes */
+	/* Fields after ocd_maxbytes are only accessible by the receiver
+	 * if the corresponding flag in ocd_connect_flags is set. Accessing
+	 * any field after ocd_maxbytes on the receiver without a valid flag
+	 * may result in out-of-bound memory access and kernel oops. */
+	__u64 padding1;	  /* added 2.1.0. also fix lustre_swab_connect */
+	__u64 padding2;	  /* added 2.1.0. also fix lustre_swab_connect */
+	__u64 padding3;	  /* added 2.1.0. also fix lustre_swab_connect */
+	__u64 padding4;	  /* added 2.1.0. also fix lustre_swab_connect */
+	__u64 padding5;	  /* added 2.1.0. also fix lustre_swab_connect */
+	__u64 padding6;	  /* added 2.1.0. also fix lustre_swab_connect */
+	__u64 padding7;	  /* added 2.1.0. also fix lustre_swab_connect */
+	__u64 padding8;	  /* added 2.1.0. also fix lustre_swab_connect */
+	__u64 padding9;	  /* added 2.1.0. also fix lustre_swab_connect */
+	__u64 paddingA;	  /* added 2.1.0. also fix lustre_swab_connect */
+	__u64 paddingB;	  /* added 2.1.0. also fix lustre_swab_connect */
+	__u64 paddingC;	  /* added 2.1.0. also fix lustre_swab_connect */
+	__u64 paddingD;	  /* added 2.1.0. also fix lustre_swab_connect */
+	__u64 paddingE;	  /* added 2.1.0. also fix lustre_swab_connect */
+	__u64 paddingF;	  /* added 2.1.0. also fix lustre_swab_connect */
+};
+/* XXX README XXX:
+ * Please DO NOT use any fields here before first ensuring that this same
+ * field is not in use on some other branch.  Please clear any such changes
+ * with senior engineers before starting to use a new field.  Then, submit
+ * a small patch against EVERY branch that ONLY adds the new field along with
+ * the matching OBD_CONNECT flag, so that can be approved and landed easily to
+ * reserve the flag for future use. */
+
+
+extern void lustre_swab_connect(struct obd_connect_data *ocd);
+
+/*
+ * Supported checksum algorithms. Up to 32 checksum types are supported.
+ * (32-bit mask stored in obd_connect_data::ocd_cksum_types)
+ * Please update DECLARE_CKSUM_NAME/OBD_CKSUM_ALL in obd.h when adding a new
+ * algorithm and also the OBD_FL_CKSUM* flags.
+ */
+typedef enum {
+	OBD_CKSUM_CRC32 = 0x00000001,
+	OBD_CKSUM_ADLER = 0x00000002,
+	OBD_CKSUM_CRC32C= 0x00000004,
+} cksum_type_t;
+
+/*
+ *   OST requests: OBDO & OBD request records
+ */
+
+/* opcodes */
+typedef enum {
+	OST_REPLY      =  0,       /* reply ? */
+	OST_GETATTR    =  1,
+	OST_SETATTR    =  2,
+	OST_READ       =  3,
+	OST_WRITE      =  4,
+	OST_CREATE     =  5,
+	OST_DESTROY    =  6,
+	OST_GET_INFO   =  7,
+	OST_CONNECT    =  8,
+	OST_DISCONNECT =  9,
+	OST_PUNCH      = 10,
+	OST_OPEN       = 11,
+	OST_CLOSE      = 12,
+	OST_STATFS     = 13,
+	OST_SYNC       = 16,
+	OST_SET_INFO   = 17,
+	OST_QUOTACHECK = 18,
+	OST_QUOTACTL   = 19,
+	OST_QUOTA_ADJUST_QUNIT = 20, /* not used since 2.4 */
+	OST_LAST_OPC
+} ost_cmd_t;
+#define OST_FIRST_OPC  OST_REPLY
+
+enum obdo_flags {
+	OBD_FL_INLINEDATA   = 0x00000001,
+	OBD_FL_OBDMDEXISTS  = 0x00000002,
+	OBD_FL_DELORPHAN    = 0x00000004, /* if set in o_flags delete orphans */
+	OBD_FL_NORPC	= 0x00000008, /* set in o_flags do in OSC not OST */
+	OBD_FL_IDONLY       = 0x00000010, /* set in o_flags only adjust obj id*/
+	OBD_FL_RECREATE_OBJS= 0x00000020, /* recreate missing obj */
+	OBD_FL_DEBUG_CHECK  = 0x00000040, /* echo client/server debug check */
+	OBD_FL_NO_USRQUOTA  = 0x00000100, /* the object's owner is over quota */
+	OBD_FL_NO_GRPQUOTA  = 0x00000200, /* the object's group is over quota */
+	OBD_FL_CREATE_CROW  = 0x00000400, /* object should be create on write */
+	OBD_FL_SRVLOCK      = 0x00000800, /* delegate DLM locking to server */
+	OBD_FL_CKSUM_CRC32  = 0x00001000, /* CRC32 checksum type */
+	OBD_FL_CKSUM_ADLER  = 0x00002000, /* ADLER checksum type */
+	OBD_FL_CKSUM_CRC32C = 0x00004000, /* CRC32C checksum type */
+	OBD_FL_CKSUM_RSVD2  = 0x00008000, /* for future cksum types */
+	OBD_FL_CKSUM_RSVD3  = 0x00010000, /* for future cksum types */
+	OBD_FL_SHRINK_GRANT = 0x00020000, /* object shrink the grant */
+	OBD_FL_MMAP	 = 0x00040000, /* object is mmapped on the client.
+					   * XXX: obsoleted - reserved for old
+					   * clients prior than 2.2 */
+	OBD_FL_RECOV_RESEND = 0x00080000, /* recoverable resent */
+	OBD_FL_NOSPC_BLK    = 0x00100000, /* no more block space on OST */
+
+	/* Note that while these checksum values are currently separate bits,
+	 * in 2.x we can actually allow all values from 1-31 if we wanted. */
+	OBD_FL_CKSUM_ALL    = OBD_FL_CKSUM_CRC32 | OBD_FL_CKSUM_ADLER |
+			      OBD_FL_CKSUM_CRC32C,
+
+	/* mask for local-only flag, which won't be sent over network */
+	OBD_FL_LOCAL_MASK   = 0xF0000000,
+};
+
+#define LOV_MAGIC_V1      0x0BD10BD0
+#define LOV_MAGIC	 LOV_MAGIC_V1
+#define LOV_MAGIC_JOIN_V1 0x0BD20BD0
+#define LOV_MAGIC_V3      0x0BD30BD0
+
+/*
+ * magic for fully defined striping
+ * the idea is that we should have different magics for striping "hints"
+ * (struct lov_user_md_v[13]) and defined ready-to-use striping (struct
+ * lov_mds_md_v[13]). at the moment the magics are used in wire protocol,
+ * we can't just change it w/o long way preparation, but we still need a
+ * mechanism to allow LOD to differentiate hint versus ready striping.
+ * so, at the moment we do a trick: MDT knows what to expect from request
+ * depending on the case (replay uses ready striping, non-replay req uses
+ * hints), so MDT replaces magic with appropriate one and now LOD can
+ * easily understand what's inside -bzzz
+ */
+#define LOV_MAGIC_V1_DEF  0x0CD10BD0
+#define LOV_MAGIC_V3_DEF  0x0CD30BD0
+
+#define LOV_PATTERN_RAID0	0x001   /* stripes are used round-robin */
+#define LOV_PATTERN_RAID1	0x002   /* stripes are mirrors of each other */
+#define LOV_PATTERN_FIRST	0x100   /* first stripe is not in round-robin */
+#define LOV_PATTERN_CMOBD	0x200
+
+#define LOV_PATTERN_F_MASK	0xffff0000
+#define LOV_PATTERN_F_RELEASED	0x80000000 /* HSM released file */
+
+#define lov_pattern(pattern)		(pattern & ~LOV_PATTERN_F_MASK)
+#define lov_pattern_flags(pattern)	(pattern & LOV_PATTERN_F_MASK)
+
+#define lov_ost_data lov_ost_data_v1
+struct lov_ost_data_v1 {	  /* per-stripe data structure (little-endian)*/
+	struct ost_id l_ost_oi;	  /* OST object ID */
+	__u32 l_ost_gen;	  /* generation of this l_ost_idx */
+	__u32 l_ost_idx;	  /* OST index in LOV (lov_tgt_desc->tgts) */
+};
+
+#define lov_mds_md lov_mds_md_v1
+struct lov_mds_md_v1 {	    /* LOV EA mds/wire data (little-endian) */
+	__u32 lmm_magic;	  /* magic number = LOV_MAGIC_V1 */
+	__u32 lmm_pattern;	/* LOV_PATTERN_RAID0, LOV_PATTERN_RAID1 */
+	struct ost_id	lmm_oi;	  /* LOV object ID */
+	__u32 lmm_stripe_size;    /* size of stripe in bytes */
+	/* lmm_stripe_count used to be __u32 */
+	__u16 lmm_stripe_count;   /* num stripes in use for this object */
+	__u16 lmm_layout_gen;     /* layout generation number */
+	struct lov_ost_data_v1 lmm_objects[0]; /* per-stripe data */
+};
+
+/**
+ * Sigh, because pre-2.4 uses
+ * struct lov_mds_md_v1 {
+ *	........
+ *	__u64 lmm_object_id;
+ *	__u64 lmm_object_seq;
+ *      ......
+ *      }
+ * to identify the LOV(MDT) object, and lmm_object_seq will
+ * be normal_fid, which make it hard to combine these conversion
+ * to ostid_to FID. so we will do lmm_oi/fid conversion separately
+ *
+ * We can tell the lmm_oi by this way,
+ * 1.8: lmm_object_id = {inode}, lmm_object_gr = 0
+ * 2.1: lmm_object_id = {oid < 128k}, lmm_object_seq = FID_SEQ_NORMAL
+ * 2.4: lmm_oi.f_seq = FID_SEQ_NORMAL, lmm_oi.f_oid = {oid < 128k},
+ *      lmm_oi.f_ver = 0
+ *
+ * But currently lmm_oi/lsm_oi does not have any "real" usages,
+ * except for printing some information, and the user can always
+ * get the real FID from LMA, besides this multiple case check might
+ * make swab more complicate. So we will keep using id/seq for lmm_oi.
+ */
+
+static inline void fid_to_lmm_oi(const struct lu_fid *fid,
+				 struct ost_id *oi)
+{
+	oi->oi.oi_id = fid_oid(fid);
+	oi->oi.oi_seq = fid_seq(fid);
+}
+
+static inline void lmm_oi_set_seq(struct ost_id *oi, __u64 seq)
+{
+	oi->oi.oi_seq = seq;
+}
+
+static inline __u64 lmm_oi_id(struct ost_id *oi)
+{
+	return oi->oi.oi_id;
+}
+
+static inline __u64 lmm_oi_seq(struct ost_id *oi)
+{
+	return oi->oi.oi_seq;
+}
+
+static inline void lmm_oi_le_to_cpu(struct ost_id *dst_oi,
+				    struct ost_id *src_oi)
+{
+	dst_oi->oi.oi_id = le64_to_cpu(src_oi->oi.oi_id);
+	dst_oi->oi.oi_seq = le64_to_cpu(src_oi->oi.oi_seq);
+}
+
+static inline void lmm_oi_cpu_to_le(struct ost_id *dst_oi,
+				    struct ost_id *src_oi)
+{
+	dst_oi->oi.oi_id = cpu_to_le64(src_oi->oi.oi_id);
+	dst_oi->oi.oi_seq = cpu_to_le64(src_oi->oi.oi_seq);
+}
+
+/* extern void lustre_swab_lov_mds_md(struct lov_mds_md *llm); */
+
+#define MAX_MD_SIZE							\
+	(sizeof(struct lov_mds_md) + 4 * sizeof(struct lov_ost_data))
+#define MIN_MD_SIZE							\
+	(sizeof(struct lov_mds_md) + 1 * sizeof(struct lov_ost_data))
+
+#define XATTR_NAME_ACL_ACCESS   "system.posix_acl_access"
+#define XATTR_NAME_ACL_DEFAULT  "system.posix_acl_default"
+#define XATTR_USER_PREFIX       "user."
+#define XATTR_TRUSTED_PREFIX    "trusted."
+#define XATTR_SECURITY_PREFIX   "security."
+#define XATTR_LUSTRE_PREFIX     "lustre."
+
+#define XATTR_NAME_LOV	  "trusted.lov"
+#define XATTR_NAME_LMA	  "trusted.lma"
+#define XATTR_NAME_LMV	  "trusted.lmv"
+#define XATTR_NAME_LINK	 "trusted.link"
+#define XATTR_NAME_FID	  "trusted.fid"
+#define XATTR_NAME_VERSION      "trusted.version"
+#define XATTR_NAME_SOM		"trusted.som"
+#define XATTR_NAME_HSM		"trusted.hsm"
+#define XATTR_NAME_LFSCK_NAMESPACE "trusted.lfsck_namespace"
+
+struct lov_mds_md_v3 {	    /* LOV EA mds/wire data (little-endian) */
+	__u32 lmm_magic;	  /* magic number = LOV_MAGIC_V3 */
+	__u32 lmm_pattern;	/* LOV_PATTERN_RAID0, LOV_PATTERN_RAID1 */
+	struct ost_id	lmm_oi;	  /* LOV object ID */
+	__u32 lmm_stripe_size;    /* size of stripe in bytes */
+	/* lmm_stripe_count used to be __u32 */
+	__u16 lmm_stripe_count;   /* num stripes in use for this object */
+	__u16 lmm_layout_gen;     /* layout generation number */
+	char  lmm_pool_name[LOV_MAXPOOLNAME]; /* must be 32bit aligned */
+	struct lov_ost_data_v1 lmm_objects[0]; /* per-stripe data */
+};
+
+static inline __u32 lov_mds_md_size(__u16 stripes, __u32 lmm_magic)
+{
+	if (lmm_magic == LOV_MAGIC_V3)
+		return sizeof(struct lov_mds_md_v3) +
+				stripes * sizeof(struct lov_ost_data_v1);
+	else
+		return sizeof(struct lov_mds_md_v1) +
+				stripes * sizeof(struct lov_ost_data_v1);
+}
+
+static inline __u32
+lov_mds_md_max_stripe_count(size_t buf_size, __u32 lmm_magic)
+{
+	switch (lmm_magic) {
+	case LOV_MAGIC_V1: {
+		struct lov_mds_md_v1 lmm;
+
+		if (buf_size < sizeof(lmm))
+			return 0;
+
+		return (buf_size - sizeof(lmm)) / sizeof(lmm.lmm_objects[0]);
+	}
+	case LOV_MAGIC_V3: {
+		struct lov_mds_md_v3 lmm;
+
+		if (buf_size < sizeof(lmm))
+			return 0;
+
+		return (buf_size - sizeof(lmm)) / sizeof(lmm.lmm_objects[0]);
+	}
+	default:
+		return 0;
+	}
+}
+
+#define OBD_MD_FLID	(0x00000001ULL) /* object ID */
+#define OBD_MD_FLATIME     (0x00000002ULL) /* access time */
+#define OBD_MD_FLMTIME     (0x00000004ULL) /* data modification time */
+#define OBD_MD_FLCTIME     (0x00000008ULL) /* change time */
+#define OBD_MD_FLSIZE      (0x00000010ULL) /* size */
+#define OBD_MD_FLBLOCKS    (0x00000020ULL) /* allocated blocks count */
+#define OBD_MD_FLBLKSZ     (0x00000040ULL) /* block size */
+#define OBD_MD_FLMODE      (0x00000080ULL) /* access bits (mode & ~S_IFMT) */
+#define OBD_MD_FLTYPE      (0x00000100ULL) /* object type (mode & S_IFMT) */
+#define OBD_MD_FLUID       (0x00000200ULL) /* user ID */
+#define OBD_MD_FLGID       (0x00000400ULL) /* group ID */
+#define OBD_MD_FLFLAGS     (0x00000800ULL) /* flags word */
+#define OBD_MD_FLNLINK     (0x00002000ULL) /* link count */
+#define OBD_MD_FLGENER     (0x00004000ULL) /* generation number */
+/*#define OBD_MD_FLINLINE    (0x00008000ULL)  inline data. used until 1.6.5 */
+#define OBD_MD_FLRDEV      (0x00010000ULL) /* device number */
+#define OBD_MD_FLEASIZE    (0x00020000ULL) /* extended attribute data */
+#define OBD_MD_LINKNAME    (0x00040000ULL) /* symbolic link target */
+#define OBD_MD_FLHANDLE    (0x00080000ULL) /* file/lock handle */
+#define OBD_MD_FLCKSUM     (0x00100000ULL) /* bulk data checksum */
+#define OBD_MD_FLQOS       (0x00200000ULL) /* quality of service stats */
+/*#define OBD_MD_FLOSCOPQ    (0x00400000ULL) osc opaque data, never used */
+#define OBD_MD_FLCOOKIE    (0x00800000ULL) /* log cancellation cookie */
+#define OBD_MD_FLGROUP     (0x01000000ULL) /* group */
+#define OBD_MD_FLFID       (0x02000000ULL) /* ->ost write inline fid */
+#define OBD_MD_FLEPOCH     (0x04000000ULL) /* ->ost write with ioepoch */
+					   /* ->mds if epoch opens or closes */
+#define OBD_MD_FLGRANT     (0x08000000ULL) /* ost preallocation space grant */
+#define OBD_MD_FLDIREA     (0x10000000ULL) /* dir's extended attribute data */
+#define OBD_MD_FLUSRQUOTA  (0x20000000ULL) /* over quota flags sent from ost */
+#define OBD_MD_FLGRPQUOTA  (0x40000000ULL) /* over quota flags sent from ost */
+#define OBD_MD_FLMODEASIZE (0x80000000ULL) /* EA size will be changed */
+
+#define OBD_MD_MDS	 (0x0000000100000000ULL) /* where an inode lives on */
+#define OBD_MD_REINT       (0x0000000200000000ULL) /* reintegrate oa */
+#define OBD_MD_MEA	 (0x0000000400000000ULL) /* CMD split EA  */
+#define OBD_MD_TSTATE      (0x0000000800000000ULL) /* transient state field */
+
+#define OBD_MD_FLXATTR       (0x0000001000000000ULL) /* xattr */
+#define OBD_MD_FLXATTRLS     (0x0000002000000000ULL) /* xattr list */
+#define OBD_MD_FLXATTRRM     (0x0000004000000000ULL) /* xattr remove */
+#define OBD_MD_FLACL	 (0x0000008000000000ULL) /* ACL */
+#define OBD_MD_FLRMTPERM     (0x0000010000000000ULL) /* remote permission */
+#define OBD_MD_FLMDSCAPA     (0x0000020000000000ULL) /* MDS capability */
+#define OBD_MD_FLOSSCAPA     (0x0000040000000000ULL) /* OSS capability */
+#define OBD_MD_FLCKSPLIT     (0x0000080000000000ULL) /* Check split on server */
+#define OBD_MD_FLCROSSREF    (0x0000100000000000ULL) /* Cross-ref case */
+#define OBD_MD_FLGETATTRLOCK (0x0000200000000000ULL) /* Get IOEpoch attributes
+						      * under lock; for xattr
+						      * requests means the
+						      * client holds the lock */
+#define OBD_MD_FLOBJCOUNT    (0x0000400000000000ULL) /* for multiple destroy */
+
+#define OBD_MD_FLRMTLSETFACL (0x0001000000000000ULL) /* lfs lsetfacl case */
+#define OBD_MD_FLRMTLGETFACL (0x0002000000000000ULL) /* lfs lgetfacl case */
+#define OBD_MD_FLRMTRSETFACL (0x0004000000000000ULL) /* lfs rsetfacl case */
+#define OBD_MD_FLRMTRGETFACL (0x0008000000000000ULL) /* lfs rgetfacl case */
+
+#define OBD_MD_FLDATAVERSION (0x0010000000000000ULL) /* iversion sum */
+#define OBD_MD_FLRELEASED    (0x0020000000000000ULL) /* file released */
+
+#define OBD_MD_FLGETATTR (OBD_MD_FLID    | OBD_MD_FLATIME | OBD_MD_FLMTIME | \
+			  OBD_MD_FLCTIME | OBD_MD_FLSIZE  | OBD_MD_FLBLKSZ | \
+			  OBD_MD_FLMODE  | OBD_MD_FLTYPE  | OBD_MD_FLUID   | \
+			  OBD_MD_FLGID   | OBD_MD_FLFLAGS | OBD_MD_FLNLINK | \
+			  OBD_MD_FLGENER | OBD_MD_FLRDEV  | OBD_MD_FLGROUP)
+
+#define OBD_MD_FLXATTRALL (OBD_MD_FLXATTR | OBD_MD_FLXATTRLS)
+
+/* don't forget obdo_fid which is way down at the bottom so it can
+ * come after the definition of llog_cookie */
+
+enum hss_valid {
+	HSS_SETMASK	= 0x01,
+	HSS_CLEARMASK	= 0x02,
+	HSS_ARCHIVE_ID	= 0x04,
+};
+
+struct hsm_state_set {
+	__u32	hss_valid;
+	__u32	hss_archive_id;
+	__u64	hss_setmask;
+	__u64	hss_clearmask;
+};
+
+extern void lustre_swab_hsm_user_state(struct hsm_user_state *hus);
+extern void lustre_swab_hsm_state_set(struct hsm_state_set *hss);
+
+extern void lustre_swab_obd_statfs (struct obd_statfs *os);
+
+/* ost_body.data values for OST_BRW */
+
+#define OBD_BRW_READ	    0x01
+#define OBD_BRW_WRITE	   0x02
+#define OBD_BRW_RWMASK	  (OBD_BRW_READ | OBD_BRW_WRITE)
+#define OBD_BRW_SYNC	    0x08 /* this page is a part of synchronous
+				      * transfer and is not accounted in
+				      * the grant. */
+#define OBD_BRW_CHECK	   0x10
+#define OBD_BRW_FROM_GRANT      0x20 /* the osc manages this under llite */
+#define OBD_BRW_GRANTED	 0x40 /* the ost manages this */
+#define OBD_BRW_NOCACHE	 0x80 /* this page is a part of non-cached IO */
+#define OBD_BRW_NOQUOTA	0x100
+#define OBD_BRW_SRVLOCK	0x200 /* Client holds no lock over this page */
+#define OBD_BRW_ASYNC	  0x400 /* Server may delay commit to disk */
+#define OBD_BRW_MEMALLOC       0x800 /* Client runs in the "kswapd" context */
+#define OBD_BRW_OVER_USRQUOTA 0x1000 /* Running out of user quota */
+#define OBD_BRW_OVER_GRPQUOTA 0x2000 /* Running out of group quota */
+
+#define OBD_OBJECT_EOF 0xffffffffffffffffULL
+
+#define OST_MIN_PRECREATE 32
+#define OST_MAX_PRECREATE 20000
+
+struct obd_ioobj {
+	struct ost_id	ioo_oid;	/* object ID, if multi-obj BRW */
+	__u32		ioo_max_brw;	/* low 16 bits were o_mode before 2.4,
+					 * now (PTLRPC_BULK_OPS_COUNT - 1) in
+					 * high 16 bits in 2.4 and later */
+	__u32		ioo_bufcnt;	/* number of niobufs for this object */
+};
+
+#define IOOBJ_MAX_BRW_BITS	16
+#define IOOBJ_TYPE_MASK		((1U << IOOBJ_MAX_BRW_BITS) - 1)
+#define ioobj_max_brw_get(ioo)	(((ioo)->ioo_max_brw >> IOOBJ_MAX_BRW_BITS) + 1)
+#define ioobj_max_brw_set(ioo, num)					\
+do { (ioo)->ioo_max_brw = ((num) - 1) << IOOBJ_MAX_BRW_BITS; } while (0)
+
+extern void lustre_swab_obd_ioobj (struct obd_ioobj *ioo);
+
+/* multiple of 8 bytes => can array */
+struct niobuf_remote {
+	__u64 offset;
+	__u32 len;
+	__u32 flags;
+};
+
+extern void lustre_swab_niobuf_remote (struct niobuf_remote *nbr);
+
+/* lock value block communicated between the filter and llite */
+
+/* OST_LVB_ERR_INIT is needed because the return code in rc is
+ * negative, i.e. because ((MASK + rc) & MASK) != MASK. */
+#define OST_LVB_ERR_INIT 0xffbadbad80000000ULL
+#define OST_LVB_ERR_MASK 0xffbadbad00000000ULL
+#define OST_LVB_IS_ERR(blocks)					  \
+	((blocks & OST_LVB_ERR_MASK) == OST_LVB_ERR_MASK)
+#define OST_LVB_SET_ERR(blocks, rc)				     \
+	do { blocks = OST_LVB_ERR_INIT + rc; } while (0)
+#define OST_LVB_GET_ERR(blocks)    (int)(blocks - OST_LVB_ERR_INIT)
+
+struct ost_lvb_v1 {
+	__u64		lvb_size;
+	__s64		lvb_mtime;
+	__s64		lvb_atime;
+	__s64		lvb_ctime;
+	__u64		lvb_blocks;
+};
+
+extern void lustre_swab_ost_lvb_v1(struct ost_lvb_v1 *lvb);
+
+struct ost_lvb {
+	__u64		lvb_size;
+	__s64		lvb_mtime;
+	__s64		lvb_atime;
+	__s64		lvb_ctime;
+	__u64		lvb_blocks;
+	__u32		lvb_mtime_ns;
+	__u32		lvb_atime_ns;
+	__u32		lvb_ctime_ns;
+	__u32		lvb_padding;
+};
+
+extern void lustre_swab_ost_lvb(struct ost_lvb *lvb);
+
+/*
+ *   lquota data structures
+ */
+
+#ifndef QUOTABLOCK_BITS
+#define QUOTABLOCK_BITS 10
+#endif
+
+#ifndef QUOTABLOCK_SIZE
+#define QUOTABLOCK_SIZE (1 << QUOTABLOCK_BITS)
+#endif
+
+#ifndef toqb
+#define toqb(x) (((x) + QUOTABLOCK_SIZE - 1) >> QUOTABLOCK_BITS)
+#endif
+
+/* The lquota_id structure is an union of all the possible identifier types that
+ * can be used with quota, this includes:
+ * - 64-bit user ID
+ * - 64-bit group ID
+ * - a FID which can be used for per-directory quota in the future */
+union lquota_id {
+	struct lu_fid	qid_fid; /* FID for per-directory quota */
+	__u64		qid_uid; /* user identifier */
+	__u64		qid_gid; /* group identifier */
+};
+
+/* quotactl management */
+struct obd_quotactl {
+	__u32			qc_cmd;
+	__u32			qc_type; /* see Q_* flag below */
+	__u32			qc_id;
+	__u32			qc_stat;
+	struct obd_dqinfo	qc_dqinfo;
+	struct obd_dqblk	qc_dqblk;
+};
+
+extern void lustre_swab_obd_quotactl(struct obd_quotactl *q);
+
+#define Q_QUOTACHECK	0x800100 /* deprecated as of 2.4 */
+#define Q_INITQUOTA	0x800101 /* deprecated as of 2.4  */
+#define Q_GETOINFO	0x800102 /* get obd quota info */
+#define Q_GETOQUOTA	0x800103 /* get obd quotas */
+#define Q_FINVALIDATE	0x800104 /* deprecated as of 2.4 */
+
+#define Q_COPY(out, in, member) (out)->member = (in)->member
+
+#define QCTL_COPY(out, in)		\
+do {					\
+	Q_COPY(out, in, qc_cmd);	\
+	Q_COPY(out, in, qc_type);	\
+	Q_COPY(out, in, qc_id);		\
+	Q_COPY(out, in, qc_stat);	\
+	Q_COPY(out, in, qc_dqinfo);	\
+	Q_COPY(out, in, qc_dqblk);	\
+} while (0)
+
+/* Body of quota request used for quota acquire/release RPCs between quota
+ * master (aka QMT) and slaves (ak QSD). */
+struct quota_body {
+	struct lu_fid	qb_fid;     /* FID of global index packing the pool ID
+				      * and type (data or metadata) as well as
+				      * the quota type (user or group). */
+	union lquota_id	qb_id;      /* uid or gid or directory FID */
+	__u32		qb_flags;   /* see below */
+	__u32		qb_padding;
+	__u64		qb_count;   /* acquire/release count (kbytes/inodes) */
+	__u64		qb_usage;   /* current slave usage (kbytes/inodes) */
+	__u64		qb_slv_ver; /* slave index file version */
+	struct lustre_handle	qb_lockh;     /* per-ID lock handle */
+	struct lustre_handle	qb_glb_lockh; /* global lock handle */
+	__u64		qb_padding1[4];
+};
+
+/* When the quota_body is used in the reply of quota global intent
+ * lock (IT_QUOTA_CONN) reply, qb_fid contains slave index file FID. */
+#define qb_slv_fid	qb_fid
+/* qb_usage is the current qunit (in kbytes/inodes) when quota_body is used in
+ * quota reply */
+#define qb_qunit	qb_usage
+
+#define QUOTA_DQACQ_FL_ACQ	0x1  /* acquire quota */
+#define QUOTA_DQACQ_FL_PREACQ	0x2  /* pre-acquire */
+#define QUOTA_DQACQ_FL_REL	0x4  /* release quota */
+#define QUOTA_DQACQ_FL_REPORT	0x8  /* report usage */
+
+extern void lustre_swab_quota_body(struct quota_body *b);
+
+/* Quota types currently supported */
+enum {
+	LQUOTA_TYPE_USR	= 0x00, /* maps to USRQUOTA */
+	LQUOTA_TYPE_GRP	= 0x01, /* maps to GRPQUOTA */
+	LQUOTA_TYPE_MAX
+};
+
+/* There are 2 different resource types on which a quota limit can be enforced:
+ * - inodes on the MDTs
+ * - blocks on the OSTs */
+enum {
+	LQUOTA_RES_MD		= 0x01, /* skip 0 to avoid null oid in FID */
+	LQUOTA_RES_DT		= 0x02,
+	LQUOTA_LAST_RES,
+	LQUOTA_FIRST_RES	= LQUOTA_RES_MD
+};
+#define LQUOTA_NR_RES (LQUOTA_LAST_RES - LQUOTA_FIRST_RES + 1)
+
+/*
+ * Space accounting support
+ * Format of an accounting record, providing disk usage information for a given
+ * user or group
+ */
+struct lquota_acct_rec { /* 16 bytes */
+	__u64 bspace;  /* current space in use */
+	__u64 ispace;  /* current # inodes in use */
+};
+
+/*
+ * Global quota index support
+ * Format of a global record, providing global quota settings for a given quota
+ * identifier
+ */
+struct lquota_glb_rec { /* 32 bytes */
+	__u64 qbr_hardlimit; /* quota hard limit, in #inodes or kbytes */
+	__u64 qbr_softlimit; /* quota soft limit, in #inodes or kbytes */
+	__u64 qbr_time;      /* grace time, in seconds */
+	__u64 qbr_granted;   /* how much is granted to slaves, in #inodes or
+			      * kbytes */
+};
+
+/*
+ * Slave index support
+ * Format of a slave record, recording how much space is granted to a given
+ * slave
+ */
+struct lquota_slv_rec { /* 8 bytes */
+	__u64 qsr_granted; /* space granted to the slave for the key=ID,
+			    * in #inodes or kbytes */
+};
+
+/* Data structures associated with the quota locks */
+
+/* Glimpse descriptor used for the index & per-ID quota locks */
+struct ldlm_gl_lquota_desc {
+	union lquota_id	gl_id;    /* quota ID subject to the glimpse */
+	__u64		gl_flags; /* see LQUOTA_FL* below */
+	__u64		gl_ver;   /* new index version */
+	__u64		gl_hardlimit; /* new hardlimit or qunit value */
+	__u64		gl_softlimit; /* new softlimit */
+	__u64		gl_time;
+	__u64		gl_pad2;
+};
+#define gl_qunit	gl_hardlimit /* current qunit value used when
+				      * glimpsing per-ID quota locks */
+
+/* quota glimpse flags */
+#define LQUOTA_FL_EDQUOT 0x1 /* user/group out of quota space on QMT */
+
+/* LVB used with quota (global and per-ID) locks */
+struct lquota_lvb {
+	__u64	lvb_flags;	/* see LQUOTA_FL* above */
+	__u64	lvb_id_may_rel; /* space that might be released later */
+	__u64	lvb_id_rel;     /* space released by the slave for this ID */
+	__u64	lvb_id_qunit;   /* current qunit value */
+	__u64	lvb_pad1;
+};
+
+extern void lustre_swab_lquota_lvb(struct lquota_lvb *lvb);
+
+/* LVB used with global quota lock */
+#define lvb_glb_ver  lvb_id_may_rel /* current version of the global index */
+
+/* op codes */
+typedef enum {
+	QUOTA_DQACQ	= 601,
+	QUOTA_DQREL	= 602,
+	QUOTA_LAST_OPC
+} quota_cmd_t;
+#define QUOTA_FIRST_OPC	QUOTA_DQACQ
+
+/*
+ *   MDS REQ RECORDS
+ */
+
+/* opcodes */
+typedef enum {
+	MDS_GETATTR		= 33,
+	MDS_GETATTR_NAME	= 34,
+	MDS_CLOSE		= 35,
+	MDS_REINT		= 36,
+	MDS_READPAGE		= 37,
+	MDS_CONNECT		= 38,
+	MDS_DISCONNECT		= 39,
+	MDS_GETSTATUS		= 40,
+	MDS_STATFS		= 41,
+	MDS_PIN			= 42,
+	MDS_UNPIN		= 43,
+	MDS_SYNC		= 44,
+	MDS_DONE_WRITING	= 45,
+	MDS_SET_INFO		= 46,
+	MDS_QUOTACHECK		= 47,
+	MDS_QUOTACTL		= 48,
+	MDS_GETXATTR		= 49,
+	MDS_SETXATTR		= 50, /* obsolete, now it's MDS_REINT op */
+	MDS_WRITEPAGE		= 51,
+	MDS_IS_SUBDIR		= 52,
+	MDS_GET_INFO		= 53,
+	MDS_HSM_STATE_GET	= 54,
+	MDS_HSM_STATE_SET	= 55,
+	MDS_HSM_ACTION		= 56,
+	MDS_HSM_PROGRESS	= 57,
+	MDS_HSM_REQUEST		= 58,
+	MDS_HSM_CT_REGISTER	= 59,
+	MDS_HSM_CT_UNREGISTER	= 60,
+	MDS_SWAP_LAYOUTS	= 61,
+	MDS_LAST_OPC
+} mds_cmd_t;
+
+#define MDS_FIRST_OPC    MDS_GETATTR
+
+
+/* opcodes for object update */
+typedef enum {
+	UPDATE_OBJ	= 1000,
+	UPDATE_LAST_OPC
+} update_cmd_t;
+
+#define UPDATE_FIRST_OPC    UPDATE_OBJ
+
+/*
+ * Do not exceed 63
+ */
+
+typedef enum {
+	REINT_SETATTR  = 1,
+	REINT_CREATE   = 2,
+	REINT_LINK     = 3,
+	REINT_UNLINK   = 4,
+	REINT_RENAME   = 5,
+	REINT_OPEN     = 6,
+	REINT_SETXATTR = 7,
+	REINT_RMENTRY  = 8,
+//      REINT_WRITE    = 9,
+	REINT_MAX
+} mds_reint_t, mdt_reint_t;
+
+extern void lustre_swab_generic_32s (__u32 *val);
+
+/* the disposition of the intent outlines what was executed */
+#define DISP_IT_EXECD	0x00000001
+#define DISP_LOOKUP_EXECD    0x00000002
+#define DISP_LOOKUP_NEG      0x00000004
+#define DISP_LOOKUP_POS      0x00000008
+#define DISP_OPEN_CREATE     0x00000010
+#define DISP_OPEN_OPEN       0x00000020
+#define DISP_ENQ_COMPLETE    0x00400000		/* obsolete and unused */
+#define DISP_ENQ_OPEN_REF    0x00800000
+#define DISP_ENQ_CREATE_REF  0x01000000
+#define DISP_OPEN_LOCK       0x02000000
+#define DISP_OPEN_LEASE      0x04000000
+#define DISP_OPEN_STRIPE     0x08000000
+
+/* INODE LOCK PARTS */
+#define MDS_INODELOCK_LOOKUP 0x000001	/* For namespace, dentry etc, and also
+					 * was used to protect permission (mode,
+					 * owner, group etc) before 2.4. */
+#define MDS_INODELOCK_UPDATE 0x000002	/* size, links, timestamps */
+#define MDS_INODELOCK_OPEN   0x000004	/* For opened files */
+#define MDS_INODELOCK_LAYOUT 0x000008	/* for layout */
+
+/* The PERM bit is added int 2.4, and it is used to protect permission(mode,
+ * owner, group, acl etc), so to separate the permission from LOOKUP lock.
+ * Because for remote directories(in DNE), these locks will be granted by
+ * different MDTs(different ldlm namespace).
+ *
+ * For local directory, MDT will always grant UPDATE_LOCK|PERM_LOCK together.
+ * For Remote directory, the master MDT, where the remote directory is, will
+ * grant UPDATE_LOCK|PERM_LOCK, and the remote MDT, where the name entry is,
+ * will grant LOOKUP_LOCK. */
+#define MDS_INODELOCK_PERM   0x000010
+#define MDS_INODELOCK_XATTR  0x000020	/* extended attributes */
+
+#define MDS_INODELOCK_MAXSHIFT 5
+/* This FULL lock is useful to take on unlink sort of operations */
+#define MDS_INODELOCK_FULL ((1<<(MDS_INODELOCK_MAXSHIFT+1))-1)
+
+extern void lustre_swab_ll_fid (struct ll_fid *fid);
+
+/* NOTE: until Lustre 1.8.7/2.1.1 the fid_ver() was packed into name[2],
+ * but was moved into name[1] along with the OID to avoid consuming the
+ * name[2,3] fields that need to be used for the quota id (also a FID). */
+enum {
+	LUSTRE_RES_ID_SEQ_OFF = 0,
+	LUSTRE_RES_ID_VER_OID_OFF = 1,
+	LUSTRE_RES_ID_WAS_VER_OFF = 2, /* see note above */
+	LUSTRE_RES_ID_QUOTA_SEQ_OFF = 2,
+	LUSTRE_RES_ID_QUOTA_VER_OID_OFF = 3,
+	LUSTRE_RES_ID_HSH_OFF = 3
+};
+
+#define MDS_STATUS_CONN 1
+#define MDS_STATUS_LOV 2
+
+/* mdt_thread_info.mti_flags. */
+enum md_op_flags {
+	/* The flag indicates Size-on-MDS attributes are changed. */
+	MF_SOM_CHANGE	   = (1 << 0),
+	/* Flags indicates an epoch opens or closes. */
+	MF_EPOCH_OPEN	   = (1 << 1),
+	MF_EPOCH_CLOSE	  = (1 << 2),
+	MF_MDC_CANCEL_FID1      = (1 << 3),
+	MF_MDC_CANCEL_FID2      = (1 << 4),
+	MF_MDC_CANCEL_FID3      = (1 << 5),
+	MF_MDC_CANCEL_FID4      = (1 << 6),
+	/* There is a pending attribute update. */
+	MF_SOM_AU	       = (1 << 7),
+	/* Cancel OST locks while getattr OST attributes. */
+	MF_GETATTR_LOCK	 = (1 << 8),
+	MF_GET_MDT_IDX	  = (1 << 9),
+};
+
+#define MF_SOM_LOCAL_FLAGS (MF_SOM_CHANGE | MF_EPOCH_OPEN | MF_EPOCH_CLOSE)
+
+#define LUSTRE_BFLAG_UNCOMMITTED_WRITES   0x1
+
+/* these should be identical to their EXT4_*_FL counterparts, they are
+ * redefined here only to avoid dragging in fs/ext4/ext4.h */
+#define LUSTRE_SYNC_FL	 0x00000008 /* Synchronous updates */
+#define LUSTRE_IMMUTABLE_FL    0x00000010 /* Immutable file */
+#define LUSTRE_APPEND_FL       0x00000020 /* writes to file may only append */
+#define LUSTRE_NOATIME_FL      0x00000080 /* do not update atime */
+#define LUSTRE_DIRSYNC_FL      0x00010000 /* dirsync behaviour (dir only) */
+
+/* Convert wire LUSTRE_*_FL to corresponding client local VFS S_* values
+ * for the client inode i_flags.  The LUSTRE_*_FL are the Lustre wire
+ * protocol equivalents of LDISKFS_*_FL values stored on disk, while
+ * the S_* flags are kernel-internal values that change between kernel
+ * versions.  These flags are set/cleared via FSFILT_IOC_{GET,SET}_FLAGS.
+ * See b=16526 for a full history. */
+static inline int ll_ext_to_inode_flags(int flags)
+{
+	return (((flags & LUSTRE_SYNC_FL)      ? S_SYNC      : 0) |
+		((flags & LUSTRE_NOATIME_FL)   ? S_NOATIME   : 0) |
+		((flags & LUSTRE_APPEND_FL)    ? S_APPEND    : 0) |
+#if defined(S_DIRSYNC)
+		((flags & LUSTRE_DIRSYNC_FL)   ? S_DIRSYNC   : 0) |
+#endif
+		((flags & LUSTRE_IMMUTABLE_FL) ? S_IMMUTABLE : 0));
+}
+
+static inline int ll_inode_to_ext_flags(int iflags)
+{
+	return (((iflags & S_SYNC)      ? LUSTRE_SYNC_FL      : 0) |
+		((iflags & S_NOATIME)   ? LUSTRE_NOATIME_FL   : 0) |
+		((iflags & S_APPEND)    ? LUSTRE_APPEND_FL    : 0) |
+#if defined(S_DIRSYNC)
+		((iflags & S_DIRSYNC)   ? LUSTRE_DIRSYNC_FL   : 0) |
+#endif
+		((iflags & S_IMMUTABLE) ? LUSTRE_IMMUTABLE_FL : 0));
+}
+
+/* 64 possible states */
+enum md_transient_state {
+	MS_RESTORE	= (1 << 0),	/* restore is running */
+};
+
+struct mdt_body {
+	struct lu_fid  fid1;
+	struct lu_fid  fid2;
+	struct lustre_handle handle;
+	__u64	  valid;
+	__u64	  size;   /* Offset, in the case of MDS_READPAGE */
+	__s64	  mtime;
+	__s64	  atime;
+	__s64	  ctime;
+	__u64	  blocks; /* XID, in the case of MDS_READPAGE */
+	__u64	  ioepoch;
+	__u64	       t_state; /* transient file state defined in
+				 * enum md_transient_state
+				 * was "ino" until 2.4.0 */
+	__u32	  fsuid;
+	__u32	  fsgid;
+	__u32	  capability;
+	__u32	  mode;
+	__u32	  uid;
+	__u32	  gid;
+	__u32	  flags; /* from vfs for pin/unpin, LUSTRE_BFLAG close */
+	__u32	  rdev;
+	__u32	  nlink; /* #bytes to read in the case of MDS_READPAGE */
+	__u32	       unused2; /* was "generation" until 2.4.0 */
+	__u32	  suppgid;
+	__u32	  eadatasize;
+	__u32	  aclsize;
+	__u32	  max_mdsize;
+	__u32	  max_cookiesize;
+	__u32	  uid_h; /* high 32-bits of uid, for FUID */
+	__u32	  gid_h; /* high 32-bits of gid, for FUID */
+	__u32	  padding_5; /* also fix lustre_swab_mdt_body */
+	__u64	  padding_6;
+	__u64	  padding_7;
+	__u64	  padding_8;
+	__u64	  padding_9;
+	__u64	  padding_10;
+}; /* 216 */
+
+extern void lustre_swab_mdt_body (struct mdt_body *b);
+
+struct mdt_ioepoch {
+	struct lustre_handle handle;
+	__u64  ioepoch;
+	__u32  flags;
+	__u32  padding;
+};
+
+extern void lustre_swab_mdt_ioepoch (struct mdt_ioepoch *b);
+
+/* permissions for md_perm.mp_perm */
+enum {
+	CFS_SETUID_PERM = 0x01,
+	CFS_SETGID_PERM = 0x02,
+	CFS_SETGRP_PERM = 0x04,
+	CFS_RMTACL_PERM = 0x08,
+	CFS_RMTOWN_PERM = 0x10
+};
+
+/* inode access permission for remote user, the inode info are omitted,
+ * for client knows them. */
+struct mdt_remote_perm {
+	__u32	   rp_uid;
+	__u32	   rp_gid;
+	__u32	   rp_fsuid;
+	__u32	   rp_fsuid_h;
+	__u32	   rp_fsgid;
+	__u32	   rp_fsgid_h;
+	__u32	   rp_access_perm; /* MAY_READ/WRITE/EXEC */
+	__u32	   rp_padding;
+};
+
+extern void lustre_swab_mdt_remote_perm(struct mdt_remote_perm *p);
+
+struct mdt_rec_setattr {
+	__u32	   sa_opcode;
+	__u32	   sa_cap;
+	__u32	   sa_fsuid;
+	__u32	   sa_fsuid_h;
+	__u32	   sa_fsgid;
+	__u32	   sa_fsgid_h;
+	__u32	   sa_suppgid;
+	__u32	   sa_suppgid_h;
+	__u32	   sa_padding_1;
+	__u32	   sa_padding_1_h;
+	struct lu_fid   sa_fid;
+	__u64	   sa_valid;
+	__u32	   sa_uid;
+	__u32	   sa_gid;
+	__u64	   sa_size;
+	__u64	   sa_blocks;
+	__s64	   sa_mtime;
+	__s64	   sa_atime;
+	__s64	   sa_ctime;
+	__u32	   sa_attr_flags;
+	__u32	   sa_mode;
+	__u32	   sa_bias;      /* some operation flags */
+	__u32	   sa_padding_3;
+	__u32	   sa_padding_4;
+	__u32	   sa_padding_5;
+};
+
+extern void lustre_swab_mdt_rec_setattr (struct mdt_rec_setattr *sa);
+
+/*
+ * Attribute flags used in mdt_rec_setattr::sa_valid.
+ * The kernel's #defines for ATTR_* should not be used over the network
+ * since the client and MDS may run different kernels (see bug 13828)
+ * Therefore, we should only use MDS_ATTR_* attributes for sa_valid.
+ */
+#define MDS_ATTR_MODE	  0x1ULL /* = 1 */
+#define MDS_ATTR_UID	   0x2ULL /* = 2 */
+#define MDS_ATTR_GID	   0x4ULL /* = 4 */
+#define MDS_ATTR_SIZE	  0x8ULL /* = 8 */
+#define MDS_ATTR_ATIME	0x10ULL /* = 16 */
+#define MDS_ATTR_MTIME	0x20ULL /* = 32 */
+#define MDS_ATTR_CTIME	0x40ULL /* = 64 */
+#define MDS_ATTR_ATIME_SET    0x80ULL /* = 128 */
+#define MDS_ATTR_MTIME_SET   0x100ULL /* = 256 */
+#define MDS_ATTR_FORCE       0x200ULL /* = 512, Not a change, but a change it */
+#define MDS_ATTR_ATTR_FLAG   0x400ULL /* = 1024 */
+#define MDS_ATTR_KILL_SUID   0x800ULL /* = 2048 */
+#define MDS_ATTR_KILL_SGID  0x1000ULL /* = 4096 */
+#define MDS_ATTR_CTIME_SET  0x2000ULL /* = 8192 */
+#define MDS_ATTR_FROM_OPEN  0x4000ULL /* = 16384, called from open path, ie O_TRUNC */
+#define MDS_ATTR_BLOCKS     0x8000ULL /* = 32768 */
+
+#ifndef FMODE_READ
+#define FMODE_READ	       00000001
+#define FMODE_WRITE	      00000002
+#endif
+
+#define MDS_FMODE_CLOSED	 00000000
+#define MDS_FMODE_EXEC	   00000004
+/* IO Epoch is opened on a closed file. */
+#define MDS_FMODE_EPOCH	  01000000
+/* IO Epoch is opened on a file truncate. */
+#define MDS_FMODE_TRUNC	  02000000
+/* Size-on-MDS Attribute Update is pending. */
+#define MDS_FMODE_SOM	    04000000
+
+#define MDS_OPEN_CREATED	 00000010
+#define MDS_OPEN_CROSS	   00000020
+
+#define MDS_OPEN_CREAT	   00000100
+#define MDS_OPEN_EXCL	    00000200
+#define MDS_OPEN_TRUNC	   00001000
+#define MDS_OPEN_APPEND	  00002000
+#define MDS_OPEN_SYNC	    00010000
+#define MDS_OPEN_DIRECTORY       00200000
+
+#define MDS_OPEN_BY_FID		040000000 /* open_by_fid for known object */
+#define MDS_OPEN_DELAY_CREATE  0100000000 /* delay initial object create */
+#define MDS_OPEN_OWNEROVERRIDE 0200000000 /* NFSD rw-reopen ro file for owner */
+#define MDS_OPEN_JOIN_FILE     0400000000 /* open for join file.
+					   * We do not support JOIN FILE
+					   * anymore, reserve this flags
+					   * just for preventing such bit
+					   * to be reused. */
+
+#define MDS_OPEN_LOCK	 04000000000 /* This open requires open lock */
+#define MDS_OPEN_HAS_EA      010000000000 /* specify object create pattern */
+#define MDS_OPEN_HAS_OBJS    020000000000 /* Just set the EA the obj exist */
+#define MDS_OPEN_NORESTORE  0100000000000ULL /* Do not restore file at open */
+#define MDS_OPEN_NEWSTRIPE  0200000000000ULL /* New stripe needed (restripe or
+					      * hsm restore) */
+#define MDS_OPEN_VOLATILE   0400000000000ULL /* File is volatile = created
+						unlinked */
+#define MDS_OPEN_LEASE	   01000000000000ULL /* Open the file and grant lease
+					      * delegation, succeed if it's not
+					      * being opened with conflict mode.
+					      */
+#define MDS_OPEN_RELEASE   02000000000000ULL /* Open the file for HSM release */
+
+/* permission for create non-directory file */
+#define MAY_CREATE      (1 << 7)
+/* permission for create directory file */
+#define MAY_LINK	(1 << 8)
+/* permission for delete from the directory */
+#define MAY_UNLINK      (1 << 9)
+/* source's permission for rename */
+#define MAY_RENAME_SRC  (1 << 10)
+/* target's permission for rename */
+#define MAY_RENAME_TAR  (1 << 11)
+/* part (parent's) VTX permission check */
+#define MAY_VTX_PART    (1 << 12)
+/* full VTX permission check */
+#define MAY_VTX_FULL    (1 << 13)
+/* lfs rgetfacl permission check */
+#define MAY_RGETFACL    (1 << 14)
+
+enum mds_op_bias {
+	MDS_CHECK_SPLIT		= 1 << 0,
+	MDS_CROSS_REF		= 1 << 1,
+	MDS_VTX_BYPASS		= 1 << 2,
+	MDS_PERM_BYPASS		= 1 << 3,
+	MDS_SOM			= 1 << 4,
+	MDS_QUOTA_IGNORE	= 1 << 5,
+	MDS_CLOSE_CLEANUP	= 1 << 6,
+	MDS_KEEP_ORPHAN		= 1 << 7,
+	MDS_RECOV_OPEN		= 1 << 8,
+	MDS_DATA_MODIFIED	= 1 << 9,
+	MDS_CREATE_VOLATILE	= 1 << 10,
+	MDS_OWNEROVERRIDE	= 1 << 11,
+	MDS_HSM_RELEASE		= 1 << 12,
+};
+
+/* instance of mdt_reint_rec */
+struct mdt_rec_create {
+	__u32	   cr_opcode;
+	__u32	   cr_cap;
+	__u32	   cr_fsuid;
+	__u32	   cr_fsuid_h;
+	__u32	   cr_fsgid;
+	__u32	   cr_fsgid_h;
+	__u32	   cr_suppgid1;
+	__u32	   cr_suppgid1_h;
+	__u32	   cr_suppgid2;
+	__u32	   cr_suppgid2_h;
+	struct lu_fid   cr_fid1;
+	struct lu_fid   cr_fid2;
+	struct lustre_handle cr_old_handle; /* handle in case of open replay */
+	__s64	   cr_time;
+	__u64	   cr_rdev;
+	__u64	   cr_ioepoch;
+	__u64	   cr_padding_1;   /* rr_blocks */
+	__u32	   cr_mode;
+	__u32	   cr_bias;
+	/* use of helpers set/get_mrc_cr_flags() is needed to access
+	 * 64 bits cr_flags [cr_flags_l, cr_flags_h], this is done to
+	 * extend cr_flags size without breaking 1.8 compat */
+	__u32	   cr_flags_l;     /* for use with open, low  32 bits  */
+	__u32	   cr_flags_h;     /* for use with open, high 32 bits */
+	__u32	   cr_umask;       /* umask for create */
+	__u32	   cr_padding_4;   /* rr_padding_4 */
+};
+
+static inline void set_mrc_cr_flags(struct mdt_rec_create *mrc, __u64 flags)
+{
+	mrc->cr_flags_l = (__u32)(flags & 0xFFFFFFFFUll);
+	mrc->cr_flags_h = (__u32)(flags >> 32);
+}
+
+static inline __u64 get_mrc_cr_flags(struct mdt_rec_create *mrc)
+{
+	return ((__u64)(mrc->cr_flags_l) | ((__u64)mrc->cr_flags_h << 32));
+}
+
+/* instance of mdt_reint_rec */
+struct mdt_rec_link {
+	__u32	   lk_opcode;
+	__u32	   lk_cap;
+	__u32	   lk_fsuid;
+	__u32	   lk_fsuid_h;
+	__u32	   lk_fsgid;
+	__u32	   lk_fsgid_h;
+	__u32	   lk_suppgid1;
+	__u32	   lk_suppgid1_h;
+	__u32	   lk_suppgid2;
+	__u32	   lk_suppgid2_h;
+	struct lu_fid   lk_fid1;
+	struct lu_fid   lk_fid2;
+	__s64	   lk_time;
+	__u64	   lk_padding_1;   /* rr_atime */
+	__u64	   lk_padding_2;   /* rr_ctime */
+	__u64	   lk_padding_3;   /* rr_size */
+	__u64	   lk_padding_4;   /* rr_blocks */
+	__u32	   lk_bias;
+	__u32	   lk_padding_5;   /* rr_mode */
+	__u32	   lk_padding_6;   /* rr_flags */
+	__u32	   lk_padding_7;   /* rr_padding_2 */
+	__u32	   lk_padding_8;   /* rr_padding_3 */
+	__u32	   lk_padding_9;   /* rr_padding_4 */
+};
+
+/* instance of mdt_reint_rec */
+struct mdt_rec_unlink {
+	__u32	   ul_opcode;
+	__u32	   ul_cap;
+	__u32	   ul_fsuid;
+	__u32	   ul_fsuid_h;
+	__u32	   ul_fsgid;
+	__u32	   ul_fsgid_h;
+	__u32	   ul_suppgid1;
+	__u32	   ul_suppgid1_h;
+	__u32	   ul_suppgid2;
+	__u32	   ul_suppgid2_h;
+	struct lu_fid   ul_fid1;
+	struct lu_fid   ul_fid2;
+	__s64	   ul_time;
+	__u64	   ul_padding_2;   /* rr_atime */
+	__u64	   ul_padding_3;   /* rr_ctime */
+	__u64	   ul_padding_4;   /* rr_size */
+	__u64	   ul_padding_5;   /* rr_blocks */
+	__u32	   ul_bias;
+	__u32	   ul_mode;
+	__u32	   ul_padding_6;   /* rr_flags */
+	__u32	   ul_padding_7;   /* rr_padding_2 */
+	__u32	   ul_padding_8;   /* rr_padding_3 */
+	__u32	   ul_padding_9;   /* rr_padding_4 */
+};
+
+/* instance of mdt_reint_rec */
+struct mdt_rec_rename {
+	__u32	   rn_opcode;
+	__u32	   rn_cap;
+	__u32	   rn_fsuid;
+	__u32	   rn_fsuid_h;
+	__u32	   rn_fsgid;
+	__u32	   rn_fsgid_h;
+	__u32	   rn_suppgid1;
+	__u32	   rn_suppgid1_h;
+	__u32	   rn_suppgid2;
+	__u32	   rn_suppgid2_h;
+	struct lu_fid   rn_fid1;
+	struct lu_fid   rn_fid2;
+	__s64	   rn_time;
+	__u64	   rn_padding_1;   /* rr_atime */
+	__u64	   rn_padding_2;   /* rr_ctime */
+	__u64	   rn_padding_3;   /* rr_size */
+	__u64	   rn_padding_4;   /* rr_blocks */
+	__u32	   rn_bias;	/* some operation flags */
+	__u32	   rn_mode;	/* cross-ref rename has mode */
+	__u32	   rn_padding_5;   /* rr_flags */
+	__u32	   rn_padding_6;   /* rr_padding_2 */
+	__u32	   rn_padding_7;   /* rr_padding_3 */
+	__u32	   rn_padding_8;   /* rr_padding_4 */
+};
+
+/* instance of mdt_reint_rec */
+struct mdt_rec_setxattr {
+	__u32	   sx_opcode;
+	__u32	   sx_cap;
+	__u32	   sx_fsuid;
+	__u32	   sx_fsuid_h;
+	__u32	   sx_fsgid;
+	__u32	   sx_fsgid_h;
+	__u32	   sx_suppgid1;
+	__u32	   sx_suppgid1_h;
+	__u32	   sx_suppgid2;
+	__u32	   sx_suppgid2_h;
+	struct lu_fid   sx_fid;
+	__u64	   sx_padding_1;   /* These three are rr_fid2 */
+	__u32	   sx_padding_2;
+	__u32	   sx_padding_3;
+	__u64	   sx_valid;
+	__s64	   sx_time;
+	__u64	   sx_padding_5;   /* rr_ctime */
+	__u64	   sx_padding_6;   /* rr_size */
+	__u64	   sx_padding_7;   /* rr_blocks */
+	__u32	   sx_size;
+	__u32	   sx_flags;
+	__u32	   sx_padding_8;   /* rr_flags */
+	__u32	   sx_padding_9;   /* rr_padding_2 */
+	__u32	   sx_padding_10;  /* rr_padding_3 */
+	__u32	   sx_padding_11;  /* rr_padding_4 */
+};
+
+/*
+ * mdt_rec_reint is the template for all mdt_reint_xxx structures.
+ * Do NOT change the size of various members, otherwise the value
+ * will be broken in lustre_swab_mdt_rec_reint().
+ *
+ * If you add new members in other mdt_reint_xxx structures and need to use the
+ * rr_padding_x fields, then update lustre_swab_mdt_rec_reint() also.
+ */
+struct mdt_rec_reint {
+	__u32	   rr_opcode;
+	__u32	   rr_cap;
+	__u32	   rr_fsuid;
+	__u32	   rr_fsuid_h;
+	__u32	   rr_fsgid;
+	__u32	   rr_fsgid_h;
+	__u32	   rr_suppgid1;
+	__u32	   rr_suppgid1_h;
+	__u32	   rr_suppgid2;
+	__u32	   rr_suppgid2_h;
+	struct lu_fid   rr_fid1;
+	struct lu_fid   rr_fid2;
+	__s64	   rr_mtime;
+	__s64	   rr_atime;
+	__s64	   rr_ctime;
+	__u64	   rr_size;
+	__u64	   rr_blocks;
+	__u32	   rr_bias;
+	__u32	   rr_mode;
+	__u32	   rr_flags;
+	__u32	   rr_flags_h;
+	__u32	   rr_umask;
+	__u32	   rr_padding_4; /* also fix lustre_swab_mdt_rec_reint */
+};
+
+extern void lustre_swab_mdt_rec_reint(struct mdt_rec_reint *rr);
+
+struct lmv_desc {
+	__u32 ld_tgt_count;		/* how many MDS's */
+	__u32 ld_active_tgt_count;	 /* how many active */
+	__u32 ld_default_stripe_count;     /* how many objects are used */
+	__u32 ld_pattern;		  /* default MEA_MAGIC_* */
+	__u64 ld_default_hash_size;
+	__u64 ld_padding_1;		/* also fix lustre_swab_lmv_desc */
+	__u32 ld_padding_2;		/* also fix lustre_swab_lmv_desc */
+	__u32 ld_qos_maxage;	       /* in second */
+	__u32 ld_padding_3;		/* also fix lustre_swab_lmv_desc */
+	__u32 ld_padding_4;		/* also fix lustre_swab_lmv_desc */
+	struct obd_uuid ld_uuid;
+};
+
+extern void lustre_swab_lmv_desc (struct lmv_desc *ld);
+
+/* TODO: lmv_stripe_md should contain mds capabilities for all slave fids */
+struct lmv_stripe_md {
+	__u32	 mea_magic;
+	__u32	 mea_count;
+	__u32	 mea_master;
+	__u32	 mea_padding;
+	char	  mea_pool_name[LOV_MAXPOOLNAME];
+	struct lu_fid mea_ids[0];
+};
+
+extern void lustre_swab_lmv_stripe_md(struct lmv_stripe_md *mea);
+
+/* lmv structures */
+#define MEA_MAGIC_LAST_CHAR      0xb2221ca1
+#define MEA_MAGIC_ALL_CHARS      0xb222a11c
+#define MEA_MAGIC_HASH_SEGMENT   0xb222a11b
+
+#define MAX_HASH_SIZE_32	 0x7fffffffUL
+#define MAX_HASH_SIZE	    0x7fffffffffffffffULL
+#define MAX_HASH_HIGHEST_BIT     0x1000000000000000ULL
+
+enum fld_rpc_opc {
+	FLD_QUERY		       = 900,
+	FLD_LAST_OPC,
+	FLD_FIRST_OPC		   = FLD_QUERY
+};
+
+enum seq_rpc_opc {
+	SEQ_QUERY		       = 700,
+	SEQ_LAST_OPC,
+	SEQ_FIRST_OPC		   = SEQ_QUERY
+};
+
+enum seq_op {
+	SEQ_ALLOC_SUPER = 0,
+	SEQ_ALLOC_META = 1
+};
+
+/*
+ *  LOV data structures
+ */
+
+#define LOV_MAX_UUID_BUFFER_SIZE  8192
+/* The size of the buffer the lov/mdc reserves for the
+ * array of UUIDs returned by the MDS.  With the current
+ * protocol, this will limit the max number of OSTs per LOV */
+
+#define LOV_DESC_MAGIC 0xB0CCDE5C
+#define LOV_DESC_QOS_MAXAGE_DEFAULT 5  /* Seconds */
+#define LOV_DESC_STRIPE_SIZE_DEFAULT (1 << LNET_MTU_BITS)
+
+/* LOV settings descriptor (should only contain static info) */
+struct lov_desc {
+	__u32 ld_tgt_count;		/* how many OBD's */
+	__u32 ld_active_tgt_count;	 /* how many active */
+	__u32 ld_default_stripe_count;     /* how many objects are used */
+	__u32 ld_pattern;		  /* default PATTERN_RAID0 */
+	__u64 ld_default_stripe_size;      /* in bytes */
+	__u64 ld_default_stripe_offset;    /* in bytes */
+	__u32 ld_padding_0;		/* unused */
+	__u32 ld_qos_maxage;	       /* in second */
+	__u32 ld_padding_1;		/* also fix lustre_swab_lov_desc */
+	__u32 ld_padding_2;		/* also fix lustre_swab_lov_desc */
+	struct obd_uuid ld_uuid;
+};
+
+#define ld_magic ld_active_tgt_count       /* for swabbing from llogs */
+
+extern void lustre_swab_lov_desc (struct lov_desc *ld);
+
+/*
+ *   LDLM requests:
+ */
+/* opcodes -- MUST be distinct from OST/MDS opcodes */
+typedef enum {
+	LDLM_ENQUEUE     = 101,
+	LDLM_CONVERT     = 102,
+	LDLM_CANCEL      = 103,
+	LDLM_BL_CALLBACK = 104,
+	LDLM_CP_CALLBACK = 105,
+	LDLM_GL_CALLBACK = 106,
+	LDLM_SET_INFO    = 107,
+	LDLM_LAST_OPC
+} ldlm_cmd_t;
+#define LDLM_FIRST_OPC LDLM_ENQUEUE
+
+#define RES_NAME_SIZE 4
+struct ldlm_res_id {
+	__u64 name[RES_NAME_SIZE];
+};
+
+#define DLDLMRES	"[%#llx:%#llx:%#llx].%llx"
+#define PLDLMRES(res)	(res)->lr_name.name[0], (res)->lr_name.name[1], \
+			(res)->lr_name.name[2], (res)->lr_name.name[3]
+
+extern void lustre_swab_ldlm_res_id (struct ldlm_res_id *id);
+
+static inline int ldlm_res_eq(const struct ldlm_res_id *res0,
+			      const struct ldlm_res_id *res1)
+{
+	return !memcmp(res0, res1, sizeof(*res0));
+}
+
+/* lock types */
+typedef enum {
+	LCK_MINMODE = 0,
+	LCK_EX      = 1,
+	LCK_PW      = 2,
+	LCK_PR      = 4,
+	LCK_CW      = 8,
+	LCK_CR      = 16,
+	LCK_NL      = 32,
+	LCK_GROUP   = 64,
+	LCK_COS     = 128,
+	LCK_MAXMODE
+} ldlm_mode_t;
+
+#define LCK_MODE_NUM    8
+
+typedef enum {
+	LDLM_PLAIN     = 10,
+	LDLM_EXTENT    = 11,
+	LDLM_FLOCK     = 12,
+	LDLM_IBITS     = 13,
+	LDLM_MAX_TYPE
+} ldlm_type_t;
+
+#define LDLM_MIN_TYPE LDLM_PLAIN
+
+struct ldlm_extent {
+	__u64 start;
+	__u64 end;
+	__u64 gid;
+};
+
+static inline int ldlm_extent_overlap(struct ldlm_extent *ex1,
+				      struct ldlm_extent *ex2)
+{
+	return (ex1->start <= ex2->end) && (ex2->start <= ex1->end);
+}
+
+/* check if @ex1 contains @ex2 */
+static inline int ldlm_extent_contain(struct ldlm_extent *ex1,
+				      struct ldlm_extent *ex2)
+{
+	return (ex1->start <= ex2->start) && (ex1->end >= ex2->end);
+}
+
+struct ldlm_inodebits {
+	__u64 bits;
+};
+
+struct ldlm_flock_wire {
+	__u64 lfw_start;
+	__u64 lfw_end;
+	__u64 lfw_owner;
+	__u32 lfw_padding;
+	__u32 lfw_pid;
+};
+
+/* it's important that the fields of the ldlm_extent structure match
+ * the first fields of the ldlm_flock structure because there is only
+ * one ldlm_swab routine to process the ldlm_policy_data_t union. if
+ * this ever changes we will need to swab the union differently based
+ * on the resource type. */
+
+typedef union {
+	struct ldlm_extent l_extent;
+	struct ldlm_flock_wire l_flock;
+	struct ldlm_inodebits l_inodebits;
+} ldlm_wire_policy_data_t;
+
+extern void lustre_swab_ldlm_policy_data (ldlm_wire_policy_data_t *d);
+
+union ldlm_gl_desc {
+	struct ldlm_gl_lquota_desc	lquota_desc;
+};
+
+extern void lustre_swab_gl_desc(union ldlm_gl_desc *);
+
+struct ldlm_intent {
+	__u64 opc;
+};
+
+extern void lustre_swab_ldlm_intent (struct ldlm_intent *i);
+
+struct ldlm_resource_desc {
+	ldlm_type_t lr_type;
+	__u32 lr_padding;       /* also fix lustre_swab_ldlm_resource_desc */
+	struct ldlm_res_id lr_name;
+};
+
+extern void lustre_swab_ldlm_resource_desc (struct ldlm_resource_desc *r);
+
+struct ldlm_lock_desc {
+	struct ldlm_resource_desc l_resource;
+	ldlm_mode_t l_req_mode;
+	ldlm_mode_t l_granted_mode;
+	ldlm_wire_policy_data_t l_policy_data;
+};
+
+extern void lustre_swab_ldlm_lock_desc (struct ldlm_lock_desc *l);
+
+#define LDLM_LOCKREQ_HANDLES 2
+#define LDLM_ENQUEUE_CANCEL_OFF 1
+
+struct ldlm_request {
+	__u32 lock_flags;
+	__u32 lock_count;
+	struct ldlm_lock_desc lock_desc;
+	struct lustre_handle lock_handle[LDLM_LOCKREQ_HANDLES];
+};
+
+extern void lustre_swab_ldlm_request (struct ldlm_request *rq);
+
+/* If LDLM_ENQUEUE, 1 slot is already occupied, 1 is available.
+ * Otherwise, 2 are available. */
+#define ldlm_request_bufsize(count, type)				\
+({								      \
+	int _avail = LDLM_LOCKREQ_HANDLES;			      \
+	_avail -= (type == LDLM_ENQUEUE ? LDLM_ENQUEUE_CANCEL_OFF : 0); \
+	sizeof(struct ldlm_request) +				   \
+	(count > _avail ? count - _avail : 0) *			 \
+	sizeof(struct lustre_handle);				   \
+})
+
+struct ldlm_reply {
+	__u32 lock_flags;
+	__u32 lock_padding;     /* also fix lustre_swab_ldlm_reply */
+	struct ldlm_lock_desc lock_desc;
+	struct lustre_handle lock_handle;
+	__u64  lock_policy_res1;
+	__u64  lock_policy_res2;
+};
+
+extern void lustre_swab_ldlm_reply (struct ldlm_reply *r);
+
+#define ldlm_flags_to_wire(flags)    ((__u32)(flags))
+#define ldlm_flags_from_wire(flags)  ((__u64)(flags))
+
+/*
+ * Opcodes for mountconf (mgs and mgc)
+ */
+typedef enum {
+	MGS_CONNECT = 250,
+	MGS_DISCONNECT,
+	MGS_EXCEPTION,	 /* node died, etc. */
+	MGS_TARGET_REG,	/* whenever target starts up */
+	MGS_TARGET_DEL,
+	MGS_SET_INFO,
+	MGS_CONFIG_READ,
+	MGS_LAST_OPC
+} mgs_cmd_t;
+#define MGS_FIRST_OPC MGS_CONNECT
+
+#define MGS_PARAM_MAXLEN 1024
+#define KEY_SET_INFO "set_info"
+
+struct mgs_send_param {
+	char	     mgs_param[MGS_PARAM_MAXLEN];
+};
+
+/* We pass this info to the MGS so it can write config logs */
+#define MTI_NAME_MAXLEN  64
+#define MTI_PARAM_MAXLEN 4096
+#define MTI_NIDS_MAX     32
+struct mgs_target_info {
+	__u32	    mti_lustre_ver;
+	__u32	    mti_stripe_index;
+	__u32	    mti_config_ver;
+	__u32	    mti_flags;
+	__u32	    mti_nid_count;
+	__u32	    mti_instance; /* Running instance of target */
+	char	     mti_fsname[MTI_NAME_MAXLEN];
+	char	     mti_svname[MTI_NAME_MAXLEN];
+	char	     mti_uuid[sizeof(struct obd_uuid)];
+	__u64	    mti_nids[MTI_NIDS_MAX];     /* host nids (lnet_nid_t)*/
+	char	     mti_params[MTI_PARAM_MAXLEN];
+};
+extern void lustre_swab_mgs_target_info(struct mgs_target_info *oinfo);
+
+struct mgs_nidtbl_entry {
+	__u64	   mne_version;    /* table version of this entry */
+	__u32	   mne_instance;   /* target instance # */
+	__u32	   mne_index;      /* target index */
+	__u32	   mne_length;     /* length of this entry - by bytes */
+	__u8	    mne_type;       /* target type LDD_F_SV_TYPE_OST/MDT */
+	__u8	    mne_nid_type;   /* type of nid(mbz). for ipv6. */
+	__u8	    mne_nid_size;   /* size of each NID, by bytes */
+	__u8	    mne_nid_count;  /* # of NIDs in buffer */
+	union {
+		lnet_nid_t nids[0];     /* variable size buffer for NIDs. */
+	} u;
+};
+extern void lustre_swab_mgs_nidtbl_entry(struct mgs_nidtbl_entry *oinfo);
+
+struct mgs_config_body {
+	char     mcb_name[MTI_NAME_MAXLEN]; /* logname */
+	__u64    mcb_offset;    /* next index of config log to request */
+	__u16    mcb_type;      /* type of log: CONFIG_T_[CONFIG|RECOVER] */
+	__u8     mcb_reserved;
+	__u8     mcb_bits;      /* bits unit size of config log */
+	__u32    mcb_units;     /* # of units for bulk transfer */
+};
+extern void lustre_swab_mgs_config_body(struct mgs_config_body *body);
+
+struct mgs_config_res {
+	__u64    mcr_offset;    /* index of last config log */
+	__u64    mcr_size;      /* size of the log */
+};
+extern void lustre_swab_mgs_config_res(struct mgs_config_res *body);
+
+/* Config marker flags (in config log) */
+#define CM_START       0x01
+#define CM_END	 0x02
+#define CM_SKIP	0x04
+#define CM_UPGRADE146  0x08
+#define CM_EXCLUDE     0x10
+#define CM_START_SKIP (CM_START | CM_SKIP)
+
+struct cfg_marker {
+	__u32	     cm_step;       /* aka config version */
+	__u32	     cm_flags;
+	__u32	     cm_vers;       /* lustre release version number */
+	__u32	     cm_padding;    /* 64 bit align */
+	__s64	     cm_createtime; /*when this record was first created */
+	__s64	     cm_canceltime; /*when this record is no longer valid*/
+	char	      cm_tgtname[MTI_NAME_MAXLEN];
+	char	      cm_comment[MTI_NAME_MAXLEN];
+};
+
+extern void lustre_swab_cfg_marker(struct cfg_marker *marker,
+				   int swab, int size);
+
+/*
+ * Opcodes for multiple servers.
+ */
+
+typedef enum {
+	OBD_PING = 400,
+	OBD_LOG_CANCEL,
+	OBD_QC_CALLBACK,
+	OBD_IDX_READ,
+	OBD_LAST_OPC
+} obd_cmd_t;
+#define OBD_FIRST_OPC OBD_PING
+
+/* catalog of log objects */
+
+/** Identifier for a single log object */
+struct llog_logid {
+	struct ost_id		lgl_oi;
+	__u32		   lgl_ogen;
+} __attribute__((packed));
+
+/** Records written to the CATALOGS list */
+#define CATLIST "CATALOGS"
+struct llog_catid {
+	struct llog_logid       lci_logid;
+	__u32		   lci_padding1;
+	__u32		   lci_padding2;
+	__u32		   lci_padding3;
+} __attribute__((packed));
+
+/* Log data record types - there is no specific reason that these need to
+ * be related to the RPC opcodes, but no reason not to (may be handy later?)
+ */
+#define LLOG_OP_MAGIC 0x10600000
+#define LLOG_OP_MASK  0xfff00000
+
+typedef enum {
+	LLOG_PAD_MAGIC		= LLOG_OP_MAGIC | 0x00000,
+	OST_SZ_REC		= LLOG_OP_MAGIC | 0x00f00,
+	/* OST_RAID1_REC	= LLOG_OP_MAGIC | 0x01000, never used */
+	MDS_UNLINK_REC		= LLOG_OP_MAGIC | 0x10000 | (MDS_REINT << 8) |
+				  REINT_UNLINK, /* obsolete after 2.5.0 */
+	MDS_UNLINK64_REC	= LLOG_OP_MAGIC | 0x90000 | (MDS_REINT << 8) |
+				  REINT_UNLINK,
+	/* MDS_SETATTR_REC	= LLOG_OP_MAGIC | 0x12401, obsolete 1.8.0 */
+	MDS_SETATTR64_REC	= LLOG_OP_MAGIC | 0x90000 | (MDS_REINT << 8) |
+				  REINT_SETATTR,
+	OBD_CFG_REC		= LLOG_OP_MAGIC | 0x20000,
+	/* PTL_CFG_REC		= LLOG_OP_MAGIC | 0x30000, obsolete 1.4.0 */
+	LLOG_GEN_REC		= LLOG_OP_MAGIC | 0x40000,
+	/* LLOG_JOIN_REC	= LLOG_OP_MAGIC | 0x50000, obsolete  1.8.0 */
+	CHANGELOG_REC		= LLOG_OP_MAGIC | 0x60000,
+	CHANGELOG_USER_REC	= LLOG_OP_MAGIC | 0x70000,
+	HSM_AGENT_REC		= LLOG_OP_MAGIC | 0x80000,
+	LLOG_HDR_MAGIC		= LLOG_OP_MAGIC | 0x45539,
+	LLOG_LOGID_MAGIC	= LLOG_OP_MAGIC | 0x4553b,
+} llog_op_type;
+
+#define LLOG_REC_HDR_NEEDS_SWABBING(r) \
+	(((r)->lrh_type & __swab32(LLOG_OP_MASK)) == __swab32(LLOG_OP_MAGIC))
+
+/** Log record header - stored in little endian order.
+ * Each record must start with this struct, end with a llog_rec_tail,
+ * and be a multiple of 256 bits in size.
+ */
+struct llog_rec_hdr {
+	__u32	lrh_len;
+	__u32	lrh_index;
+	__u32	lrh_type;
+	__u32	lrh_id;
+};
+
+struct llog_rec_tail {
+	__u32	lrt_len;
+	__u32	lrt_index;
+};
+
+/* Where data follow just after header */
+#define REC_DATA(ptr)						\
+	((void *)((char *)ptr + sizeof(struct llog_rec_hdr)))
+
+#define REC_DATA_LEN(rec)					\
+	(rec->lrh_len - sizeof(struct llog_rec_hdr) -		\
+	 sizeof(struct llog_rec_tail))
+
+struct llog_logid_rec {
+	struct llog_rec_hdr	lid_hdr;
+	struct llog_logid	lid_id;
+	__u32			lid_padding1;
+	__u64			lid_padding2;
+	__u64			lid_padding3;
+	struct llog_rec_tail	lid_tail;
+} __attribute__((packed));
+
+struct llog_unlink_rec {
+	struct llog_rec_hdr	lur_hdr;
+	__u64			lur_oid;
+	__u32			lur_oseq;
+	__u32			lur_count;
+	struct llog_rec_tail	lur_tail;
+} __attribute__((packed));
+
+struct llog_unlink64_rec {
+	struct llog_rec_hdr	lur_hdr;
+	struct lu_fid		lur_fid;
+	__u32			lur_count; /* to destroy the lost precreated */
+	__u32			lur_padding1;
+	__u64			lur_padding2;
+	__u64			lur_padding3;
+	struct llog_rec_tail    lur_tail;
+} __attribute__((packed));
+
+struct llog_setattr64_rec {
+	struct llog_rec_hdr	lsr_hdr;
+	struct ost_id		lsr_oi;
+	__u32			lsr_uid;
+	__u32			lsr_uid_h;
+	__u32			lsr_gid;
+	__u32			lsr_gid_h;
+	__u64			lsr_padding;
+	struct llog_rec_tail    lsr_tail;
+} __attribute__((packed));
+
+struct llog_size_change_rec {
+	struct llog_rec_hdr	lsc_hdr;
+	struct ll_fid		lsc_fid;
+	__u32			lsc_ioepoch;
+	__u32			lsc_padding1;
+	__u64			lsc_padding2;
+	__u64			lsc_padding3;
+	struct llog_rec_tail	lsc_tail;
+} __attribute__((packed));
+
+#define CHANGELOG_MAGIC 0xca103000
+
+/** \a changelog_rec_type's that can't be masked */
+#define CHANGELOG_MINMASK (1 << CL_MARK)
+/** bits covering all \a changelog_rec_type's */
+#define CHANGELOG_ALLMASK 0XFFFFFFFF
+/** default \a changelog_rec_type mask */
+#define CHANGELOG_DEFMASK CHANGELOG_ALLMASK & ~(1 << CL_ATIME | 1 << CL_CLOSE)
+
+/* changelog llog name, needed by client replicators */
+#define CHANGELOG_CATALOG "changelog_catalog"
+
+struct changelog_setinfo {
+	__u64 cs_recno;
+	__u32 cs_id;
+} __attribute__((packed));
+
+/** changelog record */
+struct llog_changelog_rec {
+	struct llog_rec_hdr  cr_hdr;
+	struct changelog_rec cr;
+	struct llog_rec_tail cr_tail; /**< for_sizezof_only */
+} __attribute__((packed));
+
+struct llog_changelog_ext_rec {
+	struct llog_rec_hdr      cr_hdr;
+	struct changelog_ext_rec cr;
+	struct llog_rec_tail     cr_tail; /**< for_sizezof_only */
+} __attribute__((packed));
+
+#define CHANGELOG_USER_PREFIX "cl"
+
+struct llog_changelog_user_rec {
+	struct llog_rec_hdr   cur_hdr;
+	__u32		 cur_id;
+	__u32		 cur_padding;
+	__u64		 cur_endrec;
+	struct llog_rec_tail  cur_tail;
+} __attribute__((packed));
+
+enum agent_req_status {
+	ARS_WAITING,
+	ARS_STARTED,
+	ARS_FAILED,
+	ARS_CANCELED,
+	ARS_SUCCEED,
+};
+
+static inline char *agent_req_status2name(enum agent_req_status ars)
+{
+	switch (ars) {
+	case ARS_WAITING:
+		return "WAITING";
+	case ARS_STARTED:
+		return "STARTED";
+	case ARS_FAILED:
+		return "FAILED";
+	case ARS_CANCELED:
+		return "CANCELED";
+	case ARS_SUCCEED:
+		return "SUCCEED";
+	default:
+		return "UNKNOWN";
+	}
+}
+
+static inline bool agent_req_in_final_state(enum agent_req_status ars)
+{
+	return ((ars == ARS_SUCCEED) || (ars == ARS_FAILED) ||
+		(ars == ARS_CANCELED));
+}
+
+struct llog_agent_req_rec {
+	struct llog_rec_hdr	arr_hdr;	/**< record header */
+	__u32			arr_status;	/**< status of the request */
+						/* must match enum
+						 * agent_req_status */
+	__u32			arr_archive_id;	/**< backend archive number */
+	__u64			arr_flags;	/**< req flags */
+	__u64			arr_compound_id;	/**< compound cookie */
+	__u64			arr_req_create;	/**< req. creation time */
+	__u64			arr_req_change;	/**< req. status change time */
+	struct hsm_action_item	arr_hai;	/**< req. to the agent */
+	struct llog_rec_tail	arr_tail; /**< record tail for_sizezof_only */
+} __attribute__((packed));
+
+/* Old llog gen for compatibility */
+struct llog_gen {
+	__u64 mnt_cnt;
+	__u64 conn_cnt;
+} __attribute__((packed));
+
+struct llog_gen_rec {
+	struct llog_rec_hdr	lgr_hdr;
+	struct llog_gen		lgr_gen;
+	__u64			padding1;
+	__u64			padding2;
+	__u64			padding3;
+	struct llog_rec_tail	lgr_tail;
+};
+
+/* On-disk header structure of each log object, stored in little endian order */
+#define LLOG_CHUNK_SIZE	 8192
+#define LLOG_HEADER_SIZE	(96)
+#define LLOG_BITMAP_BYTES       (LLOG_CHUNK_SIZE - LLOG_HEADER_SIZE)
+
+#define LLOG_MIN_REC_SIZE       (24) /* round(llog_rec_hdr + llog_rec_tail) */
+
+/* flags for the logs */
+enum llog_flag {
+	LLOG_F_ZAP_WHEN_EMPTY	= 0x1,
+	LLOG_F_IS_CAT		= 0x2,
+	LLOG_F_IS_PLAIN		= 0x4,
+};
+
+struct llog_log_hdr {
+	struct llog_rec_hdr     llh_hdr;
+	__s64		   llh_timestamp;
+	__u32		   llh_count;
+	__u32		   llh_bitmap_offset;
+	__u32		   llh_size;
+	__u32		   llh_flags;
+	__u32		   llh_cat_idx;
+	/* for a catalog the first plain slot is next to it */
+	struct obd_uuid	 llh_tgtuuid;
+	__u32		   llh_reserved[LLOG_HEADER_SIZE/sizeof(__u32) - 23];
+	__u32		   llh_bitmap[LLOG_BITMAP_BYTES/sizeof(__u32)];
+	struct llog_rec_tail    llh_tail;
+} __attribute__((packed));
+
+#define LLOG_BITMAP_SIZE(llh)  (__u32)((llh->llh_hdr.lrh_len -		\
+					llh->llh_bitmap_offset -	\
+					sizeof(llh->llh_tail)) * 8)
+
+/** log cookies are used to reference a specific log file and a record therein */
+struct llog_cookie {
+	struct llog_logid       lgc_lgl;
+	__u32		   lgc_subsys;
+	__u32		   lgc_index;
+	__u32		   lgc_padding;
+} __attribute__((packed));
+
+/** llog protocol */
+enum llogd_rpc_ops {
+	LLOG_ORIGIN_HANDLE_CREATE       = 501,
+	LLOG_ORIGIN_HANDLE_NEXT_BLOCK   = 502,
+	LLOG_ORIGIN_HANDLE_READ_HEADER  = 503,
+	LLOG_ORIGIN_HANDLE_WRITE_REC    = 504,
+	LLOG_ORIGIN_HANDLE_CLOSE	= 505,
+	LLOG_ORIGIN_CONNECT	     = 506,
+	LLOG_CATINFO			= 507,  /* deprecated */
+	LLOG_ORIGIN_HANDLE_PREV_BLOCK   = 508,
+	LLOG_ORIGIN_HANDLE_DESTROY      = 509,  /* for destroy llog object*/
+	LLOG_LAST_OPC,
+	LLOG_FIRST_OPC		  = LLOG_ORIGIN_HANDLE_CREATE
+};
+
+struct llogd_body {
+	struct llog_logid  lgd_logid;
+	__u32 lgd_ctxt_idx;
+	__u32 lgd_llh_flags;
+	__u32 lgd_index;
+	__u32 lgd_saved_index;
+	__u32 lgd_len;
+	__u64 lgd_cur_offset;
+} __attribute__((packed));
+
+struct llogd_conn_body {
+	struct llog_gen	 lgdc_gen;
+	struct llog_logid       lgdc_logid;
+	__u32		   lgdc_ctxt_idx;
+} __attribute__((packed));
+
+/* Note: 64-bit types are 64-bit aligned in structure */
+struct obdo {
+	__u64		o_valid;	/* hot fields in this obdo */
+	struct ost_id	o_oi;
+	__u64		o_parent_seq;
+	__u64		o_size;	 /* o_size-o_blocks == ost_lvb */
+	__s64		o_mtime;
+	__s64		o_atime;
+	__s64		o_ctime;
+	__u64		o_blocks;       /* brw: cli sent cached bytes */
+	__u64		o_grant;
+
+	/* 32-bit fields start here: keep an even number of them via padding */
+	__u32		o_blksize;      /* optimal IO blocksize */
+	__u32		o_mode;	 /* brw: cli sent cache remain */
+	__u32		o_uid;
+	__u32		o_gid;
+	__u32		o_flags;
+	__u32		o_nlink;	/* brw: checksum */
+	__u32		o_parent_oid;
+	__u32		o_misc;		/* brw: o_dropped */
+
+	__u64		   o_ioepoch;      /* epoch in ost writes */
+	__u32		   o_stripe_idx;   /* holds stripe idx */
+	__u32		   o_parent_ver;
+	struct lustre_handle    o_handle;       /* brw: lock handle to prolong
+						 * locks */
+	struct llog_cookie      o_lcookie;      /* destroy: unlink cookie from
+						 * MDS */
+	__u32			o_uid_h;
+	__u32			o_gid_h;
+
+	__u64			o_data_version; /* getattr: sum of iversion for
+						 * each stripe.
+						 * brw: grant space consumed on
+						 * the client for the write */
+	__u64			o_padding_4;
+	__u64			o_padding_5;
+	__u64			o_padding_6;
+};
+
+#define o_dirty   o_blocks
+#define o_undirty o_mode
+#define o_dropped o_misc
+#define o_cksum   o_nlink
+#define o_grant_used o_data_version
+
+static inline void lustre_set_wire_obdo(struct obd_connect_data *ocd,
+					struct obdo *wobdo,
+					const struct obdo *lobdo)
+{
+	*wobdo = *lobdo;
+	wobdo->o_flags &= ~OBD_FL_LOCAL_MASK;
+	if (ocd == NULL)
+		return;
+
+	if (unlikely(!(ocd->ocd_connect_flags & OBD_CONNECT_FID)) &&
+	    fid_seq_is_echo(ostid_seq(&lobdo->o_oi))) {
+		/* Currently OBD_FL_OSTID will only be used when 2.4 echo
+		 * client communicate with pre-2.4 server */
+		wobdo->o_oi.oi.oi_id = fid_oid(&lobdo->o_oi.oi_fid);
+		wobdo->o_oi.oi.oi_seq = fid_seq(&lobdo->o_oi.oi_fid);
+	}
+}
+
+static inline void lustre_get_wire_obdo(struct obd_connect_data *ocd,
+					struct obdo *lobdo,
+					const struct obdo *wobdo)
+{
+	__u32 local_flags = 0;
+
+	if (lobdo->o_valid & OBD_MD_FLFLAGS)
+		 local_flags = lobdo->o_flags & OBD_FL_LOCAL_MASK;
+
+	*lobdo = *wobdo;
+	if (local_flags != 0) {
+		lobdo->o_valid |= OBD_MD_FLFLAGS;
+		lobdo->o_flags &= ~OBD_FL_LOCAL_MASK;
+		lobdo->o_flags |= local_flags;
+	}
+	if (ocd == NULL)
+		return;
+
+	if (unlikely(!(ocd->ocd_connect_flags & OBD_CONNECT_FID)) &&
+	    fid_seq_is_echo(wobdo->o_oi.oi.oi_seq)) {
+		/* see above */
+		lobdo->o_oi.oi_fid.f_seq = wobdo->o_oi.oi.oi_seq;
+		lobdo->o_oi.oi_fid.f_oid = wobdo->o_oi.oi.oi_id;
+		lobdo->o_oi.oi_fid.f_ver = 0;
+	}
+}
+
+extern void lustre_swab_obdo (struct obdo *o);
+
+/* request structure for OST's */
+struct ost_body {
+	struct  obdo oa;
+};
+
+/* Key for FIEMAP to be used in get_info calls */
+struct ll_fiemap_info_key {
+	char    name[8];
+	struct  obdo oa;
+	struct  ll_user_fiemap fiemap;
+};
+
+extern void lustre_swab_ost_body (struct ost_body *b);
+extern void lustre_swab_ost_last_id(__u64 *id);
+extern void lustre_swab_fiemap(struct ll_user_fiemap *fiemap);
+
+extern void lustre_swab_lov_user_md_v1(struct lov_user_md_v1 *lum);
+extern void lustre_swab_lov_user_md_v3(struct lov_user_md_v3 *lum);
+extern void lustre_swab_lov_user_md_objects(struct lov_user_ost_data *lod,
+					    int stripe_count);
+extern void lustre_swab_lov_mds_md(struct lov_mds_md *lmm);
+
+/* llog_swab.c */
+extern void lustre_swab_llogd_body (struct llogd_body *d);
+extern void lustre_swab_llog_hdr (struct llog_log_hdr *h);
+extern void lustre_swab_llogd_conn_body (struct llogd_conn_body *d);
+extern void lustre_swab_llog_rec(struct llog_rec_hdr *rec);
+extern void lustre_swab_llog_id(struct llog_logid *lid);
+
+struct lustre_cfg;
+extern void lustre_swab_lustre_cfg(struct lustre_cfg *lcfg);
+
+/* Functions for dumping PTLRPC fields */
+void dump_rniobuf(struct niobuf_remote *rnb);
+void dump_ioo(struct obd_ioobj *nb);
+void dump_obdo(struct obdo *oa);
+void dump_ost_body(struct ost_body *ob);
+void dump_rcs(__u32 *rc);
+
+#define IDX_INFO_MAGIC 0x3D37CC37
+
+/* Index file transfer through the network. The server serializes the index into
+ * a byte stream which is sent to the client via a bulk transfer */
+struct idx_info {
+	__u32		ii_magic;
+
+	/* reply: see idx_info_flags below */
+	__u32		ii_flags;
+
+	/* request & reply: number of lu_idxpage (to be) transferred */
+	__u16		ii_count;
+	__u16		ii_pad0;
+
+	/* request: requested attributes passed down to the iterator API */
+	__u32		ii_attrs;
+
+	/* request & reply: index file identifier (FID) */
+	struct lu_fid	ii_fid;
+
+	/* reply: version of the index file before starting to walk the index.
+	 * Please note that the version can be modified at any time during the
+	 * transfer */
+	__u64		ii_version;
+
+	/* request: hash to start with:
+	 * reply: hash of the first entry of the first lu_idxpage and hash
+	 *	of the entry to read next if any */
+	__u64		ii_hash_start;
+	__u64		ii_hash_end;
+
+	/* reply: size of keys in lu_idxpages, minimal one if II_FL_VARKEY is
+	 * set */
+	__u16		ii_keysize;
+
+	/* reply: size of records in lu_idxpages, minimal one if II_FL_VARREC
+	 * is set */
+	__u16		ii_recsize;
+
+	__u32		ii_pad1;
+	__u64		ii_pad2;
+	__u64		ii_pad3;
+};
+extern void lustre_swab_idx_info(struct idx_info *ii);
+
+#define II_END_OFF	MDS_DIR_END_OFF /* all entries have been read */
+
+/* List of flags used in idx_info::ii_flags */
+enum idx_info_flags {
+	II_FL_NOHASH	= 1 << 0, /* client doesn't care about hash value */
+	II_FL_VARKEY	= 1 << 1, /* keys can be of variable size */
+	II_FL_VARREC	= 1 << 2, /* records can be of variable size */
+	II_FL_NONUNQ	= 1 << 3, /* index supports non-unique keys */
+};
+
+#define LIP_MAGIC 0x8A6D6B6C
+
+/* 4KB (= LU_PAGE_SIZE) container gathering key/record pairs */
+struct lu_idxpage {
+	/* 16-byte header */
+	__u32	lip_magic;
+	__u16	lip_flags;
+	__u16	lip_nr;   /* number of entries in the container */
+	__u64	lip_pad0; /* additional padding for future use */
+
+	/* key/record pairs are stored in the remaining 4080 bytes.
+	 * depending upon the flags in idx_info::ii_flags, each key/record
+	 * pair might be preceded by:
+	 * - a hash value
+	 * - the key size (II_FL_VARKEY is set)
+	 * - the record size (II_FL_VARREC is set)
+	 *
+	 * For the time being, we only support fixed-size key & record. */
+	char	lip_entries[0];
+};
+extern void lustre_swab_lip_header(struct lu_idxpage *lip);
+
+#define LIP_HDR_SIZE (offsetof(struct lu_idxpage, lip_entries))
+
+/* Gather all possible type associated with a 4KB container */
+union lu_page {
+	struct lu_dirpage	lp_dir; /* for MDS_READPAGE */
+	struct lu_idxpage	lp_idx; /* for OBD_IDX_READ */
+	char			lp_array[LU_PAGE_SIZE];
+};
+
+/* security opcodes */
+typedef enum {
+	SEC_CTX_INIT	    = 801,
+	SEC_CTX_INIT_CONT       = 802,
+	SEC_CTX_FINI	    = 803,
+	SEC_LAST_OPC,
+	SEC_FIRST_OPC	   = SEC_CTX_INIT
+} sec_cmd_t;
+
+/*
+ * capa related definitions
+ */
+#define CAPA_HMAC_MAX_LEN       64
+#define CAPA_HMAC_KEY_MAX_LEN   56
+
+/* NB take care when changing the sequence of elements this struct,
+ * because the offset info is used in find_capa() */
+struct lustre_capa {
+	struct lu_fid   lc_fid;	 /** fid */
+	__u64	   lc_opc;	 /** operations allowed */
+	__u64	   lc_uid;	 /** file owner */
+	__u64	   lc_gid;	 /** file group */
+	__u32	   lc_flags;       /** HMAC algorithm & flags */
+	__u32	   lc_keyid;       /** key# used for the capability */
+	__u32	   lc_timeout;     /** capa timeout value (sec) */
+	__u32	   lc_expiry;      /** expiry time (sec) */
+	__u8	    lc_hmac[CAPA_HMAC_MAX_LEN];   /** HMAC */
+} __attribute__((packed));
+
+extern void lustre_swab_lustre_capa(struct lustre_capa *c);
+
+/** lustre_capa::lc_opc */
+enum {
+	CAPA_OPC_BODY_WRITE   = 1<<0,  /**< write object data */
+	CAPA_OPC_BODY_READ    = 1<<1,  /**< read object data */
+	CAPA_OPC_INDEX_LOOKUP = 1<<2,  /**< lookup object fid */
+	CAPA_OPC_INDEX_INSERT = 1<<3,  /**< insert object fid */
+	CAPA_OPC_INDEX_DELETE = 1<<4,  /**< delete object fid */
+	CAPA_OPC_OSS_WRITE    = 1<<5,  /**< write oss object data */
+	CAPA_OPC_OSS_READ     = 1<<6,  /**< read oss object data */
+	CAPA_OPC_OSS_TRUNC    = 1<<7,  /**< truncate oss object */
+	CAPA_OPC_OSS_DESTROY  = 1<<8,  /**< destroy oss object */
+	CAPA_OPC_META_WRITE   = 1<<9,  /**< write object meta data */
+	CAPA_OPC_META_READ    = 1<<10, /**< read object meta data */
+};
+
+#define CAPA_OPC_OSS_RW (CAPA_OPC_OSS_READ | CAPA_OPC_OSS_WRITE)
+#define CAPA_OPC_MDS_ONLY						   \
+	(CAPA_OPC_BODY_WRITE | CAPA_OPC_BODY_READ | CAPA_OPC_INDEX_LOOKUP | \
+	 CAPA_OPC_INDEX_INSERT | CAPA_OPC_INDEX_DELETE)
+#define CAPA_OPC_OSS_ONLY						   \
+	(CAPA_OPC_OSS_WRITE | CAPA_OPC_OSS_READ | CAPA_OPC_OSS_TRUNC |      \
+	 CAPA_OPC_OSS_DESTROY)
+#define CAPA_OPC_MDS_DEFAULT ~CAPA_OPC_OSS_ONLY
+#define CAPA_OPC_OSS_DEFAULT ~(CAPA_OPC_MDS_ONLY | CAPA_OPC_OSS_ONLY)
+
+/* MDS capability covers object capability for operations of body r/w
+ * (dir readpage/sendpage), index lookup/insert/delete and meta data r/w,
+ * while OSS capability only covers object capability for operations of
+ * oss data(file content) r/w/truncate.
+ */
+static inline int capa_for_mds(struct lustre_capa *c)
+{
+	return (c->lc_opc & CAPA_OPC_INDEX_LOOKUP) != 0;
+}
+
+static inline int capa_for_oss(struct lustre_capa *c)
+{
+	return (c->lc_opc & CAPA_OPC_INDEX_LOOKUP) == 0;
+}
+
+/* lustre_capa::lc_hmac_alg */
+enum {
+	CAPA_HMAC_ALG_SHA1 = 1, /**< sha1 algorithm */
+	CAPA_HMAC_ALG_MAX,
+};
+
+#define CAPA_FL_MASK	    0x00ffffff
+#define CAPA_HMAC_ALG_MASK      0xff000000
+
+struct lustre_capa_key {
+	__u64   lk_seq;       /**< mds# */
+	__u32   lk_keyid;     /**< key# */
+	__u32   lk_padding;
+	__u8    lk_key[CAPA_HMAC_KEY_MAX_LEN];    /**< key */
+} __attribute__((packed));
+
+extern void lustre_swab_lustre_capa_key(struct lustre_capa_key *k);
+
+/** The link ea holds 1 \a link_ea_entry for each hardlink */
+#define LINK_EA_MAGIC 0x11EAF1DFUL
+struct link_ea_header {
+	__u32 leh_magic;
+	__u32 leh_reccount;
+	__u64 leh_len;      /* total size */
+	/* future use */
+	__u32 padding1;
+	__u32 padding2;
+};
+
+/** Hardlink data is name and parent fid.
+ * Stored in this crazy struct for maximum packing and endian-neutrality
+ */
+struct link_ea_entry {
+	/** __u16 stored big-endian, unaligned */
+	unsigned char      lee_reclen[2];
+	unsigned char      lee_parent_fid[sizeof(struct lu_fid)];
+	char	       lee_name[0];
+}__attribute__((packed));
+
+/** fid2path request/reply structure */
+struct getinfo_fid2path {
+	struct lu_fid   gf_fid;
+	__u64	   gf_recno;
+	__u32	   gf_linkno;
+	__u32	   gf_pathlen;
+	char	    gf_path[0];
+} __attribute__((packed));
+
+void lustre_swab_fid2path (struct getinfo_fid2path *gf);
+
+enum {
+	LAYOUT_INTENT_ACCESS    = 0,
+	LAYOUT_INTENT_READ      = 1,
+	LAYOUT_INTENT_WRITE     = 2,
+	LAYOUT_INTENT_GLIMPSE   = 3,
+	LAYOUT_INTENT_TRUNC     = 4,
+	LAYOUT_INTENT_RELEASE   = 5,
+	LAYOUT_INTENT_RESTORE   = 6
+};
+
+/* enqueue layout lock with intent */
+struct layout_intent {
+	__u32 li_opc; /* intent operation for enqueue, read, write etc */
+	__u32 li_flags;
+	__u64 li_start;
+	__u64 li_end;
+};
+
+void lustre_swab_layout_intent(struct layout_intent *li);
+
+/**
+ * On the wire version of hsm_progress structure.
+ *
+ * Contains the userspace hsm_progress and some internal fields.
+ */
+struct hsm_progress_kernel {
+	/* Field taken from struct hsm_progress */
+	lustre_fid		hpk_fid;
+	__u64			hpk_cookie;
+	struct hsm_extent	hpk_extent;
+	__u16			hpk_flags;
+	__u16			hpk_errval; /* positive val */
+	__u32			hpk_padding1;
+	/* Additional fields */
+	__u64			hpk_data_version;
+	__u64			hpk_padding2;
+} __attribute__((packed));
+
+extern void lustre_swab_hsm_user_state(struct hsm_user_state *hus);
+extern void lustre_swab_hsm_current_action(struct hsm_current_action *action);
+extern void lustre_swab_hsm_progress_kernel(struct hsm_progress_kernel *hpk);
+extern void lustre_swab_hsm_user_state(struct hsm_user_state *hus);
+extern void lustre_swab_hsm_user_item(struct hsm_user_item *hui);
+extern void lustre_swab_hsm_request(struct hsm_request *hr);
+
+/**
+ * These are object update opcode under UPDATE_OBJ, which is currently
+ * being used by cross-ref operations between MDT.
+ *
+ * During the cross-ref operation, the Master MDT, which the client send the
+ * request to, will disassembly the operation into object updates, then OSP
+ * will send these updates to the remote MDT to be executed.
+ *
+ *   Update request format
+ *   magic:  UPDATE_BUFFER_MAGIC_V1
+ *   Count:  How many updates in the req.
+ *   bufs[0] : following are packets of object.
+ *   update[0]:
+ *		type: object_update_op, the op code of update
+ *		fid: The object fid of the update.
+ *		lens/bufs: other parameters of the update.
+ *   update[1]:
+ *		type: object_update_op, the op code of update
+ *		fid: The object fid of the update.
+ *		lens/bufs: other parameters of the update.
+ *   ..........
+ *   update[7]:	type: object_update_op, the op code of update
+ *		fid: The object fid of the update.
+ *		lens/bufs: other parameters of the update.
+ *   Current 8 maxim updates per object update request.
+ *
+ *******************************************************************
+ *   update reply format:
+ *
+ *   ur_version: UPDATE_REPLY_V1
+ *   ur_count:   The count of the reply, which is usually equal
+ *		 to the number of updates in the request.
+ *   ur_lens:    The reply lengths of each object update.
+ *
+ *   replies:    1st update reply  [4bytes_ret: other body]
+ *		 2nd update reply  [4bytes_ret: other body]
+ *		 .....
+ *		 nth update reply  [4bytes_ret: other body]
+ *
+ *   For each reply of the update, the format would be
+ *	 result(4 bytes):Other stuff
+ */
+
+#define UPDATE_MAX_OPS		10
+#define UPDATE_BUFFER_MAGIC_V1	0xBDDE0001
+#define UPDATE_BUFFER_MAGIC	UPDATE_BUFFER_MAGIC_V1
+#define UPDATE_BUF_COUNT	8
+enum object_update_op {
+	OBJ_CREATE		= 1,
+	OBJ_DESTROY		= 2,
+	OBJ_REF_ADD		= 3,
+	OBJ_REF_DEL		= 4,
+	OBJ_ATTR_SET		= 5,
+	OBJ_ATTR_GET		= 6,
+	OBJ_XATTR_SET		= 7,
+	OBJ_XATTR_GET		= 8,
+	OBJ_INDEX_LOOKUP	= 9,
+	OBJ_INDEX_INSERT	= 10,
+	OBJ_INDEX_DELETE	= 11,
+	OBJ_LAST
+};
+
+struct update {
+	__u32		u_type;
+	__u32		u_batchid;
+	struct lu_fid	u_fid;
+	__u32		u_lens[UPDATE_BUF_COUNT];
+	__u32		u_bufs[0];
+};
+
+struct update_buf {
+	__u32	ub_magic;
+	__u32	ub_count;
+	__u32	ub_bufs[0];
+};
+
+#define UPDATE_REPLY_V1		0x00BD0001
+struct update_reply {
+	__u32	ur_version;
+	__u32	ur_count;
+	__u32	ur_lens[0];
+};
+
+void lustre_swab_update_buf(struct update_buf *ub);
+void lustre_swab_update_reply_buf(struct update_reply *ur);
+
+/** layout swap request structure
+ * fid1 and fid2 are in mdt_body
+ */
+struct mdc_swap_layouts {
+	__u64	   msl_flags;
+} __packed;
+
+void lustre_swab_swap_layouts(struct mdc_swap_layouts *msl);
+
+struct close_data {
+	struct lustre_handle	cd_handle;
+	struct lu_fid		cd_fid;
+	__u64			cd_data_version;
+	__u64			cd_reserved[8];
+};
+
+void lustre_swab_close_data(struct close_data *data);
+
+#endif
+/** @} lustreidl */
diff --git a/kernel/drivers/staging/lustre/lustre/include/lustre/lustre_lfsck_user.h b/kernel/drivers/staging/lustre/lustre/include/lustre/lustre_lfsck_user.h
new file mode 100644
index 000000000..1c87a61a7
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/include/lustre/lustre_lfsck_user.h
@@ -0,0 +1,95 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License version 2 for more details.  A copy is
+ * included in the COPYING file that accompanied this code.
+
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * lustre/include/lustre/lustre_lfsck_user.h
+ *
+ * Lustre LFSCK userspace interfaces.
+ *
+ * Author: Fan Yong <yong.fan@whamcloud.com>
+ */
+
+#ifndef _LUSTRE_LFSCK_USER_H
+# define _LUSTRE_LFSCK_USER_H
+
+enum lfsck_param_flags {
+	/* Reset LFSCK iterator position to the device beginning. */
+	LPF_RESET       = 0x0001,
+
+	/* Exit when fail. */
+	LPF_FAILOUT     = 0x0002,
+
+	/* Dryrun mode, only check without modification */
+	LPF_DRYRUN      = 0x0004,
+};
+
+enum lfsck_type {
+	/* For MDT-OST consistency check/repair. */
+	LT_LAYOUT	= 0x0001,
+
+	/* For MDT-MDT consistency check/repair. */
+	LT_DNE		= 0x0002,
+
+	/* For FID-in-dirent and linkEA consistency check/repair. */
+	LT_NAMESPACE	= 0x0004,
+};
+
+#define LFSCK_VERSION_V1	1
+#define LFSCK_VERSION_V2	2
+
+#define LFSCK_TYPES_ALL		((__u16)(~0))
+#define LFSCK_TYPES_DEF		((__u16)0)
+#define LFSCK_TYPES_SUPPORTED	LT_NAMESPACE
+
+#define LFSCK_SPEED_NO_LIMIT	0
+#define LFSCK_SPEED_LIMIT_DEF	LFSCK_SPEED_NO_LIMIT
+
+enum lfsck_start_valid {
+	LSV_SPEED_LIMIT		= 0x00000001,
+	LSV_ERROR_HANDLE	= 0x00000002,
+	LSV_DRYRUN		= 0x00000004,
+};
+
+/* Arguments for starting lfsck. */
+struct lfsck_start {
+	/* Which arguments are valid, see 'enum lfsck_start_valid'. */
+	__u32   ls_valid;
+
+	/* How many items can be scanned at most per second. */
+	__u32   ls_speed_limit;
+
+	/* For compatibility between user space tools and kernel service. */
+	__u16   ls_version;
+
+	/* Which LFSCK components to be (have been) started. */
+	__u16   ls_active;
+
+	/* Flags for the LFSCK, see 'enum lfsck_param_flags'. */
+	__u16   ls_flags;
+
+	/* For 64-bits aligned. */
+	__u16   ls_padding;
+};
+
+#endif /* _LUSTRE_LFSCK_USER_H */
diff --git a/kernel/drivers/staging/lustre/lustre/include/lustre/lustre_user.h b/kernel/drivers/staging/lustre/lustre/include/lustre/lustre_user.h
new file mode 100644
index 000000000..89794fdfe
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/include/lustre/lustre_user.h
@@ -0,0 +1,1179 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2010, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/include/lustre/lustre_user.h
+ *
+ * Lustre public user-space interface definitions.
+ */
+
+#ifndef _LUSTRE_USER_H
+#define _LUSTRE_USER_H
+
+/** \defgroup lustreuser lustreuser
+ *
+ * @{
+ */
+
+#include "ll_fiemap.h"
+#include "../linux/lustre_user.h"
+
+/* for statfs() */
+#define LL_SUPER_MAGIC 0x0BD00BD0
+
+#ifndef FSFILT_IOC_GETFLAGS
+#define FSFILT_IOC_GETFLAGS	       _IOR('f', 1, long)
+#define FSFILT_IOC_SETFLAGS	       _IOW('f', 2, long)
+#define FSFILT_IOC_GETVERSION	     _IOR('f', 3, long)
+#define FSFILT_IOC_SETVERSION	     _IOW('f', 4, long)
+#define FSFILT_IOC_GETVERSION_OLD	 _IOR('v', 1, long)
+#define FSFILT_IOC_SETVERSION_OLD	 _IOW('v', 2, long)
+#define FSFILT_IOC_FIEMAP		 _IOWR('f', 11, struct ll_user_fiemap)
+#endif
+
+/* FIEMAP flags supported by Lustre */
+#define LUSTRE_FIEMAP_FLAGS_COMPAT (FIEMAP_FLAG_SYNC | FIEMAP_FLAG_DEVICE_ORDER)
+
+enum obd_statfs_state {
+	OS_STATE_DEGRADED       = 0x00000001, /**< RAID degraded/rebuilding */
+	OS_STATE_READONLY       = 0x00000002, /**< filesystem is read-only */
+	OS_STATE_RDONLY_1       = 0x00000004, /**< obsolete 1.6, was EROFS=30 */
+	OS_STATE_RDONLY_2       = 0x00000008, /**< obsolete 1.6, was EROFS=30 */
+	OS_STATE_RDONLY_3       = 0x00000010, /**< obsolete 1.6, was EROFS=30 */
+};
+
+struct obd_statfs {
+	__u64	   os_type;
+	__u64	   os_blocks;
+	__u64	   os_bfree;
+	__u64	   os_bavail;
+	__u64	   os_files;
+	__u64	   os_ffree;
+	__u8	    os_fsid[40];
+	__u32	   os_bsize;
+	__u32	   os_namelen;
+	__u64	   os_maxbytes;
+	__u32	   os_state;       /**< obd_statfs_state OS_STATE_* flag */
+	__u32	   os_fprecreated;	/* objs available now to the caller */
+					/* used in QoS code to find preferred
+					 * OSTs */
+	__u32	   os_spare2;
+	__u32	   os_spare3;
+	__u32	   os_spare4;
+	__u32	   os_spare5;
+	__u32	   os_spare6;
+	__u32	   os_spare7;
+	__u32	   os_spare8;
+	__u32	   os_spare9;
+};
+
+/**
+ * File IDentifier.
+ *
+ * FID is a cluster-wide unique identifier of a file or an object (stripe).
+ * FIDs are never reused.
+ **/
+struct lu_fid {
+       /**
+	* FID sequence. Sequence is a unit of migration: all files (objects)
+	* with FIDs from a given sequence are stored on the same server.
+	* Lustre should support 2^64 objects, so even if each sequence
+	* has only a single object we can still enumerate 2^64 objects.
+	**/
+	__u64 f_seq;
+	/* FID number within sequence. */
+	__u32 f_oid;
+	/**
+	 * FID version, used to distinguish different versions (in the sense
+	 * of snapshots, etc.) of the same file system object. Not currently
+	 * used.
+	 **/
+	__u32 f_ver;
+};
+
+struct filter_fid {
+	struct lu_fid	ff_parent;  /* ff_parent.f_ver == file stripe number */
+};
+
+/* keep this one for compatibility */
+struct filter_fid_old {
+	struct lu_fid	ff_parent;
+	__u64		ff_objid;
+	__u64		ff_seq;
+};
+
+/* Userspace should treat lu_fid as opaque, and only use the following methods
+ * to print or parse them.  Other functions (e.g. compare, swab) could be moved
+ * here from lustre_idl.h if needed. */
+typedef struct lu_fid lustre_fid;
+
+/**
+ * Following struct for object attributes, that will be kept inode's EA.
+ * Introduced in 2.0 release (please see b15993, for details)
+ * Added to all objects since Lustre 2.4 as contains self FID
+ */
+struct lustre_mdt_attrs {
+	/**
+	 * Bitfield for supported data in this structure. From enum lma_compat.
+	 * lma_self_fid and lma_flags are always available.
+	 */
+	__u32   lma_compat;
+	/**
+	 * Per-file incompat feature list. Lustre version should support all
+	 * flags set in this field. The supported feature mask is available in
+	 * LMA_INCOMPAT_SUPP.
+	 */
+	__u32   lma_incompat;
+	/** FID of this inode */
+	struct lu_fid  lma_self_fid;
+};
+
+/**
+ * Prior to 2.4, the LMA structure also included SOM attributes which has since
+ * been moved to a dedicated xattr
+ * lma_flags was also removed because of lma_compat/incompat fields.
+ */
+#define LMA_OLD_SIZE (sizeof(struct lustre_mdt_attrs) + 5 * sizeof(__u64))
+
+/**
+ * OST object IDentifier.
+ */
+struct ost_id {
+	union {
+		struct ostid {
+			__u64	oi_id;
+			__u64	oi_seq;
+		} oi;
+		struct lu_fid oi_fid;
+	};
+};
+
+#define DOSTID "%#llx:%llu"
+#define POSTID(oi) ostid_seq(oi), ostid_id(oi)
+
+/*
+ * The ioctl naming rules:
+ * LL_*     - works on the currently opened filehandle instead of parent dir
+ * *_OBD_*  - gets data for both OSC or MDC (LOV, LMV indirectly)
+ * *_MDC_*  - gets/sets data related to MDC
+ * *_LOV_*  - gets/sets data related to OSC/LOV
+ * *FILE*   - called on parent dir and passes in a filename
+ * *STRIPE* - set/get lov_user_md
+ * *INFO    - set/get lov_user_mds_data
+ */
+/* see <lustre_lib.h> for ioctl numberss 101-150 */
+#define LL_IOC_GETFLAGS		 _IOR ('f', 151, long)
+#define LL_IOC_SETFLAGS		 _IOW ('f', 152, long)
+#define LL_IOC_CLRFLAGS		 _IOW ('f', 153, long)
+/* LL_IOC_LOV_SETSTRIPE: See also OBD_IOC_LOV_SETSTRIPE */
+#define LL_IOC_LOV_SETSTRIPE	    _IOW ('f', 154, long)
+/* LL_IOC_LOV_GETSTRIPE: See also OBD_IOC_LOV_GETSTRIPE */
+#define LL_IOC_LOV_GETSTRIPE	    _IOW ('f', 155, long)
+/* LL_IOC_LOV_SETEA: See also OBD_IOC_LOV_SETEA */
+#define LL_IOC_LOV_SETEA		_IOW ('f', 156, long)
+#define LL_IOC_RECREATE_OBJ	     _IOW ('f', 157, long)
+#define LL_IOC_RECREATE_FID	     _IOW ('f', 157, struct lu_fid)
+#define LL_IOC_GROUP_LOCK	       _IOW ('f', 158, long)
+#define LL_IOC_GROUP_UNLOCK	     _IOW ('f', 159, long)
+/* LL_IOC_QUOTACHECK: See also OBD_IOC_QUOTACHECK */
+#define LL_IOC_QUOTACHECK	       _IOW ('f', 160, int)
+/* LL_IOC_POLL_QUOTACHECK: See also OBD_IOC_POLL_QUOTACHECK */
+#define LL_IOC_POLL_QUOTACHECK	  _IOR ('f', 161, struct if_quotacheck *)
+/* LL_IOC_QUOTACTL: See also OBD_IOC_QUOTACTL */
+#define LL_IOC_QUOTACTL		 _IOWR('f', 162, struct if_quotactl)
+#define IOC_OBD_STATFS		  _IOWR('f', 164, struct obd_statfs *)
+#define IOC_LOV_GETINFO		 _IOWR('f', 165, struct lov_user_mds_data *)
+#define LL_IOC_FLUSHCTX		 _IOW ('f', 166, long)
+#define LL_IOC_RMTACL		   _IOW ('f', 167, long)
+#define LL_IOC_GETOBDCOUNT	      _IOR ('f', 168, long)
+#define LL_IOC_LLOOP_ATTACH	     _IOWR('f', 169, long)
+#define LL_IOC_LLOOP_DETACH	     _IOWR('f', 170, long)
+#define LL_IOC_LLOOP_INFO	       _IOWR('f', 171, struct lu_fid)
+#define LL_IOC_LLOOP_DETACH_BYDEV       _IOWR('f', 172, long)
+#define LL_IOC_PATH2FID		 _IOR ('f', 173, long)
+#define LL_IOC_GET_CONNECT_FLAGS	_IOWR('f', 174, __u64 *)
+#define LL_IOC_GET_MDTIDX	       _IOR ('f', 175, int)
+
+/* see <lustre_lib.h> for ioctl numbers 177-210 */
+
+#define LL_IOC_HSM_STATE_GET		_IOR('f', 211, struct hsm_user_state)
+#define LL_IOC_HSM_STATE_SET		_IOW('f', 212, struct hsm_state_set)
+#define LL_IOC_HSM_CT_START		_IOW('f', 213, struct lustre_kernelcomm)
+#define LL_IOC_HSM_COPY_START		_IOW('f', 214, struct hsm_copy *)
+#define LL_IOC_HSM_COPY_END		_IOW('f', 215, struct hsm_copy *)
+#define LL_IOC_HSM_PROGRESS		_IOW('f', 216, struct hsm_user_request)
+#define LL_IOC_HSM_REQUEST		_IOW('f', 217, struct hsm_user_request)
+#define LL_IOC_DATA_VERSION		_IOR('f', 218, struct ioc_data_version)
+#define LL_IOC_LOV_SWAP_LAYOUTS		_IOW('f', 219, \
+						struct lustre_swap_layouts)
+#define LL_IOC_HSM_ACTION		_IOR('f', 220, \
+						struct hsm_current_action)
+/* see <lustre_lib.h> for ioctl numbers 221-232 */
+
+#define LL_IOC_LMV_SETSTRIPE	    _IOWR('f', 240, struct lmv_user_md)
+#define LL_IOC_LMV_GETSTRIPE	    _IOWR('f', 241, struct lmv_user_md)
+#define LL_IOC_REMOVE_ENTRY	    _IOWR('f', 242, __u64)
+#define LL_IOC_SET_LEASE		_IOWR('f', 243, long)
+#define LL_IOC_GET_LEASE		_IO('f', 244)
+#define LL_IOC_HSM_IMPORT		_IOWR('f', 245, struct hsm_user_import)
+
+#define LL_STATFS_LMV	   1
+#define LL_STATFS_LOV	   2
+#define LL_STATFS_NODELAY	4
+
+#define IOC_MDC_TYPE	    'i'
+#define IOC_MDC_LOOKUP	  _IOWR(IOC_MDC_TYPE, 20, struct obd_device *)
+#define IOC_MDC_GETFILESTRIPE   _IOWR(IOC_MDC_TYPE, 21, struct lov_user_md *)
+#define IOC_MDC_GETFILEINFO     _IOWR(IOC_MDC_TYPE, 22, struct lov_user_mds_data *)
+#define LL_IOC_MDC_GETINFO      _IOWR(IOC_MDC_TYPE, 23, struct lov_user_mds_data *)
+
+/* Keep these for backward compartability. */
+#define LL_IOC_OBD_STATFS       IOC_OBD_STATFS
+#define IOC_MDC_GETSTRIPE       IOC_MDC_GETFILESTRIPE
+
+
+#define MAX_OBD_NAME 128 /* If this changes, a NEW ioctl must be added */
+
+/* Define O_LOV_DELAY_CREATE to be a mask that is not useful for regular
+ * files, but are unlikely to be used in practice and are not harmful if
+ * used incorrectly.  O_NOCTTY and FASYNC are only meaningful for character
+ * devices and are safe for use on new files (See LU-812, LU-4209). */
+#define O_LOV_DELAY_CREATE	(O_NOCTTY | FASYNC)
+
+#define LL_FILE_IGNORE_LOCK     0x00000001
+#define LL_FILE_GROUP_LOCKED    0x00000002
+#define LL_FILE_READAHEA	0x00000004
+#define LL_FILE_LOCKED_DIRECTIO 0x00000008 /* client-side locks with dio */
+#define LL_FILE_LOCKLESS_IO     0x00000010 /* server-side locks with cio */
+#define LL_FILE_RMTACL	  0x00000020
+
+#define LOV_USER_MAGIC_V1 0x0BD10BD0
+#define LOV_USER_MAGIC    LOV_USER_MAGIC_V1
+#define LOV_USER_MAGIC_JOIN_V1 0x0BD20BD0
+#define LOV_USER_MAGIC_V3 0x0BD30BD0
+
+#define LMV_MAGIC_V1      0x0CD10CD0    /*normal stripe lmv magic */
+#define LMV_USER_MAGIC    0x0CD20CD0    /*default lmv magic*/
+
+#define LOV_PATTERN_RAID0 0x001
+#define LOV_PATTERN_RAID1 0x002
+#define LOV_PATTERN_FIRST 0x100
+
+#define LOV_MAXPOOLNAME 16
+#define LOV_POOLNAMEF "%.16s"
+
+#define LOV_MIN_STRIPE_BITS 16   /* maximum PAGE_SIZE (ia64), power of 2 */
+#define LOV_MIN_STRIPE_SIZE (1 << LOV_MIN_STRIPE_BITS)
+#define LOV_MAX_STRIPE_COUNT_OLD 160
+/* This calculation is crafted so that input of 4096 will result in 160
+ * which in turn is equal to old maximal stripe count.
+ * XXX: In fact this is too simplified for now, what it also need is to get
+ * ea_type argument to clearly know how much space each stripe consumes.
+ *
+ * The limit of 12 pages is somewhat arbitrary, but is a reasonably large
+ * allocation that is sufficient for the current generation of systems.
+ *
+ * (max buffer size - lov+rpc header) / sizeof(struct lov_ost_data_v1) */
+#define LOV_MAX_STRIPE_COUNT 2000  /* ((12 * 4096 - 256) / 24) */
+#define LOV_ALL_STRIPES       0xffff /* only valid for directories */
+#define LOV_V1_INSANE_STRIPE_COUNT 65532 /* maximum stripe count bz13933 */
+
+#define lov_user_ost_data lov_user_ost_data_v1
+struct lov_user_ost_data_v1 {     /* per-stripe data structure */
+	struct ost_id l_ost_oi;	  /* OST object ID */
+	__u32 l_ost_gen;	  /* generation of this OST index */
+	__u32 l_ost_idx;	  /* OST index in LOV */
+} __attribute__((packed));
+
+#define lov_user_md lov_user_md_v1
+struct lov_user_md_v1 {	   /* LOV EA user data (host-endian) */
+	__u32 lmm_magic;	  /* magic number = LOV_USER_MAGIC_V1 */
+	__u32 lmm_pattern;	/* LOV_PATTERN_RAID0, LOV_PATTERN_RAID1 */
+	struct ost_id lmm_oi;	  /* LOV object ID */
+	__u32 lmm_stripe_size;    /* size of stripe in bytes */
+	__u16 lmm_stripe_count;   /* num stripes in use for this object */
+	union {
+		__u16 lmm_stripe_offset;  /* starting stripe offset in
+					   * lmm_objects, use when writing */
+		__u16 lmm_layout_gen;     /* layout generation number
+					   * used when reading */
+	};
+	struct lov_user_ost_data_v1 lmm_objects[0]; /* per-stripe data */
+} __attribute__((packed,  __may_alias__));
+
+struct lov_user_md_v3 {	   /* LOV EA user data (host-endian) */
+	__u32 lmm_magic;	  /* magic number = LOV_USER_MAGIC_V3 */
+	__u32 lmm_pattern;	/* LOV_PATTERN_RAID0, LOV_PATTERN_RAID1 */
+	struct ost_id lmm_oi;	  /* LOV object ID */
+	__u32 lmm_stripe_size;    /* size of stripe in bytes */
+	__u16 lmm_stripe_count;   /* num stripes in use for this object */
+	union {
+		__u16 lmm_stripe_offset;  /* starting stripe offset in
+					   * lmm_objects, use when writing */
+		__u16 lmm_layout_gen;     /* layout generation number
+					   * used when reading */
+	};
+	char  lmm_pool_name[LOV_MAXPOOLNAME]; /* pool name */
+	struct lov_user_ost_data_v1 lmm_objects[0]; /* per-stripe data */
+} __attribute__((packed));
+
+static inline __u32 lov_user_md_size(__u16 stripes, __u32 lmm_magic)
+{
+	if (lmm_magic == LOV_USER_MAGIC_V3)
+		return sizeof(struct lov_user_md_v3) +
+				stripes * sizeof(struct lov_user_ost_data_v1);
+	else
+		return sizeof(struct lov_user_md_v1) +
+				stripes * sizeof(struct lov_user_ost_data_v1);
+}
+
+/* Compile with -D_LARGEFILE64_SOURCE or -D_GNU_SOURCE (or #define) to
+ * use this.  It is unsafe to #define those values in this header as it
+ * is possible the application has already #included <sys/stat.h>. */
+#ifdef HAVE_LOV_USER_MDS_DATA
+#define lov_user_mds_data lov_user_mds_data_v1
+struct lov_user_mds_data_v1 {
+	lstat_t lmd_st;		 /* MDS stat struct */
+	struct lov_user_md_v1 lmd_lmm;  /* LOV EA V1 user data */
+} __attribute__((packed));
+
+struct lov_user_mds_data_v3 {
+	lstat_t lmd_st;		 /* MDS stat struct */
+	struct lov_user_md_v3 lmd_lmm;  /* LOV EA V3 user data */
+} __attribute__((packed));
+#endif
+
+/* keep this to be the same size as lov_user_ost_data_v1 */
+struct lmv_user_mds_data {
+	struct lu_fid	lum_fid;
+	__u32		lum_padding;
+	__u32		lum_mds;
+};
+
+/* lum_type */
+enum {
+	LMV_STRIPE_TYPE = 0,
+	LMV_DEFAULT_TYPE = 1,
+};
+
+#define lmv_user_md lmv_user_md_v1
+struct lmv_user_md_v1 {
+	__u32	lum_magic;	 /* must be the first field */
+	__u32	lum_stripe_count;  /* dirstripe count */
+	__u32	lum_stripe_offset; /* MDT idx for default dirstripe */
+	__u32	lum_hash_type;     /* Dir stripe policy */
+	__u32	lum_type;	  /* LMV type: default or normal */
+	__u32	lum_padding1;
+	__u32	lum_padding2;
+	__u32	lum_padding3;
+	char	lum_pool_name[LOV_MAXPOOLNAME];
+	struct	lmv_user_mds_data  lum_objects[0];
+};
+
+static inline int lmv_user_md_size(int stripes, int lmm_magic)
+{
+	return sizeof(struct lmv_user_md) +
+		      stripes * sizeof(struct lmv_user_mds_data);
+}
+
+extern void lustre_swab_lmv_user_md(struct lmv_user_md *lum);
+
+struct ll_recreate_obj {
+	__u64 lrc_id;
+	__u32 lrc_ost_idx;
+};
+
+struct ll_fid {
+	__u64 id;	 /* holds object id */
+	__u32 generation; /* holds object generation */
+	__u32 f_type;     /* holds object type or stripe idx when passing it to
+			   * OST for saving into EA. */
+};
+
+#define UUID_MAX	40
+struct obd_uuid {
+	char uuid[UUID_MAX];
+};
+
+static inline bool obd_uuid_equals(const struct obd_uuid *u1,
+				   const struct obd_uuid *u2)
+{
+	return strcmp((char *)u1->uuid, (char *)u2->uuid) == 0;
+}
+
+static inline int obd_uuid_empty(struct obd_uuid *uuid)
+{
+	return uuid->uuid[0] == '\0';
+}
+
+static inline void obd_str2uuid(struct obd_uuid *uuid, const char *tmp)
+{
+	strncpy((char *)uuid->uuid, tmp, sizeof(*uuid));
+	uuid->uuid[sizeof(*uuid) - 1] = '\0';
+}
+
+/* For printf's only, make sure uuid is terminated */
+static inline char *obd_uuid2str(const struct obd_uuid *uuid)
+{
+	if (uuid->uuid[sizeof(*uuid) - 1] != '\0') {
+		/* Obviously not safe, but for printfs, no real harm done...
+		   we're always null-terminated, even in a race. */
+		static char temp[sizeof(*uuid)];
+		memcpy(temp, uuid->uuid, sizeof(*uuid) - 1);
+		temp[sizeof(*uuid) - 1] = '\0';
+		return temp;
+	}
+	return (char *)(uuid->uuid);
+}
+
+/* Extract fsname from uuid (or target name) of a target
+   e.g. (myfs-OST0007_UUID -> myfs)
+   see also deuuidify. */
+static inline void obd_uuid2fsname(char *buf, char *uuid, int buflen)
+{
+	char *p;
+
+	strncpy(buf, uuid, buflen - 1);
+	buf[buflen - 1] = '\0';
+	p = strrchr(buf, '-');
+	if (p)
+	   *p = '\0';
+}
+
+/* printf display format
+   e.g. printf("file FID is "DFID"\n", PFID(fid)); */
+#define FID_NOBRACE_LEN 40
+#define FID_LEN (FID_NOBRACE_LEN + 2)
+#define DFID_NOBRACE "%#llx:0x%x:0x%x"
+#define DFID "["DFID_NOBRACE"]"
+#define PFID(fid)     \
+	(fid)->f_seq, \
+	(fid)->f_oid, \
+	(fid)->f_ver
+
+/* scanf input parse format -- strip '[' first.
+   e.g. sscanf(fidstr, SFID, RFID(&fid)); */
+#define SFID "0x%llx:0x%x:0x%x"
+#define RFID(fid)     \
+	&((fid)->f_seq), \
+	&((fid)->f_oid), \
+	&((fid)->f_ver)
+
+
+/********* Quotas **********/
+
+/* these must be explicitly translated into linux Q_* in ll_dir_ioctl */
+#define LUSTRE_Q_QUOTAON    0x800002     /* turn quotas on */
+#define LUSTRE_Q_QUOTAOFF   0x800003     /* turn quotas off */
+#define LUSTRE_Q_GETINFO    0x800005     /* get information about quota files */
+#define LUSTRE_Q_SETINFO    0x800006     /* set information about quota files */
+#define LUSTRE_Q_GETQUOTA   0x800007     /* get user quota structure */
+#define LUSTRE_Q_SETQUOTA   0x800008     /* set user quota structure */
+/* lustre-specific control commands */
+#define LUSTRE_Q_INVALIDATE  0x80000b     /* invalidate quota data */
+#define LUSTRE_Q_FINVALIDATE 0x80000c     /* invalidate filter quota data */
+
+#define UGQUOTA 2       /* set both USRQUOTA and GRPQUOTA */
+
+struct if_quotacheck {
+	char		    obd_type[16];
+	struct obd_uuid	 obd_uuid;
+};
+
+#define IDENTITY_DOWNCALL_MAGIC 0x6d6dd629
+
+/* permission */
+#define N_PERMS_MAX      64
+
+struct perm_downcall_data {
+	__u64 pdd_nid;
+	__u32 pdd_perm;
+	__u32 pdd_padding;
+};
+
+struct identity_downcall_data {
+	__u32			    idd_magic;
+	__u32			    idd_err;
+	__u32			    idd_uid;
+	__u32			    idd_gid;
+	__u32			    idd_nperms;
+	__u32			    idd_ngroups;
+	struct perm_downcall_data idd_perms[N_PERMS_MAX];
+	__u32			    idd_groups[0];
+};
+
+/* for non-mapped uid/gid */
+#define NOBODY_UID      99
+#define NOBODY_GID      99
+
+#define INVALID_ID      (-1)
+
+enum {
+	RMT_LSETFACL    = 1,
+	RMT_LGETFACL    = 2,
+	RMT_RSETFACL    = 3,
+	RMT_RGETFACL    = 4
+};
+
+#ifdef NEED_QUOTA_DEFS
+#ifndef QIF_BLIMITS
+#define QIF_BLIMITS     1
+#define QIF_SPACE       2
+#define QIF_ILIMITS     4
+#define QIF_INODES      8
+#define QIF_BTIME       16
+#define QIF_ITIME       32
+#define QIF_LIMITS      (QIF_BLIMITS | QIF_ILIMITS)
+#define QIF_USAGE       (QIF_SPACE | QIF_INODES)
+#define QIF_TIMES       (QIF_BTIME | QIF_ITIME)
+#define QIF_ALL	 (QIF_LIMITS | QIF_USAGE | QIF_TIMES)
+#endif
+
+#endif /* !__KERNEL__ */
+
+/* lustre volatile file support
+ * file name header: .^L^S^T^R:volatile"
+ */
+#define LUSTRE_VOLATILE_HDR	".\x0c\x13\x14\x12:VOLATILE"
+#define LUSTRE_VOLATILE_HDR_LEN	14
+/* hdr + MDT index */
+#define LUSTRE_VOLATILE_IDX	LUSTRE_VOLATILE_HDR":%.4X:"
+
+typedef enum lustre_quota_version {
+	LUSTRE_QUOTA_V2 = 1
+} lustre_quota_version_t;
+
+/* XXX: same as if_dqinfo struct in kernel */
+struct obd_dqinfo {
+	__u64 dqi_bgrace;
+	__u64 dqi_igrace;
+	__u32 dqi_flags;
+	__u32 dqi_valid;
+};
+
+/* XXX: same as if_dqblk struct in kernel, plus one padding */
+struct obd_dqblk {
+	__u64 dqb_bhardlimit;
+	__u64 dqb_bsoftlimit;
+	__u64 dqb_curspace;
+	__u64 dqb_ihardlimit;
+	__u64 dqb_isoftlimit;
+	__u64 dqb_curinodes;
+	__u64 dqb_btime;
+	__u64 dqb_itime;
+	__u32 dqb_valid;
+	__u32 dqb_padding;
+};
+
+enum {
+	QC_GENERAL      = 0,
+	QC_MDTIDX       = 1,
+	QC_OSTIDX       = 2,
+	QC_UUID	 = 3
+};
+
+struct if_quotactl {
+	__u32		   qc_cmd;
+	__u32		   qc_type;
+	__u32		   qc_id;
+	__u32		   qc_stat;
+	__u32		   qc_valid;
+	__u32		   qc_idx;
+	struct obd_dqinfo       qc_dqinfo;
+	struct obd_dqblk	qc_dqblk;
+	char		    obd_type[16];
+	struct obd_uuid	 obd_uuid;
+};
+
+/* swap layout flags */
+#define SWAP_LAYOUTS_CHECK_DV1		(1 << 0)
+#define SWAP_LAYOUTS_CHECK_DV2		(1 << 1)
+#define SWAP_LAYOUTS_KEEP_MTIME		(1 << 2)
+#define SWAP_LAYOUTS_KEEP_ATIME		(1 << 3)
+
+/* Swap XATTR_NAME_HSM as well, only on the MDT so far */
+#define SWAP_LAYOUTS_MDS_HSM		(1 << 31)
+struct lustre_swap_layouts {
+	__u64	sl_flags;
+	__u32	sl_fd;
+	__u32	sl_gid;
+	__u64	sl_dv1;
+	__u64	sl_dv2;
+};
+
+
+/********* Changelogs **********/
+/** Changelog record types */
+enum changelog_rec_type {
+	CL_MARK     = 0,
+	CL_CREATE   = 1,  /* namespace */
+	CL_MKDIR    = 2,  /* namespace */
+	CL_HARDLINK = 3,  /* namespace */
+	CL_SOFTLINK = 4,  /* namespace */
+	CL_MKNOD    = 5,  /* namespace */
+	CL_UNLINK   = 6,  /* namespace */
+	CL_RMDIR    = 7,  /* namespace */
+	CL_RENAME   = 8,  /* namespace */
+	CL_EXT      = 9,  /* namespace extended record (2nd half of rename) */
+	CL_OPEN     = 10, /* not currently used */
+	CL_CLOSE    = 11, /* may be written to log only with mtime change */
+	CL_LAYOUT   = 12, /* file layout/striping modified */
+	CL_TRUNC    = 13,
+	CL_SETATTR  = 14,
+	CL_XATTR    = 15,
+	CL_HSM      = 16, /* HSM specific events, see flags */
+	CL_MTIME    = 17, /* Precedence: setattr > mtime > ctime > atime */
+	CL_CTIME    = 18,
+	CL_ATIME    = 19,
+	CL_LAST
+};
+
+static inline const char *changelog_type2str(int type) {
+	static const char *changelog_str[] = {
+		"MARK",  "CREAT", "MKDIR", "HLINK", "SLINK", "MKNOD", "UNLNK",
+		"RMDIR", "RENME", "RNMTO", "OPEN",  "CLOSE", "LYOUT", "TRUNC",
+		"SATTR", "XATTR", "HSM",   "MTIME", "CTIME", "ATIME",
+	};
+
+	if (type >= 0 && type < CL_LAST)
+		return changelog_str[type];
+	return NULL;
+}
+
+/* per-record flags */
+#define CLF_VERSION     0x1000
+#define CLF_EXT_VERSION 0x2000
+#define CLF_FLAGSHIFT   12
+#define CLF_FLAGMASK    ((1U << CLF_FLAGSHIFT) - 1)
+#define CLF_VERMASK     (~CLF_FLAGMASK)
+/* Anything under the flagmask may be per-type (if desired) */
+/* Flags for unlink */
+#define CLF_UNLINK_LAST       0x0001 /* Unlink of last hardlink */
+#define CLF_UNLINK_HSM_EXISTS 0x0002 /* File has something in HSM */
+				     /* HSM cleaning needed */
+/* Flags for rename */
+#define CLF_RENAME_LAST       0x0001 /* rename unlink last hardlink of target */
+
+/* Flags for HSM */
+/* 12b used (from high weight to low weight):
+ * 2b for flags
+ * 3b for event
+ * 7b for error code
+ */
+#define CLF_HSM_ERR_L	0 /* HSM return code, 7 bits */
+#define CLF_HSM_ERR_H	6
+#define CLF_HSM_EVENT_L      7 /* HSM event, 3 bits, see enum hsm_event */
+#define CLF_HSM_EVENT_H      9
+#define CLF_HSM_FLAG_L      10 /* HSM flags, 2 bits, 1 used, 1 spare */
+#define CLF_HSM_FLAG_H      11
+#define CLF_HSM_SPARE_L     12 /* 4 spare bits */
+#define CLF_HSM_SPARE_H     15
+#define CLF_HSM_LAST	15
+
+/* Remove bits higher than _h, then extract the value
+ * between _h and _l by shifting lower weigth to bit 0. */
+#define CLF_GET_BITS(_b, _h, _l) (((_b << (CLF_HSM_LAST - _h)) & 0xFFFF) \
+				   >> (CLF_HSM_LAST - _h + _l))
+
+#define CLF_HSM_SUCCESS      0x00
+#define CLF_HSM_MAXERROR     0x7E
+#define CLF_HSM_ERROVERFLOW  0x7F
+
+#define CLF_HSM_DIRTY	1 /* file is dirty after HSM request end */
+
+/* 3 bits field => 8 values allowed */
+enum hsm_event {
+	HE_ARCHIVE      = 0,
+	HE_RESTORE      = 1,
+	HE_CANCEL       = 2,
+	HE_RELEASE      = 3,
+	HE_REMOVE       = 4,
+	HE_STATE	= 5,
+	HE_SPARE1       = 6,
+	HE_SPARE2       = 7,
+};
+
+static inline enum hsm_event hsm_get_cl_event(__u16 flags)
+{
+	return CLF_GET_BITS(flags, CLF_HSM_EVENT_H, CLF_HSM_EVENT_L);
+}
+
+static inline void hsm_set_cl_event(int *flags, enum hsm_event he)
+{
+	*flags |= (he << CLF_HSM_EVENT_L);
+}
+
+static inline __u16 hsm_get_cl_flags(int flags)
+{
+	return CLF_GET_BITS(flags, CLF_HSM_FLAG_H, CLF_HSM_FLAG_L);
+}
+
+static inline void hsm_set_cl_flags(int *flags, int bits)
+{
+	*flags |= (bits << CLF_HSM_FLAG_L);
+}
+
+static inline int hsm_get_cl_error(int flags)
+{
+	return CLF_GET_BITS(flags, CLF_HSM_ERR_H, CLF_HSM_ERR_L);
+}
+
+static inline void hsm_set_cl_error(int *flags, int error)
+{
+	*flags |= (error << CLF_HSM_ERR_L);
+}
+
+#define CR_MAXSIZE cfs_size_round(2*NAME_MAX + 1 + \
+				  sizeof(struct changelog_ext_rec))
+
+struct changelog_rec {
+	__u16		 cr_namelen;
+	__u16		 cr_flags; /**< (flags&CLF_FLAGMASK)|CLF_VERSION */
+	__u32		 cr_type;  /**< \a changelog_rec_type */
+	__u64		 cr_index; /**< changelog record number */
+	__u64		 cr_prev;  /**< last index for this target fid */
+	__u64		 cr_time;
+	union {
+		lustre_fid    cr_tfid;	/**< target fid */
+		__u32	 cr_markerflags; /**< CL_MARK flags */
+	};
+	lustre_fid	    cr_pfid;	/**< parent fid */
+	char		  cr_name[0];     /**< last element */
+} __attribute__((packed));
+
+/* changelog_ext_rec is 2*sizeof(lu_fid) bigger than changelog_rec, to save
+ * space, only rename uses changelog_ext_rec, while others use changelog_rec to
+ * store records.
+ */
+struct changelog_ext_rec {
+	__u16			cr_namelen;
+	__u16			cr_flags; /**< (flags & CLF_FLAGMASK) |
+						CLF_EXT_VERSION */
+	__u32			cr_type;  /**< \a changelog_rec_type */
+	__u64			cr_index; /**< changelog record number */
+	__u64			cr_prev;  /**< last index for this target fid */
+	__u64			cr_time;
+	union {
+		lustre_fid	cr_tfid;	/**< target fid */
+		__u32		cr_markerflags; /**< CL_MARK flags */
+	};
+	lustre_fid		cr_pfid;	/**< target parent fid */
+	lustre_fid		cr_sfid;	/**< source fid, or zero */
+	lustre_fid		cr_spfid;       /**< source parent fid, or zero */
+	char			cr_name[0];     /**< last element */
+} __attribute__((packed));
+
+#define CHANGELOG_REC_EXTENDED(rec) \
+	(((rec)->cr_flags & CLF_VERMASK) == CLF_EXT_VERSION)
+
+static inline int changelog_rec_size(struct changelog_rec *rec)
+{
+	return CHANGELOG_REC_EXTENDED(rec) ? sizeof(struct changelog_ext_rec):
+					     sizeof(*rec);
+}
+
+static inline char *changelog_rec_name(struct changelog_rec *rec)
+{
+	return CHANGELOG_REC_EXTENDED(rec) ?
+		((struct changelog_ext_rec *)rec)->cr_name: rec->cr_name;
+}
+
+static inline int changelog_rec_snamelen(struct changelog_ext_rec *rec)
+{
+	return rec->cr_namelen - strlen(rec->cr_name) - 1;
+}
+
+static inline char *changelog_rec_sname(struct changelog_ext_rec *rec)
+{
+	return rec->cr_name + strlen(rec->cr_name) + 1;
+}
+
+struct ioc_changelog {
+	__u64 icc_recno;
+	__u32 icc_mdtindex;
+	__u32 icc_id;
+	__u32 icc_flags;
+};
+
+enum changelog_message_type {
+	CL_RECORD = 10, /* message is a changelog_rec */
+	CL_EOF    = 11, /* at end of current changelog */
+};
+
+/********* Misc **********/
+
+struct ioc_data_version {
+	__u64 idv_version;
+	__u64 idv_flags;     /* See LL_DV_xxx */
+};
+#define LL_DV_NOFLUSH 0x01   /* Do not take READ EXTENT LOCK before sampling
+				version. Dirty caches are left unchanged. */
+
+#ifndef offsetof
+# define offsetof(typ, memb)     ((unsigned long)((char *)&(((typ *)0)->memb)))
+#endif
+
+#define dot_lustre_name ".lustre"
+
+
+/********* HSM **********/
+
+/** HSM per-file state
+ * See HSM_FLAGS below.
+ */
+enum hsm_states {
+	HS_EXISTS	= 0x00000001,
+	HS_DIRTY	= 0x00000002,
+	HS_RELEASED	= 0x00000004,
+	HS_ARCHIVED	= 0x00000008,
+	HS_NORELEASE	= 0x00000010,
+	HS_NOARCHIVE	= 0x00000020,
+	HS_LOST		= 0x00000040,
+};
+
+/* HSM user-setable flags. */
+#define HSM_USER_MASK   (HS_NORELEASE | HS_NOARCHIVE | HS_DIRTY)
+
+/* Other HSM flags. */
+#define HSM_STATUS_MASK (HS_EXISTS | HS_LOST | HS_RELEASED | HS_ARCHIVED)
+
+/*
+ * All HSM-related possible flags that could be applied to a file.
+ * This should be kept in sync with hsm_states.
+ */
+#define HSM_FLAGS_MASK  (HSM_USER_MASK | HSM_STATUS_MASK)
+
+/**
+ * HSM request progress state
+ */
+enum hsm_progress_states {
+	HPS_WAITING	= 1,
+	HPS_RUNNING	= 2,
+	HPS_DONE	= 3,
+};
+#define HPS_NONE	0
+
+static inline char *hsm_progress_state2name(enum hsm_progress_states s)
+{
+	switch  (s) {
+	case HPS_WAITING:	return "waiting";
+	case HPS_RUNNING:	return "running";
+	case HPS_DONE:		return "done";
+	default:		return "unknown";
+	}
+}
+
+struct hsm_extent {
+	__u64 offset;
+	__u64 length;
+} __attribute__((packed));
+
+/**
+ * Current HSM states of a Lustre file.
+ *
+ * This structure purpose is to be sent to user-space mainly. It describes the
+ * current HSM flags and in-progress action.
+ */
+struct hsm_user_state {
+	/** Current HSM states, from enum hsm_states. */
+	__u32			hus_states;
+	__u32			hus_archive_id;
+	/**  The current undergoing action, if there is one */
+	__u32			hus_in_progress_state;
+	__u32			hus_in_progress_action;
+	struct hsm_extent	hus_in_progress_location;
+	char			hus_extended_info[];
+};
+
+struct hsm_state_set_ioc {
+	struct lu_fid	hssi_fid;
+	__u64		hssi_setmask;
+	__u64		hssi_clearmask;
+};
+
+/*
+ * This structure describes the current in-progress action for a file.
+ * it is returned to user space and send over the wire
+ */
+struct hsm_current_action {
+	/**  The current undergoing action, if there is one */
+	/* state is one of hsm_progress_states */
+	__u32			hca_state;
+	/* action is one of hsm_user_action */
+	__u32			hca_action;
+	struct hsm_extent	hca_location;
+};
+
+/***** HSM user requests ******/
+/* User-generated (lfs/ioctl) request types */
+enum hsm_user_action {
+	HUA_NONE    =  1, /* no action (noop) */
+	HUA_ARCHIVE = 10, /* copy to hsm */
+	HUA_RESTORE = 11, /* prestage */
+	HUA_RELEASE = 12, /* drop ost objects */
+	HUA_REMOVE  = 13, /* remove from archive */
+	HUA_CANCEL  = 14  /* cancel a request */
+};
+
+static inline char *hsm_user_action2name(enum hsm_user_action  a)
+{
+	switch  (a) {
+	case HUA_NONE:    return "NOOP";
+	case HUA_ARCHIVE: return "ARCHIVE";
+	case HUA_RESTORE: return "RESTORE";
+	case HUA_RELEASE: return "RELEASE";
+	case HUA_REMOVE:  return "REMOVE";
+	case HUA_CANCEL:  return "CANCEL";
+	default:	  return "UNKNOWN";
+	}
+}
+
+/*
+ * List of hr_flags (bit field)
+ */
+#define HSM_FORCE_ACTION 0x0001
+/* used by CT, connot be set by user */
+#define HSM_GHOST_COPY   0x0002
+
+/**
+ * Contains all the fixed part of struct hsm_user_request.
+ *
+ */
+struct hsm_request {
+	__u32 hr_action;	/* enum hsm_user_action */
+	__u32 hr_archive_id;	/* archive id, used only with HUA_ARCHIVE */
+	__u64 hr_flags;		/* request flags */
+	__u32 hr_itemcount;	/* item count in hur_user_item vector */
+	__u32 hr_data_len;
+};
+
+struct hsm_user_item {
+       lustre_fid	hui_fid;
+       struct hsm_extent hui_extent;
+} __attribute__((packed));
+
+struct hsm_user_request {
+	struct hsm_request	hur_request;
+	struct hsm_user_item	hur_user_item[0];
+	/* extra data blob at end of struct (after all
+	 * hur_user_items), only use helpers to access it
+	 */
+} __attribute__((packed));
+
+/** Return pointer to data field in a hsm user request */
+static inline void *hur_data(struct hsm_user_request *hur)
+{
+	return &(hur->hur_user_item[hur->hur_request.hr_itemcount]);
+}
+
+/**
+ * Compute the current length of the provided hsm_user_request.  This returns -1
+ * instead of an errno because ssize_t is defined to be only [ -1, SSIZE_MAX ]
+ *
+ * return -1 on bounds check error.
+ */
+static inline ssize_t hur_len(struct hsm_user_request *hur)
+{
+	__u64	size;
+
+	/* can't overflow a __u64 since hr_itemcount is only __u32 */
+	size = offsetof(struct hsm_user_request, hur_user_item[0]) +
+		(__u64)hur->hur_request.hr_itemcount *
+		sizeof(hur->hur_user_item[0]) + hur->hur_request.hr_data_len;
+
+	if (size != (ssize_t)size)
+		return -1;
+
+	return size;
+}
+
+/****** HSM RPCs to copytool *****/
+/* Message types the copytool may receive */
+enum hsm_message_type {
+	HMT_ACTION_LIST = 100, /* message is a hsm_action_list */
+};
+
+/* Actions the copytool may be instructed to take for a given action_item */
+enum hsm_copytool_action {
+	HSMA_NONE    = 10, /* no action */
+	HSMA_ARCHIVE = 20, /* arbitrary offset */
+	HSMA_RESTORE = 21,
+	HSMA_REMOVE  = 22,
+	HSMA_CANCEL  = 23
+};
+
+static inline char *hsm_copytool_action2name(enum hsm_copytool_action  a)
+{
+	switch  (a) {
+	case HSMA_NONE:    return "NOOP";
+	case HSMA_ARCHIVE: return "ARCHIVE";
+	case HSMA_RESTORE: return "RESTORE";
+	case HSMA_REMOVE:  return "REMOVE";
+	case HSMA_CANCEL:  return "CANCEL";
+	default:	   return "UNKNOWN";
+	}
+}
+
+/* Copytool item action description */
+struct hsm_action_item {
+	__u32      hai_len;     /* valid size of this struct */
+	__u32      hai_action;  /* hsm_copytool_action, but use known size */
+	lustre_fid hai_fid;     /* Lustre FID to operated on */
+	lustre_fid hai_dfid;    /* fid used for data access */
+	struct hsm_extent hai_extent;  /* byte range to operate on */
+	__u64      hai_cookie;  /* action cookie from coordinator */
+	__u64      hai_gid;     /* grouplock id */
+	char       hai_data[0]; /* variable length */
+} __attribute__((packed));
+
+/*
+ * helper function which print in hexa the first bytes of
+ * hai opaque field
+ * \param hai [IN] record to print
+ * \param buffer [OUT] output buffer
+ * \param len [IN] max buffer len
+ * \retval buffer
+ */
+static inline char *hai_dump_data_field(struct hsm_action_item *hai,
+					char *buffer, int len)
+{
+	int i, sz, data_len;
+	char *ptr;
+
+	ptr = buffer;
+	sz = len;
+	data_len = hai->hai_len - sizeof(*hai);
+	for (i = 0 ; (i < data_len) && (sz > 0) ; i++) {
+		int cnt;
+
+		cnt = snprintf(ptr, sz, "%.2X",
+			       (unsigned char)hai->hai_data[i]);
+		ptr += cnt;
+		sz -= cnt;
+	}
+	*ptr = '\0';
+	return buffer;
+}
+
+/* Copytool action list */
+#define HAL_VERSION 1
+#define HAL_MAXSIZE LNET_MTU /* bytes, used in userspace only */
+struct hsm_action_list {
+	__u32 hal_version;
+	__u32 hal_count;       /* number of hai's to follow */
+	__u64 hal_compound_id; /* returned by coordinator */
+	__u64 hal_flags;
+	__u32 hal_archive_id; /* which archive backend */
+	__u32 padding1;
+	char  hal_fsname[0];   /* null-terminated */
+	/* struct hsm_action_item[hal_count] follows, aligned on 8-byte
+	   boundaries. See hai_zero */
+} __attribute__((packed));
+
+#ifndef HAVE_CFS_SIZE_ROUND
+static inline int cfs_size_round (int val)
+{
+	return (val + 7) & (~0x7);
+}
+#define HAVE_CFS_SIZE_ROUND
+#endif
+
+/* Return pointer to first hai in action list */
+static inline struct hsm_action_item *hai_zero(struct hsm_action_list *hal)
+{
+	return (struct hsm_action_item *)(hal->hal_fsname +
+					  cfs_size_round(strlen(hal-> \
+								hal_fsname)
+							 + 1));
+}
+/* Return pointer to next hai */
+static inline struct hsm_action_item *hai_next(struct hsm_action_item *hai)
+{
+	return (struct hsm_action_item *)((char *)hai +
+					  cfs_size_round(hai->hai_len));
+}
+
+/* Return size of an hsm_action_list */
+static inline int hal_size(struct hsm_action_list *hal)
+{
+	int i, sz;
+	struct hsm_action_item *hai;
+
+	sz = sizeof(*hal) + cfs_size_round(strlen(hal->hal_fsname) + 1);
+	hai = hai_zero(hal);
+	for (i = 0; i < hal->hal_count; i++, hai = hai_next(hai))
+		sz += cfs_size_round(hai->hai_len);
+
+	return sz;
+}
+
+/* HSM file import
+ * describe the attributes to be set on imported file
+ */
+struct hsm_user_import {
+	__u64		hui_size;
+	__u64		hui_atime;
+	__u64		hui_mtime;
+	__u32		hui_atime_ns;
+	__u32		hui_mtime_ns;
+	__u32		hui_uid;
+	__u32		hui_gid;
+	__u32		hui_mode;
+	__u32		hui_archive_id;
+};
+
+/* Copytool progress reporting */
+#define HP_FLAG_COMPLETED 0x01
+#define HP_FLAG_RETRY     0x02
+
+struct hsm_progress {
+	lustre_fid		hp_fid;
+	__u64			hp_cookie;
+	struct hsm_extent	hp_extent;
+	__u16			hp_flags;
+	__u16			hp_errval; /* positive val */
+	__u32			padding;
+};
+
+struct hsm_copy {
+	__u64			hc_data_version;
+	__u16			hc_flags;
+	__u16			hc_errval; /* positive val */
+	__u32			padding;
+	struct hsm_action_item	hc_hai;
+};
+
+/** @} lustreuser */
+
+#endif /* _LUSTRE_USER_H */
diff --git a/kernel/drivers/staging/lustre/lustre/include/lustre_acl.h b/kernel/drivers/staging/lustre/lustre/include/lustre_acl.h
new file mode 100644
index 000000000..aa4cfa7b7
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/include/lustre_acl.h
@@ -0,0 +1,49 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/include/lustre_acl.h
+ */
+
+#ifndef _LUSTRE_ACL_H
+#define _LUSTRE_ACL_H
+
+#include <linux/fs.h>
+#include <linux/dcache.h>
+#include <linux/posix_acl_xattr.h>
+
+#define LUSTRE_POSIX_ACL_MAX_ENTRIES	32
+#define LUSTRE_POSIX_ACL_MAX_SIZE					\
+	(sizeof(posix_acl_xattr_header) +				\
+	 LUSTRE_POSIX_ACL_MAX_ENTRIES * sizeof(posix_acl_xattr_entry))
+
+#endif
diff --git a/kernel/drivers/staging/lustre/lustre/include/lustre_capa.h b/kernel/drivers/staging/lustre/lustre/include/lustre_capa.h
new file mode 100644
index 000000000..fe19534eb
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/include/lustre_capa.h
@@ -0,0 +1,305 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/include/lustre_capa.h
+ *
+ * Author: Lai Siyao <lsy@clusterfs.com>
+ */
+
+#ifndef __LINUX_CAPA_H_
+#define __LINUX_CAPA_H_
+
+/** \defgroup capa capa
+ *
+ * @{
+ */
+
+/*
+ * capability
+ */
+#include <linux/crypto.h>
+#include "lustre/lustre_idl.h"
+
+#define CAPA_TIMEOUT 1800		/* sec, == 30 min */
+#define CAPA_KEY_TIMEOUT (24 * 60 * 60)  /* sec, == 1 days */
+
+struct capa_hmac_alg {
+	const char     *ha_name;
+	int	     ha_len;
+	int	     ha_keylen;
+};
+
+#define DEF_CAPA_HMAC_ALG(name, type, len, keylen)      \
+[CAPA_HMAC_ALG_ ## type] = {			    \
+	.ha_name	 = name,			\
+	.ha_len	  = len,			 \
+	.ha_keylen       = keylen,		      \
+}
+
+struct client_capa {
+	struct inode	     *inode;
+	struct list_head		lli_list;     /* link to lli_oss_capas */
+};
+
+struct target_capa {
+	struct hlist_node	  c_hash;       /* link to capa hash */
+};
+
+struct obd_capa {
+	struct list_head		c_list;       /* link to capa_list */
+
+	struct lustre_capa	c_capa;       /* capa */
+	atomic_t	      c_refc;       /* ref count */
+	unsigned long		c_expiry;     /* jiffies */
+	spinlock_t		c_lock;	/* protect capa content */
+	int			c_site;
+
+	union {
+		struct client_capa	cli;
+		struct target_capa	tgt;
+	} u;
+};
+
+enum {
+	CAPA_SITE_CLIENT = 0,
+	CAPA_SITE_SERVER,
+	CAPA_SITE_MAX
+};
+
+static inline struct lu_fid *capa_fid(struct lustre_capa *capa)
+{
+	return &capa->lc_fid;
+}
+
+static inline __u64 capa_opc(struct lustre_capa *capa)
+{
+	return capa->lc_opc;
+}
+
+static inline __u64 capa_uid(struct lustre_capa *capa)
+{
+	return capa->lc_uid;
+}
+
+static inline __u64 capa_gid(struct lustre_capa *capa)
+{
+	return capa->lc_gid;
+}
+
+static inline __u32 capa_flags(struct lustre_capa *capa)
+{
+	return capa->lc_flags & 0xffffff;
+}
+
+static inline __u32 capa_alg(struct lustre_capa *capa)
+{
+	return (capa->lc_flags >> 24);
+}
+
+static inline __u32 capa_keyid(struct lustre_capa *capa)
+{
+	return capa->lc_keyid;
+}
+
+static inline __u64 capa_key_seq(struct lustre_capa_key *key)
+{
+	return key->lk_seq;
+}
+
+static inline __u32 capa_key_keyid(struct lustre_capa_key *key)
+{
+	return key->lk_keyid;
+}
+
+static inline __u32 capa_timeout(struct lustre_capa *capa)
+{
+	return capa->lc_timeout;
+}
+
+static inline __u32 capa_expiry(struct lustre_capa *capa)
+{
+	return capa->lc_expiry;
+}
+
+void _debug_capa(struct lustre_capa *, struct libcfs_debug_msg_data *,
+		 const char *fmt, ...);
+#define DEBUG_CAPA(level, capa, fmt, args...)				  \
+do {									   \
+	if (((level) & D_CANTMASK) != 0 ||				     \
+	    ((libcfs_debug & (level)) != 0 &&				  \
+	     (libcfs_subsystem_debug & DEBUG_SUBSYSTEM) != 0)) {	       \
+		LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, level, NULL);	      \
+		_debug_capa((capa), &msgdata, fmt, ##args);		    \
+	}								      \
+} while (0)
+
+#define DEBUG_CAPA_KEY(level, k, fmt, args...)				 \
+do {									   \
+CDEBUG(level, fmt " capability key@%p seq %llu keyid %u\n",		 \
+       ##args, k, capa_key_seq(k), capa_key_keyid(k));			 \
+} while (0)
+
+typedef int (* renew_capa_cb_t)(struct obd_capa *, struct lustre_capa *);
+
+/* obdclass/capa.c */
+extern struct list_head capa_list[];
+extern spinlock_t capa_lock;
+extern int capa_count[];
+extern struct kmem_cache *capa_cachep;
+
+struct hlist_head *init_capa_hash(void);
+void cleanup_capa_hash(struct hlist_head *hash);
+
+struct obd_capa *capa_add(struct hlist_head *hash,
+			  struct lustre_capa *capa);
+struct obd_capa *capa_lookup(struct hlist_head *hash,
+			     struct lustre_capa *capa, int alive);
+
+int capa_hmac(__u8 *hmac, struct lustre_capa *capa, __u8 *key);
+int capa_encrypt_id(__u32 *d, __u32 *s, __u8 *key, int keylen);
+int capa_decrypt_id(__u32 *d, __u32 *s, __u8 *key, int keylen);
+void capa_cpy(void *dst, struct obd_capa *ocapa);
+static inline struct obd_capa *alloc_capa(int site)
+{
+	struct obd_capa *ocapa;
+
+	if (unlikely(site != CAPA_SITE_CLIENT && site != CAPA_SITE_SERVER))
+		return ERR_PTR(-EINVAL);
+
+	OBD_SLAB_ALLOC_PTR(ocapa, capa_cachep);
+	if (unlikely(!ocapa))
+		return ERR_PTR(-ENOMEM);
+
+	INIT_LIST_HEAD(&ocapa->c_list);
+	atomic_set(&ocapa->c_refc, 1);
+	spin_lock_init(&ocapa->c_lock);
+	ocapa->c_site = site;
+	if (ocapa->c_site == CAPA_SITE_CLIENT)
+		INIT_LIST_HEAD(&ocapa->u.cli.lli_list);
+	else
+		INIT_HLIST_NODE(&ocapa->u.tgt.c_hash);
+
+	return ocapa;
+}
+
+static inline struct obd_capa *capa_get(struct obd_capa *ocapa)
+{
+	if (!ocapa)
+		return NULL;
+
+	atomic_inc(&ocapa->c_refc);
+	return ocapa;
+}
+
+static inline void capa_put(struct obd_capa *ocapa)
+{
+	if (!ocapa)
+		return;
+
+	if (atomic_read(&ocapa->c_refc) == 0) {
+		DEBUG_CAPA(D_ERROR, &ocapa->c_capa, "refc is 0 for");
+		LBUG();
+	}
+
+	if (atomic_dec_and_test(&ocapa->c_refc)) {
+		LASSERT(list_empty(&ocapa->c_list));
+		if (ocapa->c_site == CAPA_SITE_CLIENT) {
+			LASSERT(list_empty(&ocapa->u.cli.lli_list));
+		} else {
+			struct hlist_node *hnode;
+
+			hnode = &ocapa->u.tgt.c_hash;
+			LASSERT(!hnode->next && !hnode->pprev);
+		}
+		OBD_SLAB_FREE(ocapa, capa_cachep, sizeof(*ocapa));
+	}
+}
+
+static inline int open_flags_to_accmode(int flags)
+{
+	int mode = flags;
+
+	if ((mode + 1) & O_ACCMODE)
+		mode++;
+	if (mode & O_TRUNC)
+		mode |= 2;
+
+	return mode;
+}
+
+static inline __u64 capa_open_opc(int mode)
+{
+	return mode & FMODE_WRITE ? CAPA_OPC_OSS_WRITE : CAPA_OPC_OSS_READ;
+}
+
+static inline void set_capa_expiry(struct obd_capa *ocapa)
+{
+	unsigned long expiry = cfs_time_sub((unsigned long)ocapa->c_capa.lc_expiry,
+					 get_seconds());
+	ocapa->c_expiry = cfs_time_add(cfs_time_current(),
+				       cfs_time_seconds(expiry));
+}
+
+static inline int capa_is_expired_sec(struct lustre_capa *capa)
+{
+	return (capa->lc_expiry - get_seconds() <= 0);
+}
+
+static inline int capa_is_expired(struct obd_capa *ocapa)
+{
+	return time_before_eq(ocapa->c_expiry, cfs_time_current());
+}
+
+static inline int capa_opc_supported(struct lustre_capa *capa, __u64 opc)
+{
+	return (capa_opc(capa) & opc) == opc;
+}
+
+struct filter_capa_key {
+	struct list_head	      k_list;
+	struct lustre_capa_key  k_key;
+};
+
+enum {
+	LC_ID_NONE      = 0,
+	LC_ID_PLAIN     = 1,
+	LC_ID_CONVERT   = 2
+};
+
+#define BYPASS_CAPA (struct lustre_capa *)ERR_PTR(-ENOENT)
+
+/** @} capa */
+
+#endif /* __LINUX_CAPA_H_ */
diff --git a/kernel/drivers/staging/lustre/lustre/include/lustre_cfg.h b/kernel/drivers/staging/lustre/lustre/include/lustre_cfg.h
new file mode 100644
index 000000000..7b385b872
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/include/lustre_cfg.h
@@ -0,0 +1,293 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef _LUSTRE_CFG_H
+#define _LUSTRE_CFG_H
+
+/** \defgroup cfg cfg
+ *
+ * @{
+ */
+
+/*
+ * 1cf6
+ * lcfG
+ */
+#define LUSTRE_CFG_VERSION 0x1cf60001
+#define LUSTRE_CFG_MAX_BUFCOUNT 8
+
+#define LCFG_HDR_SIZE(count) \
+    cfs_size_round(offsetof (struct lustre_cfg, lcfg_buflens[(count)]))
+
+/** If the LCFG_REQUIRED bit is set in a configuration command,
+ * then the client is required to understand this parameter
+ * in order to mount the filesystem. If it does not understand
+ * a REQUIRED command the client mount will fail. */
+#define LCFG_REQUIRED	 0x0001000
+
+enum lcfg_command_type {
+	LCFG_ATTACH	     = 0x00cf001, /**< create a new obd instance */
+	LCFG_DETACH	     = 0x00cf002, /**< destroy obd instance */
+	LCFG_SETUP	      = 0x00cf003, /**< call type-specific setup */
+	LCFG_CLEANUP	    = 0x00cf004, /**< call type-specific cleanup */
+	LCFG_ADD_UUID	   = 0x00cf005, /**< add a nid to a niduuid */
+	LCFG_DEL_UUID	   = 0x00cf006, /**< remove a nid from a niduuid */
+	LCFG_MOUNTOPT	   = 0x00cf007, /**< create a profile (mdc, osc) */
+	LCFG_DEL_MOUNTOPT       = 0x00cf008, /**< destroy a profile */
+	LCFG_SET_TIMEOUT	= 0x00cf009, /**< set obd_timeout */
+	LCFG_SET_UPCALL	 = 0x00cf00a, /**< deprecated */
+	LCFG_ADD_CONN	   = 0x00cf00b, /**< add a failover niduuid to an obd */
+	LCFG_DEL_CONN	   = 0x00cf00c, /**< remove a failover niduuid */
+	LCFG_LOV_ADD_OBD	= 0x00cf00d, /**< add an osc to a lov */
+	LCFG_LOV_DEL_OBD	= 0x00cf00e, /**< remove an osc from a lov */
+	LCFG_PARAM	      = 0x00cf00f, /**< set a proc parameter */
+	LCFG_MARKER	     = 0x00cf010, /**< metadata about next cfg rec */
+	LCFG_LOG_START	  = 0x00ce011, /**< mgc only, process a cfg log */
+	LCFG_LOG_END	    = 0x00ce012, /**< stop processing updates */
+	LCFG_LOV_ADD_INA	= 0x00ce013, /**< like LOV_ADD_OBD, inactive */
+	LCFG_ADD_MDC	    = 0x00cf014, /**< add an mdc to a lmv */
+	LCFG_DEL_MDC	    = 0x00cf015, /**< remove an mdc from a lmv */
+	LCFG_SPTLRPC_CONF       = 0x00ce016, /**< security */
+	LCFG_POOL_NEW	   = 0x00ce020, /**< create an ost pool name */
+	LCFG_POOL_ADD	   = 0x00ce021, /**< add an ost to a pool */
+	LCFG_POOL_REM	   = 0x00ce022, /**< remove an ost from a pool */
+	LCFG_POOL_DEL	   = 0x00ce023, /**< destroy an ost pool name */
+	LCFG_SET_LDLM_TIMEOUT   = 0x00ce030, /**< set ldlm_timeout */
+	LCFG_PRE_CLEANUP	= 0x00cf031, /**< call type-specific pre
+					      * cleanup cleanup */
+	LCFG_SET_PARAM		= 0x00ce032, /**< use set_param syntax to set
+					      *a proc parameters */
+};
+
+struct lustre_cfg_bufs {
+	void    *lcfg_buf[LUSTRE_CFG_MAX_BUFCOUNT];
+	__u32    lcfg_buflen[LUSTRE_CFG_MAX_BUFCOUNT];
+	__u32    lcfg_bufcount;
+};
+
+struct lustre_cfg {
+	__u32 lcfg_version;
+	__u32 lcfg_command;
+
+	__u32 lcfg_num;
+	__u32 lcfg_flags;
+	__u64 lcfg_nid;
+	__u32 lcfg_nal;		/* not used any more */
+
+	__u32 lcfg_bufcount;
+	__u32 lcfg_buflens[0];
+};
+
+enum cfg_record_type {
+	PORTALS_CFG_TYPE = 1,
+	LUSTRE_CFG_TYPE = 123,
+};
+
+#define LUSTRE_CFG_BUFLEN(lcfg, idx)	    \
+	((lcfg)->lcfg_bufcount <= (idx)	 \
+	 ? 0				    \
+	 : (lcfg)->lcfg_buflens[(idx)])
+
+static inline void lustre_cfg_bufs_set(struct lustre_cfg_bufs *bufs,
+				       __u32		   index,
+				       void		   *buf,
+				       __u32		   buflen)
+{
+	if (index >= LUSTRE_CFG_MAX_BUFCOUNT)
+		return;
+	if (bufs == NULL)
+		return;
+
+	if (bufs->lcfg_bufcount <= index)
+		bufs->lcfg_bufcount = index + 1;
+
+	bufs->lcfg_buf[index]    = buf;
+	bufs->lcfg_buflen[index] = buflen;
+}
+
+static inline void lustre_cfg_bufs_set_string(struct lustre_cfg_bufs *bufs,
+					      __u32 index,
+					      char *str)
+{
+	lustre_cfg_bufs_set(bufs, index, str, str ? strlen(str) + 1 : 0);
+}
+
+static inline void lustre_cfg_bufs_reset(struct lustre_cfg_bufs *bufs, char *name)
+{
+	memset((bufs), 0, sizeof(*bufs));
+	if (name)
+		lustre_cfg_bufs_set_string(bufs, 0, name);
+}
+
+static inline void *lustre_cfg_buf(struct lustre_cfg *lcfg, int index)
+{
+	int i;
+	int offset;
+	int bufcount;
+	LASSERT (lcfg != NULL);
+	LASSERT (index >= 0);
+
+	bufcount = lcfg->lcfg_bufcount;
+	if (index >= bufcount)
+		return NULL;
+
+	offset = LCFG_HDR_SIZE(lcfg->lcfg_bufcount);
+	for (i = 0; i < index; i++)
+		offset += cfs_size_round(lcfg->lcfg_buflens[i]);
+	return (char *)lcfg + offset;
+}
+
+static inline void lustre_cfg_bufs_init(struct lustre_cfg_bufs *bufs,
+					struct lustre_cfg *lcfg)
+{
+	int i;
+	bufs->lcfg_bufcount = lcfg->lcfg_bufcount;
+	for (i = 0; i < bufs->lcfg_bufcount; i++) {
+		bufs->lcfg_buflen[i] = lcfg->lcfg_buflens[i];
+		bufs->lcfg_buf[i] = lustre_cfg_buf(lcfg, i);
+	}
+}
+
+static inline char *lustre_cfg_string(struct lustre_cfg *lcfg, int index)
+{
+	char *s;
+
+	if (lcfg->lcfg_buflens[index] == 0)
+		return NULL;
+
+	s = lustre_cfg_buf(lcfg, index);
+	if (s == NULL)
+		return NULL;
+
+	/*
+	 * make sure it's NULL terminated, even if this kills a char
+	 * of data.  Try to use the padding first though.
+	 */
+	if (s[lcfg->lcfg_buflens[index] - 1] != '\0') {
+		int last = min((int)lcfg->lcfg_buflens[index],
+			       cfs_size_round(lcfg->lcfg_buflens[index]) - 1);
+		char lost = s[last];
+		s[last] = '\0';
+		if (lost != '\0') {
+			CWARN("Truncated buf %d to '%s' (lost '%c'...)\n",
+			      index, s, lost);
+		}
+	}
+	return s;
+}
+
+static inline int lustre_cfg_len(__u32 bufcount, __u32 *buflens)
+{
+	int i;
+	int len;
+
+	len = LCFG_HDR_SIZE(bufcount);
+	for (i = 0; i < bufcount; i++)
+		len += cfs_size_round(buflens[i]);
+
+	return cfs_size_round(len);
+}
+
+
+#include "obd_support.h"
+
+static inline struct lustre_cfg *lustre_cfg_new(int cmd,
+						struct lustre_cfg_bufs *bufs)
+{
+	struct lustre_cfg *lcfg;
+	char *ptr;
+	int i;
+
+	OBD_ALLOC(lcfg, lustre_cfg_len(bufs->lcfg_bufcount,
+				       bufs->lcfg_buflen));
+	if (!lcfg)
+		return ERR_PTR(-ENOMEM);
+
+	lcfg->lcfg_version = LUSTRE_CFG_VERSION;
+	lcfg->lcfg_command = cmd;
+	lcfg->lcfg_bufcount = bufs->lcfg_bufcount;
+
+	ptr = (char *)lcfg + LCFG_HDR_SIZE(lcfg->lcfg_bufcount);
+	for (i = 0; i < lcfg->lcfg_bufcount; i++) {
+		lcfg->lcfg_buflens[i] = bufs->lcfg_buflen[i];
+		LOGL((char *)bufs->lcfg_buf[i], bufs->lcfg_buflen[i], ptr);
+	}
+	return lcfg;
+}
+
+static inline void lustre_cfg_free(struct lustre_cfg *lcfg)
+{
+	int len;
+
+	len = lustre_cfg_len(lcfg->lcfg_bufcount, lcfg->lcfg_buflens);
+
+	OBD_FREE(lcfg, len);
+	return;
+}
+
+static inline int lustre_cfg_sanity_check(void *buf, int len)
+{
+	struct lustre_cfg *lcfg = (struct lustre_cfg *)buf;
+
+	if (!lcfg)
+		return -EINVAL;
+
+	/* check that the first bits of the struct are valid */
+	if (len < LCFG_HDR_SIZE(0))
+		return -EINVAL;
+
+	if (lcfg->lcfg_version != LUSTRE_CFG_VERSION)
+		return -EINVAL;
+
+	if (lcfg->lcfg_bufcount >= LUSTRE_CFG_MAX_BUFCOUNT)
+		return -EINVAL;
+
+	/* check that the buflens are valid */
+	if (len < LCFG_HDR_SIZE(lcfg->lcfg_bufcount))
+		return -EINVAL;
+
+	/* make sure all the pointers point inside the data */
+	if (len < lustre_cfg_len(lcfg->lcfg_bufcount, lcfg->lcfg_buflens))
+		return -EINVAL;
+
+	return 0;
+}
+
+#include "lustre/lustre_user.h"
+
+/** @} cfg */
+
+#endif /* _LUSTRE_CFG_H */
diff --git a/kernel/drivers/staging/lustre/lustre/include/lustre_debug.h b/kernel/drivers/staging/lustre/lustre/include/lustre_debug.h
new file mode 100644
index 000000000..6c92d0bc9
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/include/lustre_debug.h
@@ -0,0 +1,56 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef _LUSTRE_DEBUG_H
+#define _LUSTRE_DEBUG_H
+
+/** \defgroup debug debug
+ *
+ * @{
+ */
+
+#include "lustre_net.h"
+#include "obd.h"
+
+/* lib/debug.c */
+void dump_lniobuf(struct niobuf_local *lnb);
+int dump_req(struct ptlrpc_request *req);
+int block_debug_setup(void *addr, int len, __u64 off, __u64 id);
+int block_debug_check(char *who, void *addr, int len, __u64 off, __u64 id);
+
+/** @} debug */
+
+#endif
diff --git a/kernel/drivers/staging/lustre/lustre/include/lustre_disk.h b/kernel/drivers/staging/lustre/lustre/include/lustre_disk.h
new file mode 100644
index 000000000..9b2833131
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/include/lustre_disk.h
@@ -0,0 +1,547 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/include/lustre_disk.h
+ *
+ * Lustre disk format definitions.
+ *
+ * Author: Nathan Rutman <nathan@clusterfs.com>
+ */
+
+#ifndef _LUSTRE_DISK_H
+#define _LUSTRE_DISK_H
+
+/** \defgroup disk disk
+ *
+ * @{
+ */
+
+#include "../../include/linux/libcfs/libcfs.h"
+#include "../../include/linux/lnet/types.h"
+#include <linux/backing-dev.h>
+
+/****************** on-disk files *********************/
+
+#define MDT_LOGS_DIR		"LOGS"  /* COMPAT_146 */
+#define MOUNT_CONFIGS_DIR	"CONFIGS"
+#define CONFIGS_FILE		"mountdata"
+/** Persistent mount data are stored on the disk in this file. */
+#define MOUNT_DATA_FILE		MOUNT_CONFIGS_DIR"/"CONFIGS_FILE
+#define LAST_RCVD		"last_rcvd"
+#define LOV_OBJID		"lov_objid"
+#define LOV_OBJSEQ		"lov_objseq"
+#define HEALTH_CHECK		"health_check"
+#define CAPA_KEYS		"capa_keys"
+#define CHANGELOG_USERS		"changelog_users"
+#define MGS_NIDTBL_DIR		"NIDTBL_VERSIONS"
+#define QMT_DIR			"quota_master"
+#define QSD_DIR			"quota_slave"
+#define HSM_ACTIONS		"hsm_actions"
+
+/****************** persistent mount data *********************/
+
+#define LDD_F_SV_TYPE_MDT   0x0001
+#define LDD_F_SV_TYPE_OST   0x0002
+#define LDD_F_SV_TYPE_MGS   0x0004
+#define LDD_F_SV_TYPE_MASK (LDD_F_SV_TYPE_MDT  | \
+			    LDD_F_SV_TYPE_OST  | \
+			    LDD_F_SV_TYPE_MGS)
+#define LDD_F_SV_ALL	0x0008
+/** need an index assignment */
+#define LDD_F_NEED_INDEX    0x0010
+/** never registered */
+#define LDD_F_VIRGIN	0x0020
+/** update the config logs for this server */
+#define LDD_F_UPDATE	0x0040
+/** rewrite the LDD */
+#define LDD_F_REWRITE_LDD   0x0080
+/** regenerate config logs for this fs or server */
+#define LDD_F_WRITECONF     0x0100
+/** COMPAT_14 */
+#define LDD_F_UPGRADE14     0x0200
+/** process as lctl conf_param */
+#define LDD_F_PARAM	 0x0400
+/** all nodes are specified as service nodes */
+#define LDD_F_NO_PRIMNODE   0x1000
+/** IR enable flag */
+#define LDD_F_IR_CAPABLE    0x2000
+/** the MGS refused to register the target. */
+#define LDD_F_ERROR	 0x4000
+/** process at lctl conf_param */
+#define LDD_F_PARAM2		0x8000
+
+/* opc for target register */
+#define LDD_F_OPC_REG   0x10000000
+#define LDD_F_OPC_UNREG 0x20000000
+#define LDD_F_OPC_READY 0x40000000
+#define LDD_F_OPC_MASK  0xf0000000
+
+#define LDD_F_ONDISK_MASK  (LDD_F_SV_TYPE_MASK)
+
+#define LDD_F_MASK	  0xFFFF
+
+enum ldd_mount_type {
+	LDD_MT_EXT3 = 0,
+	LDD_MT_LDISKFS,
+	LDD_MT_SMFS,
+	LDD_MT_REISERFS,
+	LDD_MT_LDISKFS2,
+	LDD_MT_ZFS,
+	LDD_MT_LAST
+};
+
+static inline char *mt_str(enum ldd_mount_type mt)
+{
+	static char *mount_type_string[] = {
+		"ext3",
+		"ldiskfs",
+		"smfs",
+		"reiserfs",
+		"ldiskfs2",
+		"zfs",
+	};
+	return mount_type_string[mt];
+}
+
+static inline char *mt_type(enum ldd_mount_type mt)
+{
+	static char *mount_type_string[] = {
+		"osd-ldiskfs",
+		"osd-ldiskfs",
+		"osd-smfs",
+		"osd-reiserfs",
+		"osd-ldiskfs",
+		"osd-zfs",
+	};
+	return mount_type_string[mt];
+}
+
+#define LDD_INCOMPAT_SUPP 0
+#define LDD_ROCOMPAT_SUPP 0
+
+#define LDD_MAGIC 0x1dd00001
+
+/* On-disk configuration file. In host-endian order. */
+struct lustre_disk_data {
+	__u32      ldd_magic;
+	__u32      ldd_feature_compat;  /* compatible feature flags */
+	__u32      ldd_feature_rocompat;/* read-only compatible feature flags */
+	__u32      ldd_feature_incompat;/* incompatible feature flags */
+
+	__u32      ldd_config_ver;      /* config rewrite count - not used */
+	__u32      ldd_flags;	   /* LDD_SV_TYPE */
+	__u32      ldd_svindex;	 /* server index (0001), must match
+					   svname */
+	__u32      ldd_mount_type;      /* target fs type LDD_MT_* */
+	char       ldd_fsname[64];      /* filesystem this server is part of,
+					   MTI_NAME_MAXLEN */
+	char       ldd_svname[64];      /* this server's name (lustre-mdt0001)*/
+	__u8       ldd_uuid[40];	/* server UUID (COMPAT_146) */
+
+/*200*/ char       ldd_userdata[1024 - 200]; /* arbitrary user string */
+/*1024*/__u8       ldd_padding[4096 - 1024];
+/*4096*/char       ldd_mount_opts[4096]; /* target fs mount opts */
+/*8192*/char       ldd_params[4096];     /* key=value pairs */
+};
+
+
+#define IS_MDT(data)    ((data)->lsi_flags & LDD_F_SV_TYPE_MDT)
+#define IS_OST(data)    ((data)->lsi_flags & LDD_F_SV_TYPE_OST)
+#define IS_MGS(data)    ((data)->lsi_flags & LDD_F_SV_TYPE_MGS)
+#define IS_SERVER(data) ((data)->lsi_flags & (LDD_F_SV_TYPE_MGS | \
+			 LDD_F_SV_TYPE_MDT | LDD_F_SV_TYPE_OST))
+#define MT_STR(data)    mt_str((data)->ldd_mount_type)
+
+/* Make the mdt/ost server obd name based on the filesystem name */
+static inline int server_make_name(__u32 flags, __u16 index, char *fs,
+				   char *name)
+{
+	if (flags & (LDD_F_SV_TYPE_MDT | LDD_F_SV_TYPE_OST)) {
+		if (!(flags & LDD_F_SV_ALL))
+			sprintf(name, "%.8s%c%s%04x", fs,
+				(flags & LDD_F_VIRGIN) ? ':' :
+					((flags & LDD_F_WRITECONF) ? '=' : '-'),
+				(flags & LDD_F_SV_TYPE_MDT) ? "MDT" : "OST",
+				index);
+	} else if (flags & LDD_F_SV_TYPE_MGS) {
+		sprintf(name, "MGS");
+	} else {
+		CERROR("unknown server type %#x\n", flags);
+		return 1;
+	}
+	return 0;
+}
+
+/****************** mount command *********************/
+
+/* The lmd is only used internally by Lustre; mount simply passes
+   everything as string options */
+
+#define LMD_MAGIC    0xbdacbd03
+
+/* gleaned from the mount command - no persistent info here */
+struct lustre_mount_data {
+	__u32      lmd_magic;
+	__u32      lmd_flags;	 /* lustre mount flags */
+	int	lmd_mgs_failnodes; /* mgs failover node count */
+	int	lmd_exclude_count;
+	int	lmd_recovery_time_soft;
+	int	lmd_recovery_time_hard;
+	char      *lmd_dev;	   /* device name */
+	char      *lmd_profile;       /* client only */
+	char      *lmd_mgssec;	/* sptlrpc flavor to mgs */
+	char      *lmd_opts;	  /* lustre mount options (as opposed to
+					 _device_ mount options) */
+	char      *lmd_params;	/* lustre params */
+	__u32     *lmd_exclude;       /* array of OSTs to ignore */
+	char	*lmd_mgs;	   /* MGS nid */
+	char	*lmd_osd_type;      /* OSD type */
+};
+
+#define LMD_FLG_SERVER		0x0001	/* Mounting a server */
+#define LMD_FLG_CLIENT		0x0002	/* Mounting a client */
+#define LMD_FLG_ABORT_RECOV	0x0008	/* Abort recovery */
+#define LMD_FLG_NOSVC		0x0010	/* Only start MGS/MGC for servers,
+					   no other services */
+#define LMD_FLG_NOMGS		0x0020	/* Only start target for servers, reusing
+					   existing MGS services */
+#define LMD_FLG_WRITECONF	0x0040	/* Rewrite config log */
+#define LMD_FLG_NOIR		0x0080	/* NO imperative recovery */
+#define LMD_FLG_NOSCRUB		0x0100	/* Do not trigger scrub automatically */
+#define LMD_FLG_MGS		0x0200	/* Also start MGS along with server */
+#define LMD_FLG_IAM		0x0400	/* IAM dir */
+#define LMD_FLG_NO_PRIMNODE	0x0800	/* all nodes are service nodes */
+#define LMD_FLG_VIRGIN		0x1000	/* the service registers first time */
+#define LMD_FLG_UPDATE		0x2000	/* update parameters */
+#define LMD_FLG_HSM		0x4000	/* Start coordinator */
+
+#define lmd_is_client(x) ((x)->lmd_flags & LMD_FLG_CLIENT)
+
+
+/****************** last_rcvd file *********************/
+
+/** version recovery epoch */
+#define LR_EPOCH_BITS   32
+#define lr_epoch(a) ((a) >> LR_EPOCH_BITS)
+#define LR_EXPIRE_INTERVALS 16 /**< number of intervals to track transno */
+#define ENOENT_VERSION 1 /** 'virtual' version of non-existent object */
+
+#define LR_SERVER_SIZE   512
+#define LR_CLIENT_START 8192
+#define LR_CLIENT_SIZE   128
+#if LR_CLIENT_START < LR_SERVER_SIZE
+#error "Can't have LR_CLIENT_START < LR_SERVER_SIZE"
+#endif
+
+/*
+ * This limit is arbitrary (131072 clients on x86), but it is convenient to use
+ * 2^n * PAGE_CACHE_SIZE * 8 for the number of bits that fit an order-n allocation.
+ * If we need more than 131072 clients (order-2 allocation on x86) then this
+ * should become an array of single-page pointers that are allocated on demand.
+ */
+#if (128 * 1024UL) > (PAGE_CACHE_SIZE * 8)
+#define LR_MAX_CLIENTS (128 * 1024UL)
+#else
+#define LR_MAX_CLIENTS (PAGE_CACHE_SIZE * 8)
+#endif
+
+/** COMPAT_146: this is an OST (temporary) */
+#define OBD_COMPAT_OST	  0x00000002
+/** COMPAT_146: this is an MDT (temporary) */
+#define OBD_COMPAT_MDT	  0x00000004
+/** 2.0 server, interop flag to show server version is changed */
+#define OBD_COMPAT_20	   0x00000008
+
+/** MDS handles LOV_OBJID file */
+#define OBD_ROCOMPAT_LOVOBJID   0x00000001
+
+/** OST handles group subdirs */
+#define OBD_INCOMPAT_GROUPS     0x00000001
+/** this is an OST */
+#define OBD_INCOMPAT_OST	0x00000002
+/** this is an MDT */
+#define OBD_INCOMPAT_MDT	0x00000004
+/** common last_rvcd format */
+#define OBD_INCOMPAT_COMMON_LR  0x00000008
+/** FID is enabled */
+#define OBD_INCOMPAT_FID	0x00000010
+/** Size-on-MDS is enabled */
+#define OBD_INCOMPAT_SOM	0x00000020
+/** filesystem using iam format to store directory entries */
+#define OBD_INCOMPAT_IAM_DIR    0x00000040
+/** LMA attribute contains per-inode incompatible flags */
+#define OBD_INCOMPAT_LMA	0x00000080
+/** lmm_stripe_count has been shrunk from __u32 to __u16 and the remaining 16
+ * bits are now used to store a generation. Once we start changing the layout
+ * and bumping the generation, old versions expecting a 32-bit lmm_stripe_count
+ * will be confused by interpreting stripe_count | gen << 16 as the actual
+ * stripe count */
+#define OBD_INCOMPAT_LMM_VER    0x00000100
+/** multiple OI files for MDT */
+#define OBD_INCOMPAT_MULTI_OI   0x00000200
+
+/* Data stored per server at the head of the last_rcvd file.  In le32 order.
+   This should be common to filter_internal.h, lustre_mds.h */
+struct lr_server_data {
+	__u8  lsd_uuid[40];	/* server UUID */
+	__u64 lsd_last_transno;    /* last completed transaction ID */
+	__u64 lsd_compat14;	/* reserved - compat with old last_rcvd */
+	__u64 lsd_mount_count;     /* incarnation number */
+	__u32 lsd_feature_compat;  /* compatible feature flags */
+	__u32 lsd_feature_rocompat;/* read-only compatible feature flags */
+	__u32 lsd_feature_incompat;/* incompatible feature flags */
+	__u32 lsd_server_size;     /* size of server data area */
+	__u32 lsd_client_start;    /* start of per-client data area */
+	__u16 lsd_client_size;     /* size of per-client data area */
+	__u16 lsd_subdir_count;    /* number of subdirectories for objects */
+	__u64 lsd_catalog_oid;     /* recovery catalog object id */
+	__u32 lsd_catalog_ogen;    /* recovery catalog inode generation */
+	__u8  lsd_peeruuid[40];    /* UUID of MDS associated with this OST */
+	__u32 lsd_osd_index;       /* index number of OST in LOV */
+	__u32 lsd_padding1;	/* was lsd_mdt_index, unused in 2.4.0 */
+	__u32 lsd_start_epoch;     /* VBR: start epoch from last boot */
+	/** transaction values since lsd_trans_table_time */
+	__u64 lsd_trans_table[LR_EXPIRE_INTERVALS];
+	/** start point of transno table below */
+	__u32 lsd_trans_table_time; /* time of first slot in table above */
+	__u32 lsd_expire_intervals; /* LR_EXPIRE_INTERVALS */
+	__u8  lsd_padding[LR_SERVER_SIZE - 288];
+};
+
+/* Data stored per client in the last_rcvd file.  In le32 order. */
+struct lsd_client_data {
+	__u8  lcd_uuid[40];      /* client UUID */
+	__u64 lcd_last_transno; /* last completed transaction ID */
+	__u64 lcd_last_xid;     /* xid for the last transaction */
+	__u32 lcd_last_result;  /* result from last RPC */
+	__u32 lcd_last_data;    /* per-op data (disposition for open &c.) */
+	/* for MDS_CLOSE requests */
+	__u64 lcd_last_close_transno; /* last completed transaction ID */
+	__u64 lcd_last_close_xid;     /* xid for the last transaction */
+	__u32 lcd_last_close_result;  /* result from last RPC */
+	__u32 lcd_last_close_data;    /* per-op data */
+	/* VBR: last versions */
+	__u64 lcd_pre_versions[4];
+	__u32 lcd_last_epoch;
+	/** orphans handling for delayed export rely on that */
+	__u32 lcd_first_epoch;
+	__u8  lcd_padding[LR_CLIENT_SIZE - 128];
+};
+
+/* bug20354: the lcd_uuid for export of clients may be wrong */
+static inline void check_lcd(char *obd_name, int index,
+			     struct lsd_client_data *lcd)
+{
+	int length = sizeof(lcd->lcd_uuid);
+	if (strnlen((char*)lcd->lcd_uuid, length) == length) {
+		lcd->lcd_uuid[length - 1] = '\0';
+
+		LCONSOLE_ERROR("the client UUID (%s) on %s for exports stored in last_rcvd(index = %d) is bad!\n",
+			       lcd->lcd_uuid, obd_name, index);
+	}
+}
+
+/* last_rcvd handling */
+static inline void lsd_le_to_cpu(struct lr_server_data *buf,
+				 struct lr_server_data *lsd)
+{
+	int i;
+	memcpy(lsd->lsd_uuid, buf->lsd_uuid, sizeof(lsd->lsd_uuid));
+	lsd->lsd_last_transno     = le64_to_cpu(buf->lsd_last_transno);
+	lsd->lsd_compat14	 = le64_to_cpu(buf->lsd_compat14);
+	lsd->lsd_mount_count      = le64_to_cpu(buf->lsd_mount_count);
+	lsd->lsd_feature_compat   = le32_to_cpu(buf->lsd_feature_compat);
+	lsd->lsd_feature_rocompat = le32_to_cpu(buf->lsd_feature_rocompat);
+	lsd->lsd_feature_incompat = le32_to_cpu(buf->lsd_feature_incompat);
+	lsd->lsd_server_size      = le32_to_cpu(buf->lsd_server_size);
+	lsd->lsd_client_start     = le32_to_cpu(buf->lsd_client_start);
+	lsd->lsd_client_size      = le16_to_cpu(buf->lsd_client_size);
+	lsd->lsd_subdir_count     = le16_to_cpu(buf->lsd_subdir_count);
+	lsd->lsd_catalog_oid      = le64_to_cpu(buf->lsd_catalog_oid);
+	lsd->lsd_catalog_ogen     = le32_to_cpu(buf->lsd_catalog_ogen);
+	memcpy(lsd->lsd_peeruuid, buf->lsd_peeruuid, sizeof(lsd->lsd_peeruuid));
+	lsd->lsd_osd_index	= le32_to_cpu(buf->lsd_osd_index);
+	lsd->lsd_padding1	= le32_to_cpu(buf->lsd_padding1);
+	lsd->lsd_start_epoch      = le32_to_cpu(buf->lsd_start_epoch);
+	for (i = 0; i < LR_EXPIRE_INTERVALS; i++)
+		lsd->lsd_trans_table[i] = le64_to_cpu(buf->lsd_trans_table[i]);
+	lsd->lsd_trans_table_time = le32_to_cpu(buf->lsd_trans_table_time);
+	lsd->lsd_expire_intervals = le32_to_cpu(buf->lsd_expire_intervals);
+}
+
+static inline void lsd_cpu_to_le(struct lr_server_data *lsd,
+				 struct lr_server_data *buf)
+{
+	int i;
+	memcpy(buf->lsd_uuid, lsd->lsd_uuid, sizeof(buf->lsd_uuid));
+	buf->lsd_last_transno     = cpu_to_le64(lsd->lsd_last_transno);
+	buf->lsd_compat14	 = cpu_to_le64(lsd->lsd_compat14);
+	buf->lsd_mount_count      = cpu_to_le64(lsd->lsd_mount_count);
+	buf->lsd_feature_compat   = cpu_to_le32(lsd->lsd_feature_compat);
+	buf->lsd_feature_rocompat = cpu_to_le32(lsd->lsd_feature_rocompat);
+	buf->lsd_feature_incompat = cpu_to_le32(lsd->lsd_feature_incompat);
+	buf->lsd_server_size      = cpu_to_le32(lsd->lsd_server_size);
+	buf->lsd_client_start     = cpu_to_le32(lsd->lsd_client_start);
+	buf->lsd_client_size      = cpu_to_le16(lsd->lsd_client_size);
+	buf->lsd_subdir_count     = cpu_to_le16(lsd->lsd_subdir_count);
+	buf->lsd_catalog_oid      = cpu_to_le64(lsd->lsd_catalog_oid);
+	buf->lsd_catalog_ogen     = cpu_to_le32(lsd->lsd_catalog_ogen);
+	memcpy(buf->lsd_peeruuid, lsd->lsd_peeruuid, sizeof(buf->lsd_peeruuid));
+	buf->lsd_osd_index	  = cpu_to_le32(lsd->lsd_osd_index);
+	buf->lsd_padding1	  = cpu_to_le32(lsd->lsd_padding1);
+	buf->lsd_start_epoch      = cpu_to_le32(lsd->lsd_start_epoch);
+	for (i = 0; i < LR_EXPIRE_INTERVALS; i++)
+		buf->lsd_trans_table[i] = cpu_to_le64(lsd->lsd_trans_table[i]);
+	buf->lsd_trans_table_time = cpu_to_le32(lsd->lsd_trans_table_time);
+	buf->lsd_expire_intervals = cpu_to_le32(lsd->lsd_expire_intervals);
+}
+
+static inline void lcd_le_to_cpu(struct lsd_client_data *buf,
+				 struct lsd_client_data *lcd)
+{
+	memcpy(lcd->lcd_uuid, buf->lcd_uuid, sizeof (lcd->lcd_uuid));
+	lcd->lcd_last_transno       = le64_to_cpu(buf->lcd_last_transno);
+	lcd->lcd_last_xid	   = le64_to_cpu(buf->lcd_last_xid);
+	lcd->lcd_last_result	= le32_to_cpu(buf->lcd_last_result);
+	lcd->lcd_last_data	  = le32_to_cpu(buf->lcd_last_data);
+	lcd->lcd_last_close_transno = le64_to_cpu(buf->lcd_last_close_transno);
+	lcd->lcd_last_close_xid     = le64_to_cpu(buf->lcd_last_close_xid);
+	lcd->lcd_last_close_result  = le32_to_cpu(buf->lcd_last_close_result);
+	lcd->lcd_last_close_data    = le32_to_cpu(buf->lcd_last_close_data);
+	lcd->lcd_pre_versions[0]    = le64_to_cpu(buf->lcd_pre_versions[0]);
+	lcd->lcd_pre_versions[1]    = le64_to_cpu(buf->lcd_pre_versions[1]);
+	lcd->lcd_pre_versions[2]    = le64_to_cpu(buf->lcd_pre_versions[2]);
+	lcd->lcd_pre_versions[3]    = le64_to_cpu(buf->lcd_pre_versions[3]);
+	lcd->lcd_last_epoch	 = le32_to_cpu(buf->lcd_last_epoch);
+	lcd->lcd_first_epoch	= le32_to_cpu(buf->lcd_first_epoch);
+}
+
+static inline void lcd_cpu_to_le(struct lsd_client_data *lcd,
+				 struct lsd_client_data *buf)
+{
+	memcpy(buf->lcd_uuid, lcd->lcd_uuid, sizeof (lcd->lcd_uuid));
+	buf->lcd_last_transno       = cpu_to_le64(lcd->lcd_last_transno);
+	buf->lcd_last_xid	   = cpu_to_le64(lcd->lcd_last_xid);
+	buf->lcd_last_result	= cpu_to_le32(lcd->lcd_last_result);
+	buf->lcd_last_data	  = cpu_to_le32(lcd->lcd_last_data);
+	buf->lcd_last_close_transno = cpu_to_le64(lcd->lcd_last_close_transno);
+	buf->lcd_last_close_xid     = cpu_to_le64(lcd->lcd_last_close_xid);
+	buf->lcd_last_close_result  = cpu_to_le32(lcd->lcd_last_close_result);
+	buf->lcd_last_close_data    = cpu_to_le32(lcd->lcd_last_close_data);
+	buf->lcd_pre_versions[0]    = cpu_to_le64(lcd->lcd_pre_versions[0]);
+	buf->lcd_pre_versions[1]    = cpu_to_le64(lcd->lcd_pre_versions[1]);
+	buf->lcd_pre_versions[2]    = cpu_to_le64(lcd->lcd_pre_versions[2]);
+	buf->lcd_pre_versions[3]    = cpu_to_le64(lcd->lcd_pre_versions[3]);
+	buf->lcd_last_epoch	 = cpu_to_le32(lcd->lcd_last_epoch);
+	buf->lcd_first_epoch	= cpu_to_le32(lcd->lcd_first_epoch);
+}
+
+static inline __u64 lcd_last_transno(struct lsd_client_data *lcd)
+{
+	return (lcd->lcd_last_transno > lcd->lcd_last_close_transno ?
+		lcd->lcd_last_transno : lcd->lcd_last_close_transno);
+}
+
+static inline __u64 lcd_last_xid(struct lsd_client_data *lcd)
+{
+	return (lcd->lcd_last_xid > lcd->lcd_last_close_xid ?
+		lcd->lcd_last_xid : lcd->lcd_last_close_xid);
+}
+
+/****************** superblock additional info *********************/
+
+struct ll_sb_info;
+
+struct lustre_sb_info {
+	int		       lsi_flags;
+	struct obd_device	*lsi_mgc;     /* mgc obd */
+	struct lustre_mount_data *lsi_lmd;     /* mount command info */
+	struct ll_sb_info	*lsi_llsbi;   /* add'l client sbi info */
+	struct dt_device	 *lsi_dt_dev;  /* dt device to access disk fs*/
+	struct vfsmount	  *lsi_srv_mnt; /* the one server mount */
+	atomic_t	      lsi_mounts;  /* references to the srv_mnt */
+	char			  lsi_svname[MTI_NAME_MAXLEN];
+	char			  lsi_osd_obdname[64];
+	char			  lsi_osd_uuid[64];
+	struct obd_export	 *lsi_osd_exp;
+	char			  lsi_osd_type[16];
+	char			  lsi_fstype[16];
+	struct backing_dev_info   lsi_bdi;     /* each client mountpoint needs
+						  own backing_dev_info */
+};
+
+#define LSI_UMOUNT_FAILOVER	      0x00200000
+#define LSI_BDI_INITIALIZED	      0x00400000
+
+#define     s2lsi(sb)	((struct lustre_sb_info *)((sb)->s_fs_info))
+#define     s2lsi_nocast(sb) ((sb)->s_fs_info)
+
+#define     get_profile_name(sb)   (s2lsi(sb)->lsi_lmd->lmd_profile)
+#define	    get_mount_flags(sb)	   (s2lsi(sb)->lsi_lmd->lmd_flags)
+#define	    get_mntdev_name(sb)	   (s2lsi(sb)->lsi_lmd->lmd_dev)
+
+
+/****************** mount lookup info *********************/
+
+struct lustre_mount_info {
+	char		 *lmi_name;
+	struct super_block   *lmi_sb;
+	struct vfsmount      *lmi_mnt;
+	struct list_head	    lmi_list_chain;
+};
+
+/****************** prototypes *********************/
+
+/* obd_mount.c */
+int server_name2fsname(const char *svname, char *fsname, const char **endptr);
+int server_name2index(const char *svname, __u32 *idx, const char **endptr);
+int server_name2svname(const char *label, char *svname, const char **endptr,
+		       size_t svsize);
+
+int lustre_put_lsi(struct super_block *sb);
+int lustre_start_simple(char *obdname, char *type, char *uuid,
+			char *s1, char *s2, char *s3, char *s4);
+int lustre_start_mgc(struct super_block *sb);
+void lustre_register_client_fill_super(int (*cfs)(struct super_block *sb,
+						  struct vfsmount *mnt));
+void lustre_register_kill_super_cb(void (*cfs)(struct super_block *sb));
+int lustre_common_put_super(struct super_block *sb);
+
+
+int mgc_fsname2resid(char *fsname, struct ldlm_res_id *res_id, int type);
+
+/** @} disk */
+
+#endif /* _LUSTRE_DISK_H */
diff --git a/kernel/drivers/staging/lustre/lustre/include/lustre_dlm.h b/kernel/drivers/staging/lustre/lustre/include/lustre_dlm.h
new file mode 100644
index 000000000..bac9902b5
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/include/lustre_dlm.h
@@ -0,0 +1,1480 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2010, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+/** \defgroup LDLM Lustre Distributed Lock Manager
+ *
+ * Lustre DLM is based on VAX DLM.
+ * Its two main roles are:
+ *   - To provide locking assuring consistency of data on all Lustre nodes.
+ *   - To allow clients to cache state protected by a lock by holding the
+ *     lock until a conflicting lock is requested or it is expired by the LRU.
+ *
+ * @{
+ */
+
+#ifndef _LUSTRE_DLM_H__
+#define _LUSTRE_DLM_H__
+
+#include "lustre_lib.h"
+#include "lustre_net.h"
+#include "lustre_import.h"
+#include "lustre_handles.h"
+#include "interval_tree.h"	/* for interval_node{}, ldlm_extent */
+#include "lu_ref.h"
+
+#include "lustre_dlm_flags.h"
+
+struct obd_ops;
+struct obd_device;
+
+#define OBD_LDLM_DEVICENAME  "ldlm"
+
+#define LDLM_DEFAULT_LRU_SIZE (100 * num_online_cpus())
+#define LDLM_DEFAULT_MAX_ALIVE (cfs_time_seconds(36000))
+#define LDLM_CTIME_AGE_LIMIT (10)
+#define LDLM_DEFAULT_PARALLEL_AST_LIMIT 1024
+
+/**
+ * LDLM non-error return states
+ */
+typedef enum {
+	ELDLM_OK = 0,
+
+	ELDLM_LOCK_CHANGED = 300,
+	ELDLM_LOCK_ABORTED = 301,
+	ELDLM_LOCK_REPLACED = 302,
+	ELDLM_NO_LOCK_DATA = 303,
+	ELDLM_LOCK_WOULDBLOCK = 304,
+
+	ELDLM_NAMESPACE_EXISTS = 400,
+	ELDLM_BAD_NAMESPACE    = 401
+} ldlm_error_t;
+
+/**
+ * LDLM namespace type.
+ * The "client" type is actually an indication that this is a narrow local view
+ * into complete namespace on the server. Such namespaces cannot make any
+ * decisions about lack of conflicts or do any autonomous lock granting without
+ * first speaking to a server.
+ */
+typedef enum {
+	LDLM_NAMESPACE_SERVER = 1 << 0,
+	LDLM_NAMESPACE_CLIENT = 1 << 1
+} ldlm_side_t;
+
+/**
+ * The blocking callback is overloaded to perform two functions.  These flags
+ * indicate which operation should be performed.
+ */
+#define LDLM_CB_BLOCKING    1
+#define LDLM_CB_CANCELING   2
+
+/**
+ * \name Lock Compatibility Matrix.
+ *
+ * A lock has both a type (extent, flock, inode bits, or plain) and a mode.
+ * Lock types are described in their respective implementation files:
+ * ldlm_{extent,flock,inodebits,plain}.c.
+ *
+ * There are six lock modes along with a compatibility matrix to indicate if
+ * two locks are compatible.
+ *
+ * - EX: Exclusive mode. Before a new file is created, MDS requests EX lock
+ *   on the parent.
+ * - PW: Protective Write (normal write) mode. When a client requests a write
+ *   lock from an OST, a lock with PW mode will be issued.
+ * - PR: Protective Read (normal read) mode. When a client requests a read from
+ *   an OST, a lock with PR mode will be issued. Also, if the client opens a
+ *   file for execution, it is granted a lock with PR mode.
+ * - CW: Concurrent Write mode. The type of lock that the MDS grants if a client
+ *   requests a write lock during a file open operation.
+ * - CR Concurrent Read mode. When a client performs a path lookup, MDS grants
+ *   an inodebit lock with the CR mode on the intermediate path component.
+ * - NL Null mode.
+ *
+ * <PRE>
+ *       NL  CR  CW  PR  PW  EX
+ *  NL    1   1   1   1   1   1
+ *  CR    1   1   1   1   1   0
+ *  CW    1   1   1   0   0   0
+ *  PR    1   1   0   1   0   0
+ *  PW    1   1   0   0   0   0
+ *  EX    1   0   0   0   0   0
+ * </PRE>
+ */
+/** @{ */
+#define LCK_COMPAT_EX  LCK_NL
+#define LCK_COMPAT_PW  (LCK_COMPAT_EX | LCK_CR)
+#define LCK_COMPAT_PR  (LCK_COMPAT_PW | LCK_PR)
+#define LCK_COMPAT_CW  (LCK_COMPAT_PW | LCK_CW)
+#define LCK_COMPAT_CR  (LCK_COMPAT_CW | LCK_PR | LCK_PW)
+#define LCK_COMPAT_NL  (LCK_COMPAT_CR | LCK_EX | LCK_GROUP)
+#define LCK_COMPAT_GROUP  (LCK_GROUP | LCK_NL)
+#define LCK_COMPAT_COS (LCK_COS)
+/** @} Lock Compatibility Matrix */
+
+extern ldlm_mode_t lck_compat_array[];
+
+static inline void lockmode_verify(ldlm_mode_t mode)
+{
+       LASSERT(mode > LCK_MINMODE && mode < LCK_MAXMODE);
+}
+
+static inline int lockmode_compat(ldlm_mode_t exist_mode, ldlm_mode_t new_mode)
+{
+       return (lck_compat_array[exist_mode] & new_mode);
+}
+
+/*
+ *
+ * cluster name spaces
+ *
+ */
+
+#define DLM_OST_NAMESPACE 1
+#define DLM_MDS_NAMESPACE 2
+
+/* XXX
+   - do we just separate this by security domains and use a prefix for
+     multiple namespaces in the same domain?
+   -
+*/
+
+/**
+ * Locking rules for LDLM:
+ *
+ * lr_lock
+ *
+ * lr_lock
+ *     waiting_locks_spinlock
+ *
+ * lr_lock
+ *     led_lock
+ *
+ * lr_lock
+ *     ns_lock
+ *
+ * lr_lvb_mutex
+ *     lr_lock
+ *
+ */
+
+struct ldlm_pool;
+struct ldlm_lock;
+struct ldlm_resource;
+struct ldlm_namespace;
+
+/**
+ * Operations on LDLM pools.
+ * LDLM pool is a pool of locks in the namespace without any implicitly
+ * specified limits.
+ * Locks in the pool are organized in LRU.
+ * Local memory pressure or server instructions (e.g. mempressure on server)
+ * can trigger freeing of locks from the pool
+ */
+struct ldlm_pool_ops {
+	/** Recalculate pool \a pl usage */
+	int (*po_recalc)(struct ldlm_pool *pl);
+	/** Cancel at least \a nr locks from pool \a pl */
+	int (*po_shrink)(struct ldlm_pool *pl, int nr,
+			 gfp_t gfp_mask);
+	int (*po_setup)(struct ldlm_pool *pl, int limit);
+};
+
+/** One second for pools thread check interval. Each pool has own period. */
+#define LDLM_POOLS_THREAD_PERIOD (1)
+
+/** ~6% margin for modest pools. See ldlm_pool.c for details. */
+#define LDLM_POOLS_MODEST_MARGIN_SHIFT (4)
+
+/** Default recalc period for server side pools in sec. */
+#define LDLM_POOL_SRV_DEF_RECALC_PERIOD (1)
+
+/** Default recalc period for client side pools in sec. */
+#define LDLM_POOL_CLI_DEF_RECALC_PERIOD (10)
+
+/**
+ * LDLM pool structure to track granted locks.
+ * For purposes of determining when to release locks on e.g. memory pressure.
+ * This feature is commonly referred to as lru_resize.
+ */
+struct ldlm_pool {
+	/** Pool proc directory. */
+	struct proc_dir_entry	*pl_proc_dir;
+	/** Pool name, must be long enough to hold compound proc entry name. */
+	char			pl_name[100];
+	/** Lock for protecting SLV/CLV updates. */
+	spinlock_t		pl_lock;
+	/** Number of allowed locks in in pool, both, client and server side. */
+	atomic_t		pl_limit;
+	/** Number of granted locks in */
+	atomic_t		pl_granted;
+	/** Grant rate per T. */
+	atomic_t		pl_grant_rate;
+	/** Cancel rate per T. */
+	atomic_t		pl_cancel_rate;
+	/** Server lock volume (SLV). Protected by pl_lock. */
+	__u64			pl_server_lock_volume;
+	/** Current biggest client lock volume. Protected by pl_lock. */
+	__u64			pl_client_lock_volume;
+	/** Lock volume factor. SLV on client is calculated as following:
+	 *  server_slv * lock_volume_factor. */
+	atomic_t		pl_lock_volume_factor;
+	/** Time when last SLV from server was obtained. */
+	time_t			pl_recalc_time;
+	/** Recalculation period for pool. */
+	time_t			pl_recalc_period;
+	/** Recalculation and shrink operations. */
+	const struct ldlm_pool_ops	*pl_ops;
+	/** Number of planned locks for next period. */
+	int			pl_grant_plan;
+	/** Pool statistics. */
+	struct lprocfs_stats	*pl_stats;
+};
+
+typedef int (*ldlm_res_policy)(struct ldlm_namespace *, struct ldlm_lock **,
+			       void *req_cookie, ldlm_mode_t mode, __u64 flags,
+			       void *data);
+
+typedef int (*ldlm_cancel_for_recovery)(struct ldlm_lock *lock);
+
+/**
+ * LVB operations.
+ * LVB is Lock Value Block. This is a special opaque (to LDLM) value that could
+ * be associated with an LDLM lock and transferred from client to server and
+ * back.
+ *
+ * Currently LVBs are used by:
+ *  - OSC-OST code to maintain current object size/times
+ *  - layout lock code to return the layout when the layout lock is granted
+ */
+struct ldlm_valblock_ops {
+	int (*lvbo_init)(struct ldlm_resource *res);
+	int (*lvbo_update)(struct ldlm_resource *res,
+			   struct ptlrpc_request *r,
+			   int increase);
+	int (*lvbo_free)(struct ldlm_resource *res);
+	/* Return size of lvb data appropriate RPC size can be reserved */
+	int (*lvbo_size)(struct ldlm_lock *lock);
+	/* Called to fill in lvb data to RPC buffer @buf */
+	int (*lvbo_fill)(struct ldlm_lock *lock, void *buf, int buflen);
+};
+
+/**
+ * LDLM pools related, type of lock pool in the namespace.
+ * Greedy means release cached locks aggressively
+ */
+typedef enum {
+	LDLM_NAMESPACE_GREEDY = 1 << 0,
+	LDLM_NAMESPACE_MODEST = 1 << 1
+} ldlm_appetite_t;
+
+/**
+ * Default values for the "max_nolock_size", "contention_time" and
+ * "contended_locks" namespace tunables.
+ */
+#define NS_DEFAULT_MAX_NOLOCK_BYTES 0
+#define NS_DEFAULT_CONTENTION_SECONDS 2
+#define NS_DEFAULT_CONTENDED_LOCKS 32
+
+struct ldlm_ns_bucket {
+	/** back pointer to namespace */
+	struct ldlm_namespace      *nsb_namespace;
+	/**
+	 * Estimated lock callback time.  Used by adaptive timeout code to
+	 * avoid spurious client evictions due to unresponsiveness when in
+	 * fact the network or overall system load is at fault
+	 */
+	struct adaptive_timeout     nsb_at_estimate;
+};
+
+enum {
+	/** LDLM namespace lock stats */
+	LDLM_NSS_LOCKS	  = 0,
+	LDLM_NSS_LAST
+};
+
+typedef enum {
+	/** invalid type */
+	LDLM_NS_TYPE_UNKNOWN    = 0,
+	/** mdc namespace */
+	LDLM_NS_TYPE_MDC,
+	/** mds namespace */
+	LDLM_NS_TYPE_MDT,
+	/** osc namespace */
+	LDLM_NS_TYPE_OSC,
+	/** ost namespace */
+	LDLM_NS_TYPE_OST,
+	/** mgc namespace */
+	LDLM_NS_TYPE_MGC,
+	/** mgs namespace */
+	LDLM_NS_TYPE_MGT,
+} ldlm_ns_type_t;
+
+/**
+ * LDLM Namespace.
+ *
+ * Namespace serves to contain locks related to a particular service.
+ * There are two kinds of namespaces:
+ * - Server namespace has knowledge of all locks and is therefore authoritative
+ *   to make decisions like what locks could be granted and what conflicts
+ *   exist during new lock enqueue.
+ * - Client namespace only has limited knowledge about locks in the namespace,
+ *   only seeing locks held by the client.
+ *
+ * Every Lustre service has one server namespace present on the server serving
+ * that service. Every client connected to the service has a client namespace
+ * for it.
+ * Every lock obtained by client in that namespace is actually represented by
+ * two in-memory locks. One on the server and one on the client. The locks are
+ * linked by a special cookie by which one node can tell to the other which lock
+ * it actually means during communications. Such locks are called remote locks.
+ * The locks held by server only without any reference to a client are called
+ * local locks.
+ */
+struct ldlm_namespace {
+	/** Backward link to OBD, required for LDLM pool to store new SLV. */
+	struct obd_device	*ns_obd;
+
+	/** Flag indicating if namespace is on client instead of server */
+	ldlm_side_t		ns_client;
+
+	/** Resource hash table for namespace. */
+	struct cfs_hash		*ns_rs_hash;
+
+	/** serialize */
+	spinlock_t		ns_lock;
+
+	/** big refcount (by bucket) */
+	atomic_t		ns_bref;
+
+	/**
+	 * Namespace connect flags supported by server (may be changed via
+	 * /proc, LRU resize may be disabled/enabled).
+	 */
+	__u64			ns_connect_flags;
+
+	/** Client side original connect flags supported by server. */
+	__u64			ns_orig_connect_flags;
+
+	/* namespace proc dir entry */
+	struct proc_dir_entry	*ns_proc_dir_entry;
+
+	/**
+	 * Position in global namespace list linking all namespaces on
+	 * the node.
+	 */
+	struct list_head		ns_list_chain;
+
+	/**
+	 * List of unused locks for this namespace. This list is also called
+	 * LRU lock list.
+	 * Unused locks are locks with zero reader/writer reference counts.
+	 * This list is only used on clients for lock caching purposes.
+	 * When we want to release some locks voluntarily or if server wants
+	 * us to release some locks due to e.g. memory pressure, we take locks
+	 * to release from the head of this list.
+	 * Locks are linked via l_lru field in \see struct ldlm_lock.
+	 */
+	struct list_head		ns_unused_list;
+	/** Number of locks in the LRU list above */
+	int			ns_nr_unused;
+
+	/**
+	 * Maximum number of locks permitted in the LRU. If 0, means locks
+	 * are managed by pools and there is no preset limit, rather it is all
+	 * controlled by available memory on this client and on server.
+	 */
+	unsigned int		ns_max_unused;
+	/** Maximum allowed age (last used time) for locks in the LRU */
+	unsigned int		ns_max_age;
+	/**
+	 * Server only: number of times we evicted clients due to lack of reply
+	 * to ASTs.
+	 */
+	unsigned int		ns_timeouts;
+	/**
+	 * Number of seconds since the file change time after which the
+	 * MDT will return an UPDATE lock along with a LOOKUP lock.
+	 * This allows the client to start caching negative dentries
+	 * for a directory and may save an RPC for a later stat.
+	 */
+	unsigned int		ns_ctime_age_limit;
+
+	/**
+	 * Used to rate-limit ldlm_namespace_dump calls.
+	 * \see ldlm_namespace_dump. Increased by 10 seconds every time
+	 * it is called.
+	 */
+	unsigned long		ns_next_dump;
+
+	/** "policy" function that does actual lock conflict determination */
+	ldlm_res_policy		ns_policy;
+
+	/**
+	 * LVB operations for this namespace.
+	 * \see struct ldlm_valblock_ops
+	 */
+	struct ldlm_valblock_ops *ns_lvbo;
+
+	/**
+	 * Used by filter code to store pointer to OBD of the service.
+	 * Should be dropped in favor of \a ns_obd
+	 */
+	void			*ns_lvbp;
+
+	/**
+	 * Wait queue used by __ldlm_namespace_free. Gets woken up every time
+	 * a resource is removed.
+	 */
+	wait_queue_head_t		ns_waitq;
+	/** LDLM pool structure for this namespace */
+	struct ldlm_pool	ns_pool;
+	/** Definition of how eagerly unused locks will be released from LRU */
+	ldlm_appetite_t		ns_appetite;
+
+	/**
+	 * If more than \a ns_contended_locks are found, the resource is
+	 * considered to be contended. Lock enqueues might specify that no
+	 * contended locks should be granted
+	 */
+	unsigned		ns_contended_locks;
+
+	/**
+	 * The resources in this namespace remember contended state during
+	 * \a ns_contention_time, in seconds.
+	 */
+	unsigned		ns_contention_time;
+
+	/**
+	 * Limit size of contended extent locks, in bytes.
+	 * If extended lock is requested for more then this many bytes and
+	 * caller instructs us not to grant contended locks, we would disregard
+	 * such a request.
+	 */
+	unsigned		ns_max_nolock_size;
+
+	/** Limit of parallel AST RPC count. */
+	unsigned		ns_max_parallel_ast;
+
+	/** Callback to cancel locks before replaying it during recovery. */
+	ldlm_cancel_for_recovery ns_cancel_for_recovery;
+
+	/** LDLM lock stats */
+	struct lprocfs_stats	*ns_stats;
+
+	/**
+	 * Flag to indicate namespace is being freed. Used to determine if
+	 * recalculation of LDLM pool statistics should be skipped.
+	 */
+	unsigned		ns_stopping:1;
+};
+
+/**
+ * Returns 1 if namespace \a ns is a client namespace.
+ */
+static inline int ns_is_client(struct ldlm_namespace *ns)
+{
+	LASSERT(ns != NULL);
+	LASSERT(!(ns->ns_client & ~(LDLM_NAMESPACE_CLIENT |
+				    LDLM_NAMESPACE_SERVER)));
+	LASSERT(ns->ns_client == LDLM_NAMESPACE_CLIENT ||
+		ns->ns_client == LDLM_NAMESPACE_SERVER);
+	return ns->ns_client == LDLM_NAMESPACE_CLIENT;
+}
+
+/**
+ * Returns 1 if namespace \a ns is a server namespace.
+ */
+static inline int ns_is_server(struct ldlm_namespace *ns)
+{
+	LASSERT(ns != NULL);
+	LASSERT(!(ns->ns_client & ~(LDLM_NAMESPACE_CLIENT |
+				    LDLM_NAMESPACE_SERVER)));
+	LASSERT(ns->ns_client == LDLM_NAMESPACE_CLIENT ||
+		ns->ns_client == LDLM_NAMESPACE_SERVER);
+	return ns->ns_client == LDLM_NAMESPACE_SERVER;
+}
+
+/**
+ * Returns 1 if namespace \a ns supports early lock cancel (ELC).
+ */
+static inline int ns_connect_cancelset(struct ldlm_namespace *ns)
+{
+	LASSERT(ns != NULL);
+	return !!(ns->ns_connect_flags & OBD_CONNECT_CANCELSET);
+}
+
+/**
+ * Returns 1 if this namespace supports lru_resize.
+ */
+static inline int ns_connect_lru_resize(struct ldlm_namespace *ns)
+{
+	LASSERT(ns != NULL);
+	return !!(ns->ns_connect_flags & OBD_CONNECT_LRU_RESIZE);
+}
+
+static inline void ns_register_cancel(struct ldlm_namespace *ns,
+				      ldlm_cancel_for_recovery arg)
+{
+	LASSERT(ns != NULL);
+	ns->ns_cancel_for_recovery = arg;
+}
+
+struct ldlm_lock;
+
+/** Type for blocking callback function of a lock. */
+typedef int (*ldlm_blocking_callback)(struct ldlm_lock *lock,
+				      struct ldlm_lock_desc *new, void *data,
+				      int flag);
+/** Type for completion callback function of a lock. */
+typedef int (*ldlm_completion_callback)(struct ldlm_lock *lock, __u64 flags,
+					void *data);
+/** Type for glimpse callback function of a lock. */
+typedef int (*ldlm_glimpse_callback)(struct ldlm_lock *lock, void *data);
+
+/** Work list for sending GL ASTs to multiple locks. */
+struct ldlm_glimpse_work {
+	struct ldlm_lock	*gl_lock; /* lock to glimpse */
+	struct list_head		 gl_list; /* linkage to other gl work structs */
+	__u32			 gl_flags;/* see LDLM_GL_WORK_* below */
+	union ldlm_gl_desc	*gl_desc; /* glimpse descriptor to be packed in
+					   * glimpse callback request */
+};
+
+/** The ldlm_glimpse_work is allocated on the stack and should not be freed. */
+#define LDLM_GL_WORK_NOFREE 0x1
+
+/** Interval node data for each LDLM_EXTENT lock. */
+struct ldlm_interval {
+	struct interval_node	li_node;  /* node for tree management */
+	struct list_head		li_group; /* the locks which have the same
+					   * policy - group of the policy */
+};
+#define to_ldlm_interval(n) container_of(n, struct ldlm_interval, li_node)
+
+/**
+ * Interval tree for extent locks.
+ * The interval tree must be accessed under the resource lock.
+ * Interval trees are used for granted extent locks to speed up conflicts
+ * lookup. See ldlm/interval_tree.c for more details.
+ */
+struct ldlm_interval_tree {
+	/** Tree size. */
+	int			lit_size;
+	ldlm_mode_t		lit_mode;  /* lock mode */
+	struct interval_node	*lit_root; /* actual ldlm_interval */
+};
+
+/** Whether to track references to exports by LDLM locks. */
+#define LUSTRE_TRACKS_LOCK_EXP_REFS (0)
+
+/** Cancel flags. */
+typedef enum {
+	LCF_ASYNC      = 0x1, /* Cancel locks asynchronously. */
+	LCF_LOCAL      = 0x2, /* Cancel locks locally, not notifing server */
+	LCF_BL_AST     = 0x4, /* Cancel locks marked as LDLM_FL_BL_AST
+			       * in the same RPC */
+} ldlm_cancel_flags_t;
+
+struct ldlm_flock {
+	__u64 start;
+	__u64 end;
+	__u64 owner;
+	__u64 blocking_owner;
+	struct obd_export *blocking_export;
+	/* Protected by the hash lock */
+	__u32 blocking_refs;
+	__u32 pid;
+};
+
+typedef union {
+	struct ldlm_extent l_extent;
+	struct ldlm_flock l_flock;
+	struct ldlm_inodebits l_inodebits;
+} ldlm_policy_data_t;
+
+void ldlm_convert_policy_to_wire(ldlm_type_t type,
+				 const ldlm_policy_data_t *lpolicy,
+				 ldlm_wire_policy_data_t *wpolicy);
+void ldlm_convert_policy_to_local(struct obd_export *exp, ldlm_type_t type,
+				  const ldlm_wire_policy_data_t *wpolicy,
+				  ldlm_policy_data_t *lpolicy);
+
+enum lvb_type {
+	LVB_T_NONE	= 0,
+	LVB_T_OST	= 1,
+	LVB_T_LQUOTA	= 2,
+	LVB_T_LAYOUT	= 3,
+};
+
+/**
+ * LDLM lock structure
+ *
+ * Represents a single LDLM lock and its state in memory. Each lock is
+ * associated with a single ldlm_resource, the object which is being
+ * locked. There may be multiple ldlm_locks on a single resource,
+ * depending on the lock type and whether the locks are conflicting or
+ * not.
+ */
+struct ldlm_lock {
+	/**
+	 * Local lock handle.
+	 * When remote side wants to tell us about a lock, they address
+	 * it by this opaque handle.  The handle does not hold a
+	 * reference on the ldlm_lock, so it can be safely passed to
+	 * other threads or nodes. When the lock needs to be accessed
+	 * from the handle, it is looked up again in the lock table, and
+	 * may no longer exist.
+	 *
+	 * Must be first in the structure.
+	 */
+	struct portals_handle	l_handle;
+	/**
+	 * Lock reference count.
+	 * This is how many users have pointers to actual structure, so that
+	 * we do not accidentally free lock structure that is in use.
+	 */
+	atomic_t		l_refc;
+	/**
+	 * Internal spinlock protects l_resource.  We should hold this lock
+	 * first before taking res_lock.
+	 */
+	spinlock_t		l_lock;
+	/**
+	 * Pointer to actual resource this lock is in.
+	 * ldlm_lock_change_resource() can change this.
+	 */
+	struct ldlm_resource	*l_resource;
+	/**
+	 * List item for client side LRU list.
+	 * Protected by ns_lock in struct ldlm_namespace.
+	 */
+	struct list_head		l_lru;
+	/**
+	 * Linkage to resource's lock queues according to current lock state.
+	 * (could be granted, waiting or converting)
+	 * Protected by lr_lock in struct ldlm_resource.
+	 */
+	struct list_head		l_res_link;
+	/**
+	 * Tree node for ldlm_extent.
+	 */
+	struct ldlm_interval	*l_tree_node;
+	/**
+	 * Per export hash of locks.
+	 * Protected by per-bucket exp->exp_lock_hash locks.
+	 */
+	struct hlist_node	l_exp_hash;
+	/**
+	 * Per export hash of flock locks.
+	 * Protected by per-bucket exp->exp_flock_hash locks.
+	 */
+	struct hlist_node	l_exp_flock_hash;
+	/**
+	 * Requested mode.
+	 * Protected by lr_lock.
+	 */
+	ldlm_mode_t		l_req_mode;
+	/**
+	 * Granted mode, also protected by lr_lock.
+	 */
+	ldlm_mode_t		l_granted_mode;
+	/** Lock completion handler pointer. Called when lock is granted. */
+	ldlm_completion_callback l_completion_ast;
+	/**
+	 * Lock blocking AST handler pointer.
+	 * It plays two roles:
+	 * - as a notification of an attempt to queue a conflicting lock (once)
+	 * - as a notification when the lock is being cancelled.
+	 *
+	 * As such it's typically called twice: once for the initial conflict
+	 * and then once more when the last user went away and the lock is
+	 * cancelled (could happen recursively).
+	 */
+	ldlm_blocking_callback	l_blocking_ast;
+	/**
+	 * Lock glimpse handler.
+	 * Glimpse handler is used to obtain LVB updates from a client by
+	 * server
+	 */
+	ldlm_glimpse_callback	l_glimpse_ast;
+
+	/**
+	 * Lock export.
+	 * This is a pointer to actual client export for locks that were granted
+	 * to clients. Used server-side.
+	 */
+	struct obd_export	*l_export;
+	/**
+	 * Lock connection export.
+	 * Pointer to server export on a client.
+	 */
+	struct obd_export	*l_conn_export;
+
+	/**
+	 * Remote lock handle.
+	 * If the lock is remote, this is the handle of the other side lock
+	 * (l_handle)
+	 */
+	struct lustre_handle	l_remote_handle;
+
+	/**
+	 * Representation of private data specific for a lock type.
+	 * Examples are: extent range for extent lock or bitmask for ibits locks
+	 */
+	ldlm_policy_data_t	l_policy_data;
+
+	/**
+	 * Lock state flags. Protected by lr_lock.
+	 * \see lustre_dlm_flags.h where the bits are defined.
+	 */
+	__u64			l_flags;
+
+	/**
+	 * Lock r/w usage counters.
+	 * Protected by lr_lock.
+	 */
+	__u32			l_readers;
+	__u32			l_writers;
+	/**
+	 * If the lock is granted, a process sleeps on this waitq to learn when
+	 * it's no longer in use.  If the lock is not granted, a process sleeps
+	 * on this waitq to learn when it becomes granted.
+	 */
+	wait_queue_head_t		l_waitq;
+
+	/**
+	 * Seconds. It will be updated if there is any activity related to
+	 * the lock, e.g. enqueue the lock or send blocking AST.
+	 */
+	unsigned long		l_last_activity;
+
+	/**
+	 * Time last used by e.g. being matched by lock match.
+	 * Jiffies. Should be converted to time if needed.
+	 */
+	unsigned long		l_last_used;
+
+	/** Originally requested extent for the extent lock. */
+	struct ldlm_extent	l_req_extent;
+
+	/*
+	 * Client-side-only members.
+	 */
+
+	enum lvb_type	      l_lvb_type;
+
+	/**
+	 * Temporary storage for a LVB received during an enqueue operation.
+	 */
+	__u32			l_lvb_len;
+	void			*l_lvb_data;
+
+	/** Private storage for lock user. Opaque to LDLM. */
+	void			*l_ast_data;
+
+	/*
+	 * Server-side-only members.
+	 */
+
+	/**
+	 * Connection cookie for the client originating the operation.
+	 * Used by Commit on Share (COS) code. Currently only used for
+	 * inodebits locks on MDS.
+	 */
+	__u64			l_client_cookie;
+
+	/**
+	 * List item for locks waiting for cancellation from clients.
+	 * The lists this could be linked into are:
+	 * waiting_locks_list (protected by waiting_locks_spinlock),
+	 * then if the lock timed out, it is moved to
+	 * expired_lock_thread.elt_expired_locks for further processing.
+	 * Protected by elt_lock.
+	 */
+	struct list_head		l_pending_chain;
+
+	/**
+	 * Set when lock is sent a blocking AST. Time in seconds when timeout
+	 * is reached and client holding this lock could be evicted.
+	 * This timeout could be further extended by e.g. certain IO activity
+	 * under this lock.
+	 * \see ost_rw_prolong_locks
+	 */
+	unsigned long		l_callback_timeout;
+
+	/** Local PID of process which created this lock. */
+	__u32			l_pid;
+
+	/**
+	 * Number of times blocking AST was sent for this lock.
+	 * This is for debugging. Valid values are 0 and 1, if there is an
+	 * attempt to send blocking AST more than once, an assertion would be
+	 * hit. \see ldlm_work_bl_ast_lock
+	 */
+	int			l_bl_ast_run;
+	/** List item ldlm_add_ast_work_item() for case of blocking ASTs. */
+	struct list_head		l_bl_ast;
+	/** List item ldlm_add_ast_work_item() for case of completion ASTs. */
+	struct list_head		l_cp_ast;
+	/** For ldlm_add_ast_work_item() for "revoke" AST used in COS. */
+	struct list_head		l_rk_ast;
+
+	/**
+	 * Pointer to a conflicting lock that caused blocking AST to be sent
+	 * for this lock
+	 */
+	struct ldlm_lock	*l_blocking_lock;
+
+	/**
+	 * Protected by lr_lock, linkages to "skip lists".
+	 * For more explanations of skip lists see ldlm/ldlm_inodebits.c
+	 */
+	struct list_head		l_sl_mode;
+	struct list_head		l_sl_policy;
+
+	/** Reference tracking structure to debug leaked locks. */
+	struct lu_ref		l_reference;
+#if LUSTRE_TRACKS_LOCK_EXP_REFS
+	/* Debugging stuff for bug 20498, for tracking export references. */
+	/** number of export references taken */
+	int			l_exp_refs_nr;
+	/** link all locks referencing one export */
+	struct list_head		l_exp_refs_link;
+	/** referenced export object */
+	struct obd_export	*l_exp_refs_target;
+#endif
+	/**
+	 * export blocking dlm lock list, protected by
+	 * l_export->exp_bl_list_lock.
+	 * Lock order of waiting_lists_spinlock, exp_bl_list_lock and res lock
+	 * is: res lock -> exp_bl_list_lock -> wanting_lists_spinlock.
+	 */
+	struct list_head		l_exp_list;
+};
+
+/**
+ * LDLM resource description.
+ * Basically, resource is a representation for a single object.
+ * Object has a name which is currently 4 64-bit integers. LDLM user is
+ * responsible for creation of a mapping between objects it wants to be
+ * protected and resource names.
+ *
+ * A resource can only hold locks of a single lock type, though there may be
+ * multiple ldlm_locks on a single resource, depending on the lock type and
+ * whether the locks are conflicting or not.
+ */
+struct ldlm_resource {
+	struct ldlm_ns_bucket	*lr_ns_bucket;
+
+	/**
+	 * List item for list in namespace hash.
+	 * protected by ns_lock
+	 */
+	struct hlist_node	lr_hash;
+
+	/** Spinlock to protect locks under this resource. */
+	spinlock_t		lr_lock;
+
+	/**
+	 * protected by lr_lock
+	 * @{ */
+	/** List of locks in granted state */
+	struct list_head		lr_granted;
+	/** List of locks waiting to change their granted mode (converted) */
+	struct list_head		lr_converting;
+	/**
+	 * List of locks that could not be granted due to conflicts and
+	 * that are waiting for conflicts to go away */
+	struct list_head		lr_waiting;
+	/** @} */
+
+	/* XXX No longer needed? Remove ASAP */
+	ldlm_mode_t		lr_most_restr;
+
+	/** Type of locks this resource can hold. Only one type per resource. */
+	ldlm_type_t		lr_type; /* LDLM_{PLAIN,EXTENT,FLOCK,IBITS} */
+
+	/** Resource name */
+	struct ldlm_res_id	lr_name;
+	/** Reference count for this resource */
+	atomic_t		lr_refcount;
+
+	/**
+	 * Interval trees (only for extent locks) for all modes of this resource
+	 */
+	struct ldlm_interval_tree lr_itree[LCK_MODE_NUM];
+
+	/**
+	 * Server-side-only lock value block elements.
+	 * To serialize lvbo_init.
+	 */
+	struct mutex		lr_lvb_mutex;
+	int			lr_lvb_len;
+	/** protected by lr_lock */
+	void			*lr_lvb_data;
+
+	/** When the resource was considered as contended. */
+	unsigned long		lr_contention_time;
+	/** List of references to this resource. For debugging. */
+	struct lu_ref		lr_reference;
+
+	struct inode		*lr_lvb_inode;
+};
+
+static inline bool ldlm_has_layout(struct ldlm_lock *lock)
+{
+	return lock->l_resource->lr_type == LDLM_IBITS &&
+		lock->l_policy_data.l_inodebits.bits & MDS_INODELOCK_LAYOUT;
+}
+
+static inline char *
+ldlm_ns_name(struct ldlm_namespace *ns)
+{
+	return ns->ns_rs_hash->hs_name;
+}
+
+static inline struct ldlm_namespace *
+ldlm_res_to_ns(struct ldlm_resource *res)
+{
+	return res->lr_ns_bucket->nsb_namespace;
+}
+
+static inline struct ldlm_namespace *
+ldlm_lock_to_ns(struct ldlm_lock *lock)
+{
+	return ldlm_res_to_ns(lock->l_resource);
+}
+
+static inline char *
+ldlm_lock_to_ns_name(struct ldlm_lock *lock)
+{
+	return ldlm_ns_name(ldlm_lock_to_ns(lock));
+}
+
+static inline struct adaptive_timeout *
+ldlm_lock_to_ns_at(struct ldlm_lock *lock)
+{
+	return &lock->l_resource->lr_ns_bucket->nsb_at_estimate;
+}
+
+static inline int ldlm_lvbo_init(struct ldlm_resource *res)
+{
+	struct ldlm_namespace *ns = ldlm_res_to_ns(res);
+
+	if (ns->ns_lvbo != NULL && ns->ns_lvbo->lvbo_init != NULL)
+		return ns->ns_lvbo->lvbo_init(res);
+
+	return 0;
+}
+
+static inline int ldlm_lvbo_size(struct ldlm_lock *lock)
+{
+	struct ldlm_namespace *ns = ldlm_lock_to_ns(lock);
+
+	if (ns->ns_lvbo != NULL && ns->ns_lvbo->lvbo_size != NULL)
+		return ns->ns_lvbo->lvbo_size(lock);
+
+	return 0;
+}
+
+static inline int ldlm_lvbo_fill(struct ldlm_lock *lock, void *buf, int len)
+{
+	struct ldlm_namespace *ns = ldlm_lock_to_ns(lock);
+
+	if (ns->ns_lvbo != NULL) {
+		LASSERT(ns->ns_lvbo->lvbo_fill != NULL);
+		return ns->ns_lvbo->lvbo_fill(lock, buf, len);
+	}
+	return 0;
+}
+
+struct ldlm_ast_work {
+	struct ldlm_lock      *w_lock;
+	int		    w_blocking;
+	struct ldlm_lock_desc  w_desc;
+	struct list_head	     w_list;
+	int		    w_flags;
+	void		  *w_data;
+	int		    w_datalen;
+};
+
+/**
+ * Common ldlm_enqueue parameters
+ */
+struct ldlm_enqueue_info {
+	__u32 ei_type;   /** Type of the lock being enqueued. */
+	__u32 ei_mode;   /** Mode of the lock being enqueued. */
+	void *ei_cb_bl;  /** blocking lock callback */
+	void *ei_cb_cp;  /** lock completion callback */
+	void *ei_cb_gl;  /** lock glimpse callback */
+	void *ei_cbdata; /** Data to be passed into callbacks. */
+};
+
+extern struct obd_ops ldlm_obd_ops;
+
+extern char *ldlm_lockname[];
+extern char *ldlm_typename[];
+extern char *ldlm_it2str(int it);
+
+/**
+ * Just a fancy CDEBUG call with log level preset to LDLM_DEBUG.
+ * For the cases where we do not have actual lock to print along
+ * with a debugging message that is ldlm-related
+ */
+#define LDLM_DEBUG_NOLOCK(format, a...)			\
+	CDEBUG(D_DLMTRACE, "### " format "\n" , ##a)
+
+/**
+ * Support function for lock information printing into debug logs.
+ * \see LDLM_DEBUG
+ */
+#define ldlm_lock_debug(msgdata, mask, cdls, lock, fmt, a...) do {      \
+	CFS_CHECK_STACK(msgdata, mask, cdls);			   \
+									\
+	if (((mask) & D_CANTMASK) != 0 ||			       \
+	    ((libcfs_debug & (mask)) != 0 &&			    \
+	     (libcfs_subsystem_debug & DEBUG_SUBSYSTEM) != 0))	  \
+		_ldlm_lock_debug(lock, msgdata, fmt, ##a);	      \
+} while (0)
+
+void _ldlm_lock_debug(struct ldlm_lock *lock,
+		      struct libcfs_debug_msg_data *data,
+		      const char *fmt, ...)
+	__printf(3, 4);
+
+/**
+ * Rate-limited version of lock printing function.
+ */
+#define LDLM_DEBUG_LIMIT(mask, lock, fmt, a...) do {			 \
+	static struct cfs_debug_limit_state _ldlm_cdls;			   \
+	LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, mask, &_ldlm_cdls);	      \
+	ldlm_lock_debug(&msgdata, mask, &_ldlm_cdls, lock, "### " fmt , ##a);\
+} while (0)
+
+#define LDLM_ERROR(lock, fmt, a...) LDLM_DEBUG_LIMIT(D_ERROR, lock, fmt, ## a)
+#define LDLM_WARN(lock, fmt, a...)  LDLM_DEBUG_LIMIT(D_WARNING, lock, fmt, ## a)
+
+/** Non-rate-limited lock printing function for debugging purposes. */
+#define LDLM_DEBUG(lock, fmt, a...)   do {				  \
+	if (likely(lock != NULL)) {					    \
+		LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, D_DLMTRACE, NULL);      \
+		ldlm_lock_debug(&msgdata, D_DLMTRACE, NULL, lock,	    \
+				"### " fmt , ##a);			    \
+	} else {							    \
+		LDLM_DEBUG_NOLOCK("no dlm lock: " fmt, ##a);		    \
+	}								    \
+} while (0)
+
+typedef int (*ldlm_processing_policy)(struct ldlm_lock *lock, __u64 *flags,
+				      int first_enq, ldlm_error_t *err,
+				      struct list_head *work_list);
+
+/**
+ * Return values for lock iterators.
+ * Also used during deciding of lock grants and cancellations.
+ */
+#define LDLM_ITER_CONTINUE 1 /* keep iterating */
+#define LDLM_ITER_STOP     2 /* stop iterating */
+
+typedef int (*ldlm_iterator_t)(struct ldlm_lock *, void *);
+typedef int (*ldlm_res_iterator_t)(struct ldlm_resource *, void *);
+
+/** \defgroup ldlm_iterator Lock iterators
+ *
+ * LDLM provides for a way to iterate through every lock on a resource or
+ * namespace or every resource in a namespace.
+ * @{ */
+int ldlm_resource_foreach(struct ldlm_resource *res, ldlm_iterator_t iter,
+			  void *closure);
+void ldlm_namespace_foreach(struct ldlm_namespace *ns, ldlm_iterator_t iter,
+			    void *closure);
+int ldlm_resource_iterate(struct ldlm_namespace *, const struct ldlm_res_id *,
+			  ldlm_iterator_t iter, void *data);
+/** @} ldlm_iterator */
+
+int ldlm_replay_locks(struct obd_import *imp);
+
+/* ldlm_flock.c */
+int ldlm_flock_completion_ast(struct ldlm_lock *lock, __u64 flags, void *data);
+
+/* ldlm_extent.c */
+__u64 ldlm_extent_shift_kms(struct ldlm_lock *lock, __u64 old_kms);
+
+struct ldlm_callback_suite {
+	ldlm_completion_callback lcs_completion;
+	ldlm_blocking_callback   lcs_blocking;
+	ldlm_glimpse_callback    lcs_glimpse;
+};
+
+/* ldlm_lockd.c */
+int ldlm_del_waiting_lock(struct ldlm_lock *lock);
+int ldlm_refresh_waiting_lock(struct ldlm_lock *lock, int timeout);
+int ldlm_get_ref(void);
+void ldlm_put_ref(void);
+int ldlm_init_export(struct obd_export *exp);
+void ldlm_destroy_export(struct obd_export *exp);
+struct ldlm_lock *ldlm_request_lock(struct ptlrpc_request *req);
+
+/* ldlm_lock.c */
+void ldlm_register_intent(struct ldlm_namespace *ns, ldlm_res_policy arg);
+void ldlm_lock2handle(const struct ldlm_lock *lock,
+		      struct lustre_handle *lockh);
+struct ldlm_lock *__ldlm_handle2lock(const struct lustre_handle *, __u64 flags);
+void ldlm_cancel_callback(struct ldlm_lock *);
+int ldlm_lock_remove_from_lru(struct ldlm_lock *);
+int ldlm_lock_set_data(struct lustre_handle *, void *);
+
+/**
+ * Obtain a lock reference by its handle.
+ */
+static inline struct ldlm_lock *ldlm_handle2lock(const struct lustre_handle *h)
+{
+	return __ldlm_handle2lock(h, 0);
+}
+
+#define LDLM_LOCK_REF_DEL(lock) \
+	lu_ref_del(&lock->l_reference, "handle", current)
+
+static inline struct ldlm_lock *
+ldlm_handle2lock_long(const struct lustre_handle *h, __u64 flags)
+{
+	struct ldlm_lock *lock;
+
+	lock = __ldlm_handle2lock(h, flags);
+	if (lock != NULL)
+		LDLM_LOCK_REF_DEL(lock);
+	return lock;
+}
+
+/**
+ * Update Lock Value Block Operations (LVBO) on a resource taking into account
+ * data from request \a r
+ */
+static inline int ldlm_res_lvbo_update(struct ldlm_resource *res,
+				       struct ptlrpc_request *r, int increase)
+{
+	if (ldlm_res_to_ns(res)->ns_lvbo &&
+	    ldlm_res_to_ns(res)->ns_lvbo->lvbo_update) {
+		return ldlm_res_to_ns(res)->ns_lvbo->lvbo_update(res, r,
+								 increase);
+	}
+	return 0;
+}
+
+int ldlm_error2errno(ldlm_error_t error);
+ldlm_error_t ldlm_errno2error(int err_no); /* don't call it `errno': this
+					    * confuses user-space. */
+#if LUSTRE_TRACKS_LOCK_EXP_REFS
+void ldlm_dump_export_locks(struct obd_export *exp);
+#endif
+
+/**
+ * Release a temporary lock reference obtained by ldlm_handle2lock() or
+ * __ldlm_handle2lock().
+ */
+#define LDLM_LOCK_PUT(lock)		     \
+do {					    \
+	LDLM_LOCK_REF_DEL(lock);		\
+	/*LDLM_DEBUG((lock), "put");*/	  \
+	ldlm_lock_put(lock);		    \
+} while (0)
+
+/**
+ * Release a lock reference obtained by some other means (see
+ * LDLM_LOCK_PUT()).
+ */
+#define LDLM_LOCK_RELEASE(lock)		 \
+do {					    \
+	/*LDLM_DEBUG((lock), "put");*/	  \
+	ldlm_lock_put(lock);		    \
+} while (0)
+
+#define LDLM_LOCK_GET(lock)		     \
+({					      \
+	ldlm_lock_get(lock);		    \
+	/*LDLM_DEBUG((lock), "get");*/	  \
+	lock;				   \
+})
+
+#define ldlm_lock_list_put(head, member, count)		     \
+({								  \
+	struct ldlm_lock *_lock, *_next;			    \
+	int c = count;					      \
+	list_for_each_entry_safe(_lock, _next, head, member) {  \
+		if (c-- == 0)				       \
+			break;				      \
+		list_del_init(&_lock->member);		  \
+		LDLM_LOCK_RELEASE(_lock);			   \
+	}							   \
+	LASSERT(c <= 0);					    \
+})
+
+struct ldlm_lock *ldlm_lock_get(struct ldlm_lock *lock);
+void ldlm_lock_put(struct ldlm_lock *lock);
+void ldlm_lock_destroy(struct ldlm_lock *lock);
+void ldlm_lock2desc(struct ldlm_lock *lock, struct ldlm_lock_desc *desc);
+void ldlm_lock_addref(struct lustre_handle *lockh, __u32 mode);
+int  ldlm_lock_addref_try(struct lustre_handle *lockh, __u32 mode);
+void ldlm_lock_decref(struct lustre_handle *lockh, __u32 mode);
+void ldlm_lock_decref_and_cancel(struct lustre_handle *lockh, __u32 mode);
+void ldlm_lock_fail_match_locked(struct ldlm_lock *lock);
+void ldlm_lock_fail_match(struct ldlm_lock *lock);
+void ldlm_lock_allow_match(struct ldlm_lock *lock);
+void ldlm_lock_allow_match_locked(struct ldlm_lock *lock);
+ldlm_mode_t ldlm_lock_match(struct ldlm_namespace *ns, __u64 flags,
+			    const struct ldlm_res_id *, ldlm_type_t type,
+			    ldlm_policy_data_t *, ldlm_mode_t mode,
+			    struct lustre_handle *, int unref);
+ldlm_mode_t ldlm_revalidate_lock_handle(struct lustre_handle *lockh,
+					__u64 *bits);
+struct ldlm_resource *ldlm_lock_convert(struct ldlm_lock *lock, int new_mode,
+					__u32 *flags);
+void ldlm_lock_downgrade(struct ldlm_lock *lock, int new_mode);
+void ldlm_lock_cancel(struct ldlm_lock *lock);
+void ldlm_reprocess_all(struct ldlm_resource *res);
+void ldlm_reprocess_all_ns(struct ldlm_namespace *ns);
+void ldlm_lock_dump_handle(int level, struct lustre_handle *);
+void ldlm_unlink_lock_skiplist(struct ldlm_lock *req);
+
+/* resource.c */
+struct ldlm_namespace *
+ldlm_namespace_new(struct obd_device *obd, char *name,
+		   ldlm_side_t client, ldlm_appetite_t apt,
+		   ldlm_ns_type_t ns_type);
+int ldlm_namespace_cleanup(struct ldlm_namespace *ns, __u64 flags);
+void ldlm_namespace_free(struct ldlm_namespace *ns,
+			 struct obd_import *imp, int force);
+void ldlm_namespace_register(struct ldlm_namespace *ns, ldlm_side_t client);
+void ldlm_namespace_unregister(struct ldlm_namespace *ns, ldlm_side_t client);
+void ldlm_namespace_get(struct ldlm_namespace *ns);
+void ldlm_namespace_put(struct ldlm_namespace *ns);
+#if defined (CONFIG_PROC_FS)
+int ldlm_proc_setup(void);
+void ldlm_proc_cleanup(void);
+#else
+static inline int ldlm_proc_setup(void) { return 0; }
+static inline void ldlm_proc_cleanup(void) {}
+#endif
+
+/* resource.c - internal */
+struct ldlm_resource *ldlm_resource_get(struct ldlm_namespace *ns,
+					struct ldlm_resource *parent,
+					const struct ldlm_res_id *,
+					ldlm_type_t type, int create);
+struct ldlm_resource *ldlm_resource_getref(struct ldlm_resource *res);
+int ldlm_resource_putref(struct ldlm_resource *res);
+void ldlm_resource_add_lock(struct ldlm_resource *res,
+			    struct list_head *head,
+			    struct ldlm_lock *lock);
+void ldlm_resource_unlink_lock(struct ldlm_lock *lock);
+void ldlm_res2desc(struct ldlm_resource *res, struct ldlm_resource_desc *desc);
+void ldlm_dump_all_namespaces(ldlm_side_t client, int level);
+void ldlm_namespace_dump(int level, struct ldlm_namespace *);
+void ldlm_resource_dump(int level, struct ldlm_resource *);
+int ldlm_lock_change_resource(struct ldlm_namespace *, struct ldlm_lock *,
+			      const struct ldlm_res_id *);
+
+#define LDLM_RESOURCE_ADDREF(res) do {				  \
+	lu_ref_add_atomic(&(res)->lr_reference, __func__, current);  \
+} while (0)
+
+#define LDLM_RESOURCE_DELREF(res) do {				  \
+	lu_ref_del(&(res)->lr_reference, __func__, current);	  \
+} while (0)
+
+/* ldlm_request.c */
+int ldlm_expired_completion_wait(void *data);
+/** \defgroup ldlm_local_ast Default AST handlers for local locks
+ * These AST handlers are typically used for server-side local locks and are
+ * also used by client-side lock handlers to perform minimum level base
+ * processing.
+ * @{ */
+int ldlm_blocking_ast_nocheck(struct ldlm_lock *lock);
+int ldlm_blocking_ast(struct ldlm_lock *lock, struct ldlm_lock_desc *desc,
+		      void *data, int flag);
+int ldlm_glimpse_ast(struct ldlm_lock *lock, void *reqp);
+int ldlm_completion_ast_async(struct ldlm_lock *lock, __u64 flags, void *data);
+int ldlm_completion_ast(struct ldlm_lock *lock, __u64 flags, void *data);
+/** @} ldlm_local_ast */
+
+/** \defgroup ldlm_cli_api API to operate on locks from actual LDLM users.
+ * These are typically used by client and server (*_local versions)
+ * to obtain and release locks.
+ * @{ */
+int ldlm_cli_enqueue(struct obd_export *exp, struct ptlrpc_request **reqp,
+		     struct ldlm_enqueue_info *einfo,
+		     const struct ldlm_res_id *res_id,
+		     ldlm_policy_data_t const *policy, __u64 *flags,
+		     void *lvb, __u32 lvb_len, enum lvb_type lvb_type,
+		     struct lustre_handle *lockh, int async);
+int ldlm_prep_enqueue_req(struct obd_export *exp,
+			  struct ptlrpc_request *req,
+			  struct list_head *cancels,
+			  int count);
+int ldlm_prep_elc_req(struct obd_export *exp,
+		      struct ptlrpc_request *req,
+		      int version, int opc, int canceloff,
+		      struct list_head *cancels, int count);
+
+struct ptlrpc_request *ldlm_enqueue_pack(struct obd_export *exp, int lvb_len);
+int ldlm_handle_enqueue0(struct ldlm_namespace *ns, struct ptlrpc_request *req,
+			 const struct ldlm_request *dlm_req,
+			 const struct ldlm_callback_suite *cbs);
+int ldlm_cli_enqueue_fini(struct obd_export *exp, struct ptlrpc_request *req,
+			  ldlm_type_t type, __u8 with_policy, ldlm_mode_t mode,
+			  __u64 *flags, void *lvb, __u32 lvb_len,
+			  struct lustre_handle *lockh, int rc);
+int ldlm_cli_enqueue_local(struct ldlm_namespace *ns,
+			   const struct ldlm_res_id *res_id,
+			   ldlm_type_t type, ldlm_policy_data_t *policy,
+			   ldlm_mode_t mode, __u64 *flags,
+			   ldlm_blocking_callback blocking,
+			   ldlm_completion_callback completion,
+			   ldlm_glimpse_callback glimpse,
+			   void *data, __u32 lvb_len, enum lvb_type lvb_type,
+			   const __u64 *client_cookie,
+			   struct lustre_handle *lockh);
+int ldlm_server_ast(struct lustre_handle *lockh, struct ldlm_lock_desc *new,
+		    void *data, __u32 data_len);
+int ldlm_cli_convert(struct lustre_handle *, int new_mode, __u32 *flags);
+int ldlm_cli_update_pool(struct ptlrpc_request *req);
+int ldlm_cli_cancel(struct lustre_handle *lockh,
+		    ldlm_cancel_flags_t cancel_flags);
+int ldlm_cli_cancel_unused(struct ldlm_namespace *, const struct ldlm_res_id *,
+			   ldlm_cancel_flags_t flags, void *opaque);
+int ldlm_cli_cancel_unused_resource(struct ldlm_namespace *ns,
+				    const struct ldlm_res_id *res_id,
+				    ldlm_policy_data_t *policy,
+				    ldlm_mode_t mode,
+				    ldlm_cancel_flags_t flags,
+				    void *opaque);
+int ldlm_cli_cancel_req(struct obd_export *exp, struct list_head *head,
+			int count, ldlm_cancel_flags_t flags);
+int ldlm_cancel_resource_local(struct ldlm_resource *res,
+			       struct list_head *cancels,
+			       ldlm_policy_data_t *policy,
+			       ldlm_mode_t mode, __u64 lock_flags,
+			       ldlm_cancel_flags_t cancel_flags, void *opaque);
+int ldlm_cli_cancel_list_local(struct list_head *cancels, int count,
+			       ldlm_cancel_flags_t flags);
+int ldlm_cli_cancel_list(struct list_head *head, int count,
+			 struct ptlrpc_request *req, ldlm_cancel_flags_t flags);
+/** @} ldlm_cli_api */
+
+/* mds/handler.c */
+/* This has to be here because recursive inclusion sucks. */
+int intent_disposition(struct ldlm_reply *rep, int flag);
+void intent_set_disposition(struct ldlm_reply *rep, int flag);
+
+
+/* ioctls for trying requests */
+#define IOC_LDLM_TYPE		   'f'
+#define IOC_LDLM_MIN_NR		 40
+
+#define IOC_LDLM_TEST		   _IOWR('f', 40, long)
+#define IOC_LDLM_DUMP		   _IOWR('f', 41, long)
+#define IOC_LDLM_REGRESS_START	  _IOWR('f', 42, long)
+#define IOC_LDLM_REGRESS_STOP	   _IOWR('f', 43, long)
+#define IOC_LDLM_MAX_NR		 43
+
+/**
+ * "Modes" of acquiring lock_res, necessary to tell lockdep that taking more
+ * than one lock_res is dead-lock safe.
+ */
+enum lock_res_type {
+	LRT_NORMAL,
+	LRT_NEW
+};
+
+/** Lock resource. */
+static inline void lock_res(struct ldlm_resource *res)
+{
+	spin_lock(&res->lr_lock);
+}
+
+/** Lock resource with a way to instruct lockdep code about nestedness-safe. */
+static inline void lock_res_nested(struct ldlm_resource *res,
+				   enum lock_res_type mode)
+{
+	spin_lock_nested(&res->lr_lock, mode);
+}
+
+/** Unlock resource. */
+static inline void unlock_res(struct ldlm_resource *res)
+{
+	spin_unlock(&res->lr_lock);
+}
+
+/** Check if resource is already locked, assert if not. */
+static inline void check_res_locked(struct ldlm_resource *res)
+{
+	assert_spin_locked(&res->lr_lock);
+}
+
+struct ldlm_resource *lock_res_and_lock(struct ldlm_lock *lock);
+void unlock_res_and_lock(struct ldlm_lock *lock);
+
+/* ldlm_pool.c */
+/** \defgroup ldlm_pools Various LDLM pool related functions
+ * There are not used outside of ldlm.
+ * @{
+ */
+int ldlm_pools_recalc(ldlm_side_t client);
+int ldlm_pools_init(void);
+void ldlm_pools_fini(void);
+
+int ldlm_pool_init(struct ldlm_pool *pl, struct ldlm_namespace *ns,
+		   int idx, ldlm_side_t client);
+int ldlm_pool_shrink(struct ldlm_pool *pl, int nr,
+		     gfp_t gfp_mask);
+void ldlm_pool_fini(struct ldlm_pool *pl);
+int ldlm_pool_setup(struct ldlm_pool *pl, int limit);
+int ldlm_pool_recalc(struct ldlm_pool *pl);
+__u32 ldlm_pool_get_lvf(struct ldlm_pool *pl);
+__u64 ldlm_pool_get_slv(struct ldlm_pool *pl);
+__u64 ldlm_pool_get_clv(struct ldlm_pool *pl);
+__u32 ldlm_pool_get_limit(struct ldlm_pool *pl);
+void ldlm_pool_set_slv(struct ldlm_pool *pl, __u64 slv);
+void ldlm_pool_set_clv(struct ldlm_pool *pl, __u64 clv);
+void ldlm_pool_set_limit(struct ldlm_pool *pl, __u32 limit);
+void ldlm_pool_add(struct ldlm_pool *pl, struct ldlm_lock *lock);
+void ldlm_pool_del(struct ldlm_pool *pl, struct ldlm_lock *lock);
+/** @} */
+
+#endif
+/** @} LDLM */
diff --git a/kernel/drivers/staging/lustre/lustre/include/lustre_dlm_flags.h b/kernel/drivers/staging/lustre/lustre/include/lustre_dlm_flags.h
new file mode 100644
index 000000000..16dcdbfae
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/include/lustre_dlm_flags.h
@@ -0,0 +1,476 @@
+/*  -*- buffer-read-only: t -*- vi: set ro:
+ *
+ * DO NOT EDIT THIS FILE   (lustre_dlm_flags.h)
+ *
+ * It has been AutoGen-ed
+ * From the definitions    lustre_dlm_flags.def
+ * and the template file   lustre_dlm_flags.tpl
+ *
+ * lustre is free software: you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * lustre is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ * See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+/**
+ * \file lustre_dlm_flags.h
+ * The flags and collections of flags (masks) for \see struct ldlm_lock.
+ * This file is derived from flag definitions in lustre_dlm_flags.def.
+ * The format is defined in the lustre_dlm_flags.tpl template file.
+ *
+ * \addtogroup LDLM Lustre Distributed Lock Manager
+ * @{
+ *
+ * \name flags
+ * The flags and collections of flags (masks) for \see struct ldlm_lock.
+ * @{
+ */
+#ifndef LDLM_ALL_FLAGS_MASK
+
+/** l_flags bits marked as "all_flags" bits */
+#define LDLM_FL_ALL_FLAGS_MASK          0x00FFFFFFC08F932FULL
+
+/** l_flags bits marked as "ast" bits */
+#define LDLM_FL_AST_MASK                0x0000000080008000ULL
+
+/** l_flags bits marked as "blocked" bits */
+#define LDLM_FL_BLOCKED_MASK            0x000000000000000EULL
+
+/** l_flags bits marked as "gone" bits */
+#define LDLM_FL_GONE_MASK               0x0006004000000000ULL
+
+/** l_flags bits marked as "hide_lock" bits */
+#define LDLM_FL_HIDE_LOCK_MASK          0x0000206400000000ULL
+
+/** l_flags bits marked as "inherit" bits */
+#define LDLM_FL_INHERIT_MASK            0x0000000000800000ULL
+
+/** l_flags bits marked as "local_only" bits */
+#define LDLM_FL_LOCAL_ONLY_MASK         0x00FFFFFF00000000ULL
+
+/** l_flags bits marked as "on_wire" bits */
+#define LDLM_FL_ON_WIRE_MASK            0x00000000C08F932FULL
+
+/** extent, mode, or resource changed */
+#define LDLM_FL_LOCK_CHANGED            0x0000000000000001ULL // bit   0
+#define ldlm_is_lock_changed(_l)        LDLM_TEST_FLAG(( _l), 1ULL <<  0)
+#define ldlm_set_lock_changed(_l)       LDLM_SET_FLAG((  _l), 1ULL <<  0)
+#define ldlm_clear_lock_changed(_l)     LDLM_CLEAR_FLAG((_l), 1ULL <<  0)
+
+/**
+ * Server placed lock on granted list, or a recovering client wants the
+ * lock added to the granted list, no questions asked. */
+#define LDLM_FL_BLOCK_GRANTED           0x0000000000000002ULL // bit   1
+#define ldlm_is_block_granted(_l)       LDLM_TEST_FLAG(( _l), 1ULL <<  1)
+#define ldlm_set_block_granted(_l)      LDLM_SET_FLAG((  _l), 1ULL <<  1)
+#define ldlm_clear_block_granted(_l)    LDLM_CLEAR_FLAG((_l), 1ULL <<  1)
+
+/**
+ * Server placed lock on conv list, or a recovering client wants the lock
+ * added to the conv list, no questions asked. */
+#define LDLM_FL_BLOCK_CONV              0x0000000000000004ULL // bit   2
+#define ldlm_is_block_conv(_l)          LDLM_TEST_FLAG(( _l), 1ULL <<  2)
+#define ldlm_set_block_conv(_l)         LDLM_SET_FLAG((  _l), 1ULL <<  2)
+#define ldlm_clear_block_conv(_l)       LDLM_CLEAR_FLAG((_l), 1ULL <<  2)
+
+/**
+ * Server placed lock on wait list, or a recovering client wants the lock
+ * added to the wait list, no questions asked. */
+#define LDLM_FL_BLOCK_WAIT              0x0000000000000008ULL // bit   3
+#define ldlm_is_block_wait(_l)          LDLM_TEST_FLAG(( _l), 1ULL <<  3)
+#define ldlm_set_block_wait(_l)         LDLM_SET_FLAG((  _l), 1ULL <<  3)
+#define ldlm_clear_block_wait(_l)       LDLM_CLEAR_FLAG((_l), 1ULL <<  3)
+
+/** blocking or cancel packet was queued for sending. */
+#define LDLM_FL_AST_SENT                0x0000000000000020ULL // bit   5
+#define ldlm_is_ast_sent(_l)            LDLM_TEST_FLAG(( _l), 1ULL <<  5)
+#define ldlm_set_ast_sent(_l)           LDLM_SET_FLAG((  _l), 1ULL <<  5)
+#define ldlm_clear_ast_sent(_l)         LDLM_CLEAR_FLAG((_l), 1ULL <<  5)
+
+/**
+ * Lock is being replayed.  This could probably be implied by the fact that
+ * one of BLOCK_{GRANTED,CONV,WAIT} is set, but that is pretty dangerous. */
+#define LDLM_FL_REPLAY                  0x0000000000000100ULL // bit   8
+#define ldlm_is_replay(_l)              LDLM_TEST_FLAG(( _l), 1ULL <<  8)
+#define ldlm_set_replay(_l)             LDLM_SET_FLAG((  _l), 1ULL <<  8)
+#define ldlm_clear_replay(_l)           LDLM_CLEAR_FLAG((_l), 1ULL <<  8)
+
+/** Don't grant lock, just do intent. */
+#define LDLM_FL_INTENT_ONLY             0x0000000000000200ULL // bit   9
+#define ldlm_is_intent_only(_l)         LDLM_TEST_FLAG(( _l), 1ULL <<  9)
+#define ldlm_set_intent_only(_l)        LDLM_SET_FLAG((  _l), 1ULL <<  9)
+#define ldlm_clear_intent_only(_l)      LDLM_CLEAR_FLAG((_l), 1ULL <<  9)
+
+/** lock request has intent */
+#define LDLM_FL_HAS_INTENT              0x0000000000001000ULL // bit  12
+#define ldlm_is_has_intent(_l)          LDLM_TEST_FLAG(( _l), 1ULL << 12)
+#define ldlm_set_has_intent(_l)         LDLM_SET_FLAG((  _l), 1ULL << 12)
+#define ldlm_clear_has_intent(_l)       LDLM_CLEAR_FLAG((_l), 1ULL << 12)
+
+/** flock deadlock detected */
+#define LDLM_FL_FLOCK_DEADLOCK          0x0000000000008000ULL /* bit  15 */
+#define ldlm_is_flock_deadlock(_l)      LDLM_TEST_FLAG((_l), 1ULL << 15)
+#define ldlm_set_flock_deadlock(_l)     LDLM_SET_FLAG((_l), 1ULL << 15)
+#define ldlm_clear_flock_deadlock(_l)   LDLM_CLEAR_FLAG((_l), 1ULL << 15)
+
+/** discard (no writeback) on cancel */
+#define LDLM_FL_DISCARD_DATA            0x0000000000010000ULL // bit  16
+#define ldlm_is_discard_data(_l)        LDLM_TEST_FLAG(( _l), 1ULL << 16)
+#define ldlm_set_discard_data(_l)       LDLM_SET_FLAG((  _l), 1ULL << 16)
+#define ldlm_clear_discard_data(_l)     LDLM_CLEAR_FLAG((_l), 1ULL << 16)
+
+/** Blocked by group lock - wait indefinitely */
+#define LDLM_FL_NO_TIMEOUT              0x0000000000020000ULL // bit  17
+#define ldlm_is_no_timeout(_l)          LDLM_TEST_FLAG(( _l), 1ULL << 17)
+#define ldlm_set_no_timeout(_l)         LDLM_SET_FLAG((  _l), 1ULL << 17)
+#define ldlm_clear_no_timeout(_l)       LDLM_CLEAR_FLAG((_l), 1ULL << 17)
+
+/**
+ * Server told not to wait if blocked. For AGL, OST will not send glimpse
+ * callback. */
+#define LDLM_FL_BLOCK_NOWAIT            0x0000000000040000ULL // bit  18
+#define ldlm_is_block_nowait(_l)        LDLM_TEST_FLAG(( _l), 1ULL << 18)
+#define ldlm_set_block_nowait(_l)       LDLM_SET_FLAG((  _l), 1ULL << 18)
+#define ldlm_clear_block_nowait(_l)     LDLM_CLEAR_FLAG((_l), 1ULL << 18)
+
+/** return blocking lock */
+#define LDLM_FL_TEST_LOCK               0x0000000000080000ULL // bit  19
+#define ldlm_is_test_lock(_l)           LDLM_TEST_FLAG(( _l), 1ULL << 19)
+#define ldlm_set_test_lock(_l)          LDLM_SET_FLAG((  _l), 1ULL << 19)
+#define ldlm_clear_test_lock(_l)        LDLM_CLEAR_FLAG((_l), 1ULL << 19)
+
+/**
+ * Immediately cancel such locks when they block some other locks. Send
+ * cancel notification to original lock holder, but expect no reply. This
+ * is for clients (like liblustre) that cannot be expected to reliably
+ * response to blocking AST. */
+#define LDLM_FL_CANCEL_ON_BLOCK         0x0000000000800000ULL // bit  23
+#define ldlm_is_cancel_on_block(_l)     LDLM_TEST_FLAG(( _l), 1ULL << 23)
+#define ldlm_set_cancel_on_block(_l)    LDLM_SET_FLAG((  _l), 1ULL << 23)
+#define ldlm_clear_cancel_on_block(_l)  LDLM_CLEAR_FLAG((_l), 1ULL << 23)
+
+/**
+ * measure lock contention and return -EUSERS if locking contention is high */
+#define LDLM_FL_DENY_ON_CONTENTION        0x0000000040000000ULL // bit  30
+#define ldlm_is_deny_on_contention(_l)    LDLM_TEST_FLAG(( _l), 1ULL << 30)
+#define ldlm_set_deny_on_contention(_l)   LDLM_SET_FLAG((  _l), 1ULL << 30)
+#define ldlm_clear_deny_on_contention(_l) LDLM_CLEAR_FLAG((_l), 1ULL << 30)
+
+/**
+ * These are flags that are mapped into the flags and ASTs of blocking
+ * locks Add FL_DISCARD to blocking ASTs */
+#define LDLM_FL_AST_DISCARD_DATA        0x0000000080000000ULL // bit  31
+#define ldlm_is_ast_discard_data(_l)    LDLM_TEST_FLAG(( _l), 1ULL << 31)
+#define ldlm_set_ast_discard_data(_l)   LDLM_SET_FLAG((  _l), 1ULL << 31)
+#define ldlm_clear_ast_discard_data(_l) LDLM_CLEAR_FLAG((_l), 1ULL << 31)
+
+/**
+ * Used for marking lock as a target for -EINTR while cp_ast sleep emulation
+ * + race with upcoming bl_ast. */
+#define LDLM_FL_FAIL_LOC                0x0000000100000000ULL // bit  32
+#define ldlm_is_fail_loc(_l)            LDLM_TEST_FLAG(( _l), 1ULL << 32)
+#define ldlm_set_fail_loc(_l)           LDLM_SET_FLAG((  _l), 1ULL << 32)
+#define ldlm_clear_fail_loc(_l)         LDLM_CLEAR_FLAG((_l), 1ULL << 32)
+
+/**
+ * Used while processing the unused list to know that we have already
+ * handled this lock and decided to skip it. */
+#define LDLM_FL_SKIPPED                 0x0000000200000000ULL // bit  33
+#define ldlm_is_skipped(_l)             LDLM_TEST_FLAG(( _l), 1ULL << 33)
+#define ldlm_set_skipped(_l)            LDLM_SET_FLAG((  _l), 1ULL << 33)
+#define ldlm_clear_skipped(_l)          LDLM_CLEAR_FLAG((_l), 1ULL << 33)
+
+/** this lock is being destroyed */
+#define LDLM_FL_CBPENDING               0x0000000400000000ULL // bit  34
+#define ldlm_is_cbpending(_l)           LDLM_TEST_FLAG(( _l), 1ULL << 34)
+#define ldlm_set_cbpending(_l)          LDLM_SET_FLAG((  _l), 1ULL << 34)
+#define ldlm_clear_cbpending(_l)        LDLM_CLEAR_FLAG((_l), 1ULL << 34)
+
+/** not a real flag, not saved in lock */
+#define LDLM_FL_WAIT_NOREPROC           0x0000000800000000ULL // bit  35
+#define ldlm_is_wait_noreproc(_l)       LDLM_TEST_FLAG(( _l), 1ULL << 35)
+#define ldlm_set_wait_noreproc(_l)      LDLM_SET_FLAG((  _l), 1ULL << 35)
+#define ldlm_clear_wait_noreproc(_l)    LDLM_CLEAR_FLAG((_l), 1ULL << 35)
+
+/** cancellation callback already run */
+#define LDLM_FL_CANCEL                  0x0000001000000000ULL // bit  36
+#define ldlm_is_cancel(_l)              LDLM_TEST_FLAG(( _l), 1ULL << 36)
+#define ldlm_set_cancel(_l)             LDLM_SET_FLAG((  _l), 1ULL << 36)
+#define ldlm_clear_cancel(_l)           LDLM_CLEAR_FLAG((_l), 1ULL << 36)
+
+/** whatever it might mean */
+#define LDLM_FL_LOCAL_ONLY              0x0000002000000000ULL // bit  37
+#define ldlm_is_local_only(_l)          LDLM_TEST_FLAG(( _l), 1ULL << 37)
+#define ldlm_set_local_only(_l)         LDLM_SET_FLAG((  _l), 1ULL << 37)
+#define ldlm_clear_local_only(_l)       LDLM_CLEAR_FLAG((_l), 1ULL << 37)
+
+/** don't run the cancel callback under ldlm_cli_cancel_unused */
+#define LDLM_FL_FAILED                  0x0000004000000000ULL // bit  38
+#define ldlm_is_failed(_l)              LDLM_TEST_FLAG(( _l), 1ULL << 38)
+#define ldlm_set_failed(_l)             LDLM_SET_FLAG((  _l), 1ULL << 38)
+#define ldlm_clear_failed(_l)           LDLM_CLEAR_FLAG((_l), 1ULL << 38)
+
+/** lock cancel has already been sent */
+#define LDLM_FL_CANCELING               0x0000008000000000ULL // bit  39
+#define ldlm_is_canceling(_l)           LDLM_TEST_FLAG(( _l), 1ULL << 39)
+#define ldlm_set_canceling(_l)          LDLM_SET_FLAG((  _l), 1ULL << 39)
+#define ldlm_clear_canceling(_l)        LDLM_CLEAR_FLAG((_l), 1ULL << 39)
+
+/** local lock (ie, no srv/cli split) */
+#define LDLM_FL_LOCAL                   0x0000010000000000ULL // bit  40
+#define ldlm_is_local(_l)               LDLM_TEST_FLAG(( _l), 1ULL << 40)
+#define ldlm_set_local(_l)              LDLM_SET_FLAG((  _l), 1ULL << 40)
+#define ldlm_clear_local(_l)            LDLM_CLEAR_FLAG((_l), 1ULL << 40)
+
+/**
+ * XXX FIXME: This is being added to b_size as a low-risk fix to the
+ * fact that the LVB filling happens _after_ the lock has been granted,
+ * so another thread can match it before the LVB has been updated.  As a
+ * dirty hack, we set LDLM_FL_LVB_READY only after we've done the LVB poop.
+ * this is only needed on LOV/OSC now, where LVB is actually used and
+ * callers must set it in input flags.
+ *
+ * The proper fix is to do the granting inside of the completion AST,
+ * which can be replaced with a LVB-aware wrapping function for OSC locks.
+ * That change is pretty high-risk, though, and would need a lot more
+ * testing. */
+#define LDLM_FL_LVB_READY               0x0000020000000000ULL // bit  41
+#define ldlm_is_lvb_ready(_l)           LDLM_TEST_FLAG(( _l), 1ULL << 41)
+#define ldlm_set_lvb_ready(_l)          LDLM_SET_FLAG((  _l), 1ULL << 41)
+#define ldlm_clear_lvb_ready(_l)        LDLM_CLEAR_FLAG((_l), 1ULL << 41)
+
+/**
+ * A lock contributes to the known minimum size (KMS) calculation until it
+ * has finished the part of its cancellation that performs write back on its
+ * dirty pages.  It can remain on the granted list during this whole time.
+ * Threads racing to update the KMS after performing their writeback need
+ * to know to exclude each other's locks from the calculation as they walk
+ * the granted list. */
+#define LDLM_FL_KMS_IGNORE              0x0000040000000000ULL // bit  42
+#define ldlm_is_kms_ignore(_l)          LDLM_TEST_FLAG(( _l), 1ULL << 42)
+#define ldlm_set_kms_ignore(_l)         LDLM_SET_FLAG((  _l), 1ULL << 42)
+#define ldlm_clear_kms_ignore(_l)       LDLM_CLEAR_FLAG((_l), 1ULL << 42)
+
+/** completion AST to be executed */
+#define LDLM_FL_CP_REQD                 0x0000080000000000ULL // bit  43
+#define ldlm_is_cp_reqd(_l)             LDLM_TEST_FLAG(( _l), 1ULL << 43)
+#define ldlm_set_cp_reqd(_l)            LDLM_SET_FLAG((  _l), 1ULL << 43)
+#define ldlm_clear_cp_reqd(_l)          LDLM_CLEAR_FLAG((_l), 1ULL << 43)
+
+/** cleanup_resource has already handled the lock */
+#define LDLM_FL_CLEANED                 0x0000100000000000ULL // bit  44
+#define ldlm_is_cleaned(_l)             LDLM_TEST_FLAG(( _l), 1ULL << 44)
+#define ldlm_set_cleaned(_l)            LDLM_SET_FLAG((  _l), 1ULL << 44)
+#define ldlm_clear_cleaned(_l)          LDLM_CLEAR_FLAG((_l), 1ULL << 44)
+
+/**
+ * optimization hint: LDLM can run blocking callback from current context
+ * w/o involving separate thread. in order to decrease cs rate */
+#define LDLM_FL_ATOMIC_CB               0x0000200000000000ULL // bit  45
+#define ldlm_is_atomic_cb(_l)           LDLM_TEST_FLAG(( _l), 1ULL << 45)
+#define ldlm_set_atomic_cb(_l)          LDLM_SET_FLAG((  _l), 1ULL << 45)
+#define ldlm_clear_atomic_cb(_l)        LDLM_CLEAR_FLAG((_l), 1ULL << 45)
+
+/**
+ * It may happen that a client initiates two operations, e.g. unlink and
+ * mkdir, such that the server sends a blocking AST for conflicting locks
+ * to this client for the first operation, whereas the second operation
+ * has canceled this lock and is waiting for rpc_lock which is taken by
+ * the first operation. LDLM_FL_BL_AST is set by ldlm_callback_handler() in
+ * the lock to prevent the Early Lock Cancel (ELC) code from cancelling it.
+ *
+ * LDLM_FL_BL_DONE is to be set by ldlm_cancel_callback() when lock cache is
+ * dropped to let ldlm_callback_handler() return EINVAL to the server. It
+ * is used when ELC RPC is already prepared and is waiting for rpc_lock,
+ * too late to send a separate CANCEL RPC. */
+#define LDLM_FL_BL_AST                  0x0000400000000000ULL // bit  46
+#define ldlm_is_bl_ast(_l)              LDLM_TEST_FLAG(( _l), 1ULL << 46)
+#define ldlm_set_bl_ast(_l)             LDLM_SET_FLAG((  _l), 1ULL << 46)
+#define ldlm_clear_bl_ast(_l)           LDLM_CLEAR_FLAG((_l), 1ULL << 46)
+
+/** whatever it might mean */
+#define LDLM_FL_BL_DONE                 0x0000800000000000ULL // bit  47
+#define ldlm_is_bl_done(_l)             LDLM_TEST_FLAG(( _l), 1ULL << 47)
+#define ldlm_set_bl_done(_l)            LDLM_SET_FLAG((  _l), 1ULL << 47)
+#define ldlm_clear_bl_done(_l)          LDLM_CLEAR_FLAG((_l), 1ULL << 47)
+
+/**
+ * Don't put lock into the LRU list, so that it is not canceled due
+ * to aging.  Used by MGC locks, they are cancelled only at unmount or
+ * by callback. */
+#define LDLM_FL_NO_LRU                  0x0001000000000000ULL // bit  48
+#define ldlm_is_no_lru(_l)              LDLM_TEST_FLAG(( _l), 1ULL << 48)
+#define ldlm_set_no_lru(_l)             LDLM_SET_FLAG((  _l), 1ULL << 48)
+#define ldlm_clear_no_lru(_l)           LDLM_CLEAR_FLAG((_l), 1ULL << 48)
+
+/**
+ * Set for locks that failed and where the server has been notified.
+ *
+ * Protected by lock and resource locks. */
+#define LDLM_FL_FAIL_NOTIFIED           0x0002000000000000ULL // bit  49
+#define ldlm_is_fail_notified(_l)       LDLM_TEST_FLAG(( _l), 1ULL << 49)
+#define ldlm_set_fail_notified(_l)      LDLM_SET_FLAG((  _l), 1ULL << 49)
+#define ldlm_clear_fail_notified(_l)    LDLM_CLEAR_FLAG((_l), 1ULL << 49)
+
+/**
+ * Set for locks that were removed from class hash table and will
+ * be destroyed when last reference to them is released. Set by
+ * ldlm_lock_destroy_internal().
+ *
+ * Protected by lock and resource locks. */
+#define LDLM_FL_DESTROYED               0x0004000000000000ULL // bit  50
+#define ldlm_is_destroyed(_l)           LDLM_TEST_FLAG(( _l), 1ULL << 50)
+#define ldlm_set_destroyed(_l)          LDLM_SET_FLAG((  _l), 1ULL << 50)
+#define ldlm_clear_destroyed(_l)        LDLM_CLEAR_FLAG((_l), 1ULL << 50)
+
+/** flag whether this is a server namespace lock */
+#define LDLM_FL_SERVER_LOCK             0x0008000000000000ULL // bit  51
+#define ldlm_is_server_lock(_l)         LDLM_TEST_FLAG(( _l), 1ULL << 51)
+#define ldlm_set_server_lock(_l)        LDLM_SET_FLAG((  _l), 1ULL << 51)
+#define ldlm_clear_server_lock(_l)      LDLM_CLEAR_FLAG((_l), 1ULL << 51)
+
+/**
+ * It's set in lock_res_and_lock() and unset in unlock_res_and_lock().
+ *
+ * NB: compared with check_res_locked(), checking this bit is cheaper.
+ * Also, spin_is_locked() is deprecated for kernel code; one reason is
+ * because it works only for SMP so user needs to add extra macros like
+ * LASSERT_SPIN_LOCKED for uniprocessor kernels. */
+#define LDLM_FL_RES_LOCKED              0x0010000000000000ULL // bit  52
+#define ldlm_is_res_locked(_l)          LDLM_TEST_FLAG(( _l), 1ULL << 52)
+#define ldlm_set_res_locked(_l)         LDLM_SET_FLAG((  _l), 1ULL << 52)
+#define ldlm_clear_res_locked(_l)       LDLM_CLEAR_FLAG((_l), 1ULL << 52)
+
+/**
+ * It's set once we call ldlm_add_waiting_lock_res_locked() to start the
+ * lock-timeout timer and it will never be reset.
+ *
+ * Protected by lock and resource locks. */
+#define LDLM_FL_WAITED                  0x0020000000000000ULL // bit  53
+#define ldlm_is_waited(_l)              LDLM_TEST_FLAG(( _l), 1ULL << 53)
+#define ldlm_set_waited(_l)             LDLM_SET_FLAG((  _l), 1ULL << 53)
+#define ldlm_clear_waited(_l)           LDLM_CLEAR_FLAG((_l), 1ULL << 53)
+
+/** Flag whether this is a server namespace lock. */
+#define LDLM_FL_NS_SRV                  0x0040000000000000ULL // bit  54
+#define ldlm_is_ns_srv(_l)              LDLM_TEST_FLAG(( _l), 1ULL << 54)
+#define ldlm_set_ns_srv(_l)             LDLM_SET_FLAG((  _l), 1ULL << 54)
+#define ldlm_clear_ns_srv(_l)           LDLM_CLEAR_FLAG((_l), 1ULL << 54)
+
+/** Flag whether this lock can be reused. Used by exclusive open. */
+#define LDLM_FL_EXCL                    0x0080000000000000ULL /* bit  55 */
+#define ldlm_is_excl(_l)                LDLM_TEST_FLAG((_l), 1ULL << 55)
+#define ldlm_set_excl(_l)               LDLM_SET_FLAG((_l), 1ULL << 55)
+#define ldlm_clear_excl(_l)             LDLM_CLEAR_FLAG((_l), 1ULL << 55)
+
+/** test for ldlm_lock flag bit set */
+#define LDLM_TEST_FLAG(_l, _b)        (((_l)->l_flags & (_b)) != 0)
+
+/** set a ldlm_lock flag bit */
+#define LDLM_SET_FLAG(_l, _b)         (((_l)->l_flags |= (_b))
+
+/** clear a ldlm_lock flag bit */
+#define LDLM_CLEAR_FLAG(_l, _b)       (((_l)->l_flags &= ~(_b))
+
+/** Mask of flags inherited from parent lock when doing intents. */
+#define LDLM_INHERIT_FLAGS            LDLM_FL_INHERIT_MASK
+
+/** Mask of Flags sent in AST lock_flags to map into the receiving lock. */
+#define LDLM_AST_FLAGS                LDLM_FL_AST_MASK
+
+/** @} subgroup */
+/** @} group */
+#ifdef WIRESHARK_COMPILE
+static int hf_lustre_ldlm_fl_lock_changed        = -1;
+static int hf_lustre_ldlm_fl_block_granted       = -1;
+static int hf_lustre_ldlm_fl_block_conv          = -1;
+static int hf_lustre_ldlm_fl_block_wait          = -1;
+static int hf_lustre_ldlm_fl_ast_sent            = -1;
+static int hf_lustre_ldlm_fl_replay              = -1;
+static int hf_lustre_ldlm_fl_intent_only         = -1;
+static int hf_lustre_ldlm_fl_has_intent          = -1;
+static int hf_lustre_ldlm_fl_flock_deadlock      = -1;
+static int hf_lustre_ldlm_fl_discard_data        = -1;
+static int hf_lustre_ldlm_fl_no_timeout          = -1;
+static int hf_lustre_ldlm_fl_block_nowait        = -1;
+static int hf_lustre_ldlm_fl_test_lock           = -1;
+static int hf_lustre_ldlm_fl_cancel_on_block     = -1;
+static int hf_lustre_ldlm_fl_deny_on_contention  = -1;
+static int hf_lustre_ldlm_fl_ast_discard_data    = -1;
+static int hf_lustre_ldlm_fl_fail_loc            = -1;
+static int hf_lustre_ldlm_fl_skipped             = -1;
+static int hf_lustre_ldlm_fl_cbpending           = -1;
+static int hf_lustre_ldlm_fl_wait_noreproc       = -1;
+static int hf_lustre_ldlm_fl_cancel              = -1;
+static int hf_lustre_ldlm_fl_local_only          = -1;
+static int hf_lustre_ldlm_fl_failed              = -1;
+static int hf_lustre_ldlm_fl_canceling           = -1;
+static int hf_lustre_ldlm_fl_local               = -1;
+static int hf_lustre_ldlm_fl_lvb_ready           = -1;
+static int hf_lustre_ldlm_fl_kms_ignore          = -1;
+static int hf_lustre_ldlm_fl_cp_reqd             = -1;
+static int hf_lustre_ldlm_fl_cleaned             = -1;
+static int hf_lustre_ldlm_fl_atomic_cb           = -1;
+static int hf_lustre_ldlm_fl_bl_ast              = -1;
+static int hf_lustre_ldlm_fl_bl_done             = -1;
+static int hf_lustre_ldlm_fl_no_lru              = -1;
+static int hf_lustre_ldlm_fl_fail_notified       = -1;
+static int hf_lustre_ldlm_fl_destroyed           = -1;
+static int hf_lustre_ldlm_fl_server_lock         = -1;
+static int hf_lustre_ldlm_fl_res_locked          = -1;
+static int hf_lustre_ldlm_fl_waited              = -1;
+static int hf_lustre_ldlm_fl_ns_srv              = -1;
+static int hf_lustre_ldlm_fl_excl                = -1;
+
+const value_string lustre_ldlm_flags_vals[] = {
+	{LDLM_FL_LOCK_CHANGED,        "LDLM_FL_LOCK_CHANGED"},
+	{LDLM_FL_BLOCK_GRANTED,       "LDLM_FL_BLOCK_GRANTED"},
+	{LDLM_FL_BLOCK_CONV,          "LDLM_FL_BLOCK_CONV"},
+	{LDLM_FL_BLOCK_WAIT,          "LDLM_FL_BLOCK_WAIT"},
+	{LDLM_FL_AST_SENT,            "LDLM_FL_AST_SENT"},
+	{LDLM_FL_REPLAY,              "LDLM_FL_REPLAY"},
+	{LDLM_FL_INTENT_ONLY,         "LDLM_FL_INTENT_ONLY"},
+	{LDLM_FL_HAS_INTENT,          "LDLM_FL_HAS_INTENT"},
+	{LDLM_FL_FLOCK_DEADLOCK,      "LDLM_FL_FLOCK_DEADLOCK"},
+	{LDLM_FL_DISCARD_DATA,        "LDLM_FL_DISCARD_DATA"},
+	{LDLM_FL_NO_TIMEOUT,          "LDLM_FL_NO_TIMEOUT"},
+	{LDLM_FL_BLOCK_NOWAIT,        "LDLM_FL_BLOCK_NOWAIT"},
+	{LDLM_FL_TEST_LOCK,           "LDLM_FL_TEST_LOCK"},
+	{LDLM_FL_CANCEL_ON_BLOCK,     "LDLM_FL_CANCEL_ON_BLOCK"},
+	{LDLM_FL_DENY_ON_CONTENTION,  "LDLM_FL_DENY_ON_CONTENTION"},
+	{LDLM_FL_AST_DISCARD_DATA,    "LDLM_FL_AST_DISCARD_DATA"},
+	{LDLM_FL_FAIL_LOC,            "LDLM_FL_FAIL_LOC"},
+	{LDLM_FL_SKIPPED,             "LDLM_FL_SKIPPED"},
+	{LDLM_FL_CBPENDING,           "LDLM_FL_CBPENDING"},
+	{LDLM_FL_WAIT_NOREPROC,       "LDLM_FL_WAIT_NOREPROC"},
+	{LDLM_FL_CANCEL,              "LDLM_FL_CANCEL"},
+	{LDLM_FL_LOCAL_ONLY,          "LDLM_FL_LOCAL_ONLY"},
+	{LDLM_FL_FAILED,              "LDLM_FL_FAILED"},
+	{LDLM_FL_CANCELING,           "LDLM_FL_CANCELING"},
+	{LDLM_FL_LOCAL,               "LDLM_FL_LOCAL"},
+	{LDLM_FL_LVB_READY,           "LDLM_FL_LVB_READY"},
+	{LDLM_FL_KMS_IGNORE,          "LDLM_FL_KMS_IGNORE"},
+	{LDLM_FL_CP_REQD,             "LDLM_FL_CP_REQD"},
+	{LDLM_FL_CLEANED,             "LDLM_FL_CLEANED"},
+	{LDLM_FL_ATOMIC_CB,           "LDLM_FL_ATOMIC_CB"},
+	{LDLM_FL_BL_AST,              "LDLM_FL_BL_AST"},
+	{LDLM_FL_BL_DONE,             "LDLM_FL_BL_DONE"},
+	{LDLM_FL_NO_LRU,              "LDLM_FL_NO_LRU"},
+	{LDLM_FL_FAIL_NOTIFIED,       "LDLM_FL_FAIL_NOTIFIED"},
+	{LDLM_FL_DESTROYED,           "LDLM_FL_DESTROYED"},
+	{LDLM_FL_SERVER_LOCK,         "LDLM_FL_SERVER_LOCK"},
+	{LDLM_FL_RES_LOCKED,          "LDLM_FL_RES_LOCKED"},
+	{LDLM_FL_WAITED,              "LDLM_FL_WAITED"},
+	{LDLM_FL_NS_SRV,              "LDLM_FL_NS_SRV"},
+	{LDLM_FL_EXCL,                "LDLM_FL_EXCL"},
+	{ 0, NULL }
+};
+#endif /*  WIRESHARK_COMPILE */
+#endif /* LDLM_ALL_FLAGS_MASK */
diff --git a/kernel/drivers/staging/lustre/lustre/include/lustre_eacl.h b/kernel/drivers/staging/lustre/lustre/include/lustre_eacl.h
new file mode 100644
index 000000000..0f8f76c43
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/include/lustre_eacl.h
@@ -0,0 +1,95 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/lustre/include/lustre_idmap.h
+ *
+ * MDS data structures.
+ * See also lustre_idl.h for wire formats of requests.
+ */
+
+#ifndef _LUSTRE_EACL_H
+#define _LUSTRE_EACL_H
+
+/** \defgroup eacl eacl
+ *
+ * @{
+ */
+
+#ifdef CONFIG_FS_POSIX_ACL
+
+#include <linux/posix_acl_xattr.h>
+
+typedef struct {
+	__u16		   e_tag;
+	__u16		   e_perm;
+	__u32		   e_id;
+	__u32		   e_stat;
+} ext_acl_xattr_entry;
+
+typedef struct {
+	__u32		   a_count;
+	ext_acl_xattr_entry     a_entries[0];
+} ext_acl_xattr_header;
+
+#define CFS_ACL_XATTR_SIZE(count, prefix) \
+	(sizeof(prefix ## _header) + (count) * sizeof(prefix ## _entry))
+
+#define CFS_ACL_XATTR_COUNT(size, prefix) \
+	(((size) - sizeof(prefix ## _header)) / sizeof(prefix ## _entry))
+
+
+extern ext_acl_xattr_header *
+lustre_posix_acl_xattr_2ext(posix_acl_xattr_header *header, int size);
+extern int
+lustre_posix_acl_xattr_filter(posix_acl_xattr_header *header, size_t size,
+			      posix_acl_xattr_header **out);
+extern void
+lustre_posix_acl_xattr_free(posix_acl_xattr_header *header, int size);
+extern void
+lustre_ext_acl_xattr_free(ext_acl_xattr_header *header);
+extern int
+lustre_acl_xattr_merge2posix(posix_acl_xattr_header *posix_header, int size,
+			     ext_acl_xattr_header *ext_header,
+			     posix_acl_xattr_header **out);
+extern ext_acl_xattr_header *
+lustre_acl_xattr_merge2ext(posix_acl_xattr_header *posix_header, int size,
+			   ext_acl_xattr_header *ext_header);
+
+#endif /* CONFIG_FS_POSIX_ACL */
+
+/** @} eacl */
+
+#endif
diff --git a/kernel/drivers/staging/lustre/lustre/include/lustre_export.h b/kernel/drivers/staging/lustre/lustre/include/lustre_export.h
new file mode 100644
index 000000000..9c06a49f1
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/include/lustre_export.h
@@ -0,0 +1,406 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+/** \defgroup obd_export PortalRPC export definitions
+ *
+ * @{
+ */
+
+#ifndef __EXPORT_H
+#define __EXPORT_H
+
+/** \defgroup export export
+ *
+ * @{
+ */
+
+#include "lprocfs_status.h"
+#include "lustre/lustre_idl.h"
+#include "lustre_dlm.h"
+
+struct mds_client_data;
+struct mdt_client_data;
+struct mds_idmap_table;
+struct mdt_idmap_table;
+
+/**
+ * Target-specific export data
+ */
+struct tg_export_data {
+	/** Protects led_lcd below */
+	struct mutex		ted_lcd_lock;
+	/** Per-client data for each export */
+	struct lsd_client_data	*ted_lcd;
+	/** Offset of record in last_rcvd file */
+	loff_t			ted_lr_off;
+	/** Client index in last_rcvd file */
+	int			ted_lr_idx;
+};
+
+/**
+ * MDT-specific export data
+ */
+struct mdt_export_data {
+	struct tg_export_data	med_ted;
+	/** List of all files opened by client on this MDT */
+	struct list_head		med_open_head;
+	spinlock_t		med_open_lock; /* med_open_head, mfd_list */
+	/** Bitmask of all ibit locks this MDT understands */
+	__u64			med_ibits_known;
+	struct mutex		med_idmap_mutex;
+	struct lustre_idmap_table *med_idmap;
+};
+
+struct ec_export_data { /* echo client */
+	struct list_head eced_locks;
+};
+
+/* In-memory access to client data from OST struct */
+/** Filter (oss-side) specific import data */
+struct filter_export_data {
+	struct tg_export_data	fed_ted;
+	spinlock_t		fed_lock;	/**< protects fed_mod_list */
+	long		       fed_dirty;    /* in bytes */
+	long		       fed_grant;    /* in bytes */
+	struct list_head		 fed_mod_list; /* files being modified */
+	int			fed_mod_count;/* items in fed_writing list */
+	long		       fed_pending;  /* bytes just being written */
+	__u32		      fed_group;
+	__u8		       fed_pagesize; /* log2 of client page size */
+};
+
+struct mgs_export_data {
+	struct list_head		med_clients;	/* mgc fs client via this exp */
+	spinlock_t		med_lock;	/* protect med_clients */
+};
+
+/**
+ * per-NID statistics structure.
+ * It tracks access patterns to this export on a per-client-NID basis
+ */
+struct nid_stat {
+	lnet_nid_t	       nid;
+	struct hlist_node	 nid_hash;
+	struct list_head	       nid_list;
+	struct obd_device       *nid_obd;
+	struct proc_dir_entry   *nid_proc;
+	struct lprocfs_stats    *nid_stats;
+	struct lprocfs_stats    *nid_ldlm_stats;
+	atomic_t	     nid_exp_ref_count; /* for obd_nid_stats_hash
+							   exp_nid_stats */
+};
+
+#define nidstat_getref(nidstat)						\
+do {									   \
+	atomic_inc(&(nidstat)->nid_exp_ref_count);			 \
+} while (0)
+
+#define nidstat_putref(nidstat)						\
+do {									   \
+	atomic_dec(&(nidstat)->nid_exp_ref_count);			 \
+	LASSERTF(atomic_read(&(nidstat)->nid_exp_ref_count) >= 0,	  \
+		 "stat %p nid_exp_ref_count < 0\n", nidstat);		  \
+} while (0)
+
+enum obd_option {
+	OBD_OPT_FORCE =	 0x0001,
+	OBD_OPT_FAILOVER =      0x0002,
+	OBD_OPT_ABORT_RECOV =   0x0004,
+};
+
+/**
+ * Export structure. Represents target-side of connection in portals.
+ * Also used in Lustre to connect between layers on the same node when
+ * there is no network-connection in-between.
+ * For every connected client there is an export structure on the server
+ * attached to the same obd device.
+ */
+struct obd_export {
+	/**
+	 * Export handle, it's id is provided to client on connect
+	 * Subsequent client RPCs contain this handle id to identify
+	 * what export they are talking to.
+	 */
+	struct portals_handle     exp_handle;
+	atomic_t	      exp_refcount;
+	/**
+	 * Set of counters below is to track where export references are
+	 * kept. The exp_rpc_count is used for reconnect handling also,
+	 * the cb_count and locks_count are for debug purposes only for now.
+	 * The sum of them should be less than exp_refcount by 3
+	 */
+	atomic_t	      exp_rpc_count; /* RPC references */
+	atomic_t	      exp_cb_count; /* Commit callback references */
+	/** Number of queued replay requests to be processes */
+	atomic_t		  exp_replay_count;
+	atomic_t	      exp_locks_count; /** Lock references */
+#if LUSTRE_TRACKS_LOCK_EXP_REFS
+	struct list_head		exp_locks_list;
+	spinlock_t		  exp_locks_list_guard;
+#endif
+	/** UUID of client connected to this export */
+	struct obd_uuid	   exp_client_uuid;
+	/** To link all exports on an obd device */
+	struct list_head		exp_obd_chain;
+	struct hlist_node	  exp_uuid_hash; /** uuid-export hash*/
+	struct hlist_node	  exp_nid_hash; /** nid-export hash */
+	/**
+	 * All exports eligible for ping evictor are linked into a list
+	 * through this field in "most time since last request on this export"
+	 * order
+	 * protected by obd_dev_lock
+	 */
+	struct list_head		exp_obd_chain_timed;
+	/** Obd device of this export */
+	struct obd_device	*exp_obd;
+	/**
+	 * "reverse" import to send requests (e.g. from ldlm) back to client
+	 * exp_lock protect its change
+	 */
+	struct obd_import	*exp_imp_reverse;
+	struct nid_stat	  *exp_nid_stats;
+	struct lprocfs_stats     *exp_md_stats;
+	/** Active connection */
+	struct ptlrpc_connection *exp_connection;
+	/** Connection count value from last successful reconnect rpc */
+	__u32		     exp_conn_cnt;
+	/** Hash list of all ldlm locks granted on this export */
+	struct cfs_hash	       *exp_lock_hash;
+	/**
+	 * Hash list for Posix lock deadlock detection, added with
+	 * ldlm_lock::l_exp_flock_hash.
+	 */
+	struct cfs_hash	       *exp_flock_hash;
+	struct list_head		exp_outstanding_replies;
+	struct list_head		exp_uncommitted_replies;
+	spinlock_t		  exp_uncommitted_replies_lock;
+	/** Last committed transno for this export */
+	__u64		     exp_last_committed;
+	/** When was last request received */
+	unsigned long		exp_last_request_time;
+	/** On replay all requests waiting for replay are linked here */
+	struct list_head		exp_req_replay_queue;
+	/**
+	 * protects exp_flags, exp_outstanding_replies and the change
+	 * of exp_imp_reverse
+	 */
+	spinlock_t		  exp_lock;
+	/** Compatibility flags for this export are embedded into
+	 *  exp_connect_data */
+	struct obd_connect_data   exp_connect_data;
+	enum obd_option	   exp_flags;
+	unsigned long	     exp_failed:1,
+				  exp_in_recovery:1,
+				  exp_disconnected:1,
+				  exp_connecting:1,
+				  /** VBR: export missed recovery */
+				  exp_delayed:1,
+				  /** VBR: failed version checking */
+				  exp_vbr_failed:1,
+				  exp_req_replay_needed:1,
+				  exp_lock_replay_needed:1,
+				  exp_need_sync:1,
+				  exp_flvr_changed:1,
+				  exp_flvr_adapt:1,
+				  exp_libclient:1, /* liblustre client? */
+				  /* client timed out and tried to reconnect,
+				   * but couldn't because of active rpcs */
+				  exp_abort_active_req:1,
+				  /* if to swap nidtbl entries for 2.2 clients.
+				   * Only used by the MGS to fix LU-1644. */
+				  exp_need_mne_swab:1;
+	/* also protected by exp_lock */
+	enum lustre_sec_part      exp_sp_peer;
+	struct sptlrpc_flavor     exp_flvr;	     /* current */
+	struct sptlrpc_flavor     exp_flvr_old[2];      /* about-to-expire */
+	unsigned long		exp_flvr_expire[2];   /* seconds */
+
+	/** protects exp_hp_rpcs */
+	spinlock_t		  exp_rpc_lock;
+	struct list_head		  exp_hp_rpcs;	/* (potential) HP RPCs */
+
+	/** blocking dlm lock list, protected by exp_bl_list_lock */
+	struct list_head		exp_bl_list;
+	spinlock_t		  exp_bl_list_lock;
+
+	/** Target specific data */
+	union {
+		struct tg_export_data     eu_target_data;
+		struct mdt_export_data    eu_mdt_data;
+		struct filter_export_data eu_filter_data;
+		struct ec_export_data     eu_ec_data;
+		struct mgs_export_data    eu_mgs_data;
+	} u;
+};
+
+#define exp_target_data u.eu_target_data
+#define exp_mdt_data    u.eu_mdt_data
+#define exp_filter_data u.eu_filter_data
+#define exp_ec_data     u.eu_ec_data
+
+static inline __u64 *exp_connect_flags_ptr(struct obd_export *exp)
+{
+	return &exp->exp_connect_data.ocd_connect_flags;
+}
+
+static inline __u64 exp_connect_flags(struct obd_export *exp)
+{
+	return *exp_connect_flags_ptr(exp);
+}
+
+static inline int exp_max_brw_size(struct obd_export *exp)
+{
+	LASSERT(exp != NULL);
+	if (exp_connect_flags(exp) & OBD_CONNECT_BRW_SIZE)
+		return exp->exp_connect_data.ocd_brw_size;
+
+	return ONE_MB_BRW_SIZE;
+}
+
+static inline int exp_connect_multibulk(struct obd_export *exp)
+{
+	return exp_max_brw_size(exp) > ONE_MB_BRW_SIZE;
+}
+
+static inline int exp_expired(struct obd_export *exp, long age)
+{
+	LASSERT(exp->exp_delayed);
+	return time_before(cfs_time_add(exp->exp_last_request_time, age),
+			   get_seconds());
+}
+
+static inline int exp_connect_cancelset(struct obd_export *exp)
+{
+	LASSERT(exp != NULL);
+	return !!(exp_connect_flags(exp) & OBD_CONNECT_CANCELSET);
+}
+
+static inline int exp_connect_lru_resize(struct obd_export *exp)
+{
+	LASSERT(exp != NULL);
+	return !!(exp_connect_flags(exp) & OBD_CONNECT_LRU_RESIZE);
+}
+
+static inline int exp_connect_rmtclient(struct obd_export *exp)
+{
+	LASSERT(exp != NULL);
+	return !!(exp_connect_flags(exp) & OBD_CONNECT_RMT_CLIENT);
+}
+
+static inline int client_is_remote(struct obd_export *exp)
+{
+	struct obd_import *imp = class_exp2cliimp(exp);
+
+	return !!(imp->imp_connect_data.ocd_connect_flags &
+		  OBD_CONNECT_RMT_CLIENT);
+}
+
+static inline int exp_connect_vbr(struct obd_export *exp)
+{
+	LASSERT(exp != NULL);
+	LASSERT(exp->exp_connection);
+	return !!(exp_connect_flags(exp) & OBD_CONNECT_VBR);
+}
+
+static inline int exp_connect_som(struct obd_export *exp)
+{
+	LASSERT(exp != NULL);
+	return !!(exp_connect_flags(exp) & OBD_CONNECT_SOM);
+}
+
+static inline int exp_connect_umask(struct obd_export *exp)
+{
+	return !!(exp_connect_flags(exp) & OBD_CONNECT_UMASK);
+}
+
+static inline int imp_connect_lru_resize(struct obd_import *imp)
+{
+	struct obd_connect_data *ocd;
+
+	LASSERT(imp != NULL);
+	ocd = &imp->imp_connect_data;
+	return !!(ocd->ocd_connect_flags & OBD_CONNECT_LRU_RESIZE);
+}
+
+static inline int exp_connect_layout(struct obd_export *exp)
+{
+	return !!(exp_connect_flags(exp) & OBD_CONNECT_LAYOUTLOCK);
+}
+
+static inline bool exp_connect_lvb_type(struct obd_export *exp)
+{
+	LASSERT(exp != NULL);
+	if (exp_connect_flags(exp) & OBD_CONNECT_LVB_TYPE)
+		return true;
+	else
+		return false;
+}
+
+static inline bool imp_connect_lvb_type(struct obd_import *imp)
+{
+	struct obd_connect_data *ocd;
+
+	LASSERT(imp != NULL);
+	ocd = &imp->imp_connect_data;
+	if (ocd->ocd_connect_flags & OBD_CONNECT_LVB_TYPE)
+		return true;
+	else
+		return false;
+}
+
+static inline __u64 exp_connect_ibits(struct obd_export *exp)
+{
+	struct obd_connect_data *ocd;
+
+	ocd = &exp->exp_connect_data;
+	return ocd->ocd_ibits_known;
+}
+
+static inline bool imp_connect_disp_stripe(struct obd_import *imp)
+{
+	struct obd_connect_data *ocd;
+
+	LASSERT(imp != NULL);
+	ocd = &imp->imp_connect_data;
+	return ocd->ocd_connect_flags & OBD_CONNECT_DISP_STRIPE;
+}
+
+extern struct obd_export *class_conn2export(struct lustre_handle *conn);
+extern struct obd_device *class_conn2obd(struct lustre_handle *conn);
+
+/** @} export */
+
+#endif /* __EXPORT_H */
+/** @} obd_export */
diff --git a/kernel/drivers/staging/lustre/lustre/include/lustre_fid.h b/kernel/drivers/staging/lustre/lustre/include/lustre_fid.h
new file mode 100644
index 000000000..0a0929fd9
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/include/lustre_fid.h
@@ -0,0 +1,767 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/include/lustre_fid.h
+ *
+ * Author: Yury Umanets <umka@clusterfs.com>
+ */
+
+#ifndef __LUSTRE_FID_H
+#define __LUSTRE_FID_H
+
+/** \defgroup fid fid
+ *
+ * @{
+ *
+ * http://wiki.lustre.org/index.php/Architecture_-_Interoperability_fids_zfs
+ * describes the FID namespace and interoperability requirements for FIDs.
+ * The important parts of that document are included here for reference.
+ *
+ * FID
+ *   File IDentifier generated by client from range allocated by the SEQuence
+ *   service and stored in struct lu_fid. The FID is composed of three parts:
+ *   SEQuence, ObjectID, and VERsion.  The SEQ component is a filesystem
+ *   unique 64-bit integer, and only one client is ever assigned any SEQ value.
+ *   The first 0x400 FID_SEQ_NORMAL [2^33, 2^33 + 0x400] values are reserved
+ *   for system use.  The OID component is a 32-bit value generated by the
+ *   client on a per-SEQ basis to allow creating many unique FIDs without
+ *   communication with the server.  The VER component is a 32-bit value that
+ *   distinguishes between different FID instantiations, such as snapshots or
+ *   separate subtrees within the filesystem.  FIDs with the same VER field
+ *   are considered part of the same namespace.
+ *
+ * OLD filesystems are those upgraded from Lustre 1.x that predate FIDs, and
+ *   MDTs use 32-bit ldiskfs internal inode/generation numbers (IGIFs), while
+ *   OSTs use 64-bit Lustre object IDs and generation numbers.
+ *
+ * NEW filesystems are those formatted since the introduction of FIDs.
+ *
+ * IGIF
+ *   Inode and Generation In FID, a surrogate FID used to globally identify
+ *   an existing object on OLD formatted MDT file system. This would only be
+ *   used on MDT0 in a DNE filesystem, because there cannot be more than one
+ *   MDT in an OLD formatted filesystem. Belongs to sequence in [12, 2^32 - 1]
+ *   range, where inode number is stored in SEQ, and inode generation is in OID.
+ *   NOTE: This assumes no more than 2^32-1 inodes exist in the MDT filesystem,
+ *   which is the maximum possible for an ldiskfs backend.  It also assumes
+ *   that the reserved ext3/ext4/ldiskfs inode numbers [0-11] are never visible
+ *   to clients, which has always been true.
+ *
+ * IDIF
+ *   object ID In FID, a surrogate FID used to globally identify an existing
+ *   OST object on OLD formatted OST file system. Belongs to a sequence in
+ *   [2^32, 2^33 - 1]. Sequence number is calculated as:
+ *
+ *      1 << 32 | (ost_index << 16) | ((objid >> 32) & 0xffff)
+ *
+ *   that is, SEQ consists of 16-bit OST index, and higher 16 bits of object
+ *   ID. The generation of unique SEQ values per OST allows the IDIF FIDs to
+ *   be identified in the FLD correctly. The OID field is calculated as:
+ *
+ *      objid & 0xffffffff
+ *
+ *   that is, it consists of lower 32 bits of object ID.  For objects within
+ *   the IDIF range, object ID extraction will be:
+ *
+ *      o_id = (fid->f_seq & 0x7fff) << 16 | fid->f_oid;
+ *      o_seq = 0;  // formerly group number
+ *
+ *   NOTE: This assumes that no more than 2^48-1 objects have ever been created
+ *   on any OST, and that no more than 65535 OSTs are in use.  Both are very
+ *   reasonable assumptions, i.e. an IDIF can uniquely map all objects assuming
+ *   a maximum creation rate of 1M objects per second for a maximum of 9 years,
+ *   or combinations thereof.
+ *
+ * OST_MDT0
+ *   Surrogate FID used to identify an existing object on OLD formatted OST
+ *   filesystem. Belongs to the reserved SEQuence 0, and is used prior to
+ *   the introduction of FID-on-OST, at which point IDIF will be used to
+ *   identify objects as residing on a specific OST.
+ *
+ * LLOG
+ *   For Lustre Log objects the object sequence 1 is used. This is compatible
+ *   with both OLD and NEW namespaces, as this SEQ number is in the
+ *   ext3/ldiskfs reserved inode range and does not conflict with IGIF
+ *   sequence numbers.
+ *
+ * ECHO
+ *   For testing OST IO performance the object sequence 2 is used. This is
+ *   compatible with both OLD and NEW namespaces, as this SEQ number is in
+ *   the ext3/ldiskfs reserved inode range and does not conflict with IGIF
+ *   sequence numbers.
+ *
+ * OST_MDT1 .. OST_MAX
+ *   For testing with multiple MDTs the object sequence 3 through 9 is used,
+ *   allowing direct mapping of MDTs 1 through 7 respectively, for a total
+ *   of 8 MDTs including OST_MDT0. This matches the legacy CMD project "group"
+ *   mappings. However, this SEQ range is only for testing prior to any
+ *   production DNE release, as the objects in this range conflict across all
+ *   OSTs, as the OST index is not part of the FID.  For production DNE usage,
+ *   OST objects created by MDT1+ will use FID_SEQ_NORMAL FIDs.
+ *
+ * DLM OST objid to IDIF mapping
+ *   For compatibility with existing OLD OST network protocol structures, the
+ *   FID must map onto the o_id and o_seq in a manner that ensures existing
+ *   objects are identified consistently for IO, as well as onto the LDLM
+ *   namespace to ensure IDIFs there is only a single resource name for any
+ *   object in the DLM.  The OLD OST object DLM resource mapping is:
+ *
+ *      resource[] = {o_id, o_seq, 0, 0}; // o_seq == 0 for production releases
+ *
+ *   The NEW OST object DLM resource mapping is the same for both MDT and OST:
+ *
+ *      resource[] = {SEQ, OID, VER, HASH};
+ *
+ *  NOTE: for mapping IDIF values to DLM resource names the o_id may be
+ *  larger than the 2^33 reserved sequence numbers for IDIF, so it is possible
+ *  for the o_id numbers to overlap FID SEQ numbers in the resource. However,
+ *  in all production releases the OLD o_seq field is always zero, and all
+ *  valid FID OID values are non-zero, so the lock resources will not collide.
+ *  Even so, the MDT and OST resources are also in different LDLM namespaces.
+ */
+
+#include "../../include/linux/libcfs/libcfs.h"
+#include "lustre/lustre_idl.h"
+
+struct lu_env;
+struct lu_site;
+struct lu_context;
+struct obd_device;
+struct obd_export;
+
+/* Whole sequences space range and zero range definitions */
+extern const struct lu_seq_range LUSTRE_SEQ_SPACE_RANGE;
+extern const struct lu_seq_range LUSTRE_SEQ_ZERO_RANGE;
+extern const struct lu_fid LUSTRE_BFL_FID;
+extern const struct lu_fid LU_OBF_FID;
+extern const struct lu_fid LU_DOT_LUSTRE_FID;
+
+enum {
+	/*
+	 * This is how may metadata FIDs may be allocated in one sequence(128k)
+	 */
+	LUSTRE_METADATA_SEQ_MAX_WIDTH = 0x0000000000020000ULL,
+
+	/*
+	 * This is how many data FIDs could be allocated in one sequence(4B - 1)
+	 */
+	LUSTRE_DATA_SEQ_MAX_WIDTH = 0x00000000FFFFFFFFULL,
+
+	/*
+	 * How many sequences to allocate to a client at once.
+	 */
+	LUSTRE_SEQ_META_WIDTH = 0x0000000000000001ULL,
+
+	/*
+	 * seq allocation pool size.
+	 */
+	LUSTRE_SEQ_BATCH_WIDTH = LUSTRE_SEQ_META_WIDTH * 1000,
+
+	/*
+	 * This is how many sequences may be in one super-sequence allocated to
+	 * MDTs.
+	 */
+	LUSTRE_SEQ_SUPER_WIDTH = ((1ULL << 30ULL) * LUSTRE_SEQ_META_WIDTH)
+};
+
+enum {
+	/** 2^6 FIDs for OI containers */
+	OSD_OI_FID_OID_BITS     = 6,
+	/** reserve enough FIDs in case we want more in the future */
+	OSD_OI_FID_OID_BITS_MAX = 10,
+};
+
+/** special OID for local objects */
+enum local_oid {
+	/** \see fld_mod_init */
+	FLD_INDEX_OID		= 3UL,
+	/** \see fid_mod_init */
+	FID_SEQ_CTL_OID		= 4UL,
+	FID_SEQ_SRV_OID		= 5UL,
+	/** \see mdd_mod_init */
+	MDD_ROOT_INDEX_OID	= 6UL, /* deprecated in 2.4 */
+	MDD_ORPHAN_OID		= 7UL, /* deprecated in 2.4 */
+	MDD_LOV_OBJ_OID		= 8UL,
+	MDD_CAPA_KEYS_OID	= 9UL,
+	/** \see mdt_mod_init */
+	LAST_RECV_OID		= 11UL,
+	OSD_FS_ROOT_OID		= 13UL,
+	ACCT_USER_OID		= 15UL,
+	ACCT_GROUP_OID		= 16UL,
+	LFSCK_BOOKMARK_OID	= 17UL,
+	OTABLE_IT_OID		= 18UL,
+	/* These two definitions are obsolete
+	 * OFD_GROUP0_LAST_OID     = 20UL,
+	 * OFD_GROUP4K_LAST_OID    = 20UL+4096,
+	 */
+	OFD_LAST_GROUP_OID	= 4117UL,
+	LLOG_CATALOGS_OID	= 4118UL,
+	MGS_CONFIGS_OID		= 4119UL,
+	OFD_HEALTH_CHECK_OID	= 4120UL,
+	MDD_LOV_OBJ_OSEQ	= 4121UL,
+	LFSCK_NAMESPACE_OID     = 4122UL,
+	REMOTE_PARENT_DIR_OID	= 4123UL,
+};
+
+static inline void lu_local_obj_fid(struct lu_fid *fid, __u32 oid)
+{
+	fid->f_seq = FID_SEQ_LOCAL_FILE;
+	fid->f_oid = oid;
+	fid->f_ver = 0;
+}
+
+static inline void lu_local_name_obj_fid(struct lu_fid *fid, __u32 oid)
+{
+	fid->f_seq = FID_SEQ_LOCAL_NAME;
+	fid->f_oid = oid;
+	fid->f_ver = 0;
+}
+
+/* For new FS (>= 2.4), the root FID will be changed to
+ * [FID_SEQ_ROOT:1:0], for existing FS, (upgraded to 2.4),
+ * the root FID will still be IGIF */
+static inline int fid_is_root(const struct lu_fid *fid)
+{
+	return unlikely((fid_seq(fid) == FID_SEQ_ROOT &&
+			 fid_oid(fid) == 1));
+}
+
+static inline int fid_is_dot_lustre(const struct lu_fid *fid)
+{
+	return unlikely(fid_seq(fid) == FID_SEQ_DOT_LUSTRE &&
+			fid_oid(fid) == FID_OID_DOT_LUSTRE);
+}
+
+static inline int fid_is_obf(const struct lu_fid *fid)
+{
+	return unlikely(fid_seq(fid) == FID_SEQ_DOT_LUSTRE &&
+			fid_oid(fid) == FID_OID_DOT_LUSTRE_OBF);
+}
+
+static inline int fid_is_otable_it(const struct lu_fid *fid)
+{
+	return unlikely(fid_seq(fid) == FID_SEQ_LOCAL_FILE &&
+			fid_oid(fid) == OTABLE_IT_OID);
+}
+
+static inline int fid_is_acct(const struct lu_fid *fid)
+{
+	return fid_seq(fid) == FID_SEQ_LOCAL_FILE &&
+	       (fid_oid(fid) == ACCT_USER_OID ||
+		fid_oid(fid) == ACCT_GROUP_OID);
+}
+
+static inline int fid_is_quota(const struct lu_fid *fid)
+{
+	return fid_seq(fid) == FID_SEQ_QUOTA ||
+	       fid_seq(fid) == FID_SEQ_QUOTA_GLB;
+}
+
+static inline int fid_is_namespace_visible(const struct lu_fid *fid)
+{
+	const __u64 seq = fid_seq(fid);
+
+	/* Here, we cannot distinguish whether the normal FID is for OST
+	 * object or not. It is caller's duty to check more if needed. */
+	return (!fid_is_last_id(fid) &&
+		(fid_seq_is_norm(seq) || fid_seq_is_igif(seq))) ||
+	       fid_is_root(fid) || fid_is_dot_lustre(fid);
+}
+
+static inline int fid_seq_in_fldb(__u64 seq)
+{
+	return fid_seq_is_igif(seq) || fid_seq_is_norm(seq) ||
+	       fid_seq_is_root(seq) || fid_seq_is_dot(seq);
+}
+
+static inline void lu_last_id_fid(struct lu_fid *fid, __u64 seq)
+{
+	if (fid_seq_is_mdt0(seq)) {
+		fid->f_seq = fid_idif_seq(0, 0);
+	} else {
+		LASSERTF(fid_seq_is_norm(seq) || fid_seq_is_echo(seq) ||
+			 fid_seq_is_idif(seq), "%#llx\n", seq);
+		fid->f_seq = seq;
+	}
+	fid->f_oid = 0;
+	fid->f_ver = 0;
+}
+
+/* seq client type */
+enum lu_cli_type {
+	LUSTRE_SEQ_METADATA = 1,
+	LUSTRE_SEQ_DATA
+};
+
+enum lu_mgr_type {
+	LUSTRE_SEQ_SERVER,
+	LUSTRE_SEQ_CONTROLLER
+};
+
+struct lu_server_seq;
+
+/* Client sequence manager interface. */
+struct lu_client_seq {
+	/* Sequence-controller export. */
+	struct obd_export      *lcs_exp;
+	struct mutex		lcs_mutex;
+
+	/*
+	 * Range of allowed for allocation sequences. When using lu_client_seq on
+	 * clients, this contains meta-sequence range. And for servers this
+	 * contains super-sequence range.
+	 */
+	struct lu_seq_range	 lcs_space;
+
+	/* Seq related proc */
+	struct proc_dir_entry   *lcs_proc_dir;
+
+	/* This holds last allocated fid in last obtained seq */
+	struct lu_fid	   lcs_fid;
+
+	/* LUSTRE_SEQ_METADATA or LUSTRE_SEQ_DATA */
+	enum lu_cli_type	lcs_type;
+
+	/*
+	 * Service uuid, passed from MDT + seq name to form unique seq name to
+	 * use it with procfs.
+	 */
+	char		    lcs_name[LUSTRE_MDT_MAXNAMELEN];
+
+	/*
+	 * Sequence width, that is how many objects may be allocated in one
+	 * sequence. Default value for it is LUSTRE_SEQ_MAX_WIDTH.
+	 */
+	__u64		   lcs_width;
+
+	/* Seq-server for direct talking */
+	struct lu_server_seq   *lcs_srv;
+
+	/* wait queue for fid allocation and update indicator */
+	wait_queue_head_t	     lcs_waitq;
+	int		     lcs_update;
+};
+
+/* server sequence manager interface */
+struct lu_server_seq {
+	/* Available sequences space */
+	struct lu_seq_range	 lss_space;
+
+	/* keeps highwater in lsr_end for seq allocation algorithm */
+	struct lu_seq_range	 lss_lowater_set;
+	struct lu_seq_range	 lss_hiwater_set;
+
+	/*
+	 * Device for server side seq manager needs (saving sequences to backing
+	 * store).
+	 */
+	struct dt_device       *lss_dev;
+
+	/* /seq file object device */
+	struct dt_object       *lss_obj;
+
+	/* Seq related proc */
+	struct proc_dir_entry   *lss_proc_dir;
+
+	/* LUSTRE_SEQ_SERVER or LUSTRE_SEQ_CONTROLLER */
+	enum lu_mgr_type       lss_type;
+
+	/* Client interface to request controller */
+	struct lu_client_seq   *lss_cli;
+
+	/* Mutex for protecting allocation */
+	struct mutex		lss_mutex;
+
+	/*
+	 * Service uuid, passed from MDT + seq name to form unique seq name to
+	 * use it with procfs.
+	 */
+	char		    lss_name[LUSTRE_MDT_MAXNAMELEN];
+
+	/*
+	 * Allocation chunks for super and meta sequences. Default values are
+	 * LUSTRE_SEQ_SUPER_WIDTH and LUSTRE_SEQ_META_WIDTH.
+	 */
+	__u64		   lss_width;
+
+	/*
+	 * minimum lss_alloc_set size that should be allocated from
+	 * lss_space
+	 */
+	__u64		   lss_set_width;
+
+	/* sync is needed for update operation */
+	__u32		   lss_need_sync;
+
+	/**
+	 * Pointer to site object, required to access site fld.
+	 */
+	struct seq_server_site  *lss_site;
+};
+
+/* Server methods */
+
+int seq_server_init(struct lu_server_seq *seq,
+		    struct dt_device *dev,
+		    const char *prefix,
+		    enum lu_mgr_type type,
+		    struct seq_server_site *ss,
+		    const struct lu_env *env);
+
+void seq_server_fini(struct lu_server_seq *seq,
+		     const struct lu_env *env);
+
+int seq_server_alloc_super(struct lu_server_seq *seq,
+			   struct lu_seq_range *out,
+			   const struct lu_env *env);
+
+int seq_server_alloc_meta(struct lu_server_seq *seq,
+			  struct lu_seq_range *out,
+			  const struct lu_env *env);
+
+int seq_server_set_cli(struct lu_server_seq *seq,
+		       struct lu_client_seq *cli,
+		       const struct lu_env *env);
+
+/* Client methods */
+int seq_client_init(struct lu_client_seq *seq,
+		    struct obd_export *exp,
+		    enum lu_cli_type type,
+		    const char *prefix,
+		    struct lu_server_seq *srv);
+
+void seq_client_fini(struct lu_client_seq *seq);
+
+void seq_client_flush(struct lu_client_seq *seq);
+
+int seq_client_alloc_fid(const struct lu_env *env, struct lu_client_seq *seq,
+			 struct lu_fid *fid);
+int seq_client_get_seq(const struct lu_env *env, struct lu_client_seq *seq,
+		       u64 *seqnr);
+int seq_site_fini(const struct lu_env *env, struct seq_server_site *ss);
+/* Fids common stuff */
+int fid_is_local(const struct lu_env *env,
+		 struct lu_site *site, const struct lu_fid *fid);
+
+enum lu_cli_type;
+int client_fid_init(struct obd_device *obd, struct obd_export *exp,
+		    enum lu_cli_type type);
+int client_fid_fini(struct obd_device *obd);
+
+/* fid locking */
+
+struct ldlm_namespace;
+
+/*
+ * Build (DLM) resource name from FID.
+ *
+ * NOTE: until Lustre 1.8.7/2.1.1 the fid_ver() was packed into name[2],
+ * but was moved into name[1] along with the OID to avoid consuming the
+ * renaming name[2,3] fields that need to be used for the quota identifier.
+ */
+static inline struct ldlm_res_id *
+fid_build_reg_res_name(const struct lu_fid *fid, struct ldlm_res_id *res)
+{
+	memset(res, 0, sizeof(*res));
+	res->name[LUSTRE_RES_ID_SEQ_OFF] = fid_seq(fid);
+	res->name[LUSTRE_RES_ID_VER_OID_OFF] = fid_ver_oid(fid);
+
+	return res;
+}
+
+/*
+ * Return true if resource is for object identified by FID.
+ */
+static inline int fid_res_name_eq(const struct lu_fid *fid,
+				  const struct ldlm_res_id *res)
+{
+	return res->name[LUSTRE_RES_ID_SEQ_OFF] == fid_seq(fid) &&
+	       res->name[LUSTRE_RES_ID_VER_OID_OFF] == fid_ver_oid(fid);
+}
+
+/*
+ * Extract FID from LDLM resource. Reverse of fid_build_reg_res_name().
+ */
+static inline struct lu_fid *
+fid_extract_from_res_name(struct lu_fid *fid, const struct ldlm_res_id *res)
+{
+	fid->f_seq = res->name[LUSTRE_RES_ID_SEQ_OFF];
+	fid->f_oid = (__u32)(res->name[LUSTRE_RES_ID_VER_OID_OFF]);
+	fid->f_ver = (__u32)(res->name[LUSTRE_RES_ID_VER_OID_OFF] >> 32);
+	LASSERT(fid_res_name_eq(fid, res));
+
+	return fid;
+}
+
+/*
+ * Build (DLM) resource identifier from global quota FID and quota ID.
+ */
+static inline struct ldlm_res_id *
+fid_build_quota_res_name(const struct lu_fid *glb_fid, union lquota_id *qid,
+		      struct ldlm_res_id *res)
+{
+	fid_build_reg_res_name(glb_fid, res);
+	res->name[LUSTRE_RES_ID_QUOTA_SEQ_OFF] = fid_seq(&qid->qid_fid);
+	res->name[LUSTRE_RES_ID_QUOTA_VER_OID_OFF] = fid_ver_oid(&qid->qid_fid);
+
+	return res;
+}
+
+/*
+ * Extract global FID and quota ID from resource name
+ */
+static inline void fid_extract_from_quota_res(struct lu_fid *glb_fid,
+					      union lquota_id *qid,
+					      const struct ldlm_res_id *res)
+{
+	fid_extract_from_res_name(glb_fid, res);
+	qid->qid_fid.f_seq = res->name[LUSTRE_RES_ID_QUOTA_SEQ_OFF];
+	qid->qid_fid.f_oid = (__u32)res->name[LUSTRE_RES_ID_QUOTA_VER_OID_OFF];
+	qid->qid_fid.f_ver =
+		(__u32)(res->name[LUSTRE_RES_ID_QUOTA_VER_OID_OFF] >> 32);
+}
+
+static inline struct ldlm_res_id *
+fid_build_pdo_res_name(const struct lu_fid *fid, unsigned int hash,
+		       struct ldlm_res_id *res)
+{
+	fid_build_reg_res_name(fid, res);
+	res->name[LUSTRE_RES_ID_HSH_OFF] = hash;
+
+	return res;
+}
+
+/**
+ * Build DLM resource name from object id & seq, which will be removed
+ * finally, when we replace ost_id with FID in data stack.
+ *
+ * Currently, resid from the old client, whose res[0] = object_id,
+ * res[1] = object_seq, is just opposite with Metatdata
+ * resid, where, res[0] = fid->f_seq, res[1] = fid->f_oid.
+ * To unify the resid identification, we will reverse the data
+ * resid to keep it same with Metadata resid, i.e.
+ *
+ * For resid from the old client,
+ *    res[0] = objid,  res[1] = 0, still keep the original order,
+ *    for compatibility.
+ *
+ * For new resid
+ *    res will be built from normal FID directly, i.e. res[0] = f_seq,
+ *    res[1] = f_oid + f_ver.
+ */
+static inline void ostid_build_res_name(struct ost_id *oi,
+					struct ldlm_res_id *name)
+{
+	memset(name, 0, sizeof(*name));
+	if (fid_seq_is_mdt0(ostid_seq(oi))) {
+		name->name[LUSTRE_RES_ID_SEQ_OFF] = ostid_id(oi);
+		name->name[LUSTRE_RES_ID_VER_OID_OFF] = ostid_seq(oi);
+	} else {
+		fid_build_reg_res_name(&oi->oi_fid, name);
+	}
+}
+
+static inline void ostid_res_name_to_id(struct ost_id *oi,
+					struct ldlm_res_id *name)
+{
+	if (fid_seq_is_mdt0(name->name[LUSTRE_RES_ID_SEQ_OFF])) {
+		/* old resid */
+		ostid_set_seq(oi, name->name[LUSTRE_RES_ID_VER_OID_OFF]);
+		ostid_set_id(oi, name->name[LUSTRE_RES_ID_SEQ_OFF]);
+	} else {
+		/* new resid */
+		fid_extract_from_res_name(&oi->oi_fid, name);
+	}
+}
+
+/**
+ * Return true if the resource is for the object identified by this id & group.
+ */
+static inline int ostid_res_name_eq(struct ost_id *oi,
+				    struct ldlm_res_id *name)
+{
+	/* Note: it is just a trick here to save some effort, probably the
+	 * correct way would be turn them into the FID and compare */
+	if (fid_seq_is_mdt0(ostid_seq(oi))) {
+		return name->name[LUSTRE_RES_ID_SEQ_OFF] == ostid_id(oi) &&
+		       name->name[LUSTRE_RES_ID_VER_OID_OFF] == ostid_seq(oi);
+	} else {
+		return name->name[LUSTRE_RES_ID_SEQ_OFF] == ostid_seq(oi) &&
+		       name->name[LUSTRE_RES_ID_VER_OID_OFF] == ostid_id(oi);
+	}
+}
+
+/* The same as osc_build_res_name() */
+static inline void ost_fid_build_resid(const struct lu_fid *fid,
+				       struct ldlm_res_id *resname)
+{
+	if (fid_is_mdt0(fid) || fid_is_idif(fid)) {
+		struct ost_id oi;
+		oi.oi.oi_id = 0; /* gcc 4.7.2 complains otherwise */
+		if (fid_to_ostid(fid, &oi) != 0)
+			return;
+		ostid_build_res_name(&oi, resname);
+	} else {
+		fid_build_reg_res_name(fid, resname);
+	}
+}
+
+static inline void ost_fid_from_resid(struct lu_fid *fid,
+				      const struct ldlm_res_id *name)
+{
+	if (fid_seq_is_mdt0(name->name[LUSTRE_RES_ID_VER_OID_OFF])) {
+		/* old resid */
+		struct ost_id oi;
+		ostid_set_seq(&oi, name->name[LUSTRE_RES_ID_VER_OID_OFF]);
+		ostid_set_id(&oi, name->name[LUSTRE_RES_ID_SEQ_OFF]);
+		ostid_to_fid(fid, &oi, 0);
+	} else {
+		/* new resid */
+		fid_extract_from_res_name(fid, name);
+	}
+}
+
+/**
+ * Flatten 128-bit FID values into a 64-bit value for use as an inode number.
+ * For non-IGIF FIDs this starts just over 2^32, and continues without
+ * conflict until 2^64, at which point we wrap the high 24 bits of the SEQ
+ * into the range where there may not be many OID values in use, to minimize
+ * the risk of conflict.
+ *
+ * Suppose LUSTRE_SEQ_MAX_WIDTH less than (1 << 24) which is currently true,
+ * the time between re-used inode numbers is very long - 2^40 SEQ numbers,
+ * or about 2^40 client mounts, if clients create less than 2^24 files/mount.
+ */
+static inline __u64 fid_flatten(const struct lu_fid *fid)
+{
+	__u64 ino;
+	__u64 seq;
+
+	if (fid_is_igif(fid)) {
+		ino = lu_igif_ino(fid);
+		return ino;
+	}
+
+	seq = fid_seq(fid);
+
+	ino = (seq << 24) + ((seq >> 24) & 0xffffff0000ULL) + fid_oid(fid);
+
+	return ino ? ino : fid_oid(fid);
+}
+
+static inline __u32 fid_hash(const struct lu_fid *f, int bits)
+{
+	/* all objects with same id and different versions will belong to same
+	 * collisions list. */
+	return hash_long(fid_flatten(f), bits);
+}
+
+/**
+ * map fid to 32 bit value for ino on 32bit systems. */
+static inline __u32 fid_flatten32(const struct lu_fid *fid)
+{
+	__u32 ino;
+	__u64 seq;
+
+	if (fid_is_igif(fid)) {
+		ino = lu_igif_ino(fid);
+		return ino;
+	}
+
+	seq = fid_seq(fid) - FID_SEQ_START;
+
+	/* Map the high bits of the OID into higher bits of the inode number so
+	 * that inodes generated at about the same time have a reduced chance
+	 * of collisions. This will give a period of 2^12 = 1024 unique clients
+	 * (from SEQ) and up to min(LUSTRE_SEQ_MAX_WIDTH, 2^20) = 128k objects
+	 * (from OID), or up to 128M inodes without collisions for new files. */
+	ino = ((seq & 0x000fffffULL) << 12) + ((seq >> 8) & 0xfffff000) +
+	       (seq >> (64 - (40-8)) & 0xffffff00) +
+	       (fid_oid(fid) & 0xff000fff) + ((fid_oid(fid) & 0x00fff000) << 8);
+
+	return ino ? ino : fid_oid(fid);
+}
+
+static inline int lu_fid_diff(struct lu_fid *fid1, struct lu_fid *fid2)
+{
+	LASSERTF(fid_seq(fid1) == fid_seq(fid2), "fid1:"DFID", fid2:"DFID"\n",
+		 PFID(fid1), PFID(fid2));
+
+	if (fid_is_idif(fid1) && fid_is_idif(fid2))
+		return fid_idif_id(fid1->f_seq, fid1->f_oid, fid1->f_ver) -
+		       fid_idif_id(fid2->f_seq, fid2->f_oid, fid2->f_ver);
+
+	return fid_oid(fid1) - fid_oid(fid2);
+}
+
+#define LUSTRE_SEQ_SRV_NAME "seq_srv"
+#define LUSTRE_SEQ_CTL_NAME "seq_ctl"
+
+/* Range common stuff */
+static inline void range_cpu_to_le(struct lu_seq_range *dst, const struct lu_seq_range *src)
+{
+	dst->lsr_start = cpu_to_le64(src->lsr_start);
+	dst->lsr_end = cpu_to_le64(src->lsr_end);
+	dst->lsr_index = cpu_to_le32(src->lsr_index);
+	dst->lsr_flags = cpu_to_le32(src->lsr_flags);
+}
+
+static inline void range_le_to_cpu(struct lu_seq_range *dst, const struct lu_seq_range *src)
+{
+	dst->lsr_start = le64_to_cpu(src->lsr_start);
+	dst->lsr_end = le64_to_cpu(src->lsr_end);
+	dst->lsr_index = le32_to_cpu(src->lsr_index);
+	dst->lsr_flags = le32_to_cpu(src->lsr_flags);
+}
+
+static inline void range_cpu_to_be(struct lu_seq_range *dst, const struct lu_seq_range *src)
+{
+	dst->lsr_start = cpu_to_be64(src->lsr_start);
+	dst->lsr_end = cpu_to_be64(src->lsr_end);
+	dst->lsr_index = cpu_to_be32(src->lsr_index);
+	dst->lsr_flags = cpu_to_be32(src->lsr_flags);
+}
+
+static inline void range_be_to_cpu(struct lu_seq_range *dst, const struct lu_seq_range *src)
+{
+	dst->lsr_start = be64_to_cpu(src->lsr_start);
+	dst->lsr_end = be64_to_cpu(src->lsr_end);
+	dst->lsr_index = be32_to_cpu(src->lsr_index);
+	dst->lsr_flags = be32_to_cpu(src->lsr_flags);
+}
+
+/** @} fid */
+
+#endif /* __LUSTRE_FID_H */
diff --git a/kernel/drivers/staging/lustre/lustre/include/lustre_fld.h b/kernel/drivers/staging/lustre/lustre/include/lustre_fld.h
new file mode 100644
index 000000000..5ee4b1ed0
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/include/lustre_fld.h
@@ -0,0 +1,160 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2013, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef __LINUX_FLD_H
+#define __LINUX_FLD_H
+
+/** \defgroup fld fld
+ *
+ * @{
+ */
+
+#include "lustre/lustre_idl.h"
+#include "../../include/linux/libcfs/libcfs.h"
+
+struct lu_client_fld;
+struct lu_server_fld;
+struct lu_fld_hash;
+struct fld_cache;
+
+extern const struct dt_index_features fld_index_features;
+extern const char fld_index_name[];
+
+/*
+ * FLD (Fid Location Database) interface.
+ */
+enum {
+	LUSTRE_CLI_FLD_HASH_DHT = 0,
+	LUSTRE_CLI_FLD_HASH_RRB
+};
+
+
+struct lu_fld_target {
+	struct list_head	       ft_chain;
+	struct obd_export       *ft_exp;
+	struct lu_server_fld    *ft_srv;
+	__u64		    ft_idx;
+};
+
+struct lu_server_fld {
+	/**
+	 * Fld dir proc entry. */
+	struct proc_dir_entry    *lsf_proc_dir;
+
+	/**
+	 * /fld file object device */
+	struct dt_object	*lsf_obj;
+
+	/**
+	 * super sequence controller export, needed to forward fld
+	 * lookup  request. */
+	struct obd_export       *lsf_control_exp;
+
+	/**
+	 * Client FLD cache. */
+	struct fld_cache	*lsf_cache;
+
+	/**
+	 * Protect index modifications */
+	struct mutex		lsf_lock;
+
+	/**
+	 * Fld service name in form "fld-srv-lustre-MDTXXX" */
+	char		     lsf_name[LUSTRE_MDT_MAXNAMELEN];
+
+};
+
+struct lu_client_fld {
+	/**
+	 * Client side proc entry. */
+	struct proc_dir_entry    *lcf_proc_dir;
+
+	/**
+	 * List of exports client FLD knows about. */
+	struct list_head	       lcf_targets;
+
+	/**
+	 * Current hash to be used to chose an export. */
+	struct lu_fld_hash      *lcf_hash;
+
+	/**
+	 * Exports count. */
+	int		      lcf_count;
+
+	/**
+	 * Lock protecting exports list and fld_hash. */
+	spinlock_t		 lcf_lock;
+
+	/**
+	 * Client FLD cache. */
+	struct fld_cache	*lcf_cache;
+
+	/**
+	 * Client fld proc entry name. */
+	char		     lcf_name[LUSTRE_MDT_MAXNAMELEN];
+
+	int		      lcf_flags;
+};
+
+/* Client methods */
+int fld_client_init(struct lu_client_fld *fld,
+		    const char *prefix, int hash);
+
+void fld_client_fini(struct lu_client_fld *fld);
+
+void fld_client_flush(struct lu_client_fld *fld);
+
+int fld_client_lookup(struct lu_client_fld *fld, u64 seq, u32 *mds,
+		      __u32 flags, const struct lu_env *env);
+
+int fld_client_create(struct lu_client_fld *fld,
+		      struct lu_seq_range *range,
+		      const struct lu_env *env);
+
+int fld_client_delete(struct lu_client_fld *fld, u64 seq,
+		      const struct lu_env *env);
+
+int fld_client_add_target(struct lu_client_fld *fld,
+			  struct lu_fld_target *tar);
+
+int fld_client_del_target(struct lu_client_fld *fld,
+			  __u64 idx);
+
+void fld_client_proc_fini(struct lu_client_fld *fld);
+
+/** @} fld */
+
+#endif
diff --git a/kernel/drivers/staging/lustre/lustre/include/lustre_ha.h b/kernel/drivers/staging/lustre/lustre/include/lustre_ha.h
new file mode 100644
index 000000000..f3ae02b3e
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/include/lustre_ha.h
@@ -0,0 +1,64 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef _LUSTRE_HA_H
+#define _LUSTRE_HA_H
+
+/** \defgroup ha ha
+ *
+ * @{
+ */
+
+struct obd_import;
+struct obd_export;
+struct obd_device;
+struct ptlrpc_request;
+
+
+int ptlrpc_replay(struct obd_import *imp);
+int ptlrpc_resend(struct obd_import *imp);
+void ptlrpc_free_committed(struct obd_import *imp);
+void ptlrpc_wake_delayed(struct obd_import *imp);
+int ptlrpc_recover_import(struct obd_import *imp, char *new_uuid, int async);
+int ptlrpc_set_import_active(struct obd_import *imp, int active);
+void ptlrpc_activate_import(struct obd_import *imp);
+void ptlrpc_deactivate_import(struct obd_import *imp);
+void ptlrpc_invalidate_import(struct obd_import *imp);
+void ptlrpc_fail_import(struct obd_import *imp, __u32 conn_cnt);
+
+/** @} ha */
+
+#endif
diff --git a/kernel/drivers/staging/lustre/lustre/include/lustre_handles.h b/kernel/drivers/staging/lustre/lustre/include/lustre_handles.h
new file mode 100644
index 000000000..726bbd3ea
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/include/lustre_handles.h
@@ -0,0 +1,97 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef __LUSTRE_HANDLES_H_
+#define __LUSTRE_HANDLES_H_
+
+/** \defgroup handles handles
+ *
+ * @{
+ */
+
+#include <linux/atomic.h>
+#include <linux/list.h>
+#include <linux/rcupdate.h>
+#include <linux/spinlock.h>
+#include <linux/types.h>
+
+#include "../../include/linux/libcfs/libcfs.h"
+
+
+struct portals_handle_ops {
+	void (*hop_addref)(void *object);
+	void (*hop_free)(void *object, int size);
+};
+
+/* These handles are most easily used by having them appear at the very top of
+ * whatever object that you want to make handles for.  ie:
+ *
+ * struct ldlm_lock {
+ *	 struct portals_handle handle;
+ *	 ...
+ * };
+ *
+ * Now you're able to assign the results of cookie2handle directly to an
+ * ldlm_lock.  If it's not at the top, you'll want to use container_of()
+ * to compute the start of the structure based on the handle field. */
+struct portals_handle {
+	struct list_head			h_link;
+	__u64				h_cookie;
+	struct portals_handle_ops	*h_ops;
+
+	/* newly added fields to handle the RCU issue. -jxiong */
+	struct rcu_head			h_rcu;
+	spinlock_t			h_lock;
+	unsigned int			h_size:31;
+	unsigned int			h_in:1;
+};
+#define RCU2HANDLE(rcu)    container_of(rcu, struct portals_handle, h_rcu)
+
+/* handles.c */
+
+/* Add a handle to the hash table */
+void class_handle_hash(struct portals_handle *,
+		       struct portals_handle_ops *ops);
+void class_handle_unhash(struct portals_handle *);
+void class_handle_hash_back(struct portals_handle *);
+void *class_handle2object(__u64 cookie);
+void class_handle_free_cb(struct rcu_head *rcu);
+int class_handle_init(void);
+void class_handle_cleanup(void);
+
+/** @} handles */
+
+#endif
diff --git a/kernel/drivers/staging/lustre/lustre/include/lustre_import.h b/kernel/drivers/staging/lustre/lustre/include/lustre_import.h
new file mode 100644
index 000000000..dcc807676
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/include/lustre_import.h
@@ -0,0 +1,385 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+/** \defgroup obd_import PtlRPC import definitions
+ * Imports are client-side representation of remote obd target.
+ *
+ * @{
+ */
+
+#ifndef __IMPORT_H
+#define __IMPORT_H
+
+/** \defgroup export export
+ *
+ * @{
+ */
+
+#include "lustre_handles.h"
+#include "lustre/lustre_idl.h"
+
+
+/**
+ * Adaptive Timeout stuff
+ *
+ * @{
+ */
+#define D_ADAPTTO D_OTHER
+#define AT_BINS 4		  /* "bin" means "N seconds of history" */
+#define AT_FLG_NOHIST 0x1	  /* use last reported value only */
+
+struct adaptive_timeout {
+	time_t		at_binstart;	 /* bin start time */
+	unsigned int	at_hist[AT_BINS];    /* timeout history bins */
+	unsigned int	at_flags;
+	unsigned int	at_current;	  /* current timeout value */
+	unsigned int	at_worst_ever;       /* worst-ever timeout value */
+	time_t		at_worst_time;       /* worst-ever timeout timestamp */
+	spinlock_t	at_lock;
+};
+
+struct ptlrpc_at_array {
+	struct list_head       *paa_reqs_array; /** array to hold requests */
+	__u32	     paa_size;       /** the size of array */
+	__u32	     paa_count;      /** the total count of reqs */
+	time_t	    paa_deadline;   /** the earliest deadline of reqs */
+	__u32	    *paa_reqs_count; /** the count of reqs in each entry */
+};
+
+#define IMP_AT_MAX_PORTALS 8
+struct imp_at {
+	int		     iat_portal[IMP_AT_MAX_PORTALS];
+	struct adaptive_timeout iat_net_latency;
+	struct adaptive_timeout iat_service_estimate[IMP_AT_MAX_PORTALS];
+};
+
+
+/** @} */
+
+/** Possible import states */
+enum lustre_imp_state {
+	LUSTRE_IMP_CLOSED     = 1,
+	LUSTRE_IMP_NEW	= 2,
+	LUSTRE_IMP_DISCON     = 3,
+	LUSTRE_IMP_CONNECTING = 4,
+	LUSTRE_IMP_REPLAY     = 5,
+	LUSTRE_IMP_REPLAY_LOCKS = 6,
+	LUSTRE_IMP_REPLAY_WAIT  = 7,
+	LUSTRE_IMP_RECOVER    = 8,
+	LUSTRE_IMP_FULL       = 9,
+	LUSTRE_IMP_EVICTED    = 10,
+};
+
+/** Returns test string representation of numeric import state \a state */
+static inline char *ptlrpc_import_state_name(enum lustre_imp_state state)
+{
+	static char *import_state_names[] = {
+		"<UNKNOWN>", "CLOSED",  "NEW", "DISCONN",
+		"CONNECTING", "REPLAY", "REPLAY_LOCKS", "REPLAY_WAIT",
+		"RECOVER", "FULL", "EVICTED",
+	};
+
+	LASSERT (state <= LUSTRE_IMP_EVICTED);
+	return import_state_names[state];
+}
+
+/**
+ * List of import event types
+ */
+enum obd_import_event {
+	IMP_EVENT_DISCON     = 0x808001,
+	IMP_EVENT_INACTIVE   = 0x808002,
+	IMP_EVENT_INVALIDATE = 0x808003,
+	IMP_EVENT_ACTIVE     = 0x808004,
+	IMP_EVENT_OCD	= 0x808005,
+	IMP_EVENT_DEACTIVATE = 0x808006,
+	IMP_EVENT_ACTIVATE   = 0x808007,
+};
+
+/**
+ * Definition of import connection structure
+ */
+struct obd_import_conn {
+	/** Item for linking connections together */
+	struct list_head		oic_item;
+	/** Pointer to actual PortalRPC connection */
+	struct ptlrpc_connection *oic_conn;
+	/** uuid of remote side */
+	struct obd_uuid	   oic_uuid;
+	/**
+	 * Time (64 bit jiffies) of last connection attempt on this connection
+	 */
+	__u64		     oic_last_attempt;
+};
+
+/* state history */
+#define IMP_STATE_HIST_LEN 16
+struct import_state_hist {
+	enum lustre_imp_state ish_state;
+	time_t		ish_time;
+};
+
+/**
+ * Definition of PortalRPC import structure.
+ * Imports are representing client-side view to remote target.
+ */
+struct obd_import {
+	/** Local handle (== id) for this import. */
+	struct portals_handle     imp_handle;
+	/** Reference counter */
+	atomic_t	      imp_refcount;
+	struct lustre_handle      imp_dlm_handle; /* client's ldlm export */
+	/** Currently active connection */
+	struct ptlrpc_connection *imp_connection;
+	/** PortalRPC client structure for this import */
+	struct ptlrpc_client     *imp_client;
+	/** List element for linking into pinger chain */
+	struct list_head		imp_pinger_chain;
+	/** List element for linking into chain for destruction */
+	struct list_head		imp_zombie_chain;
+
+	/**
+	 * Lists of requests that are retained for replay, waiting for a reply,
+	 * or waiting for recovery to complete, respectively.
+	 * @{
+	 */
+	struct list_head		imp_replay_list;
+	struct list_head		imp_sending_list;
+	struct list_head		imp_delayed_list;
+	/** @} */
+
+	/**
+	 * List of requests that are retained for committed open replay. Once
+	 * open is committed, open replay request will be moved from the
+	 * imp_replay_list into the imp_committed_list.
+	 * The imp_replay_cursor is for accelerating searching during replay.
+	 * @{
+	 */
+	struct list_head		imp_committed_list;
+	struct list_head	       *imp_replay_cursor;
+	/** @} */
+
+	/** obd device for this import */
+	struct obd_device	*imp_obd;
+
+	/**
+	 * some seciruty-related fields
+	 * @{
+	 */
+	struct ptlrpc_sec	*imp_sec;
+	struct mutex		  imp_sec_mutex;
+	unsigned long		imp_sec_expire;
+	/** @} */
+
+	/** Wait queue for those who need to wait for recovery completion */
+	wait_queue_head_t	       imp_recovery_waitq;
+
+	/** Number of requests currently in-flight */
+	atomic_t	      imp_inflight;
+	/** Number of requests currently unregistering */
+	atomic_t	      imp_unregistering;
+	/** Number of replay requests inflight */
+	atomic_t	      imp_replay_inflight;
+	/** Number of currently happening import invalidations */
+	atomic_t	      imp_inval_count;
+	/** Numbner of request timeouts */
+	atomic_t	      imp_timeouts;
+	/** Current import state */
+	enum lustre_imp_state     imp_state;
+	/** Last replay state */
+	enum lustre_imp_state	  imp_replay_state;
+	/** History of import states */
+	struct import_state_hist  imp_state_hist[IMP_STATE_HIST_LEN];
+	int		       imp_state_hist_idx;
+	/** Current import generation. Incremented on every reconnect */
+	int		       imp_generation;
+	/** Incremented every time we send reconnection request */
+	__u32		     imp_conn_cnt;
+       /**
+	* \see ptlrpc_free_committed remembers imp_generation value here
+	* after a check to save on unnecessary replay list iterations
+	*/
+	int		       imp_last_generation_checked;
+	/** Last transno we replayed */
+	__u64		     imp_last_replay_transno;
+	/** Last transno committed on remote side */
+	__u64		     imp_peer_committed_transno;
+	/**
+	 * \see ptlrpc_free_committed remembers last_transno since its last
+	 * check here and if last_transno did not change since last run of
+	 * ptlrpc_free_committed and import generation is the same, we can
+	 * skip looking for requests to remove from replay list as optimisation
+	 */
+	__u64		     imp_last_transno_checked;
+	/**
+	 * Remote export handle. This is how remote side knows what export
+	 * we are talking to. Filled from response to connect request
+	 */
+	struct lustre_handle      imp_remote_handle;
+	/** When to perform next ping. time in jiffies. */
+	unsigned long		imp_next_ping;
+	/** When we last successfully connected. time in 64bit jiffies */
+	__u64		     imp_last_success_conn;
+
+	/** List of all possible connection for import. */
+	struct list_head		imp_conn_list;
+	/**
+	 * Current connection. \a imp_connection is imp_conn_current->oic_conn
+	 */
+	struct obd_import_conn   *imp_conn_current;
+
+	/** Protects flags, level, generation, conn_cnt, *_list */
+	spinlock_t		  imp_lock;
+
+	/* flags */
+	unsigned long	     imp_no_timeout:1, /* timeouts are disabled */
+				  imp_invalid:1,    /* evicted */
+				  /* administratively disabled */
+				  imp_deactive:1,
+				  /* try to recover the import */
+				  imp_replayable:1,
+				  /* don't run recovery (timeout instead) */
+				  imp_dlm_fake:1,
+				  /* use 1/2 timeout on MDS' OSCs */
+				  imp_server_timeout:1,
+				  /* VBR: imp in delayed recovery */
+				  imp_delayed_recovery:1,
+				  /* VBR: if gap was found then no lock replays
+				   */
+				  imp_no_lock_replay:1,
+				  /* recovery by versions was failed */
+				  imp_vbr_failed:1,
+				  /* force an immediate ping */
+				  imp_force_verify:1,
+				  /* force a scheduled ping */
+				  imp_force_next_verify:1,
+				  /* pingable */
+				  imp_pingable:1,
+				  /* resend for replay */
+				  imp_resend_replay:1,
+				  /* disable normal recovery, for test only. */
+				  imp_no_pinger_recover:1,
+				  /* need IR MNE swab */
+				  imp_need_mne_swab:1,
+				  /* import must be reconnected instead of
+				   * chose new connection */
+				  imp_force_reconnect:1,
+				  /* import has tried to connect with server */
+				  imp_connect_tried:1;
+	__u32		     imp_connect_op;
+	struct obd_connect_data   imp_connect_data;
+	__u64		     imp_connect_flags_orig;
+	int		       imp_connect_error;
+
+	__u32		     imp_msg_magic;
+	__u32		     imp_msghdr_flags;       /* adjusted based on server capability */
+
+	struct ptlrpc_request_pool *imp_rq_pool;	  /* emergency request pool */
+
+	struct imp_at	     imp_at;		 /* adaptive timeout data */
+	time_t		    imp_last_reply_time;    /* for health check */
+};
+
+typedef void (*obd_import_callback)(struct obd_import *imp, void *closure,
+				    int event, void *event_arg, void *cb_data);
+
+/**
+ * Structure for import observer.
+ * It is possible to register "observer" on an import and every time
+ * something happens to an import (like connect/evict/disconnect)
+ * obderver will get its callback called with event type
+ */
+struct obd_import_observer {
+	struct list_head	   oio_chain;
+	obd_import_callback  oio_cb;
+	void		*oio_cb_data;
+};
+
+void class_observe_import(struct obd_import *imp, obd_import_callback cb,
+			  void *cb_data);
+void class_unobserve_import(struct obd_import *imp, obd_import_callback cb,
+			    void *cb_data);
+void class_notify_import_observers(struct obd_import *imp, int event,
+				   void *event_arg);
+
+/* import.c */
+static inline unsigned int at_est2timeout(unsigned int val)
+{
+	/* add an arbitrary minimum: 125% +5 sec */
+	return (val + (val >> 2) + 5);
+}
+
+static inline unsigned int at_timeout2est(unsigned int val)
+{
+	/* restore estimate value from timeout: e=4/5(t-5) */
+	LASSERT(val);
+	return (max((val << 2) / 5, 5U) - 4);
+}
+
+static inline void at_reset(struct adaptive_timeout *at, int val)
+{
+	spin_lock(&at->at_lock);
+	at->at_current = val;
+	at->at_worst_ever = val;
+	at->at_worst_time = get_seconds();
+	spin_unlock(&at->at_lock);
+}
+static inline void at_init(struct adaptive_timeout *at, int val, int flags)
+{
+	memset(at, 0, sizeof(*at));
+	spin_lock_init(&at->at_lock);
+	at->at_flags = flags;
+	at_reset(at, val);
+}
+extern unsigned int at_min;
+static inline int at_get(struct adaptive_timeout *at)
+{
+	return (at->at_current > at_min) ? at->at_current : at_min;
+}
+int at_measured(struct adaptive_timeout *at, unsigned int val);
+int import_at_get_index(struct obd_import *imp, int portal);
+extern unsigned int at_max;
+#define AT_OFF (at_max == 0)
+
+/* genops.c */
+struct obd_export;
+extern struct obd_import *class_exp2cliimp(struct obd_export *);
+extern struct obd_import *class_conn2cliimp(struct lustre_handle *);
+
+/** @} import */
+
+#endif /* __IMPORT_H */
+
+/** @} obd_import */
diff --git a/kernel/drivers/staging/lustre/lustre/include/lustre_intent.h b/kernel/drivers/staging/lustre/lustre/include/lustre_intent.h
new file mode 100644
index 000000000..c491d52d8
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/include/lustre_intent.h
@@ -0,0 +1,62 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef LUSTRE_INTENT_H
+#define LUSTRE_INTENT_H
+
+/* intent IT_XXX are defined in lustre/include/obd.h */
+struct lustre_intent_data {
+	int		it_disposition;
+	int		it_status;
+	__u64		it_lock_handle;
+	__u64		it_lock_bits;
+	int		it_lock_mode;
+	int		it_remote_lock_mode;
+	__u64	   it_remote_lock_handle;
+	void	   *it_data;
+	unsigned int    it_lock_set:1;
+};
+
+struct lookup_intent {
+	int     it_op;
+	int     it_create_mode;
+	__u64   it_flags;
+	union {
+		struct lustre_intent_data lustre;
+	} d;
+};
+
+#endif
diff --git a/kernel/drivers/staging/lustre/lustre/include/lustre_lib.h b/kernel/drivers/staging/lustre/lustre/include/lustre_lib.h
new file mode 100644
index 000000000..bf135630c
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/include/lustre_lib.h
@@ -0,0 +1,666 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/include/lustre_lib.h
+ *
+ * Basic Lustre library routines.
+ */
+
+#ifndef _LUSTRE_LIB_H
+#define _LUSTRE_LIB_H
+
+/** \defgroup lib lib
+ *
+ * @{
+ */
+
+#include <linux/sched.h>
+#include <linux/signal.h>
+#include <linux/types.h>
+#include "../../include/linux/libcfs/libcfs.h"
+#include "lustre/lustre_idl.h"
+#include "lustre_ver.h"
+#include "lustre_cfg.h"
+
+/* target.c */
+struct kstatfs;
+struct ptlrpc_request;
+struct obd_export;
+struct lu_target;
+struct l_wait_info;
+#include "lustre_ha.h"
+#include "lustre_net.h"
+
+#define LI_POISON 0x5a5a5a5a
+#if BITS_PER_LONG > 32
+# define LL_POISON 0x5a5a5a5a5a5a5a5aL
+#else
+# define LL_POISON 0x5a5a5a5aL
+#endif
+#define LP_POISON ((void *)LL_POISON)
+
+int target_pack_pool_reply(struct ptlrpc_request *req);
+int do_set_info_async(struct obd_import *imp,
+		      int opcode, int version,
+		      u32 keylen, void *key,
+		      u32 vallen, void *val,
+		      struct ptlrpc_request_set *set);
+
+#define OBD_RECOVERY_MAX_TIME (obd_timeout * 18) /* b13079 */
+#define OBD_MAX_IOCTL_BUFFER CONFIG_LUSTRE_OBD_MAX_IOCTL_BUFFER
+
+void target_send_reply(struct ptlrpc_request *req, int rc, int fail_id);
+
+/* client.c */
+
+int client_sanobd_setup(struct obd_device *obddev, struct lustre_cfg *lcfg);
+struct client_obd *client_conn2cli(struct lustre_handle *conn);
+
+struct md_open_data;
+struct obd_client_handle {
+	struct lustre_handle	 och_fh;
+	struct lu_fid		 och_fid;
+	struct md_open_data	*och_mod;
+	struct lustre_handle	 och_lease_handle; /* open lock for lease */
+	__u32			 och_magic;
+	fmode_t			 och_flags;
+};
+#define OBD_CLIENT_HANDLE_MAGIC 0xd15ea5ed
+
+/* statfs_pack.c */
+void statfs_pack(struct obd_statfs *osfs, struct kstatfs *sfs);
+void statfs_unpack(struct kstatfs *sfs, struct obd_statfs *osfs);
+
+/*
+ * For md echo client
+ */
+enum md_echo_cmd {
+	ECHO_MD_CREATE       = 1, /* Open/Create file on MDT */
+	ECHO_MD_MKDIR	= 2, /* Mkdir on MDT */
+	ECHO_MD_DESTROY      = 3, /* Unlink file on MDT */
+	ECHO_MD_RMDIR	= 4, /* Rmdir on MDT */
+	ECHO_MD_LOOKUP       = 5, /* Lookup on MDT */
+	ECHO_MD_GETATTR      = 6, /* Getattr on MDT */
+	ECHO_MD_SETATTR      = 7, /* Setattr on MDT */
+	ECHO_MD_ALLOC_FID    = 8, /* Get FIDs from MDT */
+};
+
+/*
+ *   OBD IOCTLS
+ */
+#define OBD_IOCTL_VERSION 0x00010004
+
+struct obd_ioctl_data {
+	__u32 ioc_len;
+	__u32 ioc_version;
+
+	union {
+		__u64 ioc_cookie;
+		__u64 ioc_u64_1;
+	};
+	union {
+		__u32 ioc_conn1;
+		__u32 ioc_u32_1;
+	};
+	union {
+		__u32 ioc_conn2;
+		__u32 ioc_u32_2;
+	};
+
+	struct obdo ioc_obdo1;
+	struct obdo ioc_obdo2;
+
+	u64	 ioc_count;
+	u64	 ioc_offset;
+	__u32    ioc_dev;
+	__u32    ioc_command;
+
+	__u64 ioc_nid;
+	__u32 ioc_nal;
+	__u32 ioc_type;
+
+	/* buffers the kernel will treat as user pointers */
+	__u32  ioc_plen1;
+	char  *ioc_pbuf1;
+	__u32  ioc_plen2;
+	char  *ioc_pbuf2;
+
+	/* inline buffers for various arguments */
+	__u32  ioc_inllen1;
+	char  *ioc_inlbuf1;
+	__u32  ioc_inllen2;
+	char  *ioc_inlbuf2;
+	__u32  ioc_inllen3;
+	char  *ioc_inlbuf3;
+	__u32  ioc_inllen4;
+	char  *ioc_inlbuf4;
+
+	char    ioc_bulk[0];
+};
+
+struct obd_ioctl_hdr {
+	__u32 ioc_len;
+	__u32 ioc_version;
+};
+
+static inline int obd_ioctl_packlen(struct obd_ioctl_data *data)
+{
+	int len = cfs_size_round(sizeof(struct obd_ioctl_data));
+	len += cfs_size_round(data->ioc_inllen1);
+	len += cfs_size_round(data->ioc_inllen2);
+	len += cfs_size_round(data->ioc_inllen3);
+	len += cfs_size_round(data->ioc_inllen4);
+	return len;
+}
+
+
+static inline int obd_ioctl_is_invalid(struct obd_ioctl_data *data)
+{
+	if (data->ioc_len > OBD_MAX_IOCTL_BUFFER) {
+		CERROR("OBD ioctl: ioc_len larger than %d\n",
+		       OBD_MAX_IOCTL_BUFFER);
+		return 1;
+	}
+	if (data->ioc_inllen1 > OBD_MAX_IOCTL_BUFFER) {
+		CERROR("OBD ioctl: ioc_inllen1 larger than ioc_len\n");
+		return 1;
+	}
+	if (data->ioc_inllen2 > OBD_MAX_IOCTL_BUFFER) {
+		CERROR("OBD ioctl: ioc_inllen2 larger than ioc_len\n");
+		return 1;
+	}
+	if (data->ioc_inllen3 > OBD_MAX_IOCTL_BUFFER) {
+		CERROR("OBD ioctl: ioc_inllen3 larger than ioc_len\n");
+		return 1;
+	}
+	if (data->ioc_inllen4 > OBD_MAX_IOCTL_BUFFER) {
+		CERROR("OBD ioctl: ioc_inllen4 larger than ioc_len\n");
+		return 1;
+	}
+	if (data->ioc_inlbuf1 && !data->ioc_inllen1) {
+		CERROR("OBD ioctl: inlbuf1 pointer but 0 length\n");
+		return 1;
+	}
+	if (data->ioc_inlbuf2 && !data->ioc_inllen2) {
+		CERROR("OBD ioctl: inlbuf2 pointer but 0 length\n");
+		return 1;
+	}
+	if (data->ioc_inlbuf3 && !data->ioc_inllen3) {
+		CERROR("OBD ioctl: inlbuf3 pointer but 0 length\n");
+		return 1;
+	}
+	if (data->ioc_inlbuf4 && !data->ioc_inllen4) {
+		CERROR("OBD ioctl: inlbuf4 pointer but 0 length\n");
+		return 1;
+	}
+	if (data->ioc_pbuf1 && !data->ioc_plen1) {
+		CERROR("OBD ioctl: pbuf1 pointer but 0 length\n");
+		return 1;
+	}
+	if (data->ioc_pbuf2 && !data->ioc_plen2) {
+		CERROR("OBD ioctl: pbuf2 pointer but 0 length\n");
+		return 1;
+	}
+	if (data->ioc_plen1 && !data->ioc_pbuf1) {
+		CERROR("OBD ioctl: plen1 set but NULL pointer\n");
+		return 1;
+	}
+	if (data->ioc_plen2 && !data->ioc_pbuf2) {
+		CERROR("OBD ioctl: plen2 set but NULL pointer\n");
+		return 1;
+	}
+	if (obd_ioctl_packlen(data) > data->ioc_len) {
+		CERROR("OBD ioctl: packlen exceeds ioc_len (%d > %d)\n",
+		       obd_ioctl_packlen(data), data->ioc_len);
+		return 1;
+	}
+	return 0;
+}
+
+
+#include "obd_support.h"
+
+/* function defined in lustre/obdclass/<platform>/<platform>-module.c */
+int obd_ioctl_getdata(char **buf, int *len, void *arg);
+int obd_ioctl_popdata(void *arg, void *data, int len);
+
+static inline void obd_ioctl_freedata(char *buf, int len)
+{
+	OBD_FREE_LARGE(buf, len);
+	return;
+}
+
+/*
+ * BSD ioctl description:
+ * #define IOC_V1       _IOR(g, n1, long)
+ * #define IOC_V2       _IOW(g, n2, long)
+ *
+ * ioctl(f, IOC_V1, arg);
+ * arg will be treated as a long value,
+ *
+ * ioctl(f, IOC_V2, arg)
+ * arg will be treated as a pointer, bsd will call
+ * copyin(buf, arg, sizeof(long))
+ *
+ * To make BSD ioctl handles argument correctly and simplely,
+ * we change _IOR to _IOWR so BSD will copyin obd_ioctl_data
+ * for us. Does this change affect Linux?  (XXX Liang)
+ */
+#define OBD_IOC_DATA_TYPE long
+
+#define OBD_IOC_CREATE		 _IOWR('f', 101, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_DESTROY		_IOW ('f', 104, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_PREALLOCATE	    _IOWR('f', 105, OBD_IOC_DATA_TYPE)
+
+#define OBD_IOC_SETATTR		_IOW ('f', 107, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_GETATTR		_IOWR ('f', 108, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_READ		   _IOWR('f', 109, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_WRITE		  _IOWR('f', 110, OBD_IOC_DATA_TYPE)
+
+
+#define OBD_IOC_STATFS		 _IOWR('f', 113, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_SYNC		   _IOW ('f', 114, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_READ2		  _IOWR('f', 115, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_FORMAT		 _IOWR('f', 116, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_PARTITION	      _IOWR('f', 117, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_COPY		   _IOWR('f', 120, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_MIGR		   _IOWR('f', 121, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_PUNCH		  _IOWR('f', 122, OBD_IOC_DATA_TYPE)
+
+#define OBD_IOC_MODULE_DEBUG	   _IOWR('f', 124, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_BRW_READ	       _IOWR('f', 125, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_BRW_WRITE	      _IOWR('f', 126, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_NAME2DEV	       _IOWR('f', 127, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_UUID2DEV	       _IOWR('f', 130, OBD_IOC_DATA_TYPE)
+
+#define OBD_IOC_GETNAME		_IOWR('f', 131, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_GETMDNAME	      _IOR('f', 131, char[MAX_OBD_NAME])
+#define OBD_IOC_GETDTNAME	       OBD_IOC_GETNAME
+
+#define OBD_IOC_LOV_GET_CONFIG	 _IOWR('f', 132, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_CLIENT_RECOVER	 _IOW ('f', 133, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_PING_TARGET	    _IOW ('f', 136, OBD_IOC_DATA_TYPE)
+
+#define OBD_IOC_DEC_FS_USE_COUNT       _IO  ('f', 139      )
+#define OBD_IOC_NO_TRANSNO	     _IOW ('f', 140, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_SET_READONLY	   _IOW ('f', 141, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_ABORT_RECOVERY	 _IOR ('f', 142, OBD_IOC_DATA_TYPE)
+
+#define OBD_IOC_ROOT_SQUASH	    _IOWR('f', 143, OBD_IOC_DATA_TYPE)
+
+#define OBD_GET_VERSION		_IOWR ('f', 144, OBD_IOC_DATA_TYPE)
+
+#define OBD_IOC_GSS_SUPPORT	    _IOWR('f', 145, OBD_IOC_DATA_TYPE)
+
+#define OBD_IOC_CLOSE_UUID	     _IOWR ('f', 147, OBD_IOC_DATA_TYPE)
+
+#define OBD_IOC_CHANGELOG_SEND	 _IOW ('f', 148, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_GETDEVICE	      _IOWR ('f', 149, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_FID2PATH	       _IOWR ('f', 150, OBD_IOC_DATA_TYPE)
+/* see also <lustre/lustre_user.h> for ioctls 151-153 */
+/* OBD_IOC_LOV_SETSTRIPE: See also LL_IOC_LOV_SETSTRIPE */
+#define OBD_IOC_LOV_SETSTRIPE	  _IOW ('f', 154, OBD_IOC_DATA_TYPE)
+/* OBD_IOC_LOV_GETSTRIPE: See also LL_IOC_LOV_GETSTRIPE */
+#define OBD_IOC_LOV_GETSTRIPE	  _IOW ('f', 155, OBD_IOC_DATA_TYPE)
+/* OBD_IOC_LOV_SETEA: See also LL_IOC_LOV_SETEA */
+#define OBD_IOC_LOV_SETEA	      _IOW ('f', 156, OBD_IOC_DATA_TYPE)
+/* see <lustre/lustre_user.h> for ioctls 157-159 */
+/* OBD_IOC_QUOTACHECK: See also LL_IOC_QUOTACHECK */
+#define OBD_IOC_QUOTACHECK	     _IOW ('f', 160, int)
+/* OBD_IOC_POLL_QUOTACHECK: See also LL_IOC_POLL_QUOTACHECK */
+#define OBD_IOC_POLL_QUOTACHECK	_IOR ('f', 161, struct if_quotacheck *)
+/* OBD_IOC_QUOTACTL: See also LL_IOC_QUOTACTL */
+#define OBD_IOC_QUOTACTL	       _IOWR('f', 162, struct if_quotactl)
+/* see  also <lustre/lustre_user.h> for ioctls 163-176 */
+#define OBD_IOC_CHANGELOG_REG	  _IOW ('f', 177, struct obd_ioctl_data)
+#define OBD_IOC_CHANGELOG_DEREG	_IOW ('f', 178, struct obd_ioctl_data)
+#define OBD_IOC_CHANGELOG_CLEAR	_IOW ('f', 179, struct obd_ioctl_data)
+#define OBD_IOC_RECORD		 _IOWR('f', 180, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_ENDRECORD	      _IOWR('f', 181, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_PARSE		  _IOWR('f', 182, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_DORECORD	       _IOWR('f', 183, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_PROCESS_CFG	    _IOWR('f', 184, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_DUMP_LOG	       _IOWR('f', 185, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_CLEAR_LOG	      _IOWR('f', 186, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_PARAM		  _IOW ('f', 187, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_POOL		   _IOWR('f', 188, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_REPLACE_NIDS	   _IOWR('f', 189, OBD_IOC_DATA_TYPE)
+
+#define OBD_IOC_CATLOGLIST	     _IOWR('f', 190, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_LLOG_INFO	      _IOWR('f', 191, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_LLOG_PRINT	     _IOWR('f', 192, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_LLOG_CANCEL	    _IOWR('f', 193, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_LLOG_REMOVE	    _IOWR('f', 194, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_LLOG_CHECK	     _IOWR('f', 195, OBD_IOC_DATA_TYPE)
+/* OBD_IOC_LLOG_CATINFO is deprecated */
+#define OBD_IOC_LLOG_CATINFO	   _IOWR('f', 196, OBD_IOC_DATA_TYPE)
+
+#define ECHO_IOC_GET_STRIPE	    _IOWR('f', 200, OBD_IOC_DATA_TYPE)
+#define ECHO_IOC_SET_STRIPE	    _IOWR('f', 201, OBD_IOC_DATA_TYPE)
+#define ECHO_IOC_ENQUEUE	       _IOWR('f', 202, OBD_IOC_DATA_TYPE)
+#define ECHO_IOC_CANCEL		_IOWR('f', 203, OBD_IOC_DATA_TYPE)
+
+#define OBD_IOC_GET_OBJ_VERSION	_IOR('f', 210, OBD_IOC_DATA_TYPE)
+
+/* <lustre/lustre_user.h> defines ioctl number 218-219 */
+#define OBD_IOC_GET_MNTOPT	     _IOW('f', 220, mntopt_t)
+
+#define OBD_IOC_ECHO_MD		_IOR('f', 221, struct obd_ioctl_data)
+#define OBD_IOC_ECHO_ALLOC_SEQ	 _IOWR('f', 222, struct obd_ioctl_data)
+
+#define OBD_IOC_START_LFSCK	       _IOWR('f', 230, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_STOP_LFSCK	       _IOW('f', 231, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_PAUSE_LFSCK	       _IOW('f', 232, OBD_IOC_DATA_TYPE)
+
+/* XXX _IOWR('f', 250, long) has been defined in
+ * libcfs/include/libcfs/libcfs_private.h for debug, don't use it
+ */
+
+/* Until such time as we get_info the per-stripe maximum from the OST,
+ * we define this to be 2T - 4k, which is the ext3 maxbytes. */
+#define LUSTRE_STRIPE_MAXBYTES 0x1fffffff000ULL
+
+/* Special values for remove LOV EA from disk */
+#define LOVEA_DELETE_VALUES(size, count, offset) (size == 0 && count == 0 && \
+						 offset == (typeof(offset))(-1))
+
+/* #define POISON_BULK 0 */
+
+/*
+ * l_wait_event is a flexible sleeping function, permitting simple caller
+ * configuration of interrupt and timeout sensitivity along with actions to
+ * be performed in the event of either exception.
+ *
+ * The first form of usage looks like this:
+ *
+ * struct l_wait_info lwi = LWI_TIMEOUT_INTR(timeout, timeout_handler,
+ *					   intr_handler, callback_data);
+ * rc = l_wait_event(waitq, condition, &lwi);
+ *
+ * l_wait_event() makes the current process wait on 'waitq' until 'condition'
+ * is TRUE or a "killable" signal (SIGTERM, SIKGILL, SIGINT) is pending.  It
+ * returns 0 to signify 'condition' is TRUE, but if a signal wakes it before
+ * 'condition' becomes true, it optionally calls the specified 'intr_handler'
+ * if not NULL, and returns -EINTR.
+ *
+ * If a non-zero timeout is specified, signals are ignored until the timeout
+ * has expired.  At this time, if 'timeout_handler' is not NULL it is called.
+ * If it returns FALSE l_wait_event() continues to wait as described above with
+ * signals enabled.  Otherwise it returns -ETIMEDOUT.
+ *
+ * LWI_INTR(intr_handler, callback_data) is shorthand for
+ * LWI_TIMEOUT_INTR(0, NULL, intr_handler, callback_data)
+ *
+ * The second form of usage looks like this:
+ *
+ * struct l_wait_info lwi = LWI_TIMEOUT(timeout, timeout_handler);
+ * rc = l_wait_event(waitq, condition, &lwi);
+ *
+ * This form is the same as the first except that it COMPLETELY IGNORES
+ * SIGNALS.  The caller must therefore beware that if 'timeout' is zero, or if
+ * 'timeout_handler' is not NULL and returns FALSE, then the ONLY thing that
+ * can unblock the current process is 'condition' becoming TRUE.
+ *
+ * Another form of usage is:
+ * struct l_wait_info lwi = LWI_TIMEOUT_INTERVAL(timeout, interval,
+ *					       timeout_handler);
+ * rc = l_wait_event(waitq, condition, &lwi);
+ * This is the same as previous case, but condition is checked once every
+ * 'interval' jiffies (if non-zero).
+ *
+ * Subtle synchronization point: this macro does *not* necessary takes
+ * wait-queue spin-lock before returning, and, hence, following idiom is safe
+ * ONLY when caller provides some external locking:
+ *
+ *	     Thread1			    Thread2
+ *
+ *   l_wait_event(&obj->wq, ....);				       (1)
+ *
+ *				    wake_up(&obj->wq):		 (2)
+ *					 spin_lock(&q->lock);	  (2.1)
+ *					 __wake_up_common(q, ...);     (2.2)
+ *					 spin_unlock(&q->lock, flags); (2.3)
+ *
+ *   OBD_FREE_PTR(obj);						  (3)
+ *
+ * As l_wait_event() may "short-cut" execution and return without taking
+ * wait-queue spin-lock, some additional synchronization is necessary to
+ * guarantee that step (3) can begin only after (2.3) finishes.
+ *
+ * XXX nikita: some ptlrpc daemon threads have races of that sort.
+ *
+ */
+static inline int back_to_sleep(void *arg)
+{
+	return 0;
+}
+
+#define LWI_ON_SIGNAL_NOOP ((void (*)(void *))(-1))
+
+struct l_wait_info {
+	long lwi_timeout;
+	long lwi_interval;
+	int	    lwi_allow_intr;
+	int  (*lwi_on_timeout)(void *);
+	void (*lwi_on_signal)(void *);
+	void  *lwi_cb_data;
+};
+
+/* NB: LWI_TIMEOUT ignores signals completely */
+#define LWI_TIMEOUT(time, cb, data)	     \
+((struct l_wait_info) {			 \
+	.lwi_timeout    = time,		 \
+	.lwi_on_timeout = cb,		   \
+	.lwi_cb_data    = data,		 \
+	.lwi_interval   = 0,		    \
+	.lwi_allow_intr = 0		     \
+})
+
+#define LWI_TIMEOUT_INTERVAL(time, interval, cb, data)  \
+((struct l_wait_info) {				 \
+	.lwi_timeout    = time,			 \
+	.lwi_on_timeout = cb,			   \
+	.lwi_cb_data    = data,			 \
+	.lwi_interval   = interval,		     \
+	.lwi_allow_intr = 0			     \
+})
+
+#define LWI_TIMEOUT_INTR(time, time_cb, sig_cb, data)   \
+((struct l_wait_info) {				 \
+	.lwi_timeout    = time,			 \
+	.lwi_on_timeout = time_cb,		      \
+	.lwi_on_signal  = sig_cb,		       \
+	.lwi_cb_data    = data,			 \
+	.lwi_interval   = 0,			    \
+	.lwi_allow_intr = 0			     \
+})
+
+#define LWI_TIMEOUT_INTR_ALL(time, time_cb, sig_cb, data)       \
+((struct l_wait_info) {					 \
+	.lwi_timeout    = time,				 \
+	.lwi_on_timeout = time_cb,			      \
+	.lwi_on_signal  = sig_cb,			       \
+	.lwi_cb_data    = data,				 \
+	.lwi_interval   = 0,				    \
+	.lwi_allow_intr = 1				     \
+})
+
+#define LWI_INTR(cb, data)  LWI_TIMEOUT_INTR(0, NULL, cb, data)
+
+#define LUSTRE_FATAL_SIGS (sigmask(SIGKILL) | sigmask(SIGINT) |		\
+			   sigmask(SIGTERM) | sigmask(SIGQUIT) |	\
+			   sigmask(SIGALRM))
+
+
+/*
+ * wait for @condition to become true, but no longer than timeout, specified
+ * by @info.
+ */
+#define __l_wait_event(wq, condition, info, ret, l_add_wait)		   \
+do {									   \
+	wait_queue_t __wait;						 \
+	long __timeout = info->lwi_timeout;			  \
+	sigset_t   __blocked;					      \
+	int   __allow_intr = info->lwi_allow_intr;			     \
+									       \
+	ret = 0;							       \
+	if (condition)							 \
+		break;							 \
+									       \
+	init_waitqueue_entry(&__wait, current);					    \
+	l_add_wait(&wq, &__wait);					      \
+									       \
+	/* Block all signals (just the non-fatal ones if no timeout). */       \
+	if (info->lwi_on_signal != NULL && (__timeout == 0 || __allow_intr))   \
+		__blocked = cfs_block_sigsinv(LUSTRE_FATAL_SIGS);	      \
+	else								   \
+		__blocked = cfs_block_sigsinv(0);			      \
+									       \
+	for (;;) {							     \
+		unsigned       __wstate;				       \
+									       \
+		__wstate = info->lwi_on_signal != NULL &&		      \
+			   (__timeout == 0 || __allow_intr) ?		  \
+			TASK_INTERRUPTIBLE : TASK_UNINTERRUPTIBLE;	       \
+									       \
+		set_current_state(TASK_INTERRUPTIBLE);		 \
+									       \
+		if (condition)						 \
+			break;						 \
+									       \
+		if (__timeout == 0) {					  \
+			schedule();						\
+		} else {						       \
+			long interval = info->lwi_interval?	  \
+					     min_t(long,	     \
+						 info->lwi_interval,__timeout):\
+					     __timeout;			\
+			long remaining = schedule_timeout(interval);\
+			__timeout = cfs_time_sub(__timeout,		    \
+					    cfs_time_sub(interval, remaining));\
+			if (__timeout == 0) {				  \
+				if (info->lwi_on_timeout == NULL ||	    \
+				    info->lwi_on_timeout(info->lwi_cb_data)) { \
+					ret = -ETIMEDOUT;		      \
+					break;				 \
+				}					      \
+				/* Take signals after the timeout expires. */  \
+				if (info->lwi_on_signal != NULL)	       \
+				    (void)cfs_block_sigsinv(LUSTRE_FATAL_SIGS);\
+			}						      \
+		}							      \
+									       \
+		if (condition)						 \
+			break;						 \
+		if (cfs_signal_pending()) {				    \
+			if (info->lwi_on_signal != NULL &&		     \
+			    (__timeout == 0 || __allow_intr)) {		\
+				if (info->lwi_on_signal != LWI_ON_SIGNAL_NOOP) \
+					info->lwi_on_signal(info->lwi_cb_data);\
+				ret = -EINTR;				  \
+				break;					 \
+			}						      \
+			/* We have to do this here because some signals */     \
+			/* are not blockable - ie from strace(1).       */     \
+			/* In these cases we want to schedule_timeout() */     \
+			/* again, because we don't want that to return  */     \
+			/* -EINTR when the RPC actually succeeded.      */     \
+			/* the recalc_sigpending() below will deliver the */     \
+			/* signal properly.			     */     \
+			cfs_clear_sigpending();				\
+		}							      \
+	}								      \
+									       \
+	cfs_restore_sigs(__blocked);					   \
+									       \
+	set_current_state(TASK_RUNNING);			       \
+	remove_wait_queue(&wq, &__wait);					   \
+} while (0)
+
+
+
+#define l_wait_event(wq, condition, info)		       \
+({							      \
+	int		 __ret;			      \
+	struct l_wait_info *__info = (info);		    \
+								\
+	__l_wait_event(wq, condition, __info,		   \
+		       __ret, add_wait_queue);		   \
+	__ret;						  \
+})
+
+#define l_wait_event_exclusive(wq, condition, info)	     \
+({							      \
+	int		 __ret;			      \
+	struct l_wait_info *__info = (info);		    \
+								\
+	__l_wait_event(wq, condition, __info,		   \
+		       __ret, add_wait_queue_exclusive);	 \
+	__ret;						  \
+})
+
+#define l_wait_event_exclusive_head(wq, condition, info)	\
+({							      \
+	int		 __ret;			      \
+	struct l_wait_info *__info = (info);		    \
+								\
+	__l_wait_event(wq, condition, __info,		   \
+		       __ret, add_wait_queue_exclusive_head);    \
+	__ret;						  \
+})
+
+#define l_wait_condition(wq, condition)			 \
+({							      \
+	struct l_wait_info lwi = { 0 };			 \
+	l_wait_event(wq, condition, &lwi);		      \
+})
+
+#define l_wait_condition_exclusive(wq, condition)	       \
+({							      \
+	struct l_wait_info lwi = { 0 };			 \
+	l_wait_event_exclusive(wq, condition, &lwi);	    \
+})
+
+#define l_wait_condition_exclusive_head(wq, condition)	  \
+({							      \
+	struct l_wait_info lwi = { 0 };			 \
+	l_wait_event_exclusive_head(wq, condition, &lwi);       \
+})
+
+#define LIBLUSTRE_CLIENT (0)
+
+/** @} lib */
+
+#endif /* _LUSTRE_LIB_H */
diff --git a/kernel/drivers/staging/lustre/lustre/include/lustre_lite.h b/kernel/drivers/staging/lustre/lustre/include/lustre_lite.h
new file mode 100644
index 000000000..df557c22a
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/include/lustre_lite.h
@@ -0,0 +1,150 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef _LL_H
+#define _LL_H
+
+/** \defgroup lite lite
+ *
+ * @{
+ */
+
+#include "linux/lustre_lite.h"
+
+#include "obd_class.h"
+#include "lustre_net.h"
+#include "lustre_mds.h"
+#include "lustre_ha.h"
+
+/* 4UL * 1024 * 1024 */
+#define LL_MAX_BLKSIZE_BITS     (22)
+#define LL_MAX_BLKSIZE	  (1UL<<LL_MAX_BLKSIZE_BITS)
+
+#include "lustre/lustre_user.h"
+
+
+struct lustre_rw_params {
+	int		lrp_lock_mode;
+	ldlm_policy_data_t lrp_policy;
+	u32		lrp_brw_flags;
+	int		lrp_ast_flags;
+};
+
+/*
+ * XXX nikita: this function lives in the header because it is used by both
+ * llite kernel module and liblustre library, and there is no (?) better place
+ * to put it in.
+ */
+static inline void lustre_build_lock_params(int cmd, unsigned long open_flags,
+					    __u64 connect_flags,
+					    loff_t pos, ssize_t len,
+					    struct lustre_rw_params *params)
+{
+	params->lrp_lock_mode = (cmd == OBD_BRW_READ) ? LCK_PR : LCK_PW;
+	params->lrp_brw_flags = 0;
+
+	params->lrp_policy.l_extent.start = pos;
+	params->lrp_policy.l_extent.end = pos + len - 1;
+	/*
+	 * for now O_APPEND always takes local locks.
+	 */
+	if (cmd == OBD_BRW_WRITE && (open_flags & O_APPEND)) {
+		params->lrp_policy.l_extent.start = 0;
+		params->lrp_policy.l_extent.end   = OBD_OBJECT_EOF;
+	} else if (LIBLUSTRE_CLIENT && (connect_flags & OBD_CONNECT_SRVLOCK)) {
+		/*
+		 * liblustre: OST-side locking for all non-O_APPEND
+		 * reads/writes.
+		 */
+		params->lrp_lock_mode = LCK_NL;
+		params->lrp_brw_flags = OBD_BRW_SRVLOCK;
+	} else {
+		/*
+		 * nothing special for the kernel. In the future llite may use
+		 * OST-side locks for small writes into highly contended
+		 * files.
+		 */
+	}
+	params->lrp_ast_flags = (open_flags & O_NONBLOCK) ?
+		LDLM_FL_BLOCK_NOWAIT : 0;
+}
+
+/*
+ * This is embedded into liblustre and llite super-blocks to keep track of
+ * connect flags (capabilities) supported by all imports given mount is
+ * connected to.
+ */
+struct lustre_client_ocd {
+	/*
+	 * This is conjunction of connect_flags across all imports (LOVs) this
+	 * mount is connected to. This field is updated by cl_ocd_update()
+	 * under ->lco_lock.
+	 */
+	__u64	      lco_flags;
+	struct mutex	   lco_lock;
+	struct obd_export *lco_md_exp;
+	struct obd_export *lco_dt_exp;
+};
+
+/*
+ * Chain of hash overflow pages.
+ */
+struct ll_dir_chain {
+	/* XXX something. Later */
+};
+
+static inline void ll_dir_chain_init(struct ll_dir_chain *chain)
+{
+}
+
+static inline void ll_dir_chain_fini(struct ll_dir_chain *chain)
+{
+}
+
+static inline unsigned long hash_x_index(__u64 hash, int hash64)
+{
+	if (BITS_PER_LONG == 32 && hash64)
+		hash >>= 32;
+	/* save hash 0 as index 0 because otherwise we'll save it at
+	 * page index end (~0UL) and it causes truncate_inode_pages_range()
+	 * to loop forever.
+	 */
+	return ~0UL - (hash + !hash);
+}
+
+/** @} lite */
+
+#endif
diff --git a/kernel/drivers/staging/lustre/lustre/include/lustre_log.h b/kernel/drivers/staging/lustre/lustre/include/lustre_log.h
new file mode 100644
index 000000000..2187fb615
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/include/lustre_log.h
@@ -0,0 +1,545 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/include/lustre_log.h
+ *
+ * Generic infrastructure for managing a collection of logs.
+ * These logs are used for:
+ *
+ * - orphan recovery: OST adds record on create
+ * - mtime/size consistency: the OST adds a record on first write
+ * - open/unlinked objects: OST adds a record on destroy
+ *
+ * - mds unlink log: the MDS adds an entry upon delete
+ *
+ * - raid1 replication log between OST's
+ * - MDS replication logs
+ */
+
+#ifndef _LUSTRE_LOG_H
+#define _LUSTRE_LOG_H
+
+/** \defgroup log log
+ *
+ * @{
+ */
+
+#include "obd_class.h"
+#include "lustre/lustre_idl.h"
+#include "dt_object.h"
+
+#define LOG_NAME_LIMIT(logname, name)		   \
+	snprintf(logname, sizeof(logname), "LOGS/%s", name)
+#define LLOG_EEMPTY 4711
+
+enum llog_open_param {
+	LLOG_OPEN_EXISTS	= 0x0000,
+	LLOG_OPEN_NEW		= 0x0001,
+};
+
+struct plain_handle_data {
+	struct list_head	  phd_entry;
+	struct llog_handle *phd_cat_handle;
+	struct llog_cookie  phd_cookie; /* cookie of this log in its cat */
+};
+
+struct cat_handle_data {
+	struct list_head	      chd_head;
+	struct llog_handle     *chd_current_log; /* currently open log */
+	struct llog_handle	*chd_next_log; /* llog to be used next */
+};
+
+static inline void logid_to_fid(struct llog_logid *id, struct lu_fid *fid)
+{
+	/* For compatibility purposes we identify pre-OSD (~< 2.3.51 MDS)
+	 * logid's by non-zero ogen (inode generation) and convert them
+	 * into IGIF */
+	if (id->lgl_ogen == 0) {
+		fid->f_seq = id->lgl_oi.oi.oi_seq;
+		fid->f_oid = id->lgl_oi.oi.oi_id;
+		fid->f_ver = 0;
+	} else {
+		lu_igif_build(fid, id->lgl_oi.oi.oi_id, id->lgl_ogen);
+	}
+}
+
+static inline void fid_to_logid(struct lu_fid *fid, struct llog_logid *id)
+{
+	id->lgl_oi.oi.oi_seq = fid->f_seq;
+	id->lgl_oi.oi.oi_id = fid->f_oid;
+	id->lgl_ogen = 0;
+}
+
+static inline void logid_set_id(struct llog_logid *log_id, __u64 id)
+{
+	log_id->lgl_oi.oi.oi_id = id;
+}
+
+static inline __u64 logid_id(struct llog_logid *log_id)
+{
+	return log_id->lgl_oi.oi.oi_id;
+}
+
+struct llog_handle;
+
+/* llog.c  -  general API */
+int llog_init_handle(const struct lu_env *env, struct llog_handle *handle,
+		     int flags, struct obd_uuid *uuid);
+int llog_copy_handler(const struct lu_env *env, struct llog_handle *llh,
+		      struct llog_rec_hdr *rec, void *data);
+int llog_process(const struct lu_env *env, struct llog_handle *loghandle,
+		 llog_cb_t cb, void *data, void *catdata);
+int llog_process_or_fork(const struct lu_env *env,
+			 struct llog_handle *loghandle,
+			 llog_cb_t cb, void *data, void *catdata, bool fork);
+int llog_reverse_process(const struct lu_env *env,
+			 struct llog_handle *loghandle, llog_cb_t cb,
+			 void *data, void *catdata);
+int llog_cancel_rec(const struct lu_env *env, struct llog_handle *loghandle,
+		    int index);
+int llog_open(const struct lu_env *env, struct llog_ctxt *ctxt,
+	      struct llog_handle **lgh, struct llog_logid *logid,
+	      char *name, enum llog_open_param open_param);
+int llog_close(const struct lu_env *env, struct llog_handle *cathandle);
+int llog_is_empty(const struct lu_env *env, struct llog_ctxt *ctxt,
+		  char *name);
+int llog_backup(const struct lu_env *env, struct obd_device *obd,
+		struct llog_ctxt *ctxt, struct llog_ctxt *bak_ctxt,
+		char *name, char *backup);
+
+/* llog_process flags */
+#define LLOG_FLAG_NODEAMON 0x0001
+
+/* llog_cat.c - catalog api */
+struct llog_process_data {
+	/**
+	 * Any useful data needed while processing catalog. This is
+	 * passed later to process callback.
+	 */
+	void		*lpd_data;
+	/**
+	 * Catalog process callback function, called for each record
+	 * in catalog.
+	 */
+	llog_cb_t	    lpd_cb;
+	/**
+	 * Start processing the catalog from startcat/startidx
+	 */
+	int		  lpd_startcat;
+	int		  lpd_startidx;
+};
+
+struct llog_process_cat_data {
+	/**
+	 * Temporary stored first_idx while scanning log.
+	 */
+	int		  lpcd_first_idx;
+	/**
+	 * Temporary stored last_idx while scanning log.
+	 */
+	int		  lpcd_last_idx;
+};
+
+int llog_cat_close(const struct lu_env *env, struct llog_handle *cathandle);
+int llog_cat_add_rec(const struct lu_env *env, struct llog_handle *cathandle,
+		     struct llog_rec_hdr *rec, struct llog_cookie *reccookie,
+		     void *buf, struct thandle *th);
+int llog_cat_declare_add_rec(const struct lu_env *env,
+			     struct llog_handle *cathandle,
+			     struct llog_rec_hdr *rec, struct thandle *th);
+int llog_cat_add(const struct lu_env *env, struct llog_handle *cathandle,
+		 struct llog_rec_hdr *rec, struct llog_cookie *reccookie,
+		 void *buf);
+int llog_cat_cancel_records(const struct lu_env *env,
+			    struct llog_handle *cathandle, int count,
+			    struct llog_cookie *cookies);
+int llog_cat_process_or_fork(const struct lu_env *env,
+			     struct llog_handle *cat_llh, llog_cb_t cb,
+			     void *data, int startcat, int startidx, bool fork);
+int llog_cat_process(const struct lu_env *env, struct llog_handle *cat_llh,
+		     llog_cb_t cb, void *data, int startcat, int startidx);
+int llog_cat_reverse_process(const struct lu_env *env,
+			     struct llog_handle *cat_llh, llog_cb_t cb,
+			     void *data);
+int llog_cat_init_and_process(const struct lu_env *env,
+			      struct llog_handle *llh);
+
+/* llog_obd.c */
+int llog_setup(const struct lu_env *env, struct obd_device *obd,
+	       struct obd_llog_group *olg, int index,
+	       struct obd_device *disk_obd, struct llog_operations *op);
+int __llog_ctxt_put(const struct lu_env *env, struct llog_ctxt *ctxt);
+int llog_cleanup(const struct lu_env *env, struct llog_ctxt *);
+int llog_sync(struct llog_ctxt *ctxt, struct obd_export *exp, int flags);
+int llog_cancel(const struct lu_env *env, struct llog_ctxt *ctxt,
+		struct llog_cookie *cookies, int flags);
+
+/* llog_net.c */
+int llog_initiator_connect(struct llog_ctxt *ctxt);
+
+struct llog_operations {
+	int (*lop_destroy)(const struct lu_env *env,
+			   struct llog_handle *handle);
+	int (*lop_next_block)(const struct lu_env *env, struct llog_handle *h,
+			      int *curr_idx, int next_idx, __u64 *offset,
+			      void *buf, int len);
+	int (*lop_prev_block)(const struct lu_env *env, struct llog_handle *h,
+			      int prev_idx, void *buf, int len);
+	int (*lop_read_header)(const struct lu_env *env,
+			       struct llog_handle *handle);
+	int (*lop_setup)(const struct lu_env *env, struct obd_device *obd,
+			 struct obd_llog_group *olg, int ctxt_idx,
+			 struct obd_device *disk_obd);
+	int (*lop_sync)(struct llog_ctxt *ctxt, struct obd_export *exp,
+			int flags);
+	int (*lop_cleanup)(const struct lu_env *env, struct llog_ctxt *ctxt);
+	int (*lop_cancel)(const struct lu_env *env, struct llog_ctxt *ctxt,
+			  struct llog_cookie *cookies, int flags);
+	int (*lop_connect)(struct llog_ctxt *ctxt, struct llog_logid *logid,
+			   struct llog_gen *gen, struct obd_uuid *uuid);
+	/**
+	 * Any llog file must be opened first using llog_open().  Llog can be
+	 * opened by name, logid or without both, in last case the new logid
+	 * will be generated.
+	 */
+	int (*lop_open)(const struct lu_env *env, struct llog_handle *lgh,
+			struct llog_logid *logid, char *name,
+			enum llog_open_param);
+	/**
+	 * Opened llog may not exist and this must be checked where needed using
+	 * the llog_exist() call.
+	 */
+	int (*lop_exist)(struct llog_handle *lgh);
+	/**
+	 * Close llog file and calls llog_free_handle() implicitly.
+	 * Any opened llog must be closed by llog_close() call.
+	 */
+	int (*lop_close)(const struct lu_env *env, struct llog_handle *handle);
+	/**
+	 * Create new llog file. The llog must be opened.
+	 * Must be used only for local llog operations.
+	 */
+	int (*lop_declare_create)(const struct lu_env *env,
+				  struct llog_handle *handle,
+				  struct thandle *th);
+	int (*lop_create)(const struct lu_env *env, struct llog_handle *handle,
+			  struct thandle *th);
+	/**
+	 * write new record in llog. It appends records usually but can edit
+	 * existing records too.
+	 */
+	int (*lop_declare_write_rec)(const struct lu_env *env,
+				     struct llog_handle *lgh,
+				     struct llog_rec_hdr *rec,
+				     int idx, struct thandle *th);
+	int (*lop_write_rec)(const struct lu_env *env,
+			     struct llog_handle *loghandle,
+			     struct llog_rec_hdr *rec,
+			     struct llog_cookie *cookie, int cookiecount,
+			     void *buf, int idx, struct thandle *th);
+	/**
+	 * Add new record in llog catalog. Does the same as llog_write_rec()
+	 * but using llog catalog.
+	 */
+	int (*lop_declare_add)(const struct lu_env *env,
+			       struct llog_handle *lgh,
+			       struct llog_rec_hdr *rec, struct thandle *th);
+	int (*lop_add)(const struct lu_env *env, struct llog_handle *lgh,
+		       struct llog_rec_hdr *rec, struct llog_cookie *cookie,
+		       void *buf, struct thandle *th);
+};
+
+/* In-memory descriptor for a log object or log catalog */
+struct llog_handle {
+	struct rw_semaphore	 lgh_lock;
+	spinlock_t		 lgh_hdr_lock; /* protect lgh_hdr data */
+	struct llog_logid	 lgh_id; /* id of this log */
+	struct llog_log_hdr	*lgh_hdr;
+	struct file		*lgh_file;
+	struct dt_object	*lgh_obj;
+	int			 lgh_last_idx;
+	int			 lgh_cur_idx; /* used during llog_process */
+	__u64			 lgh_cur_offset; /* used during llog_process */
+	struct llog_ctxt	*lgh_ctxt;
+	union {
+		struct plain_handle_data	 phd;
+		struct cat_handle_data		 chd;
+	} u;
+	char			*lgh_name;
+	void			*private_data;
+	struct llog_operations	*lgh_logops;
+	atomic_t		 lgh_refcount;
+};
+
+#define LLOG_CTXT_FLAG_UNINITIALIZED     0x00000001
+#define LLOG_CTXT_FLAG_STOP		 0x00000002
+
+struct llog_ctxt {
+	int		      loc_idx; /* my index the obd array of ctxt's */
+	struct obd_device       *loc_obd; /* points back to the containing obd*/
+	struct obd_llog_group   *loc_olg; /* group containing that ctxt */
+	struct obd_export       *loc_exp; /* parent "disk" export (e.g. MDS) */
+	struct obd_import       *loc_imp; /* to use in RPC's: can be backward
+					     pointing import */
+	struct llog_operations  *loc_logops;
+	struct llog_handle      *loc_handle;
+	struct mutex		 loc_mutex; /* protect loc_imp */
+	atomic_t	     loc_refcount;
+	long		     loc_flags; /* flags, see above defines */
+	struct dt_object	*loc_dir;
+};
+
+#define LLOG_PROC_BREAK 0x0001
+#define LLOG_DEL_RECORD 0x0002
+
+static inline int llog_obd2ops(struct llog_ctxt *ctxt,
+			       struct llog_operations **lop)
+{
+	if (ctxt == NULL)
+		return -ENOTCONN;
+
+	*lop = ctxt->loc_logops;
+	if (*lop == NULL)
+		return -EOPNOTSUPP;
+
+	return 0;
+}
+
+static inline int llog_handle2ops(struct llog_handle *loghandle,
+				  struct llog_operations **lop)
+{
+	if (loghandle == NULL || loghandle->lgh_logops == NULL)
+		return -EINVAL;
+
+	*lop = loghandle->lgh_logops;
+	return 0;
+}
+
+static inline int llog_data_len(int len)
+{
+	return cfs_size_round(len);
+}
+
+static inline int llog_get_size(struct llog_handle *loghandle)
+{
+	if (loghandle && loghandle->lgh_hdr)
+		return loghandle->lgh_hdr->llh_count;
+	return 0;
+}
+
+static inline struct llog_ctxt *llog_ctxt_get(struct llog_ctxt *ctxt)
+{
+	atomic_inc(&ctxt->loc_refcount);
+	CDEBUG(D_INFO, "GETting ctxt %p : new refcount %d\n", ctxt,
+	       atomic_read(&ctxt->loc_refcount));
+	return ctxt;
+}
+
+static inline void llog_ctxt_put(struct llog_ctxt *ctxt)
+{
+	if (ctxt == NULL)
+		return;
+	LASSERT_ATOMIC_GT_LT(&ctxt->loc_refcount, 0, LI_POISON);
+	CDEBUG(D_INFO, "PUTting ctxt %p : new refcount %d\n", ctxt,
+	       atomic_read(&ctxt->loc_refcount) - 1);
+	__llog_ctxt_put(NULL, ctxt);
+}
+
+static inline void llog_group_init(struct obd_llog_group *olg, int group)
+{
+	init_waitqueue_head(&olg->olg_waitq);
+	spin_lock_init(&olg->olg_lock);
+	mutex_init(&olg->olg_cat_processing);
+	olg->olg_seq = group;
+}
+
+static inline int llog_group_set_ctxt(struct obd_llog_group *olg,
+				      struct llog_ctxt *ctxt, int index)
+{
+	LASSERT(index >= 0 && index < LLOG_MAX_CTXTS);
+
+	spin_lock(&olg->olg_lock);
+	if (olg->olg_ctxts[index] != NULL) {
+		spin_unlock(&olg->olg_lock);
+		return -EEXIST;
+	}
+	olg->olg_ctxts[index] = ctxt;
+	spin_unlock(&olg->olg_lock);
+	return 0;
+}
+
+static inline struct llog_ctxt *llog_group_get_ctxt(struct obd_llog_group *olg,
+						    int index)
+{
+	struct llog_ctxt *ctxt;
+
+	LASSERT(index >= 0 && index < LLOG_MAX_CTXTS);
+
+	spin_lock(&olg->olg_lock);
+	if (olg->olg_ctxts[index] == NULL)
+		ctxt = NULL;
+	else
+		ctxt = llog_ctxt_get(olg->olg_ctxts[index]);
+	spin_unlock(&olg->olg_lock);
+	return ctxt;
+}
+
+static inline void llog_group_clear_ctxt(struct obd_llog_group *olg, int index)
+{
+	LASSERT(index >= 0 && index < LLOG_MAX_CTXTS);
+	spin_lock(&olg->olg_lock);
+	olg->olg_ctxts[index] = NULL;
+	spin_unlock(&olg->olg_lock);
+}
+
+static inline struct llog_ctxt *llog_get_context(struct obd_device *obd,
+						 int index)
+{
+	return llog_group_get_ctxt(&obd->obd_olg, index);
+}
+
+static inline int llog_group_ctxt_null(struct obd_llog_group *olg, int index)
+{
+	return (olg->olg_ctxts[index] == NULL);
+}
+
+static inline int llog_ctxt_null(struct obd_device *obd, int index)
+{
+	return llog_group_ctxt_null(&obd->obd_olg, index);
+}
+
+static inline int llog_destroy(const struct lu_env *env,
+			       struct llog_handle *handle)
+{
+	struct llog_operations *lop;
+	int rc;
+
+	rc = llog_handle2ops(handle, &lop);
+	if (rc)
+		return rc;
+	if (lop->lop_destroy == NULL)
+		return -EOPNOTSUPP;
+
+	rc = lop->lop_destroy(env, handle);
+	return rc;
+}
+
+static inline int llog_next_block(const struct lu_env *env,
+				  struct llog_handle *loghandle, int *cur_idx,
+				  int next_idx, __u64 *cur_offset, void *buf,
+				  int len)
+{
+	struct llog_operations *lop;
+	int rc;
+
+	rc = llog_handle2ops(loghandle, &lop);
+	if (rc)
+		return rc;
+	if (lop->lop_next_block == NULL)
+		return -EOPNOTSUPP;
+
+	rc = lop->lop_next_block(env, loghandle, cur_idx, next_idx,
+				 cur_offset, buf, len);
+	return rc;
+}
+
+static inline int llog_prev_block(const struct lu_env *env,
+				  struct llog_handle *loghandle,
+				  int prev_idx, void *buf, int len)
+{
+	struct llog_operations *lop;
+	int rc;
+
+	rc = llog_handle2ops(loghandle, &lop);
+	if (rc)
+		return rc;
+	if (lop->lop_prev_block == NULL)
+		return -EOPNOTSUPP;
+
+	rc = lop->lop_prev_block(env, loghandle, prev_idx, buf, len);
+	return rc;
+}
+
+static inline int llog_connect(struct llog_ctxt *ctxt,
+			       struct llog_logid *logid, struct llog_gen *gen,
+			       struct obd_uuid *uuid)
+{
+	struct llog_operations	*lop;
+	int			 rc;
+
+	rc = llog_obd2ops(ctxt, &lop);
+	if (rc)
+		return rc;
+	if (lop->lop_connect == NULL)
+		return -EOPNOTSUPP;
+
+	rc = lop->lop_connect(ctxt, logid, gen, uuid);
+	return rc;
+}
+
+/* llog.c */
+int llog_exist(struct llog_handle *loghandle);
+int llog_declare_create(const struct lu_env *env,
+			struct llog_handle *loghandle, struct thandle *th);
+int llog_create(const struct lu_env *env, struct llog_handle *handle,
+		struct thandle *th);
+int llog_declare_write_rec(const struct lu_env *env,
+			   struct llog_handle *handle,
+			   struct llog_rec_hdr *rec, int idx,
+			   struct thandle *th);
+int llog_write_rec(const struct lu_env *env, struct llog_handle *handle,
+		   struct llog_rec_hdr *rec, struct llog_cookie *logcookies,
+		   int numcookies, void *buf, int idx, struct thandle *th);
+int llog_add(const struct lu_env *env, struct llog_handle *lgh,
+	     struct llog_rec_hdr *rec, struct llog_cookie *logcookies,
+	     void *buf, struct thandle *th);
+int llog_declare_add(const struct lu_env *env, struct llog_handle *lgh,
+		     struct llog_rec_hdr *rec, struct thandle *th);
+int lustre_process_log(struct super_block *sb, char *logname,
+		       struct config_llog_instance *cfg);
+int lustre_end_log(struct super_block *sb, char *logname,
+		   struct config_llog_instance *cfg);
+int llog_open_create(const struct lu_env *env, struct llog_ctxt *ctxt,
+		     struct llog_handle **res, struct llog_logid *logid,
+		     char *name);
+int llog_erase(const struct lu_env *env, struct llog_ctxt *ctxt,
+	       struct llog_logid *logid, char *name);
+int llog_write(const struct lu_env *env, struct llog_handle *loghandle,
+	       struct llog_rec_hdr *rec, struct llog_cookie *reccookie,
+	       int cookiecount, void *buf, int idx);
+
+/** @} log */
+
+#endif
diff --git a/kernel/drivers/staging/lustre/lustre/include/lustre_mdc.h b/kernel/drivers/staging/lustre/lustre/include/lustre_mdc.h
new file mode 100644
index 000000000..b1b05c8a3
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/include/lustre_mdc.h
@@ -0,0 +1,191 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/include/lustre_mdc.h
+ *
+ * MDS data structures.
+ * See also lustre_idl.h for wire formats of requests.
+ */
+
+#ifndef _LUSTRE_MDC_H
+#define _LUSTRE_MDC_H
+
+/** \defgroup mdc mdc
+ *
+ * @{
+ */
+
+#include <linux/fs.h>
+#include <linux/dcache.h>
+#include "lustre_intent.h"
+#include "lustre_handles.h"
+#include "../../include/linux/libcfs/libcfs.h"
+#include "obd_class.h"
+#include "lustre/lustre_idl.h"
+#include "lustre_lib.h"
+#include "lustre_dlm.h"
+#include "lustre_export.h"
+
+struct ptlrpc_client;
+struct obd_export;
+struct ptlrpc_request;
+struct obd_device;
+
+struct mdc_rpc_lock {
+	struct mutex		rpcl_mutex;
+	struct lookup_intent	*rpcl_it;
+	int			rpcl_fakes;
+};
+
+#define MDC_FAKE_RPCL_IT ((void *)0x2c0012bfUL)
+
+static inline void mdc_init_rpc_lock(struct mdc_rpc_lock *lck)
+{
+	mutex_init(&lck->rpcl_mutex);
+	lck->rpcl_it = NULL;
+}
+
+static inline void mdc_get_rpc_lock(struct mdc_rpc_lock *lck,
+				    struct lookup_intent *it)
+{
+	if (it != NULL && (it->it_op == IT_GETATTR || it->it_op == IT_LOOKUP ||
+			   it->it_op == IT_LAYOUT))
+		return;
+
+	/* This would normally block until the existing request finishes.
+	 * If fail_loc is set it will block until the regular request is
+	 * done, then set rpcl_it to MDC_FAKE_RPCL_IT.  Once that is set
+	 * it will only be cleared when all fake requests are finished.
+	 * Only when all fake requests are finished can normal requests
+	 * be sent, to ensure they are recoverable again. */
+ again:
+	mutex_lock(&lck->rpcl_mutex);
+
+	if (CFS_FAIL_CHECK_QUIET(OBD_FAIL_MDC_RPCS_SEM)) {
+		lck->rpcl_it = MDC_FAKE_RPCL_IT;
+		lck->rpcl_fakes++;
+		mutex_unlock(&lck->rpcl_mutex);
+		return;
+	}
+
+	/* This will only happen when the CFS_FAIL_CHECK() was
+	 * just turned off but there are still requests in progress.
+	 * Wait until they finish.  It doesn't need to be efficient
+	 * in this extremely rare case, just have low overhead in
+	 * the common case when it isn't true. */
+	while (unlikely(lck->rpcl_it == MDC_FAKE_RPCL_IT)) {
+		mutex_unlock(&lck->rpcl_mutex);
+		schedule_timeout(cfs_time_seconds(1) / 4);
+		goto again;
+	}
+
+	LASSERT(lck->rpcl_it == NULL);
+	lck->rpcl_it = it;
+}
+
+static inline void mdc_put_rpc_lock(struct mdc_rpc_lock *lck,
+				    struct lookup_intent *it)
+{
+	if (it != NULL && (it->it_op == IT_GETATTR || it->it_op == IT_LOOKUP ||
+			   it->it_op == IT_LAYOUT))
+		return;
+
+	if (lck->rpcl_it == MDC_FAKE_RPCL_IT) { /* OBD_FAIL_MDC_RPCS_SEM */
+		mutex_lock(&lck->rpcl_mutex);
+
+		LASSERTF(lck->rpcl_fakes > 0, "%d\n", lck->rpcl_fakes);
+		lck->rpcl_fakes--;
+
+		if (lck->rpcl_fakes == 0)
+			lck->rpcl_it = NULL;
+
+	} else {
+		LASSERTF(it == lck->rpcl_it, "%p != %p\n", it, lck->rpcl_it);
+		lck->rpcl_it = NULL;
+	}
+
+	mutex_unlock(&lck->rpcl_mutex);
+}
+
+/* Update the maximum observed easize and cookiesize.  The default easize
+ * and cookiesize is initialized to the minimum value but allowed to grow
+ * up to a single page in size if required to handle the common case.
+ */
+static inline void mdc_update_max_ea_from_body(struct obd_export *exp,
+					       struct mdt_body *body)
+{
+	if (body->valid & OBD_MD_FLMODEASIZE) {
+		struct client_obd *cli = &exp->exp_obd->u.cli;
+
+		if (cli->cl_max_mds_easize < body->max_mdsize) {
+			cli->cl_max_mds_easize = body->max_mdsize;
+			cli->cl_default_mds_easize =
+			    min_t(__u32, body->max_mdsize, PAGE_CACHE_SIZE);
+		}
+		if (cli->cl_max_mds_cookiesize < body->max_cookiesize) {
+			cli->cl_max_mds_cookiesize = body->max_cookiesize;
+			cli->cl_default_mds_cookiesize =
+			    min_t(__u32, body->max_cookiesize, PAGE_CACHE_SIZE);
+		}
+	}
+}
+
+
+struct mdc_cache_waiter {
+	struct list_head	      mcw_entry;
+	wait_queue_head_t	     mcw_waitq;
+};
+
+/* mdc/mdc_locks.c */
+int it_disposition(struct lookup_intent *it, int flag);
+void it_clear_disposition(struct lookup_intent *it, int flag);
+void it_set_disposition(struct lookup_intent *it, int flag);
+int it_open_error(int phase, struct lookup_intent *it);
+
+static inline bool cl_is_lov_delay_create(unsigned int flags)
+{
+	return (flags & O_LOV_DELAY_CREATE) == O_LOV_DELAY_CREATE;
+}
+
+static inline void cl_lov_delay_create_clear(unsigned int *flags)
+{
+	if ((*flags & O_LOV_DELAY_CREATE) == O_LOV_DELAY_CREATE)
+		*flags &= ~O_LOV_DELAY_CREATE;
+}
+
+/** @} mdc */
+
+#endif
diff --git a/kernel/drivers/staging/lustre/lustre/include/lustre_mds.h b/kernel/drivers/staging/lustre/lustre/include/lustre_mds.h
new file mode 100644
index 000000000..f0cce41c5
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/include/lustre_mds.h
@@ -0,0 +1,81 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/include/lustre_mds.h
+ *
+ * MDS data structures.
+ * See also lustre_idl.h for wire formats of requests.
+ */
+
+#ifndef _LUSTRE_MDS_H
+#define _LUSTRE_MDS_H
+
+/** \defgroup mds mds
+ *
+ * @{
+ */
+
+#include "lustre_handles.h"
+#include "../../include/linux/libcfs/libcfs.h"
+#include "lustre/lustre_idl.h"
+#include "lustre_lib.h"
+#include "lustre_dlm.h"
+#include "lustre_export.h"
+
+struct mds_group_info {
+	struct obd_uuid *uuid;
+	int group;
+};
+
+struct mds_capa_info {
+	struct obd_uuid	*uuid;
+	struct lustre_capa_key *capa;
+};
+
+#define MDD_OBD_NAME     "mdd_obd"
+#define MDD_OBD_UUID     "mdd_obd_uuid"
+
+static inline int md_should_create(__u64 flags)
+{
+       return !(flags & MDS_OPEN_DELAY_CREATE ||
+	       !(flags & FMODE_WRITE));
+}
+
+/* these are local flags, used only on the client, private */
+#define M_CHECK_STALE	   0200000000
+
+/** @} mds */
+
+#endif
diff --git a/kernel/drivers/staging/lustre/lustre/include/lustre_net.h b/kernel/drivers/staging/lustre/lustre/include/lustre_net.h
new file mode 100644
index 000000000..e2805bd1a
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/include/lustre_net.h
@@ -0,0 +1,2967 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2010, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+/** \defgroup PtlRPC Portal RPC and networking module.
+ *
+ * PortalRPC is the layer used by rest of lustre code to achieve network
+ * communications: establish connections with corresponding export and import
+ * states, listen for a service, send and receive RPCs.
+ * PortalRPC also includes base recovery framework: packet resending and
+ * replaying, reconnections, pinger.
+ *
+ * PortalRPC utilizes LNet as its transport layer.
+ *
+ * @{
+ */
+
+
+#ifndef _LUSTRE_NET_H
+#define _LUSTRE_NET_H
+
+/** \defgroup net net
+ *
+ * @{
+ */
+
+#include "../../include/linux/libcfs/libcfs.h"
+// #include <obd.h>
+#include "../../include/linux/lnet/lnet.h"
+#include "lustre/lustre_idl.h"
+#include "lustre_ha.h"
+#include "lustre_sec.h"
+#include "lustre_import.h"
+#include "lprocfs_status.h"
+#include "lu_object.h"
+#include "lustre_req_layout.h"
+
+#include "obd_support.h"
+#include "lustre_ver.h"
+
+/* MD flags we _always_ use */
+#define PTLRPC_MD_OPTIONS  0
+
+/**
+ * Max # of bulk operations in one request.
+ * In order for the client and server to properly negotiate the maximum
+ * possible transfer size, PTLRPC_BULK_OPS_COUNT must be a power-of-two
+ * value.  The client is free to limit the actual RPC size for any bulk
+ * transfer via cl_max_pages_per_rpc to some non-power-of-two value. */
+#define PTLRPC_BULK_OPS_BITS	2
+#define PTLRPC_BULK_OPS_COUNT	(1U << PTLRPC_BULK_OPS_BITS)
+/**
+ * PTLRPC_BULK_OPS_MASK is for the convenience of the client only, and
+ * should not be used on the server at all.  Otherwise, it imposes a
+ * protocol limitation on the maximum RPC size that can be used by any
+ * RPC sent to that server in the future.  Instead, the server should
+ * use the negotiated per-client ocd_brw_size to determine the bulk
+ * RPC count. */
+#define PTLRPC_BULK_OPS_MASK	(~((__u64)PTLRPC_BULK_OPS_COUNT - 1))
+
+/**
+ * Define maxima for bulk I/O.
+ *
+ * A single PTLRPC BRW request is sent via up to PTLRPC_BULK_OPS_COUNT
+ * of LNET_MTU sized RDMA transfers.  Clients and servers negotiate the
+ * currently supported maximum between peers at connect via ocd_brw_size.
+ */
+#define PTLRPC_MAX_BRW_BITS	(LNET_MTU_BITS + PTLRPC_BULK_OPS_BITS)
+#define PTLRPC_MAX_BRW_SIZE	(1 << PTLRPC_MAX_BRW_BITS)
+#define PTLRPC_MAX_BRW_PAGES	(PTLRPC_MAX_BRW_SIZE >> PAGE_CACHE_SHIFT)
+
+#define ONE_MB_BRW_SIZE		(1 << LNET_MTU_BITS)
+#define MD_MAX_BRW_SIZE		(1 << LNET_MTU_BITS)
+#define MD_MAX_BRW_PAGES	(MD_MAX_BRW_SIZE >> PAGE_CACHE_SHIFT)
+#define DT_MAX_BRW_SIZE		PTLRPC_MAX_BRW_SIZE
+#define DT_MAX_BRW_PAGES	(DT_MAX_BRW_SIZE >> PAGE_CACHE_SHIFT)
+#define OFD_MAX_BRW_SIZE	(1 << LNET_MTU_BITS)
+
+/* When PAGE_SIZE is a constant, we can check our arithmetic here with cpp! */
+# if ((PTLRPC_MAX_BRW_PAGES & (PTLRPC_MAX_BRW_PAGES - 1)) != 0)
+#  error "PTLRPC_MAX_BRW_PAGES isn't a power of two"
+# endif
+# if (PTLRPC_MAX_BRW_SIZE != (PTLRPC_MAX_BRW_PAGES * PAGE_CACHE_SIZE))
+#  error "PTLRPC_MAX_BRW_SIZE isn't PTLRPC_MAX_BRW_PAGES * PAGE_CACHE_SIZE"
+# endif
+# if (PTLRPC_MAX_BRW_SIZE > LNET_MTU * PTLRPC_BULK_OPS_COUNT)
+#  error "PTLRPC_MAX_BRW_SIZE too big"
+# endif
+# if (PTLRPC_MAX_BRW_PAGES > LNET_MAX_IOV * PTLRPC_BULK_OPS_COUNT)
+#  error "PTLRPC_MAX_BRW_PAGES too big"
+# endif
+
+#define PTLRPC_NTHRS_INIT	2
+
+/**
+ * Buffer Constants
+ *
+ * Constants determine how memory is used to buffer incoming service requests.
+ *
+ * ?_NBUFS	      # buffers to allocate when growing the pool
+ * ?_BUFSIZE	    # bytes in a single request buffer
+ * ?_MAXREQSIZE	 # maximum request service will receive
+ *
+ * When fewer than ?_NBUFS/2 buffers are posted for receive, another chunk
+ * of ?_NBUFS is added to the pool.
+ *
+ * Messages larger than ?_MAXREQSIZE are dropped.  Request buffers are
+ * considered full when less than ?_MAXREQSIZE is left in them.
+ */
+/**
+ * Thread Constants
+ *
+ * Constants determine how threads are created for ptlrpc service.
+ *
+ * ?_NTHRS_INIT		# threads to create for each service partition on
+ *			  initializing. If it's non-affinity service and
+ *			  there is only one partition, it's the overall #
+ *			  threads for the service while initializing.
+ * ?_NTHRS_BASE		# threads should be created at least for each
+ *			  ptlrpc partition to keep the service healthy.
+ *			  It's the low-water mark of threads upper-limit
+ *			  for each partition.
+ * ?_THR_FACTOR	 # threads can be added on threads upper-limit for
+ *			  each CPU core. This factor is only for reference,
+ *			  we might decrease value of factor if number of cores
+ *			  per CPT is above a limit.
+ * ?_NTHRS_MAX		# overall threads can be created for a service,
+ *			  it's a soft limit because if service is running
+ *			  on machine with hundreds of cores and tens of
+ *			  CPU partitions, we need to guarantee each partition
+ *			  has ?_NTHRS_BASE threads, which means total threads
+ *			  will be ?_NTHRS_BASE * number_of_cpts which can
+ *			  exceed ?_NTHRS_MAX.
+ *
+ * Examples
+ *
+ * #define MDS_NTHRS_INIT	2
+ * #define MDS_NTHRS_BASE	64
+ * #define MDS_NTHRS_FACTOR	8
+ * #define MDS_NTHRS_MAX	1024
+ *
+ * Example 1):
+ * ---------------------------------------------------------------------
+ * Server(A) has 16 cores, user configured it to 4 partitions so each
+ * partition has 4 cores, then actual number of service threads on each
+ * partition is:
+ *     MDS_NTHRS_BASE(64) + cores(4) * MDS_NTHRS_FACTOR(8) = 96
+ *
+ * Total number of threads for the service is:
+ *     96 * partitions(4) = 384
+ *
+ * Example 2):
+ * ---------------------------------------------------------------------
+ * Server(B) has 32 cores, user configured it to 4 partitions so each
+ * partition has 8 cores, then actual number of service threads on each
+ * partition is:
+ *     MDS_NTHRS_BASE(64) + cores(8) * MDS_NTHRS_FACTOR(8) = 128
+ *
+ * Total number of threads for the service is:
+ *     128 * partitions(4) = 512
+ *
+ * Example 3):
+ * ---------------------------------------------------------------------
+ * Server(B) has 96 cores, user configured it to 8 partitions so each
+ * partition has 12 cores, then actual number of service threads on each
+ * partition is:
+ *     MDS_NTHRS_BASE(64) + cores(12) * MDS_NTHRS_FACTOR(8) = 160
+ *
+ * Total number of threads for the service is:
+ *     160 * partitions(8) = 1280
+ *
+ * However, it's above the soft limit MDS_NTHRS_MAX, so we choose this number
+ * as upper limit of threads number for each partition:
+ *     MDS_NTHRS_MAX(1024) / partitions(8) = 128
+ *
+ * Example 4):
+ * ---------------------------------------------------------------------
+ * Server(C) have a thousand of cores and user configured it to 32 partitions
+ *     MDS_NTHRS_BASE(64) * 32 = 2048
+ *
+ * which is already above soft limit MDS_NTHRS_MAX(1024), but we still need
+ * to guarantee that each partition has at least MDS_NTHRS_BASE(64) threads
+ * to keep service healthy, so total number of threads will just be 2048.
+ *
+ * NB: we don't suggest to choose server with that many cores because backend
+ *     filesystem itself, buffer cache, or underlying network stack might
+ *     have some SMP scalability issues at that large scale.
+ *
+ *     If user already has a fat machine with hundreds or thousands of cores,
+ *     there are two choices for configuration:
+ *     a) create CPU table from subset of all CPUs and run Lustre on
+ *	top of this subset
+ *     b) bind service threads on a few partitions, see modparameters of
+ *	MDS and OSS for details
+*
+ * NB: these calculations (and examples below) are simplified to help
+ *     understanding, the real implementation is a little more complex,
+ *     please see ptlrpc_server_nthreads_check() for details.
+ *
+ */
+
+ /*
+  * LDLM threads constants:
+  *
+  * Given 8 as factor and 24 as base threads number
+  *
+  * example 1)
+  * On 4-core machine we will have 24 + 8 * 4 = 56 threads.
+  *
+  * example 2)
+  * On 8-core machine with 2 partitions we will have 24 + 4 * 8 = 56
+  * threads for each partition and total threads number will be 112.
+  *
+  * example 3)
+  * On 64-core machine with 8 partitions we will need LDLM_NTHRS_BASE(24)
+  * threads for each partition to keep service healthy, so total threads
+  * number should be 24 * 8 = 192.
+  *
+  * So with these constants, threads number will be at the similar level
+  * of old versions, unless target machine has over a hundred cores
+  */
+#define LDLM_THR_FACTOR		8
+#define LDLM_NTHRS_INIT		PTLRPC_NTHRS_INIT
+#define LDLM_NTHRS_BASE		24
+#define LDLM_NTHRS_MAX		(num_online_cpus() == 1 ? 64 : 128)
+
+#define LDLM_BL_THREADS   LDLM_NTHRS_AUTO_INIT
+#define LDLM_CLIENT_NBUFS 1
+#define LDLM_SERVER_NBUFS 64
+#define LDLM_BUFSIZE      (8 * 1024)
+#define LDLM_MAXREQSIZE   (5 * 1024)
+#define LDLM_MAXREPSIZE   (1024)
+
+#define MDS_MAXREQSIZE		(5 * 1024)	/* >= 4736 */
+
+#define OST_MAXREQSIZE		(5 * 1024)
+
+/* Macro to hide a typecast. */
+#define ptlrpc_req_async_args(req) ((void *)&req->rq_async_args)
+
+/**
+ * Structure to single define portal connection.
+ */
+struct ptlrpc_connection {
+	/** linkage for connections hash table */
+	struct hlist_node	c_hash;
+	/** Our own lnet nid for this connection */
+	lnet_nid_t	      c_self;
+	/** Remote side nid for this connection */
+	lnet_process_id_t       c_peer;
+	/** UUID of the other side */
+	struct obd_uuid	 c_remote_uuid;
+	/** reference counter for this connection */
+	atomic_t	    c_refcount;
+};
+
+/** Client definition for PortalRPC */
+struct ptlrpc_client {
+	/** What lnet portal does this client send messages to by default */
+	__u32		   cli_request_portal;
+	/** What portal do we expect replies on */
+	__u32		   cli_reply_portal;
+	/** Name of the client */
+	char		   *cli_name;
+};
+
+/** state flags of requests */
+/* XXX only ones left are those used by the bulk descs as well! */
+#define PTL_RPC_FL_INTR      (1 << 0)  /* reply wait was interrupted by user */
+#define PTL_RPC_FL_TIMEOUT   (1 << 7)  /* request timed out waiting for reply */
+
+#define REQ_MAX_ACK_LOCKS 8
+
+union ptlrpc_async_args {
+	/**
+	 * Scratchpad for passing args to completion interpreter. Users
+	 * cast to the struct of their choosing, and CLASSERT that this is
+	 * big enough.  For _tons_ of context, OBD_ALLOC a struct and store
+	 * a pointer to it here.  The pointer_arg ensures this struct is at
+	 * least big enough for that.
+	 */
+	void      *pointer_arg[11];
+	__u64      space[7];
+};
+
+struct ptlrpc_request_set;
+typedef int (*set_interpreter_func)(struct ptlrpc_request_set *, void *, int);
+typedef int (*set_producer_func)(struct ptlrpc_request_set *, void *);
+
+/**
+ * Definition of request set structure.
+ * Request set is a list of requests (not necessary to the same target) that
+ * once populated with RPCs could be sent in parallel.
+ * There are two kinds of request sets. General purpose and with dedicated
+ * serving thread. Example of the latter is ptlrpcd set.
+ * For general purpose sets once request set started sending it is impossible
+ * to add new requests to such set.
+ * Provides a way to call "completion callbacks" when all requests in the set
+ * returned.
+ */
+struct ptlrpc_request_set {
+	atomic_t	  set_refcount;
+	/** number of in queue requests */
+	atomic_t	  set_new_count;
+	/** number of uncompleted requests */
+	atomic_t	  set_remaining;
+	/** wait queue to wait on for request events */
+	wait_queue_head_t	   set_waitq;
+	wait_queue_head_t	  *set_wakeup_ptr;
+	/** List of requests in the set */
+	struct list_head	    set_requests;
+	/**
+	 * List of completion callbacks to be called when the set is completed
+	 * This is only used if \a set_interpret is NULL.
+	 * Links struct ptlrpc_set_cbdata.
+	 */
+	struct list_head	    set_cblist;
+	/** Completion callback, if only one. */
+	set_interpreter_func  set_interpret;
+	/** opaq argument passed to completion \a set_interpret callback. */
+	void		 *set_arg;
+	/**
+	 * Lock for \a set_new_requests manipulations
+	 * locked so that any old caller can communicate requests to
+	 * the set holder who can then fold them into the lock-free set
+	 */
+	spinlock_t		set_new_req_lock;
+	/** List of new yet unsent requests. Only used with ptlrpcd now. */
+	struct list_head	    set_new_requests;
+
+	/** rq_status of requests that have been freed already */
+	int		   set_rc;
+	/** Additional fields used by the flow control extension */
+	/** Maximum number of RPCs in flight */
+	int		   set_max_inflight;
+	/** Callback function used to generate RPCs */
+	set_producer_func     set_producer;
+	/** opaq argument passed to the producer callback */
+	void		 *set_producer_arg;
+};
+
+/**
+ * Description of a single ptrlrpc_set callback
+ */
+struct ptlrpc_set_cbdata {
+	/** List linkage item */
+	struct list_head	      psc_item;
+	/** Pointer to interpreting function */
+	set_interpreter_func    psc_interpret;
+	/** Opaq argument to pass to the callback */
+	void		   *psc_data;
+};
+
+struct ptlrpc_bulk_desc;
+struct ptlrpc_service_part;
+struct ptlrpc_service;
+
+/**
+ * ptlrpc callback & work item stuff
+ */
+struct ptlrpc_cb_id {
+	void   (*cbid_fn)(lnet_event_t *ev);     /* specific callback fn */
+	void    *cbid_arg;		      /* additional arg */
+};
+
+/** Maximum number of locks to fit into reply state */
+#define RS_MAX_LOCKS 8
+#define RS_DEBUG     0
+
+/**
+ * Structure to define reply state on the server
+ * Reply state holds various reply message information. Also for "difficult"
+ * replies (rep-ack case) we store the state after sending reply and wait
+ * for the client to acknowledge the reception. In these cases locks could be
+ * added to the state for replay/failover consistency guarantees.
+ */
+struct ptlrpc_reply_state {
+	/** Callback description */
+	struct ptlrpc_cb_id    rs_cb_id;
+	/** Linkage for list of all reply states in a system */
+	struct list_head	     rs_list;
+	/** Linkage for list of all reply states on same export */
+	struct list_head	     rs_exp_list;
+	/** Linkage for list of all reply states for same obd */
+	struct list_head	     rs_obd_list;
+#if RS_DEBUG
+	struct list_head	     rs_debug_list;
+#endif
+	/** A spinlock to protect the reply state flags */
+	spinlock_t		rs_lock;
+	/** Reply state flags */
+	unsigned long	  rs_difficult:1;     /* ACK/commit stuff */
+	unsigned long	  rs_no_ack:1;    /* no ACK, even for
+						  difficult requests */
+	unsigned long	  rs_scheduled:1;     /* being handled? */
+	unsigned long	  rs_scheduled_ever:1;/* any schedule attempts? */
+	unsigned long	  rs_handled:1;  /* been handled yet? */
+	unsigned long	  rs_on_net:1;   /* reply_out_callback pending? */
+	unsigned long	  rs_prealloc:1; /* rs from prealloc list */
+	unsigned long	  rs_committed:1;/* the transaction was committed
+						 and the rs was dispatched
+						 by ptlrpc_commit_replies */
+	/** Size of the state */
+	int		    rs_size;
+	/** opcode */
+	__u32		  rs_opc;
+	/** Transaction number */
+	__u64		  rs_transno;
+	/** xid */
+	__u64		  rs_xid;
+	struct obd_export     *rs_export;
+	struct ptlrpc_service_part *rs_svcpt;
+	/** Lnet metadata handle for the reply */
+	lnet_handle_md_t       rs_md_h;
+	atomic_t	   rs_refcount;
+
+	/** Context for the service thread */
+	struct ptlrpc_svc_ctx *rs_svc_ctx;
+	/** Reply buffer (actually sent to the client), encoded if needed */
+	struct lustre_msg     *rs_repbuf;       /* wrapper */
+	/** Size of the reply buffer */
+	int		    rs_repbuf_len;   /* wrapper buf length */
+	/** Size of the reply message */
+	int		    rs_repdata_len;  /* wrapper msg length */
+	/**
+	 * Actual reply message. Its content is encrypted (if needed) to
+	 * produce reply buffer for actual sending. In simple case
+	 * of no network encryption we just set \a rs_repbuf to \a rs_msg
+	 */
+	struct lustre_msg     *rs_msg;	  /* reply message */
+
+	/** Number of locks awaiting client ACK */
+	int		    rs_nlocks;
+	/** Handles of locks awaiting client reply ACK */
+	struct lustre_handle   rs_locks[RS_MAX_LOCKS];
+	/** Lock modes of locks in \a rs_locks */
+	ldlm_mode_t	    rs_modes[RS_MAX_LOCKS];
+};
+
+struct ptlrpc_thread;
+
+/** RPC stages */
+enum rq_phase {
+	RQ_PHASE_NEW	    = 0xebc0de00,
+	RQ_PHASE_RPC	    = 0xebc0de01,
+	RQ_PHASE_BULK	   = 0xebc0de02,
+	RQ_PHASE_INTERPRET      = 0xebc0de03,
+	RQ_PHASE_COMPLETE       = 0xebc0de04,
+	RQ_PHASE_UNREGISTERING  = 0xebc0de05,
+	RQ_PHASE_UNDEFINED      = 0xebc0de06
+};
+
+/** Type of request interpreter call-back */
+typedef int (*ptlrpc_interpterer_t)(const struct lu_env *env,
+				    struct ptlrpc_request *req,
+				    void *arg, int rc);
+
+/**
+ * Definition of request pool structure.
+ * The pool is used to store empty preallocated requests for the case
+ * when we would actually need to send something without performing
+ * any allocations (to avoid e.g. OOM).
+ */
+struct ptlrpc_request_pool {
+	/** Locks the list */
+	spinlock_t prp_lock;
+	/** list of ptlrpc_request structs */
+	struct list_head prp_req_list;
+	/** Maximum message size that would fit into a request from this pool */
+	int prp_rq_size;
+	/** Function to allocate more requests for this pool */
+	void (*prp_populate)(struct ptlrpc_request_pool *, int);
+};
+
+struct lu_context;
+struct lu_env;
+
+struct ldlm_lock;
+
+/**
+ * \defgroup nrs Network Request Scheduler
+ * @{
+ */
+struct ptlrpc_nrs_policy;
+struct ptlrpc_nrs_resource;
+struct ptlrpc_nrs_request;
+
+/**
+ * NRS control operations.
+ *
+ * These are common for all policies.
+ */
+enum ptlrpc_nrs_ctl {
+	/**
+	 * Not a valid opcode.
+	 */
+	PTLRPC_NRS_CTL_INVALID,
+	/**
+	 * Activate the policy.
+	 */
+	PTLRPC_NRS_CTL_START,
+	/**
+	 * Reserved for multiple primary policies, which may be a possibility
+	 * in the future.
+	 */
+	PTLRPC_NRS_CTL_STOP,
+	/**
+	 * Policies can start using opcodes from this value and onwards for
+	 * their own purposes; the assigned value itself is arbitrary.
+	 */
+	PTLRPC_NRS_CTL_1ST_POL_SPEC = 0x20,
+};
+
+/**
+ * ORR policy operations
+ */
+enum nrs_ctl_orr {
+	NRS_CTL_ORR_RD_QUANTUM = PTLRPC_NRS_CTL_1ST_POL_SPEC,
+	NRS_CTL_ORR_WR_QUANTUM,
+	NRS_CTL_ORR_RD_OFF_TYPE,
+	NRS_CTL_ORR_WR_OFF_TYPE,
+	NRS_CTL_ORR_RD_SUPP_REQ,
+	NRS_CTL_ORR_WR_SUPP_REQ,
+};
+
+/**
+ * NRS policy operations.
+ *
+ * These determine the behaviour of a policy, and are called in response to
+ * NRS core events.
+ */
+struct ptlrpc_nrs_pol_ops {
+	/**
+	 * Called during policy registration; this operation is optional.
+	 *
+	 * \param[in,out] policy The policy being initialized
+	 */
+	int	(*op_policy_init) (struct ptlrpc_nrs_policy *policy);
+	/**
+	 * Called during policy unregistration; this operation is optional.
+	 *
+	 * \param[in,out] policy The policy being unregistered/finalized
+	 */
+	void	(*op_policy_fini) (struct ptlrpc_nrs_policy *policy);
+	/**
+	 * Called when activating a policy via lprocfs; policies allocate and
+	 * initialize their resources here; this operation is optional.
+	 *
+	 * \param[in,out] policy The policy being started
+	 *
+	 * \see nrs_policy_start_locked()
+	 */
+	int	(*op_policy_start) (struct ptlrpc_nrs_policy *policy);
+	/**
+	 * Called when deactivating a policy via lprocfs; policies deallocate
+	 * their resources here; this operation is optional
+	 *
+	 * \param[in,out] policy The policy being stopped
+	 *
+	 * \see nrs_policy_stop0()
+	 */
+	void	(*op_policy_stop) (struct ptlrpc_nrs_policy *policy);
+	/**
+	 * Used for policy-specific operations; i.e. not generic ones like
+	 * \e PTLRPC_NRS_CTL_START and \e PTLRPC_NRS_CTL_GET_INFO; analogous
+	 * to an ioctl; this operation is optional.
+	 *
+	 * \param[in,out]	 policy The policy carrying out operation \a opc
+	 * \param[in]	  opc	 The command operation being carried out
+	 * \param[in,out] arg	 An generic buffer for communication between the
+	 *			 user and the control operation
+	 *
+	 * \retval -ve error
+	 * \retval   0 success
+	 *
+	 * \see ptlrpc_nrs_policy_control()
+	 */
+	int	(*op_policy_ctl) (struct ptlrpc_nrs_policy *policy,
+				  enum ptlrpc_nrs_ctl opc, void *arg);
+
+	/**
+	 * Called when obtaining references to the resources of the resource
+	 * hierarchy for a request that has arrived for handling at the PTLRPC
+	 * service. Policies should return -ve for requests they do not wish
+	 * to handle. This operation is mandatory.
+	 *
+	 * \param[in,out] policy  The policy we're getting resources for.
+	 * \param[in,out] nrq	  The request we are getting resources for.
+	 * \param[in]	  parent  The parent resource of the resource being
+	 *			  requested; set to NULL if none.
+	 * \param[out]	  resp	  The resource is to be returned here; the
+	 *			  fallback policy in an NRS head should
+	 *			  \e always return a non-NULL pointer value.
+	 * \param[in]  moving_req When set, signifies that this is an attempt
+	 *			  to obtain resources for a request being moved
+	 *			  to the high-priority NRS head by
+	 *			  ldlm_lock_reorder_req().
+	 *			  This implies two things:
+	 *			  1. We are under obd_export::exp_rpc_lock and
+	 *			  so should not sleep.
+	 *			  2. We should not perform non-idempotent or can
+	 *			  skip performing idempotent operations that
+	 *			  were carried out when resources were first
+	 *			  taken for the request when it was initialized
+	 *			  in ptlrpc_nrs_req_initialize().
+	 *
+	 * \retval 0, +ve The level of the returned resource in the resource
+	 *		  hierarchy; currently only 0 (for a non-leaf resource)
+	 *		  and 1 (for a leaf resource) are supported by the
+	 *		  framework.
+	 * \retval -ve	  error
+	 *
+	 * \see ptlrpc_nrs_req_initialize()
+	 * \see ptlrpc_nrs_hpreq_add_nolock()
+	 * \see ptlrpc_nrs_req_hp_move()
+	 */
+	int	(*op_res_get) (struct ptlrpc_nrs_policy *policy,
+			       struct ptlrpc_nrs_request *nrq,
+			       const struct ptlrpc_nrs_resource *parent,
+			       struct ptlrpc_nrs_resource **resp,
+			       bool moving_req);
+	/**
+	 * Called when releasing references taken for resources in the resource
+	 * hierarchy for the request; this operation is optional.
+	 *
+	 * \param[in,out] policy The policy the resource belongs to
+	 * \param[in] res	 The resource to be freed
+	 *
+	 * \see ptlrpc_nrs_req_finalize()
+	 * \see ptlrpc_nrs_hpreq_add_nolock()
+	 * \see ptlrpc_nrs_req_hp_move()
+	 */
+	void	(*op_res_put) (struct ptlrpc_nrs_policy *policy,
+			       const struct ptlrpc_nrs_resource *res);
+
+	/**
+	 * Obtains a request for handling from the policy, and optionally
+	 * removes the request from the policy; this operation is mandatory.
+	 *
+	 * \param[in,out] policy The policy to poll
+	 * \param[in]	  peek	 When set, signifies that we just want to
+	 *			 examine the request, and not handle it, so the
+	 *			 request is not removed from the policy.
+	 * \param[in]	  force	 When set, it will force a policy to return a
+	 *			 request if it has one queued.
+	 *
+	 * \retval NULL No request available for handling
+	 * \retval valid-pointer The request polled for handling
+	 *
+	 * \see ptlrpc_nrs_req_get_nolock()
+	 */
+	struct ptlrpc_nrs_request *
+		(*op_req_get) (struct ptlrpc_nrs_policy *policy, bool peek,
+			       bool force);
+	/**
+	 * Called when attempting to add a request to a policy for later
+	 * handling; this operation is mandatory.
+	 *
+	 * \param[in,out] policy  The policy on which to enqueue \a nrq
+	 * \param[in,out] nrq The request to enqueue
+	 *
+	 * \retval 0	success
+	 * \retval != 0	error
+	 *
+	 * \see ptlrpc_nrs_req_add_nolock()
+	 */
+	int	(*op_req_enqueue) (struct ptlrpc_nrs_policy *policy,
+				   struct ptlrpc_nrs_request *nrq);
+	/**
+	 * Removes a request from the policy's set of pending requests. Normally
+	 * called after a request has been polled successfully from the policy
+	 * for handling; this operation is mandatory.
+	 *
+	 * \param[in,out] policy The policy the request \a nrq belongs to
+	 * \param[in,out] nrq    The request to dequeue
+	 *
+	 * \see ptlrpc_nrs_req_del_nolock()
+	 */
+	void	(*op_req_dequeue) (struct ptlrpc_nrs_policy *policy,
+				   struct ptlrpc_nrs_request *nrq);
+	/**
+	 * Called after the request being carried out. Could be used for
+	 * job/resource control; this operation is optional.
+	 *
+	 * \param[in,out] policy The policy which is stopping to handle request
+	 *			 \a nrq
+	 * \param[in,out] nrq	 The request
+	 *
+	 * \pre assert_spin_locked(&svcpt->scp_req_lock)
+	 *
+	 * \see ptlrpc_nrs_req_stop_nolock()
+	 */
+	void	(*op_req_stop) (struct ptlrpc_nrs_policy *policy,
+				struct ptlrpc_nrs_request *nrq);
+	/**
+	 * Registers the policy's lprocfs interface with a PTLRPC service.
+	 *
+	 * \param[in] svc The service
+	 *
+	 * \retval 0	success
+	 * \retval != 0	error
+	 */
+	int	(*op_lprocfs_init) (struct ptlrpc_service *svc);
+	/**
+	 * Unegisters the policy's lprocfs interface with a PTLRPC service.
+	 *
+	 * In cases of failed policy registration in
+	 * \e ptlrpc_nrs_policy_register(), this function may be called for a
+	 * service which has not registered the policy successfully, so
+	 * implementations of this method should make sure their operations are
+	 * safe in such cases.
+	 *
+	 * \param[in] svc The service
+	 */
+	void	(*op_lprocfs_fini) (struct ptlrpc_service *svc);
+};
+
+/**
+ * Policy flags
+ */
+enum nrs_policy_flags {
+	/**
+	 * Fallback policy, use this flag only on a single supported policy per
+	 * service. The flag cannot be used on policies that use
+	 * \e PTLRPC_NRS_FL_REG_EXTERN
+	 */
+	PTLRPC_NRS_FL_FALLBACK		= (1 << 0),
+	/**
+	 * Start policy immediately after registering.
+	 */
+	PTLRPC_NRS_FL_REG_START		= (1 << 1),
+	/**
+	 * This is a policy registering from a module different to the one NRS
+	 * core ships in (currently ptlrpc).
+	 */
+	PTLRPC_NRS_FL_REG_EXTERN	= (1 << 2),
+};
+
+/**
+ * NRS queue type.
+ *
+ * Denotes whether an NRS instance is for handling normal or high-priority
+ * RPCs, or whether an operation pertains to one or both of the NRS instances
+ * in a service.
+ */
+enum ptlrpc_nrs_queue_type {
+	PTLRPC_NRS_QUEUE_REG	= (1 << 0),
+	PTLRPC_NRS_QUEUE_HP	= (1 << 1),
+	PTLRPC_NRS_QUEUE_BOTH	= (PTLRPC_NRS_QUEUE_REG | PTLRPC_NRS_QUEUE_HP)
+};
+
+/**
+ * NRS head
+ *
+ * A PTLRPC service has at least one NRS head instance for handling normal
+ * priority RPCs, and may optionally have a second NRS head instance for
+ * handling high-priority RPCs. Each NRS head maintains a list of available
+ * policies, of which one and only one policy is acting as the fallback policy,
+ * and optionally a different policy may be acting as the primary policy. For
+ * all RPCs handled by this NRS head instance, NRS core will first attempt to
+ * enqueue the RPC using the primary policy (if any). The fallback policy is
+ * used in the following cases:
+ * - when there was no primary policy in the
+ *   ptlrpc_nrs_pol_state::NRS_POL_STATE_STARTED state at the time the request
+ *   was initialized.
+ * - when the primary policy that was at the
+ *   ptlrpc_nrs_pol_state::PTLRPC_NRS_POL_STATE_STARTED state at the time the
+ *   RPC was initialized, denoted it did not wish, or for some other reason was
+ *   not able to handle the request, by returning a non-valid NRS resource
+ *   reference.
+ * - when the primary policy that was at the
+ *   ptlrpc_nrs_pol_state::PTLRPC_NRS_POL_STATE_STARTED state at the time the
+ *   RPC was initialized, fails later during the request enqueueing stage.
+ *
+ * \see nrs_resource_get_safe()
+ * \see nrs_request_enqueue()
+ */
+struct ptlrpc_nrs {
+	spinlock_t			nrs_lock;
+	/** XXX Possibly replace svcpt->scp_req_lock with another lock here. */
+	/**
+	 * List of registered policies
+	 */
+	struct list_head			nrs_policy_list;
+	/**
+	 * List of policies with queued requests. Policies that have any
+	 * outstanding requests are queued here, and this list is queried
+	 * in a round-robin manner from NRS core when obtaining a request
+	 * for handling. This ensures that requests from policies that at some
+	 * point transition away from the
+	 * ptlrpc_nrs_pol_state::NRS_POL_STATE_STARTED state are drained.
+	 */
+	struct list_head			nrs_policy_queued;
+	/**
+	 * Service partition for this NRS head
+	 */
+	struct ptlrpc_service_part     *nrs_svcpt;
+	/**
+	 * Primary policy, which is the preferred policy for handling RPCs
+	 */
+	struct ptlrpc_nrs_policy       *nrs_policy_primary;
+	/**
+	 * Fallback policy, which is the backup policy for handling RPCs
+	 */
+	struct ptlrpc_nrs_policy       *nrs_policy_fallback;
+	/**
+	 * This NRS head handles either HP or regular requests
+	 */
+	enum ptlrpc_nrs_queue_type	nrs_queue_type;
+	/**
+	 * # queued requests from all policies in this NRS head
+	 */
+	unsigned long			nrs_req_queued;
+	/**
+	 * # scheduled requests from all policies in this NRS head
+	 */
+	unsigned long			nrs_req_started;
+	/**
+	 * # policies on this NRS
+	 */
+	unsigned			nrs_num_pols;
+	/**
+	 * This NRS head is in progress of starting a policy
+	 */
+	unsigned			nrs_policy_starting:1;
+	/**
+	 * In progress of shutting down the whole NRS head; used during
+	 * unregistration
+	 */
+	unsigned			nrs_stopping:1;
+};
+
+#define NRS_POL_NAME_MAX		16
+
+struct ptlrpc_nrs_pol_desc;
+
+/**
+ * Service compatibility predicate; this determines whether a policy is adequate
+ * for handling RPCs of a particular PTLRPC service.
+ *
+ * XXX:This should give the same result during policy registration and
+ * unregistration, and for all partitions of a service; so the result should not
+ * depend on temporal service or other properties, that may influence the
+ * result.
+ */
+typedef bool (*nrs_pol_desc_compat_t) (const struct ptlrpc_service *svc,
+				       const struct ptlrpc_nrs_pol_desc *desc);
+
+struct ptlrpc_nrs_pol_conf {
+	/**
+	 * Human-readable policy name
+	 */
+	char				   nc_name[NRS_POL_NAME_MAX];
+	/**
+	 * NRS operations for this policy
+	 */
+	const struct ptlrpc_nrs_pol_ops	  *nc_ops;
+	/**
+	 * Service compatibility predicate
+	 */
+	nrs_pol_desc_compat_t		   nc_compat;
+	/**
+	 * Set for policies that support a single ptlrpc service, i.e. ones that
+	 * have \a pd_compat set to nrs_policy_compat_one(). The variable value
+	 * depicts the name of the single service that such policies are
+	 * compatible with.
+	 */
+	const char			  *nc_compat_svc_name;
+	/**
+	 * Owner module for this policy descriptor; policies registering from a
+	 * different module to the one the NRS framework is held within
+	 * (currently ptlrpc), should set this field to THIS_MODULE.
+	 */
+	struct module			  *nc_owner;
+	/**
+	 * Policy registration flags; a bitmask of \e nrs_policy_flags
+	 */
+	unsigned			   nc_flags;
+};
+
+/**
+ * NRS policy registering descriptor
+ *
+ * Is used to hold a description of a policy that can be passed to NRS core in
+ * order to register the policy with NRS heads in different PTLRPC services.
+ */
+struct ptlrpc_nrs_pol_desc {
+	/**
+	 * Human-readable policy name
+	 */
+	char					pd_name[NRS_POL_NAME_MAX];
+	/**
+	 * Link into nrs_core::nrs_policies
+	 */
+	struct list_head				pd_list;
+	/**
+	 * NRS operations for this policy
+	 */
+	const struct ptlrpc_nrs_pol_ops	       *pd_ops;
+	/**
+	 * Service compatibility predicate
+	 */
+	nrs_pol_desc_compat_t			pd_compat;
+	/**
+	 * Set for policies that are compatible with only one PTLRPC service.
+	 *
+	 * \see ptlrpc_nrs_pol_conf::nc_compat_svc_name
+	 */
+	const char			       *pd_compat_svc_name;
+	/**
+	 * Owner module for this policy descriptor.
+	 *
+	 * We need to hold a reference to the module whenever we might make use
+	 * of any of the module's contents, i.e.
+	 * - If one or more instances of the policy are at a state where they
+	 *   might be handling a request, i.e.
+	 *   ptlrpc_nrs_pol_state::NRS_POL_STATE_STARTED or
+	 *   ptlrpc_nrs_pol_state::NRS_POL_STATE_STOPPING as we will have to
+	 *   call into the policy's ptlrpc_nrs_pol_ops() handlers. A reference
+	 *   is taken on the module when
+	 *   \e ptlrpc_nrs_pol_desc::pd_refs becomes 1, and released when it
+	 *   becomes 0, so that we hold only one reference to the module maximum
+	 *   at any time.
+	 *
+	 *   We do not need to hold a reference to the module, even though we
+	 *   might use code and data from the module, in the following cases:
+	 * - During external policy registration, because this should happen in
+	 *   the module's init() function, in which case the module is safe from
+	 *   removal because a reference is being held on the module by the
+	 *   kernel, and iirc kmod (and I guess module-init-tools also) will
+	 *   serialize any racing processes properly anyway.
+	 * - During external policy unregistration, because this should happen
+	 *   in a module's exit() function, and any attempts to start a policy
+	 *   instance would need to take a reference on the module, and this is
+	 *   not possible once we have reached the point where the exit()
+	 *   handler is called.
+	 * - During service registration and unregistration, as service setup
+	 *   and cleanup, and policy registration, unregistration and policy
+	 *   instance starting, are serialized by \e nrs_core::nrs_mutex, so
+	 *   as long as users adhere to the convention of registering policies
+	 *   in init() and unregistering them in module exit() functions, there
+	 *   should not be a race between these operations.
+	 * - During any policy-specific lprocfs operations, because a reference
+	 *   is held by the kernel on a proc entry that has been entered by a
+	 *   syscall, so as long as proc entries are removed during unregistration time,
+	 *   then unregistration and lprocfs operations will be properly
+	 *   serialized.
+	 */
+	struct module			       *pd_owner;
+	/**
+	 * Bitmask of \e nrs_policy_flags
+	 */
+	unsigned				pd_flags;
+	/**
+	 * # of references on this descriptor
+	 */
+	atomic_t				pd_refs;
+};
+
+/**
+ * NRS policy state
+ *
+ * Policies transition from one state to the other during their lifetime
+ */
+enum ptlrpc_nrs_pol_state {
+	/**
+	 * Not a valid policy state.
+	 */
+	NRS_POL_STATE_INVALID,
+	/**
+	 * Policies are at this state either at the start of their life, or
+	 * transition here when the user selects a different policy to act
+	 * as the primary one.
+	 */
+	NRS_POL_STATE_STOPPED,
+	/**
+	 * Policy is progress of stopping
+	 */
+	NRS_POL_STATE_STOPPING,
+	/**
+	 * Policy is in progress of starting
+	 */
+	NRS_POL_STATE_STARTING,
+	/**
+	 * A policy is in this state in two cases:
+	 * - it is the fallback policy, which is always in this state.
+	 * - it has been activated by the user; i.e. it is the primary policy,
+	 */
+	NRS_POL_STATE_STARTED,
+};
+
+/**
+ * NRS policy information
+ *
+ * Used for obtaining information for the status of a policy via lprocfs
+ */
+struct ptlrpc_nrs_pol_info {
+	/**
+	 * Policy name
+	 */
+	char				pi_name[NRS_POL_NAME_MAX];
+	/**
+	 * Current policy state
+	 */
+	enum ptlrpc_nrs_pol_state	pi_state;
+	/**
+	 * # RPCs enqueued for later dispatching by the policy
+	 */
+	long				pi_req_queued;
+	/**
+	 * # RPCs started for dispatch by the policy
+	 */
+	long				pi_req_started;
+	/**
+	 * Is this a fallback policy?
+	 */
+	unsigned			pi_fallback:1;
+};
+
+/**
+ * NRS policy
+ *
+ * There is one instance of this for each policy in each NRS head of each
+ * PTLRPC service partition.
+ */
+struct ptlrpc_nrs_policy {
+	/**
+	 * Linkage into the NRS head's list of policies,
+	 * ptlrpc_nrs:nrs_policy_list
+	 */
+	struct list_head			pol_list;
+	/**
+	 * Linkage into the NRS head's list of policies with enqueued
+	 * requests ptlrpc_nrs:nrs_policy_queued
+	 */
+	struct list_head			pol_list_queued;
+	/**
+	 * Current state of this policy
+	 */
+	enum ptlrpc_nrs_pol_state	pol_state;
+	/**
+	 * Bitmask of nrs_policy_flags
+	 */
+	unsigned			pol_flags;
+	/**
+	 * # RPCs enqueued for later dispatching by the policy
+	 */
+	long				pol_req_queued;
+	/**
+	 * # RPCs started for dispatch by the policy
+	 */
+	long				pol_req_started;
+	/**
+	 * Usage Reference count taken on the policy instance
+	 */
+	long				pol_ref;
+	/**
+	 * The NRS head this policy has been created at
+	 */
+	struct ptlrpc_nrs	       *pol_nrs;
+	/**
+	 * Private policy data; varies by policy type
+	 */
+	void			       *pol_private;
+	/**
+	 * Policy descriptor for this policy instance.
+	 */
+	struct ptlrpc_nrs_pol_desc     *pol_desc;
+};
+
+/**
+ * NRS resource
+ *
+ * Resources are embedded into two types of NRS entities:
+ * - Inside NRS policies, in the policy's private data in
+ *   ptlrpc_nrs_policy::pol_private
+ * - In objects that act as prime-level scheduling entities in different NRS
+ *   policies; e.g. on a policy that performs round robin or similar order
+ *   scheduling across client NIDs, there would be one NRS resource per unique
+ *   client NID. On a policy which performs round robin scheduling across
+ *   backend filesystem objects, there would be one resource associated with
+ *   each of the backend filesystem objects partaking in the scheduling
+ *   performed by the policy.
+ *
+ * NRS resources share a parent-child relationship, in which resources embedded
+ * in policy instances are the parent entities, with all scheduling entities
+ * a policy schedules across being the children, thus forming a simple resource
+ * hierarchy. This hierarchy may be extended with one or more levels in the
+ * future if the ability to have more than one primary policy is added.
+ *
+ * Upon request initialization, references to the then active NRS policies are
+ * taken and used to later handle the dispatching of the request with one of
+ * these policies.
+ *
+ * \see nrs_resource_get_safe()
+ * \see ptlrpc_nrs_req_add()
+ */
+struct ptlrpc_nrs_resource {
+	/**
+	 * This NRS resource's parent; is NULL for resources embedded in NRS
+	 * policy instances; i.e. those are top-level ones.
+	 */
+	struct ptlrpc_nrs_resource     *res_parent;
+	/**
+	 * The policy associated with this resource.
+	 */
+	struct ptlrpc_nrs_policy       *res_policy;
+};
+
+enum {
+	NRS_RES_FALLBACK,
+	NRS_RES_PRIMARY,
+	NRS_RES_MAX
+};
+
+/* \name fifo
+ *
+ * FIFO policy
+ *
+ * This policy is a logical wrapper around previous, non-NRS functionality.
+ * It dispatches RPCs in the same order as they arrive from the network. This
+ * policy is currently used as the fallback policy, and the only enabled policy
+ * on all NRS heads of all PTLRPC service partitions.
+ * @{
+ */
+
+/**
+ * Private data structure for the FIFO policy
+ */
+struct nrs_fifo_head {
+	/**
+	 * Resource object for policy instance.
+	 */
+	struct ptlrpc_nrs_resource	fh_res;
+	/**
+	 * List of queued requests.
+	 */
+	struct list_head			fh_list;
+	/**
+	 * For debugging purposes.
+	 */
+	__u64				fh_sequence;
+};
+
+struct nrs_fifo_req {
+	struct list_head		fr_list;
+	__u64			fr_sequence;
+};
+
+/** @} fifo */
+
+/**
+ * NRS request
+ *
+ * Instances of this object exist embedded within ptlrpc_request; the main
+ * purpose of this object is to hold references to the request's resources
+ * for the lifetime of the request, and to hold properties that policies use
+ * use for determining the request's scheduling priority.
+ * */
+struct ptlrpc_nrs_request {
+	/**
+	 * The request's resource hierarchy.
+	 */
+	struct ptlrpc_nrs_resource     *nr_res_ptrs[NRS_RES_MAX];
+	/**
+	 * Index into ptlrpc_nrs_request::nr_res_ptrs of the resource of the
+	 * policy that was used to enqueue the request.
+	 *
+	 * \see nrs_request_enqueue()
+	 */
+	unsigned			nr_res_idx;
+	unsigned			nr_initialized:1;
+	unsigned			nr_enqueued:1;
+	unsigned			nr_started:1;
+	unsigned			nr_finalized:1;
+
+	/**
+	 * Policy-specific fields, used for determining a request's scheduling
+	 * priority, and other supporting functionality.
+	 */
+	union {
+		/**
+		 * Fields for the FIFO policy
+		 */
+		struct nrs_fifo_req	fifo;
+	} nr_u;
+	/**
+	 * Externally-registering policies may want to use this to allocate
+	 * their own request properties.
+	 */
+	void			       *ext;
+};
+
+/** @} nrs */
+
+/**
+ * Basic request prioritization operations structure.
+ * The whole idea is centered around locks and RPCs that might affect locks.
+ * When a lock is contended we try to give priority to RPCs that might lead
+ * to fastest release of that lock.
+ * Currently only implemented for OSTs only in a way that makes all
+ * IO and truncate RPCs that are coming from a locked region where a lock is
+ * contended a priority over other requests.
+ */
+struct ptlrpc_hpreq_ops {
+	/**
+	 * Check if the lock handle of the given lock is the same as
+	 * taken from the request.
+	 */
+	int  (*hpreq_lock_match)(struct ptlrpc_request *, struct ldlm_lock *);
+	/**
+	 * Check if the request is a high priority one.
+	 */
+	int  (*hpreq_check)(struct ptlrpc_request *);
+	/**
+	 * Called after the request has been handled.
+	 */
+	void (*hpreq_fini)(struct ptlrpc_request *);
+};
+
+/**
+ * Represents remote procedure call.
+ *
+ * This is a staple structure used by everybody wanting to send a request
+ * in Lustre.
+ */
+struct ptlrpc_request {
+	/* Request type: one of PTL_RPC_MSG_* */
+	int rq_type;
+	/** Result of request processing */
+	int rq_status;
+	/**
+	 * Linkage item through which this request is included into
+	 * sending/delayed lists on client and into rqbd list on server
+	 */
+	struct list_head rq_list;
+	/**
+	 * Server side list of incoming unserved requests sorted by arrival
+	 * time.  Traversed from time to time to notice about to expire
+	 * requests and sent back "early replies" to clients to let them
+	 * know server is alive and well, just very busy to service their
+	 * requests in time
+	 */
+	struct list_head rq_timed_list;
+	/** server-side history, used for debugging purposes. */
+	struct list_head rq_history_list;
+	/** server-side per-export list */
+	struct list_head rq_exp_list;
+	/** server-side hp handlers */
+	struct ptlrpc_hpreq_ops *rq_ops;
+
+	/** initial thread servicing this request */
+	struct ptlrpc_thread *rq_svc_thread;
+
+	/** history sequence # */
+	__u64 rq_history_seq;
+	/** \addtogroup  nrs
+	 * @{
+	 */
+	/** stub for NRS request */
+	struct ptlrpc_nrs_request rq_nrq;
+	/** @} nrs */
+	/** the index of service's srv_at_array into which request is linked */
+	time_t rq_at_index;
+	/** Lock to protect request flags and some other important bits, like
+	 * rq_list
+	 */
+	spinlock_t rq_lock;
+	/** client-side flags are serialized by rq_lock */
+	unsigned int rq_intr:1, rq_replied:1, rq_err:1,
+		rq_timedout:1, rq_resend:1, rq_restart:1,
+		/**
+		 * when ->rq_replay is set, request is kept by the client even
+		 * after server commits corresponding transaction. This is
+		 * used for operations that require sequence of multiple
+		 * requests to be replayed. The only example currently is file
+		 * open/close. When last request in such a sequence is
+		 * committed, ->rq_replay is cleared on all requests in the
+		 * sequence.
+		 */
+		rq_replay:1,
+		rq_no_resend:1, rq_waiting:1, rq_receiving_reply:1,
+		rq_no_delay:1, rq_net_err:1, rq_wait_ctx:1,
+		rq_early:1,
+		rq_req_unlink:1, rq_reply_unlink:1,
+		rq_memalloc:1,      /* req originated from "kswapd" */
+		/* server-side flags */
+		rq_packed_final:1,  /* packed final reply */
+		rq_hp:1,	    /* high priority RPC */
+		rq_at_linked:1,     /* link into service's srv_at_array */
+		rq_reply_truncate:1,
+		rq_committed:1,
+		/* whether the "rq_set" is a valid one */
+		rq_invalid_rqset:1,
+		rq_generation_set:1,
+		/* do not resend request on -EINPROGRESS */
+		rq_no_retry_einprogress:1,
+		/* allow the req to be sent if the import is in recovery
+		 * status */
+		rq_allow_replay:1;
+
+	unsigned int rq_nr_resend;
+
+	enum rq_phase rq_phase; /* one of RQ_PHASE_* */
+	enum rq_phase rq_next_phase; /* one of RQ_PHASE_* to be used next */
+	atomic_t rq_refcount;/* client-side refcount for SENT race,
+				    server-side refcount for multiple replies */
+
+	/** Portal to which this request would be sent */
+	short rq_request_portal;  /* XXX FIXME bug 249 */
+	/** Portal where to wait for reply and where reply would be sent */
+	short rq_reply_portal;    /* XXX FIXME bug 249 */
+
+	/**
+	 * client-side:
+	 * !rq_truncate : # reply bytes actually received,
+	 *  rq_truncate : required repbuf_len for resend
+	 */
+	int rq_nob_received;
+	/** Request length */
+	int rq_reqlen;
+	/** Reply length */
+	int rq_replen;
+	/** Request message - what client sent */
+	struct lustre_msg *rq_reqmsg;
+	/** Reply message - server response */
+	struct lustre_msg *rq_repmsg;
+	/** Transaction number */
+	__u64 rq_transno;
+	/** xid */
+	__u64 rq_xid;
+	/**
+	 * List item to for replay list. Not yet committed requests get linked
+	 * there.
+	 * Also see \a rq_replay comment above.
+	 */
+	struct list_head rq_replay_list;
+
+	/**
+	 * security and encryption data
+	 * @{ */
+	struct ptlrpc_cli_ctx   *rq_cli_ctx;     /**< client's half ctx */
+	struct ptlrpc_svc_ctx   *rq_svc_ctx;     /**< server's half ctx */
+	struct list_head	       rq_ctx_chain;   /**< link to waited ctx */
+
+	struct sptlrpc_flavor    rq_flvr;	/**< for client & server */
+	enum lustre_sec_part     rq_sp_from;
+
+	/* client/server security flags */
+	unsigned int
+				 rq_ctx_init:1,      /* context initiation */
+				 rq_ctx_fini:1,      /* context destroy */
+				 rq_bulk_read:1,     /* request bulk read */
+				 rq_bulk_write:1,    /* request bulk write */
+				 /* server authentication flags */
+				 rq_auth_gss:1,      /* authenticated by gss */
+				 rq_auth_remote:1,   /* authed as remote user */
+				 rq_auth_usr_root:1, /* authed as root */
+				 rq_auth_usr_mdt:1,  /* authed as mdt */
+				 rq_auth_usr_ost:1,  /* authed as ost */
+				 /* security tfm flags */
+				 rq_pack_udesc:1,
+				 rq_pack_bulk:1,
+				 /* doesn't expect reply FIXME */
+				 rq_no_reply:1,
+				 rq_pill_init:1;     /* pill initialized */
+
+	uid_t		    rq_auth_uid;	/* authed uid */
+	uid_t		    rq_auth_mapped_uid; /* authed uid mapped to */
+
+	/* (server side), pointed directly into req buffer */
+	struct ptlrpc_user_desc *rq_user_desc;
+
+	/* various buffer pointers */
+	struct lustre_msg       *rq_reqbuf;      /* req wrapper */
+	char		    *rq_repbuf;      /* rep buffer */
+	struct lustre_msg       *rq_repdata;     /* rep wrapper msg */
+	struct lustre_msg       *rq_clrbuf;      /* only in priv mode */
+	int		      rq_reqbuf_len;  /* req wrapper buf len */
+	int		      rq_reqdata_len; /* req wrapper msg len */
+	int		      rq_repbuf_len;  /* rep buffer len */
+	int		      rq_repdata_len; /* rep wrapper msg len */
+	int		      rq_clrbuf_len;  /* only in priv mode */
+	int		      rq_clrdata_len; /* only in priv mode */
+
+	/** early replies go to offset 0, regular replies go after that */
+	unsigned int	     rq_reply_off;
+
+	/** @} */
+
+	/** Fields that help to see if request and reply were swabbed or not */
+	__u32 rq_req_swab_mask;
+	__u32 rq_rep_swab_mask;
+
+	/** What was import generation when this request was sent */
+	int rq_import_generation;
+	enum lustre_imp_state rq_send_state;
+
+	/** how many early replies (for stats) */
+	int rq_early_count;
+
+	/** client+server request */
+	lnet_handle_md_t     rq_req_md_h;
+	struct ptlrpc_cb_id  rq_req_cbid;
+	/** optional time limit for send attempts */
+	long       rq_delay_limit;
+	/** time request was first queued */
+	unsigned long	   rq_queued_time;
+
+	/* server-side... */
+	/** request arrival time */
+	struct timeval       rq_arrival_time;
+	/** separated reply state */
+	struct ptlrpc_reply_state *rq_reply_state;
+	/** incoming request buffer */
+	struct ptlrpc_request_buffer_desc *rq_rqbd;
+
+	/** client-only incoming reply */
+	lnet_handle_md_t     rq_reply_md_h;
+	wait_queue_head_t	  rq_reply_waitq;
+	struct ptlrpc_cb_id  rq_reply_cbid;
+
+	/** our LNet NID */
+	lnet_nid_t	   rq_self;
+	/** Peer description (the other side) */
+	lnet_process_id_t    rq_peer;
+	/** Server-side, export on which request was received */
+	struct obd_export   *rq_export;
+	/** Client side, import where request is being sent */
+	struct obd_import   *rq_import;
+
+	/** Replay callback, called after request is replayed at recovery */
+	void (*rq_replay_cb)(struct ptlrpc_request *);
+	/**
+	 * Commit callback, called when request is committed and about to be
+	 * freed.
+	 */
+	void (*rq_commit_cb)(struct ptlrpc_request *);
+	/** Opaq data for replay and commit callbacks. */
+	void  *rq_cb_data;
+
+	/** For bulk requests on client only: bulk descriptor */
+	struct ptlrpc_bulk_desc *rq_bulk;
+
+	/** client outgoing req */
+	/**
+	 * when request/reply sent (secs), or time when request should be sent
+	 */
+	time_t rq_sent;
+	/** time for request really sent out */
+	time_t rq_real_sent;
+
+	/** when request must finish. volatile
+	 * so that servers' early reply updates to the deadline aren't
+	 * kept in per-cpu cache */
+	volatile time_t rq_deadline;
+	/** when req reply unlink must finish. */
+	time_t rq_reply_deadline;
+	/** when req bulk unlink must finish. */
+	time_t rq_bulk_deadline;
+	/**
+	 * service time estimate (secs)
+	 * If the requestsis not served by this time, it is marked as timed out.
+	 */
+	int    rq_timeout;
+
+	/** Multi-rpc bits */
+	/** Per-request waitq introduced by bug 21938 for recovery waiting */
+	wait_queue_head_t rq_set_waitq;
+	/** Link item for request set lists */
+	struct list_head  rq_set_chain;
+	/** Link back to the request set */
+	struct ptlrpc_request_set *rq_set;
+	/** Async completion handler, called when reply is received */
+	ptlrpc_interpterer_t rq_interpret_reply;
+	/** Async completion context */
+	union ptlrpc_async_args rq_async_args;
+
+	/** Pool if request is from preallocated list */
+	struct ptlrpc_request_pool *rq_pool;
+
+	struct lu_context	   rq_session;
+	struct lu_context	   rq_recov_session;
+
+	/** request format description */
+	struct req_capsule	  rq_pill;
+};
+
+/**
+ * Call completion handler for rpc if any, return it's status or original
+ * rc if there was no handler defined for this request.
+ */
+static inline int ptlrpc_req_interpret(const struct lu_env *env,
+				       struct ptlrpc_request *req, int rc)
+{
+	if (req->rq_interpret_reply != NULL) {
+		req->rq_status = req->rq_interpret_reply(env, req,
+							 &req->rq_async_args,
+							 rc);
+		return req->rq_status;
+	}
+	return rc;
+}
+
+/** \addtogroup  nrs
+ * @{
+ */
+int ptlrpc_nrs_policy_register(struct ptlrpc_nrs_pol_conf *conf);
+int ptlrpc_nrs_policy_unregister(struct ptlrpc_nrs_pol_conf *conf);
+void ptlrpc_nrs_req_hp_move(struct ptlrpc_request *req);
+void nrs_policy_get_info_locked(struct ptlrpc_nrs_policy *policy,
+				struct ptlrpc_nrs_pol_info *info);
+
+/*
+ * Can the request be moved from the regular NRS head to the high-priority NRS
+ * head (of the same PTLRPC service partition), if any?
+ *
+ * For a reliable result, this should be checked under svcpt->scp_req lock.
+ */
+static inline bool ptlrpc_nrs_req_can_move(struct ptlrpc_request *req)
+{
+	struct ptlrpc_nrs_request *nrq = &req->rq_nrq;
+
+	/**
+	 * LU-898: Check ptlrpc_nrs_request::nr_enqueued to make sure the
+	 * request has been enqueued first, and ptlrpc_nrs_request::nr_started
+	 * to make sure it has not been scheduled yet (analogous to previous
+	 * (non-NRS) checking of !list_empty(&ptlrpc_request::rq_list).
+	 */
+	return nrq->nr_enqueued && !nrq->nr_started && !req->rq_hp;
+}
+/** @} nrs */
+
+/**
+ * Returns 1 if request buffer at offset \a index was already swabbed
+ */
+static inline int lustre_req_swabbed(struct ptlrpc_request *req, int index)
+{
+	LASSERT(index < sizeof(req->rq_req_swab_mask) * 8);
+	return req->rq_req_swab_mask & (1 << index);
+}
+
+/**
+ * Returns 1 if request reply buffer at offset \a index was already swabbed
+ */
+static inline int lustre_rep_swabbed(struct ptlrpc_request *req, int index)
+{
+	LASSERT(index < sizeof(req->rq_rep_swab_mask) * 8);
+	return req->rq_rep_swab_mask & (1 << index);
+}
+
+/**
+ * Returns 1 if request needs to be swabbed into local cpu byteorder
+ */
+static inline int ptlrpc_req_need_swab(struct ptlrpc_request *req)
+{
+	return lustre_req_swabbed(req, MSG_PTLRPC_HEADER_OFF);
+}
+
+/**
+ * Returns 1 if request reply needs to be swabbed into local cpu byteorder
+ */
+static inline int ptlrpc_rep_need_swab(struct ptlrpc_request *req)
+{
+	return lustre_rep_swabbed(req, MSG_PTLRPC_HEADER_OFF);
+}
+
+/**
+ * Mark request buffer at offset \a index that it was already swabbed
+ */
+static inline void lustre_set_req_swabbed(struct ptlrpc_request *req, int index)
+{
+	LASSERT(index < sizeof(req->rq_req_swab_mask) * 8);
+	LASSERT((req->rq_req_swab_mask & (1 << index)) == 0);
+	req->rq_req_swab_mask |= 1 << index;
+}
+
+/**
+ * Mark request reply buffer at offset \a index that it was already swabbed
+ */
+static inline void lustre_set_rep_swabbed(struct ptlrpc_request *req, int index)
+{
+	LASSERT(index < sizeof(req->rq_rep_swab_mask) * 8);
+	LASSERT((req->rq_rep_swab_mask & (1 << index)) == 0);
+	req->rq_rep_swab_mask |= 1 << index;
+}
+
+/**
+ * Convert numerical request phase value \a phase into text string description
+ */
+static inline const char *
+ptlrpc_phase2str(enum rq_phase phase)
+{
+	switch (phase) {
+	case RQ_PHASE_NEW:
+		return "New";
+	case RQ_PHASE_RPC:
+		return "Rpc";
+	case RQ_PHASE_BULK:
+		return "Bulk";
+	case RQ_PHASE_INTERPRET:
+		return "Interpret";
+	case RQ_PHASE_COMPLETE:
+		return "Complete";
+	case RQ_PHASE_UNREGISTERING:
+		return "Unregistering";
+	default:
+		return "?Phase?";
+	}
+}
+
+/**
+ * Convert numerical request phase of the request \a req into text stringi
+ * description
+ */
+static inline const char *
+ptlrpc_rqphase2str(struct ptlrpc_request *req)
+{
+	return ptlrpc_phase2str(req->rq_phase);
+}
+
+/**
+ * Debugging functions and helpers to print request structure into debug log
+ * @{
+ */
+/* Spare the preprocessor, spoil the bugs. */
+#define FLAG(field, str) (field ? str : "")
+
+/** Convert bit flags into a string */
+#define DEBUG_REQ_FLAGS(req)						    \
+	ptlrpc_rqphase2str(req),						\
+	FLAG(req->rq_intr, "I"), FLAG(req->rq_replied, "R"),		    \
+	FLAG(req->rq_err, "E"),						 \
+	FLAG(req->rq_timedout, "X") /* eXpired */, FLAG(req->rq_resend, "S"),   \
+	FLAG(req->rq_restart, "T"), FLAG(req->rq_replay, "P"),		  \
+	FLAG(req->rq_no_resend, "N"),					   \
+	FLAG(req->rq_waiting, "W"),					     \
+	FLAG(req->rq_wait_ctx, "C"), FLAG(req->rq_hp, "H"),		     \
+	FLAG(req->rq_committed, "M")
+
+#define REQ_FLAGS_FMT "%s:%s%s%s%s%s%s%s%s%s%s%s%s"
+
+void _debug_req(struct ptlrpc_request *req,
+		struct libcfs_debug_msg_data *data, const char *fmt, ...)
+	__printf(3, 4);
+
+/**
+ * Helper that decides if we need to print request according to current debug
+ * level settings
+ */
+#define debug_req(msgdata, mask, cdls, req, fmt, a...)			\
+do {									  \
+	CFS_CHECK_STACK(msgdata, mask, cdls);				 \
+									      \
+	if (((mask) & D_CANTMASK) != 0 ||				     \
+	    ((libcfs_debug & (mask)) != 0 &&				  \
+	     (libcfs_subsystem_debug & DEBUG_SUBSYSTEM) != 0))		\
+		_debug_req((req), msgdata, fmt, ##a);			 \
+} while (0)
+
+/**
+ * This is the debug print function you need to use to print request structure
+ * content into lustre debug log.
+ * for most callers (level is a constant) this is resolved at compile time */
+#define DEBUG_REQ(level, req, fmt, args...)				   \
+do {									  \
+	if ((level) & (D_ERROR | D_WARNING)) {				\
+		static struct cfs_debug_limit_state cdls;			  \
+		LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, level, &cdls);	    \
+		debug_req(&msgdata, level, &cdls, req, "@@@ "fmt" ", ## args);\
+	} else {							      \
+		LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, level, NULL);	     \
+		debug_req(&msgdata, level, NULL, req, "@@@ "fmt" ", ## args); \
+	}								     \
+} while (0)
+/** @} */
+
+/**
+ * Structure that defines a single page of a bulk transfer
+ */
+struct ptlrpc_bulk_page {
+	/** Linkage to list of pages in a bulk */
+	struct list_head       bp_link;
+	/**
+	 * Number of bytes in a page to transfer starting from \a bp_pageoffset
+	 */
+	int	      bp_buflen;
+	/** offset within a page */
+	int	      bp_pageoffset;
+	/** The page itself */
+	struct page     *bp_page;
+};
+
+#define BULK_GET_SOURCE   0
+#define BULK_PUT_SINK     1
+#define BULK_GET_SINK     2
+#define BULK_PUT_SOURCE   3
+
+/**
+ * Definition of bulk descriptor.
+ * Bulks are special "Two phase" RPCs where initial request message
+ * is sent first and it is followed bt a transfer (o receiving) of a large
+ * amount of data to be settled into pages referenced from the bulk descriptors.
+ * Bulks transfers (the actual data following the small requests) are done
+ * on separate LNet portals.
+ * In lustre we use bulk transfers for READ and WRITE transfers from/to OSTs.
+ *  Another user is readpage for MDT.
+ */
+struct ptlrpc_bulk_desc {
+	/** completed with failure */
+	unsigned long bd_failure:1;
+	/** {put,get}{source,sink} */
+	unsigned long bd_type:2;
+	/** client side */
+	unsigned long bd_registered:1;
+	/** For serialization with callback */
+	spinlock_t bd_lock;
+	/** Import generation when request for this bulk was sent */
+	int bd_import_generation;
+	/** LNet portal for this bulk */
+	__u32 bd_portal;
+	/** Server side - export this bulk created for */
+	struct obd_export *bd_export;
+	/** Client side - import this bulk was sent on */
+	struct obd_import *bd_import;
+	/** Back pointer to the request */
+	struct ptlrpc_request *bd_req;
+	wait_queue_head_t	    bd_waitq;	/* server side only WQ */
+	int		    bd_iov_count;    /* # entries in bd_iov */
+	int		    bd_max_iov;      /* allocated size of bd_iov */
+	int		    bd_nob;	  /* # bytes covered */
+	int		    bd_nob_transferred; /* # bytes GOT/PUT */
+
+	__u64		  bd_last_xid;
+
+	struct ptlrpc_cb_id    bd_cbid;	 /* network callback info */
+	lnet_nid_t	     bd_sender;       /* stash event::sender */
+	int			bd_md_count;	/* # valid entries in bd_mds */
+	int			bd_md_max_brw;	/* max entries in bd_mds */
+	/** array of associated MDs */
+	lnet_handle_md_t	bd_mds[PTLRPC_BULK_OPS_COUNT];
+
+	/*
+	 * encrypt iov, size is either 0 or bd_iov_count.
+	 */
+	lnet_kiov_t	   *bd_enc_iov;
+
+	lnet_kiov_t	    bd_iov[0];
+};
+
+enum {
+	SVC_STOPPED     = 1 << 0,
+	SVC_STOPPING    = 1 << 1,
+	SVC_STARTING    = 1 << 2,
+	SVC_RUNNING     = 1 << 3,
+	SVC_EVENT       = 1 << 4,
+	SVC_SIGNAL      = 1 << 5,
+};
+
+#define PTLRPC_THR_NAME_LEN		32
+/**
+ * Definition of server service thread structure
+ */
+struct ptlrpc_thread {
+	/**
+	 * List of active threads in svc->srv_threads
+	 */
+	struct list_head t_link;
+	/**
+	 * thread-private data (preallocated memory)
+	 */
+	void *t_data;
+	__u32 t_flags;
+	/**
+	 * service thread index, from ptlrpc_start_threads
+	 */
+	unsigned int t_id;
+	/**
+	 * service thread pid
+	 */
+	pid_t t_pid;
+	/**
+	 * put watchdog in the structure per thread b=14840
+	 *
+	 * Lustre watchdog is removed for client in the hope
+	 * of a generic watchdog can be merged in kernel.
+	 * When that happens, we should add below back.
+	 *
+	 * struct lc_watchdog *t_watchdog;
+	 */
+	/**
+	 * the svc this thread belonged to b=18582
+	 */
+	struct ptlrpc_service_part	*t_svcpt;
+	wait_queue_head_t			t_ctl_waitq;
+	struct lu_env			*t_env;
+	char				t_name[PTLRPC_THR_NAME_LEN];
+};
+
+static inline int thread_is_init(struct ptlrpc_thread *thread)
+{
+	return thread->t_flags == 0;
+}
+
+static inline int thread_is_stopped(struct ptlrpc_thread *thread)
+{
+	return !!(thread->t_flags & SVC_STOPPED);
+}
+
+static inline int thread_is_stopping(struct ptlrpc_thread *thread)
+{
+	return !!(thread->t_flags & SVC_STOPPING);
+}
+
+static inline int thread_is_starting(struct ptlrpc_thread *thread)
+{
+	return !!(thread->t_flags & SVC_STARTING);
+}
+
+static inline int thread_is_running(struct ptlrpc_thread *thread)
+{
+	return !!(thread->t_flags & SVC_RUNNING);
+}
+
+static inline int thread_is_event(struct ptlrpc_thread *thread)
+{
+	return !!(thread->t_flags & SVC_EVENT);
+}
+
+static inline int thread_is_signal(struct ptlrpc_thread *thread)
+{
+	return !!(thread->t_flags & SVC_SIGNAL);
+}
+
+static inline void thread_clear_flags(struct ptlrpc_thread *thread, __u32 flags)
+{
+	thread->t_flags &= ~flags;
+}
+
+static inline void thread_set_flags(struct ptlrpc_thread *thread, __u32 flags)
+{
+	thread->t_flags = flags;
+}
+
+static inline void thread_add_flags(struct ptlrpc_thread *thread, __u32 flags)
+{
+	thread->t_flags |= flags;
+}
+
+static inline int thread_test_and_clear_flags(struct ptlrpc_thread *thread,
+					      __u32 flags)
+{
+	if (thread->t_flags & flags) {
+		thread->t_flags &= ~flags;
+		return 1;
+	}
+	return 0;
+}
+
+/**
+ * Request buffer descriptor structure.
+ * This is a structure that contains one posted request buffer for service.
+ * Once data land into a buffer, event callback creates actual request and
+ * notifies wakes one of the service threads to process new incoming request.
+ * More than one request can fit into the buffer.
+ */
+struct ptlrpc_request_buffer_desc {
+	/** Link item for rqbds on a service */
+	struct list_head	     rqbd_list;
+	/** History of requests for this buffer */
+	struct list_head	     rqbd_reqs;
+	/** Back pointer to service for which this buffer is registered */
+	struct ptlrpc_service_part *rqbd_svcpt;
+	/** LNet descriptor */
+	lnet_handle_md_t       rqbd_md_h;
+	int		    rqbd_refcount;
+	/** The buffer itself */
+	char		  *rqbd_buffer;
+	struct ptlrpc_cb_id    rqbd_cbid;
+	/**
+	 * This "embedded" request structure is only used for the
+	 * last request to fit into the buffer
+	 */
+	struct ptlrpc_request  rqbd_req;
+};
+
+typedef int  (*svc_handler_t)(struct ptlrpc_request *req);
+
+struct ptlrpc_service_ops {
+	/**
+	 * if non-NULL called during thread creation (ptlrpc_start_thread())
+	 * to initialize service specific per-thread state.
+	 */
+	int		(*so_thr_init)(struct ptlrpc_thread *thr);
+	/**
+	 * if non-NULL called during thread shutdown (ptlrpc_main()) to
+	 * destruct state created by ->srv_init().
+	 */
+	void		(*so_thr_done)(struct ptlrpc_thread *thr);
+	/**
+	 * Handler function for incoming requests for this service
+	 */
+	int		(*so_req_handler)(struct ptlrpc_request *req);
+	/**
+	 * function to determine priority of the request, it's called
+	 * on every new request
+	 */
+	int		(*so_hpreq_handler)(struct ptlrpc_request *);
+	/**
+	 * service-specific print fn
+	 */
+	void		(*so_req_printer)(void *, struct ptlrpc_request *);
+};
+
+#ifndef __cfs_cacheline_aligned
+/* NB: put it here for reducing patche dependence */
+# define __cfs_cacheline_aligned
+#endif
+
+/**
+ * How many high priority requests to serve before serving one normal
+ * priority request
+ */
+#define PTLRPC_SVC_HP_RATIO 10
+
+/**
+ * Definition of PortalRPC service.
+ * The service is listening on a particular portal (like tcp port)
+ * and perform actions for a specific server like IO service for OST
+ * or general metadata service for MDS.
+ */
+struct ptlrpc_service {
+	/** serialize /proc operations */
+	spinlock_t			srv_lock;
+	/** most often accessed fields */
+	/** chain thru all services */
+	struct list_head		      srv_list;
+	/** service operations table */
+	struct ptlrpc_service_ops	srv_ops;
+	/** only statically allocated strings here; we don't clean them */
+	char			   *srv_name;
+	/** only statically allocated strings here; we don't clean them */
+	char			   *srv_thread_name;
+	/** service thread list */
+	struct list_head		      srv_threads;
+	/** threads # should be created for each partition on initializing */
+	int				srv_nthrs_cpt_init;
+	/** limit of threads number for each partition */
+	int				srv_nthrs_cpt_limit;
+	/** Root of /proc dir tree for this service */
+	struct proc_dir_entry	   *srv_procroot;
+	/** Pointer to statistic data for this service */
+	struct lprocfs_stats	   *srv_stats;
+	/** # hp per lp reqs to handle */
+	int			     srv_hpreq_ratio;
+	/** biggest request to receive */
+	int			     srv_max_req_size;
+	/** biggest reply to send */
+	int			     srv_max_reply_size;
+	/** size of individual buffers */
+	int			     srv_buf_size;
+	/** # buffers to allocate in 1 group */
+	int			     srv_nbuf_per_group;
+	/** Local portal on which to receive requests */
+	__u32			   srv_req_portal;
+	/** Portal on the client to send replies to */
+	__u32			   srv_rep_portal;
+	/**
+	 * Tags for lu_context associated with this thread, see struct
+	 * lu_context.
+	 */
+	__u32			   srv_ctx_tags;
+	/** soft watchdog timeout multiplier */
+	int			     srv_watchdog_factor;
+	/** under unregister_service */
+	unsigned			srv_is_stopping:1;
+
+	/** max # request buffers in history per partition */
+	int				srv_hist_nrqbds_cpt_max;
+	/** number of CPTs this service bound on */
+	int				srv_ncpts;
+	/** CPTs array this service bound on */
+	__u32				*srv_cpts;
+	/** 2^srv_cptab_bits >= cfs_cpt_numbert(srv_cptable) */
+	int				srv_cpt_bits;
+	/** CPT table this service is running over */
+	struct cfs_cpt_table		*srv_cptable;
+	/**
+	 * partition data for ptlrpc service
+	 */
+	struct ptlrpc_service_part	*srv_parts[0];
+};
+
+/**
+ * Definition of PortalRPC service partition data.
+ * Although a service only has one instance of it right now, but we
+ * will have multiple instances very soon (instance per CPT).
+ *
+ * it has four locks:
+ * \a scp_lock
+ *    serialize operations on rqbd and requests waiting for preprocess
+ * \a scp_req_lock
+ *    serialize operations active requests sent to this portal
+ * \a scp_at_lock
+ *    serialize adaptive timeout stuff
+ * \a scp_rep_lock
+ *    serialize operations on RS list (reply states)
+ *
+ * We don't have any use-case to take two or more locks at the same time
+ * for now, so there is no lock order issue.
+ */
+struct ptlrpc_service_part {
+	/** back reference to owner */
+	struct ptlrpc_service		*scp_service __cfs_cacheline_aligned;
+	/* CPT id, reserved */
+	int				scp_cpt;
+	/** always increasing number */
+	int				scp_thr_nextid;
+	/** # of starting threads */
+	int				scp_nthrs_starting;
+	/** # of stopping threads, reserved for shrinking threads */
+	int				scp_nthrs_stopping;
+	/** # running threads */
+	int				scp_nthrs_running;
+	/** service threads list */
+	struct list_head			scp_threads;
+
+	/**
+	 * serialize the following fields, used for protecting
+	 * rqbd list and incoming requests waiting for preprocess,
+	 * threads starting & stopping are also protected by this lock.
+	 */
+	spinlock_t			scp_lock  __cfs_cacheline_aligned;
+	/** total # req buffer descs allocated */
+	int				scp_nrqbds_total;
+	/** # posted request buffers for receiving */
+	int				scp_nrqbds_posted;
+	/** in progress of allocating rqbd */
+	int				scp_rqbd_allocating;
+	/** # incoming reqs */
+	int				scp_nreqs_incoming;
+	/** request buffers to be reposted */
+	struct list_head			scp_rqbd_idle;
+	/** req buffers receiving */
+	struct list_head			scp_rqbd_posted;
+	/** incoming reqs */
+	struct list_head			scp_req_incoming;
+	/** timeout before re-posting reqs, in tick */
+	long			scp_rqbd_timeout;
+	/**
+	 * all threads sleep on this. This wait-queue is signalled when new
+	 * incoming request arrives and when difficult reply has to be handled.
+	 */
+	wait_queue_head_t			scp_waitq;
+
+	/** request history */
+	struct list_head			scp_hist_reqs;
+	/** request buffer history */
+	struct list_head			scp_hist_rqbds;
+	/** # request buffers in history */
+	int				scp_hist_nrqbds;
+	/** sequence number for request */
+	__u64				scp_hist_seq;
+	/** highest seq culled from history */
+	__u64				scp_hist_seq_culled;
+
+	/**
+	 * serialize the following fields, used for processing requests
+	 * sent to this portal
+	 */
+	spinlock_t			scp_req_lock __cfs_cacheline_aligned;
+	/** # reqs in either of the NRS heads below */
+	/** # reqs being served */
+	int				scp_nreqs_active;
+	/** # HPreqs being served */
+	int				scp_nhreqs_active;
+	/** # hp requests handled */
+	int				scp_hreq_count;
+
+	/** NRS head for regular requests */
+	struct ptlrpc_nrs		scp_nrs_reg;
+	/** NRS head for HP requests; this is only valid for services that can
+	 *  handle HP requests */
+	struct ptlrpc_nrs	       *scp_nrs_hp;
+
+	/** AT stuff */
+	/** @{ */
+	/**
+	 * serialize the following fields, used for changes on
+	 * adaptive timeout
+	 */
+	spinlock_t			scp_at_lock __cfs_cacheline_aligned;
+	/** estimated rpc service time */
+	struct adaptive_timeout		scp_at_estimate;
+	/** reqs waiting for replies */
+	struct ptlrpc_at_array		scp_at_array;
+	/** early reply timer */
+	struct timer_list		scp_at_timer;
+	/** debug */
+	unsigned long			scp_at_checktime;
+	/** check early replies */
+	unsigned			scp_at_check;
+	/** @} */
+
+	/**
+	 * serialize the following fields, used for processing
+	 * replies for this portal
+	 */
+	spinlock_t			scp_rep_lock __cfs_cacheline_aligned;
+	/** all the active replies */
+	struct list_head			scp_rep_active;
+	/** List of free reply_states */
+	struct list_head			scp_rep_idle;
+	/** waitq to run, when adding stuff to srv_free_rs_list */
+	wait_queue_head_t			scp_rep_waitq;
+	/** # 'difficult' replies */
+	atomic_t			scp_nreps_difficult;
+};
+
+#define ptlrpc_service_for_each_part(part, i, svc)			\
+	for (i = 0;							\
+	     i < (svc)->srv_ncpts &&					\
+	     (svc)->srv_parts != NULL &&				\
+	     ((part) = (svc)->srv_parts[i]) != NULL; i++)
+
+/**
+ * Declaration of ptlrpcd control structure
+ */
+struct ptlrpcd_ctl {
+	/**
+	 * Ptlrpc thread control flags (LIOD_START, LIOD_STOP, LIOD_FORCE)
+	 */
+	unsigned long			pc_flags;
+	/**
+	 * Thread lock protecting structure fields.
+	 */
+	spinlock_t			pc_lock;
+	/**
+	 * Start completion.
+	 */
+	struct completion		pc_starting;
+	/**
+	 * Stop completion.
+	 */
+	struct completion		pc_finishing;
+	/**
+	 * Thread requests set.
+	 */
+	struct ptlrpc_request_set  *pc_set;
+	/**
+	 * Thread name used in cfs_daemonize()
+	 */
+	char			pc_name[16];
+	/**
+	 * Environment for request interpreters to run in.
+	 */
+	struct lu_env	       pc_env;
+	/**
+	 * Index of ptlrpcd thread in the array.
+	 */
+	int			 pc_index;
+	/**
+	 * Number of the ptlrpcd's partners.
+	 */
+	int			 pc_npartners;
+	/**
+	 * Pointer to the array of partners' ptlrpcd_ctl structure.
+	 */
+	struct ptlrpcd_ctl	**pc_partners;
+	/**
+	 * Record the partner index to be processed next.
+	 */
+	int			 pc_cursor;
+};
+
+/* Bits for pc_flags */
+enum ptlrpcd_ctl_flags {
+	/**
+	 * Ptlrpc thread start flag.
+	 */
+	LIOD_START       = 1 << 0,
+	/**
+	 * Ptlrpc thread stop flag.
+	 */
+	LIOD_STOP	= 1 << 1,
+	/**
+	 * Ptlrpc thread force flag (only stop force so far).
+	 * This will cause aborting any inflight rpcs handled
+	 * by thread if LIOD_STOP is specified.
+	 */
+	LIOD_FORCE       = 1 << 2,
+	/**
+	 * This is a recovery ptlrpc thread.
+	 */
+	LIOD_RECOVERY    = 1 << 3,
+	/**
+	 * The ptlrpcd is bound to some CPU core.
+	 */
+	LIOD_BIND	= 1 << 4,
+};
+
+/**
+ * \addtogroup nrs
+ * @{
+ *
+ * Service compatibility function; the policy is compatible with all services.
+ *
+ * \param[in] svc  The service the policy is attempting to register with.
+ * \param[in] desc The policy descriptor
+ *
+ * \retval true The policy is compatible with the service
+ *
+ * \see ptlrpc_nrs_pol_desc::pd_compat()
+ */
+static inline bool nrs_policy_compat_all(const struct ptlrpc_service *svc,
+					 const struct ptlrpc_nrs_pol_desc *desc)
+{
+	return true;
+}
+
+/**
+ * Service compatibility function; the policy is compatible with only a specific
+ * service which is identified by its human-readable name at
+ * ptlrpc_service::srv_name.
+ *
+ * \param[in] svc  The service the policy is attempting to register with.
+ * \param[in] desc The policy descriptor
+ *
+ * \retval false The policy is not compatible with the service
+ * \retval true	 The policy is compatible with the service
+ *
+ * \see ptlrpc_nrs_pol_desc::pd_compat()
+ */
+static inline bool nrs_policy_compat_one(const struct ptlrpc_service *svc,
+					 const struct ptlrpc_nrs_pol_desc *desc)
+{
+	LASSERT(desc->pd_compat_svc_name != NULL);
+	return strcmp(svc->srv_name, desc->pd_compat_svc_name) == 0;
+}
+
+/** @} nrs */
+
+/* ptlrpc/events.c */
+extern lnet_handle_eq_t ptlrpc_eq_h;
+extern int ptlrpc_uuid_to_peer(struct obd_uuid *uuid,
+			       lnet_process_id_t *peer, lnet_nid_t *self);
+/**
+ * These callbacks are invoked by LNet when something happened to
+ * underlying buffer
+ * @{
+ */
+extern void request_out_callback(lnet_event_t *ev);
+extern void reply_in_callback(lnet_event_t *ev);
+extern void client_bulk_callback(lnet_event_t *ev);
+extern void request_in_callback(lnet_event_t *ev);
+extern void reply_out_callback(lnet_event_t *ev);
+/** @} */
+
+/* ptlrpc/connection.c */
+struct ptlrpc_connection *ptlrpc_connection_get(lnet_process_id_t peer,
+						lnet_nid_t self,
+						struct obd_uuid *uuid);
+int ptlrpc_connection_put(struct ptlrpc_connection *c);
+struct ptlrpc_connection *ptlrpc_connection_addref(struct ptlrpc_connection *);
+int ptlrpc_connection_init(void);
+void ptlrpc_connection_fini(void);
+extern lnet_pid_t ptl_get_pid(void);
+
+/* ptlrpc/niobuf.c */
+/**
+ * Actual interfacing with LNet to put/get/register/unregister stuff
+ * @{
+ */
+
+int ptlrpc_register_bulk(struct ptlrpc_request *req);
+int ptlrpc_unregister_bulk(struct ptlrpc_request *req, int async);
+
+static inline int ptlrpc_client_bulk_active(struct ptlrpc_request *req)
+{
+	struct ptlrpc_bulk_desc *desc;
+	int		      rc;
+
+	LASSERT(req != NULL);
+	desc = req->rq_bulk;
+
+	if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_BULK_UNLINK) &&
+	    req->rq_bulk_deadline > get_seconds())
+		return 1;
+
+	if (!desc)
+		return 0;
+
+	spin_lock(&desc->bd_lock);
+	rc = desc->bd_md_count;
+	spin_unlock(&desc->bd_lock);
+	return rc;
+}
+
+#define PTLRPC_REPLY_MAYBE_DIFFICULT 0x01
+#define PTLRPC_REPLY_EARLY	   0x02
+int ptlrpc_send_reply(struct ptlrpc_request *req, int flags);
+int ptlrpc_reply(struct ptlrpc_request *req);
+int ptlrpc_send_error(struct ptlrpc_request *req, int difficult);
+int ptlrpc_error(struct ptlrpc_request *req);
+void ptlrpc_resend_req(struct ptlrpc_request *request);
+int ptlrpc_at_get_net_latency(struct ptlrpc_request *req);
+int ptl_send_rpc(struct ptlrpc_request *request, int noreply);
+int ptlrpc_register_rqbd(struct ptlrpc_request_buffer_desc *rqbd);
+/** @} */
+
+/* ptlrpc/client.c */
+/**
+ * Client-side portals API. Everything to send requests, receive replies,
+ * request queues, request management, etc.
+ * @{
+ */
+void ptlrpc_request_committed(struct ptlrpc_request *req, int force);
+
+void ptlrpc_init_client(int req_portal, int rep_portal, char *name,
+			struct ptlrpc_client *);
+void ptlrpc_cleanup_client(struct obd_import *imp);
+struct ptlrpc_connection *ptlrpc_uuid_to_connection(struct obd_uuid *uuid);
+
+int ptlrpc_queue_wait(struct ptlrpc_request *req);
+int ptlrpc_replay_req(struct ptlrpc_request *req);
+int ptlrpc_unregister_reply(struct ptlrpc_request *req, int async);
+void ptlrpc_restart_req(struct ptlrpc_request *req);
+void ptlrpc_abort_inflight(struct obd_import *imp);
+void ptlrpc_cleanup_imp(struct obd_import *imp);
+void ptlrpc_abort_set(struct ptlrpc_request_set *set);
+
+struct ptlrpc_request_set *ptlrpc_prep_set(void);
+struct ptlrpc_request_set *ptlrpc_prep_fcset(int max, set_producer_func func,
+					     void *arg);
+int ptlrpc_set_add_cb(struct ptlrpc_request_set *set,
+		      set_interpreter_func fn, void *data);
+int ptlrpc_set_next_timeout(struct ptlrpc_request_set *);
+int ptlrpc_check_set(const struct lu_env *env, struct ptlrpc_request_set *set);
+int ptlrpc_set_wait(struct ptlrpc_request_set *);
+int ptlrpc_expired_set(void *data);
+void ptlrpc_interrupted_set(void *data);
+void ptlrpc_mark_interrupted(struct ptlrpc_request *req);
+void ptlrpc_set_destroy(struct ptlrpc_request_set *);
+void ptlrpc_set_add_req(struct ptlrpc_request_set *, struct ptlrpc_request *);
+void ptlrpc_set_add_new_req(struct ptlrpcd_ctl *pc,
+			    struct ptlrpc_request *req);
+
+void ptlrpc_free_rq_pool(struct ptlrpc_request_pool *pool);
+void ptlrpc_add_rqs_to_pool(struct ptlrpc_request_pool *pool, int num_rq);
+
+struct ptlrpc_request_pool *
+ptlrpc_init_rq_pool(int, int,
+		    void (*populate_pool)(struct ptlrpc_request_pool *, int));
+
+void ptlrpc_at_set_req_timeout(struct ptlrpc_request *req);
+struct ptlrpc_request *ptlrpc_request_alloc(struct obd_import *imp,
+					    const struct req_format *format);
+struct ptlrpc_request *ptlrpc_request_alloc_pool(struct obd_import *imp,
+					    struct ptlrpc_request_pool *,
+					    const struct req_format *format);
+void ptlrpc_request_free(struct ptlrpc_request *request);
+int ptlrpc_request_pack(struct ptlrpc_request *request,
+			__u32 version, int opcode);
+struct ptlrpc_request *ptlrpc_request_alloc_pack(struct obd_import *imp,
+						const struct req_format *format,
+						__u32 version, int opcode);
+int ptlrpc_request_bufs_pack(struct ptlrpc_request *request,
+			     __u32 version, int opcode, char **bufs,
+			     struct ptlrpc_cli_ctx *ctx);
+struct ptlrpc_request *ptlrpc_prep_req(struct obd_import *imp, __u32 version,
+				       int opcode, int count, __u32 *lengths,
+				       char **bufs);
+struct ptlrpc_request *ptlrpc_prep_req_pool(struct obd_import *imp,
+					     __u32 version, int opcode,
+					    int count, __u32 *lengths, char **bufs,
+					    struct ptlrpc_request_pool *pool);
+void ptlrpc_req_finished(struct ptlrpc_request *request);
+void ptlrpc_req_finished_with_imp_lock(struct ptlrpc_request *request);
+struct ptlrpc_request *ptlrpc_request_addref(struct ptlrpc_request *req);
+struct ptlrpc_bulk_desc *ptlrpc_prep_bulk_imp(struct ptlrpc_request *req,
+					      unsigned npages, unsigned max_brw,
+					      unsigned type, unsigned portal);
+void __ptlrpc_free_bulk(struct ptlrpc_bulk_desc *bulk, int pin);
+static inline void ptlrpc_free_bulk_pin(struct ptlrpc_bulk_desc *bulk)
+{
+	__ptlrpc_free_bulk(bulk, 1);
+}
+static inline void ptlrpc_free_bulk_nopin(struct ptlrpc_bulk_desc *bulk)
+{
+	__ptlrpc_free_bulk(bulk, 0);
+}
+void __ptlrpc_prep_bulk_page(struct ptlrpc_bulk_desc *desc,
+			     struct page *page, int pageoffset, int len, int);
+static inline void ptlrpc_prep_bulk_page_pin(struct ptlrpc_bulk_desc *desc,
+					     struct page *page, int pageoffset,
+					     int len)
+{
+	__ptlrpc_prep_bulk_page(desc, page, pageoffset, len, 1);
+}
+
+static inline void ptlrpc_prep_bulk_page_nopin(struct ptlrpc_bulk_desc *desc,
+					       struct page *page, int pageoffset,
+					       int len)
+{
+	__ptlrpc_prep_bulk_page(desc, page, pageoffset, len, 0);
+}
+
+void ptlrpc_retain_replayable_request(struct ptlrpc_request *req,
+				      struct obd_import *imp);
+__u64 ptlrpc_next_xid(void);
+__u64 ptlrpc_sample_next_xid(void);
+__u64 ptlrpc_req_xid(struct ptlrpc_request *request);
+
+/* Set of routines to run a function in ptlrpcd context */
+void *ptlrpcd_alloc_work(struct obd_import *imp,
+			 int (*cb)(const struct lu_env *, void *), void *data);
+void ptlrpcd_destroy_work(void *handler);
+int ptlrpcd_queue_work(void *handler);
+
+/** @} */
+struct ptlrpc_service_buf_conf {
+	/* nbufs is buffers # to allocate when growing the pool */
+	unsigned int			bc_nbufs;
+	/* buffer size to post */
+	unsigned int			bc_buf_size;
+	/* portal to listed for requests on */
+	unsigned int			bc_req_portal;
+	/* portal of where to send replies to */
+	unsigned int			bc_rep_portal;
+	/* maximum request size to be accepted for this service */
+	unsigned int			bc_req_max_size;
+	/* maximum reply size this service can ever send */
+	unsigned int			bc_rep_max_size;
+};
+
+struct ptlrpc_service_thr_conf {
+	/* threadname should be 8 characters or less - 6 will be added on */
+	char				*tc_thr_name;
+	/* threads increasing factor for each CPU */
+	unsigned int			tc_thr_factor;
+	/* service threads # to start on each partition while initializing */
+	unsigned int			tc_nthrs_init;
+	/*
+	 * low water of threads # upper-limit on each partition while running,
+	 * service availability may be impacted if threads number is lower
+	 * than this value. It can be ZERO if the service doesn't require
+	 * CPU affinity or there is only one partition.
+	 */
+	unsigned int			tc_nthrs_base;
+	/* "soft" limit for total threads number */
+	unsigned int			tc_nthrs_max;
+	/* user specified threads number, it will be validated due to
+	 * other members of this structure. */
+	unsigned int			tc_nthrs_user;
+	/* set NUMA node affinity for service threads */
+	unsigned int			tc_cpu_affinity;
+	/* Tags for lu_context associated with service thread */
+	__u32				tc_ctx_tags;
+};
+
+struct ptlrpc_service_cpt_conf {
+	struct cfs_cpt_table		*cc_cptable;
+	/* string pattern to describe CPTs for a service */
+	char				*cc_pattern;
+};
+
+struct ptlrpc_service_conf {
+	/* service name */
+	char				*psc_name;
+	/* soft watchdog timeout multiplifier to print stuck service traces */
+	unsigned int			psc_watchdog_factor;
+	/* buffer information */
+	struct ptlrpc_service_buf_conf	psc_buf;
+	/* thread information */
+	struct ptlrpc_service_thr_conf	psc_thr;
+	/* CPU partition information */
+	struct ptlrpc_service_cpt_conf	psc_cpt;
+	/* function table */
+	struct ptlrpc_service_ops	psc_ops;
+};
+
+/* ptlrpc/service.c */
+/**
+ * Server-side services API. Register/unregister service, request state
+ * management, service thread management
+ *
+ * @{
+ */
+void ptlrpc_save_lock(struct ptlrpc_request *req,
+		      struct lustre_handle *lock, int mode, int no_ack);
+void ptlrpc_commit_replies(struct obd_export *exp);
+void ptlrpc_dispatch_difficult_reply(struct ptlrpc_reply_state *rs);
+void ptlrpc_schedule_difficult_reply(struct ptlrpc_reply_state *rs);
+int ptlrpc_hpreq_handler(struct ptlrpc_request *req);
+struct ptlrpc_service *ptlrpc_register_service(
+				struct ptlrpc_service_conf *conf,
+				struct proc_dir_entry *proc_entry);
+void ptlrpc_stop_all_threads(struct ptlrpc_service *svc);
+
+int ptlrpc_start_threads(struct ptlrpc_service *svc);
+int ptlrpc_unregister_service(struct ptlrpc_service *service);
+int liblustre_check_services(void *arg);
+void ptlrpc_daemonize(char *name);
+int ptlrpc_service_health_check(struct ptlrpc_service *);
+void ptlrpc_server_drop_request(struct ptlrpc_request *req);
+void ptlrpc_request_change_export(struct ptlrpc_request *req,
+				  struct obd_export *export);
+
+int ptlrpc_hr_init(void);
+void ptlrpc_hr_fini(void);
+
+/** @} */
+
+/* ptlrpc/import.c */
+/**
+ * Import API
+ * @{
+ */
+int ptlrpc_connect_import(struct obd_import *imp);
+int ptlrpc_init_import(struct obd_import *imp);
+int ptlrpc_disconnect_import(struct obd_import *imp, int noclose);
+int ptlrpc_import_recovery_state_machine(struct obd_import *imp);
+void deuuidify(char *uuid, const char *prefix, char **uuid_start,
+	       int *uuid_len);
+
+/* ptlrpc/pack_generic.c */
+int ptlrpc_reconnect_import(struct obd_import *imp);
+/** @} */
+
+/**
+ * ptlrpc msg buffer and swab interface
+ *
+ * @{
+ */
+int ptlrpc_buf_need_swab(struct ptlrpc_request *req, const int inout,
+			 int index);
+void ptlrpc_buf_set_swabbed(struct ptlrpc_request *req, const int inout,
+				int index);
+int ptlrpc_unpack_rep_msg(struct ptlrpc_request *req, int len);
+int ptlrpc_unpack_req_msg(struct ptlrpc_request *req, int len);
+
+int lustre_msg_check_version(struct lustre_msg *msg, __u32 version);
+void lustre_init_msg_v2(struct lustre_msg_v2 *msg, int count, __u32 *lens,
+			char **bufs);
+int lustre_pack_request(struct ptlrpc_request *, __u32 magic, int count,
+			__u32 *lens, char **bufs);
+int lustre_pack_reply(struct ptlrpc_request *, int count, __u32 *lens,
+		      char **bufs);
+int lustre_pack_reply_v2(struct ptlrpc_request *req, int count,
+			 __u32 *lens, char **bufs, int flags);
+#define LPRFL_EARLY_REPLY 1
+int lustre_pack_reply_flags(struct ptlrpc_request *, int count, __u32 *lens,
+			    char **bufs, int flags);
+int lustre_shrink_msg(struct lustre_msg *msg, int segment,
+		      unsigned int newlen, int move_data);
+void lustre_free_reply_state(struct ptlrpc_reply_state *rs);
+int __lustre_unpack_msg(struct lustre_msg *m, int len);
+int lustre_msg_hdr_size(__u32 magic, int count);
+int lustre_msg_size(__u32 magic, int count, __u32 *lengths);
+int lustre_msg_size_v2(int count, __u32 *lengths);
+int lustre_packed_msg_size(struct lustre_msg *msg);
+int lustre_msg_early_size(void);
+void *lustre_msg_buf_v2(struct lustre_msg_v2 *m, int n, int min_size);
+void *lustre_msg_buf(struct lustre_msg *m, int n, int minlen);
+int lustre_msg_buflen(struct lustre_msg *m, int n);
+void lustre_msg_set_buflen(struct lustre_msg *m, int n, int len);
+int lustre_msg_bufcount(struct lustre_msg *m);
+char *lustre_msg_string(struct lustre_msg *m, int n, int max_len);
+__u32 lustre_msghdr_get_flags(struct lustre_msg *msg);
+void lustre_msghdr_set_flags(struct lustre_msg *msg, __u32 flags);
+__u32 lustre_msg_get_flags(struct lustre_msg *msg);
+void lustre_msg_add_flags(struct lustre_msg *msg, int flags);
+void lustre_msg_set_flags(struct lustre_msg *msg, int flags);
+void lustre_msg_clear_flags(struct lustre_msg *msg, int flags);
+__u32 lustre_msg_get_op_flags(struct lustre_msg *msg);
+void lustre_msg_add_op_flags(struct lustre_msg *msg, int flags);
+void lustre_msg_set_op_flags(struct lustre_msg *msg, int flags);
+struct lustre_handle *lustre_msg_get_handle(struct lustre_msg *msg);
+__u32 lustre_msg_get_type(struct lustre_msg *msg);
+__u32 lustre_msg_get_version(struct lustre_msg *msg);
+void lustre_msg_add_version(struct lustre_msg *msg, int version);
+__u32 lustre_msg_get_opc(struct lustre_msg *msg);
+__u64 lustre_msg_get_last_xid(struct lustre_msg *msg);
+__u64 lustre_msg_get_last_committed(struct lustre_msg *msg);
+__u64 *lustre_msg_get_versions(struct lustre_msg *msg);
+__u64 lustre_msg_get_transno(struct lustre_msg *msg);
+__u64 lustre_msg_get_slv(struct lustre_msg *msg);
+__u32 lustre_msg_get_limit(struct lustre_msg *msg);
+void lustre_msg_set_slv(struct lustre_msg *msg, __u64 slv);
+void lustre_msg_set_limit(struct lustre_msg *msg, __u64 limit);
+int lustre_msg_get_status(struct lustre_msg *msg);
+__u32 lustre_msg_get_conn_cnt(struct lustre_msg *msg);
+int lustre_msg_is_v1(struct lustre_msg *msg);
+__u32 lustre_msg_get_magic(struct lustre_msg *msg);
+__u32 lustre_msg_get_timeout(struct lustre_msg *msg);
+__u32 lustre_msg_get_service_time(struct lustre_msg *msg);
+char *lustre_msg_get_jobid(struct lustre_msg *msg);
+__u32 lustre_msg_get_cksum(struct lustre_msg *msg);
+__u32 lustre_msg_calc_cksum(struct lustre_msg *msg);
+void lustre_msg_set_handle(struct lustre_msg *msg,
+			   struct lustre_handle *handle);
+void lustre_msg_set_type(struct lustre_msg *msg, __u32 type);
+void lustre_msg_set_opc(struct lustre_msg *msg, __u32 opc);
+void lustre_msg_set_last_xid(struct lustre_msg *msg, __u64 last_xid);
+void lustre_msg_set_last_committed(struct lustre_msg *msg,
+				   __u64 last_committed);
+void lustre_msg_set_versions(struct lustre_msg *msg, __u64 *versions);
+void lustre_msg_set_transno(struct lustre_msg *msg, __u64 transno);
+void lustre_msg_set_status(struct lustre_msg *msg, __u32 status);
+void lustre_msg_set_conn_cnt(struct lustre_msg *msg, __u32 conn_cnt);
+void ptlrpc_req_set_repsize(struct ptlrpc_request *req, int count, __u32 *sizes);
+void ptlrpc_request_set_replen(struct ptlrpc_request *req);
+void lustre_msg_set_timeout(struct lustre_msg *msg, __u32 timeout);
+void lustre_msg_set_service_time(struct lustre_msg *msg, __u32 service_time);
+void lustre_msg_set_jobid(struct lustre_msg *msg, char *jobid);
+void lustre_msg_set_cksum(struct lustre_msg *msg, __u32 cksum);
+
+static inline void
+lustre_shrink_reply(struct ptlrpc_request *req, int segment,
+		    unsigned int newlen, int move_data)
+{
+	LASSERT(req->rq_reply_state);
+	LASSERT(req->rq_repmsg);
+	req->rq_replen = lustre_shrink_msg(req->rq_repmsg, segment,
+					   newlen, move_data);
+}
+
+#ifdef CONFIG_LUSTRE_TRANSLATE_ERRNOS
+
+static inline int ptlrpc_status_hton(int h)
+{
+	/*
+	 * Positive errnos must be network errnos, such as LUSTRE_EDEADLK,
+	 * ELDLM_LOCK_ABORTED, etc.
+	 */
+	if (h < 0)
+		return -lustre_errno_hton(-h);
+	else
+		return h;
+}
+
+static inline int ptlrpc_status_ntoh(int n)
+{
+	/*
+	 * See the comment in ptlrpc_status_hton().
+	 */
+	if (n < 0)
+		return -lustre_errno_ntoh(-n);
+	else
+		return n;
+}
+
+#else
+
+#define ptlrpc_status_hton(h) (h)
+#define ptlrpc_status_ntoh(n) (n)
+
+#endif
+/** @} */
+
+/** Change request phase of \a req to \a new_phase */
+static inline void
+ptlrpc_rqphase_move(struct ptlrpc_request *req, enum rq_phase new_phase)
+{
+	if (req->rq_phase == new_phase)
+		return;
+
+	if (new_phase == RQ_PHASE_UNREGISTERING) {
+		req->rq_next_phase = req->rq_phase;
+		if (req->rq_import)
+			atomic_inc(&req->rq_import->imp_unregistering);
+	}
+
+	if (req->rq_phase == RQ_PHASE_UNREGISTERING) {
+		if (req->rq_import)
+			atomic_dec(&req->rq_import->imp_unregistering);
+	}
+
+	DEBUG_REQ(D_INFO, req, "move req \"%s\" -> \"%s\"",
+		  ptlrpc_rqphase2str(req), ptlrpc_phase2str(new_phase));
+
+	req->rq_phase = new_phase;
+}
+
+/**
+ * Returns true if request \a req got early reply and hard deadline is not met
+ */
+static inline int
+ptlrpc_client_early(struct ptlrpc_request *req)
+{
+	if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_REPL_UNLINK) &&
+	    req->rq_reply_deadline > get_seconds())
+		return 0;
+	return req->rq_early;
+}
+
+/**
+ * Returns true if we got real reply from server for this request
+ */
+static inline int
+ptlrpc_client_replied(struct ptlrpc_request *req)
+{
+	if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_REPL_UNLINK) &&
+	    req->rq_reply_deadline > get_seconds())
+		return 0;
+	return req->rq_replied;
+}
+
+/** Returns true if request \a req is in process of receiving server reply */
+static inline int
+ptlrpc_client_recv(struct ptlrpc_request *req)
+{
+	if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_REPL_UNLINK) &&
+	    req->rq_reply_deadline > get_seconds())
+		return 1;
+	return req->rq_receiving_reply;
+}
+
+static inline int
+ptlrpc_client_recv_or_unlink(struct ptlrpc_request *req)
+{
+	int rc;
+
+	spin_lock(&req->rq_lock);
+	if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_REPL_UNLINK) &&
+	    req->rq_reply_deadline > get_seconds()) {
+		spin_unlock(&req->rq_lock);
+		return 1;
+	}
+	rc = req->rq_receiving_reply;
+	rc = rc || req->rq_req_unlink || req->rq_reply_unlink;
+	spin_unlock(&req->rq_lock);
+	return rc;
+}
+
+static inline void
+ptlrpc_client_wake_req(struct ptlrpc_request *req)
+{
+	if (req->rq_set == NULL)
+		wake_up(&req->rq_reply_waitq);
+	else
+		wake_up(&req->rq_set->set_waitq);
+}
+
+static inline void
+ptlrpc_rs_addref(struct ptlrpc_reply_state *rs)
+{
+	LASSERT(atomic_read(&rs->rs_refcount) > 0);
+	atomic_inc(&rs->rs_refcount);
+}
+
+static inline void
+ptlrpc_rs_decref(struct ptlrpc_reply_state *rs)
+{
+	LASSERT(atomic_read(&rs->rs_refcount) > 0);
+	if (atomic_dec_and_test(&rs->rs_refcount))
+		lustre_free_reply_state(rs);
+}
+
+/* Should only be called once per req */
+static inline void ptlrpc_req_drop_rs(struct ptlrpc_request *req)
+{
+	if (req->rq_reply_state == NULL)
+		return; /* shouldn't occur */
+	ptlrpc_rs_decref(req->rq_reply_state);
+	req->rq_reply_state = NULL;
+	req->rq_repmsg = NULL;
+}
+
+static inline __u32 lustre_request_magic(struct ptlrpc_request *req)
+{
+	return lustre_msg_get_magic(req->rq_reqmsg);
+}
+
+static inline int ptlrpc_req_get_repsize(struct ptlrpc_request *req)
+{
+	switch (req->rq_reqmsg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V2:
+		return req->rq_reqmsg->lm_repsize;
+	default:
+		LASSERTF(0, "incorrect message magic: %08x\n",
+			 req->rq_reqmsg->lm_magic);
+		return -EFAULT;
+	}
+}
+
+static inline int ptlrpc_send_limit_expired(struct ptlrpc_request *req)
+{
+	if (req->rq_delay_limit != 0 &&
+	    time_before(cfs_time_add(req->rq_queued_time,
+				     cfs_time_seconds(req->rq_delay_limit)),
+			cfs_time_current())) {
+		return 1;
+	}
+	return 0;
+}
+
+static inline int ptlrpc_no_resend(struct ptlrpc_request *req)
+{
+	if (!req->rq_no_resend && ptlrpc_send_limit_expired(req)) {
+		spin_lock(&req->rq_lock);
+		req->rq_no_resend = 1;
+		spin_unlock(&req->rq_lock);
+	}
+	return req->rq_no_resend;
+}
+
+static inline int
+ptlrpc_server_get_timeout(struct ptlrpc_service_part *svcpt)
+{
+	int at = AT_OFF ? 0 : at_get(&svcpt->scp_at_estimate);
+
+	return svcpt->scp_service->srv_watchdog_factor *
+	       max_t(int, at, obd_timeout);
+}
+
+static inline struct ptlrpc_service *
+ptlrpc_req2svc(struct ptlrpc_request *req)
+{
+	LASSERT(req->rq_rqbd != NULL);
+	return req->rq_rqbd->rqbd_svcpt->scp_service;
+}
+
+/* ldlm/ldlm_lib.c */
+/**
+ * Target client logic
+ * @{
+ */
+int client_obd_setup(struct obd_device *obddev, struct lustre_cfg *lcfg);
+int client_obd_cleanup(struct obd_device *obddev);
+int client_connect_import(const struct lu_env *env,
+			  struct obd_export **exp, struct obd_device *obd,
+			  struct obd_uuid *cluuid, struct obd_connect_data *,
+			  void *localdata);
+int client_disconnect_export(struct obd_export *exp);
+int client_import_add_conn(struct obd_import *imp, struct obd_uuid *uuid,
+			   int priority);
+int client_import_del_conn(struct obd_import *imp, struct obd_uuid *uuid);
+int client_import_find_conn(struct obd_import *imp, lnet_nid_t peer,
+			    struct obd_uuid *uuid);
+int import_set_conn_priority(struct obd_import *imp, struct obd_uuid *uuid);
+void client_destroy_import(struct obd_import *imp);
+/** @} */
+
+
+/* ptlrpc/pinger.c */
+/**
+ * Pinger API (client side only)
+ * @{
+ */
+enum timeout_event {
+	TIMEOUT_GRANT = 1
+};
+struct timeout_item;
+typedef int (*timeout_cb_t)(struct timeout_item *, void *);
+int ptlrpc_pinger_add_import(struct obd_import *imp);
+int ptlrpc_pinger_del_import(struct obd_import *imp);
+int ptlrpc_add_timeout_client(int time, enum timeout_event event,
+			      timeout_cb_t cb, void *data,
+			      struct list_head *obd_list);
+int ptlrpc_del_timeout_client(struct list_head *obd_list,
+			      enum timeout_event event);
+struct ptlrpc_request *ptlrpc_prep_ping(struct obd_import *imp);
+int ptlrpc_obd_ping(struct obd_device *obd);
+void ping_evictor_start(void);
+void ping_evictor_stop(void);
+void ptlrpc_pinger_ir_up(void);
+void ptlrpc_pinger_ir_down(void);
+/** @} */
+int ptlrpc_pinger_suppress_pings(void);
+
+/* ptlrpc daemon bind policy */
+typedef enum {
+	/* all ptlrpcd threads are free mode */
+	PDB_POLICY_NONE	  = 1,
+	/* all ptlrpcd threads are bound mode */
+	PDB_POLICY_FULL	  = 2,
+	/* <free1 bound1> <free2 bound2> ... <freeN boundN> */
+	PDB_POLICY_PAIR	  = 3,
+	/* <free1 bound1> <bound1 free2> ... <freeN boundN> <boundN free1>,
+	 * means each ptlrpcd[X] has two partners: thread[X-1] and thread[X+1].
+	 * If kernel supports NUMA, pthrpcd threads are binded and
+	 * grouped by NUMA node */
+	PDB_POLICY_NEIGHBOR      = 4,
+} pdb_policy_t;
+
+/* ptlrpc daemon load policy
+ * It is caller's duty to specify how to push the async RPC into some ptlrpcd
+ * queue, but it is not enforced, affected by "ptlrpcd_bind_policy". If it is
+ * "PDB_POLICY_FULL", then the RPC will be processed by the selected ptlrpcd,
+ * Otherwise, the RPC may be processed by the selected ptlrpcd or its partner,
+ * depends on which is scheduled firstly, to accelerate the RPC processing. */
+typedef enum {
+	/* on the same CPU core as the caller */
+	PDL_POLICY_SAME	 = 1,
+	/* within the same CPU partition, but not the same core as the caller */
+	PDL_POLICY_LOCAL	= 2,
+	/* round-robin on all CPU cores, but not the same core as the caller */
+	PDL_POLICY_ROUND	= 3,
+	/* the specified CPU core is preferred, but not enforced */
+	PDL_POLICY_PREFERRED    = 4,
+} pdl_policy_t;
+
+/* ptlrpc/ptlrpcd.c */
+void ptlrpcd_stop(struct ptlrpcd_ctl *pc, int force);
+void ptlrpcd_free(struct ptlrpcd_ctl *pc);
+void ptlrpcd_wake(struct ptlrpc_request *req);
+void ptlrpcd_add_req(struct ptlrpc_request *req, pdl_policy_t policy, int idx);
+void ptlrpcd_add_rqset(struct ptlrpc_request_set *set);
+int ptlrpcd_addref(void);
+void ptlrpcd_decref(void);
+
+/* ptlrpc/lproc_ptlrpc.c */
+/**
+ * procfs output related functions
+ * @{
+ */
+const char *ll_opcode2str(__u32 opcode);
+#if defined (CONFIG_PROC_FS)
+void ptlrpc_lprocfs_register_obd(struct obd_device *obd);
+void ptlrpc_lprocfs_unregister_obd(struct obd_device *obd);
+void ptlrpc_lprocfs_brw(struct ptlrpc_request *req, int bytes);
+#else
+static inline void ptlrpc_lprocfs_register_obd(struct obd_device *obd) {}
+static inline void ptlrpc_lprocfs_unregister_obd(struct obd_device *obd) {}
+static inline void ptlrpc_lprocfs_brw(struct ptlrpc_request *req, int bytes) {}
+#endif
+/** @} */
+
+/* ptlrpc/llog_client.c */
+extern struct llog_operations llog_client_ops;
+
+/** @} net */
+
+#endif
+/** @} PtlRPC */
diff --git a/kernel/drivers/staging/lustre/lustre/include/lustre_param.h b/kernel/drivers/staging/lustre/lustre/include/lustre_param.h
new file mode 100644
index 000000000..ed654684c
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/include/lustre_param.h
@@ -0,0 +1,121 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/include/lustre_param.h
+ *
+ * User-settable parameter keys
+ *
+ * Author: Nathan Rutman <nathan@clusterfs.com>
+ */
+
+#ifndef _LUSTRE_PARAM_H
+#define _LUSTRE_PARAM_H
+
+/** \defgroup param param
+ *
+ * @{
+ */
+
+/* For interoperability */
+struct cfg_interop_param {
+	char *old_param;
+	char *new_param;
+};
+
+/* obd_config.c */
+int class_find_param(char *buf, char *key, char **valp);
+struct cfg_interop_param *class_find_old_param(const char *param,
+					       struct cfg_interop_param *ptr);
+int class_get_next_param(char **params, char *copy);
+int class_match_param(char *buf, char *key, char **valp);
+int class_parse_nid(char *buf, lnet_nid_t *nid, char **endh);
+int class_parse_nid_quiet(char *buf, lnet_nid_t *nid, char **endh);
+int class_parse_net(char *buf, __u32 *net, char **endh);
+int class_match_nid(char *buf, char *key, lnet_nid_t nid);
+int class_match_net(char *buf, char *key, __u32 net);
+/* obd_mount.c */
+int do_lcfg(char *cfgname, lnet_nid_t nid, int cmd,
+	    char *s1, char *s2, char *s3, char *s4);
+
+
+
+/****************** User-settable parameter keys *********************/
+/* e.g.
+	tunefs.lustre --param="failover.node=192.168.0.13@tcp0" /dev/sda
+	lctl conf_param testfs-OST0000 failover.node=3@elan,192.168.0.3@tcp0
+		    ... testfs-MDT0000.lov.stripesize=4M
+		    ... testfs-OST0000.ost.client_cache_seconds=15
+		    ... testfs.sys.timeout=<secs>
+		    ... testfs.llite.max_read_ahead_mb=16
+*/
+
+/* System global or special params not handled in obd's proc
+ * See mgs_write_log_sys()
+ */
+#define PARAM_TIMEOUT	      "timeout="	  /* global */
+#define PARAM_LDLM_TIMEOUT	 "ldlm_timeout="     /* global */
+#define PARAM_AT_MIN	       "at_min="	   /* global */
+#define PARAM_AT_MAX	       "at_max="	   /* global */
+#define PARAM_AT_EXTRA	     "at_extra="	 /* global */
+#define PARAM_AT_EARLY_MARGIN      "at_early_margin="  /* global */
+#define PARAM_AT_HISTORY	   "at_history="       /* global */
+#define PARAM_JOBID_VAR		   "jobid_var="	       /* global */
+#define PARAM_MGSNODE	      "mgsnode="	  /* only at mounttime */
+#define PARAM_FAILNODE	     "failover.node="    /* add failover nid */
+#define PARAM_FAILMODE	     "failover.mode="    /* initial mount only */
+#define PARAM_ACTIVE	       "active="	   /* activate/deactivate */
+#define PARAM_NETWORK	      "network="	  /* bind on nid */
+#define PARAM_ID_UPCALL		"identity_upcall="  /* identity upcall */
+
+/* Prefixes for parameters handled by obd's proc methods (XXX_process_config) */
+#define PARAM_OST		  "ost."
+#define PARAM_OSC		  "osc."
+#define PARAM_MDT		  "mdt."
+#define PARAM_MDD		  "mdd."
+#define PARAM_MDC		  "mdc."
+#define PARAM_LLITE		"llite."
+#define PARAM_LOV		  "lov."
+#define PARAM_LOD		"lod."
+#define PARAM_OSP		"osp."
+#define PARAM_SYS		  "sys."	      /* global */
+#define PARAM_SRPC		 "srpc."
+#define PARAM_SRPC_FLVR	    "srpc.flavor."
+#define PARAM_SRPC_UDESC	   "srpc.udesc.cli2mdt"
+#define PARAM_SEC		  "security."
+#define PARAM_QUOTA		"quota."	    /* global */
+
+/** @} param */
+
+#endif /* _LUSTRE_PARAM_H */
diff --git a/kernel/drivers/staging/lustre/lustre/include/lustre_quota.h b/kernel/drivers/staging/lustre/lustre/include/lustre_quota.h
new file mode 100644
index 000000000..2643f2807
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/include/lustre_quota.h
@@ -0,0 +1,241 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ * Use is subject to license terms.
+ */
+
+#ifndef _LUSTRE_QUOTA_H
+#define _LUSTRE_QUOTA_H
+
+/** \defgroup quota quota
+ *
+ */
+
+#include <linux/fs.h>
+#include <linux/quota.h>
+#include <linux/quotaops.h>
+
+#include "dt_object.h"
+#include "lustre_fid.h"
+#include "lustre_dlm.h"
+
+#ifndef MAX_IQ_TIME
+#define MAX_IQ_TIME  604800     /* (7*24*60*60) 1 week */
+#endif
+
+#ifndef MAX_DQ_TIME
+#define MAX_DQ_TIME  604800     /* (7*24*60*60) 1 week */
+#endif
+
+struct lquota_id_info;
+struct lquota_trans;
+
+/* Gather all quota record type in an union that can be used to read any records
+ * from disk. All fields of these records must be 64-bit aligned, otherwise the
+ * OSD layer may swab them incorrectly. */
+union lquota_rec {
+	struct lquota_glb_rec	lqr_glb_rec;
+	struct lquota_slv_rec	lqr_slv_rec;
+	struct lquota_acct_rec	lqr_acct_rec;
+};
+
+/* Index features supported by the global index objects
+ * Only used for migration purpose and should be removed once on-disk migration
+ * is no longer needed */
+extern struct dt_index_features dt_quota_iusr_features;
+extern struct dt_index_features dt_quota_busr_features;
+extern struct dt_index_features dt_quota_igrp_features;
+extern struct dt_index_features dt_quota_bgrp_features;
+
+/* Name used in the configuration logs to identify the default metadata pool
+ * (composed of all the MDTs, with pool ID 0) and the default data pool (all
+ * the OSTs, with pool ID 0 too). */
+#define QUOTA_METAPOOL_NAME   "mdt="
+#define QUOTA_DATAPOOL_NAME   "ost="
+
+/*
+ * Quota Master Target support
+ */
+
+/* Request handlers for quota master operations.
+ * This is used by the MDT to pass quota/lock requests to the quota master
+ * target. This won't be needed any more once the QMT is a real target and
+ * does not rely any more on the MDT service threads and namespace. */
+struct qmt_handlers {
+	/* Handle quotactl request from client. */
+	int (*qmth_quotactl)(const struct lu_env *, struct lu_device *,
+			     struct obd_quotactl *);
+
+	/* Handle dqacq/dqrel request from slave. */
+	int (*qmth_dqacq)(const struct lu_env *, struct lu_device *,
+			  struct ptlrpc_request *);
+
+	/* LDLM intent policy associated with quota locks */
+	int (*qmth_intent_policy)(const struct lu_env *, struct lu_device *,
+				  struct ptlrpc_request *, struct ldlm_lock **,
+				  int);
+
+	/* Initialize LVB of ldlm resource associated with quota objects */
+	int (*qmth_lvbo_init)(struct lu_device *, struct ldlm_resource *);
+
+	/* Update LVB of ldlm resource associated with quota objects */
+	int (*qmth_lvbo_update)(struct lu_device *, struct ldlm_resource *,
+				struct ptlrpc_request *, int);
+
+	/* Return size of LVB to be packed in ldlm message */
+	int (*qmth_lvbo_size)(struct lu_device *, struct ldlm_lock *);
+
+	/* Fill request buffer with lvb */
+	int (*qmth_lvbo_fill)(struct lu_device *, struct ldlm_lock *, void *,
+			      int);
+
+	/* Free lvb associated with ldlm resource */
+	int (*qmth_lvbo_free)(struct lu_device *, struct ldlm_resource *);
+};
+
+/* actual handlers are defined in lustre/quota/qmt_handler.c */
+extern struct qmt_handlers qmt_hdls;
+
+/*
+ * Quota enforcement support on slaves
+ */
+
+struct qsd_instance;
+
+/* The quota slave feature is implemented under the form of a library.
+ * The API is the following:
+ *
+ * - qsd_init(): the user (mostly the OSD layer) should first allocate a qsd
+ *	       instance via qsd_init(). This creates all required structures
+ *	       to manage quota enforcement for this target and performs all
+ *	       low-level initialization which does not involve any lustre
+ *	       object. qsd_init() should typically be called when the OSD
+ *	       is being set up.
+ *
+ * - qsd_prepare(): This sets up on-disk objects associated with the quota slave
+ *		  feature and initiates the quota reintegration procedure if
+ *		  needed. qsd_prepare() should typically be called when
+ *		  ->ldo_prepare is invoked.
+ *
+ * - qsd_start(): a qsd instance should be started once recovery is completed
+ *		(i.e. when ->ldo_recovery_complete is called). This is used
+ *		to notify the qsd layer that quota should now be enforced
+ *		again via the qsd_op_begin/end functions. The last step of the
+ *		reintegration procedure (namely usage reconciliation) will be
+ *		completed during start.
+ *
+ * - qsd_fini(): is used to release a qsd_instance structure allocated with
+ *	       qsd_init(). This releases all quota slave objects and frees the
+ *	       structures associated with the qsd_instance.
+ *
+ * - qsd_op_begin(): is used to enforce quota, it must be called in the
+ *		   declaration of each operation. qsd_op_end() should then be
+ *		   invoked later once all operations have been completed in
+ *		   order to release/adjust the quota space.
+ *		   Running qsd_op_begin() before qsd_start() isn't fatal and
+ *		   will return success.
+ *		   Once qsd_start() has been run, qsd_op_begin() will block
+ *		   until the reintegration procedure is completed.
+ *
+ * - qsd_op_end(): performs the post operation quota processing. This must be
+ *		 called after the operation transaction stopped.
+ *		 While qsd_op_begin() must be invoked each time a new
+ *		 operation is declared, qsd_op_end() should be called only
+ *		 once for the whole transaction.
+ *
+ * - qsd_op_adjust(): triggers pre-acquire/release if necessary.
+ *
+ * Below are the function prototypes to be used by OSD layer to manage quota
+ * enforcement. Arguments are documented where each function is defined.  */
+
+struct qsd_instance *qsd_init(const struct lu_env *, char *, struct dt_device *,
+			      struct proc_dir_entry *);
+int qsd_prepare(const struct lu_env *, struct qsd_instance *);
+int qsd_start(const struct lu_env *, struct qsd_instance *);
+void qsd_fini(const struct lu_env *, struct qsd_instance *);
+int qsd_op_begin(const struct lu_env *, struct qsd_instance *,
+		 struct lquota_trans *, struct lquota_id_info *, int *);
+void qsd_op_end(const struct lu_env *, struct qsd_instance *,
+		struct lquota_trans *);
+void qsd_op_adjust(const struct lu_env *, struct qsd_instance *,
+		   union lquota_id *, int);
+/* This is exported for the ldiskfs quota migration only,
+ * see convert_quota_file() */
+int lquota_disk_write_glb(const struct lu_env *, struct dt_object *,
+			  __u64, struct lquota_glb_rec *);
+
+/*
+ * Quota information attached to a transaction
+ */
+
+struct lquota_entry;
+
+struct lquota_id_info {
+	/* quota identifier */
+	union lquota_id		 lqi_id;
+
+	/* USRQUOTA or GRPQUOTA for now, could be expanded for
+	 * directory quota or other types later.  */
+	int			 lqi_type;
+
+	/* inodes or kbytes to be consumed or released, it could
+	 * be negative when releasing space.  */
+	long long		 lqi_space;
+
+	/* quota slave entry structure associated with this ID */
+	struct lquota_entry	*lqi_qentry;
+
+	/* whether we are reporting blocks or inodes */
+	bool			 lqi_is_blk;
+};
+
+/* Since we enforce only inode quota in meta pool (MDTs), and block quota in
+ * data pool (OSTs), there are at most 4 quota ids being enforced in a single
+ * transaction, which is chown transaction:
+ * original uid and gid, new uid and gid.
+ *
+ * This value might need to be revised when directory quota is added.  */
+#define QUOTA_MAX_TRANSIDS    4
+
+/* all qids involved in a single transaction */
+struct lquota_trans {
+	unsigned short		lqt_id_cnt;
+	struct lquota_id_info	lqt_ids[QUOTA_MAX_TRANSIDS];
+};
+
+/* flags for quota local enforcement */
+#define QUOTA_FL_OVER_USRQUOTA  0x01
+#define QUOTA_FL_OVER_GRPQUOTA  0x02
+#define QUOTA_FL_SYNC	   0x04
+
+#define IS_LQUOTA_RES(res)						\
+	(res->lr_name.name[LUSTRE_RES_ID_SEQ_OFF] == FID_SEQ_QUOTA ||	\
+	 res->lr_name.name[LUSTRE_RES_ID_SEQ_OFF] == FID_SEQ_QUOTA_GLB)
+
+/* helper function used by MDT & OFD to retrieve quota accounting information
+ * on slave */
+int lquotactl_slv(const struct lu_env *, struct dt_device *,
+		  struct obd_quotactl *);
+/** @} quota */
+#endif /* _LUSTRE_QUOTA_H */
diff --git a/kernel/drivers/staging/lustre/lustre/include/lustre_req_layout.h b/kernel/drivers/staging/lustre/lustre/include/lustre_req_layout.h
new file mode 100644
index 000000000..c6457b27c
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/include/lustre_req_layout.h
@@ -0,0 +1,341 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/include/lustre_req_layout.h
+ *
+ * Lustre Metadata Target (mdt) request handler
+ *
+ * Author: Nikita Danilov <nikita@clusterfs.com>
+ */
+
+#ifndef _LUSTRE_REQ_LAYOUT_H__
+#define _LUSTRE_REQ_LAYOUT_H__
+
+/** \defgroup req_layout req_layout
+ *
+ * @{
+ */
+
+struct req_msg_field;
+struct req_format;
+struct req_capsule;
+
+struct ptlrpc_request;
+
+enum req_location {
+	RCL_CLIENT,
+	RCL_SERVER,
+	RCL_NR
+};
+
+/* Maximal number of fields (buffers) in a request message. */
+#define REQ_MAX_FIELD_NR  9
+
+struct req_capsule {
+	struct ptlrpc_request   *rc_req;
+	const struct req_format *rc_fmt;
+	enum req_location	rc_loc;
+	__u32		    rc_area[RCL_NR][REQ_MAX_FIELD_NR];
+};
+
+#if !defined(__REQ_LAYOUT_USER__)
+
+/* struct ptlrpc_request, lustre_msg* */
+#include "lustre_net.h"
+
+void req_capsule_init(struct req_capsule *pill, struct ptlrpc_request *req,
+		      enum req_location location);
+void req_capsule_fini(struct req_capsule *pill);
+
+void req_capsule_set(struct req_capsule *pill, const struct req_format *fmt);
+void req_capsule_client_dump(struct req_capsule *pill);
+void req_capsule_server_dump(struct req_capsule *pill);
+void req_capsule_init_area(struct req_capsule *pill);
+int req_capsule_filled_sizes(struct req_capsule *pill, enum req_location loc);
+int  req_capsule_server_pack(struct req_capsule *pill);
+
+void *req_capsule_client_get(struct req_capsule *pill,
+			     const struct req_msg_field *field);
+void *req_capsule_client_swab_get(struct req_capsule *pill,
+				  const struct req_msg_field *field,
+				  void *swabber);
+void *req_capsule_client_sized_get(struct req_capsule *pill,
+				   const struct req_msg_field *field,
+				   int len);
+void *req_capsule_server_get(struct req_capsule *pill,
+			     const struct req_msg_field *field);
+void *req_capsule_server_sized_get(struct req_capsule *pill,
+				   const struct req_msg_field *field,
+				   int len);
+void *req_capsule_server_swab_get(struct req_capsule *pill,
+				  const struct req_msg_field *field,
+				  void *swabber);
+void *req_capsule_server_sized_swab_get(struct req_capsule *pill,
+					const struct req_msg_field *field,
+					int len, void *swabber);
+const void *req_capsule_other_get(struct req_capsule *pill,
+				  const struct req_msg_field *field);
+
+void req_capsule_set_size(struct req_capsule *pill,
+			  const struct req_msg_field *field,
+			  enum req_location loc, int size);
+int req_capsule_get_size(const struct req_capsule *pill,
+			  const struct req_msg_field *field,
+			  enum req_location loc);
+int req_capsule_msg_size(struct req_capsule *pill, enum req_location loc);
+int req_capsule_fmt_size(__u32 magic, const struct req_format *fmt,
+			 enum req_location loc);
+void req_capsule_extend(struct req_capsule *pill, const struct req_format *fmt);
+
+int req_capsule_has_field(const struct req_capsule *pill,
+			  const struct req_msg_field *field,
+			  enum req_location loc);
+int req_capsule_field_present(const struct req_capsule *pill,
+			      const struct req_msg_field *field,
+			      enum req_location loc);
+void req_capsule_shrink(struct req_capsule *pill,
+			const struct req_msg_field *field,
+			unsigned int newlen,
+			enum req_location loc);
+int req_capsule_server_grow(struct req_capsule *pill,
+			    const struct req_msg_field *field,
+			    unsigned int newlen);
+int  req_layout_init(void);
+void req_layout_fini(void);
+
+/* __REQ_LAYOUT_USER__ */
+#endif
+
+extern struct req_format RQF_OBD_PING;
+extern struct req_format RQF_OBD_SET_INFO;
+extern struct req_format RQF_SEC_CTX;
+extern struct req_format RQF_OBD_IDX_READ;
+/* MGS req_format */
+extern struct req_format RQF_MGS_TARGET_REG;
+extern struct req_format RQF_MGS_SET_INFO;
+extern struct req_format RQF_MGS_CONFIG_READ;
+/* fid/fld req_format */
+extern struct req_format RQF_SEQ_QUERY;
+extern struct req_format RQF_FLD_QUERY;
+/* MDS req_format */
+extern struct req_format RQF_MDS_CONNECT;
+extern struct req_format RQF_MDS_DISCONNECT;
+extern struct req_format RQF_MDS_STATFS;
+extern struct req_format RQF_MDS_GETSTATUS;
+extern struct req_format RQF_MDS_SYNC;
+extern struct req_format RQF_MDS_GETXATTR;
+extern struct req_format RQF_MDS_GETATTR;
+extern struct req_format RQF_UPDATE_OBJ;
+
+/*
+ * This is format of direct (non-intent) MDS_GETATTR_NAME request.
+ */
+extern struct req_format RQF_MDS_GETATTR_NAME;
+extern struct req_format RQF_MDS_CLOSE;
+extern struct req_format RQF_MDS_RELEASE_CLOSE;
+extern struct req_format RQF_MDS_PIN;
+extern struct req_format RQF_MDS_UNPIN;
+extern struct req_format RQF_MDS_CONNECT;
+extern struct req_format RQF_MDS_DISCONNECT;
+extern struct req_format RQF_MDS_GET_INFO;
+extern struct req_format RQF_MDS_READPAGE;
+extern struct req_format RQF_MDS_WRITEPAGE;
+extern struct req_format RQF_MDS_IS_SUBDIR;
+extern struct req_format RQF_MDS_DONE_WRITING;
+extern struct req_format RQF_MDS_REINT;
+extern struct req_format RQF_MDS_REINT_CREATE;
+extern struct req_format RQF_MDS_REINT_CREATE_RMT_ACL;
+extern struct req_format RQF_MDS_REINT_CREATE_SLAVE;
+extern struct req_format RQF_MDS_REINT_CREATE_SYM;
+extern struct req_format RQF_MDS_REINT_OPEN;
+extern struct req_format RQF_MDS_REINT_UNLINK;
+extern struct req_format RQF_MDS_REINT_LINK;
+extern struct req_format RQF_MDS_REINT_RENAME;
+extern struct req_format RQF_MDS_REINT_SETATTR;
+extern struct req_format RQF_MDS_REINT_SETXATTR;
+extern struct req_format RQF_MDS_QUOTACHECK;
+extern struct req_format RQF_MDS_QUOTACTL;
+extern struct req_format RQF_QC_CALLBACK;
+extern struct req_format RQF_QUOTA_DQACQ;
+extern struct req_format RQF_MDS_SWAP_LAYOUTS;
+/* MDS hsm formats */
+extern struct req_format RQF_MDS_HSM_STATE_GET;
+extern struct req_format RQF_MDS_HSM_STATE_SET;
+extern struct req_format RQF_MDS_HSM_ACTION;
+extern struct req_format RQF_MDS_HSM_PROGRESS;
+extern struct req_format RQF_MDS_HSM_CT_REGISTER;
+extern struct req_format RQF_MDS_HSM_CT_UNREGISTER;
+extern struct req_format RQF_MDS_HSM_REQUEST;
+/* OST req_format */
+extern struct req_format RQF_OST_CONNECT;
+extern struct req_format RQF_OST_DISCONNECT;
+extern struct req_format RQF_OST_QUOTACHECK;
+extern struct req_format RQF_OST_QUOTACTL;
+extern struct req_format RQF_OST_GETATTR;
+extern struct req_format RQF_OST_SETATTR;
+extern struct req_format RQF_OST_CREATE;
+extern struct req_format RQF_OST_PUNCH;
+extern struct req_format RQF_OST_SYNC;
+extern struct req_format RQF_OST_DESTROY;
+extern struct req_format RQF_OST_BRW_READ;
+extern struct req_format RQF_OST_BRW_WRITE;
+extern struct req_format RQF_OST_STATFS;
+extern struct req_format RQF_OST_SET_GRANT_INFO;
+extern struct req_format RQF_OST_GET_INFO_GENERIC;
+extern struct req_format RQF_OST_GET_INFO_LAST_ID;
+extern struct req_format RQF_OST_GET_INFO_LAST_FID;
+extern struct req_format RQF_OST_SET_INFO_LAST_FID;
+extern struct req_format RQF_OST_GET_INFO_FIEMAP;
+
+/* LDLM req_format */
+extern struct req_format RQF_LDLM_ENQUEUE;
+extern struct req_format RQF_LDLM_ENQUEUE_LVB;
+extern struct req_format RQF_LDLM_CONVERT;
+extern struct req_format RQF_LDLM_INTENT;
+extern struct req_format RQF_LDLM_INTENT_BASIC;
+extern struct req_format RQF_LDLM_INTENT_LAYOUT;
+extern struct req_format RQF_LDLM_INTENT_GETATTR;
+extern struct req_format RQF_LDLM_INTENT_OPEN;
+extern struct req_format RQF_LDLM_INTENT_CREATE;
+extern struct req_format RQF_LDLM_INTENT_UNLINK;
+extern struct req_format RQF_LDLM_INTENT_GETXATTR;
+extern struct req_format RQF_LDLM_INTENT_QUOTA;
+extern struct req_format RQF_LDLM_CANCEL;
+extern struct req_format RQF_LDLM_CALLBACK;
+extern struct req_format RQF_LDLM_CP_CALLBACK;
+extern struct req_format RQF_LDLM_BL_CALLBACK;
+extern struct req_format RQF_LDLM_GL_CALLBACK;
+extern struct req_format RQF_LDLM_GL_DESC_CALLBACK;
+/* LOG req_format */
+extern struct req_format RQF_LOG_CANCEL;
+extern struct req_format RQF_LLOG_ORIGIN_HANDLE_CREATE;
+extern struct req_format RQF_LLOG_ORIGIN_HANDLE_DESTROY;
+extern struct req_format RQF_LLOG_ORIGIN_HANDLE_NEXT_BLOCK;
+extern struct req_format RQF_LLOG_ORIGIN_HANDLE_PREV_BLOCK;
+extern struct req_format RQF_LLOG_ORIGIN_HANDLE_READ_HEADER;
+extern struct req_format RQF_LLOG_ORIGIN_CONNECT;
+
+extern struct req_format RQF_CONNECT;
+
+extern struct req_msg_field RMF_GENERIC_DATA;
+extern struct req_msg_field RMF_PTLRPC_BODY;
+extern struct req_msg_field RMF_MDT_BODY;
+extern struct req_msg_field RMF_MDT_EPOCH;
+extern struct req_msg_field RMF_OBD_STATFS;
+extern struct req_msg_field RMF_NAME;
+extern struct req_msg_field RMF_SYMTGT;
+extern struct req_msg_field RMF_TGTUUID;
+extern struct req_msg_field RMF_CLUUID;
+extern struct req_msg_field RMF_SETINFO_VAL;
+extern struct req_msg_field RMF_SETINFO_KEY;
+extern struct req_msg_field RMF_GETINFO_VAL;
+extern struct req_msg_field RMF_GETINFO_VALLEN;
+extern struct req_msg_field RMF_GETINFO_KEY;
+extern struct req_msg_field RMF_IDX_INFO;
+extern struct req_msg_field RMF_CLOSE_DATA;
+
+/*
+ * connection handle received in MDS_CONNECT request.
+ */
+extern struct req_msg_field RMF_CONN;
+extern struct req_msg_field RMF_CONNECT_DATA;
+extern struct req_msg_field RMF_DLM_REQ;
+extern struct req_msg_field RMF_DLM_REP;
+extern struct req_msg_field RMF_DLM_LVB;
+extern struct req_msg_field RMF_DLM_GL_DESC;
+extern struct req_msg_field RMF_LDLM_INTENT;
+extern struct req_msg_field RMF_LAYOUT_INTENT;
+extern struct req_msg_field RMF_MDT_MD;
+extern struct req_msg_field RMF_REC_REINT;
+extern struct req_msg_field RMF_EADATA;
+extern struct req_msg_field RMF_EAVALS;
+extern struct req_msg_field RMF_EAVALS_LENS;
+extern struct req_msg_field RMF_ACL;
+extern struct req_msg_field RMF_LOGCOOKIES;
+extern struct req_msg_field RMF_CAPA1;
+extern struct req_msg_field RMF_CAPA2;
+extern struct req_msg_field RMF_OBD_QUOTACHECK;
+extern struct req_msg_field RMF_OBD_QUOTACTL;
+extern struct req_msg_field RMF_QUOTA_BODY;
+extern struct req_msg_field RMF_STRING;
+extern struct req_msg_field RMF_SWAP_LAYOUTS;
+extern struct req_msg_field RMF_MDS_HSM_PROGRESS;
+extern struct req_msg_field RMF_MDS_HSM_REQUEST;
+extern struct req_msg_field RMF_MDS_HSM_USER_ITEM;
+extern struct req_msg_field RMF_MDS_HSM_ARCHIVE;
+extern struct req_msg_field RMF_HSM_USER_STATE;
+extern struct req_msg_field RMF_HSM_STATE_SET;
+extern struct req_msg_field RMF_MDS_HSM_CURRENT_ACTION;
+extern struct req_msg_field RMF_MDS_HSM_REQUEST;
+
+/* seq-mgr fields */
+extern struct req_msg_field RMF_SEQ_OPC;
+extern struct req_msg_field RMF_SEQ_RANGE;
+extern struct req_msg_field RMF_FID_SPACE;
+
+/* FLD fields */
+extern struct req_msg_field RMF_FLD_OPC;
+extern struct req_msg_field RMF_FLD_MDFLD;
+
+extern struct req_msg_field RMF_LLOGD_BODY;
+extern struct req_msg_field RMF_LLOG_LOG_HDR;
+extern struct req_msg_field RMF_LLOGD_CONN_BODY;
+
+extern struct req_msg_field RMF_MGS_TARGET_INFO;
+extern struct req_msg_field RMF_MGS_SEND_PARAM;
+
+extern struct req_msg_field RMF_OST_BODY;
+extern struct req_msg_field RMF_OBD_IOOBJ;
+extern struct req_msg_field RMF_OBD_ID;
+extern struct req_msg_field RMF_FID;
+extern struct req_msg_field RMF_NIOBUF_REMOTE;
+extern struct req_msg_field RMF_RCS;
+extern struct req_msg_field RMF_FIEMAP_KEY;
+extern struct req_msg_field RMF_FIEMAP_VAL;
+extern struct req_msg_field RMF_OST_ID;
+
+/* MGS config read message format */
+extern struct req_msg_field RMF_MGS_CONFIG_BODY;
+extern struct req_msg_field RMF_MGS_CONFIG_RES;
+
+/* generic uint32 */
+extern struct req_msg_field RMF_U32;
+
+/* OBJ update format */
+extern struct req_msg_field RMF_UPDATE;
+extern struct req_msg_field RMF_UPDATE_REPLY;
+/** @} req_layout */
+
+#endif /* _LUSTRE_REQ_LAYOUT_H__ */
diff --git a/kernel/drivers/staging/lustre/lustre/include/lustre_sec.h b/kernel/drivers/staging/lustre/lustre/include/lustre_sec.h
new file mode 100644
index 000000000..dff70a5b9
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/include/lustre_sec.h
@@ -0,0 +1,1147 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef _LUSTRE_SEC_H_
+#define _LUSTRE_SEC_H_
+
+/** \defgroup sptlrpc sptlrpc
+ *
+ * @{
+ */
+
+/*
+ * to avoid include
+ */
+struct obd_import;
+struct obd_export;
+struct ptlrpc_request;
+struct ptlrpc_reply_state;
+struct ptlrpc_bulk_desc;
+struct brw_page;
+/* Linux specific */
+struct key;
+struct seq_file;
+
+/*
+ * forward declaration
+ */
+struct ptlrpc_sec_policy;
+struct ptlrpc_sec_cops;
+struct ptlrpc_sec_sops;
+struct ptlrpc_sec;
+struct ptlrpc_svc_ctx;
+struct ptlrpc_cli_ctx;
+struct ptlrpc_ctx_ops;
+
+/**
+ * \addtogroup flavor flavor
+ *
+ * RPC flavor is represented by a 32 bits integer. Currently the high 12 bits
+ * are unused, must be set to 0 for future expansion.
+ * <pre>
+ * ------------------------------------------------------------------------
+ * | 4b (bulk svc) | 4b (bulk type) | 4b (svc) | 4b (mech)  | 4b (policy) |
+ * ------------------------------------------------------------------------
+ * </pre>
+ *
+ * @{
+ */
+
+/*
+ * flavor constants
+ */
+enum sptlrpc_policy {
+	SPTLRPC_POLICY_NULL	     = 0,
+	SPTLRPC_POLICY_PLAIN	    = 1,
+	SPTLRPC_POLICY_GSS	      = 2,
+	SPTLRPC_POLICY_MAX,
+};
+
+enum sptlrpc_mech_null {
+	SPTLRPC_MECH_NULL	       = 0,
+	SPTLRPC_MECH_NULL_MAX,
+};
+
+enum sptlrpc_mech_plain {
+	SPTLRPC_MECH_PLAIN	      = 0,
+	SPTLRPC_MECH_PLAIN_MAX,
+};
+
+enum sptlrpc_mech_gss {
+	SPTLRPC_MECH_GSS_NULL	   = 0,
+	SPTLRPC_MECH_GSS_KRB5	   = 1,
+	SPTLRPC_MECH_GSS_MAX,
+};
+
+enum sptlrpc_service_type {
+	SPTLRPC_SVC_NULL		= 0,    /**< no security */
+	SPTLRPC_SVC_AUTH		= 1,    /**< authentication only */
+	SPTLRPC_SVC_INTG		= 2,    /**< integrity */
+	SPTLRPC_SVC_PRIV		= 3,    /**< privacy */
+	SPTLRPC_SVC_MAX,
+};
+
+enum sptlrpc_bulk_type {
+	SPTLRPC_BULK_DEFAULT	    = 0,    /**< follow rpc flavor */
+	SPTLRPC_BULK_HASH	       = 1,    /**< hash integrity */
+	SPTLRPC_BULK_MAX,
+};
+
+enum sptlrpc_bulk_service {
+	SPTLRPC_BULK_SVC_NULL	   = 0,    /**< no security */
+	SPTLRPC_BULK_SVC_AUTH	   = 1,    /**< authentication only */
+	SPTLRPC_BULK_SVC_INTG	   = 2,    /**< integrity */
+	SPTLRPC_BULK_SVC_PRIV	   = 3,    /**< privacy */
+	SPTLRPC_BULK_SVC_MAX,
+};
+
+/*
+ * compose/extract macros
+ */
+#define FLVR_POLICY_OFFSET	      (0)
+#define FLVR_MECH_OFFSET		(4)
+#define FLVR_SVC_OFFSET		 (8)
+#define FLVR_BULK_TYPE_OFFSET	   (12)
+#define FLVR_BULK_SVC_OFFSET	    (16)
+
+#define MAKE_FLVR(policy, mech, svc, btype, bsvc)		       \
+	(((__u32)(policy) << FLVR_POLICY_OFFSET) |		      \
+	 ((__u32)(mech) << FLVR_MECH_OFFSET) |			  \
+	 ((__u32)(svc) << FLVR_SVC_OFFSET) |			    \
+	 ((__u32)(btype) << FLVR_BULK_TYPE_OFFSET) |		    \
+	 ((__u32)(bsvc) << FLVR_BULK_SVC_OFFSET))
+
+/*
+ * extraction
+ */
+#define SPTLRPC_FLVR_POLICY(flavor)				     \
+	((((__u32)(flavor)) >> FLVR_POLICY_OFFSET) & 0xF)
+#define SPTLRPC_FLVR_MECH(flavor)				       \
+	((((__u32)(flavor)) >> FLVR_MECH_OFFSET) & 0xF)
+#define SPTLRPC_FLVR_SVC(flavor)					\
+	((((__u32)(flavor)) >> FLVR_SVC_OFFSET) & 0xF)
+#define SPTLRPC_FLVR_BULK_TYPE(flavor)				  \
+	((((__u32)(flavor)) >> FLVR_BULK_TYPE_OFFSET) & 0xF)
+#define SPTLRPC_FLVR_BULK_SVC(flavor)				   \
+	((((__u32)(flavor)) >> FLVR_BULK_SVC_OFFSET) & 0xF)
+
+#define SPTLRPC_FLVR_BASE(flavor)				       \
+	((((__u32)(flavor)) >> FLVR_POLICY_OFFSET) & 0xFFF)
+#define SPTLRPC_FLVR_BASE_SUB(flavor)				   \
+	((((__u32)(flavor)) >> FLVR_MECH_OFFSET) & 0xFF)
+
+/*
+ * gss subflavors
+ */
+#define MAKE_BASE_SUBFLVR(mech, svc)				    \
+	((__u32)(mech) |						\
+	 ((__u32)(svc) << (FLVR_SVC_OFFSET - FLVR_MECH_OFFSET)))
+
+#define SPTLRPC_SUBFLVR_KRB5N					   \
+	MAKE_BASE_SUBFLVR(SPTLRPC_MECH_GSS_KRB5, SPTLRPC_SVC_NULL)
+#define SPTLRPC_SUBFLVR_KRB5A					   \
+	MAKE_BASE_SUBFLVR(SPTLRPC_MECH_GSS_KRB5, SPTLRPC_SVC_AUTH)
+#define SPTLRPC_SUBFLVR_KRB5I					   \
+	MAKE_BASE_SUBFLVR(SPTLRPC_MECH_GSS_KRB5, SPTLRPC_SVC_INTG)
+#define SPTLRPC_SUBFLVR_KRB5P					   \
+	MAKE_BASE_SUBFLVR(SPTLRPC_MECH_GSS_KRB5, SPTLRPC_SVC_PRIV)
+
+/*
+ * "end user" flavors
+ */
+#define SPTLRPC_FLVR_NULL			       \
+	MAKE_FLVR(SPTLRPC_POLICY_NULL,		  \
+		  SPTLRPC_MECH_NULL,		    \
+		  SPTLRPC_SVC_NULL,		     \
+		  SPTLRPC_BULK_DEFAULT,		 \
+		  SPTLRPC_BULK_SVC_NULL)
+#define SPTLRPC_FLVR_PLAIN			      \
+	MAKE_FLVR(SPTLRPC_POLICY_PLAIN,		 \
+		  SPTLRPC_MECH_PLAIN,		   \
+		  SPTLRPC_SVC_NULL,		     \
+		  SPTLRPC_BULK_HASH,		    \
+		  SPTLRPC_BULK_SVC_INTG)
+#define SPTLRPC_FLVR_KRB5N			      \
+	MAKE_FLVR(SPTLRPC_POLICY_GSS,		   \
+		  SPTLRPC_MECH_GSS_KRB5,		\
+		  SPTLRPC_SVC_NULL,		     \
+		  SPTLRPC_BULK_DEFAULT,		 \
+		  SPTLRPC_BULK_SVC_NULL)
+#define SPTLRPC_FLVR_KRB5A			      \
+	MAKE_FLVR(SPTLRPC_POLICY_GSS,		   \
+		  SPTLRPC_MECH_GSS_KRB5,		\
+		  SPTLRPC_SVC_AUTH,		     \
+		  SPTLRPC_BULK_DEFAULT,		 \
+		  SPTLRPC_BULK_SVC_NULL)
+#define SPTLRPC_FLVR_KRB5I			      \
+	MAKE_FLVR(SPTLRPC_POLICY_GSS,		   \
+		  SPTLRPC_MECH_GSS_KRB5,		\
+		  SPTLRPC_SVC_INTG,		     \
+		  SPTLRPC_BULK_DEFAULT,		 \
+		  SPTLRPC_BULK_SVC_INTG)
+#define SPTLRPC_FLVR_KRB5P			      \
+	MAKE_FLVR(SPTLRPC_POLICY_GSS,		   \
+		  SPTLRPC_MECH_GSS_KRB5,		\
+		  SPTLRPC_SVC_PRIV,		     \
+		  SPTLRPC_BULK_DEFAULT,		 \
+		  SPTLRPC_BULK_SVC_PRIV)
+
+#define SPTLRPC_FLVR_DEFAULT	    SPTLRPC_FLVR_NULL
+
+#define SPTLRPC_FLVR_INVALID	    ((__u32) 0xFFFFFFFF)
+#define SPTLRPC_FLVR_ANY		((__u32) 0xFFF00000)
+
+/**
+ * extract the useful part from wire flavor
+ */
+#define WIRE_FLVR(wflvr)		(((__u32) (wflvr)) & 0x000FFFFF)
+
+/** @} flavor */
+
+static inline void flvr_set_svc(__u32 *flvr, __u32 svc)
+{
+	LASSERT(svc < SPTLRPC_SVC_MAX);
+	*flvr = MAKE_FLVR(SPTLRPC_FLVR_POLICY(*flvr),
+			  SPTLRPC_FLVR_MECH(*flvr),
+			  svc,
+			  SPTLRPC_FLVR_BULK_TYPE(*flvr),
+			  SPTLRPC_FLVR_BULK_SVC(*flvr));
+}
+
+static inline void flvr_set_bulk_svc(__u32 *flvr, __u32 svc)
+{
+	LASSERT(svc < SPTLRPC_BULK_SVC_MAX);
+	*flvr = MAKE_FLVR(SPTLRPC_FLVR_POLICY(*flvr),
+			  SPTLRPC_FLVR_MECH(*flvr),
+			  SPTLRPC_FLVR_SVC(*flvr),
+			  SPTLRPC_FLVR_BULK_TYPE(*flvr),
+			  svc);
+}
+
+struct bulk_spec_hash {
+	__u8    hash_alg;
+};
+
+/**
+ * Full description of flavors being used on a ptlrpc connection, include
+ * both regular RPC and bulk transfer parts.
+ */
+struct sptlrpc_flavor {
+	/**
+	 * wire flavor, should be renamed to sf_wire.
+	 */
+	__u32   sf_rpc;
+	/**
+	 * general flags of PTLRPC_SEC_FL_*
+	 */
+	__u32   sf_flags;
+	/**
+	 * rpc flavor specification
+	 */
+	union {
+		/* nothing for now */
+	} u_rpc;
+	/**
+	 * bulk flavor specification
+	 */
+	union {
+		struct bulk_spec_hash hash;
+	} u_bulk;
+};
+
+/**
+ * identify the RPC is generated from what part of Lustre. It's encoded into
+ * RPC requests and to be checked by ptlrpc service.
+ */
+enum lustre_sec_part {
+	LUSTRE_SP_CLI	   = 0,
+	LUSTRE_SP_MDT,
+	LUSTRE_SP_OST,
+	LUSTRE_SP_MGC,
+	LUSTRE_SP_MGS,
+	LUSTRE_SP_ANY	   = 0xFF
+};
+
+const char *sptlrpc_part2name(enum lustre_sec_part sp);
+enum lustre_sec_part sptlrpc_target_sec_part(struct obd_device *obd);
+
+/**
+ * A rule specifies a flavor to be used by a ptlrpc connection between
+ * two Lustre parts.
+ */
+struct sptlrpc_rule {
+	__u32		   sr_netid;   /* LNET network ID */
+	__u8		    sr_from;    /* sec_part */
+	__u8		    sr_to;      /* sec_part */
+	__u16		   sr_padding;
+	struct sptlrpc_flavor   sr_flvr;
+};
+
+/**
+ * A set of rules in memory.
+ *
+ * Rules are generated and stored on MGS, and propagated to MDT, OST,
+ * and client when needed.
+ */
+struct sptlrpc_rule_set {
+	int		     srs_nslot;
+	int		     srs_nrule;
+	struct sptlrpc_rule    *srs_rules;
+};
+
+int sptlrpc_parse_flavor(const char *str, struct sptlrpc_flavor *flvr);
+int sptlrpc_flavor_has_bulk(struct sptlrpc_flavor *flvr);
+
+static inline void sptlrpc_rule_set_init(struct sptlrpc_rule_set *set)
+{
+	memset(set, 0, sizeof(*set));
+}
+
+void sptlrpc_rule_set_free(struct sptlrpc_rule_set *set);
+int  sptlrpc_rule_set_expand(struct sptlrpc_rule_set *set);
+int  sptlrpc_rule_set_merge(struct sptlrpc_rule_set *set,
+			    struct sptlrpc_rule *rule);
+int sptlrpc_rule_set_choose(struct sptlrpc_rule_set *rset,
+			    enum lustre_sec_part from,
+			    enum lustre_sec_part to,
+			    lnet_nid_t nid,
+			    struct sptlrpc_flavor *sf);
+void sptlrpc_rule_set_dump(struct sptlrpc_rule_set *set);
+
+int  sptlrpc_process_config(struct lustre_cfg *lcfg);
+void sptlrpc_conf_log_start(const char *logname);
+void sptlrpc_conf_log_stop(const char *logname);
+void sptlrpc_conf_log_update_begin(const char *logname);
+void sptlrpc_conf_log_update_end(const char *logname);
+void sptlrpc_conf_client_adapt(struct obd_device *obd);
+void sptlrpc_target_choose_flavor(struct sptlrpc_rule_set *rset,
+				  enum lustre_sec_part from,
+				  lnet_nid_t nid,
+				  struct sptlrpc_flavor *flavor);
+
+/* The maximum length of security payload. 1024 is enough for Kerberos 5,
+ * and should be enough for other future mechanisms but not sure.
+ * Only used by pre-allocated request/reply pool.
+ */
+#define SPTLRPC_MAX_PAYLOAD     (1024)
+
+
+struct vfs_cred {
+	uint32_t	vc_uid;
+	uint32_t	vc_gid;
+};
+
+struct ptlrpc_ctx_ops {
+	/**
+	 * To determine whether it's suitable to use the \a ctx for \a vcred.
+	 */
+	int     (*match)       (struct ptlrpc_cli_ctx *ctx,
+				struct vfs_cred *vcred);
+
+	/**
+	 * To bring the \a ctx uptodate.
+	 */
+	int     (*refresh)     (struct ptlrpc_cli_ctx *ctx);
+
+	/**
+	 * Validate the \a ctx.
+	 */
+	int     (*validate)    (struct ptlrpc_cli_ctx *ctx);
+
+	/**
+	 * Force the \a ctx to die.
+	 */
+	void    (*force_die)   (struct ptlrpc_cli_ctx *ctx,
+				int grace);
+	int     (*display)     (struct ptlrpc_cli_ctx *ctx,
+				char *buf, int bufsize);
+
+	/**
+	 * Sign the request message using \a ctx.
+	 *
+	 * \pre req->rq_reqmsg point to request message.
+	 * \pre req->rq_reqlen is the request message length.
+	 * \post req->rq_reqbuf point to request message with signature.
+	 * \post req->rq_reqdata_len is set to the final request message size.
+	 *
+	 * \see null_ctx_sign(), plain_ctx_sign(), gss_cli_ctx_sign().
+	 */
+	int     (*sign)	(struct ptlrpc_cli_ctx *ctx,
+				struct ptlrpc_request *req);
+
+	/**
+	 * Verify the reply message using \a ctx.
+	 *
+	 * \pre req->rq_repdata point to reply message with signature.
+	 * \pre req->rq_repdata_len is the total reply message length.
+	 * \post req->rq_repmsg point to reply message without signature.
+	 * \post req->rq_replen is the reply message length.
+	 *
+	 * \see null_ctx_verify(), plain_ctx_verify(), gss_cli_ctx_verify().
+	 */
+	int     (*verify)      (struct ptlrpc_cli_ctx *ctx,
+				struct ptlrpc_request *req);
+
+	/**
+	 * Encrypt the request message using \a ctx.
+	 *
+	 * \pre req->rq_reqmsg point to request message in clear text.
+	 * \pre req->rq_reqlen is the request message length.
+	 * \post req->rq_reqbuf point to request message.
+	 * \post req->rq_reqdata_len is set to the final request message size.
+	 *
+	 * \see gss_cli_ctx_seal().
+	 */
+	int     (*seal)	(struct ptlrpc_cli_ctx *ctx,
+				struct ptlrpc_request *req);
+
+	/**
+	 * Decrypt the reply message using \a ctx.
+	 *
+	 * \pre req->rq_repdata point to encrypted reply message.
+	 * \pre req->rq_repdata_len is the total cipher text length.
+	 * \post req->rq_repmsg point to reply message in clear text.
+	 * \post req->rq_replen is the reply message length in clear text.
+	 *
+	 * \see gss_cli_ctx_unseal().
+	 */
+	int     (*unseal)      (struct ptlrpc_cli_ctx *ctx,
+				struct ptlrpc_request *req);
+
+	/**
+	 * Wrap bulk request data. This is called before wrapping RPC
+	 * request message.
+	 *
+	 * \pre bulk buffer is descripted by desc->bd_iov and
+	 * desc->bd_iov_count. note for read it's just buffer, no data
+	 * need to be sent;  for write it contains data in clear text.
+	 * \post when necessary, ptlrpc_bulk_sec_desc was properly prepared
+	 * (usually inside of RPC request message).
+	 * - encryption: cipher text bulk buffer is descripted by
+	 *   desc->bd_enc_iov and desc->bd_iov_count (currently assume iov
+	 *   count remains the same).
+	 * - otherwise: bulk buffer is still desc->bd_iov and
+	 *   desc->bd_iov_count.
+	 *
+	 * \return 0: success.
+	 * \return -ev: error code.
+	 *
+	 * \see plain_cli_wrap_bulk(), gss_cli_ctx_wrap_bulk().
+	 */
+	int     (*wrap_bulk)   (struct ptlrpc_cli_ctx *ctx,
+				struct ptlrpc_request *req,
+				struct ptlrpc_bulk_desc *desc);
+
+	/**
+	 * Unwrap bulk reply data. This is called after wrapping RPC
+	 * reply message.
+	 *
+	 * \pre bulk buffer is descripted by desc->bd_iov/desc->bd_enc_iov and
+	 * desc->bd_iov_count, according to wrap_bulk().
+	 * \post final bulk data in clear text is placed in buffer described
+	 * by desc->bd_iov and desc->bd_iov_count.
+	 * \return +ve nob of actual bulk data in clear text.
+	 * \return -ve error code.
+	 *
+	 * \see plain_cli_unwrap_bulk(), gss_cli_ctx_unwrap_bulk().
+	 */
+	int     (*unwrap_bulk) (struct ptlrpc_cli_ctx *ctx,
+				struct ptlrpc_request *req,
+				struct ptlrpc_bulk_desc *desc);
+};
+
+#define PTLRPC_CTX_NEW_BIT	     (0)  /* newly created */
+#define PTLRPC_CTX_UPTODATE_BIT	(1)  /* uptodate */
+#define PTLRPC_CTX_DEAD_BIT	    (2)  /* mark expired gracefully */
+#define PTLRPC_CTX_ERROR_BIT	   (3)  /* fatal error (refresh, etc.) */
+#define PTLRPC_CTX_CACHED_BIT	  (8)  /* in ctx cache (hash etc.) */
+#define PTLRPC_CTX_ETERNAL_BIT	 (9)  /* always valid */
+
+#define PTLRPC_CTX_NEW		 (1 << PTLRPC_CTX_NEW_BIT)
+#define PTLRPC_CTX_UPTODATE	    (1 << PTLRPC_CTX_UPTODATE_BIT)
+#define PTLRPC_CTX_DEAD		(1 << PTLRPC_CTX_DEAD_BIT)
+#define PTLRPC_CTX_ERROR	       (1 << PTLRPC_CTX_ERROR_BIT)
+#define PTLRPC_CTX_CACHED	      (1 << PTLRPC_CTX_CACHED_BIT)
+#define PTLRPC_CTX_ETERNAL	     (1 << PTLRPC_CTX_ETERNAL_BIT)
+
+#define PTLRPC_CTX_STATUS_MASK	 (PTLRPC_CTX_NEW_BIT    |       \
+					PTLRPC_CTX_UPTODATE   |       \
+					PTLRPC_CTX_DEAD       |       \
+					PTLRPC_CTX_ERROR)
+
+struct ptlrpc_cli_ctx {
+	struct hlist_node	cc_cache;      /* linked into ctx cache */
+	atomic_t	    cc_refcount;
+	struct ptlrpc_sec      *cc_sec;
+	struct ptlrpc_ctx_ops  *cc_ops;
+	unsigned long	      cc_expire;     /* in seconds */
+	unsigned int	    cc_early_expire:1;
+	unsigned long	   cc_flags;
+	struct vfs_cred	 cc_vcred;
+	spinlock_t		cc_lock;
+	struct list_head	      cc_req_list;   /* waiting reqs linked here */
+	struct list_head	      cc_gc_chain;   /* linked to gc chain */
+};
+
+/**
+ * client side policy operation vector.
+ */
+struct ptlrpc_sec_cops {
+	/**
+	 * Given an \a imp, create and initialize a ptlrpc_sec structure.
+	 * \param ctx service context:
+	 * - regular import: \a ctx should be NULL;
+	 * - reverse import: \a ctx is obtained from incoming request.
+	 * \param flavor specify what flavor to use.
+	 *
+	 * When necessary, policy module is responsible for taking reference
+	 * on the import.
+	 *
+	 * \see null_create_sec(), plain_create_sec(), gss_sec_create_kr().
+	 */
+	struct ptlrpc_sec *     (*create_sec)  (struct obd_import *imp,
+						struct ptlrpc_svc_ctx *ctx,
+						struct sptlrpc_flavor *flavor);
+
+	/**
+	 * Destructor of ptlrpc_sec. When called, refcount has been dropped
+	 * to 0 and all contexts has been destroyed.
+	 *
+	 * \see null_destroy_sec(), plain_destroy_sec(), gss_sec_destroy_kr().
+	 */
+	void		    (*destroy_sec) (struct ptlrpc_sec *sec);
+
+	/**
+	 * Notify that this ptlrpc_sec is going to die. Optionally, policy
+	 * module is supposed to set sec->ps_dying and whatever necessary
+	 * actions.
+	 *
+	 * \see plain_kill_sec(), gss_sec_kill().
+	 */
+	void		    (*kill_sec)    (struct ptlrpc_sec *sec);
+
+	/**
+	 * Given \a vcred, lookup and/or create its context. The policy module
+	 * is supposed to maintain its own context cache.
+	 * XXX currently \a create and \a remove_dead is always 1, perhaps
+	 * should be removed completely.
+	 *
+	 * \see null_lookup_ctx(), plain_lookup_ctx(), gss_sec_lookup_ctx_kr().
+	 */
+	struct ptlrpc_cli_ctx * (*lookup_ctx)  (struct ptlrpc_sec *sec,
+						struct vfs_cred *vcred,
+						int create,
+						int remove_dead);
+
+	/**
+	 * Called then the reference of \a ctx dropped to 0. The policy module
+	 * is supposed to destroy this context or whatever else according to
+	 * its cache maintenance mechanism.
+	 *
+	 * \param sync if zero, we shouldn't wait for the context being
+	 * destroyed completely.
+	 *
+	 * \see plain_release_ctx(), gss_sec_release_ctx_kr().
+	 */
+	void		    (*release_ctx) (struct ptlrpc_sec *sec,
+						struct ptlrpc_cli_ctx *ctx,
+						int sync);
+
+	/**
+	 * Flush the context cache.
+	 *
+	 * \param uid context of which user, -1 means all contexts.
+	 * \param grace if zero, the PTLRPC_CTX_UPTODATE_BIT of affected
+	 * contexts should be cleared immediately.
+	 * \param force if zero, only idle contexts will be flushed.
+	 *
+	 * \see plain_flush_ctx_cache(), gss_sec_flush_ctx_cache_kr().
+	 */
+	int		     (*flush_ctx_cache)
+					       (struct ptlrpc_sec *sec,
+						uid_t uid,
+						int grace,
+						int force);
+
+	/**
+	 * Called periodically by garbage collector to remove dead contexts
+	 * from cache.
+	 *
+	 * \see gss_sec_gc_ctx_kr().
+	 */
+	void		    (*gc_ctx)      (struct ptlrpc_sec *sec);
+
+	/**
+	 * Given an context \a ctx, install a corresponding reverse service
+	 * context on client side.
+	 * XXX currently it's only used by GSS module, maybe we should remove
+	 * this from general API.
+	 */
+	int		     (*install_rctx)(struct obd_import *imp,
+						struct ptlrpc_sec *sec,
+						struct ptlrpc_cli_ctx *ctx);
+
+	/**
+	 * To allocate request buffer for \a req.
+	 *
+	 * \pre req->rq_reqmsg == NULL.
+	 * \pre req->rq_reqbuf == NULL, otherwise it must be pre-allocated,
+	 * we are not supposed to free it.
+	 * \post if success, req->rq_reqmsg point to a buffer with size
+	 * at least \a lustre_msg_size.
+	 *
+	 * \see null_alloc_reqbuf(), plain_alloc_reqbuf(), gss_alloc_reqbuf().
+	 */
+	int		     (*alloc_reqbuf)(struct ptlrpc_sec *sec,
+						struct ptlrpc_request *req,
+						int lustre_msg_size);
+
+	/**
+	 * To free request buffer for \a req.
+	 *
+	 * \pre req->rq_reqbuf != NULL.
+	 *
+	 * \see null_free_reqbuf(), plain_free_reqbuf(), gss_free_reqbuf().
+	 */
+	void		    (*free_reqbuf) (struct ptlrpc_sec *sec,
+						struct ptlrpc_request *req);
+
+	/**
+	 * To allocate reply buffer for \a req.
+	 *
+	 * \pre req->rq_repbuf == NULL.
+	 * \post if success, req->rq_repbuf point to a buffer with size
+	 * req->rq_repbuf_len, the size should be large enough to receive
+	 * reply which be transformed from \a lustre_msg_size of clear text.
+	 *
+	 * \see null_alloc_repbuf(), plain_alloc_repbuf(), gss_alloc_repbuf().
+	 */
+	int		     (*alloc_repbuf)(struct ptlrpc_sec *sec,
+						struct ptlrpc_request *req,
+						int lustre_msg_size);
+
+	/**
+	 * To free reply buffer for \a req.
+	 *
+	 * \pre req->rq_repbuf != NULL.
+	 * \post req->rq_repbuf == NULL.
+	 * \post req->rq_repbuf_len == 0.
+	 *
+	 * \see null_free_repbuf(), plain_free_repbuf(), gss_free_repbuf().
+	 */
+	void		    (*free_repbuf) (struct ptlrpc_sec *sec,
+						struct ptlrpc_request *req);
+
+	/**
+	 * To expand the request buffer of \a req, thus the \a segment in
+	 * the request message pointed by req->rq_reqmsg can accommodate
+	 * at least \a newsize of data.
+	 *
+	 * \pre req->rq_reqmsg->lm_buflens[segment] < newsize.
+	 *
+	 * \see null_enlarge_reqbuf(), plain_enlarge_reqbuf(),
+	 * gss_enlarge_reqbuf().
+	 */
+	int		     (*enlarge_reqbuf)
+					       (struct ptlrpc_sec *sec,
+						struct ptlrpc_request *req,
+						int segment, int newsize);
+	/*
+	 * misc
+	 */
+	int		     (*display)     (struct ptlrpc_sec *sec,
+						struct seq_file *seq);
+};
+
+/**
+ * server side policy operation vector.
+ */
+struct ptlrpc_sec_sops {
+	/**
+	 * verify an incoming request.
+	 *
+	 * \pre request message is pointed by req->rq_reqbuf, size is
+	 * req->rq_reqdata_len; and the message has been unpacked to
+	 * host byte order.
+	 *
+	 * \retval SECSVC_OK success, req->rq_reqmsg point to request message
+	 * in clear text, size is req->rq_reqlen; req->rq_svc_ctx is set;
+	 * req->rq_sp_from is decoded from request.
+	 * \retval SECSVC_COMPLETE success, the request has been fully
+	 * processed, and reply message has been prepared; req->rq_sp_from is
+	 * decoded from request.
+	 * \retval SECSVC_DROP failed, this request should be dropped.
+	 *
+	 * \see null_accept(), plain_accept(), gss_svc_accept_kr().
+	 */
+	int		     (*accept)      (struct ptlrpc_request *req);
+
+	/**
+	 * Perform security transformation upon reply message.
+	 *
+	 * \pre reply message is pointed by req->rq_reply_state->rs_msg, size
+	 * is req->rq_replen.
+	 * \post req->rs_repdata_len is the final message size.
+	 * \post req->rq_reply_off is set.
+	 *
+	 * \see null_authorize(), plain_authorize(), gss_svc_authorize().
+	 */
+	int		     (*authorize)   (struct ptlrpc_request *req);
+
+	/**
+	 * Invalidate server context \a ctx.
+	 *
+	 * \see gss_svc_invalidate_ctx().
+	 */
+	void		    (*invalidate_ctx)
+					       (struct ptlrpc_svc_ctx *ctx);
+
+	/**
+	 * Allocate a ptlrpc_reply_state.
+	 *
+	 * \param msgsize size of the reply message in clear text.
+	 * \pre if req->rq_reply_state != NULL, then it's pre-allocated, we
+	 * should simply use it; otherwise we'll responsible for allocating
+	 * a new one.
+	 * \post req->rq_reply_state != NULL;
+	 * \post req->rq_reply_state->rs_msg != NULL;
+	 *
+	 * \see null_alloc_rs(), plain_alloc_rs(), gss_svc_alloc_rs().
+	 */
+	int		     (*alloc_rs)    (struct ptlrpc_request *req,
+						int msgsize);
+
+	/**
+	 * Free a ptlrpc_reply_state.
+	 */
+	void		    (*free_rs)     (struct ptlrpc_reply_state *rs);
+
+	/**
+	 * Release the server context \a ctx.
+	 *
+	 * \see gss_svc_free_ctx().
+	 */
+	void		    (*free_ctx)    (struct ptlrpc_svc_ctx *ctx);
+
+	/**
+	 * Install a reverse context based on the server context \a ctx.
+	 *
+	 * \see gss_svc_install_rctx_kr().
+	 */
+	int		     (*install_rctx)(struct obd_import *imp,
+						struct ptlrpc_svc_ctx *ctx);
+
+	/**
+	 * Prepare buffer for incoming bulk write.
+	 *
+	 * \pre desc->bd_iov and desc->bd_iov_count describes the buffer
+	 * intended to receive the write.
+	 *
+	 * \see gss_svc_prep_bulk().
+	 */
+	int		     (*prep_bulk)   (struct ptlrpc_request *req,
+						struct ptlrpc_bulk_desc *desc);
+
+	/**
+	 * Unwrap the bulk write data.
+	 *
+	 * \see plain_svc_unwrap_bulk(), gss_svc_unwrap_bulk().
+	 */
+	int		     (*unwrap_bulk) (struct ptlrpc_request *req,
+						struct ptlrpc_bulk_desc *desc);
+
+	/**
+	 * Wrap the bulk read data.
+	 *
+	 * \see plain_svc_wrap_bulk(), gss_svc_wrap_bulk().
+	 */
+	int		     (*wrap_bulk)   (struct ptlrpc_request *req,
+						struct ptlrpc_bulk_desc *desc);
+};
+
+struct ptlrpc_sec_policy {
+	struct module		   *sp_owner;
+	char			   *sp_name;
+	__u16			   sp_policy; /* policy number */
+	struct ptlrpc_sec_cops	 *sp_cops;   /* client ops */
+	struct ptlrpc_sec_sops	 *sp_sops;   /* server ops */
+};
+
+#define PTLRPC_SEC_FL_REVERSE	   0x0001 /* reverse sec */
+#define PTLRPC_SEC_FL_ROOTONLY	  0x0002 /* treat everyone as root */
+#define PTLRPC_SEC_FL_UDESC	     0x0004 /* ship udesc */
+#define PTLRPC_SEC_FL_BULK	      0x0008 /* intensive bulk i/o expected */
+#define PTLRPC_SEC_FL_PAG	       0x0010 /* PAG mode */
+
+/**
+ * The ptlrpc_sec represents the client side ptlrpc security facilities,
+ * each obd_import (both regular and reverse import) must associate with
+ * a ptlrpc_sec.
+ *
+ * \see sptlrpc_import_sec_adapt().
+ */
+struct ptlrpc_sec {
+	struct ptlrpc_sec_policy       *ps_policy;
+	atomic_t		    ps_refcount;
+	/** statistic only */
+	atomic_t		    ps_nctx;
+	/** unique identifier */
+	int			     ps_id;
+	struct sptlrpc_flavor	   ps_flvr;
+	enum lustre_sec_part	    ps_part;
+	/** after set, no more new context will be created */
+	unsigned int		    ps_dying:1;
+	/** owning import */
+	struct obd_import	      *ps_import;
+	spinlock_t			ps_lock;
+
+	/*
+	 * garbage collection
+	 */
+	struct list_head		      ps_gc_list;
+	unsigned long		      ps_gc_interval; /* in seconds */
+	unsigned long		      ps_gc_next;     /* in seconds */
+};
+
+static inline int sec_is_reverse(struct ptlrpc_sec *sec)
+{
+	return (sec->ps_flvr.sf_flags & PTLRPC_SEC_FL_REVERSE);
+}
+
+static inline int sec_is_rootonly(struct ptlrpc_sec *sec)
+{
+	return (sec->ps_flvr.sf_flags & PTLRPC_SEC_FL_ROOTONLY);
+}
+
+
+struct ptlrpc_svc_ctx {
+	atomic_t		    sc_refcount;
+	struct ptlrpc_sec_policy       *sc_policy;
+};
+
+/*
+ * user identity descriptor
+ */
+#define LUSTRE_MAX_GROUPS	       (128)
+
+struct ptlrpc_user_desc {
+	__u32	   pud_uid;
+	__u32	   pud_gid;
+	__u32	   pud_fsuid;
+	__u32	   pud_fsgid;
+	__u32	   pud_cap;
+	__u32	   pud_ngroups;
+	__u32	   pud_groups[0];
+};
+
+/*
+ * bulk flavors
+ */
+enum sptlrpc_bulk_hash_alg {
+	BULK_HASH_ALG_NULL      = 0,
+	BULK_HASH_ALG_ADLER32,
+	BULK_HASH_ALG_CRC32,
+	BULK_HASH_ALG_MD5,
+	BULK_HASH_ALG_SHA1,
+	BULK_HASH_ALG_SHA256,
+	BULK_HASH_ALG_SHA384,
+	BULK_HASH_ALG_SHA512,
+	BULK_HASH_ALG_MAX
+};
+
+const char *sptlrpc_get_hash_name(__u8 hash_alg);
+__u8 sptlrpc_get_hash_alg(const char *algname);
+
+enum {
+	BSD_FL_ERR      = 1,
+};
+
+struct ptlrpc_bulk_sec_desc {
+	__u8	    bsd_version;    /* 0 */
+	__u8	    bsd_type;       /* SPTLRPC_BULK_XXX */
+	__u8	    bsd_svc;	/* SPTLRPC_BULK_SVC_XXXX */
+	__u8	    bsd_flags;      /* flags */
+	__u32	   bsd_nob;	/* nob of bulk data */
+	__u8	    bsd_data[0];    /* policy-specific token */
+};
+
+
+/*
+ * round size up to next power of 2, for slab allocation.
+ * @size must be sane (can't overflow after round up)
+ */
+static inline int size_roundup_power2(int size)
+{
+	size--;
+	size |= size >> 1;
+	size |= size >> 2;
+	size |= size >> 4;
+	size |= size >> 8;
+	size |= size >> 16;
+	size++;
+	return size;
+}
+
+/*
+ * internal support libraries
+ */
+void _sptlrpc_enlarge_msg_inplace(struct lustre_msg *msg,
+				  int segment, int newsize);
+
+/*
+ * security policies
+ */
+int sptlrpc_register_policy(struct ptlrpc_sec_policy *policy);
+int sptlrpc_unregister_policy(struct ptlrpc_sec_policy *policy);
+
+__u32 sptlrpc_name2flavor_base(const char *name);
+const char *sptlrpc_flavor2name_base(__u32 flvr);
+char *sptlrpc_flavor2name_bulk(struct sptlrpc_flavor *sf,
+			       char *buf, int bufsize);
+char *sptlrpc_flavor2name(struct sptlrpc_flavor *sf, char *buf, int bufsize);
+char *sptlrpc_secflags2str(__u32 flags, char *buf, int bufsize);
+
+static inline
+struct ptlrpc_sec_policy *sptlrpc_policy_get(struct ptlrpc_sec_policy *policy)
+{
+	__module_get(policy->sp_owner);
+	return policy;
+}
+
+static inline
+void sptlrpc_policy_put(struct ptlrpc_sec_policy *policy)
+{
+	module_put(policy->sp_owner);
+}
+
+/*
+ * client credential
+ */
+static inline
+unsigned long cli_ctx_status(struct ptlrpc_cli_ctx *ctx)
+{
+	return (ctx->cc_flags & PTLRPC_CTX_STATUS_MASK);
+}
+
+static inline
+int cli_ctx_is_ready(struct ptlrpc_cli_ctx *ctx)
+{
+	return (cli_ctx_status(ctx) == PTLRPC_CTX_UPTODATE);
+}
+
+static inline
+int cli_ctx_is_refreshed(struct ptlrpc_cli_ctx *ctx)
+{
+	return (cli_ctx_status(ctx) != 0);
+}
+
+static inline
+int cli_ctx_is_uptodate(struct ptlrpc_cli_ctx *ctx)
+{
+	return ((ctx->cc_flags & PTLRPC_CTX_UPTODATE) != 0);
+}
+
+static inline
+int cli_ctx_is_error(struct ptlrpc_cli_ctx *ctx)
+{
+	return ((ctx->cc_flags & PTLRPC_CTX_ERROR) != 0);
+}
+
+static inline
+int cli_ctx_is_dead(struct ptlrpc_cli_ctx *ctx)
+{
+	return ((ctx->cc_flags & (PTLRPC_CTX_DEAD | PTLRPC_CTX_ERROR)) != 0);
+}
+
+static inline
+int cli_ctx_is_eternal(struct ptlrpc_cli_ctx *ctx)
+{
+	return ((ctx->cc_flags & PTLRPC_CTX_ETERNAL) != 0);
+}
+
+/*
+ * sec get/put
+ */
+struct ptlrpc_sec *sptlrpc_sec_get(struct ptlrpc_sec *sec);
+void sptlrpc_sec_put(struct ptlrpc_sec *sec);
+
+/*
+ * internal apis which only used by policy implementation
+ */
+int  sptlrpc_get_next_secid(void);
+void sptlrpc_sec_destroy(struct ptlrpc_sec *sec);
+
+/*
+ * exported client context api
+ */
+struct ptlrpc_cli_ctx *sptlrpc_cli_ctx_get(struct ptlrpc_cli_ctx *ctx);
+void sptlrpc_cli_ctx_put(struct ptlrpc_cli_ctx *ctx, int sync);
+void sptlrpc_cli_ctx_expire(struct ptlrpc_cli_ctx *ctx);
+void sptlrpc_cli_ctx_wakeup(struct ptlrpc_cli_ctx *ctx);
+int sptlrpc_cli_ctx_display(struct ptlrpc_cli_ctx *ctx, char *buf, int bufsize);
+
+/*
+ * exported client context wrap/buffers
+ */
+int sptlrpc_cli_wrap_request(struct ptlrpc_request *req);
+int sptlrpc_cli_unwrap_reply(struct ptlrpc_request *req);
+int sptlrpc_cli_alloc_reqbuf(struct ptlrpc_request *req, int msgsize);
+void sptlrpc_cli_free_reqbuf(struct ptlrpc_request *req);
+int sptlrpc_cli_alloc_repbuf(struct ptlrpc_request *req, int msgsize);
+void sptlrpc_cli_free_repbuf(struct ptlrpc_request *req);
+int sptlrpc_cli_enlarge_reqbuf(struct ptlrpc_request *req,
+			       int segment, int newsize);
+int  sptlrpc_cli_unwrap_early_reply(struct ptlrpc_request *req,
+				    struct ptlrpc_request **req_ret);
+void sptlrpc_cli_finish_early_reply(struct ptlrpc_request *early_req);
+
+void sptlrpc_request_out_callback(struct ptlrpc_request *req);
+
+/*
+ * exported higher interface of import & request
+ */
+int sptlrpc_import_sec_adapt(struct obd_import *imp,
+			     struct ptlrpc_svc_ctx *ctx,
+			     struct sptlrpc_flavor *flvr);
+struct ptlrpc_sec *sptlrpc_import_sec_ref(struct obd_import *imp);
+void sptlrpc_import_sec_put(struct obd_import *imp);
+
+int  sptlrpc_import_check_ctx(struct obd_import *imp);
+void sptlrpc_import_flush_root_ctx(struct obd_import *imp);
+void sptlrpc_import_flush_my_ctx(struct obd_import *imp);
+void sptlrpc_import_flush_all_ctx(struct obd_import *imp);
+int  sptlrpc_req_get_ctx(struct ptlrpc_request *req);
+void sptlrpc_req_put_ctx(struct ptlrpc_request *req, int sync);
+int  sptlrpc_req_refresh_ctx(struct ptlrpc_request *req, long timeout);
+int  sptlrpc_req_replace_dead_ctx(struct ptlrpc_request *req);
+void sptlrpc_req_set_flavor(struct ptlrpc_request *req, int opcode);
+
+int sptlrpc_parse_rule(char *param, struct sptlrpc_rule *rule);
+
+/* gc */
+void sptlrpc_gc_add_sec(struct ptlrpc_sec *sec);
+void sptlrpc_gc_del_sec(struct ptlrpc_sec *sec);
+void sptlrpc_gc_add_ctx(struct ptlrpc_cli_ctx *ctx);
+
+/* misc */
+const char *sec2target_str(struct ptlrpc_sec *sec);
+/*
+ * lprocfs
+ */
+#if defined (CONFIG_PROC_FS)
+struct proc_dir_entry;
+extern struct proc_dir_entry *sptlrpc_proc_root;
+int sptlrpc_lprocfs_cliobd_attach(struct obd_device *dev);
+#else
+#define sptlrpc_proc_root	NULL
+static inline int sptlrpc_lprocfs_cliobd_attach(struct obd_device *dev)
+{ return 0; }
+#endif
+
+/*
+ * server side
+ */
+enum secsvc_accept_res {
+	SECSVC_OK       = 0,
+	SECSVC_COMPLETE,
+	SECSVC_DROP,
+};
+
+int  sptlrpc_svc_unwrap_request(struct ptlrpc_request *req);
+int  sptlrpc_svc_alloc_rs(struct ptlrpc_request *req, int msglen);
+int  sptlrpc_svc_wrap_reply(struct ptlrpc_request *req);
+void sptlrpc_svc_free_rs(struct ptlrpc_reply_state *rs);
+void sptlrpc_svc_ctx_addref(struct ptlrpc_request *req);
+void sptlrpc_svc_ctx_decref(struct ptlrpc_request *req);
+void sptlrpc_svc_ctx_invalidate(struct ptlrpc_request *req);
+
+int  sptlrpc_target_export_check(struct obd_export *exp,
+				 struct ptlrpc_request *req);
+void sptlrpc_target_update_exp_flavor(struct obd_device *obd,
+				      struct sptlrpc_rule_set *rset);
+
+/*
+ * reverse context
+ */
+int sptlrpc_svc_install_rvs_ctx(struct obd_import *imp,
+				struct ptlrpc_svc_ctx *ctx);
+int sptlrpc_cli_install_rvs_ctx(struct obd_import *imp,
+				struct ptlrpc_cli_ctx *ctx);
+
+/* bulk security api */
+int sptlrpc_enc_pool_add_user(void);
+int sptlrpc_enc_pool_del_user(void);
+int  sptlrpc_enc_pool_get_pages(struct ptlrpc_bulk_desc *desc);
+void sptlrpc_enc_pool_put_pages(struct ptlrpc_bulk_desc *desc);
+
+int sptlrpc_cli_wrap_bulk(struct ptlrpc_request *req,
+			  struct ptlrpc_bulk_desc *desc);
+int sptlrpc_cli_unwrap_bulk_read(struct ptlrpc_request *req,
+				 struct ptlrpc_bulk_desc *desc,
+				 int nob);
+int sptlrpc_cli_unwrap_bulk_write(struct ptlrpc_request *req,
+				  struct ptlrpc_bulk_desc *desc);
+
+/* bulk helpers (internal use only by policies) */
+int sptlrpc_get_bulk_checksum(struct ptlrpc_bulk_desc *desc, __u8 alg,
+			      void *buf, int buflen);
+
+int bulk_sec_desc_unpack(struct lustre_msg *msg, int offset, int swabbed);
+
+/* user descriptor helpers */
+static inline int sptlrpc_user_desc_size(int ngroups)
+{
+	return sizeof(struct ptlrpc_user_desc) + ngroups * sizeof(__u32);
+}
+
+int sptlrpc_current_user_desc_size(void);
+int sptlrpc_pack_user_desc(struct lustre_msg *msg, int offset);
+int sptlrpc_unpack_user_desc(struct lustre_msg *req, int offset, int swabbed);
+
+
+#define CFS_CAP_CHOWN_MASK (1 << CFS_CAP_CHOWN)
+#define CFS_CAP_SYS_RESOURCE_MASK (1 << CFS_CAP_SYS_RESOURCE)
+
+enum {
+	LUSTRE_SEC_NONE	 = 0,
+	LUSTRE_SEC_REMOTE       = 1,
+	LUSTRE_SEC_SPECIFY      = 2,
+	LUSTRE_SEC_ALL	  = 3
+};
+
+/** @} sptlrpc */
+
+#endif /* _LUSTRE_SEC_H_ */
diff --git a/kernel/drivers/staging/lustre/lustre/include/lustre_ver.h b/kernel/drivers/staging/lustre/lustre/include/lustre_ver.h
new file mode 100644
index 000000000..caa4da12f
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/include/lustre_ver.h
@@ -0,0 +1,26 @@
+#ifndef _LUSTRE_VER_H_
+#define _LUSTRE_VER_H_
+/* This file automatically generated from lustre/include/lustre_ver.h.in,
+ * based on parameters in lustre/autoconf/lustre-version.ac.
+ * Changes made directly to this file will be lost. */
+
+#define LUSTRE_MAJOR 2
+#define LUSTRE_MINOR 3
+#define LUSTRE_PATCH 64
+#define LUSTRE_FIX 0
+#define LUSTRE_VERSION_STRING "2.3.64"
+
+#define LUSTRE_VERSION_CODE OBD_OCD_VERSION(LUSTRE_MAJOR, \
+					    LUSTRE_MINOR, LUSTRE_PATCH, \
+					    LUSTRE_FIX)
+
+/* liblustre clients are only allowed to connect if their LUSTRE_FIX mismatches
+ * by this amount (set in lustre/autoconf/lustre-version.ac). */
+#define LUSTRE_VERSION_ALLOWED_OFFSET OBD_OCD_VERSION(0, 0, 1, 32)
+
+/* If lustre version of client and servers it connects to differs by more
+ * than this amount, client would issue a warning.
+ * (set in lustre/autoconf/lustre-version.ac) */
+#define LUSTRE_VERSION_OFFSET_WARN OBD_OCD_VERSION(0, 4, 0, 0)
+
+#endif
diff --git a/kernel/drivers/staging/lustre/lustre/include/obd.h b/kernel/drivers/staging/lustre/lustre/include/obd.h
new file mode 100644
index 000000000..2a88b806f
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/include/obd.h
@@ -0,0 +1,1496 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef __OBD_H
+#define __OBD_H
+
+#include "linux/obd.h"
+
+#define IOC_OSC_TYPE	 'h'
+#define IOC_OSC_MIN_NR       20
+#define IOC_OSC_SET_ACTIVE   _IOWR(IOC_OSC_TYPE, 21, struct obd_device *)
+#define IOC_OSC_MAX_NR       50
+
+#define IOC_MDC_TYPE	 'i'
+#define IOC_MDC_MIN_NR       20
+#define IOC_MDC_MAX_NR       50
+
+#include "lustre/lustre_idl.h"
+#include "lustre_lib.h"
+#include "lu_ref.h"
+#include "lustre_export.h"
+#include "lustre_fid.h"
+#include "lustre_fld.h"
+#include "lustre_capa.h"
+
+#define MAX_OBD_DEVICES 8192
+
+struct osc_async_rc {
+	int     ar_rc;
+	int     ar_force_sync;
+	__u64   ar_min_xid;
+};
+
+struct lov_oinfo {		 /* per-stripe data structure */
+	struct ost_id   loi_oi;    /* object ID/Sequence on the target OST */
+	int loi_ost_idx;	   /* OST stripe index in lov_tgt_desc->tgts */
+	int loi_ost_gen;	   /* generation of this loi_ost_idx */
+
+	unsigned long loi_kms_valid:1;
+	__u64 loi_kms;	     /* known minimum size */
+	struct ost_lvb loi_lvb;
+	struct osc_async_rc     loi_ar;
+};
+
+static inline void loi_kms_set(struct lov_oinfo *oinfo, __u64 kms)
+{
+	oinfo->loi_kms = kms;
+	oinfo->loi_kms_valid = 1;
+}
+
+static inline void loi_init(struct lov_oinfo *loi)
+{
+}
+
+struct lov_stripe_md {
+	atomic_t     lsm_refc;
+	spinlock_t	lsm_lock;
+	pid_t	    lsm_lock_owner; /* debugging */
+
+	/* maximum possible file size, might change as OSTs status changes,
+	 * e.g. disconnected, deactivated */
+	__u64	    lsm_maxbytes;
+	struct {
+		/* Public members. */
+		struct ost_id lw_object_oi; /* lov object id/seq */
+
+		/* LOV-private members start here -- only for use in lov/. */
+		__u32 lw_magic;
+		__u32 lw_stripe_size;      /* size of the stripe */
+		__u32 lw_pattern;	  /* striping pattern (RAID0, RAID1) */
+		__u16 lw_stripe_count;  /* number of objects being striped over */
+		__u16 lw_layout_gen;       /* generation of the layout */
+		char  lw_pool_name[LOV_MAXPOOLNAME]; /* pool name */
+	} lsm_wire;
+
+	struct lov_oinfo *lsm_oinfo[0];
+};
+
+#define lsm_oi		 lsm_wire.lw_object_oi
+#define lsm_magic	lsm_wire.lw_magic
+#define lsm_layout_gen   lsm_wire.lw_layout_gen
+#define lsm_stripe_size  lsm_wire.lw_stripe_size
+#define lsm_pattern      lsm_wire.lw_pattern
+#define lsm_stripe_count lsm_wire.lw_stripe_count
+#define lsm_pool_name    lsm_wire.lw_pool_name
+
+static inline bool lsm_is_released(struct lov_stripe_md *lsm)
+{
+	return !!(lsm->lsm_pattern & LOV_PATTERN_F_RELEASED);
+}
+
+static inline bool lsm_has_objects(struct lov_stripe_md *lsm)
+{
+	if (lsm == NULL)
+		return false;
+	if (lsm_is_released(lsm))
+		return false;
+	return true;
+}
+
+static inline int lov_stripe_md_size(unsigned int stripe_count)
+{
+	struct lov_stripe_md lsm;
+
+	return sizeof(lsm) + stripe_count * sizeof(lsm.lsm_oinfo[0]);
+}
+
+struct obd_info;
+
+typedef int (*obd_enqueue_update_f)(void *cookie, int rc);
+
+/* obd info for a particular level (lov, osc). */
+struct obd_info {
+	/* Lock policy. It keeps an extent which is specific for a particular
+	 * OSC. (e.g. lov_prep_enqueue_set initialises extent of the policy,
+	 * and osc_enqueue passes it into ldlm_lock_match & ldlm_cli_enqueue. */
+	ldlm_policy_data_t      oi_policy;
+	/* Flags used for set request specific flags:
+	   - while lock handling, the flags obtained on the enqueue
+	   request are set here.
+	   - while stats, the flags used for control delay/resend.
+	   - while setattr, the flags used for distinguish punch operation
+	 */
+	__u64		   oi_flags;
+	/* Lock handle specific for every OSC lock. */
+	struct lustre_handle   *oi_lockh;
+	/* lsm data specific for every OSC. */
+	struct lov_stripe_md   *oi_md;
+	/* obdo data specific for every OSC, if needed at all. */
+	struct obdo	    *oi_oa;
+	/* statfs data specific for every OSC, if needed at all. */
+	struct obd_statfs      *oi_osfs;
+	/* An update callback which is called to update some data on upper
+	 * level. E.g. it is used for update lsm->lsm_oinfo at every received
+	 * request in osc level for enqueue requests. It is also possible to
+	 * update some caller data from LOV layer if needed. */
+	obd_enqueue_update_f    oi_cb_up;
+	/* oss capability, its type is obd_capa in client to avoid copy.
+	 * in contrary its type is lustre_capa in OSS. */
+	void		   *oi_capa;
+	/* transfer jobid from ost_sync() to filter_sync()... */
+	char		   *oi_jobid;
+};
+
+/* compare all relevant fields. */
+static inline int lov_stripe_md_cmp(struct lov_stripe_md *m1,
+				    struct lov_stripe_md *m2)
+{
+	/*
+	 * ->lsm_wire contains padding, but it should be zeroed out during
+	 * allocation.
+	 */
+	return memcmp(&m1->lsm_wire, &m2->lsm_wire, sizeof(m1->lsm_wire));
+}
+
+static inline int lov_lum_lsm_cmp(struct lov_user_md *lum,
+				  struct lov_stripe_md  *lsm)
+{
+	if (lsm->lsm_magic != lum->lmm_magic)
+		return 1;
+	if ((lsm->lsm_stripe_count != 0) && (lum->lmm_stripe_count != 0) &&
+	    (lsm->lsm_stripe_count != lum->lmm_stripe_count))
+		return 2;
+	if ((lsm->lsm_stripe_size != 0) && (lum->lmm_stripe_size != 0) &&
+	    (lsm->lsm_stripe_size != lum->lmm_stripe_size))
+		return 3;
+	if ((lsm->lsm_pattern != 0) && (lum->lmm_pattern != 0) &&
+	    (lsm->lsm_pattern != lum->lmm_pattern))
+		return 4;
+	if ((lsm->lsm_magic == LOV_MAGIC_V3) &&
+	    (strncmp(lsm->lsm_pool_name,
+		     ((struct lov_user_md_v3 *)lum)->lmm_pool_name,
+		     LOV_MAXPOOLNAME) != 0))
+		return 5;
+	return 0;
+}
+
+static inline int lov_lum_swab_if_needed(struct lov_user_md_v3 *lumv3,
+					 int *lmm_magic,
+					 struct lov_user_md *lum)
+{
+	if (lum && copy_from_user(lumv3, lum, sizeof(struct lov_user_md_v1)))
+		return -EFAULT;
+
+	*lmm_magic = lumv3->lmm_magic;
+
+	if (*lmm_magic == __swab32(LOV_USER_MAGIC_V1)) {
+		lustre_swab_lov_user_md_v1((struct lov_user_md_v1 *)lumv3);
+		*lmm_magic = LOV_USER_MAGIC_V1;
+	} else if (*lmm_magic == LOV_USER_MAGIC_V3) {
+		if (lum && copy_from_user(lumv3, lum, sizeof(*lumv3)))
+			return -EFAULT;
+	} else if (*lmm_magic == __swab32(LOV_USER_MAGIC_V3)) {
+		if (lum && copy_from_user(lumv3, lum, sizeof(*lumv3)))
+			return -EFAULT;
+		lustre_swab_lov_user_md_v3(lumv3);
+		*lmm_magic = LOV_USER_MAGIC_V3;
+	} else if (*lmm_magic != LOV_USER_MAGIC_V1) {
+		CDEBUG(D_IOCTL,
+		       "bad userland LOV MAGIC: %#08x != %#08x nor %#08x\n",
+		       *lmm_magic, LOV_USER_MAGIC_V1, LOV_USER_MAGIC_V3);
+		       return -EINVAL;
+	}
+	return 0;
+}
+
+void lov_stripe_lock(struct lov_stripe_md *md);
+void lov_stripe_unlock(struct lov_stripe_md *md);
+
+struct obd_type {
+	struct list_head typ_chain;
+	struct obd_ops *typ_dt_ops;
+	struct md_ops *typ_md_ops;
+	struct proc_dir_entry *typ_procroot;
+	char *typ_name;
+	int  typ_refcnt;
+	struct lu_device_type *typ_lu;
+	spinlock_t obd_type_lock;
+};
+
+struct brw_page {
+	u64 off;
+	struct page *pg;
+	int count;
+	u32 flag;
+};
+
+/* llog contexts */
+enum llog_ctxt_id {
+	LLOG_CONFIG_ORIG_CTXT  =  0,
+	LLOG_CONFIG_REPL_CTXT,
+	LLOG_MDS_OST_ORIG_CTXT,
+	LLOG_MDS_OST_REPL_CTXT,
+	LLOG_SIZE_ORIG_CTXT,
+	LLOG_SIZE_REPL_CTXT,
+	LLOG_RD1_ORIG_CTXT,
+	LLOG_RD1_REPL_CTXT,
+	LLOG_TEST_ORIG_CTXT,
+	LLOG_TEST_REPL_CTXT,
+	LLOG_LOVEA_ORIG_CTXT,
+	LLOG_LOVEA_REPL_CTXT,
+	LLOG_CHANGELOG_ORIG_CTXT,	/**< changelog generation on mdd */
+	LLOG_CHANGELOG_REPL_CTXT,	/**< changelog access on clients */
+	LLOG_CHANGELOG_USER_ORIG_CTXT,	/**< for multiple changelog consumers */
+	LLOG_AGENT_ORIG_CTXT,		/**< agent requests generation on cdt */
+	LLOG_MAX_CTXTS
+};
+
+struct timeout_item {
+	enum timeout_event ti_event;
+	unsigned long	 ti_timeout;
+	timeout_cb_t       ti_cb;
+	void	      *ti_cb_data;
+	struct list_head	 ti_obd_list;
+	struct list_head	 ti_chain;
+};
+
+#define OSC_MAX_RIF_DEFAULT       8
+#define MDS_OSC_MAX_RIF_DEFAULT   50
+#define OSC_MAX_RIF_MAX	 256
+#define OSC_MAX_DIRTY_DEFAULT  (OSC_MAX_RIF_DEFAULT * 4)
+#define OSC_MAX_DIRTY_MB_MAX   2048     /* arbitrary, but < MAX_LONG bytes */
+#define OSC_DEFAULT_RESENDS      10
+
+/* possible values for fo_sync_lock_cancel */
+enum {
+	NEVER_SYNC_ON_CANCEL = 0,
+	BLOCKING_SYNC_ON_CANCEL = 1,
+	ALWAYS_SYNC_ON_CANCEL = 2,
+	NUM_SYNC_ON_CANCEL_STATES
+};
+
+#define MDC_MAX_RIF_DEFAULT       8
+#define MDC_MAX_RIF_MAX	 512
+
+struct mdc_rpc_lock;
+struct obd_import;
+struct client_obd {
+	struct rw_semaphore  cl_sem;
+	struct obd_uuid	  cl_target_uuid;
+	struct obd_import       *cl_import; /* ptlrpc connection state */
+	int		      cl_conn_count;
+	/* max_mds_easize is purely a performance thing so we don't have to
+	 * call obd_size_diskmd() all the time. */
+	int			 cl_default_mds_easize;
+	int			 cl_max_mds_easize;
+	int			 cl_default_mds_cookiesize;
+	int			 cl_max_mds_cookiesize;
+
+	enum lustre_sec_part     cl_sp_me;
+	enum lustre_sec_part     cl_sp_to;
+	struct sptlrpc_flavor    cl_flvr_mgc;   /* fixed flavor of mgc->mgs */
+
+	/* the grant values are protected by loi_list_lock below */
+	long		     cl_dirty;	 /* all _dirty_ in bytes */
+	long		     cl_dirty_max;     /* allowed w/o rpc */
+	long		     cl_dirty_transit; /* dirty synchronous */
+	long		     cl_avail_grant;   /* bytes of credit for ost */
+	long		     cl_lost_grant;    /* lost credits (trunc) */
+
+	/* since we allocate grant by blocks, we don't know how many grant will
+	 * be used to add a page into cache. As a solution, we reserve maximum
+	 * grant before trying to dirty a page and unreserve the rest.
+	 * See osc_{reserve|unreserve}_grant for details. */
+	long		 cl_reserved_grant;
+	struct list_head cl_cache_waiters; /* waiting for cache/grant */
+	unsigned long	 cl_next_shrink_grant;   /* jiffies */
+	struct list_head cl_grant_shrink_list;  /* Timeout event list */
+	int		 cl_grant_shrink_interval; /* seconds */
+
+	/* A chunk is an optimal size used by osc_extent to determine
+	 * the extent size. A chunk is max(PAGE_CACHE_SIZE, OST block size) */
+	int		  cl_chunkbits;
+	int		  cl_chunk;
+	int		  cl_extent_tax; /* extent overhead, by bytes */
+
+	/* keep track of objects that have lois that contain pages which
+	 * have been queued for async brw.  this lock also protects the
+	 * lists of osc_client_pages that hang off of the loi */
+	/*
+	 * ->cl_loi_list_lock protects consistency of
+	 * ->cl_loi_{ready,read,write}_list. ->ap_make_ready() and
+	 * ->ap_completion() call-backs are executed under this lock. As we
+	 * cannot guarantee that these call-backs never block on all platforms
+	 * (as a matter of fact they do block on Mac OS X), type of
+	 * ->cl_loi_list_lock is platform dependent: it's a spin-lock on Linux
+	 * and blocking mutex on Mac OS X. (Alternative is to make this lock
+	 * blocking everywhere, but we don't want to slow down fast-path of
+	 * our main platform.)
+	 *
+	 * Exact type of ->cl_loi_list_lock is defined in arch/obd.h together
+	 * with client_obd_list_{un,}lock() and
+	 * client_obd_list_lock_{init,done}() functions.
+	 *
+	 * NB by Jinshan: though field names are still _loi_, but actually
+	 * osc_object{}s are in the list.
+	 */
+	client_obd_lock_t	cl_loi_list_lock;
+	struct list_head	       cl_loi_ready_list;
+	struct list_head	       cl_loi_hp_ready_list;
+	struct list_head	       cl_loi_write_list;
+	struct list_head	       cl_loi_read_list;
+	int		      cl_r_in_flight;
+	int		      cl_w_in_flight;
+	/* just a sum of the loi/lop pending numbers to be exported by /proc */
+	atomic_t	     cl_pending_w_pages;
+	atomic_t	     cl_pending_r_pages;
+	__u32			 cl_max_pages_per_rpc;
+	int		      cl_max_rpcs_in_flight;
+	struct obd_histogram     cl_read_rpc_hist;
+	struct obd_histogram     cl_write_rpc_hist;
+	struct obd_histogram     cl_read_page_hist;
+	struct obd_histogram     cl_write_page_hist;
+	struct obd_histogram     cl_read_offset_hist;
+	struct obd_histogram     cl_write_offset_hist;
+
+	/* lru for osc caching pages */
+	struct cl_client_cache	*cl_cache;
+	struct list_head	 cl_lru_osc; /* member of cl_cache->ccc_lru */
+	atomic_t		*cl_lru_left;
+	atomic_t		 cl_lru_busy;
+	atomic_t		 cl_lru_shrinkers;
+	atomic_t		 cl_lru_in_list;
+	struct list_head	 cl_lru_list; /* lru page list */
+	client_obd_lock_t	 cl_lru_list_lock; /* page list protector */
+
+	/* number of in flight destroy rpcs is limited to max_rpcs_in_flight */
+	atomic_t	     cl_destroy_in_flight;
+	wait_queue_head_t	      cl_destroy_waitq;
+
+	struct mdc_rpc_lock     *cl_rpc_lock;
+	struct mdc_rpc_lock     *cl_close_lock;
+
+	/* mgc datastruct */
+	struct mutex		 cl_mgc_mutex;
+	struct local_oid_storage *cl_mgc_los;
+	struct dt_object	*cl_mgc_configs_dir;
+	atomic_t	     cl_mgc_refcount;
+	struct obd_export       *cl_mgc_mgsexp;
+
+	/* checksumming for data sent over the network */
+	unsigned int	     cl_checksum:1; /* 0 = disabled, 1 = enabled */
+	/* supported checksum types that are worked out at connect time */
+	__u32		    cl_supp_cksum_types;
+	/* checksum algorithm to be used */
+	cksum_type_t	     cl_cksum_type;
+
+	/* also protected by the poorly named _loi_list_lock lock above */
+	struct osc_async_rc      cl_ar;
+
+	/* used by quotacheck when the servers are older than 2.4 */
+	int		      cl_qchk_stat; /* quotacheck stat of the peer */
+#define CL_NOT_QUOTACHECKED 1   /* client->cl_qchk_stat init value */
+#if LUSTRE_VERSION_CODE >= OBD_OCD_VERSION(2, 7, 50, 0)
+#warning "please consider removing quotacheck compatibility code"
+#endif
+
+	/* sequence manager */
+	struct lu_client_seq    *cl_seq;
+
+	atomic_t	     cl_resends; /* resend count */
+
+	/* ptlrpc work for writeback in ptlrpcd context */
+	void		    *cl_writeback_work;
+	/* hash tables for osc_quota_info */
+	struct cfs_hash	      *cl_quota_hash[MAXQUOTAS];
+};
+#define obd2cli_tgt(obd) ((char *)(obd)->u.cli.cl_target_uuid.uuid)
+
+struct obd_id_info {
+	__u32   idx;
+	u64	*data;
+};
+
+struct echo_client_obd {
+	struct obd_export	*ec_exp;   /* the local connection to osc/lov */
+	spinlock_t		ec_lock;
+	struct list_head	   ec_objects;
+	struct list_head	   ec_locks;
+	int		  ec_nstripes;
+	__u64		ec_unique;
+};
+
+struct lov_qos_oss {
+	struct obd_uuid     lqo_uuid;       /* ptlrpc's c_remote_uuid */
+	struct list_head	  lqo_oss_list;   /* link to lov_qos */
+	__u64	       lqo_bavail;     /* total bytes avail on OSS */
+	__u64	       lqo_penalty;    /* current penalty */
+	__u64	       lqo_penalty_per_obj;/* penalty decrease every obj*/
+	time_t	      lqo_used;       /* last used time, seconds */
+	__u32	       lqo_ost_count;  /* number of osts on this oss */
+};
+
+struct ltd_qos {
+	struct lov_qos_oss *ltq_oss;	 /* oss info */
+	__u64	       ltq_penalty;     /* current penalty */
+	__u64	       ltq_penalty_per_obj; /* penalty decrease every obj*/
+	__u64	       ltq_weight;      /* net weighting */
+	time_t	      ltq_used;	/* last used time, seconds */
+	unsigned int	ltq_usable:1;    /* usable for striping */
+};
+
+/* Generic subset of OSTs */
+struct ost_pool {
+	__u32	      *op_array;      /* array of index of
+						   lov_obd->lov_tgts */
+	unsigned int	op_count;      /* number of OSTs in the array */
+	unsigned int	op_size;       /* allocated size of lp_array */
+	struct rw_semaphore op_rw_sem;     /* to protect ost_pool use */
+};
+
+/* Round-robin allocator data */
+struct lov_qos_rr {
+	__u32	       lqr_start_idx;   /* start index of new inode */
+	__u32	       lqr_offset_idx;  /* aliasing for start_idx  */
+	int		 lqr_start_count; /* reseed counter */
+	struct ost_pool     lqr_pool;	/* round-robin optimized list */
+	unsigned long       lqr_dirty:1;     /* recalc round-robin list */
+};
+
+/* allow statfs data caching for 1 second */
+#define OBD_STATFS_CACHE_SECONDS 1
+
+struct lov_statfs_data {
+	struct obd_info   lsd_oi;
+	struct obd_statfs lsd_statfs;
+};
+/* Stripe placement optimization */
+struct lov_qos {
+	struct list_head    lq_oss_list; /* list of OSSs that targets use */
+	struct rw_semaphore lq_rw_sem;
+	__u32		lq_active_oss_count;
+	unsigned int	lq_prio_free;   /* priority for free space */
+	unsigned int	lq_threshold_rr;/* priority for rr */
+	struct lov_qos_rr   lq_rr;	  /* round robin qos data */
+	unsigned long       lq_dirty:1,     /* recalc qos data */
+			    lq_same_space:1,/* the ost's all have approx.
+					       the same space avail */
+			    lq_reset:1,     /* zero current penalties */
+			    lq_statfs_in_progress:1; /* statfs op in
+							progress */
+	/* qos statfs data */
+	struct lov_statfs_data *lq_statfs_data;
+	wait_queue_head_t	 lq_statfs_waitq; /* waitqueue to notify statfs
+					      * requests completion */
+};
+
+struct lov_tgt_desc {
+	struct list_head	  ltd_kill;
+	struct obd_uuid     ltd_uuid;
+	struct obd_device  *ltd_obd;
+	struct obd_export  *ltd_exp;
+	struct ltd_qos      ltd_qos;     /* qos info per target */
+	__u32	       ltd_gen;
+	__u32	       ltd_index;   /* index in lov_obd->tgts */
+	unsigned long       ltd_active:1,/* is this target up for requests */
+			    ltd_activate:1,/* should  target be activated */
+			    ltd_reap:1;  /* should this target be deleted */
+};
+
+/* Pool metadata */
+#define pool_tgt_size(_p)   _p->pool_obds.op_size
+#define pool_tgt_count(_p)  _p->pool_obds.op_count
+#define pool_tgt_array(_p)  _p->pool_obds.op_array
+#define pool_tgt_rw_sem(_p) _p->pool_obds.op_rw_sem
+
+struct pool_desc {
+	char		  pool_name[LOV_MAXPOOLNAME + 1]; /* name of pool */
+	struct ost_pool       pool_obds;	      /* pool members */
+	atomic_t	  pool_refcount;	  /* pool ref. counter */
+	struct lov_qos_rr     pool_rr;		/* round robin qos */
+	struct hlist_node      pool_hash;	      /* access by poolname */
+	struct list_head	    pool_list;	      /* serial access */
+	struct proc_dir_entry *pool_proc_entry;	/* file in /proc */
+	struct obd_device    *pool_lobd;	/* obd of the lov/lod to which
+						*  this pool belongs */
+};
+
+struct lov_obd {
+	struct lov_desc	 desc;
+	struct lov_tgt_desc   **lov_tgts;	      /* sparse array */
+	struct ost_pool	 lov_packed;	    /* all OSTs in a packed
+							  array */
+	struct mutex		lov_lock;
+	struct obd_connect_data lov_ocd;
+	atomic_t	    lov_refcount;
+	__u32		   lov_tgt_count;	 /* how many OBD's */
+	__u32		   lov_active_tgt_count;  /* how many active */
+	__u32		   lov_death_row;/* tgts scheduled to be deleted */
+	__u32		   lov_tgt_size;   /* size of tgts array */
+	int		     lov_connects;
+	int		     lov_pool_count;
+	struct cfs_hash	     *lov_pools_hash_body; /* used for key access */
+	struct list_head	lov_pool_list; /* used for sequential access */
+	struct proc_dir_entry   *lov_pool_proc_entry;
+	enum lustre_sec_part    lov_sp_me;
+
+	/* Cached LRU pages from upper layer */
+	void		       *lov_cache;
+
+	struct rw_semaphore     lov_notify_lock;
+};
+
+struct lmv_tgt_desc {
+	struct obd_uuid		ltd_uuid;
+	struct obd_export	*ltd_exp;
+	int			ltd_idx;
+	struct mutex		ltd_fid_mutex;
+	unsigned long		ltd_active:1; /* target up for requests */
+};
+
+enum placement_policy {
+	PLACEMENT_CHAR_POLICY   = 0,
+	PLACEMENT_NID_POLICY    = 1,
+	PLACEMENT_INVAL_POLICY  = 2,
+	PLACEMENT_MAX_POLICY
+};
+
+struct lmv_obd {
+	int			refcount;
+	struct lu_client_fld	lmv_fld;
+	spinlock_t		lmv_lock;
+	enum placement_policy	lmv_placement;
+	struct lmv_desc		desc;
+	struct obd_uuid		cluuid;
+	struct obd_export	*exp;
+
+	struct mutex		init_mutex;
+	int			connected;
+	int			max_easize;
+	int			max_def_easize;
+	int			max_cookiesize;
+	int			max_def_cookiesize;
+	int			server_timeout;
+
+	int			tgts_size; /* size of tgts array */
+	struct lmv_tgt_desc	**tgts;
+
+	struct obd_connect_data	conn_data;
+};
+
+struct niobuf_local {
+	__u64		lnb_file_offset;
+	__u32		lnb_page_offset;
+	__u32		len;
+	__u32		flags;
+	struct page	*page;
+	struct dentry	*dentry;
+	int		lnb_grant_used;
+	int		rc;
+};
+
+#define LUSTRE_FLD_NAME	 "fld"
+#define LUSTRE_SEQ_NAME	 "seq"
+
+#define LUSTRE_MDD_NAME	 "mdd"
+#define LUSTRE_OSD_LDISKFS_NAME	"osd-ldiskfs"
+#define LUSTRE_OSD_ZFS_NAME     "osd-zfs"
+#define LUSTRE_VVP_NAME	 "vvp"
+#define LUSTRE_LMV_NAME	 "lmv"
+#define LUSTRE_SLP_NAME	 "slp"
+#define LUSTRE_LOD_NAME		"lod"
+#define LUSTRE_OSP_NAME		"osp"
+#define LUSTRE_LWP_NAME		"lwp"
+
+/* obd device type names */
+ /* FIXME all the references to LUSTRE_MDS_NAME should be swapped with LUSTRE_MDT_NAME */
+#define LUSTRE_MDS_NAME	 "mds"
+#define LUSTRE_MDT_NAME	 "mdt"
+#define LUSTRE_MDC_NAME	 "mdc"
+#define LUSTRE_OSS_NAME	 "ost"       /* FIXME change name to oss */
+#define LUSTRE_OST_NAME	 "obdfilter" /* FIXME change name to ost */
+#define LUSTRE_OSC_NAME	 "osc"
+#define LUSTRE_LOV_NAME	 "lov"
+#define LUSTRE_MGS_NAME	 "mgs"
+#define LUSTRE_MGC_NAME	 "mgc"
+
+#define LUSTRE_ECHO_NAME	"obdecho"
+#define LUSTRE_ECHO_CLIENT_NAME "echo_client"
+#define LUSTRE_QMT_NAME	 "qmt"
+
+/* Constant obd names (post-rename) */
+#define LUSTRE_MDS_OBDNAME "MDS"
+#define LUSTRE_OSS_OBDNAME "OSS"
+#define LUSTRE_MGS_OBDNAME "MGS"
+#define LUSTRE_MGC_OBDNAME "MGC"
+
+/* Don't conflict with on-wire flags OBD_BRW_WRITE, etc */
+#define N_LOCAL_TEMP_PAGE 0x10000000
+
+struct obd_trans_info {
+	__u64		    oti_transno;
+	__u64		    oti_xid;
+	/* Only used on the server side for tracking acks. */
+	struct oti_req_ack_lock {
+		struct lustre_handle lock;
+		__u32		mode;
+	}			oti_ack_locks[4];
+	void		    *oti_handle;
+	struct llog_cookie       oti_onecookie;
+	struct llog_cookie      *oti_logcookies;
+	int		      oti_numcookies;
+	/** synchronous write is needed */
+	unsigned long		 oti_sync_write:1;
+
+	/* initial thread handling transaction */
+	struct ptlrpc_thread *oti_thread;
+	__u32		    oti_conn_cnt;
+	/** VBR: versions */
+	__u64		    oti_pre_version;
+	/** JobID */
+	char		    *oti_jobid;
+
+	struct obd_uuid	 *oti_ost_uuid;
+};
+
+static inline void oti_init(struct obd_trans_info *oti,
+			    struct ptlrpc_request *req)
+{
+	if (oti == NULL)
+		return;
+	memset(oti, 0, sizeof(*oti));
+
+	if (req == NULL)
+		return;
+
+	oti->oti_xid = req->rq_xid;
+	/** VBR: take versions from request */
+	if (req->rq_reqmsg != NULL &&
+	    lustre_msg_get_flags(req->rq_reqmsg) & MSG_REPLAY) {
+		__u64 *pre_version = lustre_msg_get_versions(req->rq_reqmsg);
+
+		oti->oti_pre_version = pre_version ? pre_version[0] : 0;
+		oti->oti_transno = lustre_msg_get_transno(req->rq_reqmsg);
+	}
+
+	/** called from mds_create_objects */
+	if (req->rq_repmsg != NULL)
+		oti->oti_transno = lustre_msg_get_transno(req->rq_repmsg);
+	oti->oti_thread = req->rq_svc_thread;
+	if (req->rq_reqmsg != NULL)
+		oti->oti_conn_cnt = lustre_msg_get_conn_cnt(req->rq_reqmsg);
+}
+
+static inline void oti_alloc_cookies(struct obd_trans_info *oti,
+				     int num_cookies)
+{
+	if (!oti)
+		return;
+
+	if (num_cookies == 1)
+		oti->oti_logcookies = &oti->oti_onecookie;
+	else
+		OBD_ALLOC_LARGE(oti->oti_logcookies,
+				num_cookies * sizeof(oti->oti_onecookie));
+
+	oti->oti_numcookies = num_cookies;
+}
+
+static inline void oti_free_cookies(struct obd_trans_info *oti)
+{
+	if (!oti || !oti->oti_logcookies)
+		return;
+
+	if (oti->oti_logcookies == &oti->oti_onecookie)
+		LASSERT(oti->oti_numcookies == 1);
+	else
+		OBD_FREE_LARGE(oti->oti_logcookies,
+			       oti->oti_numcookies*sizeof(oti->oti_onecookie));
+	oti->oti_logcookies = NULL;
+	oti->oti_numcookies = 0;
+}
+
+/*
+ * Events signalled through obd_notify() upcall-chain.
+ */
+enum obd_notify_event {
+	/* target added */
+	OBD_NOTIFY_CREATE,
+	/* Device connect start */
+	OBD_NOTIFY_CONNECT,
+	/* Device activated */
+	OBD_NOTIFY_ACTIVE,
+	/* Device deactivated */
+	OBD_NOTIFY_INACTIVE,
+	/* Device disconnected */
+	OBD_NOTIFY_DISCON,
+	/* Connect data for import were changed */
+	OBD_NOTIFY_OCD,
+	/* Sync request */
+	OBD_NOTIFY_SYNC_NONBLOCK,
+	OBD_NOTIFY_SYNC,
+	/* Configuration event */
+	OBD_NOTIFY_CONFIG,
+	/* Administratively deactivate/activate event */
+	OBD_NOTIFY_DEACTIVATE,
+	OBD_NOTIFY_ACTIVATE
+};
+
+/*
+ * Data structure used to pass obd_notify()-event to non-obd listeners (llite
+ * and liblustre being main examples).
+ */
+struct obd_notify_upcall {
+	int (*onu_upcall)(struct obd_device *host, struct obd_device *watched,
+			  enum obd_notify_event ev, void *owner, void *data);
+	/* Opaque datum supplied by upper layer listener */
+	void *onu_owner;
+};
+
+struct target_recovery_data {
+	svc_handler_t		trd_recovery_handler;
+	pid_t			trd_processing_task;
+	struct completion	trd_starting;
+	struct completion	trd_finishing;
+};
+
+struct obd_llog_group {
+	int		olg_seq;
+	struct llog_ctxt  *olg_ctxts[LLOG_MAX_CTXTS];
+	wait_queue_head_t	olg_waitq;
+	spinlock_t	   olg_lock;
+	struct mutex	   olg_cat_processing;
+};
+
+/* corresponds to one of the obd's */
+#define OBD_DEVICE_MAGIC	0XAB5CD6EF
+#define OBD_DEV_BY_DEVNAME      0xffffd0de
+
+struct lvfs_run_ctxt {
+	struct dt_device *dt;
+};
+
+struct obd_device {
+	struct obd_type	*obd_type;
+	__u32		   obd_magic;
+
+	/* common and UUID name of this device */
+	char		    obd_name[MAX_OBD_NAME];
+	struct obd_uuid	 obd_uuid;
+
+	struct lu_device       *obd_lu_dev;
+
+	int		     obd_minor;
+	/* bitfield modification is protected by obd_dev_lock */
+	unsigned long obd_attached:1,      /* finished attach */
+		      obd_set_up:1,	/* finished setup */
+		      obd_recovering:1,    /* there are recoverable clients */
+		      obd_abort_recovery:1,/* recovery expired */
+		      obd_version_recov:1, /* obd uses version checking */
+		      obd_replayable:1,    /* recovery is enabled; inform clients */
+		      obd_no_transno:1,    /* no committed-transno notification */
+		      obd_no_recov:1,      /* fail instead of retry messages */
+		      obd_stopping:1,      /* started cleanup */
+		      obd_starting:1,      /* started setup */
+		      obd_force:1,	 /* cleanup with > 0 obd refcount */
+		      obd_fail:1,	  /* cleanup with failover */
+		      obd_async_recov:1,   /* allow asynchronous orphan cleanup */
+		      obd_no_conn:1,       /* deny new connections */
+		      obd_inactive:1,      /* device active/inactive
+					   * (for /proc/status only!!) */
+		      obd_no_ir:1,	 /* no imperative recovery. */
+		      obd_process_conf:1;  /* device is processing mgs config */
+	/* use separate field as it is set in interrupt to don't mess with
+	 * protection of other bits using _bh lock */
+	unsigned long obd_recovery_expired:1;
+	/* uuid-export hash body */
+	struct cfs_hash	     *obd_uuid_hash;
+	/* nid-export hash body */
+	struct cfs_hash	     *obd_nid_hash;
+	/* nid stats body */
+	struct cfs_hash	     *obd_nid_stats_hash;
+	struct list_head	      obd_nid_stats;
+	atomic_t	    obd_refcount;
+	wait_queue_head_t	     obd_refcount_waitq;
+	struct list_head	      obd_exports;
+	struct list_head	      obd_unlinked_exports;
+	struct list_head	      obd_delayed_exports;
+	int		     obd_num_exports;
+	spinlock_t		obd_nid_lock;
+	struct ldlm_namespace  *obd_namespace;
+	struct ptlrpc_client	obd_ldlm_client; /* XXX OST/MDS only */
+	/* a spinlock is OK for what we do now, may need a semaphore later */
+	spinlock_t		obd_dev_lock; /* protect OBD bitfield above */
+	struct mutex		obd_dev_mutex;
+	__u64			obd_last_committed;
+	spinlock_t		obd_osfs_lock;
+	struct obd_statfs	obd_osfs;       /* locked by obd_osfs_lock */
+	__u64			obd_osfs_age;
+	struct lvfs_run_ctxt	obd_lvfs_ctxt;
+	struct obd_llog_group	obd_olg;	/* default llog group */
+	struct obd_device	*obd_observer;
+	struct rw_semaphore	obd_observer_link_sem;
+	struct obd_notify_upcall obd_upcall;
+	struct obd_export       *obd_self_export;
+	/* list of exports in LRU order, for ping evictor, with obd_dev_lock */
+	struct list_head	      obd_exports_timed;
+	time_t		  obd_eviction_timer; /* for ping evictor */
+
+	int			      obd_max_recoverable_clients;
+	atomic_t		     obd_connected_clients;
+	int			      obd_stale_clients;
+	int			      obd_delayed_clients;
+	/* this lock protects all recovery list_heads, timer and
+	 * obd_next_recovery_transno value */
+	spinlock_t			 obd_recovery_task_lock;
+	__u64			    obd_next_recovery_transno;
+	int			      obd_replayed_requests;
+	int			      obd_requests_queued_for_recovery;
+	wait_queue_head_t		      obd_next_transno_waitq;
+	/* protected by obd_recovery_task_lock */
+	struct timer_list	obd_recovery_timer;
+	time_t			obd_recovery_start; /* seconds */
+	time_t			obd_recovery_end; /* seconds, for lprocfs_status */
+	int			      obd_recovery_time_hard;
+	int			      obd_recovery_timeout;
+	int			      obd_recovery_ir_factor;
+
+	/* new recovery stuff from CMD2 */
+	struct target_recovery_data      obd_recovery_data;
+	int			      obd_replayed_locks;
+	atomic_t		     obd_req_replay_clients;
+	atomic_t		     obd_lock_replay_clients;
+	/* all lists are protected by obd_recovery_task_lock */
+	struct list_head		       obd_req_replay_queue;
+	struct list_head		       obd_lock_replay_queue;
+	struct list_head		       obd_final_req_queue;
+	int			      obd_recovery_stage;
+
+	union {
+		struct client_obd cli;
+		struct echo_client_obd echo_client;
+		struct lov_obd lov;
+		struct lmv_obd lmv;
+	} u;
+	/* Fields used by LProcFS */
+	unsigned int	   obd_cntr_base;
+	struct lprocfs_stats  *obd_stats;
+
+	unsigned int	   md_cntr_base;
+	struct lprocfs_stats  *md_stats;
+
+	struct proc_dir_entry  *obd_proc_entry;
+	void		  *obd_proc_private; /* type private PDEs */
+	struct proc_dir_entry  *obd_proc_exports_entry;
+	struct proc_dir_entry  *obd_svc_procroot;
+	struct lprocfs_stats  *obd_svc_stats;
+	atomic_t	   obd_evict_inprogress;
+	wait_queue_head_t	    obd_evict_inprogress_waitq;
+	struct list_head	obd_evict_list; /* protected with pet_lock */
+
+	/**
+	 * Ldlm pool part. Save last calculated SLV and Limit.
+	 */
+	rwlock_t		obd_pool_lock;
+	int		    obd_pool_limit;
+	__u64		  obd_pool_slv;
+
+	/**
+	 * A list of outstanding class_incref()'s against this obd. For
+	 * debugging.
+	 */
+	struct lu_ref	  obd_reference;
+
+	int		       obd_conn_inprogress;
+};
+
+#define OBD_LLOG_FL_SENDNOW     0x0001
+#define OBD_LLOG_FL_EXIT	0x0002
+
+enum obd_cleanup_stage {
+/* Special case hack for MDS LOVs */
+	OBD_CLEANUP_EARLY,
+/* can be directly mapped to .ldto_device_fini() */
+	OBD_CLEANUP_EXPORTS,
+};
+
+/* get/set_info keys */
+#define KEY_ASYNC	       "async"
+#define KEY_BLOCKSIZE_BITS      "blocksize_bits"
+#define KEY_BLOCKSIZE	   "blocksize"
+#define KEY_CAPA_KEY	    "capa_key"
+#define KEY_CHANGELOG_CLEAR     "changelog_clear"
+#define KEY_FID2PATH	    "fid2path"
+#define KEY_CHECKSUM	    "checksum"
+#define KEY_CLEAR_FS	    "clear_fs"
+#define KEY_CONN_DATA	   "conn_data"
+#define KEY_EVICT_BY_NID	"evict_by_nid"
+#define KEY_FIEMAP	      "fiemap"
+#define KEY_FLUSH_CTX	   "flush_ctx"
+#define KEY_GRANT_SHRINK	"grant_shrink"
+#define KEY_HSM_COPYTOOL_SEND   "hsm_send"
+#define KEY_INIT_RECOV_BACKUP   "init_recov_bk"
+#define KEY_INIT_RECOV	  "initial_recov"
+#define KEY_INTERMDS	    "inter_mds"
+#define KEY_LAST_ID	     "last_id"
+#define KEY_LAST_FID		"last_fid"
+#define KEY_LOCK_TO_STRIPE      "lock_to_stripe"
+#define KEY_LOVDESC	     "lovdesc"
+#define KEY_LOV_IDX	     "lov_idx"
+#define KEY_MAX_EASIZE		"max_easize"
+#define KEY_DEFAULT_EASIZE	"default_easize"
+#define KEY_MAX_COOKIESIZE	"max_cookiesize"
+#define KEY_DEFAULT_COOKIESIZE	"default_cookiesize"
+#define KEY_MDS_CONN	    "mds_conn"
+#define KEY_MGSSEC	      "mgssec"
+#define KEY_NEXT_ID	     "next_id"
+#define KEY_READ_ONLY	   "read-only"
+#define KEY_REGISTER_TARGET     "register_target"
+#define KEY_SET_FS	      "set_fs"
+#define KEY_TGT_COUNT	   "tgt_count"
+/*      KEY_SET_INFO in lustre_idl.h */
+#define KEY_SPTLRPC_CONF	"sptlrpc_conf"
+#define KEY_CONNECT_FLAG	"connect_flags"
+#define KEY_SYNC_LOCK_CANCEL    "sync_lock_cancel"
+
+#define KEY_CACHE_SET		"cache_set"
+#define KEY_CACHE_LRU_SHRINK	"cache_lru_shrink"
+#define KEY_CHANGELOG_INDEX	"changelog_index"
+
+struct lu_context;
+
+/* /!\ must be coherent with include/linux/namei.h on patched kernel */
+#define IT_OPEN     (1 << 0)
+#define IT_CREAT    (1 << 1)
+#define IT_READDIR  (1 << 2)
+#define IT_GETATTR  (1 << 3)
+#define IT_LOOKUP   (1 << 4)
+#define IT_UNLINK   (1 << 5)
+#define IT_TRUNC    (1 << 6)
+#define IT_GETXATTR (1 << 7)
+#define IT_EXEC     (1 << 8)
+#define IT_PIN      (1 << 9)
+#define IT_LAYOUT   (1 << 10)
+#define IT_QUOTA_DQACQ (1 << 11)
+#define IT_QUOTA_CONN  (1 << 12)
+#define IT_SETXATTR (1 << 13)
+
+static inline int it_to_lock_mode(struct lookup_intent *it)
+{
+	/* CREAT needs to be tested before open (both could be set) */
+	if (it->it_op & IT_CREAT)
+		return LCK_CW;
+	else if (it->it_op & (IT_READDIR | IT_GETATTR | IT_OPEN | IT_LOOKUP |
+			      IT_LAYOUT))
+		return LCK_CR;
+	else if (it->it_op &  IT_GETXATTR)
+		return LCK_PR;
+	else if (it->it_op &  IT_SETXATTR)
+		return LCK_PW;
+
+	LASSERTF(0, "Invalid it_op: %d\n", it->it_op);
+	return -EINVAL;
+}
+
+struct md_op_data {
+	struct lu_fid	   op_fid1; /* operation fid1 (usually parent) */
+	struct lu_fid	   op_fid2; /* operation fid2 (usually child) */
+	struct lu_fid	   op_fid3; /* 2 extra fids to find conflicting */
+	struct lu_fid	   op_fid4; /* to the operation locks. */
+	u32			op_mds;  /* what mds server open will go to */
+	struct lustre_handle    op_handle;
+	s64			op_mod_time;
+	const char	     *op_name;
+	int		     op_namelen;
+	__u32		   op_mode;
+	struct lmv_stripe_md   *op_mea1;
+	struct lmv_stripe_md   *op_mea2;
+	__u32		   op_suppgids[2];
+	__u32		   op_fsuid;
+	__u32		   op_fsgid;
+	cfs_cap_t	       op_cap;
+	void		   *op_data;
+
+	/* iattr fields and blocks. */
+	struct iattr	    op_attr;
+	unsigned int	    op_attr_flags;
+	__u64		   op_valid;
+	loff_t		  op_attr_blocks;
+
+	/* Size-on-MDS epoch and flags. */
+	__u64		   op_ioepoch;
+	__u32		   op_flags;
+
+	/* Capa fields */
+	struct obd_capa	*op_capa1;
+	struct obd_capa	*op_capa2;
+
+	/* Various operation flags. */
+	enum mds_op_bias        op_bias;
+
+	/* Operation type */
+	__u32		   op_opc;
+
+	/* Used by readdir */
+	__u64		   op_offset;
+
+	/* Used by readdir */
+	__u32		   op_npages;
+
+	/* used to transfer info between the stacks of MD client
+	 * see enum op_cli_flags */
+	__u32			op_cli_flags;
+
+	/* File object data version for HSM release, on client */
+	__u64			op_data_version;
+	struct lustre_handle	op_lease_handle;
+};
+
+enum op_cli_flags {
+	CLI_SET_MEA	= 1 << 0,
+	CLI_RM_ENTRY	= 1 << 1,
+};
+
+struct md_enqueue_info;
+/* metadata stat-ahead */
+
+struct md_enqueue_info {
+	struct md_op_data       mi_data;
+	struct lookup_intent    mi_it;
+	struct lustre_handle    mi_lockh;
+	struct inode	   *mi_dir;
+	int (*mi_cb)(struct ptlrpc_request *req,
+		struct md_enqueue_info *minfo, int rc);
+	__u64		   mi_cbdata;
+	unsigned int	    mi_generation;
+};
+
+struct obd_ops {
+	struct module *o_owner;
+	int (*o_iocontrol)(unsigned int cmd, struct obd_export *exp, int len,
+			   void *karg, void *uarg);
+	int (*o_get_info)(const struct lu_env *env, struct obd_export *,
+			  __u32 keylen, void *key, __u32 *vallen, void *val,
+			  struct lov_stripe_md *lsm);
+	int (*o_set_info_async)(const struct lu_env *, struct obd_export *,
+				__u32 keylen, void *key,
+				__u32 vallen, void *val,
+				struct ptlrpc_request_set *set);
+	int (*o_attach)(struct obd_device *dev, u32 len, void *data);
+	int (*o_detach)(struct obd_device *dev);
+	int (*o_setup)(struct obd_device *dev, struct lustre_cfg *cfg);
+	int (*o_precleanup)(struct obd_device *dev,
+			    enum obd_cleanup_stage cleanup_stage);
+	int (*o_cleanup)(struct obd_device *dev);
+	int (*o_process_config)(struct obd_device *dev, u32 len, void *data);
+	int (*o_postrecov)(struct obd_device *dev);
+	int (*o_add_conn)(struct obd_import *imp, struct obd_uuid *uuid,
+			  int priority);
+	int (*o_del_conn)(struct obd_import *imp, struct obd_uuid *uuid);
+	/* connect to the target device with given connection
+	 * data. @ocd->ocd_connect_flags is modified to reflect flags actually
+	 * granted by the target, which are guaranteed to be a subset of flags
+	 * asked for. If @ocd == NULL, use default parameters. */
+	int (*o_connect)(const struct lu_env *env,
+			 struct obd_export **exp, struct obd_device *src,
+			 struct obd_uuid *cluuid, struct obd_connect_data *ocd,
+			 void *localdata);
+	int (*o_reconnect)(const struct lu_env *env,
+			   struct obd_export *exp, struct obd_device *src,
+			   struct obd_uuid *cluuid,
+			   struct obd_connect_data *ocd,
+			   void *localdata);
+	int (*o_disconnect)(struct obd_export *exp);
+
+	/* Initialize/finalize fids infrastructure. */
+	int (*o_fid_init)(struct obd_device *obd,
+			  struct obd_export *exp, enum lu_cli_type type);
+	int (*o_fid_fini)(struct obd_device *obd);
+
+	/* Allocate new fid according to passed @hint. */
+	int (*o_fid_alloc)(struct obd_export *exp, struct lu_fid *fid,
+			   struct md_op_data *op_data);
+
+	/*
+	 * Object with @fid is getting deleted, we may want to do something
+	 * about this.
+	 */
+	int (*o_statfs)(const struct lu_env *, struct obd_export *exp,
+			struct obd_statfs *osfs, __u64 max_age, __u32 flags);
+	int (*o_statfs_async)(struct obd_export *exp, struct obd_info *oinfo,
+			      __u64 max_age, struct ptlrpc_request_set *set);
+	int (*o_packmd)(struct obd_export *exp, struct lov_mds_md **disk_tgt,
+			struct lov_stripe_md *mem_src);
+	int (*o_unpackmd)(struct obd_export *exp,
+			  struct lov_stripe_md **mem_tgt,
+			  struct lov_mds_md *disk_src, int disk_len);
+	int (*o_preallocate)(struct lustre_handle *, u32 *req, u64 *ids);
+	/* FIXME: add fid capability support for create & destroy! */
+	int (*o_create)(const struct lu_env *env, struct obd_export *exp,
+			struct obdo *oa, struct lov_stripe_md **ea,
+			struct obd_trans_info *oti);
+	int (*o_destroy)(const struct lu_env *env, struct obd_export *exp,
+			 struct obdo *oa, struct lov_stripe_md *ea,
+			 struct obd_trans_info *oti, struct obd_export *md_exp,
+			 void *capa);
+	int (*o_setattr)(const struct lu_env *, struct obd_export *exp,
+			 struct obd_info *oinfo, struct obd_trans_info *oti);
+	int (*o_setattr_async)(struct obd_export *exp, struct obd_info *oinfo,
+			       struct obd_trans_info *oti,
+			       struct ptlrpc_request_set *rqset);
+	int (*o_getattr)(const struct lu_env *env, struct obd_export *exp,
+			 struct obd_info *oinfo);
+	int (*o_getattr_async)(struct obd_export *exp, struct obd_info *oinfo,
+			       struct ptlrpc_request_set *set);
+	int (*o_adjust_kms)(struct obd_export *exp, struct lov_stripe_md *lsm,
+			    u64 size, int shrink);
+	int (*o_preprw)(const struct lu_env *env, int cmd,
+			struct obd_export *exp, struct obdo *oa, int objcount,
+			struct obd_ioobj *obj, struct niobuf_remote *remote,
+			int *nr_pages, struct niobuf_local *local,
+			struct obd_trans_info *oti, struct lustre_capa *capa);
+	int (*o_commitrw)(const struct lu_env *env, int cmd,
+			  struct obd_export *exp, struct obdo *oa,
+			  int objcount, struct obd_ioobj *obj,
+			  struct niobuf_remote *remote, int pages,
+			  struct niobuf_local *local,
+			  struct obd_trans_info *oti, int rc);
+	int (*o_find_cbdata)(struct obd_export *, struct lov_stripe_md *,
+			     ldlm_iterator_t it, void *data);
+	int (*o_init_export)(struct obd_export *exp);
+	int (*o_destroy_export)(struct obd_export *exp);
+
+	/* metadata-only methods */
+	int (*o_import_event)(struct obd_device *, struct obd_import *,
+			      enum obd_import_event);
+
+	int (*o_notify)(struct obd_device *obd, struct obd_device *watched,
+			enum obd_notify_event ev, void *data);
+
+	int (*o_health_check)(const struct lu_env *env, struct obd_device *);
+	struct obd_uuid *(*o_get_uuid)(struct obd_export *exp);
+
+	/* quota methods */
+	int (*o_quotacheck)(struct obd_device *, struct obd_export *,
+			    struct obd_quotactl *);
+	int (*o_quotactl)(struct obd_device *, struct obd_export *,
+			  struct obd_quotactl *);
+
+	/* pools methods */
+	int (*o_pool_new)(struct obd_device *obd, char *poolname);
+	int (*o_pool_del)(struct obd_device *obd, char *poolname);
+	int (*o_pool_add)(struct obd_device *obd, char *poolname,
+			  char *ostname);
+	int (*o_pool_rem)(struct obd_device *obd, char *poolname,
+			  char *ostname);
+	void (*o_getref)(struct obd_device *obd);
+	void (*o_putref)(struct obd_device *obd);
+	/*
+	 * NOTE: If adding ops, add another LPROCFS_OBD_OP_INIT() line
+	 * to lprocfs_alloc_obd_stats() in obdclass/lprocfs_status.c.
+	 * Also, add a wrapper function in include/linux/obd_class.h. */
+};
+
+enum {
+	LUSTRE_OPC_MKDIR    = (1 << 0),
+	LUSTRE_OPC_SYMLINK  = (1 << 1),
+	LUSTRE_OPC_MKNOD    = (1 << 2),
+	LUSTRE_OPC_CREATE   = (1 << 3),
+	LUSTRE_OPC_ANY      = (1 << 4)
+};
+
+/* lmv structures */
+#define MEA_MAGIC_LAST_CHAR      0xb2221ca1
+#define MEA_MAGIC_ALL_CHARS      0xb222a11c
+#define MEA_MAGIC_HASH_SEGMENT   0xb222a11b
+
+#define MAX_HASH_SIZE_32	 0x7fffffffUL
+#define MAX_HASH_SIZE	    0x7fffffffffffffffULL
+#define MAX_HASH_HIGHEST_BIT     0x1000000000000000ULL
+
+struct lustre_md {
+	struct mdt_body	 *body;
+	struct lov_stripe_md    *lsm;
+	struct lmv_stripe_md    *mea;
+#ifdef CONFIG_FS_POSIX_ACL
+	struct posix_acl	*posix_acl;
+#endif
+	struct mdt_remote_perm  *remote_perm;
+	struct obd_capa	 *mds_capa;
+	struct obd_capa	 *oss_capa;
+};
+
+struct md_open_data {
+	struct obd_client_handle *mod_och;
+	struct ptlrpc_request    *mod_open_req;
+	struct ptlrpc_request    *mod_close_req;
+	atomic_t		  mod_refcount;
+	bool			  mod_is_create;
+};
+
+struct lookup_intent;
+
+struct md_ops {
+	int (*m_getstatus)(struct obd_export *, struct lu_fid *,
+			   struct obd_capa **);
+	int (*m_null_inode)(struct obd_export *, const struct lu_fid *);
+	int (*m_find_cbdata)(struct obd_export *, const struct lu_fid *,
+			     ldlm_iterator_t, void *);
+	int (*m_close)(struct obd_export *, struct md_op_data *,
+		       struct md_open_data *, struct ptlrpc_request **);
+	int (*m_create)(struct obd_export *, struct md_op_data *,
+			const void *, int, int, __u32, __u32, cfs_cap_t,
+			__u64, struct ptlrpc_request **);
+	int (*m_done_writing)(struct obd_export *, struct md_op_data  *,
+			      struct md_open_data *);
+	int (*m_enqueue)(struct obd_export *, struct ldlm_enqueue_info *,
+			 struct lookup_intent *, struct md_op_data *,
+			 struct lustre_handle *, void *, int,
+			 struct ptlrpc_request **, __u64);
+	int (*m_getattr)(struct obd_export *, struct md_op_data *,
+			 struct ptlrpc_request **);
+	int (*m_getattr_name)(struct obd_export *, struct md_op_data *,
+			      struct ptlrpc_request **);
+	int (*m_intent_lock)(struct obd_export *, struct md_op_data *,
+			     void *, int, struct lookup_intent *, int,
+			     struct ptlrpc_request **,
+			     ldlm_blocking_callback, __u64);
+	int (*m_link)(struct obd_export *, struct md_op_data *,
+		      struct ptlrpc_request **);
+	int (*m_rename)(struct obd_export *, struct md_op_data *,
+			const char *, int, const char *, int,
+			struct ptlrpc_request **);
+	int (*m_is_subdir)(struct obd_export *, const struct lu_fid *,
+			   const struct lu_fid *,
+			   struct ptlrpc_request **);
+	int (*m_setattr)(struct obd_export *, struct md_op_data *, void *,
+			 int , void *, int, struct ptlrpc_request **,
+			 struct md_open_data **mod);
+	int (*m_sync)(struct obd_export *, const struct lu_fid *,
+		      struct obd_capa *, struct ptlrpc_request **);
+	int (*m_readpage)(struct obd_export *, struct md_op_data *,
+			  struct page **, struct ptlrpc_request **);
+
+	int (*m_unlink)(struct obd_export *, struct md_op_data *,
+			struct ptlrpc_request **);
+
+	int (*m_setxattr)(struct obd_export *, const struct lu_fid *,
+			  struct obd_capa *, u64, const char *,
+			  const char *, int, int, int, __u32,
+			  struct ptlrpc_request **);
+
+	int (*m_getxattr)(struct obd_export *, const struct lu_fid *,
+			  struct obd_capa *, u64, const char *,
+			  const char *, int, int, int,
+			  struct ptlrpc_request **);
+
+	int (*m_init_ea_size)(struct obd_export *, int, int, int, int);
+
+	int (*m_get_lustre_md)(struct obd_export *, struct ptlrpc_request *,
+			       struct obd_export *, struct obd_export *,
+			       struct lustre_md *);
+
+	int (*m_free_lustre_md)(struct obd_export *, struct lustre_md *);
+
+	int (*m_set_open_replay_data)(struct obd_export *,
+				      struct obd_client_handle *,
+				      struct lookup_intent *);
+	int (*m_clear_open_replay_data)(struct obd_export *,
+					struct obd_client_handle *);
+	int (*m_set_lock_data)(struct obd_export *, __u64 *, void *, __u64 *);
+
+	ldlm_mode_t (*m_lock_match)(struct obd_export *, __u64,
+				    const struct lu_fid *, ldlm_type_t,
+				    ldlm_policy_data_t *, ldlm_mode_t,
+				    struct lustre_handle *);
+
+	int (*m_cancel_unused)(struct obd_export *, const struct lu_fid *,
+			       ldlm_policy_data_t *, ldlm_mode_t,
+			       ldlm_cancel_flags_t flags, void *opaque);
+	int (*m_renew_capa)(struct obd_export *, struct obd_capa *oc,
+			    renew_capa_cb_t cb);
+	int (*m_unpack_capa)(struct obd_export *, struct ptlrpc_request *,
+			     const struct req_msg_field *, struct obd_capa **);
+
+	int (*m_get_remote_perm)(struct obd_export *, const struct lu_fid *,
+				 struct obd_capa *, __u32,
+				 struct ptlrpc_request **);
+
+	int (*m_intent_getattr_async)(struct obd_export *,
+				      struct md_enqueue_info *,
+				      struct ldlm_enqueue_info *);
+
+	int (*m_revalidate_lock)(struct obd_export *, struct lookup_intent *,
+				 struct lu_fid *, __u64 *bits);
+
+	/*
+	 * NOTE: If adding ops, add another LPROCFS_MD_OP_INIT() line to
+	 * lprocfs_alloc_md_stats() in obdclass/lprocfs_status.c. Also, add a
+	 * wrapper function in include/linux/obd_class.h.
+	 */
+};
+
+struct lsm_operations {
+	void (*lsm_free)(struct lov_stripe_md *);
+	int (*lsm_destroy)(struct lov_stripe_md *, struct obdo *oa,
+			   struct obd_export *md_exp);
+	void (*lsm_stripe_by_index)(struct lov_stripe_md *, int *, u64 *,
+				    u64 *);
+	void (*lsm_stripe_by_offset)(struct lov_stripe_md *, int *, u64 *,
+				     u64 *);
+	int (*lsm_lmm_verify)(struct lov_mds_md *lmm, int lmm_bytes,
+			       __u16 *stripe_count);
+	int (*lsm_unpackmd)(struct lov_obd *lov, struct lov_stripe_md *lsm,
+			     struct lov_mds_md *lmm);
+};
+
+extern const struct lsm_operations lsm_v1_ops;
+extern const struct lsm_operations lsm_v3_ops;
+static inline const struct lsm_operations *lsm_op_find(int magic)
+{
+	switch (magic) {
+	case LOV_MAGIC_V1:
+	       return &lsm_v1_ops;
+	case LOV_MAGIC_V3:
+	       return &lsm_v3_ops;
+	default:
+	       CERROR("Cannot recognize lsm_magic %08x\n", magic);
+	       return NULL;
+	}
+}
+
+/* Requests for obd_extent_calc() */
+#define OBD_CALC_STRIPE_START   1
+#define OBD_CALC_STRIPE_END     2
+
+static inline struct lustre_capa *oinfo_capa(struct obd_info *oinfo)
+{
+	return oinfo->oi_capa;
+}
+
+static inline struct md_open_data *obd_mod_alloc(void)
+{
+	struct md_open_data *mod;
+
+	OBD_ALLOC_PTR(mod);
+	if (mod == NULL)
+		return NULL;
+	atomic_set(&mod->mod_refcount, 1);
+	return mod;
+}
+
+#define obd_mod_get(mod) atomic_inc(&(mod)->mod_refcount)
+#define obd_mod_put(mod)					\
+({							      \
+	if (atomic_dec_and_test(&(mod)->mod_refcount)) {	  \
+		if ((mod)->mod_open_req)			  \
+			ptlrpc_req_finished((mod)->mod_open_req);   \
+		OBD_FREE_PTR(mod);			      \
+	}						       \
+})
+
+void obdo_from_inode(struct obdo *dst, struct inode *src, u32 valid);
+void obdo_set_parent_fid(struct obdo *dst, const struct lu_fid *parent);
+
+/* return 1 if client should be resend request */
+static inline int client_should_resend(int resend, struct client_obd *cli)
+{
+	return atomic_read(&cli->cl_resends) ?
+	       atomic_read(&cli->cl_resends) > resend : 1;
+}
+
+/**
+ * Return device name for this device
+ *
+ * XXX: lu_device is declared before obd_device, while a pointer pointing
+ * back to obd_device in lu_device, so this helper function defines here
+ * instead of in lu_object.h
+ */
+static inline const char *lu_dev_name(const struct lu_device *lu_dev)
+{
+	return lu_dev->ld_obd->obd_name;
+}
+
+static inline bool filename_is_volatile(const char *name, int namelen, int *idx)
+{
+	const char	*start;
+	char		*end;
+
+	if (strncmp(name, LUSTRE_VOLATILE_HDR, LUSTRE_VOLATILE_HDR_LEN) != 0)
+		return false;
+
+	/* caller does not care of idx */
+	if (idx == NULL)
+		return true;
+
+	/* volatile file, the MDT can be set from name */
+	/* name format is LUSTRE_VOLATILE_HDR:[idx]: */
+	/* if no MDT is specified, use std way */
+	if (namelen < LUSTRE_VOLATILE_HDR_LEN + 2)
+		goto bad_format;
+	/* test for no MDT idx case */
+	if ((*(name + LUSTRE_VOLATILE_HDR_LEN) == ':') &&
+	    (*(name + LUSTRE_VOLATILE_HDR_LEN + 1) == ':')) {
+		*idx = -1;
+		return true;
+	}
+	/* we have an idx, read it */
+	start = name + LUSTRE_VOLATILE_HDR_LEN + 1;
+	*idx = strtoul(start, &end, 0);
+	/* error cases:
+	 * no digit, no trailing :, negative value
+	 */
+	if (((*idx == 0) && (end == start)) ||
+	    (*end != ':') || (*idx < 0))
+		goto bad_format;
+
+	return true;
+bad_format:
+	/* bad format of mdt idx, we cannot return an error
+	 * to caller so we use hash algo */
+	CERROR("Bad volatile file name format: %s\n",
+	       name + LUSTRE_VOLATILE_HDR_LEN);
+	return false;
+}
+
+static inline int cli_brw_size(struct obd_device *obd)
+{
+	LASSERT(obd != NULL);
+	return obd->u.cli.cl_max_pages_per_rpc << PAGE_CACHE_SHIFT;
+}
+
+#endif /* __OBD_H */
diff --git a/kernel/drivers/staging/lustre/lustre/include/obd_cache.h b/kernel/drivers/staging/lustre/lustre/include/obd_cache.h
new file mode 100644
index 000000000..c8249fbb0
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/include/obd_cache.h
@@ -0,0 +1,39 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef _OBD_CACHE_H__
+#define _OBD_CACHE_H__
+
+
+#endif
diff --git a/kernel/drivers/staging/lustre/lustre/include/obd_cksum.h b/kernel/drivers/staging/lustre/lustre/include/obd_cksum.h
new file mode 100644
index 000000000..3a63462aa
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/include/obd_cksum.h
@@ -0,0 +1,176 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef __OBD_CKSUM
+#define __OBD_CKSUM
+#include "../../include/linux/libcfs/libcfs.h"
+#include "lustre/lustre_idl.h"
+
+static inline unsigned char cksum_obd2cfs(cksum_type_t cksum_type)
+{
+	switch (cksum_type) {
+	case OBD_CKSUM_CRC32:
+		return CFS_HASH_ALG_CRC32;
+	case OBD_CKSUM_ADLER:
+		return CFS_HASH_ALG_ADLER32;
+	case OBD_CKSUM_CRC32C:
+		return CFS_HASH_ALG_CRC32C;
+	default:
+		CERROR("Unknown checksum type (%x)!!!\n", cksum_type);
+		LBUG();
+	}
+	return 0;
+}
+
+/* The OBD_FL_CKSUM_* flags is packed into 5 bits of o_flags, since there can
+ * only be a single checksum type per RPC.
+ *
+ * The OBD_CHECKSUM_* type bits passed in ocd_cksum_types are a 32-bit bitmask
+ * since they need to represent the full range of checksum algorithms that
+ * both the client and server can understand.
+ *
+ * In case of an unsupported types/flags we fall back to ADLER
+ * because that is supported by all clients since 1.8
+ *
+ * In case multiple algorithms are supported the best one is used. */
+static inline u32 cksum_type_pack(cksum_type_t cksum_type)
+{
+	unsigned int    performance = 0, tmp;
+	u32		flag = OBD_FL_CKSUM_ADLER;
+
+	if (cksum_type & OBD_CKSUM_CRC32) {
+		tmp = cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_CRC32));
+		if (tmp > performance) {
+			performance = tmp;
+			flag = OBD_FL_CKSUM_CRC32;
+		}
+	}
+	if (cksum_type & OBD_CKSUM_CRC32C) {
+		tmp = cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_CRC32C));
+		if (tmp > performance) {
+			performance = tmp;
+			flag = OBD_FL_CKSUM_CRC32C;
+		}
+	}
+	if (cksum_type & OBD_CKSUM_ADLER) {
+		tmp = cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_ADLER));
+		if (tmp > performance) {
+			performance = tmp;
+			flag = OBD_FL_CKSUM_ADLER;
+		}
+	}
+	if (unlikely(cksum_type && !(cksum_type & (OBD_CKSUM_CRC32C |
+						   OBD_CKSUM_CRC32 |
+						   OBD_CKSUM_ADLER))))
+		CWARN("unknown cksum type %x\n", cksum_type);
+
+	return flag;
+}
+
+static inline cksum_type_t cksum_type_unpack(u32 o_flags)
+{
+	switch (o_flags & OBD_FL_CKSUM_ALL) {
+	case OBD_FL_CKSUM_CRC32C:
+		return OBD_CKSUM_CRC32C;
+	case OBD_FL_CKSUM_CRC32:
+		return OBD_CKSUM_CRC32;
+	default:
+		break;
+	}
+
+	return OBD_CKSUM_ADLER;
+}
+
+/* Return a bitmask of the checksum types supported on this system.
+ * 1.8 supported ADLER it is base and not depend on hw
+ * Client uses all available local algos
+ */
+static inline cksum_type_t cksum_types_supported_client(void)
+{
+	cksum_type_t ret = OBD_CKSUM_ADLER;
+
+	CDEBUG(D_INFO, "Crypto hash speed: crc %d, crc32c %d, adler %d\n",
+	       cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_CRC32)),
+	       cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_CRC32C)),
+	       cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_ADLER)));
+
+	if (cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_CRC32C)) > 0)
+		ret |= OBD_CKSUM_CRC32C;
+	if (cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_CRC32)) > 0)
+		ret |= OBD_CKSUM_CRC32;
+
+	return ret;
+}
+
+/* Server uses algos that perform at 50% or better of the Adler */
+static inline cksum_type_t cksum_types_supported_server(void)
+{
+	int	     base_speed;
+	cksum_type_t    ret = OBD_CKSUM_ADLER;
+
+	CDEBUG(D_INFO, "Crypto hash speed: crc %d, crc32c %d, adler %d\n",
+	       cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_CRC32)),
+	       cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_CRC32C)),
+	       cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_ADLER)));
+
+	base_speed = cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_ADLER)) / 2;
+
+	if (cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_CRC32C)) >=
+	    base_speed)
+		ret |= OBD_CKSUM_CRC32C;
+	if (cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_CRC32)) >=
+	    base_speed)
+		ret |= OBD_CKSUM_CRC32;
+
+	return ret;
+}
+
+
+/* Select the best checksum algorithm among those supplied in the cksum_types
+ * input.
+ *
+ * Currently, calling cksum_type_pack() with a mask will return the fastest
+ * checksum type due to its benchmarking at libcfs module load.
+ * Caution is advised, however, since what is fastest on a single client may
+ * not be the fastest or most efficient algorithm on the server.  */
+static inline cksum_type_t cksum_type_select(cksum_type_t cksum_types)
+{
+	return cksum_type_unpack(cksum_type_pack(cksum_types));
+}
+
+/* Checksum algorithm names. Must be defined in the same order as the
+ * OBD_CKSUM_* flags. */
+#define DECLARE_CKSUM_NAME char *cksum_name[] = {"crc32", "adler", "crc32c"}
+
+#endif /* __OBD_H */
diff --git a/kernel/drivers/staging/lustre/lustre/include/obd_class.h b/kernel/drivers/staging/lustre/lustre/include/obd_class.h
new file mode 100644
index 000000000..34b5fa3f0
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/include/obd_class.h
@@ -0,0 +1,1929 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+#ifndef __CLASS_OBD_H
+#define __CLASS_OBD_H
+
+
+#include "obd_support.h"
+#include "lustre_import.h"
+#include "lustre_net.h"
+#include "obd.h"
+#include "lustre_lib.h"
+#include "lustre/lustre_idl.h"
+#include "lprocfs_status.h"
+
+#define OBD_STATFS_NODELAY      0x0001  /* requests should be send without delay
+					 * and resends for avoid deadlocks */
+#define OBD_STATFS_FROM_CACHE   0x0002  /* the statfs callback should not update
+					 * obd_osfs_age */
+#define OBD_STATFS_PTLRPCD      0x0004  /* requests will be sent via ptlrpcd
+					 * instead of a specific set. This
+					 * means that we cannot rely on the set
+					 * interpret routine to be called.
+					 * lov_statfs_fini() must thus be called
+					 * by the request interpret routine */
+#define OBD_STATFS_FOR_MDT0	0x0008	/* The statfs is only for retrieving
+					 * information from MDT0. */
+#define OBD_FL_PUNCH    0x00000001      /* To indicate it is punch operation */
+
+/* OBD Device Declarations */
+extern struct obd_device *obd_devs[MAX_OBD_DEVICES];
+extern rwlock_t obd_dev_lock;
+
+/* OBD Operations Declarations */
+extern struct obd_device *class_conn2obd(struct lustre_handle *);
+extern struct obd_device *class_exp2obd(struct obd_export *);
+extern int class_handle_ioctl(unsigned int cmd, unsigned long arg);
+extern int lustre_get_jobid(char *jobid);
+
+struct lu_device_type;
+
+/* genops.c */
+extern struct list_head obd_types;
+struct obd_export *class_conn2export(struct lustre_handle *);
+int class_register_type(struct obd_ops *, struct md_ops *,
+			struct lprocfs_vars *, const char *nm,
+			struct lu_device_type *ldt);
+int class_unregister_type(const char *nm);
+
+struct obd_device *class_newdev(const char *type_name, const char *name);
+void class_release_dev(struct obd_device *obd);
+
+int class_name2dev(const char *name);
+struct obd_device *class_name2obd(const char *name);
+int class_uuid2dev(struct obd_uuid *uuid);
+struct obd_device *class_uuid2obd(struct obd_uuid *uuid);
+void class_obd_list(void);
+struct obd_device *class_find_client_obd(struct obd_uuid *tgt_uuid,
+					  const char *typ_name,
+					  struct obd_uuid *grp_uuid);
+struct obd_device *class_devices_in_group(struct obd_uuid *grp_uuid,
+					   int *next);
+struct obd_device *class_num2obd(int num);
+int get_devices_count(void);
+
+int class_notify_sptlrpc_conf(const char *fsname, int namelen);
+
+char *obd_export_nid2str(struct obd_export *exp);
+
+int obd_export_evict_by_nid(struct obd_device *obd, const char *nid);
+int obd_export_evict_by_uuid(struct obd_device *obd, const char *uuid);
+int obd_connect_flags2str(char *page, int count, __u64 flags, char *sep);
+
+int obd_zombie_impexp_init(void);
+void obd_zombie_impexp_stop(void);
+void obd_zombie_impexp_cull(void);
+void obd_zombie_barrier(void);
+void obd_exports_barrier(struct obd_device *obd);
+int kuc_len(int payload_len);
+struct kuc_hdr *kuc_ptr(void *p);
+int kuc_ispayload(void *p);
+void *kuc_alloc(int payload_len, int transport, int type);
+void kuc_free(void *p, int payload_len);
+
+struct llog_handle;
+struct llog_rec_hdr;
+typedef int (*llog_cb_t)(const struct lu_env *, struct llog_handle *,
+			 struct llog_rec_hdr *, void *);
+/* obd_config.c */
+struct lustre_cfg *lustre_cfg_rename(struct lustre_cfg *cfg,
+				     const char *new_name);
+int class_process_config(struct lustre_cfg *lcfg);
+int class_process_proc_param(char *prefix, struct lprocfs_vars *lvars,
+			     struct lustre_cfg *lcfg, void *data);
+int class_attach(struct lustre_cfg *lcfg);
+int class_setup(struct obd_device *obd, struct lustre_cfg *lcfg);
+int class_cleanup(struct obd_device *obd, struct lustre_cfg *lcfg);
+int class_detach(struct obd_device *obd, struct lustre_cfg *lcfg);
+struct obd_device *class_incref(struct obd_device *obd,
+				const char *scope, const void *source);
+void class_decref(struct obd_device *obd,
+		  const char *scope, const void *source);
+void dump_exports(struct obd_device *obd, int locks);
+int class_config_llog_handler(const struct lu_env *env,
+			      struct llog_handle *handle,
+			      struct llog_rec_hdr *rec, void *data);
+int class_add_conn(struct obd_device *obd, struct lustre_cfg *lcfg);
+int class_add_uuid(const char *uuid, __u64 nid);
+
+/*obdecho*/
+#if defined (CONFIG_PROC_FS)
+extern void lprocfs_echo_init_vars(struct lprocfs_static_vars *lvars);
+#else
+static inline void lprocfs_echo_init_vars(struct lprocfs_static_vars *lvars)
+{
+	memset(lvars, 0, sizeof(*lvars));
+}
+#endif
+
+#define CFG_F_START     0x01   /* Set when we start updating from a log */
+#define CFG_F_MARKER    0x02   /* We are within a maker */
+#define CFG_F_SKIP      0x04   /* We should ignore this cfg command */
+#define CFG_F_COMPAT146 0x08   /* Allow old-style logs */
+#define CFG_F_EXCLUDE   0x10   /* OST exclusion list */
+
+/* Passed as data param to class_config_parse_llog */
+struct config_llog_instance {
+	char	       *cfg_obdname;
+	void	       *cfg_instance;
+	struct super_block *cfg_sb;
+	struct obd_uuid     cfg_uuid;
+	llog_cb_t	    cfg_callback;
+	int		 cfg_last_idx; /* for partial llog processing */
+	int		 cfg_flags;
+};
+int class_config_parse_llog(const struct lu_env *env, struct llog_ctxt *ctxt,
+			    char *name, struct config_llog_instance *cfg);
+int class_config_dump_llog(const struct lu_env *env, struct llog_ctxt *ctxt,
+			   char *name, struct config_llog_instance *cfg);
+
+enum {
+	CONFIG_T_CONFIG  = 0,
+	CONFIG_T_SPTLRPC = 1,
+	CONFIG_T_RECOVER = 2,
+	CONFIG_T_PARAMS  = 3,
+	CONFIG_T_MAX     = 4
+};
+
+#define PARAMS_FILENAME	"params"
+#define LCTL_UPCALL	"lctl"
+
+/* list of active configuration logs  */
+struct config_llog_data {
+	struct ldlm_res_id	  cld_resid;
+	struct config_llog_instance cld_cfg;
+	struct list_head		  cld_list_chain;
+	atomic_t		cld_refcount;
+	struct config_llog_data    *cld_sptlrpc;/* depended sptlrpc log */
+	struct config_llog_data	   *cld_params;	/* common parameters log */
+	struct config_llog_data    *cld_recover;/* imperative recover log */
+	struct obd_export	  *cld_mgcexp;
+	struct mutex		    cld_lock;
+	int			 cld_type;
+	unsigned int		cld_stopping:1, /* we were told to stop
+						     * watching */
+				    cld_lostlock:1; /* lock not requeued */
+	char			cld_logname[0];
+};
+
+struct lustre_profile {
+	struct list_head       lp_list;
+	char	    *lp_profile;
+	char	    *lp_dt;
+	char	    *lp_md;
+};
+
+struct lustre_profile *class_get_profile(const char *prof);
+void class_del_profile(const char *prof);
+void class_del_profiles(void);
+
+#if LUSTRE_TRACKS_LOCK_EXP_REFS
+
+void __class_export_add_lock_ref(struct obd_export *, struct ldlm_lock *);
+void __class_export_del_lock_ref(struct obd_export *, struct ldlm_lock *);
+extern void (*class_export_dump_hook)(struct obd_export *);
+
+#else
+
+#define __class_export_add_lock_ref(exp, lock)	     do {} while (0)
+#define __class_export_del_lock_ref(exp, lock)	     do {} while (0)
+
+#endif
+
+static inline void class_export_rpc_inc(struct obd_export *exp)
+{
+	atomic_inc(&(exp)->exp_rpc_count);
+	CDEBUG(D_INFO, "RPC GETting export %p : new rpc_count %d\n",
+	       (exp), atomic_read(&(exp)->exp_rpc_count));
+}
+
+static inline void class_export_rpc_dec(struct obd_export *exp)
+{
+	LASSERT_ATOMIC_POS(&exp->exp_rpc_count);
+	atomic_dec(&(exp)->exp_rpc_count);
+	CDEBUG(D_INFO, "RPC PUTting export %p : new rpc_count %d\n",
+	       (exp), atomic_read(&(exp)->exp_rpc_count));
+}
+
+#define class_export_lock_get(exp, lock)				\
+({								      \
+	atomic_inc(&(exp)->exp_locks_count);			\
+	__class_export_add_lock_ref(exp, lock);			 \
+	CDEBUG(D_INFO, "lock GETting export %p : new locks_count %d\n", \
+	       (exp), atomic_read(&(exp)->exp_locks_count));	\
+	class_export_get(exp);					  \
+})
+
+#define class_export_lock_put(exp, lock)				\
+({								      \
+	LASSERT_ATOMIC_POS(&exp->exp_locks_count);		      \
+	atomic_dec(&(exp)->exp_locks_count);			\
+	__class_export_del_lock_ref(exp, lock);			 \
+	CDEBUG(D_INFO, "lock PUTting export %p : new locks_count %d\n", \
+	       (exp), atomic_read(&(exp)->exp_locks_count));	\
+	class_export_put(exp);					  \
+})
+
+#define class_export_cb_get(exp)					\
+({								      \
+	atomic_inc(&(exp)->exp_cb_count);			   \
+	CDEBUG(D_INFO, "callback GETting export %p : new cb_count %d\n",\
+	       (exp), atomic_read(&(exp)->exp_cb_count));	   \
+	class_export_get(exp);					  \
+})
+
+#define class_export_cb_put(exp)					\
+({								      \
+	LASSERT_ATOMIC_POS(&exp->exp_cb_count);			 \
+	atomic_dec(&(exp)->exp_cb_count);			   \
+	CDEBUG(D_INFO, "callback PUTting export %p : new cb_count %d\n",\
+	       (exp), atomic_read(&(exp)->exp_cb_count));	   \
+	class_export_put(exp);					  \
+})
+
+/* genops.c */
+struct obd_export *class_export_get(struct obd_export *exp);
+void class_export_put(struct obd_export *exp);
+struct obd_export *class_new_export(struct obd_device *obddev,
+				    struct obd_uuid *cluuid);
+void class_unlink_export(struct obd_export *exp);
+
+struct obd_import *class_import_get(struct obd_import *);
+void class_import_put(struct obd_import *);
+struct obd_import *class_new_import(struct obd_device *obd);
+void class_destroy_import(struct obd_import *exp);
+
+struct obd_type *class_search_type(const char *name);
+struct obd_type *class_get_type(const char *name);
+void class_put_type(struct obd_type *type);
+int class_connect(struct lustre_handle *conn, struct obd_device *obd,
+		  struct obd_uuid *cluuid);
+int class_disconnect(struct obd_export *exp);
+void class_fail_export(struct obd_export *exp);
+int class_connected_export(struct obd_export *exp);
+void class_disconnect_exports(struct obd_device *obddev);
+int class_manual_cleanup(struct obd_device *obd);
+void class_disconnect_stale_exports(struct obd_device *,
+				    int (*test_export)(struct obd_export *));
+static inline enum obd_option exp_flags_from_obd(struct obd_device *obd)
+{
+	return ((obd->obd_fail ? OBD_OPT_FAILOVER : 0) |
+		(obd->obd_force ? OBD_OPT_FORCE : 0) |
+		(obd->obd_abort_recovery ? OBD_OPT_ABORT_RECOV : 0) |
+		0);
+}
+
+struct inode;
+struct lu_attr;
+struct obdo;
+void obdo_from_la(struct obdo *dst, struct lu_attr *la, __u64 valid);
+void la_from_obdo(struct lu_attr *la, struct obdo *dst, u32 valid);
+void obdo_refresh_inode(struct inode *dst, struct obdo *src, u32 valid);
+void obdo_to_inode(struct inode *dst, struct obdo *src, u32 valid);
+
+void obdo_cpy_md(struct obdo *dst, struct obdo *src, u32 valid);
+void obdo_to_ioobj(struct obdo *oa, struct obd_ioobj *ioobj);
+void obdo_from_iattr(struct obdo *oa, struct iattr *attr,
+		     unsigned int ia_valid);
+void iattr_from_obdo(struct iattr *attr, struct obdo *oa, u32 valid);
+void md_from_obdo(struct md_op_data *op_data, struct obdo *oa, u32 valid);
+void obdo_from_md(struct obdo *oa, struct md_op_data *op_data,
+		  unsigned int valid);
+
+void obdo_cpu_to_le(struct obdo *dobdo, struct obdo *sobdo);
+void obdo_le_to_cpu(struct obdo *dobdo, struct obdo *sobdo);
+
+#define OBT(dev)	(dev)->obd_type
+#define OBP(dev, op)    (dev)->obd_type->typ_dt_ops->o_ ## op
+#define MDP(dev, op)    (dev)->obd_type->typ_md_ops->m_ ## op
+#define CTXTP(ctxt, op) (ctxt)->loc_logops->lop_##op
+
+/* Ensure obd_setup: used for cleanup which must be called
+   while obd is stopping */
+static inline int obd_check_dev(struct obd_device *obd)
+{
+	if (!obd) {
+		CERROR("NULL device\n");
+		return -ENODEV;
+	}
+	return 0;
+}
+
+/* ensure obd_setup and !obd_stopping */
+static inline int obd_check_dev_active(struct obd_device *obd)
+{
+	int rc;
+
+	rc = obd_check_dev(obd);
+	if (rc)
+		return rc;
+	if (!obd->obd_set_up || obd->obd_stopping) {
+		CERROR("Device %d not setup\n", obd->obd_minor);
+		return -ENODEV;
+	}
+	return rc;
+}
+
+#if defined (CONFIG_PROC_FS)
+#define OBD_COUNTER_OFFSET(op)				  \
+	((offsetof(struct obd_ops, o_ ## op) -		  \
+	  offsetof(struct obd_ops, o_iocontrol))		\
+	 / sizeof(((struct obd_ops *)(0))->o_iocontrol))
+
+#define OBD_COUNTER_INCREMENT(obdx, op)			   \
+	if ((obdx)->obd_stats != NULL) {			  \
+		unsigned int coffset;			     \
+		coffset = (unsigned int)((obdx)->obd_cntr_base) + \
+			OBD_COUNTER_OFFSET(op);		   \
+		LASSERT(coffset < (obdx)->obd_stats->ls_num);     \
+		lprocfs_counter_incr((obdx)->obd_stats, coffset); \
+	}
+
+#define EXP_COUNTER_INCREMENT(export, op)				    \
+	if ((export)->exp_obd->obd_stats != NULL) {			  \
+		unsigned int coffset;					\
+		coffset = (unsigned int)((export)->exp_obd->obd_cntr_base) + \
+			OBD_COUNTER_OFFSET(op);			      \
+		LASSERT(coffset < (export)->exp_obd->obd_stats->ls_num);     \
+		lprocfs_counter_incr((export)->exp_obd->obd_stats, coffset); \
+		if ((export)->exp_nid_stats != NULL &&		       \
+		    (export)->exp_nid_stats->nid_stats != NULL)	      \
+			lprocfs_counter_incr(				\
+				(export)->exp_nid_stats->nid_stats, coffset);\
+	}
+
+#define MD_COUNTER_OFFSET(op)				   \
+	((offsetof(struct md_ops, m_ ## op) -		   \
+	  offsetof(struct md_ops, m_getstatus))		 \
+	 / sizeof(((struct md_ops *)(0))->m_getstatus))
+
+#define MD_COUNTER_INCREMENT(obdx, op)			   \
+	if ((obd)->md_stats != NULL) {			   \
+		unsigned int coffset;			    \
+		coffset = (unsigned int)((obdx)->md_cntr_base) + \
+			MD_COUNTER_OFFSET(op);		   \
+		LASSERT(coffset < (obdx)->md_stats->ls_num);     \
+		lprocfs_counter_incr((obdx)->md_stats, coffset); \
+	}
+
+#define EXP_MD_COUNTER_INCREMENT(export, op)				 \
+	if ((export)->exp_obd->obd_stats != NULL) {			  \
+		unsigned int coffset;					\
+		coffset = (unsigned int)((export)->exp_obd->md_cntr_base) +  \
+			MD_COUNTER_OFFSET(op);			       \
+		LASSERT(coffset < (export)->exp_obd->md_stats->ls_num);      \
+		lprocfs_counter_incr((export)->exp_obd->md_stats, coffset);  \
+		if ((export)->exp_md_stats != NULL)			  \
+			lprocfs_counter_incr(				\
+				(export)->exp_md_stats, coffset);	    \
+	}
+
+#else
+#define OBD_COUNTER_OFFSET(op)
+#define OBD_COUNTER_INCREMENT(obd, op)
+#define EXP_COUNTER_INCREMENT(exp, op)
+#define MD_COUNTER_INCREMENT(obd, op)
+#define EXP_MD_COUNTER_INCREMENT(exp, op)
+#endif
+
+static inline int lprocfs_nid_ldlm_stats_init(struct nid_stat *tmp)
+{
+	/* Always add in ldlm_stats */
+	tmp->nid_ldlm_stats = lprocfs_alloc_stats(LDLM_LAST_OPC - LDLM_FIRST_OPC
+						  ,LPROCFS_STATS_FLAG_NOPERCPU);
+	if (tmp->nid_ldlm_stats == NULL)
+		return -ENOMEM;
+
+	lprocfs_init_ldlm_stats(tmp->nid_ldlm_stats);
+
+	return lprocfs_register_stats(tmp->nid_proc, "ldlm_stats",
+				      tmp->nid_ldlm_stats);
+}
+
+#define OBD_CHECK_MD_OP(obd, op, err)			   \
+do {							    \
+	if (!OBT(obd) || !MDP((obd), op)) {		     \
+		if (err)					\
+			CERROR("md_" #op ": dev %s/%d no operation\n", \
+			       obd->obd_name, obd->obd_minor);  \
+		return err;				    \
+	}						       \
+} while (0)
+
+#define EXP_CHECK_MD_OP(exp, op)				\
+do {							    \
+	if ((exp) == NULL) {				    \
+		CERROR("obd_" #op ": NULL export\n");	   \
+		return -ENODEV;				\
+	}						       \
+	if ((exp)->exp_obd == NULL || !OBT((exp)->exp_obd)) {   \
+		CERROR("obd_" #op ": cleaned up obd\n");	\
+		return -EOPNOTSUPP;			    \
+	}						       \
+	if (!OBT((exp)->exp_obd) || !MDP((exp)->exp_obd, op)) { \
+		CERROR("obd_" #op ": dev %s/%d no operation\n", \
+		       (exp)->exp_obd->obd_name,		\
+		       (exp)->exp_obd->obd_minor);	      \
+		return -EOPNOTSUPP;			    \
+	}						       \
+} while (0)
+
+
+#define OBD_CHECK_DT_OP(obd, op, err)			   \
+do {							    \
+	if (!OBT(obd) || !OBP((obd), op)) {		     \
+		if (err)					\
+			CERROR("obd_" #op ": dev %d no operation\n",    \
+			       obd->obd_minor);		 \
+		return err;				    \
+	}						       \
+} while (0)
+
+#define EXP_CHECK_DT_OP(exp, op)				\
+do {							    \
+	if ((exp) == NULL) {				    \
+		CERROR("obd_" #op ": NULL export\n");	   \
+		return -ENODEV;				\
+	}						       \
+	if ((exp)->exp_obd == NULL || !OBT((exp)->exp_obd)) {   \
+		CERROR("obd_" #op ": cleaned up obd\n");	\
+		return -EOPNOTSUPP;			    \
+	}						       \
+	if (!OBT((exp)->exp_obd) || !OBP((exp)->exp_obd, op)) { \
+		CERROR("obd_" #op ": dev %d no operation\n",    \
+		       (exp)->exp_obd->obd_minor);	      \
+		return -EOPNOTSUPP;			    \
+	}						       \
+} while (0)
+
+#define CTXT_CHECK_OP(ctxt, op, err)				 \
+do {								 \
+	if (!OBT(ctxt->loc_obd) || !CTXTP((ctxt), op)) {	     \
+		if (err)					     \
+			CERROR("lop_" #op ": dev %d no operation\n", \
+			       ctxt->loc_obd->obd_minor);	    \
+		return err;					 \
+	}							    \
+} while (0)
+
+static inline int class_devno_max(void)
+{
+	return MAX_OBD_DEVICES;
+}
+
+static inline int obd_get_info(const struct lu_env *env,
+			       struct obd_export *exp, __u32 keylen,
+			       void *key, __u32 *vallen, void *val,
+			       struct lov_stripe_md *lsm)
+{
+	int rc;
+
+	EXP_CHECK_DT_OP(exp, get_info);
+	EXP_COUNTER_INCREMENT(exp, get_info);
+
+	rc = OBP(exp->exp_obd, get_info)(env, exp, keylen, key, vallen, val,
+					 lsm);
+	return rc;
+}
+
+static inline int obd_set_info_async(const struct lu_env *env,
+				     struct obd_export *exp, u32 keylen,
+				     void *key, u32 vallen, void *val,
+				     struct ptlrpc_request_set *set)
+{
+	int rc;
+
+	EXP_CHECK_DT_OP(exp, set_info_async);
+	EXP_COUNTER_INCREMENT(exp, set_info_async);
+
+	rc = OBP(exp->exp_obd, set_info_async)(env, exp, keylen, key, vallen,
+					       val, set);
+	return rc;
+}
+
+/*
+ * obd-lu integration.
+ *
+ * Functionality is being moved into new lu_device-based layering, but some
+ * pieces of configuration process are still based on obd devices.
+ *
+ * Specifically, lu_device_type_operations::ldto_device_alloc() methods fully
+ * subsume ->o_setup() methods of obd devices they replace. The same for
+ * lu_device_operations::ldo_process_config() and ->o_process_config(). As a
+ * result, obd_setup() and obd_process_config() branch and call one XOR
+ * another.
+ *
+ * Yet neither lu_device_type_operations::ldto_device_fini() nor
+ * lu_device_type_operations::ldto_device_free() fully implement the
+ * functionality of ->o_precleanup() and ->o_cleanup() they override. Hence,
+ * obd_precleanup() and obd_cleanup() call both lu_device and obd operations.
+ */
+
+#define DECLARE_LU_VARS(ldt, d)		 \
+	struct lu_device_type *ldt;       \
+	struct lu_device *d
+
+static inline int obd_setup(struct obd_device *obd, struct lustre_cfg *cfg)
+{
+	int rc;
+	DECLARE_LU_VARS(ldt, d);
+
+	ldt = obd->obd_type->typ_lu;
+	if (ldt != NULL) {
+		struct lu_context  session_ctx;
+		struct lu_env env;
+		lu_context_init(&session_ctx, LCT_SESSION);
+		session_ctx.lc_thread = NULL;
+		lu_context_enter(&session_ctx);
+
+		rc = lu_env_init(&env, ldt->ldt_ctx_tags);
+		if (rc == 0) {
+			env.le_ses = &session_ctx;
+			d = ldt->ldt_ops->ldto_device_alloc(&env, ldt, cfg);
+			lu_env_fini(&env);
+			if (!IS_ERR(d)) {
+				obd->obd_lu_dev = d;
+				d->ld_obd = obd;
+				rc = 0;
+			} else
+				rc = PTR_ERR(d);
+		}
+		lu_context_exit(&session_ctx);
+		lu_context_fini(&session_ctx);
+
+	} else {
+		OBD_CHECK_DT_OP(obd, setup, -EOPNOTSUPP);
+		OBD_COUNTER_INCREMENT(obd, setup);
+		rc = OBP(obd, setup)(obd, cfg);
+	}
+	return rc;
+}
+
+static inline int obd_precleanup(struct obd_device *obd,
+				 enum obd_cleanup_stage cleanup_stage)
+{
+	int rc;
+	DECLARE_LU_VARS(ldt, d);
+
+	rc = obd_check_dev(obd);
+	if (rc)
+		return rc;
+	ldt = obd->obd_type->typ_lu;
+	d = obd->obd_lu_dev;
+	if (ldt != NULL && d != NULL) {
+		if (cleanup_stage == OBD_CLEANUP_EXPORTS) {
+			struct lu_env env;
+
+			rc = lu_env_init(&env, ldt->ldt_ctx_tags);
+			if (rc == 0) {
+				ldt->ldt_ops->ldto_device_fini(&env, d);
+				lu_env_fini(&env);
+			}
+		}
+	}
+	OBD_CHECK_DT_OP(obd, precleanup, 0);
+	OBD_COUNTER_INCREMENT(obd, precleanup);
+
+	rc = OBP(obd, precleanup)(obd, cleanup_stage);
+	return rc;
+}
+
+static inline int obd_cleanup(struct obd_device *obd)
+{
+	int rc;
+	DECLARE_LU_VARS(ldt, d);
+
+	rc = obd_check_dev(obd);
+	if (rc)
+		return rc;
+
+	ldt = obd->obd_type->typ_lu;
+	d = obd->obd_lu_dev;
+	if (ldt != NULL && d != NULL) {
+		struct lu_env env;
+
+		rc = lu_env_init(&env, ldt->ldt_ctx_tags);
+		if (rc == 0) {
+			ldt->ldt_ops->ldto_device_free(&env, d);
+			lu_env_fini(&env);
+			obd->obd_lu_dev = NULL;
+		}
+	}
+	OBD_CHECK_DT_OP(obd, cleanup, 0);
+	OBD_COUNTER_INCREMENT(obd, cleanup);
+
+	rc = OBP(obd, cleanup)(obd);
+	return rc;
+}
+
+static inline void obd_cleanup_client_import(struct obd_device *obd)
+{
+	/* If we set up but never connected, the
+	   client import will not have been cleaned. */
+	down_write(&obd->u.cli.cl_sem);
+	if (obd->u.cli.cl_import) {
+		struct obd_import *imp;
+		imp = obd->u.cli.cl_import;
+		CDEBUG(D_CONFIG, "%s: client import never connected\n",
+		       obd->obd_name);
+		ptlrpc_invalidate_import(imp);
+		if (imp->imp_rq_pool) {
+			ptlrpc_free_rq_pool(imp->imp_rq_pool);
+			imp->imp_rq_pool = NULL;
+		}
+		client_destroy_import(imp);
+		obd->u.cli.cl_import = NULL;
+	}
+	up_write(&obd->u.cli.cl_sem);
+}
+
+static inline int
+obd_process_config(struct obd_device *obd, int datalen, void *data)
+{
+	int rc;
+	DECLARE_LU_VARS(ldt, d);
+
+	rc = obd_check_dev(obd);
+	if (rc)
+		return rc;
+
+	obd->obd_process_conf = 1;
+	ldt = obd->obd_type->typ_lu;
+	d = obd->obd_lu_dev;
+	if (ldt != NULL && d != NULL) {
+		struct lu_env env;
+
+		rc = lu_env_init(&env, ldt->ldt_ctx_tags);
+		if (rc == 0) {
+			rc = d->ld_ops->ldo_process_config(&env, d, data);
+			lu_env_fini(&env);
+		}
+	} else {
+		OBD_CHECK_DT_OP(obd, process_config, -EOPNOTSUPP);
+		rc = OBP(obd, process_config)(obd, datalen, data);
+	}
+	OBD_COUNTER_INCREMENT(obd, process_config);
+	obd->obd_process_conf = 0;
+
+	return rc;
+}
+
+/* Pack an in-memory MD struct for storage on disk.
+ * Returns +ve size of packed MD (0 for free), or -ve error.
+ *
+ * If @disk_tgt == NULL, MD size is returned (max size if @mem_src == NULL).
+ * If @*disk_tgt != NULL and @mem_src == NULL, @*disk_tgt will be freed.
+ * If @*disk_tgt == NULL, it will be allocated
+ */
+static inline int obd_packmd(struct obd_export *exp,
+			     struct lov_mds_md **disk_tgt,
+			     struct lov_stripe_md *mem_src)
+{
+	int rc;
+
+	EXP_CHECK_DT_OP(exp, packmd);
+	EXP_COUNTER_INCREMENT(exp, packmd);
+
+	rc = OBP(exp->exp_obd, packmd)(exp, disk_tgt, mem_src);
+	return rc;
+}
+
+static inline int obd_size_diskmd(struct obd_export *exp,
+				  struct lov_stripe_md *mem_src)
+{
+	return obd_packmd(exp, NULL, mem_src);
+}
+
+static inline int obd_free_diskmd(struct obd_export *exp,
+				  struct lov_mds_md **disk_tgt)
+{
+	LASSERT(disk_tgt);
+	LASSERT(*disk_tgt);
+	/*
+	 * LU-2590, for caller's convenience, *disk_tgt could be host
+	 * endianness, it needs swab to LE if necessary, while just
+	 * lov_mds_md header needs it for figuring out how much memory
+	 * needs to be freed.
+	 */
+	if ((cpu_to_le32(LOV_MAGIC) != LOV_MAGIC) &&
+	    (((*disk_tgt)->lmm_magic == LOV_MAGIC_V1) ||
+	     ((*disk_tgt)->lmm_magic == LOV_MAGIC_V3)))
+		lustre_swab_lov_mds_md(*disk_tgt);
+	return obd_packmd(exp, disk_tgt, NULL);
+}
+
+/* Unpack an MD struct from disk to in-memory format.
+ * Returns +ve size of unpacked MD (0 for free), or -ve error.
+ *
+ * If @mem_tgt == NULL, MD size is returned (max size if @disk_src == NULL).
+ * If @*mem_tgt != NULL and @disk_src == NULL, @*mem_tgt will be freed.
+ * If @*mem_tgt == NULL, it will be allocated
+ */
+static inline int obd_unpackmd(struct obd_export *exp,
+			       struct lov_stripe_md **mem_tgt,
+			       struct lov_mds_md *disk_src,
+			       int disk_len)
+{
+	int rc;
+
+	EXP_CHECK_DT_OP(exp, unpackmd);
+	EXP_COUNTER_INCREMENT(exp, unpackmd);
+
+	rc = OBP(exp->exp_obd, unpackmd)(exp, mem_tgt, disk_src, disk_len);
+	return rc;
+}
+
+/* helper functions */
+static inline int obd_alloc_memmd(struct obd_export *exp,
+				  struct lov_stripe_md **mem_tgt)
+{
+	LASSERT(mem_tgt);
+	LASSERT(*mem_tgt == NULL);
+	return obd_unpackmd(exp, mem_tgt, NULL, 0);
+}
+
+static inline int obd_free_memmd(struct obd_export *exp,
+				 struct lov_stripe_md **mem_tgt)
+{
+	int rc;
+
+	LASSERT(mem_tgt);
+	LASSERT(*mem_tgt);
+	rc = obd_unpackmd(exp, mem_tgt, NULL, 0);
+	*mem_tgt = NULL;
+	return rc;
+}
+
+static inline int obd_create(const struct lu_env *env, struct obd_export *exp,
+			     struct obdo *obdo, struct lov_stripe_md **ea,
+			     struct obd_trans_info *oti)
+{
+	int rc;
+
+	EXP_CHECK_DT_OP(exp, create);
+	EXP_COUNTER_INCREMENT(exp, create);
+
+	rc = OBP(exp->exp_obd, create)(env, exp, obdo, ea, oti);
+	return rc;
+}
+
+static inline int obd_destroy(const struct lu_env *env, struct obd_export *exp,
+			      struct obdo *obdo, struct lov_stripe_md *ea,
+			      struct obd_trans_info *oti,
+			      struct obd_export *md_exp, void *capa)
+{
+	int rc;
+
+	EXP_CHECK_DT_OP(exp, destroy);
+	EXP_COUNTER_INCREMENT(exp, destroy);
+
+	rc = OBP(exp->exp_obd, destroy)(env, exp, obdo, ea, oti, md_exp, capa);
+	return rc;
+}
+
+static inline int obd_getattr(const struct lu_env *env, struct obd_export *exp,
+			      struct obd_info *oinfo)
+{
+	int rc;
+
+	EXP_CHECK_DT_OP(exp, getattr);
+	EXP_COUNTER_INCREMENT(exp, getattr);
+
+	rc = OBP(exp->exp_obd, getattr)(env, exp, oinfo);
+	return rc;
+}
+
+static inline int obd_getattr_async(struct obd_export *exp,
+				    struct obd_info *oinfo,
+				    struct ptlrpc_request_set *set)
+{
+	int rc;
+
+	EXP_CHECK_DT_OP(exp, getattr_async);
+	EXP_COUNTER_INCREMENT(exp, getattr_async);
+
+	rc = OBP(exp->exp_obd, getattr_async)(exp, oinfo, set);
+	return rc;
+}
+
+static inline int obd_setattr(const struct lu_env *env, struct obd_export *exp,
+			      struct obd_info *oinfo,
+			      struct obd_trans_info *oti)
+{
+	int rc;
+
+	EXP_CHECK_DT_OP(exp, setattr);
+	EXP_COUNTER_INCREMENT(exp, setattr);
+
+	rc = OBP(exp->exp_obd, setattr)(env, exp, oinfo, oti);
+	return rc;
+}
+
+/* This performs all the requests set init/wait/destroy actions. */
+static inline int obd_setattr_rqset(struct obd_export *exp,
+				    struct obd_info *oinfo,
+				    struct obd_trans_info *oti)
+{
+	struct ptlrpc_request_set *set = NULL;
+	int rc;
+
+	EXP_CHECK_DT_OP(exp, setattr_async);
+	EXP_COUNTER_INCREMENT(exp, setattr_async);
+
+	set =  ptlrpc_prep_set();
+	if (set == NULL)
+		return -ENOMEM;
+
+	rc = OBP(exp->exp_obd, setattr_async)(exp, oinfo, oti, set);
+	if (rc == 0)
+		rc = ptlrpc_set_wait(set);
+	ptlrpc_set_destroy(set);
+	return rc;
+}
+
+/* This adds all the requests into @set if @set != NULL, otherwise
+   all requests are sent asynchronously without waiting for response. */
+static inline int obd_setattr_async(struct obd_export *exp,
+				    struct obd_info *oinfo,
+				    struct obd_trans_info *oti,
+				    struct ptlrpc_request_set *set)
+{
+	int rc;
+
+	EXP_CHECK_DT_OP(exp, setattr_async);
+	EXP_COUNTER_INCREMENT(exp, setattr_async);
+
+	rc = OBP(exp->exp_obd, setattr_async)(exp, oinfo, oti, set);
+	return rc;
+}
+
+static inline int obd_add_conn(struct obd_import *imp, struct obd_uuid *uuid,
+			       int priority)
+{
+	struct obd_device *obd = imp->imp_obd;
+	int rc;
+
+	rc = obd_check_dev_active(obd);
+	if (rc)
+		return rc;
+	OBD_CHECK_DT_OP(obd, add_conn, -EOPNOTSUPP);
+	OBD_COUNTER_INCREMENT(obd, add_conn);
+
+	rc = OBP(obd, add_conn)(imp, uuid, priority);
+	return rc;
+}
+
+static inline int obd_del_conn(struct obd_import *imp, struct obd_uuid *uuid)
+{
+	struct obd_device *obd = imp->imp_obd;
+	int rc;
+
+	rc = obd_check_dev_active(obd);
+	if (rc)
+		return rc;
+	OBD_CHECK_DT_OP(obd, del_conn, -EOPNOTSUPP);
+	OBD_COUNTER_INCREMENT(obd, del_conn);
+
+	rc = OBP(obd, del_conn)(imp, uuid);
+	return rc;
+}
+
+static inline struct obd_uuid *obd_get_uuid(struct obd_export *exp)
+{
+	struct obd_uuid *uuid;
+
+	OBD_CHECK_DT_OP(exp->exp_obd, get_uuid, NULL);
+	EXP_COUNTER_INCREMENT(exp, get_uuid);
+
+	uuid = OBP(exp->exp_obd, get_uuid)(exp);
+	return uuid;
+}
+
+/** Create a new /a exp on device /a obd for the uuid /a cluuid
+ * @param exp New export handle
+ * @param d Connect data, supported flags are set, flags also understood
+ *    by obd are returned.
+ */
+static inline int obd_connect(const struct lu_env *env,
+			      struct obd_export **exp, struct obd_device *obd,
+			      struct obd_uuid *cluuid,
+			      struct obd_connect_data *data,
+			      void *localdata)
+{
+	int rc;
+	__u64 ocf = data ? data->ocd_connect_flags : 0; /* for post-condition
+						   * check */
+
+	rc = obd_check_dev_active(obd);
+	if (rc)
+		return rc;
+	OBD_CHECK_DT_OP(obd, connect, -EOPNOTSUPP);
+	OBD_COUNTER_INCREMENT(obd, connect);
+
+	rc = OBP(obd, connect)(env, exp, obd, cluuid, data, localdata);
+	/* check that only subset is granted */
+	LASSERT(ergo(data != NULL, (data->ocd_connect_flags & ocf) ==
+				    data->ocd_connect_flags));
+	return rc;
+}
+
+static inline int obd_reconnect(const struct lu_env *env,
+				struct obd_export *exp,
+				struct obd_device *obd,
+				struct obd_uuid *cluuid,
+				struct obd_connect_data *d,
+				void *localdata)
+{
+	int rc;
+	__u64 ocf = d ? d->ocd_connect_flags : 0; /* for post-condition
+						   * check */
+
+	rc = obd_check_dev_active(obd);
+	if (rc)
+		return rc;
+	OBD_CHECK_DT_OP(obd, reconnect, 0);
+	OBD_COUNTER_INCREMENT(obd, reconnect);
+
+	rc = OBP(obd, reconnect)(env, exp, obd, cluuid, d, localdata);
+	/* check that only subset is granted */
+	LASSERT(ergo(d != NULL,
+		     (d->ocd_connect_flags & ocf) == d->ocd_connect_flags));
+	return rc;
+}
+
+static inline int obd_disconnect(struct obd_export *exp)
+{
+	int rc;
+
+	EXP_CHECK_DT_OP(exp, disconnect);
+	EXP_COUNTER_INCREMENT(exp, disconnect);
+
+	rc = OBP(exp->exp_obd, disconnect)(exp);
+	return rc;
+}
+
+static inline int obd_fid_init(struct obd_device *obd, struct obd_export *exp,
+			       enum lu_cli_type type)
+{
+	int rc;
+
+	OBD_CHECK_DT_OP(obd, fid_init, 0);
+	OBD_COUNTER_INCREMENT(obd, fid_init);
+
+	rc = OBP(obd, fid_init)(obd, exp, type);
+	return rc;
+}
+
+static inline int obd_fid_fini(struct obd_device *obd)
+{
+	int rc;
+
+	OBD_CHECK_DT_OP(obd, fid_fini, 0);
+	OBD_COUNTER_INCREMENT(obd, fid_fini);
+
+	rc = OBP(obd, fid_fini)(obd);
+	return rc;
+}
+
+static inline int obd_fid_alloc(struct obd_export *exp,
+				struct lu_fid *fid,
+				struct md_op_data *op_data)
+{
+	int rc;
+
+	EXP_CHECK_DT_OP(exp, fid_alloc);
+	EXP_COUNTER_INCREMENT(exp, fid_alloc);
+
+	rc = OBP(exp->exp_obd, fid_alloc)(exp, fid, op_data);
+	return rc;
+}
+
+static inline int obd_pool_new(struct obd_device *obd, char *poolname)
+{
+	int rc;
+
+	OBD_CHECK_DT_OP(obd, pool_new, -EOPNOTSUPP);
+	OBD_COUNTER_INCREMENT(obd, pool_new);
+
+	rc = OBP(obd, pool_new)(obd, poolname);
+	return rc;
+}
+
+static inline int obd_pool_del(struct obd_device *obd, char *poolname)
+{
+	int rc;
+
+	OBD_CHECK_DT_OP(obd, pool_del, -EOPNOTSUPP);
+	OBD_COUNTER_INCREMENT(obd, pool_del);
+
+	rc = OBP(obd, pool_del)(obd, poolname);
+	return rc;
+}
+
+static inline int obd_pool_add(struct obd_device *obd, char *poolname, char *ostname)
+{
+	int rc;
+
+	OBD_CHECK_DT_OP(obd, pool_add, -EOPNOTSUPP);
+	OBD_COUNTER_INCREMENT(obd, pool_add);
+
+	rc = OBP(obd, pool_add)(obd, poolname, ostname);
+	return rc;
+}
+
+static inline int obd_pool_rem(struct obd_device *obd, char *poolname, char *ostname)
+{
+	int rc;
+
+	OBD_CHECK_DT_OP(obd, pool_rem, -EOPNOTSUPP);
+	OBD_COUNTER_INCREMENT(obd, pool_rem);
+
+	rc = OBP(obd, pool_rem)(obd, poolname, ostname);
+	return rc;
+}
+
+static inline void obd_getref(struct obd_device *obd)
+{
+	if (OBT(obd) && OBP(obd, getref)) {
+		OBD_COUNTER_INCREMENT(obd, getref);
+		OBP(obd, getref)(obd);
+	}
+}
+
+static inline void obd_putref(struct obd_device *obd)
+{
+	if (OBT(obd) && OBP(obd, putref)) {
+		OBD_COUNTER_INCREMENT(obd, putref);
+		OBP(obd, putref)(obd);
+	}
+}
+
+static inline int obd_init_export(struct obd_export *exp)
+{
+	int rc = 0;
+
+	if ((exp)->exp_obd != NULL && OBT((exp)->exp_obd) &&
+	    OBP((exp)->exp_obd, init_export))
+		rc = OBP(exp->exp_obd, init_export)(exp);
+	return rc;
+}
+
+static inline int obd_destroy_export(struct obd_export *exp)
+{
+	if ((exp)->exp_obd != NULL && OBT((exp)->exp_obd) &&
+	    OBP((exp)->exp_obd, destroy_export))
+		OBP(exp->exp_obd, destroy_export)(exp);
+	return 0;
+}
+
+/* @max_age is the oldest time in jiffies that we accept using a cached data.
+ * If the cache is older than @max_age we will get a new value from the
+ * target.  Use a value of "cfs_time_current() + HZ" to guarantee freshness. */
+static inline int obd_statfs_async(struct obd_export *exp,
+				   struct obd_info *oinfo,
+				   __u64 max_age,
+				   struct ptlrpc_request_set *rqset)
+{
+	int rc = 0;
+	struct obd_device *obd;
+
+	if (exp == NULL || exp->exp_obd == NULL)
+		return -EINVAL;
+
+	obd = exp->exp_obd;
+	OBD_CHECK_DT_OP(obd, statfs, -EOPNOTSUPP);
+	OBD_COUNTER_INCREMENT(obd, statfs);
+
+	CDEBUG(D_SUPER, "%s: osfs %p age %llu, max_age %llu\n",
+	       obd->obd_name, &obd->obd_osfs, obd->obd_osfs_age, max_age);
+	if (cfs_time_before_64(obd->obd_osfs_age, max_age)) {
+		rc = OBP(obd, statfs_async)(exp, oinfo, max_age, rqset);
+	} else {
+		CDEBUG(D_SUPER,
+		       "%s: use %p cache blocks %llu/%llu objects %llu/%llu\n",
+		       obd->obd_name, &obd->obd_osfs,
+		       obd->obd_osfs.os_bavail, obd->obd_osfs.os_blocks,
+		       obd->obd_osfs.os_ffree, obd->obd_osfs.os_files);
+		spin_lock(&obd->obd_osfs_lock);
+		memcpy(oinfo->oi_osfs, &obd->obd_osfs, sizeof(*oinfo->oi_osfs));
+		spin_unlock(&obd->obd_osfs_lock);
+		oinfo->oi_flags |= OBD_STATFS_FROM_CACHE;
+		if (oinfo->oi_cb_up)
+			oinfo->oi_cb_up(oinfo, 0);
+	}
+	return rc;
+}
+
+static inline int obd_statfs_rqset(struct obd_export *exp,
+				   struct obd_statfs *osfs, __u64 max_age,
+				   __u32 flags)
+{
+	struct ptlrpc_request_set *set = NULL;
+	struct obd_info oinfo = { { { 0 } } };
+	int rc = 0;
+
+	set =  ptlrpc_prep_set();
+	if (set == NULL)
+		return -ENOMEM;
+
+	oinfo.oi_osfs = osfs;
+	oinfo.oi_flags = flags;
+	rc = obd_statfs_async(exp, &oinfo, max_age, set);
+	if (rc == 0)
+		rc = ptlrpc_set_wait(set);
+	ptlrpc_set_destroy(set);
+	return rc;
+}
+
+/* @max_age is the oldest time in jiffies that we accept using a cached data.
+ * If the cache is older than @max_age we will get a new value from the
+ * target.  Use a value of "cfs_time_current() + HZ" to guarantee freshness. */
+static inline int obd_statfs(const struct lu_env *env, struct obd_export *exp,
+			     struct obd_statfs *osfs, __u64 max_age,
+			     __u32 flags)
+{
+	int rc = 0;
+	struct obd_device *obd = exp->exp_obd;
+
+	if (obd == NULL)
+		return -EINVAL;
+
+	OBD_CHECK_DT_OP(obd, statfs, -EOPNOTSUPP);
+	OBD_COUNTER_INCREMENT(obd, statfs);
+
+	CDEBUG(D_SUPER, "osfs %llu, max_age %llu\n",
+	       obd->obd_osfs_age, max_age);
+	if (cfs_time_before_64(obd->obd_osfs_age, max_age)) {
+		rc = OBP(obd, statfs)(env, exp, osfs, max_age, flags);
+		if (rc == 0) {
+			spin_lock(&obd->obd_osfs_lock);
+			memcpy(&obd->obd_osfs, osfs, sizeof(obd->obd_osfs));
+			obd->obd_osfs_age = cfs_time_current_64();
+			spin_unlock(&obd->obd_osfs_lock);
+		}
+	} else {
+		CDEBUG(D_SUPER, "%s: use %p cache blocks %llu/%llu objects %llu/%llu\n",
+		       obd->obd_name, &obd->obd_osfs,
+		       obd->obd_osfs.os_bavail, obd->obd_osfs.os_blocks,
+		       obd->obd_osfs.os_ffree, obd->obd_osfs.os_files);
+		spin_lock(&obd->obd_osfs_lock);
+		memcpy(osfs, &obd->obd_osfs, sizeof(*osfs));
+		spin_unlock(&obd->obd_osfs_lock);
+	}
+	return rc;
+}
+
+static inline int obd_preprw(const struct lu_env *env, int cmd,
+			     struct obd_export *exp, struct obdo *oa,
+			     int objcount, struct obd_ioobj *obj,
+			     struct niobuf_remote *remote, int *pages,
+			     struct niobuf_local *local,
+			     struct obd_trans_info *oti,
+			     struct lustre_capa *capa)
+{
+	int rc;
+
+	EXP_CHECK_DT_OP(exp, preprw);
+	EXP_COUNTER_INCREMENT(exp, preprw);
+
+	rc = OBP(exp->exp_obd, preprw)(env, cmd, exp, oa, objcount, obj, remote,
+				       pages, local, oti, capa);
+	return rc;
+}
+
+static inline int obd_commitrw(const struct lu_env *env, int cmd,
+			       struct obd_export *exp, struct obdo *oa,
+			       int objcount, struct obd_ioobj *obj,
+			       struct niobuf_remote *rnb, int pages,
+			       struct niobuf_local *local,
+			       struct obd_trans_info *oti, int rc)
+{
+	EXP_CHECK_DT_OP(exp, commitrw);
+	EXP_COUNTER_INCREMENT(exp, commitrw);
+
+	rc = OBP(exp->exp_obd, commitrw)(env, cmd, exp, oa, objcount, obj,
+					 rnb, pages, local, oti, rc);
+	return rc;
+}
+
+static inline int obd_adjust_kms(struct obd_export *exp,
+				 struct lov_stripe_md *lsm, u64 size,
+				 int shrink)
+{
+	int rc;
+
+	EXP_CHECK_DT_OP(exp, adjust_kms);
+	EXP_COUNTER_INCREMENT(exp, adjust_kms);
+
+	rc = OBP(exp->exp_obd, adjust_kms)(exp, lsm, size, shrink);
+	return rc;
+}
+
+static inline int obd_iocontrol(unsigned int cmd, struct obd_export *exp,
+				int len, void *karg, void *uarg)
+{
+	int rc;
+
+	EXP_CHECK_DT_OP(exp, iocontrol);
+	EXP_COUNTER_INCREMENT(exp, iocontrol);
+
+	rc = OBP(exp->exp_obd, iocontrol)(cmd, exp, len, karg, uarg);
+	return rc;
+}
+
+static inline int obd_find_cbdata(struct obd_export *exp,
+				  struct lov_stripe_md *lsm,
+				  ldlm_iterator_t it, void *data)
+{
+	int rc;
+
+	EXP_CHECK_DT_OP(exp, find_cbdata);
+	EXP_COUNTER_INCREMENT(exp, find_cbdata);
+
+	rc = OBP(exp->exp_obd, find_cbdata)(exp, lsm, it, data);
+	return rc;
+}
+
+static inline void obd_import_event(struct obd_device *obd,
+				    struct obd_import *imp,
+				    enum obd_import_event event)
+{
+	if (!obd) {
+		CERROR("NULL device\n");
+		return;
+	}
+	if (obd->obd_set_up && OBP(obd, import_event)) {
+		OBD_COUNTER_INCREMENT(obd, import_event);
+		OBP(obd, import_event)(obd, imp, event);
+	}
+}
+
+static inline int obd_notify(struct obd_device *obd,
+			     struct obd_device *watched,
+			     enum obd_notify_event ev,
+			     void *data)
+{
+	int rc;
+
+	rc = obd_check_dev(obd);
+	if (rc)
+		return rc;
+
+	/* the check for async_recov is a complete hack - I'm hereby
+	   overloading the meaning to also mean "this was called from
+	   mds_postsetup".  I know that my mds is able to handle notifies
+	   by this point, and it needs to get them to execute mds_postrecov. */
+	if (!obd->obd_set_up && !obd->obd_async_recov) {
+		CDEBUG(D_HA, "obd %s not set up\n", obd->obd_name);
+		return -EINVAL;
+	}
+
+	if (!OBP(obd, notify)) {
+		CDEBUG(D_HA, "obd %s has no notify handler\n", obd->obd_name);
+		return -ENOSYS;
+	}
+
+	OBD_COUNTER_INCREMENT(obd, notify);
+	rc = OBP(obd, notify)(obd, watched, ev, data);
+	return rc;
+}
+
+static inline int obd_notify_observer(struct obd_device *observer,
+				      struct obd_device *observed,
+				      enum obd_notify_event ev,
+				      void *data)
+{
+	int rc1;
+	int rc2;
+
+	struct obd_notify_upcall *onu;
+
+	if (observer->obd_observer)
+		rc1 = obd_notify(observer->obd_observer, observed, ev, data);
+	else
+		rc1 = 0;
+	/*
+	 * Also, call non-obd listener, if any
+	 */
+	onu = &observer->obd_upcall;
+	if (onu->onu_upcall != NULL)
+		rc2 = onu->onu_upcall(observer, observed, ev,
+				      onu->onu_owner, NULL);
+	else
+		rc2 = 0;
+
+	return rc1 ? rc1 : rc2;
+}
+
+static inline int obd_quotacheck(struct obd_export *exp,
+				 struct obd_quotactl *oqctl)
+{
+	int rc;
+
+	EXP_CHECK_DT_OP(exp, quotacheck);
+	EXP_COUNTER_INCREMENT(exp, quotacheck);
+
+	rc = OBP(exp->exp_obd, quotacheck)(exp->exp_obd, exp, oqctl);
+	return rc;
+}
+
+static inline int obd_quotactl(struct obd_export *exp,
+			       struct obd_quotactl *oqctl)
+{
+	int rc;
+
+	EXP_CHECK_DT_OP(exp, quotactl);
+	EXP_COUNTER_INCREMENT(exp, quotactl);
+
+	rc = OBP(exp->exp_obd, quotactl)(exp->exp_obd, exp, oqctl);
+	return rc;
+}
+
+static inline int obd_health_check(const struct lu_env *env,
+				   struct obd_device *obd)
+{
+	/* returns: 0 on healthy
+	 *	 >0 on unhealthy + reason code/flag
+	 *	    however the only supported reason == 1 right now
+	 *	    We'll need to define some better reasons
+	 *	    or flags in the future.
+	 *	 <0 on error
+	 */
+	int rc;
+
+	/* don't use EXP_CHECK_DT_OP, because NULL method is normal here */
+	if (obd == NULL || !OBT(obd)) {
+		CERROR("cleaned up obd\n");
+		return -EOPNOTSUPP;
+	}
+	if (!obd->obd_set_up || obd->obd_stopping)
+		return 0;
+	if (!OBP(obd, health_check))
+		return 0;
+
+	rc = OBP(obd, health_check)(env, obd);
+	return rc;
+}
+
+static inline int obd_register_observer(struct obd_device *obd,
+					struct obd_device *observer)
+{
+	int rc;
+
+	rc = obd_check_dev(obd);
+	if (rc)
+		return rc;
+	down_write(&obd->obd_observer_link_sem);
+	if (obd->obd_observer && observer) {
+		up_write(&obd->obd_observer_link_sem);
+		return -EALREADY;
+	}
+	obd->obd_observer = observer;
+	up_write(&obd->obd_observer_link_sem);
+	return 0;
+}
+
+#if 0
+static inline int obd_register_page_removal_cb(struct obd_export *exp,
+					       obd_page_removal_cb_t cb,
+					       obd_pin_extent_cb pin_cb)
+{
+	int rc;
+
+	OBD_CHECK_DT_OP(exp->exp_obd, register_page_removal_cb, 0);
+	OBD_COUNTER_INCREMENT(exp->exp_obd, register_page_removal_cb);
+
+	rc = OBP(exp->exp_obd, register_page_removal_cb)(exp, cb, pin_cb);
+	return rc;
+}
+
+static inline int obd_unregister_page_removal_cb(struct obd_export *exp,
+						 obd_page_removal_cb_t cb)
+{
+	int rc;
+
+	OBD_CHECK_DT_OP(exp->exp_obd, unregister_page_removal_cb, 0);
+	OBD_COUNTER_INCREMENT(exp->exp_obd, unregister_page_removal_cb);
+
+	rc = OBP(exp->exp_obd, unregister_page_removal_cb)(exp, cb);
+	return rc;
+}
+
+static inline int obd_register_lock_cancel_cb(struct obd_export *exp,
+					      obd_lock_cancel_cb cb)
+{
+	int rc;
+
+	OBD_CHECK_DT_OP(exp->exp_obd, register_lock_cancel_cb, 0);
+	OBD_COUNTER_INCREMENT(exp->exp_obd, register_lock_cancel_cb);
+
+	rc = OBP(exp->exp_obd, register_lock_cancel_cb)(exp, cb);
+	return rc;
+}
+
+static inline int obd_unregister_lock_cancel_cb(struct obd_export *exp,
+						 obd_lock_cancel_cb cb)
+{
+	int rc;
+
+	OBD_CHECK_DT_OP(exp->exp_obd, unregister_lock_cancel_cb, 0);
+	OBD_COUNTER_INCREMENT(exp->exp_obd, unregister_lock_cancel_cb);
+
+	rc = OBP(exp->exp_obd, unregister_lock_cancel_cb)(exp, cb);
+	return rc;
+}
+#endif
+
+/* metadata helpers */
+static inline int md_getstatus(struct obd_export *exp,
+			       struct lu_fid *fid, struct obd_capa **pc)
+{
+	int rc;
+
+	EXP_CHECK_MD_OP(exp, getstatus);
+	EXP_MD_COUNTER_INCREMENT(exp, getstatus);
+	rc = MDP(exp->exp_obd, getstatus)(exp, fid, pc);
+	return rc;
+}
+
+static inline int md_getattr(struct obd_export *exp, struct md_op_data *op_data,
+			     struct ptlrpc_request **request)
+{
+	int rc;
+
+	EXP_CHECK_MD_OP(exp, getattr);
+	EXP_MD_COUNTER_INCREMENT(exp, getattr);
+	rc = MDP(exp->exp_obd, getattr)(exp, op_data, request);
+	return rc;
+}
+
+static inline int md_null_inode(struct obd_export *exp,
+				   const struct lu_fid *fid)
+{
+	int rc;
+
+	EXP_CHECK_MD_OP(exp, null_inode);
+	EXP_MD_COUNTER_INCREMENT(exp, null_inode);
+	rc = MDP(exp->exp_obd, null_inode)(exp, fid);
+	return rc;
+}
+
+static inline int md_find_cbdata(struct obd_export *exp,
+				 const struct lu_fid *fid,
+				 ldlm_iterator_t it, void *data)
+{
+	int rc;
+
+	EXP_CHECK_MD_OP(exp, find_cbdata);
+	EXP_MD_COUNTER_INCREMENT(exp, find_cbdata);
+	rc = MDP(exp->exp_obd, find_cbdata)(exp, fid, it, data);
+	return rc;
+}
+
+static inline int md_close(struct obd_export *exp, struct md_op_data *op_data,
+			   struct md_open_data *mod,
+			   struct ptlrpc_request **request)
+{
+	int rc;
+
+	EXP_CHECK_MD_OP(exp, close);
+	EXP_MD_COUNTER_INCREMENT(exp, close);
+	rc = MDP(exp->exp_obd, close)(exp, op_data, mod, request);
+	return rc;
+}
+
+static inline int md_create(struct obd_export *exp, struct md_op_data *op_data,
+			    const void *data, int datalen, int mode, __u32 uid,
+			    __u32 gid, cfs_cap_t cap_effective, __u64 rdev,
+			    struct ptlrpc_request **request)
+{
+	int rc;
+
+	EXP_CHECK_MD_OP(exp, create);
+	EXP_MD_COUNTER_INCREMENT(exp, create);
+	rc = MDP(exp->exp_obd, create)(exp, op_data, data, datalen, mode,
+				       uid, gid, cap_effective, rdev, request);
+	return rc;
+}
+
+static inline int md_done_writing(struct obd_export *exp,
+				  struct md_op_data *op_data,
+				  struct md_open_data *mod)
+{
+	int rc;
+
+	EXP_CHECK_MD_OP(exp, done_writing);
+	EXP_MD_COUNTER_INCREMENT(exp, done_writing);
+	rc = MDP(exp->exp_obd, done_writing)(exp, op_data, mod);
+	return rc;
+}
+
+static inline int md_enqueue(struct obd_export *exp,
+			     struct ldlm_enqueue_info *einfo,
+			     struct lookup_intent *it,
+			     struct md_op_data *op_data,
+			     struct lustre_handle *lockh,
+			     void *lmm, int lmmsize,
+			     struct ptlrpc_request **req,
+			     __u64 extra_lock_flags)
+{
+	int rc;
+
+	EXP_CHECK_MD_OP(exp, enqueue);
+	EXP_MD_COUNTER_INCREMENT(exp, enqueue);
+	rc = MDP(exp->exp_obd, enqueue)(exp, einfo, it, op_data, lockh,
+					lmm, lmmsize, req, extra_lock_flags);
+	return rc;
+}
+
+static inline int md_getattr_name(struct obd_export *exp,
+				  struct md_op_data *op_data,
+				  struct ptlrpc_request **request)
+{
+	int rc;
+
+	EXP_CHECK_MD_OP(exp, getattr_name);
+	EXP_MD_COUNTER_INCREMENT(exp, getattr_name);
+	rc = MDP(exp->exp_obd, getattr_name)(exp, op_data, request);
+	return rc;
+}
+
+static inline int md_intent_lock(struct obd_export *exp,
+				 struct md_op_data *op_data, void *lmm,
+				 int lmmsize, struct lookup_intent *it,
+				 int lookup_flags, struct ptlrpc_request **reqp,
+				 ldlm_blocking_callback cb_blocking,
+				 __u64 extra_lock_flags)
+{
+	int rc;
+
+	EXP_CHECK_MD_OP(exp, intent_lock);
+	EXP_MD_COUNTER_INCREMENT(exp, intent_lock);
+	rc = MDP(exp->exp_obd, intent_lock)(exp, op_data, lmm, lmmsize,
+					    it, lookup_flags, reqp, cb_blocking,
+					    extra_lock_flags);
+	return rc;
+}
+
+static inline int md_link(struct obd_export *exp, struct md_op_data *op_data,
+			  struct ptlrpc_request **request)
+{
+	int rc;
+
+	EXP_CHECK_MD_OP(exp, link);
+	EXP_MD_COUNTER_INCREMENT(exp, link);
+	rc = MDP(exp->exp_obd, link)(exp, op_data, request);
+	return rc;
+}
+
+static inline int md_rename(struct obd_export *exp, struct md_op_data *op_data,
+			    const char *old, int oldlen, const char *new,
+			    int newlen, struct ptlrpc_request **request)
+{
+	int rc;
+
+	EXP_CHECK_MD_OP(exp, rename);
+	EXP_MD_COUNTER_INCREMENT(exp, rename);
+	rc = MDP(exp->exp_obd, rename)(exp, op_data, old, oldlen, new,
+				       newlen, request);
+	return rc;
+}
+
+static inline int md_is_subdir(struct obd_export *exp,
+			       const struct lu_fid *pfid,
+			       const struct lu_fid *cfid,
+			       struct ptlrpc_request **request)
+{
+	int rc;
+
+	EXP_CHECK_MD_OP(exp, is_subdir);
+	EXP_MD_COUNTER_INCREMENT(exp, is_subdir);
+	rc = MDP(exp->exp_obd, is_subdir)(exp, pfid, cfid, request);
+	return rc;
+}
+
+static inline int md_setattr(struct obd_export *exp, struct md_op_data *op_data,
+			     void *ea, int ealen, void *ea2, int ea2len,
+			     struct ptlrpc_request **request,
+			     struct md_open_data **mod)
+{
+	int rc;
+
+	EXP_CHECK_MD_OP(exp, setattr);
+	EXP_MD_COUNTER_INCREMENT(exp, setattr);
+	rc = MDP(exp->exp_obd, setattr)(exp, op_data, ea, ealen,
+					ea2, ea2len, request, mod);
+	return rc;
+}
+
+static inline int md_sync(struct obd_export *exp, const struct lu_fid *fid,
+			  struct obd_capa *oc, struct ptlrpc_request **request)
+{
+	int rc;
+
+	EXP_CHECK_MD_OP(exp, sync);
+	EXP_MD_COUNTER_INCREMENT(exp, sync);
+	rc = MDP(exp->exp_obd, sync)(exp, fid, oc, request);
+	return rc;
+}
+
+static inline int md_readpage(struct obd_export *exp, struct md_op_data *opdata,
+			      struct page **pages,
+			      struct ptlrpc_request **request)
+{
+	int rc;
+
+	EXP_CHECK_MD_OP(exp, readpage);
+	EXP_MD_COUNTER_INCREMENT(exp, readpage);
+	rc = MDP(exp->exp_obd, readpage)(exp, opdata, pages, request);
+	return rc;
+}
+
+static inline int md_unlink(struct obd_export *exp, struct md_op_data *op_data,
+			    struct ptlrpc_request **request)
+{
+	int rc;
+
+	EXP_CHECK_MD_OP(exp, unlink);
+	EXP_MD_COUNTER_INCREMENT(exp, unlink);
+	rc = MDP(exp->exp_obd, unlink)(exp, op_data, request);
+	return rc;
+}
+
+static inline int md_get_lustre_md(struct obd_export *exp,
+				   struct ptlrpc_request *req,
+				   struct obd_export *dt_exp,
+				   struct obd_export *md_exp,
+				   struct lustre_md *md)
+{
+	EXP_CHECK_MD_OP(exp, get_lustre_md);
+	EXP_MD_COUNTER_INCREMENT(exp, get_lustre_md);
+	return MDP(exp->exp_obd, get_lustre_md)(exp, req, dt_exp, md_exp, md);
+}
+
+static inline int md_free_lustre_md(struct obd_export *exp,
+				    struct lustre_md *md)
+{
+	EXP_CHECK_MD_OP(exp, free_lustre_md);
+	EXP_MD_COUNTER_INCREMENT(exp, free_lustre_md);
+	return MDP(exp->exp_obd, free_lustre_md)(exp, md);
+}
+
+static inline int md_setxattr(struct obd_export *exp,
+			      const struct lu_fid *fid, struct obd_capa *oc,
+			      u64 valid, const char *name,
+			      const char *input, int input_size,
+			      int output_size, int flags, __u32 suppgid,
+			      struct ptlrpc_request **request)
+{
+	EXP_CHECK_MD_OP(exp, setxattr);
+	EXP_MD_COUNTER_INCREMENT(exp, setxattr);
+	return MDP(exp->exp_obd, setxattr)(exp, fid, oc, valid, name, input,
+					   input_size, output_size, flags,
+					   suppgid, request);
+}
+
+static inline int md_getxattr(struct obd_export *exp,
+			      const struct lu_fid *fid, struct obd_capa *oc,
+			      u64 valid, const char *name,
+			      const char *input, int input_size,
+			      int output_size, int flags,
+			      struct ptlrpc_request **request)
+{
+	EXP_CHECK_MD_OP(exp, getxattr);
+	EXP_MD_COUNTER_INCREMENT(exp, getxattr);
+	return MDP(exp->exp_obd, getxattr)(exp, fid, oc, valid, name, input,
+					   input_size, output_size, flags,
+					   request);
+}
+
+static inline int md_set_open_replay_data(struct obd_export *exp,
+					  struct obd_client_handle *och,
+					  struct lookup_intent *it)
+{
+	EXP_CHECK_MD_OP(exp, set_open_replay_data);
+	EXP_MD_COUNTER_INCREMENT(exp, set_open_replay_data);
+	return MDP(exp->exp_obd, set_open_replay_data)(exp, och, it);
+}
+
+static inline int md_clear_open_replay_data(struct obd_export *exp,
+					    struct obd_client_handle *och)
+{
+	EXP_CHECK_MD_OP(exp, clear_open_replay_data);
+	EXP_MD_COUNTER_INCREMENT(exp, clear_open_replay_data);
+	return MDP(exp->exp_obd, clear_open_replay_data)(exp, och);
+}
+
+static inline int md_set_lock_data(struct obd_export *exp,
+				   __u64 *lockh, void *data, __u64 *bits)
+{
+	EXP_CHECK_MD_OP(exp, set_lock_data);
+	EXP_MD_COUNTER_INCREMENT(exp, set_lock_data);
+	return MDP(exp->exp_obd, set_lock_data)(exp, lockh, data, bits);
+}
+
+static inline int md_cancel_unused(struct obd_export *exp,
+				   const struct lu_fid *fid,
+				   ldlm_policy_data_t *policy,
+				   ldlm_mode_t mode,
+				   ldlm_cancel_flags_t flags,
+				   void *opaque)
+{
+	int rc;
+
+	EXP_CHECK_MD_OP(exp, cancel_unused);
+	EXP_MD_COUNTER_INCREMENT(exp, cancel_unused);
+
+	rc = MDP(exp->exp_obd, cancel_unused)(exp, fid, policy, mode,
+					      flags, opaque);
+	return rc;
+}
+
+static inline ldlm_mode_t md_lock_match(struct obd_export *exp, __u64 flags,
+					const struct lu_fid *fid,
+					ldlm_type_t type,
+					ldlm_policy_data_t *policy,
+					ldlm_mode_t mode,
+					struct lustre_handle *lockh)
+{
+	EXP_CHECK_MD_OP(exp, lock_match);
+	EXP_MD_COUNTER_INCREMENT(exp, lock_match);
+	return MDP(exp->exp_obd, lock_match)(exp, flags, fid, type,
+					     policy, mode, lockh);
+}
+
+static inline int md_init_ea_size(struct obd_export *exp, int easize,
+				  int def_asize, int cookiesize,
+				  int def_cookiesize)
+{
+	EXP_CHECK_MD_OP(exp, init_ea_size);
+	EXP_MD_COUNTER_INCREMENT(exp, init_ea_size);
+	return MDP(exp->exp_obd, init_ea_size)(exp, easize, def_asize,
+					       cookiesize, def_cookiesize);
+}
+
+static inline int md_get_remote_perm(struct obd_export *exp,
+				     const struct lu_fid *fid,
+				     struct obd_capa *oc, __u32 suppgid,
+				     struct ptlrpc_request **request)
+{
+	EXP_CHECK_MD_OP(exp, get_remote_perm);
+	EXP_MD_COUNTER_INCREMENT(exp, get_remote_perm);
+	return MDP(exp->exp_obd, get_remote_perm)(exp, fid, oc, suppgid,
+						  request);
+}
+
+static inline int md_renew_capa(struct obd_export *exp, struct obd_capa *ocapa,
+				renew_capa_cb_t cb)
+{
+	int rc;
+
+	EXP_CHECK_MD_OP(exp, renew_capa);
+	EXP_MD_COUNTER_INCREMENT(exp, renew_capa);
+	rc = MDP(exp->exp_obd, renew_capa)(exp, ocapa, cb);
+	return rc;
+}
+
+static inline int md_unpack_capa(struct obd_export *exp,
+				 struct ptlrpc_request *req,
+				 const struct req_msg_field *field,
+				 struct obd_capa **oc)
+{
+	int rc;
+
+	EXP_CHECK_MD_OP(exp, unpack_capa);
+	EXP_MD_COUNTER_INCREMENT(exp, unpack_capa);
+	rc = MDP(exp->exp_obd, unpack_capa)(exp, req, field, oc);
+	return rc;
+}
+
+static inline int md_intent_getattr_async(struct obd_export *exp,
+					  struct md_enqueue_info *minfo,
+					  struct ldlm_enqueue_info *einfo)
+{
+	int rc;
+
+	EXP_CHECK_MD_OP(exp, intent_getattr_async);
+	EXP_MD_COUNTER_INCREMENT(exp, intent_getattr_async);
+	rc = MDP(exp->exp_obd, intent_getattr_async)(exp, minfo, einfo);
+	return rc;
+}
+
+static inline int md_revalidate_lock(struct obd_export *exp,
+				     struct lookup_intent *it,
+				     struct lu_fid *fid, __u64 *bits)
+{
+	int rc;
+
+	EXP_CHECK_MD_OP(exp, revalidate_lock);
+	EXP_MD_COUNTER_INCREMENT(exp, revalidate_lock);
+	rc = MDP(exp->exp_obd, revalidate_lock)(exp, it, fid, bits);
+	return rc;
+}
+
+
+/* OBD Metadata Support */
+
+extern int obd_init_caches(void);
+extern void obd_cleanup_caches(void);
+
+/* support routines */
+extern struct kmem_cache *obdo_cachep;
+
+#define OBDO_ALLOC(ptr)						       \
+do {									  \
+	OBD_SLAB_ALLOC_PTR_GFP((ptr), obdo_cachep, GFP_NOFS);             \
+} while (0)
+
+#define OBDO_FREE(ptr)							\
+do {									  \
+	OBD_SLAB_FREE_PTR((ptr), obdo_cachep);				\
+} while (0)
+
+
+static inline void obdo2fid(struct obdo *oa, struct lu_fid *fid)
+{
+	/* something here */
+}
+
+static inline void fid2obdo(struct lu_fid *fid, struct obdo *oa)
+{
+	/* something here */
+}
+
+typedef int (*register_lwp_cb)(void *data);
+
+struct lwp_register_item {
+	struct obd_export **lri_exp;
+	register_lwp_cb	    lri_cb_func;
+	void		   *lri_cb_data;
+	struct list_head	    lri_list;
+	char		    lri_name[MTI_NAME_MAXLEN];
+};
+
+/* I'm as embarrassed about this as you are.
+ *
+ * <shaver> // XXX do not look into _superhack with remaining eye
+ * <shaver> // XXX if this were any uglier, I'd get my own show on MTV */
+extern int (*ptlrpc_put_connection_superhack)(struct ptlrpc_connection *c);
+
+/* obd_mount.c */
+
+/* sysctl.c */
+extern void obd_sysctl_init (void);
+extern void obd_sysctl_clean (void);
+
+/* uuid.c  */
+typedef __u8 class_uuid_t[16];
+void class_uuid_unparse(class_uuid_t in, struct obd_uuid *out);
+
+/* lustre_peer.c    */
+int lustre_uuid_to_peer(const char *uuid, lnet_nid_t *peer_nid, int index);
+int class_add_uuid(const char *uuid, __u64 nid);
+int class_del_uuid (const char *uuid);
+int class_check_uuid(struct obd_uuid *uuid, __u64 nid);
+void class_init_uuidlist(void);
+void class_exit_uuidlist(void);
+
+/* class_obd.c */
+extern char obd_jobid_node[];
+extern struct miscdevice obd_psdev;
+extern spinlock_t obd_types_lock;
+
+/* prng.c */
+#define ll_generate_random_uuid(uuid_out) cfs_get_random_bytes(uuid_out, sizeof(class_uuid_t))
+
+#endif /* __LINUX_OBD_CLASS_H */
diff --git a/kernel/drivers/staging/lustre/lustre/include/obd_support.h b/kernel/drivers/staging/lustre/lustre/include/obd_support.h
new file mode 100644
index 000000000..2991d2ee7
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/include/obd_support.h
@@ -0,0 +1,862 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef _OBD_SUPPORT
+#define _OBD_SUPPORT
+
+#include <linux/slab.h>
+#include "../../include/linux/libcfs/libcfs.h"
+#include "linux/lustre_compat25.h"
+#include "lprocfs_status.h"
+
+/* global variables */
+extern struct lprocfs_stats *obd_memory;
+enum {
+	OBD_MEMORY_STAT = 0,
+	OBD_MEMORY_PAGES_STAT = 1,
+	OBD_STATS_NUM,
+};
+
+extern unsigned int obd_debug_peer_on_timeout;
+extern unsigned int obd_dump_on_timeout;
+extern unsigned int obd_dump_on_eviction;
+/* obd_timeout should only be used for recovery, not for
+   networking / disk / timings affected by load (use Adaptive Timeouts) */
+extern unsigned int obd_timeout;	  /* seconds */
+extern unsigned int ldlm_timeout;	 /* seconds */
+extern unsigned int obd_timeout_set;
+extern unsigned int ldlm_timeout_set;
+extern unsigned int at_min;
+extern unsigned int at_max;
+extern unsigned int at_history;
+extern int at_early_margin;
+extern int at_extra;
+extern unsigned int obd_sync_filter;
+extern unsigned int obd_max_dirty_pages;
+extern atomic_t obd_dirty_pages;
+extern atomic_t obd_dirty_transit_pages;
+extern unsigned int obd_alloc_fail_rate;
+extern char obd_jobid_var[];
+
+/* lvfs.c */
+int obd_alloc_fail(const void *ptr, const char *name, const char *type,
+		   size_t size, const char *file, int line);
+
+/* Some hash init argument constants */
+#define HASH_POOLS_BKT_BITS 3
+#define HASH_POOLS_CUR_BITS 3
+#define HASH_POOLS_MAX_BITS 7
+#define HASH_UUID_BKT_BITS 5
+#define HASH_UUID_CUR_BITS 7
+#define HASH_UUID_MAX_BITS 12
+#define HASH_NID_BKT_BITS 5
+#define HASH_NID_CUR_BITS 7
+#define HASH_NID_MAX_BITS 12
+#define HASH_NID_STATS_BKT_BITS 5
+#define HASH_NID_STATS_CUR_BITS 7
+#define HASH_NID_STATS_MAX_BITS 12
+#define HASH_LQE_BKT_BITS 5
+#define HASH_LQE_CUR_BITS 7
+#define HASH_LQE_MAX_BITS 12
+#define HASH_CONN_BKT_BITS 5
+#define HASH_CONN_CUR_BITS 5
+#define HASH_CONN_MAX_BITS 15
+#define HASH_EXP_LOCK_BKT_BITS  5
+#define HASH_EXP_LOCK_CUR_BITS  7
+#define HASH_EXP_LOCK_MAX_BITS  16
+#define HASH_CL_ENV_BKT_BITS    5
+#define HASH_CL_ENV_BITS	10
+#define HASH_JOB_STATS_BKT_BITS 5
+#define HASH_JOB_STATS_CUR_BITS 7
+#define HASH_JOB_STATS_MAX_BITS 12
+
+/* Timeout definitions */
+#define OBD_TIMEOUT_DEFAULT	     100
+#define LDLM_TIMEOUT_DEFAULT	    20
+#define MDS_LDLM_TIMEOUT_DEFAULT	6
+/* Time to wait for all clients to reconnect during recovery (hard limit) */
+#define OBD_RECOVERY_TIME_HARD	  (obd_timeout * 9)
+/* Time to wait for all clients to reconnect during recovery (soft limit) */
+/* Should be very conservative; must catch the first reconnect after reboot */
+#define OBD_RECOVERY_TIME_SOFT	  (obd_timeout * 3)
+/* Change recovery-small 26b time if you change this */
+#define PING_INTERVAL max(obd_timeout / 4, 1U)
+/* a bit more than maximal journal commit time in seconds */
+#define PING_INTERVAL_SHORT min(PING_INTERVAL, 7U)
+/* Client may skip 1 ping; we must wait at least 2.5. But for multiple
+ * failover targets the client only pings one server at a time, and pings
+ * can be lost on a loaded network. Since eviction has serious consequences,
+ * and there's no urgent need to evict a client just because it's idle, we
+ * should be very conservative here. */
+#define PING_EVICT_TIMEOUT (PING_INTERVAL * 6)
+#define DISK_TIMEOUT 50	  /* Beyond this we warn about disk speed */
+#define CONNECTION_SWITCH_MIN 5U /* Connection switching rate limiter */
+ /* Max connect interval for nonresponsive servers; ~50s to avoid building up
+    connect requests in the LND queues, but within obd_timeout so we don't
+    miss the recovery window */
+#define CONNECTION_SWITCH_MAX min(50U, max(CONNECTION_SWITCH_MIN, obd_timeout))
+#define CONNECTION_SWITCH_INC 5  /* Connection timeout backoff */
+/* In general this should be low to have quick detection of a system
+   running on a backup server. (If it's too low, import_select_connection
+   will increase the timeout anyhow.)  */
+#define INITIAL_CONNECT_TIMEOUT max(CONNECTION_SWITCH_MIN, obd_timeout/20)
+/* The max delay between connects is SWITCH_MAX + SWITCH_INC + INITIAL */
+#define RECONNECT_DELAY_MAX (CONNECTION_SWITCH_MAX + CONNECTION_SWITCH_INC + \
+			     INITIAL_CONNECT_TIMEOUT)
+/* The min time a target should wait for clients to reconnect in recovery */
+#define OBD_RECOVERY_TIME_MIN    (2*RECONNECT_DELAY_MAX)
+#define OBD_IR_FACTOR_MIN	 1
+#define OBD_IR_FACTOR_MAX	 10
+#define OBD_IR_FACTOR_DEFAULT    (OBD_IR_FACTOR_MAX/2)
+/* default timeout for the MGS to become IR_FULL */
+#define OBD_IR_MGS_TIMEOUT       (4*obd_timeout)
+#define LONG_UNLINK 300	  /* Unlink should happen before now */
+
+/**
+ * Time interval of shrink, if the client is "idle" more than this interval,
+ * then the ll_grant thread will return the requested grant space to filter
+ */
+#define GRANT_SHRINK_INTERVAL	    1200/*20 minutes*/
+
+#define OBD_FAIL_MDS		     0x100
+#define OBD_FAIL_MDS_HANDLE_UNPACK       0x101
+#define OBD_FAIL_MDS_GETATTR_NET	 0x102
+#define OBD_FAIL_MDS_GETATTR_PACK	0x103
+#define OBD_FAIL_MDS_READPAGE_NET	0x104
+#define OBD_FAIL_MDS_READPAGE_PACK       0x105
+#define OBD_FAIL_MDS_SENDPAGE	    0x106
+#define OBD_FAIL_MDS_REINT_NET	   0x107
+#define OBD_FAIL_MDS_REINT_UNPACK	0x108
+#define OBD_FAIL_MDS_REINT_SETATTR       0x109
+#define OBD_FAIL_MDS_REINT_SETATTR_WRITE 0x10a
+#define OBD_FAIL_MDS_REINT_CREATE	0x10b
+#define OBD_FAIL_MDS_REINT_CREATE_WRITE  0x10c
+#define OBD_FAIL_MDS_REINT_UNLINK	0x10d
+#define OBD_FAIL_MDS_REINT_UNLINK_WRITE  0x10e
+#define OBD_FAIL_MDS_REINT_LINK	  0x10f
+#define OBD_FAIL_MDS_REINT_LINK_WRITE    0x110
+#define OBD_FAIL_MDS_REINT_RENAME	0x111
+#define OBD_FAIL_MDS_REINT_RENAME_WRITE  0x112
+#define OBD_FAIL_MDS_OPEN_NET	    0x113
+#define OBD_FAIL_MDS_OPEN_PACK	   0x114
+#define OBD_FAIL_MDS_CLOSE_NET	   0x115
+#define OBD_FAIL_MDS_CLOSE_PACK	  0x116
+#define OBD_FAIL_MDS_CONNECT_NET	 0x117
+#define OBD_FAIL_MDS_CONNECT_PACK	0x118
+#define OBD_FAIL_MDS_REINT_NET_REP       0x119
+#define OBD_FAIL_MDS_DISCONNECT_NET      0x11a
+#define OBD_FAIL_MDS_GETSTATUS_NET       0x11b
+#define OBD_FAIL_MDS_GETSTATUS_PACK      0x11c
+#define OBD_FAIL_MDS_STATFS_PACK	 0x11d
+#define OBD_FAIL_MDS_STATFS_NET	  0x11e
+#define OBD_FAIL_MDS_GETATTR_NAME_NET    0x11f
+#define OBD_FAIL_MDS_PIN_NET	     0x120
+#define OBD_FAIL_MDS_UNPIN_NET	   0x121
+#define OBD_FAIL_MDS_ALL_REPLY_NET       0x122
+#define OBD_FAIL_MDS_ALL_REQUEST_NET     0x123
+#define OBD_FAIL_MDS_SYNC_NET	    0x124
+#define OBD_FAIL_MDS_SYNC_PACK	   0x125
+#define OBD_FAIL_MDS_DONE_WRITING_NET    0x126
+#define OBD_FAIL_MDS_DONE_WRITING_PACK   0x127
+#define OBD_FAIL_MDS_ALLOC_OBDO	  0x128
+#define OBD_FAIL_MDS_PAUSE_OPEN	  0x129
+#define OBD_FAIL_MDS_STATFS_LCW_SLEEP    0x12a
+#define OBD_FAIL_MDS_OPEN_CREATE	 0x12b
+#define OBD_FAIL_MDS_OST_SETATTR	 0x12c
+#define OBD_FAIL_MDS_QUOTACHECK_NET      0x12d
+#define OBD_FAIL_MDS_QUOTACTL_NET	0x12e
+#define OBD_FAIL_MDS_CLIENT_ADD	  0x12f
+#define OBD_FAIL_MDS_GETXATTR_NET	0x130
+#define OBD_FAIL_MDS_GETXATTR_PACK       0x131
+#define OBD_FAIL_MDS_SETXATTR_NET	0x132
+#define OBD_FAIL_MDS_SETXATTR	    0x133
+#define OBD_FAIL_MDS_SETXATTR_WRITE      0x134
+#define OBD_FAIL_MDS_FS_SETUP	    0x135
+#define OBD_FAIL_MDS_RESEND	      0x136
+#define OBD_FAIL_MDS_LLOG_CREATE_FAILED  0x137
+#define OBD_FAIL_MDS_LOV_SYNC_RACE       0x138
+#define OBD_FAIL_MDS_OSC_PRECREATE       0x139
+#define OBD_FAIL_MDS_LLOG_SYNC_TIMEOUT   0x13a
+#define OBD_FAIL_MDS_CLOSE_NET_REP       0x13b
+#define OBD_FAIL_MDS_BLOCK_QUOTA_REQ     0x13c
+#define OBD_FAIL_MDS_DROP_QUOTA_REQ      0x13d
+#define OBD_FAIL_MDS_REMOVE_COMMON_EA    0x13e
+#define OBD_FAIL_MDS_ALLOW_COMMON_EA_SETTING   0x13f
+#define OBD_FAIL_MDS_FAIL_LOV_LOG_ADD    0x140
+#define OBD_FAIL_MDS_LOV_PREP_CREATE     0x141
+#define OBD_FAIL_MDS_REINT_DELAY	 0x142
+#define OBD_FAIL_MDS_READLINK_EPROTO     0x143
+#define OBD_FAIL_MDS_OPEN_WAIT_CREATE    0x144
+#define OBD_FAIL_MDS_PDO_LOCK	    0x145
+#define OBD_FAIL_MDS_PDO_LOCK2	   0x146
+#define OBD_FAIL_MDS_OSC_CREATE_FAIL     0x147
+#define OBD_FAIL_MDS_NEGATIVE_POSITIVE	 0x148
+#define OBD_FAIL_MDS_HSM_STATE_GET_NET		0x149
+#define OBD_FAIL_MDS_HSM_STATE_SET_NET		0x14a
+#define OBD_FAIL_MDS_HSM_PROGRESS_NET		0x14b
+#define OBD_FAIL_MDS_HSM_REQUEST_NET		0x14c
+#define OBD_FAIL_MDS_HSM_CT_REGISTER_NET	0x14d
+#define OBD_FAIL_MDS_HSM_CT_UNREGISTER_NET	0x14e
+#define OBD_FAIL_MDS_SWAP_LAYOUTS_NET		0x14f
+#define OBD_FAIL_MDS_HSM_ACTION_NET		0x150
+#define OBD_FAIL_MDS_CHANGELOG_INIT		0x151
+
+/* layout lock */
+#define OBD_FAIL_MDS_NO_LL_GETATTR	 0x170
+#define OBD_FAIL_MDS_NO_LL_OPEN		 0x171
+#define OBD_FAIL_MDS_LL_BLOCK		 0x172
+
+/* CMD */
+#define OBD_FAIL_MDS_IS_SUBDIR_NET       0x180
+#define OBD_FAIL_MDS_IS_SUBDIR_PACK      0x181
+#define OBD_FAIL_MDS_SET_INFO_NET	0x182
+#define OBD_FAIL_MDS_WRITEPAGE_NET       0x183
+#define OBD_FAIL_MDS_WRITEPAGE_PACK      0x184
+#define OBD_FAIL_MDS_RECOVERY_ACCEPTS_GAPS 0x185
+#define OBD_FAIL_MDS_GET_INFO_NET	0x186
+#define OBD_FAIL_MDS_DQACQ_NET	   0x187
+
+/* OI scrub */
+#define OBD_FAIL_OSD_SCRUB_DELAY			0x190
+#define OBD_FAIL_OSD_SCRUB_CRASH			0x191
+#define OBD_FAIL_OSD_SCRUB_FATAL			0x192
+#define OBD_FAIL_OSD_FID_MAPPING			0x193
+#define OBD_FAIL_OSD_LMA_INCOMPAT			0x194
+#define OBD_FAIL_OSD_COMPAT_INVALID_ENTRY		0x195
+
+#define OBD_FAIL_OST		     0x200
+#define OBD_FAIL_OST_CONNECT_NET	 0x201
+#define OBD_FAIL_OST_DISCONNECT_NET      0x202
+#define OBD_FAIL_OST_GET_INFO_NET	0x203
+#define OBD_FAIL_OST_CREATE_NET	  0x204
+#define OBD_FAIL_OST_DESTROY_NET	 0x205
+#define OBD_FAIL_OST_GETATTR_NET	 0x206
+#define OBD_FAIL_OST_SETATTR_NET	 0x207
+#define OBD_FAIL_OST_OPEN_NET	    0x208
+#define OBD_FAIL_OST_CLOSE_NET	   0x209
+#define OBD_FAIL_OST_BRW_NET	     0x20a
+#define OBD_FAIL_OST_PUNCH_NET	   0x20b
+#define OBD_FAIL_OST_STATFS_NET	  0x20c
+#define OBD_FAIL_OST_HANDLE_UNPACK       0x20d
+#define OBD_FAIL_OST_BRW_WRITE_BULK      0x20e
+#define OBD_FAIL_OST_BRW_READ_BULK       0x20f
+#define OBD_FAIL_OST_SYNC_NET	    0x210
+#define OBD_FAIL_OST_ALL_REPLY_NET       0x211
+#define OBD_FAIL_OST_ALL_REQUEST_NET     0x212
+#define OBD_FAIL_OST_LDLM_REPLY_NET      0x213
+#define OBD_FAIL_OST_BRW_PAUSE_BULK      0x214
+#define OBD_FAIL_OST_ENOSPC	      0x215
+#define OBD_FAIL_OST_EROFS	       0x216
+#define OBD_FAIL_OST_ENOENT	      0x217
+#define OBD_FAIL_OST_QUOTACHECK_NET      0x218
+#define OBD_FAIL_OST_QUOTACTL_NET	0x219
+#define OBD_FAIL_OST_CHECKSUM_RECEIVE    0x21a
+#define OBD_FAIL_OST_CHECKSUM_SEND       0x21b
+#define OBD_FAIL_OST_BRW_SIZE	    0x21c
+#define OBD_FAIL_OST_DROP_REQ	    0x21d
+#define OBD_FAIL_OST_SETATTR_CREDITS     0x21e
+#define OBD_FAIL_OST_HOLD_WRITE_RPC      0x21f
+#define OBD_FAIL_OST_BRW_WRITE_BULK2     0x220
+#define OBD_FAIL_OST_LLOG_RECOVERY_TIMEOUT 0x221
+#define OBD_FAIL_OST_CANCEL_COOKIE_TIMEOUT 0x222
+#define OBD_FAIL_OST_PAUSE_CREATE	0x223
+#define OBD_FAIL_OST_BRW_PAUSE_PACK      0x224
+#define OBD_FAIL_OST_CONNECT_NET2	0x225
+#define OBD_FAIL_OST_NOMEM	       0x226
+#define OBD_FAIL_OST_BRW_PAUSE_BULK2     0x227
+#define OBD_FAIL_OST_MAPBLK_ENOSPC       0x228
+#define OBD_FAIL_OST_ENOINO	      0x229
+#define OBD_FAIL_OST_DQACQ_NET	   0x230
+#define OBD_FAIL_OST_STATFS_EINPROGRESS  0x231
+
+#define OBD_FAIL_LDLM		    0x300
+#define OBD_FAIL_LDLM_NAMESPACE_NEW      0x301
+#define OBD_FAIL_LDLM_ENQUEUE_NET			0x302
+#define OBD_FAIL_LDLM_CONVERT_NET			0x303
+#define OBD_FAIL_LDLM_CANCEL_NET			0x304
+#define OBD_FAIL_LDLM_BL_CALLBACK_NET			0x305
+#define OBD_FAIL_LDLM_CP_CALLBACK_NET			0x306
+#define OBD_FAIL_LDLM_GL_CALLBACK_NET			0x307
+#define OBD_FAIL_LDLM_ENQUEUE_EXTENT_ERR 0x308
+#define OBD_FAIL_LDLM_ENQUEUE_INTENT_ERR 0x309
+#define OBD_FAIL_LDLM_CREATE_RESOURCE    0x30a
+#define OBD_FAIL_LDLM_ENQUEUE_BLOCKED    0x30b
+#define OBD_FAIL_LDLM_REPLY	      0x30c
+#define OBD_FAIL_LDLM_RECOV_CLIENTS      0x30d
+#define OBD_FAIL_LDLM_ENQUEUE_OLD_EXPORT 0x30e
+#define OBD_FAIL_LDLM_GLIMPSE	    0x30f
+#define OBD_FAIL_LDLM_CANCEL_RACE	0x310
+#define OBD_FAIL_LDLM_CANCEL_EVICT_RACE  0x311
+#define OBD_FAIL_LDLM_PAUSE_CANCEL       0x312
+#define OBD_FAIL_LDLM_CLOSE_THREAD       0x313
+#define OBD_FAIL_LDLM_CANCEL_BL_CB_RACE  0x314
+#define OBD_FAIL_LDLM_CP_CB_WAIT	 0x315
+#define OBD_FAIL_LDLM_OST_FAIL_RACE      0x316
+#define OBD_FAIL_LDLM_INTR_CP_AST	0x317
+#define OBD_FAIL_LDLM_CP_BL_RACE	 0x318
+#define OBD_FAIL_LDLM_NEW_LOCK	   0x319
+#define OBD_FAIL_LDLM_AGL_DELAY	  0x31a
+#define OBD_FAIL_LDLM_AGL_NOLOCK	 0x31b
+#define OBD_FAIL_LDLM_OST_LVB		 0x31c
+
+/* LOCKLESS IO */
+#define OBD_FAIL_LDLM_SET_CONTENTION     0x385
+
+#define OBD_FAIL_OSC		     0x400
+#define OBD_FAIL_OSC_BRW_READ_BULK       0x401
+#define OBD_FAIL_OSC_BRW_WRITE_BULK      0x402
+#define OBD_FAIL_OSC_LOCK_BL_AST	 0x403
+#define OBD_FAIL_OSC_LOCK_CP_AST	 0x404
+#define OBD_FAIL_OSC_MATCH	       0x405
+#define OBD_FAIL_OSC_BRW_PREP_REQ	0x406
+#define OBD_FAIL_OSC_SHUTDOWN	    0x407
+#define OBD_FAIL_OSC_CHECKSUM_RECEIVE    0x408
+#define OBD_FAIL_OSC_CHECKSUM_SEND       0x409
+#define OBD_FAIL_OSC_BRW_PREP_REQ2       0x40a
+#define OBD_FAIL_OSC_CONNECT_CKSUM       0x40b
+#define OBD_FAIL_OSC_CKSUM_ADLER_ONLY    0x40c
+#define OBD_FAIL_OSC_DIO_PAUSE	   0x40d
+#define OBD_FAIL_OSC_OBJECT_CONTENTION   0x40e
+#define OBD_FAIL_OSC_CP_CANCEL_RACE      0x40f
+#define OBD_FAIL_OSC_CP_ENQ_RACE	 0x410
+#define OBD_FAIL_OSC_NO_GRANT	    0x411
+#define OBD_FAIL_OSC_DELAY_SETTIME	 0x412
+
+#define OBD_FAIL_PTLRPC		  0x500
+#define OBD_FAIL_PTLRPC_ACK	      0x501
+#define OBD_FAIL_PTLRPC_RQBD	     0x502
+#define OBD_FAIL_PTLRPC_BULK_GET_NET     0x503
+#define OBD_FAIL_PTLRPC_BULK_PUT_NET     0x504
+#define OBD_FAIL_PTLRPC_DROP_RPC	 0x505
+#define OBD_FAIL_PTLRPC_DELAY_SEND       0x506
+#define OBD_FAIL_PTLRPC_DELAY_RECOV      0x507
+#define OBD_FAIL_PTLRPC_CLIENT_BULK_CB   0x508
+#define OBD_FAIL_PTLRPC_PAUSE_REQ	0x50a
+#define OBD_FAIL_PTLRPC_PAUSE_REP	0x50c
+#define OBD_FAIL_PTLRPC_IMP_DEACTIVE     0x50d
+#define OBD_FAIL_PTLRPC_DUMP_LOG	 0x50e
+#define OBD_FAIL_PTLRPC_LONG_REPL_UNLINK 0x50f
+#define OBD_FAIL_PTLRPC_LONG_BULK_UNLINK 0x510
+#define OBD_FAIL_PTLRPC_HPREQ_TIMEOUT    0x511
+#define OBD_FAIL_PTLRPC_HPREQ_NOTIMEOUT  0x512
+#define OBD_FAIL_PTLRPC_DROP_REQ_OPC     0x513
+#define OBD_FAIL_PTLRPC_FINISH_REPLAY    0x514
+#define OBD_FAIL_PTLRPC_CLIENT_BULK_CB2  0x515
+#define OBD_FAIL_PTLRPC_DELAY_IMP_FULL   0x516
+#define OBD_FAIL_PTLRPC_CANCEL_RESEND    0x517
+
+#define OBD_FAIL_OBD_PING_NET	    0x600
+#define OBD_FAIL_OBD_LOG_CANCEL_NET      0x601
+#define OBD_FAIL_OBD_LOGD_NET	    0x602
+#define OBD_FAIL_OBD_QC_CALLBACK_NET     0x603
+#define OBD_FAIL_OBD_DQACQ	       0x604
+#define OBD_FAIL_OBD_LLOG_SETUP	  0x605
+#define OBD_FAIL_OBD_LOG_CANCEL_REP      0x606
+#define OBD_FAIL_OBD_IDX_READ_NET	0x607
+#define OBD_FAIL_OBD_IDX_READ_BREAK	 0x608
+#define OBD_FAIL_OBD_NO_LRU		 0x609
+
+#define OBD_FAIL_TGT_REPLY_NET	   0x700
+#define OBD_FAIL_TGT_CONN_RACE	   0x701
+#define OBD_FAIL_TGT_FORCE_RECONNECT     0x702
+#define OBD_FAIL_TGT_DELAY_CONNECT       0x703
+#define OBD_FAIL_TGT_DELAY_RECONNECT     0x704
+#define OBD_FAIL_TGT_DELAY_PRECREATE     0x705
+#define OBD_FAIL_TGT_TOOMANY_THREADS     0x706
+#define OBD_FAIL_TGT_REPLAY_DROP	 0x707
+#define OBD_FAIL_TGT_FAKE_EXP	    0x708
+#define OBD_FAIL_TGT_REPLAY_DELAY	0x709
+#define OBD_FAIL_TGT_LAST_REPLAY	 0x710
+#define OBD_FAIL_TGT_CLIENT_ADD	  0x711
+#define OBD_FAIL_TGT_RCVG_FLAG	   0x712
+#define OBD_FAIL_TGT_DELAY_CONDITIONAL	 0x713
+
+#define OBD_FAIL_MDC_REVALIDATE_PAUSE    0x800
+#define OBD_FAIL_MDC_ENQUEUE_PAUSE       0x801
+#define OBD_FAIL_MDC_OLD_EXT_FLAGS       0x802
+#define OBD_FAIL_MDC_GETATTR_ENQUEUE     0x803
+#define OBD_FAIL_MDC_RPCS_SEM		 0x804
+#define OBD_FAIL_MDC_LIGHTWEIGHT	 0x805
+
+#define OBD_FAIL_MGS		     0x900
+#define OBD_FAIL_MGS_ALL_REQUEST_NET     0x901
+#define OBD_FAIL_MGS_ALL_REPLY_NET       0x902
+#define OBD_FAIL_MGC_PAUSE_PROCESS_LOG   0x903
+#define OBD_FAIL_MGS_PAUSE_REQ	   0x904
+#define OBD_FAIL_MGS_PAUSE_TARGET_REG    0x905
+#define OBD_FAIL_MGS_CONNECT_NET	 0x906
+#define OBD_FAIL_MGS_DISCONNECT_NET	 0x907
+#define OBD_FAIL_MGS_SET_INFO_NET	 0x908
+#define OBD_FAIL_MGS_EXCEPTION_NET	 0x909
+#define OBD_FAIL_MGS_TARGET_REG_NET	 0x90a
+#define OBD_FAIL_MGS_TARGET_DEL_NET	 0x90b
+#define OBD_FAIL_MGS_CONFIG_READ_NET	 0x90c
+
+#define OBD_FAIL_QUOTA_DQACQ_NET			0xA01
+#define OBD_FAIL_QUOTA_EDQUOT	    0xA02
+#define OBD_FAIL_QUOTA_DELAY_REINT       0xA03
+#define OBD_FAIL_QUOTA_RECOVERABLE_ERR   0xA04
+
+#define OBD_FAIL_LPROC_REMOVE	    0xB00
+
+#define OBD_FAIL_GENERAL_ALLOC	   0xC00
+
+#define OBD_FAIL_SEQ		     0x1000
+#define OBD_FAIL_SEQ_QUERY_NET	   0x1001
+#define OBD_FAIL_SEQ_EXHAUST		 0x1002
+
+#define OBD_FAIL_FLD		     0x1100
+#define OBD_FAIL_FLD_QUERY_NET	   0x1101
+
+#define OBD_FAIL_SEC_CTX		 0x1200
+#define OBD_FAIL_SEC_CTX_INIT_NET	0x1201
+#define OBD_FAIL_SEC_CTX_INIT_CONT_NET   0x1202
+#define OBD_FAIL_SEC_CTX_FINI_NET	0x1203
+#define OBD_FAIL_SEC_CTX_HDL_PAUSE       0x1204
+
+#define OBD_FAIL_LLOG			       0x1300
+#define OBD_FAIL_LLOG_ORIGIN_CONNECT_NET	    0x1301
+#define OBD_FAIL_LLOG_ORIGIN_HANDLE_CREATE_NET      0x1302
+#define OBD_FAIL_LLOG_ORIGIN_HANDLE_DESTROY_NET     0x1303
+#define OBD_FAIL_LLOG_ORIGIN_HANDLE_READ_HEADER_NET 0x1304
+#define OBD_FAIL_LLOG_ORIGIN_HANDLE_NEXT_BLOCK_NET  0x1305
+#define OBD_FAIL_LLOG_ORIGIN_HANDLE_PREV_BLOCK_NET  0x1306
+#define OBD_FAIL_LLOG_ORIGIN_HANDLE_WRITE_REC_NET   0x1307
+#define OBD_FAIL_LLOG_ORIGIN_HANDLE_CLOSE_NET       0x1308
+#define OBD_FAIL_LLOG_CATINFO_NET		   0x1309
+#define OBD_FAIL_MDS_SYNC_CAPA_SL		   0x1310
+#define OBD_FAIL_SEQ_ALLOC			  0x1311
+
+#define OBD_FAIL_LLITE			      0x1400
+#define OBD_FAIL_LLITE_FAULT_TRUNC_RACE	     0x1401
+#define OBD_FAIL_LOCK_STATE_WAIT_INTR	       0x1402
+#define OBD_FAIL_LOV_INIT			    0x1403
+#define OBD_FAIL_GLIMPSE_DELAY			    0x1404
+#define OBD_FAIL_LLITE_XATTR_ENOMEM		    0x1405
+
+#define OBD_FAIL_FID_INDIR	0x1501
+#define OBD_FAIL_FID_INLMA	0x1502
+#define OBD_FAIL_FID_IGIF	0x1504
+#define OBD_FAIL_FID_LOOKUP	0x1505
+#define OBD_FAIL_FID_NOLMA	0x1506
+
+/* LFSCK */
+#define OBD_FAIL_LFSCK_DELAY1		0x1600
+#define OBD_FAIL_LFSCK_DELAY2		0x1601
+#define OBD_FAIL_LFSCK_DELAY3		0x1602
+#define OBD_FAIL_LFSCK_LINKEA_CRASH	0x1603
+#define OBD_FAIL_LFSCK_LINKEA_MORE	0x1604
+#define OBD_FAIL_LFSCK_LINKEA_MORE2	0x1605
+#define OBD_FAIL_LFSCK_FATAL1		0x1608
+#define OBD_FAIL_LFSCK_FATAL2		0x1609
+#define OBD_FAIL_LFSCK_CRASH		0x160a
+#define OBD_FAIL_LFSCK_NO_AUTO		0x160b
+#define OBD_FAIL_LFSCK_NO_DOUBLESCAN	0x160c
+
+/* UPDATE */
+#define OBD_FAIL_UPDATE_OBJ_NET			0x1700
+#define OBD_FAIL_UPDATE_OBJ_NET_REP		0x1701
+
+
+/* Assign references to moved code to reduce code changes */
+#define OBD_FAIL_PRECHECK(id)		   CFS_FAIL_PRECHECK(id)
+#define OBD_FAIL_CHECK(id)		      CFS_FAIL_CHECK(id)
+#define OBD_FAIL_CHECK_VALUE(id, value)	 CFS_FAIL_CHECK_VALUE(id, value)
+#define OBD_FAIL_CHECK_ORSET(id, value)	 CFS_FAIL_CHECK_ORSET(id, value)
+#define OBD_FAIL_CHECK_RESET(id, value)	 CFS_FAIL_CHECK_RESET(id, value)
+#define OBD_FAIL_RETURN(id, ret)		CFS_FAIL_RETURN(id, ret)
+#define OBD_FAIL_TIMEOUT(id, secs)	      CFS_FAIL_TIMEOUT(id, secs)
+#define OBD_FAIL_TIMEOUT_MS(id, ms)	     CFS_FAIL_TIMEOUT_MS(id, ms)
+#define OBD_FAIL_TIMEOUT_ORSET(id, value, secs) CFS_FAIL_TIMEOUT_ORSET(id, value, secs)
+#define OBD_RACE(id)			    CFS_RACE(id)
+#define OBD_FAIL_ONCE			   CFS_FAIL_ONCE
+#define OBD_FAILED			      CFS_FAILED
+
+extern atomic_t libcfs_kmemory;
+
+extern void obd_update_maxusage(void);
+
+#if defined (CONFIG_PROC_FS)
+#define obd_memory_add(size)						  \
+	lprocfs_counter_add(obd_memory, OBD_MEMORY_STAT, (long)(size))
+#define obd_memory_sub(size)						  \
+	lprocfs_counter_sub(obd_memory, OBD_MEMORY_STAT, (long)(size))
+#define obd_memory_sum()						      \
+	lprocfs_stats_collector(obd_memory, OBD_MEMORY_STAT,		  \
+				LPROCFS_FIELDS_FLAGS_SUM)
+#define obd_pages_add(order)						  \
+	lprocfs_counter_add(obd_memory, OBD_MEMORY_PAGES_STAT,		\
+			    (long)(1 << (order)))
+#define obd_pages_sub(order)						  \
+	lprocfs_counter_sub(obd_memory, OBD_MEMORY_PAGES_STAT,		\
+			    (long)(1 << (order)))
+#define obd_pages_sum()						       \
+	lprocfs_stats_collector(obd_memory, OBD_MEMORY_PAGES_STAT,	    \
+				LPROCFS_FIELDS_FLAGS_SUM)
+
+extern __u64 obd_memory_max(void);
+extern __u64 obd_pages_max(void);
+
+#else
+
+extern __u64 obd_alloc;
+extern __u64 obd_pages;
+
+extern __u64 obd_max_alloc;
+extern __u64 obd_max_pages;
+
+static inline void obd_memory_add(long size)
+{
+	obd_alloc += size;
+	if (obd_alloc > obd_max_alloc)
+		obd_max_alloc = obd_alloc;
+}
+
+static inline void obd_memory_sub(long size)
+{
+	obd_alloc -= size;
+}
+
+static inline void obd_pages_add(int order)
+{
+	obd_pages += 1<< order;
+	if (obd_pages > obd_max_pages)
+		obd_max_pages = obd_pages;
+}
+
+static inline void obd_pages_sub(int order)
+{
+	obd_pages -= 1<< order;
+}
+
+#define obd_memory_sum() (obd_alloc)
+#define obd_pages_sum()  (obd_pages)
+
+#define obd_memory_max() (obd_max_alloc)
+#define obd_pages_max() (obd_max_pages)
+
+#endif
+
+#define OBD_DEBUG_MEMUSAGE (1)
+
+#if OBD_DEBUG_MEMUSAGE
+#define OBD_ALLOC_POST(ptr, size, name)				 \
+		obd_memory_add(size);				   \
+		CDEBUG(D_MALLOC, name " '" #ptr "': %d at %p.\n",       \
+		       (int)(size), ptr)
+
+#define OBD_FREE_PRE(ptr, size, name)				   \
+	LASSERT(ptr);						   \
+	obd_memory_sub(size);					   \
+	CDEBUG(D_MALLOC, name " '" #ptr "': %d at %p.\n",	       \
+	       (int)(size), ptr);				       \
+	POISON(ptr, 0x5a, size)
+
+#else /* !OBD_DEBUG_MEMUSAGE */
+
+#define OBD_ALLOC_POST(ptr, size, name) ((void)0)
+#define OBD_FREE_PRE(ptr, size, name)   ((void)0)
+
+#endif /* !OBD_DEBUG_MEMUSAGE */
+
+#define HAS_FAIL_ALLOC_FLAG OBD_FAIL_CHECK(OBD_FAIL_GENERAL_ALLOC)
+
+#define OBD_ALLOC_FAIL_BITS 24
+#define OBD_ALLOC_FAIL_MASK ((1 << OBD_ALLOC_FAIL_BITS) - 1)
+#define OBD_ALLOC_FAIL_MULT (OBD_ALLOC_FAIL_MASK / 100)
+
+#if defined(LUSTRE_UTILS) /* this version is for utils only */
+#define __OBD_MALLOC_VERBOSE(ptr, cptab, cpt, size, flags)		      \
+do {									      \
+	(ptr) = (cptab) == NULL ?					      \
+		kmalloc(size, flags) :				      \
+		kmalloc_node(size, flags, cfs_cpt_spread_node(cptab, cpt));   \
+	if (unlikely((ptr) == NULL)) {					\
+		CERROR("kmalloc of '" #ptr "' (%d bytes) failed at %s:%d\n",  \
+		       (int)(size), __FILE__, __LINE__);		      \
+	} else {							      \
+		memset(ptr, 0, size);					      \
+		CDEBUG(D_MALLOC, "kmalloced '" #ptr "': %d at %p\n",	      \
+		       (int)(size), ptr);				      \
+	}								      \
+} while (0)
+
+#else /* this version is for the kernel and liblustre */
+#define OBD_FREE_RTN0(ptr)						    \
+({									    \
+	kfree(ptr);							\
+	(ptr) = NULL;							 \
+	0;								    \
+})
+
+#define __OBD_MALLOC_VERBOSE(ptr, cptab, cpt, size, flags)		      \
+do {									      \
+	(ptr) = (cptab) == NULL ?					      \
+		kmalloc(size, flags | __GFP_ZERO) :			      \
+		kmalloc_node(size, flags | __GFP_ZERO,			      \
+			     cfs_cpt_spread_node(cptab, cpt));		      \
+	if (likely((ptr) != NULL &&					   \
+		   (!HAS_FAIL_ALLOC_FLAG || obd_alloc_fail_rate == 0 ||       \
+		    !obd_alloc_fail(ptr, #ptr, "km", size,		    \
+				    __FILE__, __LINE__) ||		    \
+		    OBD_FREE_RTN0(ptr)))){				    \
+		OBD_ALLOC_POST(ptr, size, "kmalloced");		       \
+	}								     \
+} while (0)
+#endif
+
+#define OBD_ALLOC_GFP(ptr, size, gfp_mask)				      \
+	__OBD_MALLOC_VERBOSE(ptr, NULL, 0, size, gfp_mask)
+
+#define OBD_ALLOC(ptr, size) OBD_ALLOC_GFP(ptr, size, GFP_NOFS)
+#define OBD_ALLOC_WAIT(ptr, size) OBD_ALLOC_GFP(ptr, size, GFP_KERNEL)
+#define OBD_ALLOC_PTR(ptr) OBD_ALLOC(ptr, sizeof(*(ptr)))
+#define OBD_ALLOC_PTR_WAIT(ptr) OBD_ALLOC_WAIT(ptr, sizeof(*(ptr)))
+
+#define OBD_CPT_ALLOC_GFP(ptr, cptab, cpt, size, gfp_mask)		      \
+	__OBD_MALLOC_VERBOSE(ptr, cptab, cpt, size, gfp_mask)
+
+#define OBD_CPT_ALLOC(ptr, cptab, cpt, size)				      \
+	OBD_CPT_ALLOC_GFP(ptr, cptab, cpt, size, GFP_NOFS)
+
+#define OBD_CPT_ALLOC_PTR(ptr, cptab, cpt)				      \
+	OBD_CPT_ALLOC(ptr, cptab, cpt, sizeof(*(ptr)))
+
+# define __OBD_VMALLOC_VEROBSE(ptr, cptab, cpt, size)			      \
+do {									      \
+	(ptr) = cptab == NULL ?						      \
+		vzalloc(size) :						      \
+		vzalloc_node(size, cfs_cpt_spread_node(cptab, cpt));	      \
+	if (unlikely((ptr) == NULL)) {					\
+		CERROR("vmalloc of '" #ptr "' (%d bytes) failed\n",	   \
+		       (int)(size));					  \
+		CERROR("%llu total bytes allocated by Lustre, %d by LNET\n", \
+		       obd_memory_sum(), atomic_read(&libcfs_kmemory));   \
+	} else {							      \
+		OBD_ALLOC_POST(ptr, size, "vmalloced");		       \
+	}								     \
+} while (0)
+
+# define OBD_VMALLOC(ptr, size)						      \
+	 __OBD_VMALLOC_VEROBSE(ptr, NULL, 0, size)
+# define OBD_CPT_VMALLOC(ptr, cptab, cpt, size)				      \
+	 __OBD_VMALLOC_VEROBSE(ptr, cptab, cpt, size)
+
+
+/* Allocations above this size are considered too big and could not be done
+ * atomically.
+ *
+ * Be very careful when changing this value, especially when decreasing it,
+ * since vmalloc in Linux doesn't perform well on multi-cores system, calling
+ * vmalloc in critical path would hurt performance badly. See LU-66.
+ */
+#define OBD_ALLOC_BIG (4 * PAGE_CACHE_SIZE)
+
+#define OBD_ALLOC_LARGE(ptr, size)					    \
+do {									  \
+	if (size > OBD_ALLOC_BIG)					     \
+		OBD_VMALLOC(ptr, size);				       \
+	else								  \
+		OBD_ALLOC(ptr, size);					 \
+} while (0)
+
+#define OBD_CPT_ALLOC_LARGE(ptr, cptab, cpt, size)			      \
+do {									      \
+	if (size > OBD_ALLOC_BIG)					      \
+		OBD_CPT_VMALLOC(ptr, cptab, cpt, size);			      \
+	else								      \
+		OBD_CPT_ALLOC(ptr, cptab, cpt, size);			      \
+} while (0)
+
+#define OBD_FREE_LARGE(ptr, size)					     \
+do {									  \
+	if (size > OBD_ALLOC_BIG)					     \
+		OBD_VFREE(ptr, size);					 \
+	else								  \
+		OBD_FREE(ptr, size);					  \
+} while (0)
+
+
+#ifdef CONFIG_DEBUG_SLAB
+#define POISON(ptr, c, s) do {} while (0)
+#define POISON_PTR(ptr)  ((void)0)
+#else
+#define POISON(ptr, c, s) memset(ptr, c, s)
+#define POISON_PTR(ptr)  (ptr) = (void *)0xdeadbeef
+#endif
+
+#ifdef POISON_BULK
+#define POISON_PAGE(page, val) do { memset(kmap(page), val, PAGE_CACHE_SIZE);   \
+				    kunmap(page); } while (0)
+#else
+#define POISON_PAGE(page, val) do { } while (0)
+#endif
+
+#define OBD_FREE(ptr, size)						   \
+do {									  \
+	OBD_FREE_PRE(ptr, size, "kfreed");				    \
+	kfree(ptr);							\
+	POISON_PTR(ptr);						      \
+} while (0)
+
+
+#define OBD_FREE_RCU(ptr, size, handle)					      \
+do {									      \
+	struct portals_handle *__h = (handle);				      \
+									      \
+	LASSERT(handle != NULL);					      \
+	__h->h_cookie = (unsigned long)(ptr);				      \
+	__h->h_size = (size);						      \
+	call_rcu(&__h->h_rcu, class_handle_free_cb);			      \
+	POISON_PTR(ptr);						      \
+} while (0)
+
+
+#define OBD_VFREE(ptr, size)				\
+	do {						\
+		OBD_FREE_PRE(ptr, size, "vfreed");	\
+		vfree(ptr);			\
+		POISON_PTR(ptr);			\
+	} while (0)
+
+/* we memset() the slab object to 0 when allocation succeeds, so DO NOT
+ * HAVE A CTOR THAT DOES ANYTHING.  its work will be cleared here.  we'd
+ * love to assert on that, but slab.c keeps kmem_cache_s all to itself. */
+#define OBD_SLAB_FREE_RTN0(ptr, slab)					 \
+({									    \
+	kmem_cache_free((slab), (ptr));				    \
+	(ptr) = NULL;							 \
+	0;								    \
+})
+
+#define __OBD_SLAB_ALLOC_VERBOSE(ptr, slab, cptab, cpt, size, type)	      \
+do {									      \
+	LASSERT(ergo((type) != GFP_ATOMIC, !in_interrupt()));	      \
+	(ptr) = (cptab) == NULL ?					      \
+		kmem_cache_alloc(slab, type | __GFP_ZERO) :		\
+		kmem_cache_alloc_node(slab, type | __GFP_ZERO,		\
+				      cfs_cpt_spread_node(cptab, cpt));	\
+	if (likely((ptr) != NULL &&					   \
+		   (!HAS_FAIL_ALLOC_FLAG || obd_alloc_fail_rate == 0 ||       \
+		    !obd_alloc_fail(ptr, #ptr, "slab-", size,		 \
+				    __FILE__, __LINE__) ||		    \
+		    OBD_SLAB_FREE_RTN0(ptr, slab)))) {			\
+		OBD_ALLOC_POST(ptr, size, "slab-alloced");		    \
+	}								     \
+} while (0)
+
+#define OBD_SLAB_ALLOC_GFP(ptr, slab, size, flags)			      \
+	__OBD_SLAB_ALLOC_VERBOSE(ptr, slab, NULL, 0, size, flags)
+#define OBD_SLAB_CPT_ALLOC_GFP(ptr, slab, cptab, cpt, size, flags)	      \
+	__OBD_SLAB_ALLOC_VERBOSE(ptr, slab, cptab, cpt, size, flags)
+
+#define OBD_FREE_PTR(ptr) OBD_FREE(ptr, sizeof(*(ptr)))
+
+#define OBD_SLAB_FREE(ptr, slab, size)					\
+do {									  \
+	OBD_FREE_PRE(ptr, size, "slab-freed");				\
+	kmem_cache_free(slab, ptr);					\
+	POISON_PTR(ptr);						      \
+} while (0)
+
+#define OBD_SLAB_ALLOC(ptr, slab, size)					      \
+	OBD_SLAB_ALLOC_GFP(ptr, slab, size, GFP_NOFS)
+
+#define OBD_SLAB_CPT_ALLOC(ptr, slab, cptab, cpt, size)			      \
+	OBD_SLAB_CPT_ALLOC_GFP(ptr, slab, cptab, cpt, size, GFP_NOFS)
+
+#define OBD_SLAB_ALLOC_PTR(ptr, slab)					      \
+	OBD_SLAB_ALLOC(ptr, slab, sizeof(*(ptr)))
+
+#define OBD_SLAB_CPT_ALLOC_PTR(ptr, slab, cptab, cpt)			      \
+	OBD_SLAB_CPT_ALLOC(ptr, slab, cptab, cpt, sizeof(*(ptr)))
+
+#define OBD_SLAB_ALLOC_PTR_GFP(ptr, slab, flags)			      \
+	OBD_SLAB_ALLOC_GFP(ptr, slab, sizeof(*(ptr)), flags)
+
+#define OBD_SLAB_CPT_ALLOC_PTR_GFP(ptr, slab, cptab, cpt, flags)		      \
+	OBD_SLAB_CPT_ALLOC_GFP(ptr, slab, cptab, cpt, sizeof(*(ptr)), flags)
+
+#define OBD_SLAB_FREE_PTR(ptr, slab)					      \
+	OBD_SLAB_FREE((ptr), (slab), sizeof(*(ptr)))
+
+#define KEY_IS(str) \
+	(keylen >= (sizeof(str)-1) && memcmp(key, str, (sizeof(str)-1)) == 0)
+
+/* Wrapper for contiguous page frame allocation */
+#define __OBD_PAGE_ALLOC_VERBOSE(ptr, cptab, cpt, gfp_mask)		      \
+do {									      \
+	(ptr) = (cptab) == NULL ?					      \
+		alloc_page(gfp_mask) :				      \
+		alloc_pages_node(cfs_cpt_spread_node(cptab, cpt), gfp_mask, 0);\
+	if (unlikely((ptr) == NULL)) {					\
+		CERROR("alloc_pages of '" #ptr "' %d page(s) / %llu bytes "\
+		       "failed\n", (int)1,				    \
+		       (__u64)(1 << PAGE_CACHE_SHIFT));			 \
+		CERROR("%llu total bytes and %llu total pages "	   \
+		       "(%llu bytes) allocated by Lustre, "		\
+		       "%d total bytes by LNET\n",			    \
+		       obd_memory_sum(),				      \
+		       obd_pages_sum() << PAGE_CACHE_SHIFT,		     \
+		       obd_pages_sum(),				       \
+		       atomic_read(&libcfs_kmemory));		     \
+	} else {							      \
+		obd_pages_add(0);					     \
+		CDEBUG(D_MALLOC, "alloc_pages '" #ptr "': %d page(s) / "      \
+		       "%llu bytes at %p.\n",				\
+		       (int)1,						\
+		       (__u64)(1 << PAGE_CACHE_SHIFT), ptr);		    \
+	}								     \
+} while (0)
+
+#define OBD_PAGE_ALLOC(ptr, gfp_mask)					      \
+	__OBD_PAGE_ALLOC_VERBOSE(ptr, NULL, 0, gfp_mask)
+#define OBD_PAGE_CPT_ALLOC(ptr, cptab, cpt, gfp_mask)			      \
+	__OBD_PAGE_ALLOC_VERBOSE(ptr, cptab, cpt, gfp_mask)
+
+#define OBD_PAGE_FREE(ptr)						    \
+do {									  \
+	LASSERT(ptr);							 \
+	obd_pages_sub(0);						     \
+	CDEBUG(D_MALLOC, "free_pages '" #ptr "': %d page(s) / %llu bytes " \
+	       "at %p.\n",						    \
+	       (int)1, (__u64)(1 << PAGE_CACHE_SHIFT),			  \
+	       ptr);							  \
+	__free_page(ptr);						   \
+	(ptr) = (void *)0xdeadbeef;					   \
+} while (0)
+
+#endif
diff --git a/kernel/drivers/staging/lustre/lustre/lclient/glimpse.c b/kernel/drivers/staging/lustre/lustre/lclient/glimpse.c
new file mode 100644
index 000000000..b9f2bb66d
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/lclient/glimpse.c
@@ -0,0 +1,269 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * glimpse code shared between vvp and liblustre (and other Lustre clients in
+ * the future).
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ *   Author: Oleg Drokin <oleg.drokin@sun.com>
+ */
+
+#include "../../include/linux/libcfs/libcfs.h"
+#include "../include/obd_class.h"
+#include "../include/obd_support.h"
+#include "../include/obd.h"
+
+#include "../include/lustre_dlm.h"
+#include "../include/lustre_lite.h"
+#include "../include/lustre_mdc.h"
+#include <linux/pagemap.h>
+#include <linux/file.h>
+
+#include "../include/cl_object.h"
+#include "../include/lclient.h"
+#include "../llite/llite_internal.h"
+
+static const struct cl_lock_descr whole_file = {
+	.cld_start = 0,
+	.cld_end   = CL_PAGE_EOF,
+	.cld_mode  = CLM_READ
+};
+
+/*
+ * Check whether file has possible unwriten pages.
+ *
+ * \retval 1    file is mmap-ed or has dirty pages
+ *	 0    otherwise
+ */
+blkcnt_t dirty_cnt(struct inode *inode)
+{
+	blkcnt_t cnt = 0;
+	struct ccc_object *vob = cl_inode2ccc(inode);
+	void	      *results[1];
+
+	if (inode->i_mapping != NULL)
+		cnt += radix_tree_gang_lookup_tag(&inode->i_mapping->page_tree,
+						  results, 0, 1,
+						  PAGECACHE_TAG_DIRTY);
+	if (cnt == 0 && atomic_read(&vob->cob_mmap_cnt) > 0)
+		cnt = 1;
+
+	return (cnt > 0) ? 1 : 0;
+}
+
+int cl_glimpse_lock(const struct lu_env *env, struct cl_io *io,
+		    struct inode *inode, struct cl_object *clob, int agl)
+{
+	struct cl_lock_descr *descr = &ccc_env_info(env)->cti_descr;
+	struct cl_inode_info *lli   = cl_i2info(inode);
+	const struct lu_fid  *fid   = lu_object_fid(&clob->co_lu);
+	struct ccc_io	*cio   = ccc_env_io(env);
+	struct cl_lock       *lock;
+	int result;
+
+	result = 0;
+	if (!(lli->lli_flags & LLIF_MDS_SIZE_LOCK)) {
+		CDEBUG(D_DLMTRACE, "Glimpsing inode "DFID"\n", PFID(fid));
+		if (lli->lli_has_smd) {
+			/* NOTE: this looks like DLM lock request, but it may
+			 *       not be one. Due to CEF_ASYNC flag (translated
+			 *       to LDLM_FL_HAS_INTENT by osc), this is
+			 *       glimpse request, that won't revoke any
+			 *       conflicting DLM locks held. Instead,
+			 *       ll_glimpse_callback() will be called on each
+			 *       client holding a DLM lock against this file,
+			 *       and resulting size will be returned for each
+			 *       stripe. DLM lock on [0, EOF] is acquired only
+			 *       if there were no conflicting locks. If there
+			 *       were conflicting locks, enqueuing or waiting
+			 *       fails with -ENAVAIL, but valid inode
+			 *       attributes are returned anyway. */
+			*descr = whole_file;
+			descr->cld_obj   = clob;
+			descr->cld_mode  = CLM_PHANTOM;
+			descr->cld_enq_flags = CEF_ASYNC | CEF_MUST;
+			if (agl)
+				descr->cld_enq_flags |= CEF_AGL;
+			cio->cui_glimpse = 1;
+			/*
+			 * CEF_ASYNC is used because glimpse sub-locks cannot
+			 * deadlock (because they never conflict with other
+			 * locks) and, hence, can be enqueued out-of-order.
+			 *
+			 * CEF_MUST protects glimpse lock from conversion into
+			 * a lockless mode.
+			 */
+			lock = cl_lock_request(env, io, descr, "glimpse",
+					       current);
+			cio->cui_glimpse = 0;
+
+			if (lock == NULL)
+				return 0;
+
+			if (IS_ERR(lock))
+				return PTR_ERR(lock);
+
+			LASSERT(agl == 0);
+			result = cl_wait(env, lock);
+			if (result == 0) {
+				cl_merge_lvb(env, inode);
+				if (cl_isize_read(inode) > 0 &&
+				    inode->i_blocks == 0) {
+					/*
+					 * LU-417: Add dirty pages block count
+					 * lest i_blocks reports 0, some "cp" or
+					 * "tar" may think it's a completely
+					 * sparse file and skip it.
+					 */
+					inode->i_blocks = dirty_cnt(inode);
+				}
+				cl_unuse(env, lock);
+			}
+			cl_lock_release(env, lock, "glimpse", current);
+		} else {
+			CDEBUG(D_DLMTRACE, "No objects for inode\n");
+			cl_merge_lvb(env, inode);
+		}
+	}
+
+	return result;
+}
+
+static int cl_io_get(struct inode *inode, struct lu_env **envout,
+		     struct cl_io **ioout, int *refcheck)
+{
+	struct lu_env	  *env;
+	struct cl_io	   *io;
+	struct cl_inode_info   *lli = cl_i2info(inode);
+	struct cl_object       *clob = lli->lli_clob;
+	int result;
+
+	if (S_ISREG(cl_inode_mode(inode))) {
+		env = cl_env_get(refcheck);
+		if (!IS_ERR(env)) {
+			io = ccc_env_thread_io(env);
+			io->ci_obj = clob;
+			*envout = env;
+			*ioout  = io;
+			result = 1;
+		} else
+			result = PTR_ERR(env);
+	} else
+		result = 0;
+	return result;
+}
+
+int cl_glimpse_size0(struct inode *inode, int agl)
+{
+	/*
+	 * We don't need ast_flags argument to cl_glimpse_size(), because
+	 * osc_lock_enqueue() takes care of the possible deadlock that said
+	 * argument was introduced to avoid.
+	 */
+	/*
+	 * XXX but note that ll_file_seek() passes LDLM_FL_BLOCK_NOWAIT to
+	 * cl_glimpse_size(), which doesn't make sense: glimpse locks are not
+	 * blocking anyway.
+	 */
+	struct lu_env	  *env = NULL;
+	struct cl_io	   *io  = NULL;
+	int		     result;
+	int		     refcheck;
+
+	result = cl_io_get(inode, &env, &io, &refcheck);
+	if (result > 0) {
+again:
+		io->ci_verify_layout = 1;
+		result = cl_io_init(env, io, CIT_MISC, io->ci_obj);
+		if (result > 0)
+			/*
+			 * nothing to do for this io. This currently happens
+			 * when stripe sub-object's are not yet created.
+			 */
+			result = io->ci_result;
+		else if (result == 0)
+			result = cl_glimpse_lock(env, io, inode, io->ci_obj,
+						 agl);
+
+		OBD_FAIL_TIMEOUT(OBD_FAIL_GLIMPSE_DELAY, 2);
+		cl_io_fini(env, io);
+		if (unlikely(io->ci_need_restart))
+			goto again;
+		cl_env_put(env, &refcheck);
+	}
+	return result;
+}
+
+int cl_local_size(struct inode *inode)
+{
+	struct lu_env	   *env = NULL;
+	struct cl_io	    *io  = NULL;
+	struct ccc_thread_info  *cti;
+	struct cl_object	*clob;
+	struct cl_lock_descr    *descr;
+	struct cl_lock	  *lock;
+	int		      result;
+	int		      refcheck;
+
+	if (!cl_i2info(inode)->lli_has_smd)
+		return 0;
+
+	result = cl_io_get(inode, &env, &io, &refcheck);
+	if (result <= 0)
+		return result;
+
+	clob = io->ci_obj;
+	result = cl_io_init(env, io, CIT_MISC, clob);
+	if (result > 0)
+		result = io->ci_result;
+	else if (result == 0) {
+		cti = ccc_env_info(env);
+		descr = &cti->cti_descr;
+
+		*descr = whole_file;
+		descr->cld_obj = clob;
+		lock = cl_lock_peek(env, io, descr, "localsize", current);
+		if (lock != NULL) {
+			cl_merge_lvb(env, inode);
+			cl_unuse(env, lock);
+			cl_lock_release(env, lock, "localsize", current);
+			result = 0;
+		} else
+			result = -ENODATA;
+	}
+	cl_io_fini(env, io);
+	cl_env_put(env, &refcheck);
+	return result;
+}
diff --git a/kernel/drivers/staging/lustre/lustre/lclient/lcommon_cl.c b/kernel/drivers/staging/lustre/lustre/lclient/lcommon_cl.c
new file mode 100644
index 000000000..ab6cb4193
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/lclient/lcommon_cl.c
@@ -0,0 +1,1287 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * cl code shared between vvp and liblustre (and other Lustre clients in the
+ * future).
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LLITE
+
+#include "../../include/linux/libcfs/libcfs.h"
+# include <linux/fs.h>
+# include <linux/sched.h>
+# include <linux/mm.h>
+# include <linux/quotaops.h>
+# include <linux/highmem.h>
+# include <linux/pagemap.h>
+# include <linux/rbtree.h>
+
+#include "../include/obd.h"
+#include "../include/obd_support.h"
+#include "../include/lustre_fid.h"
+#include "../include/lustre_lite.h"
+#include "../include/lustre_dlm.h"
+#include "../include/lustre_ver.h"
+#include "../include/lustre_mdc.h"
+#include "../include/cl_object.h"
+
+#include "../include/lclient.h"
+
+#include "../llite/llite_internal.h"
+
+static const struct cl_req_operations ccc_req_ops;
+
+/*
+ * ccc_ prefix stands for "Common Client Code".
+ */
+
+static struct kmem_cache *ccc_lock_kmem;
+static struct kmem_cache *ccc_object_kmem;
+static struct kmem_cache *ccc_thread_kmem;
+static struct kmem_cache *ccc_session_kmem;
+static struct kmem_cache *ccc_req_kmem;
+
+static struct lu_kmem_descr ccc_caches[] = {
+	{
+		.ckd_cache = &ccc_lock_kmem,
+		.ckd_name  = "ccc_lock_kmem",
+		.ckd_size  = sizeof(struct ccc_lock)
+	},
+	{
+		.ckd_cache = &ccc_object_kmem,
+		.ckd_name  = "ccc_object_kmem",
+		.ckd_size  = sizeof(struct ccc_object)
+	},
+	{
+		.ckd_cache = &ccc_thread_kmem,
+		.ckd_name  = "ccc_thread_kmem",
+		.ckd_size  = sizeof(struct ccc_thread_info),
+	},
+	{
+		.ckd_cache = &ccc_session_kmem,
+		.ckd_name  = "ccc_session_kmem",
+		.ckd_size  = sizeof(struct ccc_session)
+	},
+	{
+		.ckd_cache = &ccc_req_kmem,
+		.ckd_name  = "ccc_req_kmem",
+		.ckd_size  = sizeof(struct ccc_req)
+	},
+	{
+		.ckd_cache = NULL
+	}
+};
+
+/*****************************************************************************
+ *
+ * Vvp device and device type functions.
+ *
+ */
+
+void *ccc_key_init(const struct lu_context *ctx, struct lu_context_key *key)
+{
+	struct ccc_thread_info *info;
+
+	OBD_SLAB_ALLOC_PTR_GFP(info, ccc_thread_kmem, GFP_NOFS);
+	if (info == NULL)
+		info = ERR_PTR(-ENOMEM);
+	return info;
+}
+
+void ccc_key_fini(const struct lu_context *ctx,
+			 struct lu_context_key *key, void *data)
+{
+	struct ccc_thread_info *info = data;
+
+	OBD_SLAB_FREE_PTR(info, ccc_thread_kmem);
+}
+
+void *ccc_session_key_init(const struct lu_context *ctx,
+				  struct lu_context_key *key)
+{
+	struct ccc_session *session;
+
+	OBD_SLAB_ALLOC_PTR_GFP(session, ccc_session_kmem, GFP_NOFS);
+	if (session == NULL)
+		session = ERR_PTR(-ENOMEM);
+	return session;
+}
+
+void ccc_session_key_fini(const struct lu_context *ctx,
+				 struct lu_context_key *key, void *data)
+{
+	struct ccc_session *session = data;
+
+	OBD_SLAB_FREE_PTR(session, ccc_session_kmem);
+}
+
+struct lu_context_key ccc_key = {
+	.lct_tags = LCT_CL_THREAD,
+	.lct_init = ccc_key_init,
+	.lct_fini = ccc_key_fini
+};
+
+struct lu_context_key ccc_session_key = {
+	.lct_tags = LCT_SESSION,
+	.lct_init = ccc_session_key_init,
+	.lct_fini = ccc_session_key_fini
+};
+
+
+/* type constructor/destructor: ccc_type_{init,fini,start,stop}(). */
+/* LU_TYPE_INIT_FINI(ccc, &ccc_key, &ccc_session_key); */
+
+int ccc_device_init(const struct lu_env *env, struct lu_device *d,
+			   const char *name, struct lu_device *next)
+{
+	struct ccc_device  *vdv;
+	int rc;
+
+	vdv = lu2ccc_dev(d);
+	vdv->cdv_next = lu2cl_dev(next);
+
+	LASSERT(d->ld_site != NULL && next->ld_type != NULL);
+	next->ld_site = d->ld_site;
+	rc = next->ld_type->ldt_ops->ldto_device_init(
+			env, next, next->ld_type->ldt_name, NULL);
+	if (rc == 0) {
+		lu_device_get(next);
+		lu_ref_add(&next->ld_reference, "lu-stack", &lu_site_init);
+	}
+	return rc;
+}
+
+struct lu_device *ccc_device_fini(const struct lu_env *env,
+					 struct lu_device *d)
+{
+	return cl2lu_dev(lu2ccc_dev(d)->cdv_next);
+}
+
+struct lu_device *ccc_device_alloc(const struct lu_env *env,
+				   struct lu_device_type *t,
+				   struct lustre_cfg *cfg,
+				   const struct lu_device_operations *luops,
+				   const struct cl_device_operations *clops)
+{
+	struct ccc_device *vdv;
+	struct lu_device  *lud;
+	struct cl_site    *site;
+	int rc;
+
+	OBD_ALLOC_PTR(vdv);
+	if (vdv == NULL)
+		return ERR_PTR(-ENOMEM);
+
+	lud = &vdv->cdv_cl.cd_lu_dev;
+	cl_device_init(&vdv->cdv_cl, t);
+	ccc2lu_dev(vdv)->ld_ops = luops;
+	vdv->cdv_cl.cd_ops = clops;
+
+	OBD_ALLOC_PTR(site);
+	if (site != NULL) {
+		rc = cl_site_init(site, &vdv->cdv_cl);
+		if (rc == 0)
+			rc = lu_site_init_finish(&site->cs_lu);
+		else {
+			LASSERT(lud->ld_site == NULL);
+			CERROR("Cannot init lu_site, rc %d.\n", rc);
+			OBD_FREE_PTR(site);
+		}
+	} else
+		rc = -ENOMEM;
+	if (rc != 0) {
+		ccc_device_free(env, lud);
+		lud = ERR_PTR(rc);
+	}
+	return lud;
+}
+
+struct lu_device *ccc_device_free(const struct lu_env *env,
+					 struct lu_device *d)
+{
+	struct ccc_device *vdv  = lu2ccc_dev(d);
+	struct cl_site    *site = lu2cl_site(d->ld_site);
+	struct lu_device  *next = cl2lu_dev(vdv->cdv_next);
+
+	if (d->ld_site != NULL) {
+		cl_site_fini(site);
+		OBD_FREE_PTR(site);
+	}
+	cl_device_fini(lu2cl_dev(d));
+	OBD_FREE_PTR(vdv);
+	return next;
+}
+
+int ccc_req_init(const struct lu_env *env, struct cl_device *dev,
+			struct cl_req *req)
+{
+	struct ccc_req *vrq;
+	int result;
+
+	OBD_SLAB_ALLOC_PTR_GFP(vrq, ccc_req_kmem, GFP_NOFS);
+	if (vrq != NULL) {
+		cl_req_slice_add(req, &vrq->crq_cl, dev, &ccc_req_ops);
+		result = 0;
+	} else
+		result = -ENOMEM;
+	return result;
+}
+
+/**
+ * An `emergency' environment used by ccc_inode_fini() when cl_env_get()
+ * fails. Access to this environment is serialized by ccc_inode_fini_guard
+ * mutex.
+ */
+static struct lu_env *ccc_inode_fini_env;
+
+/**
+ * A mutex serializing calls to slp_inode_fini() under extreme memory
+ * pressure, when environments cannot be allocated.
+ */
+static DEFINE_MUTEX(ccc_inode_fini_guard);
+static int dummy_refcheck;
+
+int ccc_global_init(struct lu_device_type *device_type)
+{
+	int result;
+
+	result = lu_kmem_init(ccc_caches);
+	if (result)
+		return result;
+
+	result = lu_device_type_init(device_type);
+	if (result)
+		goto out_kmem;
+
+	ccc_inode_fini_env = cl_env_alloc(&dummy_refcheck,
+					  LCT_REMEMBER|LCT_NOREF);
+	if (IS_ERR(ccc_inode_fini_env)) {
+		result = PTR_ERR(ccc_inode_fini_env);
+		goto out_device;
+	}
+
+	ccc_inode_fini_env->le_ctx.lc_cookie = 0x4;
+	return 0;
+out_device:
+	lu_device_type_fini(device_type);
+out_kmem:
+	lu_kmem_fini(ccc_caches);
+	return result;
+}
+
+void ccc_global_fini(struct lu_device_type *device_type)
+{
+	if (ccc_inode_fini_env != NULL) {
+		cl_env_put(ccc_inode_fini_env, &dummy_refcheck);
+		ccc_inode_fini_env = NULL;
+	}
+	lu_device_type_fini(device_type);
+	lu_kmem_fini(ccc_caches);
+}
+
+/*****************************************************************************
+ *
+ * Object operations.
+ *
+ */
+
+struct lu_object *ccc_object_alloc(const struct lu_env *env,
+				   const struct lu_object_header *unused,
+				   struct lu_device *dev,
+				   const struct cl_object_operations *clops,
+				   const struct lu_object_operations *luops)
+{
+	struct ccc_object *vob;
+	struct lu_object  *obj;
+
+	OBD_SLAB_ALLOC_PTR_GFP(vob, ccc_object_kmem, GFP_NOFS);
+	if (vob != NULL) {
+		struct cl_object_header *hdr;
+
+		obj = ccc2lu(vob);
+		hdr = &vob->cob_header;
+		cl_object_header_init(hdr);
+		lu_object_init(obj, &hdr->coh_lu, dev);
+		lu_object_add_top(&hdr->coh_lu, obj);
+
+		vob->cob_cl.co_ops = clops;
+		obj->lo_ops = luops;
+	} else
+		obj = NULL;
+	return obj;
+}
+
+int ccc_object_init0(const struct lu_env *env,
+			    struct ccc_object *vob,
+			    const struct cl_object_conf *conf)
+{
+	vob->cob_inode = conf->coc_inode;
+	vob->cob_transient_pages = 0;
+	cl_object_page_init(&vob->cob_cl, sizeof(struct ccc_page));
+	return 0;
+}
+
+int ccc_object_init(const struct lu_env *env, struct lu_object *obj,
+			   const struct lu_object_conf *conf)
+{
+	struct ccc_device *dev = lu2ccc_dev(obj->lo_dev);
+	struct ccc_object *vob = lu2ccc(obj);
+	struct lu_object  *below;
+	struct lu_device  *under;
+	int result;
+
+	under = &dev->cdv_next->cd_lu_dev;
+	below = under->ld_ops->ldo_object_alloc(env, obj->lo_header, under);
+	if (below != NULL) {
+		const struct cl_object_conf *cconf;
+
+		cconf = lu2cl_conf(conf);
+		INIT_LIST_HEAD(&vob->cob_pending_list);
+		lu_object_add(obj, below);
+		result = ccc_object_init0(env, vob, cconf);
+	} else
+		result = -ENOMEM;
+	return result;
+}
+
+void ccc_object_free(const struct lu_env *env, struct lu_object *obj)
+{
+	struct ccc_object *vob = lu2ccc(obj);
+
+	lu_object_fini(obj);
+	lu_object_header_fini(obj->lo_header);
+	OBD_SLAB_FREE_PTR(vob, ccc_object_kmem);
+}
+
+int ccc_lock_init(const struct lu_env *env,
+		  struct cl_object *obj, struct cl_lock *lock,
+		  const struct cl_io *unused,
+		  const struct cl_lock_operations *lkops)
+{
+	struct ccc_lock *clk;
+	int result;
+
+	CLOBINVRNT(env, obj, ccc_object_invariant(obj));
+
+	OBD_SLAB_ALLOC_PTR_GFP(clk, ccc_lock_kmem, GFP_NOFS);
+	if (clk != NULL) {
+		cl_lock_slice_add(lock, &clk->clk_cl, obj, lkops);
+		result = 0;
+	} else
+		result = -ENOMEM;
+	return result;
+}
+
+int ccc_attr_set(const struct lu_env *env, struct cl_object *obj,
+		 const struct cl_attr *attr, unsigned valid)
+{
+	return 0;
+}
+
+int ccc_object_glimpse(const struct lu_env *env,
+		       const struct cl_object *obj, struct ost_lvb *lvb)
+{
+	struct inode *inode = ccc_object_inode(obj);
+
+	lvb->lvb_mtime = cl_inode_mtime(inode);
+	lvb->lvb_atime = cl_inode_atime(inode);
+	lvb->lvb_ctime = cl_inode_ctime(inode);
+	/*
+	 * LU-417: Add dirty pages block count lest i_blocks reports 0, some
+	 * "cp" or "tar" on remote node may think it's a completely sparse file
+	 * and skip it.
+	 */
+	if (lvb->lvb_size > 0 && lvb->lvb_blocks == 0)
+		lvb->lvb_blocks = dirty_cnt(inode);
+	return 0;
+}
+
+
+
+int ccc_conf_set(const struct lu_env *env, struct cl_object *obj,
+			const struct cl_object_conf *conf)
+{
+	/* TODO: destroy all pages attached to this object. */
+	return 0;
+}
+
+static void ccc_object_size_lock(struct cl_object *obj)
+{
+	struct inode *inode = ccc_object_inode(obj);
+
+	cl_isize_lock(inode);
+	cl_object_attr_lock(obj);
+}
+
+static void ccc_object_size_unlock(struct cl_object *obj)
+{
+	struct inode *inode = ccc_object_inode(obj);
+
+	cl_object_attr_unlock(obj);
+	cl_isize_unlock(inode);
+}
+
+/*****************************************************************************
+ *
+ * Page operations.
+ *
+ */
+
+struct page *ccc_page_vmpage(const struct lu_env *env,
+			    const struct cl_page_slice *slice)
+{
+	return cl2vm_page(slice);
+}
+
+int ccc_page_is_under_lock(const struct lu_env *env,
+			   const struct cl_page_slice *slice,
+			   struct cl_io *io)
+{
+	struct ccc_io	*cio  = ccc_env_io(env);
+	struct cl_lock_descr *desc = &ccc_env_info(env)->cti_descr;
+	struct cl_page       *page = slice->cpl_page;
+
+	int result;
+
+	if (io->ci_type == CIT_READ || io->ci_type == CIT_WRITE ||
+	    io->ci_type == CIT_FAULT) {
+		if (cio->cui_fd->fd_flags & LL_FILE_GROUP_LOCKED)
+			result = -EBUSY;
+		else {
+			desc->cld_start = page->cp_index;
+			desc->cld_end   = page->cp_index;
+			desc->cld_obj   = page->cp_obj;
+			desc->cld_mode  = CLM_READ;
+			result = cl_queue_match(&io->ci_lockset.cls_done,
+						desc) ? -EBUSY : 0;
+		}
+	} else
+		result = 0;
+	return result;
+}
+
+int ccc_fail(const struct lu_env *env, const struct cl_page_slice *slice)
+{
+	/*
+	 * Cached read?
+	 */
+	LBUG();
+	return 0;
+}
+
+void ccc_transient_page_verify(const struct cl_page *page)
+{
+}
+
+int ccc_transient_page_own(const struct lu_env *env,
+				   const struct cl_page_slice *slice,
+				   struct cl_io *unused,
+				   int nonblock)
+{
+	ccc_transient_page_verify(slice->cpl_page);
+	return 0;
+}
+
+void ccc_transient_page_assume(const struct lu_env *env,
+				      const struct cl_page_slice *slice,
+				      struct cl_io *unused)
+{
+	ccc_transient_page_verify(slice->cpl_page);
+}
+
+void ccc_transient_page_unassume(const struct lu_env *env,
+					const struct cl_page_slice *slice,
+					struct cl_io *unused)
+{
+	ccc_transient_page_verify(slice->cpl_page);
+}
+
+void ccc_transient_page_disown(const struct lu_env *env,
+				      const struct cl_page_slice *slice,
+				      struct cl_io *unused)
+{
+	ccc_transient_page_verify(slice->cpl_page);
+}
+
+void ccc_transient_page_discard(const struct lu_env *env,
+				       const struct cl_page_slice *slice,
+				       struct cl_io *unused)
+{
+	struct cl_page *page = slice->cpl_page;
+
+	ccc_transient_page_verify(slice->cpl_page);
+
+	/*
+	 * For transient pages, remove it from the radix tree.
+	 */
+	cl_page_delete(env, page);
+}
+
+int ccc_transient_page_prep(const struct lu_env *env,
+				   const struct cl_page_slice *slice,
+				   struct cl_io *unused)
+{
+	/* transient page should always be sent. */
+	return 0;
+}
+
+/*****************************************************************************
+ *
+ * Lock operations.
+ *
+ */
+
+void ccc_lock_delete(const struct lu_env *env,
+		     const struct cl_lock_slice *slice)
+{
+	CLOBINVRNT(env, slice->cls_obj, ccc_object_invariant(slice->cls_obj));
+}
+
+void ccc_lock_fini(const struct lu_env *env, struct cl_lock_slice *slice)
+{
+	struct ccc_lock *clk = cl2ccc_lock(slice);
+
+	OBD_SLAB_FREE_PTR(clk, ccc_lock_kmem);
+}
+
+int ccc_lock_enqueue(const struct lu_env *env,
+		     const struct cl_lock_slice *slice,
+		     struct cl_io *unused, __u32 enqflags)
+{
+	CLOBINVRNT(env, slice->cls_obj, ccc_object_invariant(slice->cls_obj));
+	return 0;
+}
+
+int ccc_lock_use(const struct lu_env *env, const struct cl_lock_slice *slice)
+{
+	CLOBINVRNT(env, slice->cls_obj, ccc_object_invariant(slice->cls_obj));
+	return 0;
+}
+
+int ccc_lock_unuse(const struct lu_env *env, const struct cl_lock_slice *slice)
+{
+	CLOBINVRNT(env, slice->cls_obj, ccc_object_invariant(slice->cls_obj));
+	return 0;
+}
+
+int ccc_lock_wait(const struct lu_env *env, const struct cl_lock_slice *slice)
+{
+	CLOBINVRNT(env, slice->cls_obj, ccc_object_invariant(slice->cls_obj));
+	return 0;
+}
+
+/**
+ * Implementation of cl_lock_operations::clo_fits_into() methods for ccc
+ * layer. This function is executed every time io finds an existing lock in
+ * the lock cache while creating new lock. This function has to decide whether
+ * cached lock "fits" into io.
+ *
+ * \param slice lock to be checked
+ * \param io    IO that wants a lock.
+ *
+ * \see lov_lock_fits_into().
+ */
+int ccc_lock_fits_into(const struct lu_env *env,
+		       const struct cl_lock_slice *slice,
+		       const struct cl_lock_descr *need,
+		       const struct cl_io *io)
+{
+	const struct cl_lock       *lock  = slice->cls_lock;
+	const struct cl_lock_descr *descr = &lock->cll_descr;
+	const struct ccc_io	*cio   = ccc_env_io(env);
+	int			 result;
+
+	/*
+	 * Work around DLM peculiarity: it assumes that glimpse
+	 * (LDLM_FL_HAS_INTENT) lock is always LCK_PR, and returns reads lock
+	 * when asked for LCK_PW lock with LDLM_FL_HAS_INTENT flag set. Make
+	 * sure that glimpse doesn't get CLM_WRITE top-lock, so that it
+	 * doesn't enqueue CLM_WRITE sub-locks.
+	 */
+	if (cio->cui_glimpse)
+		result = descr->cld_mode != CLM_WRITE;
+
+	/*
+	 * Also, don't match incomplete write locks for read, otherwise read
+	 * would enqueue missing sub-locks in the write mode.
+	 */
+	else if (need->cld_mode != descr->cld_mode)
+		result = lock->cll_state >= CLS_ENQUEUED;
+	else
+		result = 1;
+	return result;
+}
+
+/**
+ * Implements cl_lock_operations::clo_state() method for ccc layer, invoked
+ * whenever lock state changes. Transfers object attributes, that might be
+ * updated as a result of lock acquiring into inode.
+ */
+void ccc_lock_state(const struct lu_env *env,
+		    const struct cl_lock_slice *slice,
+		    enum cl_lock_state state)
+{
+	struct cl_lock *lock = slice->cls_lock;
+
+	/*
+	 * Refresh inode attributes when the lock is moving into CLS_HELD
+	 * state, and only when this is a result of real enqueue, rather than
+	 * of finding lock in the cache.
+	 */
+	if (state == CLS_HELD && lock->cll_state < CLS_HELD) {
+		struct cl_object *obj;
+		struct inode     *inode;
+
+		obj   = slice->cls_obj;
+		inode = ccc_object_inode(obj);
+
+		/* vmtruncate() sets the i_size
+		 * under both a DLM lock and the
+		 * ll_inode_size_lock().  If we don't get the
+		 * ll_inode_size_lock() here we can match the DLM lock and
+		 * reset i_size.  generic_file_write can then trust the
+		 * stale i_size when doing appending writes and effectively
+		 * cancel the result of the truncate.  Getting the
+		 * ll_inode_size_lock() after the enqueue maintains the DLM
+		 * -> ll_inode_size_lock() acquiring order. */
+		if (lock->cll_descr.cld_start == 0 &&
+		    lock->cll_descr.cld_end == CL_PAGE_EOF)
+			cl_merge_lvb(env, inode);
+	}
+}
+
+/*****************************************************************************
+ *
+ * io operations.
+ *
+ */
+
+void ccc_io_fini(const struct lu_env *env, const struct cl_io_slice *ios)
+{
+	struct cl_io *io = ios->cis_io;
+
+	CLOBINVRNT(env, io->ci_obj, ccc_object_invariant(io->ci_obj));
+}
+
+int ccc_io_one_lock_index(const struct lu_env *env, struct cl_io *io,
+			  __u32 enqflags, enum cl_lock_mode mode,
+			  pgoff_t start, pgoff_t end)
+{
+	struct ccc_io	  *cio   = ccc_env_io(env);
+	struct cl_lock_descr   *descr = &cio->cui_link.cill_descr;
+	struct cl_object       *obj   = io->ci_obj;
+
+	CLOBINVRNT(env, obj, ccc_object_invariant(obj));
+
+	CDEBUG(D_VFSTRACE, "lock: %d [%lu, %lu]\n", mode, start, end);
+
+	memset(&cio->cui_link, 0, sizeof(cio->cui_link));
+
+	if (cio->cui_fd && (cio->cui_fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
+		descr->cld_mode = CLM_GROUP;
+		descr->cld_gid  = cio->cui_fd->fd_grouplock.cg_gid;
+	} else {
+		descr->cld_mode  = mode;
+	}
+	descr->cld_obj   = obj;
+	descr->cld_start = start;
+	descr->cld_end   = end;
+	descr->cld_enq_flags = enqflags;
+
+	cl_io_lock_add(env, io, &cio->cui_link);
+	return 0;
+}
+
+void ccc_io_update_iov(const struct lu_env *env,
+		       struct ccc_io *cio, struct cl_io *io)
+{
+	size_t size = io->u.ci_rw.crw_count;
+
+	if (!cl_is_normalio(env, io) || cio->cui_iter == NULL)
+		return;
+
+	iov_iter_truncate(cio->cui_iter, size);
+}
+
+int ccc_io_one_lock(const struct lu_env *env, struct cl_io *io,
+		    __u32 enqflags, enum cl_lock_mode mode,
+		    loff_t start, loff_t end)
+{
+	struct cl_object *obj = io->ci_obj;
+
+	return ccc_io_one_lock_index(env, io, enqflags, mode,
+				     cl_index(obj, start), cl_index(obj, end));
+}
+
+void ccc_io_end(const struct lu_env *env, const struct cl_io_slice *ios)
+{
+	CLOBINVRNT(env, ios->cis_io->ci_obj,
+		   ccc_object_invariant(ios->cis_io->ci_obj));
+}
+
+void ccc_io_advance(const struct lu_env *env,
+		    const struct cl_io_slice *ios,
+		    size_t nob)
+{
+	struct ccc_io    *cio = cl2ccc_io(env, ios);
+	struct cl_io     *io  = ios->cis_io;
+	struct cl_object *obj = ios->cis_io->ci_obj;
+
+	CLOBINVRNT(env, obj, ccc_object_invariant(obj));
+
+	if (!cl_is_normalio(env, io))
+		return;
+
+	iov_iter_reexpand(cio->cui_iter, cio->cui_tot_count  -= nob);
+}
+
+/**
+ * Helper function that if necessary adjusts file size (inode->i_size), when
+ * position at the offset \a pos is accessed. File size can be arbitrary stale
+ * on a Lustre client, but client at least knows KMS. If accessed area is
+ * inside [0, KMS], set file size to KMS, otherwise glimpse file size.
+ *
+ * Locking: cl_isize_lock is used to serialize changes to inode size and to
+ * protect consistency between inode size and cl_object
+ * attributes. cl_object_size_lock() protects consistency between cl_attr's of
+ * top-object and sub-objects.
+ */
+int ccc_prep_size(const struct lu_env *env, struct cl_object *obj,
+		  struct cl_io *io, loff_t start, size_t count, int *exceed)
+{
+	struct cl_attr *attr  = ccc_env_thread_attr(env);
+	struct inode   *inode = ccc_object_inode(obj);
+	loff_t	  pos   = start + count - 1;
+	loff_t kms;
+	int result;
+
+	/*
+	 * Consistency guarantees: following possibilities exist for the
+	 * relation between region being accessed and real file size at this
+	 * moment:
+	 *
+	 *  (A): the region is completely inside of the file;
+	 *
+	 *  (B-x): x bytes of region are inside of the file, the rest is
+	 *  outside;
+	 *
+	 *  (C): the region is completely outside of the file.
+	 *
+	 * This classification is stable under DLM lock already acquired by
+	 * the caller, because to change the class, other client has to take
+	 * DLM lock conflicting with our lock. Also, any updates to ->i_size
+	 * by other threads on this client are serialized by
+	 * ll_inode_size_lock(). This guarantees that short reads are handled
+	 * correctly in the face of concurrent writes and truncates.
+	 */
+	ccc_object_size_lock(obj);
+	result = cl_object_attr_get(env, obj, attr);
+	if (result == 0) {
+		kms = attr->cat_kms;
+		if (pos > kms) {
+			/*
+			 * A glimpse is necessary to determine whether we
+			 * return a short read (B) or some zeroes at the end
+			 * of the buffer (C)
+			 */
+			ccc_object_size_unlock(obj);
+			result = cl_glimpse_lock(env, io, inode, obj, 0);
+			if (result == 0 && exceed != NULL) {
+				/* If objective page index exceed end-of-file
+				 * page index, return directly. Do not expect
+				 * kernel will check such case correctly.
+				 * linux-2.6.18-128.1.1 miss to do that.
+				 * --bug 17336 */
+				loff_t size = cl_isize_read(inode);
+				loff_t cur_index = start >> PAGE_CACHE_SHIFT;
+				loff_t size_index = (size - 1) >>
+						    PAGE_CACHE_SHIFT;
+
+				if ((size == 0 && cur_index != 0) ||
+				    size_index < cur_index)
+					*exceed = 1;
+			}
+			return result;
+		} else {
+			/*
+			 * region is within kms and, hence, within real file
+			 * size (A). We need to increase i_size to cover the
+			 * read region so that generic_file_read() will do its
+			 * job, but that doesn't mean the kms size is
+			 * _correct_, it is only the _minimum_ size. If
+			 * someone does a stat they will get the correct size
+			 * which will always be >= the kms value here.
+			 * b=11081
+			 */
+			if (cl_isize_read(inode) < kms) {
+				cl_isize_write_nolock(inode, kms);
+				CDEBUG(D_VFSTRACE,
+				       DFID" updating i_size %llu\n",
+				       PFID(lu_object_fid(&obj->co_lu)),
+				       (__u64)cl_isize_read(inode));
+
+			}
+		}
+	}
+	ccc_object_size_unlock(obj);
+	return result;
+}
+
+/*****************************************************************************
+ *
+ * Transfer operations.
+ *
+ */
+
+void ccc_req_completion(const struct lu_env *env,
+			const struct cl_req_slice *slice, int ioret)
+{
+	struct ccc_req *vrq;
+
+	if (ioret > 0)
+		cl_stats_tally(slice->crs_dev, slice->crs_req->crq_type, ioret);
+
+	vrq = cl2ccc_req(slice);
+	OBD_SLAB_FREE_PTR(vrq, ccc_req_kmem);
+}
+
+/**
+ * Implementation of struct cl_req_operations::cro_attr_set() for ccc
+ * layer. ccc is responsible for
+ *
+ *    - o_[mac]time
+ *
+ *    - o_mode
+ *
+ *    - o_parent_seq
+ *
+ *    - o_[ug]id
+ *
+ *    - o_parent_oid
+ *
+ *    - o_parent_ver
+ *
+ *    - o_ioepoch,
+ *
+ *  and capability.
+ */
+void ccc_req_attr_set(const struct lu_env *env,
+		      const struct cl_req_slice *slice,
+		      const struct cl_object *obj,
+		      struct cl_req_attr *attr, u64 flags)
+{
+	struct inode *inode;
+	struct obdo  *oa;
+	u32	      valid_flags;
+
+	oa = attr->cra_oa;
+	inode = ccc_object_inode(obj);
+	valid_flags = OBD_MD_FLTYPE;
+
+	if ((flags & OBD_MD_FLOSSCAPA) != 0) {
+		LASSERT(attr->cra_capa == NULL);
+		attr->cra_capa = cl_capa_lookup(inode,
+						slice->crs_req->crq_type);
+	}
+
+	if (slice->crs_req->crq_type == CRT_WRITE) {
+		if (flags & OBD_MD_FLEPOCH) {
+			oa->o_valid |= OBD_MD_FLEPOCH;
+			oa->o_ioepoch = cl_i2info(inode)->lli_ioepoch;
+			valid_flags |= OBD_MD_FLMTIME | OBD_MD_FLCTIME |
+				       OBD_MD_FLUID | OBD_MD_FLGID;
+		}
+	}
+	obdo_from_inode(oa, inode, valid_flags & flags);
+	obdo_set_parent_fid(oa, &cl_i2info(inode)->lli_fid);
+	memcpy(attr->cra_jobid, cl_i2info(inode)->lli_jobid,
+	       JOBSTATS_JOBID_SIZE);
+}
+
+static const struct cl_req_operations ccc_req_ops = {
+	.cro_attr_set   = ccc_req_attr_set,
+	.cro_completion = ccc_req_completion
+};
+
+int cl_setattr_ost(struct inode *inode, const struct iattr *attr,
+		   struct obd_capa *capa)
+{
+	struct lu_env *env;
+	struct cl_io  *io;
+	int	    result;
+	int	    refcheck;
+
+	env = cl_env_get(&refcheck);
+	if (IS_ERR(env))
+		return PTR_ERR(env);
+
+	io = ccc_env_thread_io(env);
+	io->ci_obj = cl_i2info(inode)->lli_clob;
+
+	io->u.ci_setattr.sa_attr.lvb_atime = LTIME_S(attr->ia_atime);
+	io->u.ci_setattr.sa_attr.lvb_mtime = LTIME_S(attr->ia_mtime);
+	io->u.ci_setattr.sa_attr.lvb_ctime = LTIME_S(attr->ia_ctime);
+	io->u.ci_setattr.sa_attr.lvb_size = attr->ia_size;
+	io->u.ci_setattr.sa_valid = attr->ia_valid;
+	io->u.ci_setattr.sa_capa = capa;
+
+again:
+	if (cl_io_init(env, io, CIT_SETATTR, io->ci_obj) == 0) {
+		struct ccc_io *cio = ccc_env_io(env);
+
+		if (attr->ia_valid & ATTR_FILE)
+			/* populate the file descriptor for ftruncate to honor
+			 * group lock - see LU-787 */
+			cio->cui_fd = cl_iattr2fd(inode, attr);
+
+		result = cl_io_loop(env, io);
+	} else {
+		result = io->ci_result;
+	}
+	cl_io_fini(env, io);
+	if (unlikely(io->ci_need_restart))
+		goto again;
+	/* HSM import case: file is released, cannot be restored
+	 * no need to fail except if restore registration failed
+	 * with -ENODATA */
+	if (result == -ENODATA && io->ci_restore_needed &&
+	    io->ci_result != -ENODATA)
+		result = 0;
+	cl_env_put(env, &refcheck);
+	return result;
+}
+
+/*****************************************************************************
+ *
+ * Type conversions.
+ *
+ */
+
+struct lu_device *ccc2lu_dev(struct ccc_device *vdv)
+{
+	return &vdv->cdv_cl.cd_lu_dev;
+}
+
+struct ccc_device *lu2ccc_dev(const struct lu_device *d)
+{
+	return container_of0(d, struct ccc_device, cdv_cl.cd_lu_dev);
+}
+
+struct ccc_device *cl2ccc_dev(const struct cl_device *d)
+{
+	return container_of0(d, struct ccc_device, cdv_cl);
+}
+
+struct lu_object *ccc2lu(struct ccc_object *vob)
+{
+	return &vob->cob_cl.co_lu;
+}
+
+struct ccc_object *lu2ccc(const struct lu_object *obj)
+{
+	return container_of0(obj, struct ccc_object, cob_cl.co_lu);
+}
+
+struct ccc_object *cl2ccc(const struct cl_object *obj)
+{
+	return container_of0(obj, struct ccc_object, cob_cl);
+}
+
+struct ccc_lock *cl2ccc_lock(const struct cl_lock_slice *slice)
+{
+	return container_of(slice, struct ccc_lock, clk_cl);
+}
+
+struct ccc_io *cl2ccc_io(const struct lu_env *env,
+			 const struct cl_io_slice *slice)
+{
+	struct ccc_io *cio;
+
+	cio = container_of(slice, struct ccc_io, cui_cl);
+	LASSERT(cio == ccc_env_io(env));
+	return cio;
+}
+
+struct ccc_req *cl2ccc_req(const struct cl_req_slice *slice)
+{
+	return container_of0(slice, struct ccc_req, crq_cl);
+}
+
+struct page *cl2vm_page(const struct cl_page_slice *slice)
+{
+	return cl2ccc_page(slice)->cpg_page;
+}
+
+/*****************************************************************************
+ *
+ * Accessors.
+ *
+ */
+int ccc_object_invariant(const struct cl_object *obj)
+{
+	struct inode	 *inode = ccc_object_inode(obj);
+	struct cl_inode_info *lli   = cl_i2info(inode);
+
+	return (S_ISREG(cl_inode_mode(inode)) ||
+		/* i_mode of unlinked inode is zeroed. */
+		cl_inode_mode(inode) == 0) && lli->lli_clob == obj;
+}
+
+struct inode *ccc_object_inode(const struct cl_object *obj)
+{
+	return cl2ccc(obj)->cob_inode;
+}
+
+/**
+ * Returns a pointer to cl_page associated with \a vmpage, without acquiring
+ * additional reference to the resulting page. This is an unsafe version of
+ * cl_vmpage_page() that can only be used under vmpage lock.
+ */
+struct cl_page *ccc_vmpage_page_transient(struct page *vmpage)
+{
+	KLASSERT(PageLocked(vmpage));
+	return (struct cl_page *)vmpage->private;
+}
+
+/**
+ * Initialize or update CLIO structures for regular files when new
+ * meta-data arrives from the server.
+ *
+ * \param inode regular file inode
+ * \param md    new file metadata from MDS
+ * - allocates cl_object if necessary,
+ * - updated layout, if object was already here.
+ */
+int cl_file_inode_init(struct inode *inode, struct lustre_md *md)
+{
+	struct lu_env	*env;
+	struct cl_inode_info *lli;
+	struct cl_object     *clob;
+	struct lu_site       *site;
+	struct lu_fid	*fid;
+	struct cl_object_conf conf = {
+		.coc_inode = inode,
+		.u = {
+			.coc_md    = md
+		}
+	};
+	int result = 0;
+	int refcheck;
+
+	LASSERT(md->body->valid & OBD_MD_FLID);
+	LASSERT(S_ISREG(cl_inode_mode(inode)));
+
+	env = cl_env_get(&refcheck);
+	if (IS_ERR(env))
+		return PTR_ERR(env);
+
+	site = cl_i2sbi(inode)->ll_site;
+	lli  = cl_i2info(inode);
+	fid  = &lli->lli_fid;
+	LASSERT(fid_is_sane(fid));
+
+	if (lli->lli_clob == NULL) {
+		/* clob is slave of inode, empty lli_clob means for new inode,
+		 * there is no clob in cache with the given fid, so it is
+		 * unnecessary to perform lookup-alloc-lookup-insert, just
+		 * alloc and insert directly. */
+		LASSERT(inode->i_state & I_NEW);
+		conf.coc_lu.loc_flags = LOC_F_NEW;
+		clob = cl_object_find(env, lu2cl_dev(site->ls_top_dev),
+				      fid, &conf);
+		if (!IS_ERR(clob)) {
+			/*
+			 * No locking is necessary, as new inode is
+			 * locked by I_NEW bit.
+			 */
+			lli->lli_clob = clob;
+			lli->lli_has_smd = lsm_has_objects(md->lsm);
+			lu_object_ref_add(&clob->co_lu, "inode", inode);
+		} else
+			result = PTR_ERR(clob);
+	} else {
+		result = cl_conf_set(env, lli->lli_clob, &conf);
+	}
+
+	cl_env_put(env, &refcheck);
+
+	if (result != 0)
+		CERROR("Failure to initialize cl object "DFID": %d\n",
+		       PFID(fid), result);
+	return result;
+}
+
+/**
+ * Wait for others drop their references of the object at first, then we drop
+ * the last one, which will lead to the object be destroyed immediately.
+ * Must be called after cl_object_kill() against this object.
+ *
+ * The reason we want to do this is: destroying top object will wait for sub
+ * objects being destroyed first, so we can't let bottom layer (e.g. from ASTs)
+ * to initiate top object destroying which may deadlock. See bz22520.
+ */
+static void cl_object_put_last(struct lu_env *env, struct cl_object *obj)
+{
+	struct lu_object_header *header = obj->co_lu.lo_header;
+	wait_queue_t	   waiter;
+
+	if (unlikely(atomic_read(&header->loh_ref) != 1)) {
+		struct lu_site *site = obj->co_lu.lo_dev->ld_site;
+		struct lu_site_bkt_data *bkt;
+
+		bkt = lu_site_bkt_from_fid(site, &header->loh_fid);
+
+		init_waitqueue_entry(&waiter, current);
+		add_wait_queue(&bkt->lsb_marche_funebre, &waiter);
+
+		while (1) {
+			set_current_state(TASK_UNINTERRUPTIBLE);
+			if (atomic_read(&header->loh_ref) == 1)
+				break;
+			schedule();
+		}
+
+		set_current_state(TASK_RUNNING);
+		remove_wait_queue(&bkt->lsb_marche_funebre, &waiter);
+	}
+
+	cl_object_put(env, obj);
+}
+
+void cl_inode_fini(struct inode *inode)
+{
+	struct lu_env	   *env;
+	struct cl_inode_info    *lli  = cl_i2info(inode);
+	struct cl_object	*clob = lli->lli_clob;
+	int refcheck;
+	int emergency;
+
+	if (clob != NULL) {
+		void		    *cookie;
+
+		cookie = cl_env_reenter();
+		env = cl_env_get(&refcheck);
+		emergency = IS_ERR(env);
+		if (emergency) {
+			mutex_lock(&ccc_inode_fini_guard);
+			LASSERT(ccc_inode_fini_env != NULL);
+			cl_env_implant(ccc_inode_fini_env, &refcheck);
+			env = ccc_inode_fini_env;
+		}
+		/*
+		 * cl_object cache is a slave to inode cache (which, in turn
+		 * is a slave to dentry cache), don't keep cl_object in memory
+		 * when its master is evicted.
+		 */
+		cl_object_kill(env, clob);
+		lu_object_ref_del(&clob->co_lu, "inode", inode);
+		cl_object_put_last(env, clob);
+		lli->lli_clob = NULL;
+		if (emergency) {
+			cl_env_unplant(ccc_inode_fini_env, &refcheck);
+			mutex_unlock(&ccc_inode_fini_guard);
+		} else
+			cl_env_put(env, &refcheck);
+		cl_env_reexit(cookie);
+	}
+}
+
+/**
+ * return IF_* type for given lu_dirent entry.
+ * IF_* flag shld be converted to particular OS file type in
+ * platform llite module.
+ */
+__u16 ll_dirent_type_get(struct lu_dirent *ent)
+{
+	__u16 type = 0;
+	struct luda_type *lt;
+	int len = 0;
+
+	if (le32_to_cpu(ent->lde_attrs) & LUDA_TYPE) {
+		const unsigned align = sizeof(struct luda_type) - 1;
+
+		len = le16_to_cpu(ent->lde_namelen);
+		len = (len + align) & ~align;
+		lt = (void *)ent->lde_name + len;
+		type = IFTODT(le16_to_cpu(lt->lt_type));
+	}
+	return type;
+}
+
+/**
+ * build inode number from passed @fid */
+__u64 cl_fid_build_ino(const struct lu_fid *fid, int api32)
+{
+	if (BITS_PER_LONG == 32 || api32)
+		return fid_flatten32(fid);
+	else
+		return fid_flatten(fid);
+}
+
+/**
+ * build inode generation from passed @fid.  If our FID overflows the 32-bit
+ * inode number then return a non-zero generation to distinguish them. */
+__u32 cl_fid_build_gen(const struct lu_fid *fid)
+{
+	__u32 gen;
+
+	if (fid_is_igif(fid)) {
+		gen = lu_igif_gen(fid);
+		return gen;
+	}
+
+	gen = fid_flatten(fid) >> 32;
+	return gen;
+}
+
+/* lsm is unreliable after hsm implementation as layout can be changed at
+ * any time. This is only to support old, non-clio-ized interfaces. It will
+ * cause deadlock if clio operations are called with this extra layout refcount
+ * because in case the layout changed during the IO, ll_layout_refresh() will
+ * have to wait for the refcount to become zero to destroy the older layout.
+ *
+ * Notice that the lsm returned by this function may not be valid unless called
+ * inside layout lock - MDS_INODELOCK_LAYOUT. */
+struct lov_stripe_md *ccc_inode_lsm_get(struct inode *inode)
+{
+	return lov_lsm_get(cl_i2info(inode)->lli_clob);
+}
+
+inline void ccc_inode_lsm_put(struct inode *inode, struct lov_stripe_md *lsm)
+{
+	lov_lsm_put(cl_i2info(inode)->lli_clob, lsm);
+}
diff --git a/kernel/drivers/staging/lustre/lustre/lclient/lcommon_misc.c b/kernel/drivers/staging/lustre/lustre/lclient/lcommon_misc.c
new file mode 100644
index 000000000..01bf894d4
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/lclient/lcommon_misc.c
@@ -0,0 +1,199 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * cl code shared between vvp and liblustre (and other Lustre clients in the
+ * future).
+ *
+ */
+#include "../include/obd_class.h"
+#include "../include/obd_support.h"
+#include "../include/obd.h"
+#include "../include/cl_object.h"
+#include "../include/lclient.h"
+
+#include "../include/lustre_lite.h"
+
+
+/* Initialize the default and maximum LOV EA and cookie sizes.  This allows
+ * us to make MDS RPCs with large enough reply buffers to hold the
+ * maximum-sized (= maximum striped) EA and cookie without having to
+ * calculate this (via a call into the LOV + OSCs) each time we make an RPC. */
+int cl_init_ea_size(struct obd_export *md_exp, struct obd_export *dt_exp)
+{
+	struct lov_stripe_md lsm = { .lsm_magic = LOV_MAGIC_V3 };
+	__u32 valsize = sizeof(struct lov_desc);
+	int rc, easize, def_easize, cookiesize;
+	struct lov_desc desc;
+	__u16 stripes, def_stripes;
+
+	rc = obd_get_info(NULL, dt_exp, sizeof(KEY_LOVDESC), KEY_LOVDESC,
+			  &valsize, &desc, NULL);
+	if (rc)
+		return rc;
+
+	stripes = min_t(__u32, desc.ld_tgt_count, LOV_MAX_STRIPE_COUNT);
+	lsm.lsm_stripe_count = stripes;
+	easize = obd_size_diskmd(dt_exp, &lsm);
+
+	def_stripes = min_t(__u32, desc.ld_default_stripe_count,
+			    LOV_MAX_STRIPE_COUNT);
+	lsm.lsm_stripe_count = def_stripes;
+	def_easize = obd_size_diskmd(dt_exp, &lsm);
+
+	cookiesize = stripes * sizeof(struct llog_cookie);
+
+	/* default cookiesize is 0 because from 2.4 server doesn't send
+	 * llog cookies to client. */
+	CDEBUG(D_HA,
+	       "updating def/max_easize: %d/%d def/max_cookiesize: 0/%d\n",
+	       def_easize, easize, cookiesize);
+
+	rc = md_init_ea_size(md_exp, easize, def_easize, cookiesize, 0);
+	return rc;
+}
+
+/**
+ * This function is used as an upcall-callback hooked by liblustre and llite
+ * clients into obd_notify() listeners chain to handle notifications about
+ * change of import connect_flags. See llu_fsswop_mount() and
+ * lustre_common_fill_super().
+ */
+int cl_ocd_update(struct obd_device *host,
+		  struct obd_device *watched,
+		  enum obd_notify_event ev, void *owner, void *data)
+{
+	struct lustre_client_ocd *lco;
+	struct client_obd	*cli;
+	__u64 flags;
+	int   result;
+
+	if (!strcmp(watched->obd_type->typ_name, LUSTRE_OSC_NAME)) {
+		cli = &watched->u.cli;
+		lco = owner;
+		flags = cli->cl_import->imp_connect_data.ocd_connect_flags;
+		CDEBUG(D_SUPER, "Changing connect_flags: %#llx -> %#llx\n",
+		       lco->lco_flags, flags);
+		mutex_lock(&lco->lco_lock);
+		lco->lco_flags &= flags;
+		/* for each osc event update ea size */
+		if (lco->lco_dt_exp)
+			cl_init_ea_size(lco->lco_md_exp, lco->lco_dt_exp);
+
+		mutex_unlock(&lco->lco_lock);
+		result = 0;
+	} else {
+		CERROR("unexpected notification from %s %s!\n",
+		       watched->obd_type->typ_name,
+		       watched->obd_name);
+		result = -EINVAL;
+	}
+	return result;
+}
+
+#define GROUPLOCK_SCOPE "grouplock"
+
+int cl_get_grouplock(struct cl_object *obj, unsigned long gid, int nonblock,
+		     struct ccc_grouplock *cg)
+{
+	struct lu_env	  *env;
+	struct cl_io	   *io;
+	struct cl_lock	 *lock;
+	struct cl_lock_descr   *descr;
+	__u32		   enqflags;
+	int		     refcheck;
+	int		     rc;
+
+	env = cl_env_get(&refcheck);
+	if (IS_ERR(env))
+		return PTR_ERR(env);
+
+	io = ccc_env_thread_io(env);
+	io->ci_obj = obj;
+	io->ci_ignore_layout = 1;
+
+	rc = cl_io_init(env, io, CIT_MISC, io->ci_obj);
+	if (rc) {
+		/* Does not make sense to take GL for released layout */
+		if (rc > 0)
+			rc = -ENOTSUPP;
+		cl_env_put(env, &refcheck);
+		return rc;
+	}
+
+	descr = &ccc_env_info(env)->cti_descr;
+	descr->cld_obj = obj;
+	descr->cld_start = 0;
+	descr->cld_end = CL_PAGE_EOF;
+	descr->cld_gid = gid;
+	descr->cld_mode = CLM_GROUP;
+
+	enqflags = CEF_MUST | (nonblock ? CEF_NONBLOCK : 0);
+	descr->cld_enq_flags = enqflags;
+
+	lock = cl_lock_request(env, io, descr, GROUPLOCK_SCOPE, current);
+	if (IS_ERR(lock)) {
+		cl_io_fini(env, io);
+		cl_env_put(env, &refcheck);
+		return PTR_ERR(lock);
+	}
+
+	cg->cg_env  = cl_env_get(&refcheck);
+	cg->cg_io   = io;
+	cg->cg_lock = lock;
+	cg->cg_gid  = gid;
+	LASSERT(cg->cg_env == env);
+
+	cl_env_unplant(env, &refcheck);
+	return 0;
+}
+
+void cl_put_grouplock(struct ccc_grouplock *cg)
+{
+	struct lu_env  *env  = cg->cg_env;
+	struct cl_io   *io   = cg->cg_io;
+	struct cl_lock *lock = cg->cg_lock;
+	int	     refcheck;
+
+	LASSERT(cg->cg_env);
+	LASSERT(cg->cg_gid);
+
+	cl_env_implant(env, &refcheck);
+	cl_env_put(env, &refcheck);
+
+	cl_unuse(env, lock);
+	cl_lock_release(env, lock, GROUPLOCK_SCOPE, current);
+	cl_io_fini(env, io);
+	cl_env_put(env, NULL);
+}
diff --git a/kernel/drivers/staging/lustre/lustre/ldlm/interval_tree.c b/kernel/drivers/staging/lustre/lustre/ldlm/interval_tree.c
new file mode 100644
index 000000000..eab2bd602
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/ldlm/interval_tree.c
@@ -0,0 +1,751 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/ldlm/interval_tree.c
+ *
+ * Interval tree library used by ldlm extent lock code
+ *
+ * Author: Huang Wei <huangwei@clusterfs.com>
+ * Author: Jay Xiong <jinshan.xiong@sun.com>
+ */
+#include "../include/lustre_dlm.h"
+#include "../include/obd_support.h"
+#include "../include/interval_tree.h"
+
+enum {
+	INTERVAL_RED = 0,
+	INTERVAL_BLACK = 1
+};
+
+static inline int node_is_left_child(struct interval_node *node)
+{
+	LASSERT(node->in_parent != NULL);
+	return node == node->in_parent->in_left;
+}
+
+static inline int node_is_right_child(struct interval_node *node)
+{
+	LASSERT(node->in_parent != NULL);
+	return node == node->in_parent->in_right;
+}
+
+static inline int node_is_red(struct interval_node *node)
+{
+	return node->in_color == INTERVAL_RED;
+}
+
+static inline int node_is_black(struct interval_node *node)
+{
+	return node->in_color == INTERVAL_BLACK;
+}
+
+static inline int extent_compare(struct interval_node_extent *e1,
+				 struct interval_node_extent *e2)
+{
+	int rc;
+
+	if (e1->start == e2->start) {
+		if (e1->end < e2->end)
+			rc = -1;
+		else if (e1->end > e2->end)
+			rc = 1;
+		else
+			rc = 0;
+	} else {
+		if (e1->start < e2->start)
+			rc = -1;
+		else
+			rc = 1;
+	}
+	return rc;
+}
+
+static inline int extent_equal(struct interval_node_extent *e1,
+			       struct interval_node_extent *e2)
+{
+	return (e1->start == e2->start) && (e1->end == e2->end);
+}
+
+static inline int extent_overlapped(struct interval_node_extent *e1,
+				    struct interval_node_extent *e2)
+{
+	return (e1->start <= e2->end) && (e2->start <= e1->end);
+}
+
+static inline int node_compare(struct interval_node *n1,
+			       struct interval_node *n2)
+{
+	return extent_compare(&n1->in_extent, &n2->in_extent);
+}
+
+static inline int node_equal(struct interval_node *n1,
+			     struct interval_node *n2)
+{
+	return extent_equal(&n1->in_extent, &n2->in_extent);
+}
+
+static inline __u64 max_u64(__u64 x, __u64 y)
+{
+	return x > y ? x : y;
+}
+
+static inline __u64 min_u64(__u64 x, __u64 y)
+{
+	return x < y ? x : y;
+}
+
+#define interval_for_each(node, root)		   \
+for (node = interval_first(root); node != NULL;	 \
+	node = interval_next(node))
+
+#define interval_for_each_reverse(node, root)	   \
+for (node = interval_last(root); node != NULL;	  \
+	node = interval_prev(node))
+
+static struct interval_node *interval_first(struct interval_node *node)
+{
+	if (!node)
+		return NULL;
+	while (node->in_left)
+		node = node->in_left;
+	return node;
+}
+
+static struct interval_node *interval_last(struct interval_node *node)
+{
+	if (!node)
+		return NULL;
+	while (node->in_right)
+		node = node->in_right;
+	return node;
+}
+
+static struct interval_node *interval_next(struct interval_node *node)
+{
+	if (!node)
+		return NULL;
+	if (node->in_right)
+		return interval_first(node->in_right);
+	while (node->in_parent && node_is_right_child(node))
+		node = node->in_parent;
+	return node->in_parent;
+}
+
+static struct interval_node *interval_prev(struct interval_node *node)
+{
+	if (!node)
+		return NULL;
+
+	if (node->in_left)
+		return interval_last(node->in_left);
+
+	while (node->in_parent && node_is_left_child(node))
+		node = node->in_parent;
+
+	return node->in_parent;
+}
+
+enum interval_iter interval_iterate(struct interval_node *root,
+				    interval_callback_t func,
+				    void *data)
+{
+	struct interval_node *node;
+	enum interval_iter rc = INTERVAL_ITER_CONT;
+
+	interval_for_each(node, root) {
+		rc = func(node, data);
+		if (rc == INTERVAL_ITER_STOP)
+			break;
+	}
+
+	return rc;
+}
+EXPORT_SYMBOL(interval_iterate);
+
+enum interval_iter interval_iterate_reverse(struct interval_node *root,
+					    interval_callback_t func,
+					    void *data)
+{
+	struct interval_node *node;
+	enum interval_iter rc = INTERVAL_ITER_CONT;
+
+	interval_for_each_reverse(node, root) {
+		rc = func(node, data);
+		if (rc == INTERVAL_ITER_STOP)
+			break;
+	}
+
+	return rc;
+}
+EXPORT_SYMBOL(interval_iterate_reverse);
+
+/* try to find a node with same interval in the tree,
+ * if found, return the pointer to the node, otherwise return NULL*/
+struct interval_node *interval_find(struct interval_node *root,
+				    struct interval_node_extent *ex)
+{
+	struct interval_node *walk = root;
+	int rc;
+
+	while (walk) {
+		rc = extent_compare(ex, &walk->in_extent);
+		if (rc == 0)
+			break;
+		else if (rc < 0)
+			walk = walk->in_left;
+		else
+			walk = walk->in_right;
+	}
+
+	return walk;
+}
+EXPORT_SYMBOL(interval_find);
+
+static void __rotate_change_maxhigh(struct interval_node *node,
+				    struct interval_node *rotate)
+{
+	__u64 left_max, right_max;
+
+	rotate->in_max_high = node->in_max_high;
+	left_max = node->in_left ? node->in_left->in_max_high : 0;
+	right_max = node->in_right ? node->in_right->in_max_high : 0;
+	node->in_max_high  = max_u64(interval_high(node),
+				     max_u64(left_max, right_max));
+}
+
+/* The left rotation "pivots" around the link from node to node->right, and
+ * - node will be linked to node->right's left child, and
+ * - node->right's left child will be linked to node's right child.  */
+static void __rotate_left(struct interval_node *node,
+			  struct interval_node **root)
+{
+	struct interval_node *right = node->in_right;
+	struct interval_node *parent = node->in_parent;
+
+	node->in_right = right->in_left;
+	if (node->in_right)
+		right->in_left->in_parent = node;
+
+	right->in_left = node;
+	right->in_parent = parent;
+	if (parent) {
+		if (node_is_left_child(node))
+			parent->in_left = right;
+		else
+			parent->in_right = right;
+	} else {
+		*root = right;
+	}
+	node->in_parent = right;
+
+	/* update max_high for node and right */
+	__rotate_change_maxhigh(node, right);
+}
+
+/* The right rotation "pivots" around the link from node to node->left, and
+ * - node will be linked to node->left's right child, and
+ * - node->left's right child will be linked to node's left child.  */
+static void __rotate_right(struct interval_node *node,
+			   struct interval_node **root)
+{
+	struct interval_node *left = node->in_left;
+	struct interval_node *parent = node->in_parent;
+
+	node->in_left = left->in_right;
+	if (node->in_left)
+		left->in_right->in_parent = node;
+	left->in_right = node;
+
+	left->in_parent = parent;
+	if (parent) {
+		if (node_is_right_child(node))
+			parent->in_right = left;
+		else
+			parent->in_left = left;
+	} else {
+		*root = left;
+	}
+	node->in_parent = left;
+
+	/* update max_high for node and left */
+	__rotate_change_maxhigh(node, left);
+}
+
+#define interval_swap(a, b) do {			\
+	struct interval_node *c = a; a = b; b = c;      \
+} while (0)
+
+/*
+ * Operations INSERT and DELETE, when run on a tree with n keys,
+ * take O(logN) time.Because they modify the tree, the result
+ * may violate the red-black properties.To restore these properties,
+ * we must change the colors of some of the nodes in the tree
+ * and also change the pointer structure.
+ */
+static void interval_insert_color(struct interval_node *node,
+				  struct interval_node **root)
+{
+	struct interval_node *parent, *gparent;
+
+	while ((parent = node->in_parent) && node_is_red(parent)) {
+		gparent = parent->in_parent;
+		/* Parent is RED, so gparent must not be NULL */
+		if (node_is_left_child(parent)) {
+			struct interval_node *uncle;
+
+			uncle = gparent->in_right;
+			if (uncle && node_is_red(uncle)) {
+				uncle->in_color = INTERVAL_BLACK;
+				parent->in_color = INTERVAL_BLACK;
+				gparent->in_color = INTERVAL_RED;
+				node = gparent;
+				continue;
+			}
+
+			if (parent->in_right == node) {
+				__rotate_left(parent, root);
+				interval_swap(node, parent);
+			}
+
+			parent->in_color = INTERVAL_BLACK;
+			gparent->in_color = INTERVAL_RED;
+			__rotate_right(gparent, root);
+		} else {
+			struct interval_node *uncle;
+
+			uncle = gparent->in_left;
+			if (uncle && node_is_red(uncle)) {
+				uncle->in_color = INTERVAL_BLACK;
+				parent->in_color = INTERVAL_BLACK;
+				gparent->in_color = INTERVAL_RED;
+				node = gparent;
+				continue;
+			}
+
+			if (node_is_left_child(node)) {
+				__rotate_right(parent, root);
+				interval_swap(node, parent);
+			}
+
+			parent->in_color = INTERVAL_BLACK;
+			gparent->in_color = INTERVAL_RED;
+			__rotate_left(gparent, root);
+		}
+	}
+
+	(*root)->in_color = INTERVAL_BLACK;
+}
+
+struct interval_node *interval_insert(struct interval_node *node,
+				      struct interval_node **root)
+
+{
+	struct interval_node **p, *parent = NULL;
+
+	LASSERT(!interval_is_intree(node));
+	p = root;
+	while (*p) {
+		parent = *p;
+		if (node_equal(parent, node))
+			return parent;
+
+		/* max_high field must be updated after each iteration */
+		if (parent->in_max_high < interval_high(node))
+			parent->in_max_high = interval_high(node);
+
+		if (node_compare(node, parent) < 0)
+			p = &parent->in_left;
+		else
+			p = &parent->in_right;
+	}
+
+	/* link node into the tree */
+	node->in_parent = parent;
+	node->in_color = INTERVAL_RED;
+	node->in_left = node->in_right = NULL;
+	*p = node;
+
+	interval_insert_color(node, root);
+	node->in_intree = 1;
+
+	return NULL;
+}
+EXPORT_SYMBOL(interval_insert);
+
+static inline int node_is_black_or_0(struct interval_node *node)
+{
+	return !node || node_is_black(node);
+}
+
+static void interval_erase_color(struct interval_node *node,
+				 struct interval_node *parent,
+				 struct interval_node **root)
+{
+	struct interval_node *tmp;
+
+	while (node_is_black_or_0(node) && node != *root) {
+		if (parent->in_left == node) {
+			tmp = parent->in_right;
+			if (node_is_red(tmp)) {
+				tmp->in_color = INTERVAL_BLACK;
+				parent->in_color = INTERVAL_RED;
+				__rotate_left(parent, root);
+				tmp = parent->in_right;
+			}
+			if (node_is_black_or_0(tmp->in_left) &&
+			    node_is_black_or_0(tmp->in_right)) {
+				tmp->in_color = INTERVAL_RED;
+				node = parent;
+				parent = node->in_parent;
+			} else {
+				if (node_is_black_or_0(tmp->in_right)) {
+					struct interval_node *o_left;
+
+					o_left = tmp->in_left;
+					if (o_left)
+						o_left->in_color = INTERVAL_BLACK;
+					tmp->in_color = INTERVAL_RED;
+					__rotate_right(tmp, root);
+					tmp = parent->in_right;
+				}
+				tmp->in_color = parent->in_color;
+				parent->in_color = INTERVAL_BLACK;
+				if (tmp->in_right)
+					tmp->in_right->in_color = INTERVAL_BLACK;
+				__rotate_left(parent, root);
+				node = *root;
+				break;
+			}
+		} else {
+			tmp = parent->in_left;
+			if (node_is_red(tmp)) {
+				tmp->in_color = INTERVAL_BLACK;
+				parent->in_color = INTERVAL_RED;
+				__rotate_right(parent, root);
+				tmp = parent->in_left;
+			}
+			if (node_is_black_or_0(tmp->in_left) &&
+			    node_is_black_or_0(tmp->in_right)) {
+				tmp->in_color = INTERVAL_RED;
+				node = parent;
+				parent = node->in_parent;
+			} else {
+				if (node_is_black_or_0(tmp->in_left)) {
+					struct interval_node *o_right;
+
+					o_right = tmp->in_right;
+					if (o_right)
+						o_right->in_color = INTERVAL_BLACK;
+					tmp->in_color = INTERVAL_RED;
+					__rotate_left(tmp, root);
+					tmp = parent->in_left;
+				}
+				tmp->in_color = parent->in_color;
+				parent->in_color = INTERVAL_BLACK;
+				if (tmp->in_left)
+					tmp->in_left->in_color = INTERVAL_BLACK;
+				__rotate_right(parent, root);
+				node = *root;
+				break;
+			}
+		}
+	}
+	if (node)
+		node->in_color = INTERVAL_BLACK;
+}
+
+/*
+ * if the @max_high value of @node is changed, this function traverse  a path
+ * from node  up to the root to update max_high for the whole tree.
+ */
+static void update_maxhigh(struct interval_node *node,
+			   __u64  old_maxhigh)
+{
+	__u64 left_max, right_max;
+
+	while (node) {
+		left_max = node->in_left ? node->in_left->in_max_high : 0;
+		right_max = node->in_right ? node->in_right->in_max_high : 0;
+		node->in_max_high = max_u64(interval_high(node),
+					    max_u64(left_max, right_max));
+
+		if (node->in_max_high >= old_maxhigh)
+			break;
+		node = node->in_parent;
+	}
+}
+
+void interval_erase(struct interval_node *node,
+		    struct interval_node **root)
+{
+	struct interval_node *child, *parent;
+	int color;
+
+	LASSERT(interval_is_intree(node));
+	node->in_intree = 0;
+	if (!node->in_left) {
+		child = node->in_right;
+	} else if (!node->in_right) {
+		child = node->in_left;
+	} else { /* Both left and right child are not NULL */
+		struct interval_node *old = node;
+
+		node = interval_next(node);
+		child = node->in_right;
+		parent = node->in_parent;
+		color = node->in_color;
+
+		if (child)
+			child->in_parent = parent;
+		if (parent == old)
+			parent->in_right = child;
+		else
+			parent->in_left = child;
+
+		node->in_color = old->in_color;
+		node->in_right = old->in_right;
+		node->in_left = old->in_left;
+		node->in_parent = old->in_parent;
+
+		if (old->in_parent) {
+			if (node_is_left_child(old))
+				old->in_parent->in_left = node;
+			else
+				old->in_parent->in_right = node;
+		} else {
+			*root = node;
+		}
+
+		old->in_left->in_parent = node;
+		if (old->in_right)
+			old->in_right->in_parent = node;
+		update_maxhigh(child ? : parent, node->in_max_high);
+		update_maxhigh(node, old->in_max_high);
+		if (parent == old)
+			parent = node;
+		goto color;
+	}
+	parent = node->in_parent;
+	color = node->in_color;
+
+	if (child)
+		child->in_parent = parent;
+	if (parent) {
+		if (node_is_left_child(node))
+			parent->in_left = child;
+		else
+			parent->in_right = child;
+	} else {
+		*root = child;
+	}
+
+	update_maxhigh(child ? : parent, node->in_max_high);
+
+color:
+	if (color == INTERVAL_BLACK)
+		interval_erase_color(child, parent, root);
+}
+EXPORT_SYMBOL(interval_erase);
+
+static inline int interval_may_overlap(struct interval_node *node,
+					  struct interval_node_extent *ext)
+{
+	return (ext->start <= node->in_max_high &&
+		ext->end >= interval_low(node));
+}
+
+/*
+ * This function finds all intervals that overlap interval ext,
+ * and calls func to handle resulted intervals one by one.
+ * in lustre, this function will find all conflicting locks in
+ * the granted queue and add these locks to the ast work list.
+ *
+ * {
+ *       if (node == NULL)
+ *	       return 0;
+ *       if (ext->end < interval_low(node)) {
+ *	       interval_search(node->in_left, ext, func, data);
+ *       } else if (interval_may_overlap(node, ext)) {
+ *	       if (extent_overlapped(ext, &node->in_extent))
+ *		       func(node, data);
+ *	       interval_search(node->in_left, ext, func, data);
+ *	       interval_search(node->in_right, ext, func, data);
+ *       }
+ *       return 0;
+ * }
+ *
+ */
+enum interval_iter interval_search(struct interval_node *node,
+				   struct interval_node_extent *ext,
+				   interval_callback_t func,
+				   void *data)
+{
+	struct interval_node *parent;
+	enum interval_iter rc = INTERVAL_ITER_CONT;
+
+	LASSERT(ext != NULL);
+	LASSERT(func != NULL);
+
+	while (node) {
+		if (ext->end < interval_low(node)) {
+			if (node->in_left) {
+				node = node->in_left;
+				continue;
+			}
+		} else if (interval_may_overlap(node, ext)) {
+			if (extent_overlapped(ext, &node->in_extent)) {
+				rc = func(node, data);
+				if (rc == INTERVAL_ITER_STOP)
+					break;
+			}
+
+			if (node->in_left) {
+				node = node->in_left;
+				continue;
+			}
+			if (node->in_right) {
+				node = node->in_right;
+				continue;
+			}
+		}
+
+		parent = node->in_parent;
+		while (parent) {
+			if (node_is_left_child(node) &&
+			    parent->in_right) {
+				/* If we ever got the left, it means that the
+				 * parent met ext->end<interval_low(parent), or
+				 * may_overlap(parent). If the former is true,
+				 * we needn't go back. So stop early and check
+				 * may_overlap(parent) after this loop.  */
+				node = parent->in_right;
+				break;
+			}
+			node = parent;
+			parent = parent->in_parent;
+		}
+		if (parent == NULL || !interval_may_overlap(parent, ext))
+			break;
+	}
+
+	return rc;
+}
+EXPORT_SYMBOL(interval_search);
+
+static enum interval_iter interval_overlap_cb(struct interval_node *n,
+					      void *args)
+{
+	*(int *)args = 1;
+	return INTERVAL_ITER_STOP;
+}
+
+int interval_is_overlapped(struct interval_node *root,
+			   struct interval_node_extent *ext)
+{
+	int has = 0;
+	(void)interval_search(root, ext, interval_overlap_cb, &has);
+	return has;
+}
+EXPORT_SYMBOL(interval_is_overlapped);
+
+/* Don't expand to low. Expanding downwards is expensive, and meaningless to
+ * some extents, because programs seldom do IO backward.
+ *
+ * The recursive algorithm of expanding low:
+ * expand_low {
+ *	struct interval_node *tmp;
+ *	static __u64 res = 0;
+ *
+ *	if (root == NULL)
+ *		return res;
+ *	if (root->in_max_high < low) {
+ *		res = max_u64(root->in_max_high + 1, res);
+ *		return res;
+ *	} else if (low < interval_low(root)) {
+ *		interval_expand_low(root->in_left, low);
+ *		return res;
+ *	}
+ *
+ *	if (interval_high(root) < low)
+ *		res = max_u64(interval_high(root) + 1, res);
+ *	interval_expand_low(root->in_left, low);
+ *	interval_expand_low(root->in_right, low);
+ *
+ *	return res;
+ * }
+ *
+ * It's much easy to eliminate the recursion, see interval_search for
+ * an example. -jay
+ */
+static inline __u64 interval_expand_low(struct interval_node *root, __u64 low)
+{
+	/* we only concern the empty tree right now. */
+	if (root == NULL)
+		return 0;
+	return low;
+}
+
+static inline __u64 interval_expand_high(struct interval_node *node, __u64 high)
+{
+	__u64 result = ~0;
+
+	while (node != NULL) {
+		if (node->in_max_high < high)
+			break;
+
+		if (interval_low(node) > high) {
+			result = interval_low(node) - 1;
+			node = node->in_left;
+		} else {
+			node = node->in_right;
+		}
+	}
+
+	return result;
+}
+
+/* expanding the extent based on @ext. */
+void interval_expand(struct interval_node *root,
+		     struct interval_node_extent *ext,
+		     struct interval_node_extent *limiter)
+{
+	/* The assertion of interval_is_overlapped is expensive because we may
+	 * travel many nodes to find the overlapped node. */
+	LASSERT(interval_is_overlapped(root, ext) == 0);
+	if (!limiter || limiter->start < ext->start)
+		ext->start = interval_expand_low(root, ext->start);
+	if (!limiter || limiter->end > ext->end)
+		ext->end = interval_expand_high(root, ext->end);
+	LASSERT(interval_is_overlapped(root, ext) == 0);
+}
+EXPORT_SYMBOL(interval_expand);
diff --git a/kernel/drivers/staging/lustre/lustre/ldlm/l_lock.c b/kernel/drivers/staging/lustre/lustre/ldlm/l_lock.c
new file mode 100644
index 000000000..cd8ab40e3
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/ldlm/l_lock.c
@@ -0,0 +1,76 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_LDLM
+#include "../../include/linux/libcfs/libcfs.h"
+
+#include "../include/lustre_dlm.h"
+#include "../include/lustre_lib.h"
+
+/**
+ * Lock a lock and its resource.
+ *
+ * LDLM locking uses resource to serialize access to locks
+ * but there is a case when we change resource of lock upon
+ * enqueue reply. We rely on lock->l_resource = new_res
+ * being an atomic operation.
+ */
+struct ldlm_resource *lock_res_and_lock(struct ldlm_lock *lock)
+{
+	/* on server-side resource of lock doesn't change */
+	if ((lock->l_flags & LDLM_FL_NS_SRV) == 0)
+		spin_lock(&lock->l_lock);
+
+	lock_res(lock->l_resource);
+
+	lock->l_flags |= LDLM_FL_RES_LOCKED;
+	return lock->l_resource;
+}
+EXPORT_SYMBOL(lock_res_and_lock);
+
+/**
+ * Unlock a lock and its resource previously locked with lock_res_and_lock
+ */
+void unlock_res_and_lock(struct ldlm_lock *lock)
+{
+	/* on server-side resource of lock doesn't change */
+	lock->l_flags &= ~LDLM_FL_RES_LOCKED;
+
+	unlock_res(lock->l_resource);
+	if ((lock->l_flags & LDLM_FL_NS_SRV) == 0)
+		spin_unlock(&lock->l_lock);
+}
+EXPORT_SYMBOL(unlock_res_and_lock);
diff --git a/kernel/drivers/staging/lustre/lustre/ldlm/ldlm_extent.c b/kernel/drivers/staging/lustre/lustre/ldlm/ldlm_extent.c
new file mode 100644
index 000000000..fd9b05936
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/ldlm/ldlm_extent.c
@@ -0,0 +1,241 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2010, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/ldlm/ldlm_extent.c
+ *
+ * Author: Peter Braam <braam@clusterfs.com>
+ * Author: Phil Schwan <phil@clusterfs.com>
+ */
+
+/**
+ * This file contains implementation of EXTENT lock type
+ *
+ * EXTENT lock type is for locking a contiguous range of values, represented
+ * by 64-bit starting and ending offsets (inclusive). There are several extent
+ * lock modes, some of which may be mutually incompatible. Extent locks are
+ * considered incompatible if their modes are incompatible and their extents
+ * intersect.  See the lock mode compatibility matrix in lustre_dlm.h.
+ */
+
+#define DEBUG_SUBSYSTEM S_LDLM
+#include "../../include/linux/libcfs/libcfs.h"
+#include "../include/lustre_dlm.h"
+#include "../include/obd_support.h"
+#include "../include/obd.h"
+#include "../include/obd_class.h"
+#include "../include/lustre_lib.h"
+#include "ldlm_internal.h"
+
+
+/* When a lock is cancelled by a client, the KMS may undergo change if this
+ * is the "highest lock".  This function returns the new KMS value.
+ * Caller must hold lr_lock already.
+ *
+ * NB: A lock on [x,y] protects a KMS of up to y + 1 bytes! */
+__u64 ldlm_extent_shift_kms(struct ldlm_lock *lock, __u64 old_kms)
+{
+	struct ldlm_resource *res = lock->l_resource;
+	struct list_head *tmp;
+	struct ldlm_lock *lck;
+	__u64 kms = 0;
+
+	/* don't let another thread in ldlm_extent_shift_kms race in
+	 * just after we finish and take our lock into account in its
+	 * calculation of the kms */
+	lock->l_flags |= LDLM_FL_KMS_IGNORE;
+
+	list_for_each(tmp, &res->lr_granted) {
+		lck = list_entry(tmp, struct ldlm_lock, l_res_link);
+
+		if (lck->l_flags & LDLM_FL_KMS_IGNORE)
+			continue;
+
+		if (lck->l_policy_data.l_extent.end >= old_kms)
+			return old_kms;
+
+		/* This extent _has_ to be smaller than old_kms (checked above)
+		 * so kms can only ever be smaller or the same as old_kms. */
+		if (lck->l_policy_data.l_extent.end + 1 > kms)
+			kms = lck->l_policy_data.l_extent.end + 1;
+	}
+	LASSERTF(kms <= old_kms, "kms %llu old_kms %llu\n", kms, old_kms);
+
+	return kms;
+}
+EXPORT_SYMBOL(ldlm_extent_shift_kms);
+
+struct kmem_cache *ldlm_interval_slab;
+struct ldlm_interval *ldlm_interval_alloc(struct ldlm_lock *lock)
+{
+	struct ldlm_interval *node;
+
+	LASSERT(lock->l_resource->lr_type == LDLM_EXTENT);
+	OBD_SLAB_ALLOC_PTR_GFP(node, ldlm_interval_slab, GFP_NOFS);
+	if (node == NULL)
+		return NULL;
+
+	INIT_LIST_HEAD(&node->li_group);
+	ldlm_interval_attach(node, lock);
+	return node;
+}
+
+void ldlm_interval_free(struct ldlm_interval *node)
+{
+	if (node) {
+		LASSERT(list_empty(&node->li_group));
+		LASSERT(!interval_is_intree(&node->li_node));
+		OBD_SLAB_FREE(node, ldlm_interval_slab, sizeof(*node));
+	}
+}
+
+/* interval tree, for LDLM_EXTENT. */
+void ldlm_interval_attach(struct ldlm_interval *n,
+			  struct ldlm_lock *l)
+{
+	LASSERT(l->l_tree_node == NULL);
+	LASSERT(l->l_resource->lr_type == LDLM_EXTENT);
+
+	list_add_tail(&l->l_sl_policy, &n->li_group);
+	l->l_tree_node = n;
+}
+
+struct ldlm_interval *ldlm_interval_detach(struct ldlm_lock *l)
+{
+	struct ldlm_interval *n = l->l_tree_node;
+
+	if (n == NULL)
+		return NULL;
+
+	LASSERT(!list_empty(&n->li_group));
+	l->l_tree_node = NULL;
+	list_del_init(&l->l_sl_policy);
+
+	return list_empty(&n->li_group) ? n : NULL;
+}
+
+static inline int lock_mode_to_index(ldlm_mode_t mode)
+{
+	int index;
+
+	LASSERT(mode != 0);
+	LASSERT(IS_PO2(mode));
+	for (index = -1; mode; index++)
+		mode >>= 1;
+	LASSERT(index < LCK_MODE_NUM);
+	return index;
+}
+
+/** Add newly granted lock into interval tree for the resource. */
+void ldlm_extent_add_lock(struct ldlm_resource *res,
+			  struct ldlm_lock *lock)
+{
+	struct interval_node *found, **root;
+	struct ldlm_interval *node;
+	struct ldlm_extent *extent;
+	int idx;
+
+	LASSERT(lock->l_granted_mode == lock->l_req_mode);
+
+	node = lock->l_tree_node;
+	LASSERT(node != NULL);
+	LASSERT(!interval_is_intree(&node->li_node));
+
+	idx = lock_mode_to_index(lock->l_granted_mode);
+	LASSERT(lock->l_granted_mode == 1 << idx);
+	LASSERT(lock->l_granted_mode == res->lr_itree[idx].lit_mode);
+
+	/* node extent initialize */
+	extent = &lock->l_policy_data.l_extent;
+	interval_set(&node->li_node, extent->start, extent->end);
+
+	root = &res->lr_itree[idx].lit_root;
+	found = interval_insert(&node->li_node, root);
+	if (found) { /* The policy group found. */
+		struct ldlm_interval *tmp;
+
+		tmp = ldlm_interval_detach(lock);
+		LASSERT(tmp != NULL);
+		ldlm_interval_free(tmp);
+		ldlm_interval_attach(to_ldlm_interval(found), lock);
+	}
+	res->lr_itree[idx].lit_size++;
+
+	/* even though we use interval tree to manage the extent lock, we also
+	 * add the locks into grant list, for debug purpose, .. */
+	ldlm_resource_add_lock(res, &res->lr_granted, lock);
+}
+
+/** Remove cancelled lock from resource interval tree. */
+void ldlm_extent_unlink_lock(struct ldlm_lock *lock)
+{
+	struct ldlm_resource *res = lock->l_resource;
+	struct ldlm_interval *node = lock->l_tree_node;
+	struct ldlm_interval_tree *tree;
+	int idx;
+
+	if (!node || !interval_is_intree(&node->li_node)) /* duplicate unlink */
+		return;
+
+	idx = lock_mode_to_index(lock->l_granted_mode);
+	LASSERT(lock->l_granted_mode == 1 << idx);
+	tree = &res->lr_itree[idx];
+
+	LASSERT(tree->lit_root != NULL); /* assure the tree is not null */
+
+	tree->lit_size--;
+	node = ldlm_interval_detach(lock);
+	if (node) {
+		interval_erase(&node->li_node, &tree->lit_root);
+		ldlm_interval_free(node);
+	}
+}
+
+void ldlm_extent_policy_wire_to_local(const ldlm_wire_policy_data_t *wpolicy,
+				     ldlm_policy_data_t *lpolicy)
+{
+	memset(lpolicy, 0, sizeof(*lpolicy));
+	lpolicy->l_extent.start = wpolicy->l_extent.start;
+	lpolicy->l_extent.end = wpolicy->l_extent.end;
+	lpolicy->l_extent.gid = wpolicy->l_extent.gid;
+}
+
+void ldlm_extent_policy_local_to_wire(const ldlm_policy_data_t *lpolicy,
+				     ldlm_wire_policy_data_t *wpolicy)
+{
+	memset(wpolicy, 0, sizeof(*wpolicy));
+	wpolicy->l_extent.start = lpolicy->l_extent.start;
+	wpolicy->l_extent.end = lpolicy->l_extent.end;
+	wpolicy->l_extent.gid = lpolicy->l_extent.gid;
+}
diff --git a/kernel/drivers/staging/lustre/lustre/ldlm/ldlm_flock.c b/kernel/drivers/staging/lustre/lustre/ldlm/ldlm_flock.c
new file mode 100644
index 000000000..a4c252feb
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/ldlm/ldlm_flock.c
@@ -0,0 +1,859 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003 Hewlett-Packard Development Company LP.
+ * Developed under the sponsorship of the US Government under
+ * Subcontract No. B514193
+ *
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2010, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+/**
+ * This file implements POSIX lock type for Lustre.
+ * Its policy properties are start and end of extent and PID.
+ *
+ * These locks are only done through MDS due to POSIX semantics requiring
+ * e.g. that locks could be only partially released and as such split into
+ * two parts, and also that two adjacent locks from the same process may be
+ * merged into a single wider lock.
+ *
+ * Lock modes are mapped like this:
+ * PR and PW for READ and WRITE locks
+ * NL to request a releasing of a portion of the lock
+ *
+ * These flock locks never timeout.
+ */
+
+#define DEBUG_SUBSYSTEM S_LDLM
+
+#include "../include/lustre_dlm.h"
+#include "../include/obd_support.h"
+#include "../include/obd_class.h"
+#include "../include/lustre_lib.h"
+#include <linux/list.h>
+#include "ldlm_internal.h"
+
+int ldlm_flock_blocking_ast(struct ldlm_lock *lock, struct ldlm_lock_desc *desc,
+			    void *data, int flag);
+
+/**
+ * list_for_remaining_safe - iterate over the remaining entries in a list
+ *	      and safeguard against removal of a list entry.
+ * \param pos   the &struct list_head to use as a loop counter. pos MUST
+ *	      have been initialized prior to using it in this macro.
+ * \param n     another &struct list_head to use as temporary storage
+ * \param head  the head for your list.
+ */
+#define list_for_remaining_safe(pos, n, head) \
+	for (n = pos->next; pos != (head); pos = n, n = pos->next)
+
+static inline int
+ldlm_same_flock_owner(struct ldlm_lock *lock, struct ldlm_lock *new)
+{
+	return((new->l_policy_data.l_flock.owner ==
+		lock->l_policy_data.l_flock.owner) &&
+	       (new->l_export == lock->l_export));
+}
+
+static inline int
+ldlm_flocks_overlap(struct ldlm_lock *lock, struct ldlm_lock *new)
+{
+	return((new->l_policy_data.l_flock.start <=
+		lock->l_policy_data.l_flock.end) &&
+	       (new->l_policy_data.l_flock.end >=
+		lock->l_policy_data.l_flock.start));
+}
+
+static inline void ldlm_flock_blocking_link(struct ldlm_lock *req,
+					    struct ldlm_lock *lock)
+{
+	/* For server only */
+	if (req->l_export == NULL)
+		return;
+
+	LASSERT(hlist_unhashed(&req->l_exp_flock_hash));
+
+	req->l_policy_data.l_flock.blocking_owner =
+		lock->l_policy_data.l_flock.owner;
+	req->l_policy_data.l_flock.blocking_export =
+		lock->l_export;
+	req->l_policy_data.l_flock.blocking_refs = 0;
+
+	cfs_hash_add(req->l_export->exp_flock_hash,
+		     &req->l_policy_data.l_flock.owner,
+		     &req->l_exp_flock_hash);
+}
+
+static inline void ldlm_flock_blocking_unlink(struct ldlm_lock *req)
+{
+	/* For server only */
+	if (req->l_export == NULL)
+		return;
+
+	check_res_locked(req->l_resource);
+	if (req->l_export->exp_flock_hash != NULL &&
+	    !hlist_unhashed(&req->l_exp_flock_hash))
+		cfs_hash_del(req->l_export->exp_flock_hash,
+			     &req->l_policy_data.l_flock.owner,
+			     &req->l_exp_flock_hash);
+}
+
+static inline void
+ldlm_flock_destroy(struct ldlm_lock *lock, ldlm_mode_t mode, __u64 flags)
+{
+	LDLM_DEBUG(lock, "ldlm_flock_destroy(mode: %d, flags: 0x%llx)",
+		   mode, flags);
+
+	/* Safe to not lock here, since it should be empty anyway */
+	LASSERT(hlist_unhashed(&lock->l_exp_flock_hash));
+
+	list_del_init(&lock->l_res_link);
+	if (flags == LDLM_FL_WAIT_NOREPROC &&
+	    !(lock->l_flags & LDLM_FL_FAILED)) {
+		/* client side - set a flag to prevent sending a CANCEL */
+		lock->l_flags |= LDLM_FL_LOCAL_ONLY | LDLM_FL_CBPENDING;
+
+		/* when reaching here, it is under lock_res_and_lock(). Thus,
+		   need call the nolock version of ldlm_lock_decref_internal*/
+		ldlm_lock_decref_internal_nolock(lock, mode);
+	}
+
+	ldlm_lock_destroy_nolock(lock);
+}
+
+/**
+ * POSIX locks deadlock detection code.
+ *
+ * Given a new lock \a req and an existing lock \a bl_lock it conflicts
+ * with, we need to iterate through all blocked POSIX locks for this
+ * export and see if there is a deadlock condition arising. (i.e. when
+ * one client holds a lock on something and want a lock on something
+ * else and at the same time another client has the opposite situation).
+ */
+static int
+ldlm_flock_deadlock(struct ldlm_lock *req, struct ldlm_lock *bl_lock)
+{
+	struct obd_export *req_exp = req->l_export;
+	struct obd_export *bl_exp = bl_lock->l_export;
+	__u64 req_owner = req->l_policy_data.l_flock.owner;
+	__u64 bl_owner = bl_lock->l_policy_data.l_flock.owner;
+
+	/* For server only */
+	if (req_exp == NULL)
+		return 0;
+
+	class_export_get(bl_exp);
+	while (1) {
+		struct obd_export *bl_exp_new;
+		struct ldlm_lock *lock = NULL;
+		struct ldlm_flock *flock;
+
+		if (bl_exp->exp_flock_hash != NULL)
+			lock = cfs_hash_lookup(bl_exp->exp_flock_hash,
+					       &bl_owner);
+		if (lock == NULL)
+			break;
+
+		LASSERT(req != lock);
+		flock = &lock->l_policy_data.l_flock;
+		LASSERT(flock->owner == bl_owner);
+		bl_owner = flock->blocking_owner;
+		bl_exp_new = class_export_get(flock->blocking_export);
+		class_export_put(bl_exp);
+
+		cfs_hash_put(bl_exp->exp_flock_hash, &lock->l_exp_flock_hash);
+		bl_exp = bl_exp_new;
+
+		if (bl_owner == req_owner && bl_exp == req_exp) {
+			class_export_put(bl_exp);
+			return 1;
+		}
+	}
+	class_export_put(bl_exp);
+
+	return 0;
+}
+
+static void ldlm_flock_cancel_on_deadlock(struct ldlm_lock *lock,
+					  struct list_head *work_list)
+{
+	CDEBUG(D_INFO, "reprocess deadlock req=%p\n", lock);
+
+	if ((exp_connect_flags(lock->l_export) &
+				OBD_CONNECT_FLOCK_DEAD) == 0) {
+		CERROR(
+		      "deadlock found, but client doesn't support flock canceliation\n");
+	} else {
+		LASSERT(lock->l_completion_ast);
+		LASSERT((lock->l_flags & LDLM_FL_AST_SENT) == 0);
+		lock->l_flags |= LDLM_FL_AST_SENT | LDLM_FL_CANCEL_ON_BLOCK |
+			LDLM_FL_FLOCK_DEADLOCK;
+		ldlm_flock_blocking_unlink(lock);
+		ldlm_resource_unlink_lock(lock);
+		ldlm_add_ast_work_item(lock, NULL, work_list);
+	}
+}
+
+/**
+ * Process a granting attempt for flock lock.
+ * Must be called under ns lock held.
+ *
+ * This function looks for any conflicts for \a lock in the granted or
+ * waiting queues. The lock is granted if no conflicts are found in
+ * either queue.
+ *
+ * It is also responsible for splitting a lock if a portion of the lock
+ * is released.
+ *
+ * If \a first_enq is 0 (ie, called from ldlm_reprocess_queue):
+ *   - blocking ASTs have already been sent
+ *
+ * If \a first_enq is 1 (ie, called from ldlm_lock_enqueue):
+ *   - blocking ASTs have not been sent yet, so list of conflicting locks
+ *     would be collected and ASTs sent.
+ */
+int
+ldlm_process_flock_lock(struct ldlm_lock *req, __u64 *flags, int first_enq,
+			ldlm_error_t *err, struct list_head *work_list)
+{
+	struct ldlm_resource *res = req->l_resource;
+	struct ldlm_namespace *ns = ldlm_res_to_ns(res);
+	struct list_head *tmp;
+	struct list_head *ownlocks = NULL;
+	struct ldlm_lock *lock = NULL;
+	struct ldlm_lock *new = req;
+	struct ldlm_lock *new2 = NULL;
+	ldlm_mode_t mode = req->l_req_mode;
+	int local = ns_is_client(ns);
+	int added = (mode == LCK_NL);
+	int overlaps = 0;
+	int splitted = 0;
+	const struct ldlm_callback_suite null_cbs = { NULL };
+
+	CDEBUG(D_DLMTRACE,
+	       "flags %#llx owner %llu pid %u mode %u start %llu end %llu\n",
+	       *flags, new->l_policy_data.l_flock.owner,
+	       new->l_policy_data.l_flock.pid, mode,
+	       req->l_policy_data.l_flock.start,
+	       req->l_policy_data.l_flock.end);
+
+	*err = ELDLM_OK;
+
+	if (local) {
+		/* No blocking ASTs are sent to the clients for
+		 * Posix file & record locks */
+		req->l_blocking_ast = NULL;
+	} else {
+		/* Called on the server for lock cancels. */
+		req->l_blocking_ast = ldlm_flock_blocking_ast;
+	}
+
+reprocess:
+	if ((*flags == LDLM_FL_WAIT_NOREPROC) || (mode == LCK_NL)) {
+		/* This loop determines where this processes locks start
+		 * in the resource lr_granted list. */
+		list_for_each(tmp, &res->lr_granted) {
+			lock = list_entry(tmp, struct ldlm_lock,
+					      l_res_link);
+			if (ldlm_same_flock_owner(lock, req)) {
+				ownlocks = tmp;
+				break;
+			}
+		}
+	} else {
+		int reprocess_failed = 0;
+
+		lockmode_verify(mode);
+
+		/* This loop determines if there are existing locks
+		 * that conflict with the new lock request. */
+		list_for_each(tmp, &res->lr_granted) {
+			lock = list_entry(tmp, struct ldlm_lock,
+					      l_res_link);
+
+			if (ldlm_same_flock_owner(lock, req)) {
+				if (!ownlocks)
+					ownlocks = tmp;
+				continue;
+			}
+
+			/* locks are compatible, overlap doesn't matter */
+			if (lockmode_compat(lock->l_granted_mode, mode))
+				continue;
+
+			if (!ldlm_flocks_overlap(lock, req))
+				continue;
+
+			if (!first_enq) {
+				reprocess_failed = 1;
+				if (ldlm_flock_deadlock(req, lock)) {
+					ldlm_flock_cancel_on_deadlock(req,
+							work_list);
+					return LDLM_ITER_CONTINUE;
+				}
+				continue;
+			}
+
+			if (*flags & LDLM_FL_BLOCK_NOWAIT) {
+				ldlm_flock_destroy(req, mode, *flags);
+				*err = -EAGAIN;
+				return LDLM_ITER_STOP;
+			}
+
+			if (*flags & LDLM_FL_TEST_LOCK) {
+				ldlm_flock_destroy(req, mode, *flags);
+				req->l_req_mode = lock->l_granted_mode;
+				req->l_policy_data.l_flock.pid =
+					lock->l_policy_data.l_flock.pid;
+				req->l_policy_data.l_flock.start =
+					lock->l_policy_data.l_flock.start;
+				req->l_policy_data.l_flock.end =
+					lock->l_policy_data.l_flock.end;
+				*flags |= LDLM_FL_LOCK_CHANGED;
+				return LDLM_ITER_STOP;
+			}
+
+			/* add lock to blocking list before deadlock
+			 * check to prevent race */
+			ldlm_flock_blocking_link(req, lock);
+
+			if (ldlm_flock_deadlock(req, lock)) {
+				ldlm_flock_blocking_unlink(req);
+				ldlm_flock_destroy(req, mode, *flags);
+				*err = -EDEADLK;
+				return LDLM_ITER_STOP;
+			}
+
+			ldlm_resource_add_lock(res, &res->lr_waiting, req);
+			*flags |= LDLM_FL_BLOCK_GRANTED;
+			return LDLM_ITER_STOP;
+		}
+		if (reprocess_failed)
+			return LDLM_ITER_CONTINUE;
+	}
+
+	if (*flags & LDLM_FL_TEST_LOCK) {
+		ldlm_flock_destroy(req, mode, *flags);
+		req->l_req_mode = LCK_NL;
+		*flags |= LDLM_FL_LOCK_CHANGED;
+		return LDLM_ITER_STOP;
+	}
+
+	/* In case we had slept on this lock request take it off of the
+	 * deadlock detection hash list. */
+	ldlm_flock_blocking_unlink(req);
+
+	/* Scan the locks owned by this process that overlap this request.
+	 * We may have to merge or split existing locks. */
+
+	if (!ownlocks)
+		ownlocks = &res->lr_granted;
+
+	list_for_remaining_safe(ownlocks, tmp, &res->lr_granted) {
+		lock = list_entry(ownlocks, struct ldlm_lock, l_res_link);
+
+		if (!ldlm_same_flock_owner(lock, new))
+			break;
+
+		if (lock->l_granted_mode == mode) {
+			/* If the modes are the same then we need to process
+			 * locks that overlap OR adjoin the new lock. The extra
+			 * logic condition is necessary to deal with arithmetic
+			 * overflow and underflow. */
+			if ((new->l_policy_data.l_flock.start >
+			     (lock->l_policy_data.l_flock.end + 1))
+			    && (lock->l_policy_data.l_flock.end !=
+				OBD_OBJECT_EOF))
+				continue;
+
+			if ((new->l_policy_data.l_flock.end <
+			     (lock->l_policy_data.l_flock.start - 1))
+			    && (lock->l_policy_data.l_flock.start != 0))
+				break;
+
+			if (new->l_policy_data.l_flock.start <
+			    lock->l_policy_data.l_flock.start) {
+				lock->l_policy_data.l_flock.start =
+					new->l_policy_data.l_flock.start;
+			} else {
+				new->l_policy_data.l_flock.start =
+					lock->l_policy_data.l_flock.start;
+			}
+
+			if (new->l_policy_data.l_flock.end >
+			    lock->l_policy_data.l_flock.end) {
+				lock->l_policy_data.l_flock.end =
+					new->l_policy_data.l_flock.end;
+			} else {
+				new->l_policy_data.l_flock.end =
+					lock->l_policy_data.l_flock.end;
+			}
+
+			if (added) {
+				ldlm_flock_destroy(lock, mode, *flags);
+			} else {
+				new = lock;
+				added = 1;
+			}
+			continue;
+		}
+
+		if (new->l_policy_data.l_flock.start >
+		    lock->l_policy_data.l_flock.end)
+			continue;
+
+		if (new->l_policy_data.l_flock.end <
+		    lock->l_policy_data.l_flock.start)
+			break;
+
+		++overlaps;
+
+		if (new->l_policy_data.l_flock.start <=
+		    lock->l_policy_data.l_flock.start) {
+			if (new->l_policy_data.l_flock.end <
+			    lock->l_policy_data.l_flock.end) {
+				lock->l_policy_data.l_flock.start =
+					new->l_policy_data.l_flock.end + 1;
+				break;
+			}
+			ldlm_flock_destroy(lock, lock->l_req_mode, *flags);
+			continue;
+		}
+		if (new->l_policy_data.l_flock.end >=
+		    lock->l_policy_data.l_flock.end) {
+			lock->l_policy_data.l_flock.end =
+				new->l_policy_data.l_flock.start - 1;
+			continue;
+		}
+
+		/* split the existing lock into two locks */
+
+		/* if this is an F_UNLCK operation then we could avoid
+		 * allocating a new lock and use the req lock passed in
+		 * with the request but this would complicate the reply
+		 * processing since updates to req get reflected in the
+		 * reply. The client side replays the lock request so
+		 * it must see the original lock data in the reply. */
+
+		/* XXX - if ldlm_lock_new() can sleep we should
+		 * release the lr_lock, allocate the new lock,
+		 * and restart processing this lock. */
+		if (!new2) {
+			unlock_res_and_lock(req);
+			new2 = ldlm_lock_create(ns, &res->lr_name, LDLM_FLOCK,
+						lock->l_granted_mode, &null_cbs,
+						NULL, 0, LVB_T_NONE);
+			lock_res_and_lock(req);
+			if (!new2) {
+				ldlm_flock_destroy(req, lock->l_granted_mode,
+						   *flags);
+				*err = -ENOLCK;
+				return LDLM_ITER_STOP;
+			}
+			goto reprocess;
+		}
+
+		splitted = 1;
+
+		new2->l_granted_mode = lock->l_granted_mode;
+		new2->l_policy_data.l_flock.pid =
+			new->l_policy_data.l_flock.pid;
+		new2->l_policy_data.l_flock.owner =
+			new->l_policy_data.l_flock.owner;
+		new2->l_policy_data.l_flock.start =
+			lock->l_policy_data.l_flock.start;
+		new2->l_policy_data.l_flock.end =
+			new->l_policy_data.l_flock.start - 1;
+		lock->l_policy_data.l_flock.start =
+			new->l_policy_data.l_flock.end + 1;
+		new2->l_conn_export = lock->l_conn_export;
+		if (lock->l_export != NULL) {
+			new2->l_export = class_export_lock_get(lock->l_export,
+							       new2);
+			if (new2->l_export->exp_lock_hash &&
+			    hlist_unhashed(&new2->l_exp_hash))
+				cfs_hash_add(new2->l_export->exp_lock_hash,
+					     &new2->l_remote_handle,
+					     &new2->l_exp_hash);
+		}
+		if (*flags == LDLM_FL_WAIT_NOREPROC)
+			ldlm_lock_addref_internal_nolock(new2,
+							 lock->l_granted_mode);
+
+		/* insert new2 at lock */
+		ldlm_resource_add_lock(res, ownlocks, new2);
+		LDLM_LOCK_RELEASE(new2);
+		break;
+	}
+
+	/* if new2 is created but never used, destroy it*/
+	if (splitted == 0 && new2 != NULL)
+		ldlm_lock_destroy_nolock(new2);
+
+	/* At this point we're granting the lock request. */
+	req->l_granted_mode = req->l_req_mode;
+
+	/* Add req to the granted queue before calling ldlm_reprocess_all(). */
+	if (!added) {
+		list_del_init(&req->l_res_link);
+		/* insert new lock before ownlocks in list. */
+		ldlm_resource_add_lock(res, ownlocks, req);
+	}
+
+	if (*flags != LDLM_FL_WAIT_NOREPROC) {
+		/* The only one possible case for client-side calls flock
+		 * policy function is ldlm_flock_completion_ast inside which
+		 * carries LDLM_FL_WAIT_NOREPROC flag. */
+		CERROR("Illegal parameter for client-side-only module.\n");
+		LBUG();
+	}
+
+	/* In case we're reprocessing the requested lock we can't destroy
+	 * it until after calling ldlm_add_ast_work_item() above so that laawi()
+	 * can bump the reference count on \a req. Otherwise \a req
+	 * could be freed before the completion AST can be sent.  */
+	if (added)
+		ldlm_flock_destroy(req, mode, *flags);
+
+	ldlm_resource_dump(D_INFO, res);
+	return LDLM_ITER_CONTINUE;
+}
+
+struct ldlm_flock_wait_data {
+	struct ldlm_lock *fwd_lock;
+	int	       fwd_generation;
+};
+
+static void
+ldlm_flock_interrupted_wait(void *data)
+{
+	struct ldlm_lock *lock;
+
+	lock = ((struct ldlm_flock_wait_data *)data)->fwd_lock;
+
+	/* take lock off the deadlock detection hash list. */
+	lock_res_and_lock(lock);
+	ldlm_flock_blocking_unlink(lock);
+
+	/* client side - set flag to prevent lock from being put on LRU list */
+	lock->l_flags |= LDLM_FL_CBPENDING;
+	unlock_res_and_lock(lock);
+}
+
+/**
+ * Flock completion callback function.
+ *
+ * \param lock [in,out]: A lock to be handled
+ * \param flags    [in]: flags
+ * \param *data    [in]: ldlm_work_cp_ast_lock() will use ldlm_cb_set_arg
+ *
+ * \retval 0    : success
+ * \retval <0   : failure
+ */
+int
+ldlm_flock_completion_ast(struct ldlm_lock *lock, __u64 flags, void *data)
+{
+	struct file_lock		*getlk = lock->l_ast_data;
+	struct obd_device	      *obd;
+	struct obd_import	      *imp = NULL;
+	struct ldlm_flock_wait_data     fwd;
+	struct l_wait_info	      lwi;
+	ldlm_error_t		    err;
+	int			     rc = 0;
+
+	CDEBUG(D_DLMTRACE, "flags: 0x%llx data: %p getlk: %p\n",
+	       flags, data, getlk);
+
+	/* Import invalidation. We need to actually release the lock
+	 * references being held, so that it can go away. No point in
+	 * holding the lock even if app still believes it has it, since
+	 * server already dropped it anyway. Only for granted locks too. */
+	if ((lock->l_flags & (LDLM_FL_FAILED|LDLM_FL_LOCAL_ONLY)) ==
+	    (LDLM_FL_FAILED|LDLM_FL_LOCAL_ONLY)) {
+		if (lock->l_req_mode == lock->l_granted_mode &&
+		    lock->l_granted_mode != LCK_NL &&
+		    NULL == data)
+			ldlm_lock_decref_internal(lock, lock->l_req_mode);
+
+		/* Need to wake up the waiter if we were evicted */
+		wake_up(&lock->l_waitq);
+		return 0;
+	}
+
+	LASSERT(flags != LDLM_FL_WAIT_NOREPROC);
+
+	if (!(flags & (LDLM_FL_BLOCK_WAIT | LDLM_FL_BLOCK_GRANTED |
+		       LDLM_FL_BLOCK_CONV))) {
+		if (NULL == data)
+			/* mds granted the lock in the reply */
+			goto granted;
+		/* CP AST RPC: lock get granted, wake it up */
+		wake_up(&lock->l_waitq);
+		return 0;
+	}
+
+	LDLM_DEBUG(lock, "client-side enqueue returned a blocked lock, sleeping");
+	fwd.fwd_lock = lock;
+	obd = class_exp2obd(lock->l_conn_export);
+
+	/* if this is a local lock, there is no import */
+	if (NULL != obd)
+		imp = obd->u.cli.cl_import;
+
+	if (NULL != imp) {
+		spin_lock(&imp->imp_lock);
+		fwd.fwd_generation = imp->imp_generation;
+		spin_unlock(&imp->imp_lock);
+	}
+
+	lwi = LWI_TIMEOUT_INTR(0, NULL, ldlm_flock_interrupted_wait, &fwd);
+
+	/* Go to sleep until the lock is granted. */
+	rc = l_wait_event(lock->l_waitq, is_granted_or_cancelled(lock), &lwi);
+
+	if (rc) {
+		LDLM_DEBUG(lock, "client-side enqueue waking up: failed (%d)",
+			   rc);
+		return rc;
+	}
+
+granted:
+	OBD_FAIL_TIMEOUT(OBD_FAIL_LDLM_CP_CB_WAIT, 10);
+
+	if (lock->l_flags & LDLM_FL_DESTROYED) {
+		LDLM_DEBUG(lock, "client-side enqueue waking up: destroyed");
+		return 0;
+	}
+
+	if (lock->l_flags & LDLM_FL_FAILED) {
+		LDLM_DEBUG(lock, "client-side enqueue waking up: failed");
+		return -EIO;
+	}
+
+	if (rc) {
+		LDLM_DEBUG(lock, "client-side enqueue waking up: failed (%d)",
+			   rc);
+		return rc;
+	}
+
+	LDLM_DEBUG(lock, "client-side enqueue granted");
+
+	lock_res_and_lock(lock);
+
+	/* take lock off the deadlock detection hash list. */
+	ldlm_flock_blocking_unlink(lock);
+
+	/* ldlm_lock_enqueue() has already placed lock on the granted list. */
+	list_del_init(&lock->l_res_link);
+
+	if (lock->l_flags & LDLM_FL_FLOCK_DEADLOCK) {
+		LDLM_DEBUG(lock, "client-side enqueue deadlock received");
+		rc = -EDEADLK;
+	} else if (flags & LDLM_FL_TEST_LOCK) {
+		/* fcntl(F_GETLK) request */
+		/* The old mode was saved in getlk->fl_type so that if the mode
+		 * in the lock changes we can decref the appropriate refcount.*/
+		ldlm_flock_destroy(lock, getlk->fl_type, LDLM_FL_WAIT_NOREPROC);
+		switch (lock->l_granted_mode) {
+		case LCK_PR:
+			getlk->fl_type = F_RDLCK;
+			break;
+		case LCK_PW:
+			getlk->fl_type = F_WRLCK;
+			break;
+		default:
+			getlk->fl_type = F_UNLCK;
+		}
+		getlk->fl_pid = (pid_t)lock->l_policy_data.l_flock.pid;
+		getlk->fl_start = (loff_t)lock->l_policy_data.l_flock.start;
+		getlk->fl_end = (loff_t)lock->l_policy_data.l_flock.end;
+	} else {
+		__u64 noreproc = LDLM_FL_WAIT_NOREPROC;
+
+		/* We need to reprocess the lock to do merges or splits
+		 * with existing locks owned by this process. */
+		ldlm_process_flock_lock(lock, &noreproc, 1, &err, NULL);
+	}
+	unlock_res_and_lock(lock);
+	return rc;
+}
+EXPORT_SYMBOL(ldlm_flock_completion_ast);
+
+int ldlm_flock_blocking_ast(struct ldlm_lock *lock, struct ldlm_lock_desc *desc,
+			    void *data, int flag)
+{
+	LASSERT(lock);
+	LASSERT(flag == LDLM_CB_CANCELING);
+
+	/* take lock off the deadlock detection hash list. */
+	lock_res_and_lock(lock);
+	ldlm_flock_blocking_unlink(lock);
+	unlock_res_and_lock(lock);
+	return 0;
+}
+
+void ldlm_flock_policy_wire18_to_local(const ldlm_wire_policy_data_t *wpolicy,
+				       ldlm_policy_data_t *lpolicy)
+{
+	memset(lpolicy, 0, sizeof(*lpolicy));
+	lpolicy->l_flock.start = wpolicy->l_flock.lfw_start;
+	lpolicy->l_flock.end = wpolicy->l_flock.lfw_end;
+	lpolicy->l_flock.pid = wpolicy->l_flock.lfw_pid;
+	/* Compat code, old clients had no idea about owner field and
+	 * relied solely on pid for ownership. Introduced in LU-104, 2.1,
+	 * April 2011 */
+	lpolicy->l_flock.owner = wpolicy->l_flock.lfw_pid;
+}
+
+
+void ldlm_flock_policy_wire21_to_local(const ldlm_wire_policy_data_t *wpolicy,
+				       ldlm_policy_data_t *lpolicy)
+{
+	memset(lpolicy, 0, sizeof(*lpolicy));
+	lpolicy->l_flock.start = wpolicy->l_flock.lfw_start;
+	lpolicy->l_flock.end = wpolicy->l_flock.lfw_end;
+	lpolicy->l_flock.pid = wpolicy->l_flock.lfw_pid;
+	lpolicy->l_flock.owner = wpolicy->l_flock.lfw_owner;
+}
+
+void ldlm_flock_policy_local_to_wire(const ldlm_policy_data_t *lpolicy,
+				     ldlm_wire_policy_data_t *wpolicy)
+{
+	memset(wpolicy, 0, sizeof(*wpolicy));
+	wpolicy->l_flock.lfw_start = lpolicy->l_flock.start;
+	wpolicy->l_flock.lfw_end = lpolicy->l_flock.end;
+	wpolicy->l_flock.lfw_pid = lpolicy->l_flock.pid;
+	wpolicy->l_flock.lfw_owner = lpolicy->l_flock.owner;
+}
+
+/*
+ * Export handle<->flock hash operations.
+ */
+static unsigned
+ldlm_export_flock_hash(struct cfs_hash *hs, const void *key, unsigned mask)
+{
+	return cfs_hash_u64_hash(*(__u64 *)key, mask);
+}
+
+static void *
+ldlm_export_flock_key(struct hlist_node *hnode)
+{
+	struct ldlm_lock *lock;
+
+	lock = hlist_entry(hnode, struct ldlm_lock, l_exp_flock_hash);
+	return &lock->l_policy_data.l_flock.owner;
+}
+
+static int
+ldlm_export_flock_keycmp(const void *key, struct hlist_node *hnode)
+{
+	return !memcmp(ldlm_export_flock_key(hnode), key, sizeof(__u64));
+}
+
+static void *
+ldlm_export_flock_object(struct hlist_node *hnode)
+{
+	return hlist_entry(hnode, struct ldlm_lock, l_exp_flock_hash);
+}
+
+static void
+ldlm_export_flock_get(struct cfs_hash *hs, struct hlist_node *hnode)
+{
+	struct ldlm_lock *lock;
+	struct ldlm_flock *flock;
+
+	lock = hlist_entry(hnode, struct ldlm_lock, l_exp_flock_hash);
+	LDLM_LOCK_GET(lock);
+
+	flock = &lock->l_policy_data.l_flock;
+	LASSERT(flock->blocking_export != NULL);
+	class_export_get(flock->blocking_export);
+	flock->blocking_refs++;
+}
+
+static void
+ldlm_export_flock_put(struct cfs_hash *hs, struct hlist_node *hnode)
+{
+	struct ldlm_lock *lock;
+	struct ldlm_flock *flock;
+
+	lock = hlist_entry(hnode, struct ldlm_lock, l_exp_flock_hash);
+	LDLM_LOCK_RELEASE(lock);
+
+	flock = &lock->l_policy_data.l_flock;
+	LASSERT(flock->blocking_export != NULL);
+	class_export_put(flock->blocking_export);
+	if (--flock->blocking_refs == 0) {
+		flock->blocking_owner = 0;
+		flock->blocking_export = NULL;
+	}
+}
+
+static cfs_hash_ops_t ldlm_export_flock_ops = {
+	.hs_hash	= ldlm_export_flock_hash,
+	.hs_key	 = ldlm_export_flock_key,
+	.hs_keycmp      = ldlm_export_flock_keycmp,
+	.hs_object      = ldlm_export_flock_object,
+	.hs_get	 = ldlm_export_flock_get,
+	.hs_put	 = ldlm_export_flock_put,
+	.hs_put_locked  = ldlm_export_flock_put,
+};
+
+int ldlm_init_flock_export(struct obd_export *exp)
+{
+	if (strcmp(exp->exp_obd->obd_type->typ_name, LUSTRE_MDT_NAME) != 0)
+		return 0;
+
+	exp->exp_flock_hash =
+		cfs_hash_create(obd_uuid2str(&exp->exp_client_uuid),
+				HASH_EXP_LOCK_CUR_BITS,
+				HASH_EXP_LOCK_MAX_BITS,
+				HASH_EXP_LOCK_BKT_BITS, 0,
+				CFS_HASH_MIN_THETA, CFS_HASH_MAX_THETA,
+				&ldlm_export_flock_ops,
+				CFS_HASH_DEFAULT | CFS_HASH_NBLK_CHANGE);
+	if (!exp->exp_flock_hash)
+		return -ENOMEM;
+
+	return 0;
+}
+EXPORT_SYMBOL(ldlm_init_flock_export);
+
+void ldlm_destroy_flock_export(struct obd_export *exp)
+{
+	if (exp->exp_flock_hash) {
+		cfs_hash_putref(exp->exp_flock_hash);
+		exp->exp_flock_hash = NULL;
+	}
+}
+EXPORT_SYMBOL(ldlm_destroy_flock_export);
diff --git a/kernel/drivers/staging/lustre/lustre/ldlm/ldlm_inodebits.c b/kernel/drivers/staging/lustre/lustre/ldlm/ldlm_inodebits.c
new file mode 100644
index 000000000..40d333850
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/ldlm/ldlm_inodebits.c
@@ -0,0 +1,74 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/ldlm/ldlm_inodebits.c
+ *
+ * Author: Peter Braam <braam@clusterfs.com>
+ * Author: Phil Schwan <phil@clusterfs.com>
+ */
+
+/**
+ * This file contains implementation of IBITS lock type
+ *
+ * IBITS lock type contains a bit mask determining various properties of an
+ * object. The meanings of specific bits are specific to the caller and are
+ * opaque to LDLM code.
+ *
+ * Locks with intersecting bitmasks and conflicting lock modes (e.g.  LCK_PW)
+ * are considered conflicting.  See the lock mode compatibility matrix
+ * in lustre_dlm.h.
+ */
+
+#define DEBUG_SUBSYSTEM S_LDLM
+
+#include "../include/lustre_dlm.h"
+#include "../include/obd_support.h"
+#include "../include/lustre_lib.h"
+#include "ldlm_internal.h"
+
+
+void ldlm_ibits_policy_wire_to_local(const ldlm_wire_policy_data_t *wpolicy,
+				     ldlm_policy_data_t *lpolicy)
+{
+	memset(lpolicy, 0, sizeof(*lpolicy));
+	lpolicy->l_inodebits.bits = wpolicy->l_inodebits.bits;
+}
+
+void ldlm_ibits_policy_local_to_wire(const ldlm_policy_data_t *lpolicy,
+				     ldlm_wire_policy_data_t *wpolicy)
+{
+	memset(wpolicy, 0, sizeof(*wpolicy));
+	wpolicy->l_inodebits.bits = lpolicy->l_inodebits.bits;
+}
diff --git a/kernel/drivers/staging/lustre/lustre/ldlm/ldlm_internal.h b/kernel/drivers/staging/lustre/lustre/ldlm/ldlm_internal.h
new file mode 100644
index 000000000..70b909f55
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/ldlm/ldlm_internal.h
@@ -0,0 +1,316 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define MAX_STRING_SIZE 128
+
+extern int ldlm_srv_namespace_nr;
+extern int ldlm_cli_namespace_nr;
+extern struct mutex ldlm_srv_namespace_lock;
+extern struct list_head ldlm_srv_namespace_list;
+extern struct mutex ldlm_cli_namespace_lock;
+extern struct list_head ldlm_cli_active_namespace_list;
+extern struct list_head ldlm_cli_inactive_namespace_list;
+
+static inline int ldlm_namespace_nr_read(ldlm_side_t client)
+{
+	return client == LDLM_NAMESPACE_SERVER ?
+		ldlm_srv_namespace_nr : ldlm_cli_namespace_nr;
+}
+
+static inline void ldlm_namespace_nr_inc(ldlm_side_t client)
+{
+	if (client == LDLM_NAMESPACE_SERVER)
+		ldlm_srv_namespace_nr++;
+	else
+		ldlm_cli_namespace_nr++;
+}
+
+static inline void ldlm_namespace_nr_dec(ldlm_side_t client)
+{
+	if (client == LDLM_NAMESPACE_SERVER)
+		ldlm_srv_namespace_nr--;
+	else
+		ldlm_cli_namespace_nr--;
+}
+
+static inline struct list_head *ldlm_namespace_list(ldlm_side_t client)
+{
+	return client == LDLM_NAMESPACE_SERVER ?
+		&ldlm_srv_namespace_list : &ldlm_cli_active_namespace_list;
+}
+
+static inline struct list_head *ldlm_namespace_inactive_list(ldlm_side_t client)
+{
+	return client == LDLM_NAMESPACE_SERVER ?
+		&ldlm_srv_namespace_list : &ldlm_cli_inactive_namespace_list;
+}
+
+static inline struct mutex *ldlm_namespace_lock(ldlm_side_t client)
+{
+	return client == LDLM_NAMESPACE_SERVER ?
+		&ldlm_srv_namespace_lock : &ldlm_cli_namespace_lock;
+}
+
+/* ns_bref is the number of resources in this namespace */
+static inline int ldlm_ns_empty(struct ldlm_namespace *ns)
+{
+	return atomic_read(&ns->ns_bref) == 0;
+}
+
+void ldlm_namespace_move_to_active_locked(struct ldlm_namespace *, ldlm_side_t);
+void ldlm_namespace_move_to_inactive_locked(struct ldlm_namespace *,
+					    ldlm_side_t);
+struct ldlm_namespace *ldlm_namespace_first_locked(ldlm_side_t);
+
+/* ldlm_request.c */
+/* Cancel lru flag, it indicates we cancel aged locks. */
+enum {
+	LDLM_CANCEL_AGED   = 1 << 0, /* Cancel aged locks (non lru resize). */
+	LDLM_CANCEL_PASSED = 1 << 1, /* Cancel passed number of locks. */
+	LDLM_CANCEL_SHRINK = 1 << 2, /* Cancel locks from shrinker. */
+	LDLM_CANCEL_LRUR   = 1 << 3, /* Cancel locks from lru resize. */
+	LDLM_CANCEL_NO_WAIT = 1 << 4 /* Cancel locks w/o blocking (neither
+				      * sending nor waiting for any rpcs) */
+};
+
+int ldlm_cancel_lru(struct ldlm_namespace *ns, int nr,
+		    ldlm_cancel_flags_t sync, int flags);
+int ldlm_cancel_lru_local(struct ldlm_namespace *ns,
+			  struct list_head *cancels, int count, int max,
+			  ldlm_cancel_flags_t cancel_flags, int flags);
+extern int ldlm_enqueue_min;
+int ldlm_get_enq_timeout(struct ldlm_lock *lock);
+
+/* ldlm_resource.c */
+int ldlm_resource_putref_locked(struct ldlm_resource *res);
+void ldlm_resource_insert_lock_after(struct ldlm_lock *original,
+				     struct ldlm_lock *new);
+void ldlm_namespace_free_prior(struct ldlm_namespace *ns,
+			       struct obd_import *imp, int force);
+void ldlm_namespace_free_post(struct ldlm_namespace *ns);
+/* ldlm_lock.c */
+
+struct ldlm_cb_set_arg {
+	struct ptlrpc_request_set	*set;
+	int				 type; /* LDLM_{CP,BL,GL}_CALLBACK */
+	atomic_t			 restart;
+	struct list_head			*list;
+	union ldlm_gl_desc		*gl_desc; /* glimpse AST descriptor */
+};
+
+enum ldlm_desc_ast_t {
+	LDLM_WORK_BL_AST,
+	LDLM_WORK_CP_AST,
+	LDLM_WORK_REVOKE_AST,
+	LDLM_WORK_GL_AST
+};
+
+void ldlm_grant_lock(struct ldlm_lock *lock, struct list_head *work_list);
+int ldlm_fill_lvb(struct ldlm_lock *lock, struct req_capsule *pill,
+		  enum req_location loc, void *data, int size);
+struct ldlm_lock *
+ldlm_lock_create(struct ldlm_namespace *ns, const struct ldlm_res_id *,
+		 ldlm_type_t type, ldlm_mode_t,
+		 const struct ldlm_callback_suite *cbs,
+		 void *data, __u32 lvb_len, enum lvb_type lvb_type);
+ldlm_error_t ldlm_lock_enqueue(struct ldlm_namespace *, struct ldlm_lock **,
+			       void *cookie, __u64 *flags);
+void ldlm_lock_addref_internal(struct ldlm_lock *, __u32 mode);
+void ldlm_lock_addref_internal_nolock(struct ldlm_lock *, __u32 mode);
+void ldlm_lock_decref_internal(struct ldlm_lock *, __u32 mode);
+void ldlm_lock_decref_internal_nolock(struct ldlm_lock *, __u32 mode);
+void ldlm_add_ast_work_item(struct ldlm_lock *lock, struct ldlm_lock *new,
+			    struct list_head *work_list);
+int ldlm_run_ast_work(struct ldlm_namespace *ns, struct list_head *rpc_list,
+		      enum ldlm_desc_ast_t ast_type);
+int ldlm_work_gl_ast_lock(struct ptlrpc_request_set *rqset, void *opaq);
+int ldlm_lock_remove_from_lru(struct ldlm_lock *lock);
+int ldlm_lock_remove_from_lru_nolock(struct ldlm_lock *lock);
+void ldlm_lock_add_to_lru_nolock(struct ldlm_lock *lock);
+void ldlm_lock_add_to_lru(struct ldlm_lock *lock);
+void ldlm_lock_touch_in_lru(struct ldlm_lock *lock);
+void ldlm_lock_destroy_nolock(struct ldlm_lock *lock);
+
+void ldlm_cancel_locks_for_export(struct obd_export *export);
+
+/* ldlm_lockd.c */
+int ldlm_bl_to_thread_lock(struct ldlm_namespace *ns, struct ldlm_lock_desc *ld,
+			   struct ldlm_lock *lock);
+int ldlm_bl_to_thread_list(struct ldlm_namespace *ns,
+			   struct ldlm_lock_desc *ld,
+			   struct list_head *cancels, int count,
+			   ldlm_cancel_flags_t cancel_flags);
+
+void ldlm_handle_bl_callback(struct ldlm_namespace *ns,
+			     struct ldlm_lock_desc *ld, struct ldlm_lock *lock);
+
+extern struct kmem_cache *ldlm_resource_slab;
+
+/* ldlm_lockd.c & ldlm_lock.c */
+extern struct kmem_cache *ldlm_lock_slab;
+
+/* ldlm_extent.c */
+void ldlm_extent_add_lock(struct ldlm_resource *res, struct ldlm_lock *lock);
+void ldlm_extent_unlink_lock(struct ldlm_lock *lock);
+
+/* ldlm_flock.c */
+int ldlm_process_flock_lock(struct ldlm_lock *req, __u64 *flags,
+			    int first_enq, ldlm_error_t *err,
+			    struct list_head *work_list);
+int ldlm_init_flock_export(struct obd_export *exp);
+void ldlm_destroy_flock_export(struct obd_export *exp);
+
+/* l_lock.c */
+void l_check_ns_lock(struct ldlm_namespace *ns);
+void l_check_no_ns_lock(struct ldlm_namespace *ns);
+
+extern struct proc_dir_entry *ldlm_svc_proc_dir;
+extern struct proc_dir_entry *ldlm_type_proc_dir;
+
+struct ldlm_state {
+	struct ptlrpc_service *ldlm_cb_service;
+	struct ptlrpc_service *ldlm_cancel_service;
+	struct ptlrpc_client *ldlm_client;
+	struct ptlrpc_connection *ldlm_server_conn;
+	struct ldlm_bl_pool *ldlm_bl_pool;
+};
+
+/* interval tree, for LDLM_EXTENT. */
+extern struct kmem_cache *ldlm_interval_slab; /* slab cache for ldlm_interval */
+extern void ldlm_interval_attach(struct ldlm_interval *n, struct ldlm_lock *l);
+struct ldlm_interval *ldlm_interval_detach(struct ldlm_lock *l);
+struct ldlm_interval *ldlm_interval_alloc(struct ldlm_lock *lock);
+void ldlm_interval_free(struct ldlm_interval *node);
+/* this function must be called with res lock held */
+static inline struct ldlm_extent *
+ldlm_interval_extent(struct ldlm_interval *node)
+{
+	struct ldlm_lock *lock;
+
+	LASSERT(!list_empty(&node->li_group));
+
+	lock = list_entry(node->li_group.next, struct ldlm_lock,
+			      l_sl_policy);
+	return &lock->l_policy_data.l_extent;
+}
+
+int ldlm_init(void);
+void ldlm_exit(void);
+
+enum ldlm_policy_res {
+	LDLM_POLICY_CANCEL_LOCK,
+	LDLM_POLICY_KEEP_LOCK,
+	LDLM_POLICY_SKIP_LOCK
+};
+
+typedef enum ldlm_policy_res ldlm_policy_res_t;
+
+#define LDLM_POOL_PROC_READER_SEQ_SHOW(var, type)			    \
+	static int lprocfs_##var##_seq_show(struct seq_file *m, void *v) \
+	{								    \
+		struct ldlm_pool *pl = m->private;			    \
+		type tmp;						    \
+									    \
+		spin_lock(&pl->pl_lock);				    \
+		tmp = pl->pl_##var;					    \
+		spin_unlock(&pl->pl_lock);				    \
+									    \
+		return lprocfs_rd_uint(m, &tmp);			    \
+	}								    \
+	struct __##var##__dummy_read {; } /* semicolon catcher */
+
+#define LDLM_POOL_PROC_WRITER(var, type)				    \
+	static int lprocfs_wr_##var(struct file *file,			    \
+				const char __user *buffer,		    \
+				unsigned long count, void *data)	    \
+	{								    \
+		struct ldlm_pool *pl = data;				    \
+		type tmp;						    \
+		int rc;							    \
+									    \
+		rc = lprocfs_wr_uint(file, buffer, count, &tmp);	    \
+		if (rc < 0) {						    \
+			CERROR("Can't parse user input, rc = %d\n", rc);    \
+			return rc;					    \
+		}							    \
+									    \
+		spin_lock(&pl->pl_lock);				    \
+		pl->pl_##var = tmp;					    \
+		spin_unlock(&pl->pl_lock);				    \
+									    \
+		return rc;						    \
+	}								    \
+	struct __##var##__dummy_write {; } /* semicolon catcher */
+
+static inline int is_granted_or_cancelled(struct ldlm_lock *lock)
+{
+	int ret = 0;
+
+	lock_res_and_lock(lock);
+	if (((lock->l_req_mode == lock->l_granted_mode) &&
+	     !(lock->l_flags & LDLM_FL_CP_REQD)) ||
+	    (lock->l_flags & (LDLM_FL_FAILED | LDLM_FL_CANCEL)))
+		ret = 1;
+	unlock_res_and_lock(lock);
+
+	return ret;
+}
+
+typedef void (*ldlm_policy_wire_to_local_t)(const ldlm_wire_policy_data_t *,
+					    ldlm_policy_data_t *);
+
+typedef void (*ldlm_policy_local_to_wire_t)(const ldlm_policy_data_t *,
+					    ldlm_wire_policy_data_t *);
+
+void ldlm_plain_policy_wire_to_local(const ldlm_wire_policy_data_t *wpolicy,
+				     ldlm_policy_data_t *lpolicy);
+void ldlm_plain_policy_local_to_wire(const ldlm_policy_data_t *lpolicy,
+				     ldlm_wire_policy_data_t *wpolicy);
+void ldlm_ibits_policy_wire_to_local(const ldlm_wire_policy_data_t *wpolicy,
+				     ldlm_policy_data_t *lpolicy);
+void ldlm_ibits_policy_local_to_wire(const ldlm_policy_data_t *lpolicy,
+				     ldlm_wire_policy_data_t *wpolicy);
+void ldlm_extent_policy_wire_to_local(const ldlm_wire_policy_data_t *wpolicy,
+				     ldlm_policy_data_t *lpolicy);
+void ldlm_extent_policy_local_to_wire(const ldlm_policy_data_t *lpolicy,
+				     ldlm_wire_policy_data_t *wpolicy);
+void ldlm_flock_policy_wire18_to_local(const ldlm_wire_policy_data_t *wpolicy,
+				     ldlm_policy_data_t *lpolicy);
+void ldlm_flock_policy_wire21_to_local(const ldlm_wire_policy_data_t *wpolicy,
+				     ldlm_policy_data_t *lpolicy);
+
+void ldlm_flock_policy_local_to_wire(const ldlm_policy_data_t *lpolicy,
+				     ldlm_wire_policy_data_t *wpolicy);
diff --git a/kernel/drivers/staging/lustre/lustre/ldlm/ldlm_lib.c b/kernel/drivers/staging/lustre/lustre/ldlm/ldlm_lib.c
new file mode 100644
index 000000000..c5c86e73c
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/ldlm/ldlm_lib.c
@@ -0,0 +1,870 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2010, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+/**
+ * This file deals with various client/target related logic including recovery.
+ *
+ * TODO: This code more logically belongs in the ptlrpc module than in ldlm and
+ * should be moved.
+ */
+
+#define DEBUG_SUBSYSTEM S_LDLM
+
+#include "../../include/linux/libcfs/libcfs.h"
+#include "../include/obd.h"
+#include "../include/obd_class.h"
+#include "../include/lustre_dlm.h"
+#include "../include/lustre_net.h"
+#include "../include/lustre_sec.h"
+#include "ldlm_internal.h"
+
+/* @priority: If non-zero, move the selected connection to the list head.
+ * @create: If zero, only search in existing connections.
+ */
+static int import_set_conn(struct obd_import *imp, struct obd_uuid *uuid,
+			   int priority, int create)
+{
+	struct ptlrpc_connection *ptlrpc_conn;
+	struct obd_import_conn *imp_conn = NULL, *item;
+	int rc = 0;
+
+	if (!create && !priority) {
+		CDEBUG(D_HA, "Nothing to do\n");
+		return -EINVAL;
+	}
+
+	ptlrpc_conn = ptlrpc_uuid_to_connection(uuid);
+	if (!ptlrpc_conn) {
+		CDEBUG(D_HA, "can't find connection %s\n", uuid->uuid);
+		return -ENOENT;
+	}
+
+	if (create) {
+		OBD_ALLOC(imp_conn, sizeof(*imp_conn));
+		if (!imp_conn) {
+			rc = -ENOMEM;
+			goto out_put;
+		}
+	}
+
+	spin_lock(&imp->imp_lock);
+	list_for_each_entry(item, &imp->imp_conn_list, oic_item) {
+		if (obd_uuid_equals(uuid, &item->oic_uuid)) {
+			if (priority) {
+				list_del(&item->oic_item);
+				list_add(&item->oic_item,
+					     &imp->imp_conn_list);
+				item->oic_last_attempt = 0;
+			}
+			CDEBUG(D_HA, "imp %p@%s: found existing conn %s%s\n",
+			       imp, imp->imp_obd->obd_name, uuid->uuid,
+			       (priority ? ", moved to head" : ""));
+			spin_unlock(&imp->imp_lock);
+			rc = 0;
+			goto out_free;
+		}
+	}
+	/* No existing import connection found for \a uuid. */
+	if (create) {
+		imp_conn->oic_conn = ptlrpc_conn;
+		imp_conn->oic_uuid = *uuid;
+		imp_conn->oic_last_attempt = 0;
+		if (priority)
+			list_add(&imp_conn->oic_item, &imp->imp_conn_list);
+		else
+			list_add_tail(&imp_conn->oic_item,
+					  &imp->imp_conn_list);
+		CDEBUG(D_HA, "imp %p@%s: add connection %s at %s\n",
+		       imp, imp->imp_obd->obd_name, uuid->uuid,
+		       (priority ? "head" : "tail"));
+	} else {
+		spin_unlock(&imp->imp_lock);
+		rc = -ENOENT;
+		goto out_free;
+	}
+
+	spin_unlock(&imp->imp_lock);
+	return 0;
+out_free:
+	if (imp_conn)
+		OBD_FREE(imp_conn, sizeof(*imp_conn));
+out_put:
+	ptlrpc_connection_put(ptlrpc_conn);
+	return rc;
+}
+
+int import_set_conn_priority(struct obd_import *imp, struct obd_uuid *uuid)
+{
+	return import_set_conn(imp, uuid, 1, 0);
+}
+
+int client_import_add_conn(struct obd_import *imp, struct obd_uuid *uuid,
+			   int priority)
+{
+	return import_set_conn(imp, uuid, priority, 1);
+}
+EXPORT_SYMBOL(client_import_add_conn);
+
+int client_import_del_conn(struct obd_import *imp, struct obd_uuid *uuid)
+{
+	struct obd_import_conn *imp_conn;
+	struct obd_export *dlmexp;
+	int rc = -ENOENT;
+
+	spin_lock(&imp->imp_lock);
+	if (list_empty(&imp->imp_conn_list)) {
+		LASSERT(!imp->imp_connection);
+		goto out;
+	}
+
+	list_for_each_entry(imp_conn, &imp->imp_conn_list, oic_item) {
+		if (!obd_uuid_equals(uuid, &imp_conn->oic_uuid))
+			continue;
+		LASSERT(imp_conn->oic_conn);
+
+		if (imp_conn == imp->imp_conn_current) {
+			LASSERT(imp_conn->oic_conn == imp->imp_connection);
+
+			if (imp->imp_state != LUSTRE_IMP_CLOSED &&
+			    imp->imp_state != LUSTRE_IMP_DISCON) {
+				CERROR("can't remove current connection\n");
+				rc = -EBUSY;
+				goto out;
+			}
+
+			ptlrpc_connection_put(imp->imp_connection);
+			imp->imp_connection = NULL;
+
+			dlmexp = class_conn2export(&imp->imp_dlm_handle);
+			if (dlmexp && dlmexp->exp_connection) {
+				LASSERT(dlmexp->exp_connection ==
+					imp_conn->oic_conn);
+				ptlrpc_connection_put(dlmexp->exp_connection);
+				dlmexp->exp_connection = NULL;
+			}
+		}
+
+		list_del(&imp_conn->oic_item);
+		ptlrpc_connection_put(imp_conn->oic_conn);
+		OBD_FREE(imp_conn, sizeof(*imp_conn));
+		CDEBUG(D_HA, "imp %p@%s: remove connection %s\n",
+		       imp, imp->imp_obd->obd_name, uuid->uuid);
+		rc = 0;
+		break;
+	}
+out:
+	spin_unlock(&imp->imp_lock);
+	if (rc == -ENOENT)
+		CERROR("connection %s not found\n", uuid->uuid);
+	return rc;
+}
+EXPORT_SYMBOL(client_import_del_conn);
+
+/**
+ * Find conn UUID by peer NID. \a peer is a server NID. This function is used
+ * to find a conn uuid of \a imp which can reach \a peer.
+ */
+int client_import_find_conn(struct obd_import *imp, lnet_nid_t peer,
+			    struct obd_uuid *uuid)
+{
+	struct obd_import_conn *conn;
+	int rc = -ENOENT;
+
+	spin_lock(&imp->imp_lock);
+	list_for_each_entry(conn, &imp->imp_conn_list, oic_item) {
+		/* Check if conn UUID does have this peer NID. */
+		if (class_check_uuid(&conn->oic_uuid, peer)) {
+			*uuid = conn->oic_uuid;
+			rc = 0;
+			break;
+		}
+	}
+	spin_unlock(&imp->imp_lock);
+	return rc;
+}
+EXPORT_SYMBOL(client_import_find_conn);
+
+void client_destroy_import(struct obd_import *imp)
+{
+	/* Drop security policy instance after all RPCs have finished/aborted
+	 * to let all busy contexts be released. */
+	class_import_get(imp);
+	class_destroy_import(imp);
+	sptlrpc_import_sec_put(imp);
+	class_import_put(imp);
+}
+EXPORT_SYMBOL(client_destroy_import);
+
+/**
+ * Check whether or not the OSC is on MDT.
+ * In the config log,
+ * osc on MDT
+ *	setup 0:{fsname}-OSTxxxx-osc[-MDTxxxx] 1:lustre-OST0000_UUID 2:NID
+ * osc on client
+ *	setup 0:{fsname}-OSTxxxx-osc 1:lustre-OST0000_UUID 2:NID
+ *
+ **/
+static int osc_on_mdt(char *obdname)
+{
+	char *ptr;
+
+	ptr = strrchr(obdname, '-');
+	if (ptr == NULL)
+		return 0;
+
+	if (strncmp(ptr + 1, "MDT", 3) == 0)
+		return 1;
+
+	return 0;
+}
+
+/* Configure an RPC client OBD device.
+ *
+ * lcfg parameters:
+ * 1 - client UUID
+ * 2 - server UUID
+ * 3 - inactive-on-startup
+ */
+int client_obd_setup(struct obd_device *obddev, struct lustre_cfg *lcfg)
+{
+	struct client_obd *cli = &obddev->u.cli;
+	struct obd_import *imp;
+	struct obd_uuid server_uuid;
+	int rq_portal, rp_portal, connect_op;
+	char *name = obddev->obd_type->typ_name;
+	ldlm_ns_type_t ns_type = LDLM_NS_TYPE_UNKNOWN;
+	int rc;
+
+	/* In a more perfect world, we would hang a ptlrpc_client off of
+	 * obd_type and just use the values from there. */
+	if (!strcmp(name, LUSTRE_OSC_NAME)) {
+		rq_portal = OST_REQUEST_PORTAL;
+		rp_portal = OSC_REPLY_PORTAL;
+		connect_op = OST_CONNECT;
+		cli->cl_sp_me = LUSTRE_SP_CLI;
+		cli->cl_sp_to = LUSTRE_SP_OST;
+		ns_type = LDLM_NS_TYPE_OSC;
+	} else if (!strcmp(name, LUSTRE_MDC_NAME) ||
+		   !strcmp(name, LUSTRE_LWP_NAME)) {
+		rq_portal = MDS_REQUEST_PORTAL;
+		rp_portal = MDC_REPLY_PORTAL;
+		connect_op = MDS_CONNECT;
+		cli->cl_sp_me = LUSTRE_SP_CLI;
+		cli->cl_sp_to = LUSTRE_SP_MDT;
+		ns_type = LDLM_NS_TYPE_MDC;
+	} else if (!strcmp(name, LUSTRE_OSP_NAME)) {
+		if (strstr(lustre_cfg_buf(lcfg, 1), "OST") == NULL) {
+			/* OSP_on_MDT for other MDTs */
+			connect_op = MDS_CONNECT;
+			cli->cl_sp_to = LUSTRE_SP_MDT;
+			ns_type = LDLM_NS_TYPE_MDC;
+			rq_portal = OUT_PORTAL;
+		} else {
+			/* OSP on MDT for OST */
+			connect_op = OST_CONNECT;
+			cli->cl_sp_to = LUSTRE_SP_OST;
+			ns_type = LDLM_NS_TYPE_OSC;
+			rq_portal = OST_REQUEST_PORTAL;
+		}
+		rp_portal = OSC_REPLY_PORTAL;
+		cli->cl_sp_me = LUSTRE_SP_CLI;
+	} else if (!strcmp(name, LUSTRE_MGC_NAME)) {
+		rq_portal = MGS_REQUEST_PORTAL;
+		rp_portal = MGC_REPLY_PORTAL;
+		connect_op = MGS_CONNECT;
+		cli->cl_sp_me = LUSTRE_SP_MGC;
+		cli->cl_sp_to = LUSTRE_SP_MGS;
+		cli->cl_flvr_mgc.sf_rpc = SPTLRPC_FLVR_INVALID;
+		ns_type = LDLM_NS_TYPE_MGC;
+	} else {
+		CERROR("unknown client OBD type \"%s\", can't setup\n",
+		       name);
+		return -EINVAL;
+	}
+
+	if (LUSTRE_CFG_BUFLEN(lcfg, 1) < 1) {
+		CERROR("requires a TARGET UUID\n");
+		return -EINVAL;
+	}
+
+	if (LUSTRE_CFG_BUFLEN(lcfg, 1) > 37) {
+		CERROR("client UUID must be less than 38 characters\n");
+		return -EINVAL;
+	}
+
+	if (LUSTRE_CFG_BUFLEN(lcfg, 2) < 1) {
+		CERROR("setup requires a SERVER UUID\n");
+		return -EINVAL;
+	}
+
+	if (LUSTRE_CFG_BUFLEN(lcfg, 2) > 37) {
+		CERROR("target UUID must be less than 38 characters\n");
+		return -EINVAL;
+	}
+
+	init_rwsem(&cli->cl_sem);
+	mutex_init(&cli->cl_mgc_mutex);
+	cli->cl_conn_count = 0;
+	memcpy(server_uuid.uuid, lustre_cfg_buf(lcfg, 2),
+	       min_t(unsigned int, LUSTRE_CFG_BUFLEN(lcfg, 2),
+		     sizeof(server_uuid)));
+
+	cli->cl_dirty = 0;
+	cli->cl_avail_grant = 0;
+	/* FIXME: Should limit this for the sum of all cl_dirty_max. */
+	cli->cl_dirty_max = OSC_MAX_DIRTY_DEFAULT * 1024 * 1024;
+	if (cli->cl_dirty_max >> PAGE_CACHE_SHIFT > totalram_pages / 8)
+		cli->cl_dirty_max = totalram_pages << (PAGE_CACHE_SHIFT - 3);
+	INIT_LIST_HEAD(&cli->cl_cache_waiters);
+	INIT_LIST_HEAD(&cli->cl_loi_ready_list);
+	INIT_LIST_HEAD(&cli->cl_loi_hp_ready_list);
+	INIT_LIST_HEAD(&cli->cl_loi_write_list);
+	INIT_LIST_HEAD(&cli->cl_loi_read_list);
+	client_obd_list_lock_init(&cli->cl_loi_list_lock);
+	atomic_set(&cli->cl_pending_w_pages, 0);
+	atomic_set(&cli->cl_pending_r_pages, 0);
+	cli->cl_r_in_flight = 0;
+	cli->cl_w_in_flight = 0;
+
+	spin_lock_init(&cli->cl_read_rpc_hist.oh_lock);
+	spin_lock_init(&cli->cl_write_rpc_hist.oh_lock);
+	spin_lock_init(&cli->cl_read_page_hist.oh_lock);
+	spin_lock_init(&cli->cl_write_page_hist.oh_lock);
+	spin_lock_init(&cli->cl_read_offset_hist.oh_lock);
+	spin_lock_init(&cli->cl_write_offset_hist.oh_lock);
+
+	/* lru for osc. */
+	INIT_LIST_HEAD(&cli->cl_lru_osc);
+	atomic_set(&cli->cl_lru_shrinkers, 0);
+	atomic_set(&cli->cl_lru_busy, 0);
+	atomic_set(&cli->cl_lru_in_list, 0);
+	INIT_LIST_HEAD(&cli->cl_lru_list);
+	client_obd_list_lock_init(&cli->cl_lru_list_lock);
+
+	init_waitqueue_head(&cli->cl_destroy_waitq);
+	atomic_set(&cli->cl_destroy_in_flight, 0);
+	/* Turn on checksumming by default. */
+	cli->cl_checksum = 1;
+	/*
+	 * The supported checksum types will be worked out at connect time
+	 * Set cl_chksum* to CRC32 for now to avoid returning screwed info
+	 * through procfs.
+	 */
+	cli->cl_cksum_type = cli->cl_supp_cksum_types = OBD_CKSUM_CRC32;
+	atomic_set(&cli->cl_resends, OSC_DEFAULT_RESENDS);
+
+	/* This value may be reduced at connect time in
+	 * ptlrpc_connect_interpret() . We initialize it to only
+	 * 1MB until we know what the performance looks like.
+	 * In the future this should likely be increased. LU-1431 */
+	cli->cl_max_pages_per_rpc = min_t(int, PTLRPC_MAX_BRW_PAGES,
+					  LNET_MTU >> PAGE_CACHE_SHIFT);
+
+	if (!strcmp(name, LUSTRE_MDC_NAME)) {
+		cli->cl_max_rpcs_in_flight = MDC_MAX_RIF_DEFAULT;
+	} else if (totalram_pages >> (20 - PAGE_CACHE_SHIFT) <= 128 /* MB */) {
+		cli->cl_max_rpcs_in_flight = 2;
+	} else if (totalram_pages >> (20 - PAGE_CACHE_SHIFT) <= 256 /* MB */) {
+		cli->cl_max_rpcs_in_flight = 3;
+	} else if (totalram_pages >> (20 - PAGE_CACHE_SHIFT) <= 512 /* MB */) {
+		cli->cl_max_rpcs_in_flight = 4;
+	} else {
+		if (osc_on_mdt(obddev->obd_name))
+			cli->cl_max_rpcs_in_flight = MDS_OSC_MAX_RIF_DEFAULT;
+		else
+			cli->cl_max_rpcs_in_flight = OSC_MAX_RIF_DEFAULT;
+	}
+	rc = ldlm_get_ref();
+	if (rc) {
+		CERROR("ldlm_get_ref failed: %d\n", rc);
+		goto err;
+	}
+
+	ptlrpc_init_client(rq_portal, rp_portal, name,
+			   &obddev->obd_ldlm_client);
+
+	imp = class_new_import(obddev);
+	if (imp == NULL) {
+		rc = -ENOENT;
+		goto err_ldlm;
+	}
+	imp->imp_client = &obddev->obd_ldlm_client;
+	imp->imp_connect_op = connect_op;
+	memcpy(cli->cl_target_uuid.uuid, lustre_cfg_buf(lcfg, 1),
+	       LUSTRE_CFG_BUFLEN(lcfg, 1));
+	class_import_put(imp);
+
+	rc = client_import_add_conn(imp, &server_uuid, 1);
+	if (rc) {
+		CERROR("can't add initial connection\n");
+		goto err_import;
+	}
+
+	cli->cl_import = imp;
+	/* cli->cl_max_mds_{easize,cookiesize} updated by mdc_init_ea_size() */
+	cli->cl_max_mds_easize = sizeof(struct lov_mds_md_v3);
+	cli->cl_max_mds_cookiesize = sizeof(struct llog_cookie);
+
+	if (LUSTRE_CFG_BUFLEN(lcfg, 3) > 0) {
+		if (!strcmp(lustre_cfg_string(lcfg, 3), "inactive")) {
+			CDEBUG(D_HA, "marking %s %s->%s as inactive\n",
+			       name, obddev->obd_name,
+			       cli->cl_target_uuid.uuid);
+			spin_lock(&imp->imp_lock);
+			imp->imp_deactive = 1;
+			spin_unlock(&imp->imp_lock);
+		}
+	}
+
+	obddev->obd_namespace = ldlm_namespace_new(obddev, obddev->obd_name,
+						   LDLM_NAMESPACE_CLIENT,
+						   LDLM_NAMESPACE_GREEDY,
+						   ns_type);
+	if (obddev->obd_namespace == NULL) {
+		CERROR("Unable to create client namespace - %s\n",
+		       obddev->obd_name);
+		rc = -ENOMEM;
+		goto err_import;
+	}
+
+	cli->cl_qchk_stat = CL_NOT_QUOTACHECKED;
+
+	return rc;
+
+err_import:
+	class_destroy_import(imp);
+err_ldlm:
+	ldlm_put_ref();
+err:
+	return rc;
+
+}
+EXPORT_SYMBOL(client_obd_setup);
+
+int client_obd_cleanup(struct obd_device *obddev)
+{
+	ldlm_namespace_free_post(obddev->obd_namespace);
+	obddev->obd_namespace = NULL;
+
+	LASSERT(obddev->u.cli.cl_import == NULL);
+
+	ldlm_put_ref();
+	return 0;
+}
+EXPORT_SYMBOL(client_obd_cleanup);
+
+/* ->o_connect() method for client side (OSC and MDC and MGC) */
+int client_connect_import(const struct lu_env *env,
+			  struct obd_export **exp,
+			  struct obd_device *obd, struct obd_uuid *cluuid,
+			  struct obd_connect_data *data, void *localdata)
+{
+	struct client_obd       *cli    = &obd->u.cli;
+	struct obd_import       *imp    = cli->cl_import;
+	struct obd_connect_data *ocd;
+	struct lustre_handle    conn    = { 0 };
+	int		     rc;
+
+	*exp = NULL;
+	down_write(&cli->cl_sem);
+	if (cli->cl_conn_count > 0) {
+		rc = -EALREADY;
+		goto out_sem;
+	}
+
+	rc = class_connect(&conn, obd, cluuid);
+	if (rc)
+		goto out_sem;
+
+	cli->cl_conn_count++;
+	*exp = class_conn2export(&conn);
+
+	LASSERT(obd->obd_namespace);
+
+	imp->imp_dlm_handle = conn;
+	rc = ptlrpc_init_import(imp);
+	if (rc != 0)
+		goto out_ldlm;
+
+	ocd = &imp->imp_connect_data;
+	if (data) {
+		*ocd = *data;
+		imp->imp_connect_flags_orig = data->ocd_connect_flags;
+	}
+
+	rc = ptlrpc_connect_import(imp);
+	if (rc != 0) {
+		LASSERT(imp->imp_state == LUSTRE_IMP_DISCON);
+		goto out_ldlm;
+	}
+	LASSERT(*exp != NULL && (*exp)->exp_connection);
+
+	if (data) {
+		LASSERTF((ocd->ocd_connect_flags & data->ocd_connect_flags) ==
+			 ocd->ocd_connect_flags, "old %#llx, new %#llx\n",
+			 data->ocd_connect_flags, ocd->ocd_connect_flags);
+		data->ocd_connect_flags = ocd->ocd_connect_flags;
+	}
+
+	ptlrpc_pinger_add_import(imp);
+
+	if (rc) {
+out_ldlm:
+		cli->cl_conn_count--;
+		class_disconnect(*exp);
+		*exp = NULL;
+	}
+out_sem:
+	up_write(&cli->cl_sem);
+
+	return rc;
+}
+EXPORT_SYMBOL(client_connect_import);
+
+int client_disconnect_export(struct obd_export *exp)
+{
+	struct obd_device *obd = class_exp2obd(exp);
+	struct client_obd *cli;
+	struct obd_import *imp;
+	int rc = 0, err;
+
+	if (!obd) {
+		CERROR("invalid export for disconnect: exp %p cookie %#llx\n",
+		       exp, exp ? exp->exp_handle.h_cookie : -1);
+		return -EINVAL;
+	}
+
+	cli = &obd->u.cli;
+	imp = cli->cl_import;
+
+	down_write(&cli->cl_sem);
+	CDEBUG(D_INFO, "disconnect %s - %d\n", obd->obd_name,
+	       cli->cl_conn_count);
+
+	if (!cli->cl_conn_count) {
+		CERROR("disconnecting disconnected device (%s)\n",
+		       obd->obd_name);
+		rc = -EINVAL;
+		goto out_disconnect;
+	}
+
+	cli->cl_conn_count--;
+	if (cli->cl_conn_count) {
+		rc = 0;
+		goto out_disconnect;
+	}
+
+	/* Mark import deactivated now, so we don't try to reconnect if any
+	 * of the cleanup RPCs fails (e.g. LDLM cancel, etc).  We don't
+	 * fully deactivate the import, or that would drop all requests. */
+	spin_lock(&imp->imp_lock);
+	imp->imp_deactive = 1;
+	spin_unlock(&imp->imp_lock);
+
+	/* Some non-replayable imports (MDS's OSCs) are pinged, so just
+	 * delete it regardless.  (It's safe to delete an import that was
+	 * never added.) */
+	(void)ptlrpc_pinger_del_import(imp);
+
+	if (obd->obd_namespace != NULL) {
+		/* obd_force == local only */
+		ldlm_cli_cancel_unused(obd->obd_namespace, NULL,
+				       obd->obd_force ? LCF_LOCAL : 0, NULL);
+		ldlm_namespace_free_prior(obd->obd_namespace, imp,
+					  obd->obd_force);
+	}
+
+	/* There's no need to hold sem while disconnecting an import,
+	 * and it may actually cause deadlock in GSS. */
+	up_write(&cli->cl_sem);
+	rc = ptlrpc_disconnect_import(imp, 0);
+	down_write(&cli->cl_sem);
+
+	ptlrpc_invalidate_import(imp);
+
+out_disconnect:
+	/* Use server style - class_disconnect should be always called for
+	 * o_disconnect. */
+	err = class_disconnect(exp);
+	if (!rc && err)
+		rc = err;
+
+	up_write(&cli->cl_sem);
+
+	return rc;
+}
+EXPORT_SYMBOL(client_disconnect_export);
+
+
+/**
+ * Packs current SLV and Limit into \a req.
+ */
+int target_pack_pool_reply(struct ptlrpc_request *req)
+{
+	struct obd_device *obd;
+
+	/* Check that we still have all structures alive as this may
+	 * be some late RPC at shutdown time. */
+	if (unlikely(!req->rq_export || !req->rq_export->exp_obd ||
+		     !exp_connect_lru_resize(req->rq_export))) {
+		lustre_msg_set_slv(req->rq_repmsg, 0);
+		lustre_msg_set_limit(req->rq_repmsg, 0);
+		return 0;
+	}
+
+	/* OBD is alive here as export is alive, which we checked above. */
+	obd = req->rq_export->exp_obd;
+
+	read_lock(&obd->obd_pool_lock);
+	lustre_msg_set_slv(req->rq_repmsg, obd->obd_pool_slv);
+	lustre_msg_set_limit(req->rq_repmsg, obd->obd_pool_limit);
+	read_unlock(&obd->obd_pool_lock);
+
+	return 0;
+}
+EXPORT_SYMBOL(target_pack_pool_reply);
+
+int target_send_reply_msg(struct ptlrpc_request *req, int rc, int fail_id)
+{
+	if (OBD_FAIL_CHECK_ORSET(fail_id & ~OBD_FAIL_ONCE, OBD_FAIL_ONCE)) {
+		DEBUG_REQ(D_ERROR, req, "dropping reply");
+		return -ECOMM;
+	}
+
+	if (unlikely(rc)) {
+		DEBUG_REQ(D_NET, req, "processing error (%d)", rc);
+		req->rq_status = rc;
+		return ptlrpc_send_error(req, 1);
+	} else {
+		DEBUG_REQ(D_NET, req, "sending reply");
+	}
+
+	return ptlrpc_send_reply(req, PTLRPC_REPLY_MAYBE_DIFFICULT);
+}
+
+void target_send_reply(struct ptlrpc_request *req, int rc, int fail_id)
+{
+	struct ptlrpc_service_part *svcpt;
+	int			netrc;
+	struct ptlrpc_reply_state *rs;
+	struct obd_export	 *exp;
+
+	if (req->rq_no_reply)
+		return;
+
+	svcpt = req->rq_rqbd->rqbd_svcpt;
+	rs = req->rq_reply_state;
+	if (rs == NULL || !rs->rs_difficult) {
+		/* no notifiers */
+		target_send_reply_msg(req, rc, fail_id);
+		return;
+	}
+
+	/* must be an export if locks saved */
+	LASSERT(req->rq_export != NULL);
+	/* req/reply consistent */
+	LASSERT(rs->rs_svcpt == svcpt);
+
+	/* "fresh" reply */
+	LASSERT(!rs->rs_scheduled);
+	LASSERT(!rs->rs_scheduled_ever);
+	LASSERT(!rs->rs_handled);
+	LASSERT(!rs->rs_on_net);
+	LASSERT(rs->rs_export == NULL);
+	LASSERT(list_empty(&rs->rs_obd_list));
+	LASSERT(list_empty(&rs->rs_exp_list));
+
+	exp = class_export_get(req->rq_export);
+
+	/* disable reply scheduling while I'm setting up */
+	rs->rs_scheduled = 1;
+	rs->rs_on_net    = 1;
+	rs->rs_xid       = req->rq_xid;
+	rs->rs_transno   = req->rq_transno;
+	rs->rs_export    = exp;
+	rs->rs_opc       = lustre_msg_get_opc(req->rq_reqmsg);
+
+	spin_lock(&exp->exp_uncommitted_replies_lock);
+	CDEBUG(D_NET, "rs transno = %llu, last committed = %llu\n",
+	       rs->rs_transno, exp->exp_last_committed);
+	if (rs->rs_transno > exp->exp_last_committed) {
+		/* not committed already */
+		list_add_tail(&rs->rs_obd_list,
+				  &exp->exp_uncommitted_replies);
+	}
+	spin_unlock(&exp->exp_uncommitted_replies_lock);
+
+	spin_lock(&exp->exp_lock);
+	list_add_tail(&rs->rs_exp_list, &exp->exp_outstanding_replies);
+	spin_unlock(&exp->exp_lock);
+
+	netrc = target_send_reply_msg(req, rc, fail_id);
+
+	spin_lock(&svcpt->scp_rep_lock);
+
+	atomic_inc(&svcpt->scp_nreps_difficult);
+
+	if (netrc != 0) {
+		/* error sending: reply is off the net.  Also we need +1
+		 * reply ref until ptlrpc_handle_rs() is done
+		 * with the reply state (if the send was successful, there
+		 * would have been +1 ref for the net, which
+		 * reply_out_callback leaves alone) */
+		rs->rs_on_net = 0;
+		ptlrpc_rs_addref(rs);
+	}
+
+	spin_lock(&rs->rs_lock);
+	if (rs->rs_transno <= exp->exp_last_committed ||
+	    (!rs->rs_on_net && !rs->rs_no_ack) ||
+	    list_empty(&rs->rs_exp_list) ||     /* completed already */
+	    list_empty(&rs->rs_obd_list)) {
+		CDEBUG(D_HA, "Schedule reply immediately\n");
+		ptlrpc_dispatch_difficult_reply(rs);
+	} else {
+		list_add(&rs->rs_list, &svcpt->scp_rep_active);
+		rs->rs_scheduled = 0;	/* allow notifier to schedule */
+	}
+	spin_unlock(&rs->rs_lock);
+	spin_unlock(&svcpt->scp_rep_lock);
+}
+EXPORT_SYMBOL(target_send_reply);
+
+ldlm_mode_t lck_compat_array[] = {
+	[LCK_EX]	= LCK_COMPAT_EX,
+	[LCK_PW]	= LCK_COMPAT_PW,
+	[LCK_PR]	= LCK_COMPAT_PR,
+	[LCK_CW]	= LCK_COMPAT_CW,
+	[LCK_CR]	= LCK_COMPAT_CR,
+	[LCK_NL]	= LCK_COMPAT_NL,
+	[LCK_GROUP]	= LCK_COMPAT_GROUP,
+	[LCK_COS]	= LCK_COMPAT_COS,
+};
+
+/**
+ * Rather arbitrary mapping from LDLM error codes to errno values. This should
+ * not escape to the user level.
+ */
+int ldlm_error2errno(ldlm_error_t error)
+{
+	int result;
+
+	switch (error) {
+	case ELDLM_OK:
+		result = 0;
+		break;
+	case ELDLM_LOCK_CHANGED:
+		result = -ESTALE;
+		break;
+	case ELDLM_LOCK_ABORTED:
+		result = -ENAVAIL;
+		break;
+	case ELDLM_LOCK_REPLACED:
+		result = -ESRCH;
+		break;
+	case ELDLM_NO_LOCK_DATA:
+		result = -ENOENT;
+		break;
+	case ELDLM_NAMESPACE_EXISTS:
+		result = -EEXIST;
+		break;
+	case ELDLM_BAD_NAMESPACE:
+		result = -EBADF;
+		break;
+	default:
+		if (((int)error) < 0)  /* cast to signed type */
+			result = error; /* as ldlm_error_t can be unsigned */
+		else {
+			CERROR("Invalid DLM result code: %d\n", error);
+			result = -EPROTO;
+		}
+	}
+	return result;
+}
+EXPORT_SYMBOL(ldlm_error2errno);
+
+/**
+ * Dual to ldlm_error2errno(): maps errno values back to ldlm_error_t.
+ */
+ldlm_error_t ldlm_errno2error(int err_no)
+{
+	int error;
+
+	switch (err_no) {
+	case 0:
+		error = ELDLM_OK;
+		break;
+	case -ESTALE:
+		error = ELDLM_LOCK_CHANGED;
+		break;
+	case -ENAVAIL:
+		error = ELDLM_LOCK_ABORTED;
+		break;
+	case -ESRCH:
+		error = ELDLM_LOCK_REPLACED;
+		break;
+	case -ENOENT:
+		error = ELDLM_NO_LOCK_DATA;
+		break;
+	case -EEXIST:
+		error = ELDLM_NAMESPACE_EXISTS;
+		break;
+	case -EBADF:
+		error = ELDLM_BAD_NAMESPACE;
+		break;
+	default:
+		error = err_no;
+	}
+	return error;
+}
+EXPORT_SYMBOL(ldlm_errno2error);
+
+#if LUSTRE_TRACKS_LOCK_EXP_REFS
+void ldlm_dump_export_locks(struct obd_export *exp)
+{
+	spin_lock(&exp->exp_locks_list_guard);
+	if (!list_empty(&exp->exp_locks_list)) {
+		struct ldlm_lock *lock;
+
+		CERROR("dumping locks for export %p,ignore if the unmount doesn't hang\n",
+		       exp);
+		list_for_each_entry(lock, &exp->exp_locks_list,
+					l_exp_refs_link)
+			LDLM_ERROR(lock, "lock:");
+	}
+	spin_unlock(&exp->exp_locks_list_guard);
+}
+#endif
diff --git a/kernel/drivers/staging/lustre/lustre/ldlm/ldlm_lock.c b/kernel/drivers/staging/lustre/lustre/ldlm/ldlm_lock.c
new file mode 100644
index 000000000..84b111eb4
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/ldlm/ldlm_lock.c
@@ -0,0 +1,2322 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2010, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/ldlm/ldlm_lock.c
+ *
+ * Author: Peter Braam <braam@clusterfs.com>
+ * Author: Phil Schwan <phil@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LDLM
+
+#include "../../include/linux/libcfs/libcfs.h"
+#include "../include/lustre_intent.h"
+#include "../include/obd_class.h"
+#include "ldlm_internal.h"
+
+/* lock types */
+char *ldlm_lockname[] = {
+	[0]		= "--",
+	[LCK_EX]	= "EX",
+	[LCK_PW]	= "PW",
+	[LCK_PR]	= "PR",
+	[LCK_CW]	= "CW",
+	[LCK_CR]	= "CR",
+	[LCK_NL]	= "NL",
+	[LCK_GROUP]	= "GROUP",
+	[LCK_COS]	= "COS",
+};
+EXPORT_SYMBOL(ldlm_lockname);
+
+char *ldlm_typename[] = {
+	[LDLM_PLAIN]	= "PLN",
+	[LDLM_EXTENT]	= "EXT",
+	[LDLM_FLOCK]	= "FLK",
+	[LDLM_IBITS]	= "IBT",
+};
+EXPORT_SYMBOL(ldlm_typename);
+
+static ldlm_policy_wire_to_local_t ldlm_policy_wire18_to_local[] = {
+	[LDLM_PLAIN - LDLM_MIN_TYPE]	= ldlm_plain_policy_wire_to_local,
+	[LDLM_EXTENT - LDLM_MIN_TYPE]	= ldlm_extent_policy_wire_to_local,
+	[LDLM_FLOCK - LDLM_MIN_TYPE]	= ldlm_flock_policy_wire18_to_local,
+	[LDLM_IBITS - LDLM_MIN_TYPE]	= ldlm_ibits_policy_wire_to_local,
+};
+
+static ldlm_policy_wire_to_local_t ldlm_policy_wire21_to_local[] = {
+	[LDLM_PLAIN - LDLM_MIN_TYPE]	= ldlm_plain_policy_wire_to_local,
+	[LDLM_EXTENT - LDLM_MIN_TYPE]	= ldlm_extent_policy_wire_to_local,
+	[LDLM_FLOCK - LDLM_MIN_TYPE]	= ldlm_flock_policy_wire21_to_local,
+	[LDLM_IBITS - LDLM_MIN_TYPE]	= ldlm_ibits_policy_wire_to_local,
+};
+
+static ldlm_policy_local_to_wire_t ldlm_policy_local_to_wire[] = {
+	[LDLM_PLAIN - LDLM_MIN_TYPE]	= ldlm_plain_policy_local_to_wire,
+	[LDLM_EXTENT - LDLM_MIN_TYPE]	= ldlm_extent_policy_local_to_wire,
+	[LDLM_FLOCK - LDLM_MIN_TYPE]	= ldlm_flock_policy_local_to_wire,
+	[LDLM_IBITS - LDLM_MIN_TYPE]	= ldlm_ibits_policy_local_to_wire,
+};
+
+/**
+ * Converts lock policy from local format to on the wire lock_desc format
+ */
+void ldlm_convert_policy_to_wire(ldlm_type_t type,
+				 const ldlm_policy_data_t *lpolicy,
+				 ldlm_wire_policy_data_t *wpolicy)
+{
+	ldlm_policy_local_to_wire_t convert;
+
+	convert = ldlm_policy_local_to_wire[type - LDLM_MIN_TYPE];
+
+	convert(lpolicy, wpolicy);
+}
+
+/**
+ * Converts lock policy from on the wire lock_desc format to local format
+ */
+void ldlm_convert_policy_to_local(struct obd_export *exp, ldlm_type_t type,
+				  const ldlm_wire_policy_data_t *wpolicy,
+				  ldlm_policy_data_t *lpolicy)
+{
+	ldlm_policy_wire_to_local_t convert;
+	int new_client;
+
+	/** some badness for 2.0.0 clients, but 2.0.0 isn't supported */
+	new_client = (exp_connect_flags(exp) & OBD_CONNECT_FULL20) != 0;
+	if (new_client)
+		convert = ldlm_policy_wire21_to_local[type - LDLM_MIN_TYPE];
+	else
+		convert = ldlm_policy_wire18_to_local[type - LDLM_MIN_TYPE];
+
+	convert(wpolicy, lpolicy);
+}
+
+char *ldlm_it2str(int it)
+{
+	switch (it) {
+	case IT_OPEN:
+		return "open";
+	case IT_CREAT:
+		return "creat";
+	case (IT_OPEN | IT_CREAT):
+		return "open|creat";
+	case IT_READDIR:
+		return "readdir";
+	case IT_GETATTR:
+		return "getattr";
+	case IT_LOOKUP:
+		return "lookup";
+	case IT_UNLINK:
+		return "unlink";
+	case IT_GETXATTR:
+		return "getxattr";
+	case IT_LAYOUT:
+		return "layout";
+	default:
+		CERROR("Unknown intent %d\n", it);
+		return "UNKNOWN";
+	}
+}
+EXPORT_SYMBOL(ldlm_it2str);
+
+
+void ldlm_register_intent(struct ldlm_namespace *ns, ldlm_res_policy arg)
+{
+	ns->ns_policy = arg;
+}
+EXPORT_SYMBOL(ldlm_register_intent);
+
+/*
+ * REFCOUNTED LOCK OBJECTS
+ */
+
+
+/**
+ * Get a reference on a lock.
+ *
+ * Lock refcounts, during creation:
+ *   - one special one for allocation, dec'd only once in destroy
+ *   - one for being a lock that's in-use
+ *   - one for the addref associated with a new lock
+ */
+struct ldlm_lock *ldlm_lock_get(struct ldlm_lock *lock)
+{
+	atomic_inc(&lock->l_refc);
+	return lock;
+}
+EXPORT_SYMBOL(ldlm_lock_get);
+
+/**
+ * Release lock reference.
+ *
+ * Also frees the lock if it was last reference.
+ */
+void ldlm_lock_put(struct ldlm_lock *lock)
+{
+	LASSERT(lock->l_resource != LP_POISON);
+	LASSERT(atomic_read(&lock->l_refc) > 0);
+	if (atomic_dec_and_test(&lock->l_refc)) {
+		struct ldlm_resource *res;
+
+		LDLM_DEBUG(lock,
+			   "final lock_put on destroyed lock, freeing it.");
+
+		res = lock->l_resource;
+		LASSERT(lock->l_flags & LDLM_FL_DESTROYED);
+		LASSERT(list_empty(&lock->l_res_link));
+		LASSERT(list_empty(&lock->l_pending_chain));
+
+		lprocfs_counter_decr(ldlm_res_to_ns(res)->ns_stats,
+				     LDLM_NSS_LOCKS);
+		lu_ref_del(&res->lr_reference, "lock", lock);
+		ldlm_resource_putref(res);
+		lock->l_resource = NULL;
+		if (lock->l_export) {
+			class_export_lock_put(lock->l_export, lock);
+			lock->l_export = NULL;
+		}
+
+		if (lock->l_lvb_data != NULL)
+			OBD_FREE(lock->l_lvb_data, lock->l_lvb_len);
+
+		ldlm_interval_free(ldlm_interval_detach(lock));
+		lu_ref_fini(&lock->l_reference);
+		OBD_FREE_RCU(lock, sizeof(*lock), &lock->l_handle);
+	}
+}
+EXPORT_SYMBOL(ldlm_lock_put);
+
+/**
+ * Removes LDLM lock \a lock from LRU. Assumes LRU is already locked.
+ */
+int ldlm_lock_remove_from_lru_nolock(struct ldlm_lock *lock)
+{
+	int rc = 0;
+
+	if (!list_empty(&lock->l_lru)) {
+		struct ldlm_namespace *ns = ldlm_lock_to_ns(lock);
+
+		LASSERT(lock->l_resource->lr_type != LDLM_FLOCK);
+		list_del_init(&lock->l_lru);
+		LASSERT(ns->ns_nr_unused > 0);
+		ns->ns_nr_unused--;
+		rc = 1;
+	}
+	return rc;
+}
+
+/**
+ * Removes LDLM lock \a lock from LRU. Obtains the LRU lock first.
+ */
+int ldlm_lock_remove_from_lru(struct ldlm_lock *lock)
+{
+	struct ldlm_namespace *ns = ldlm_lock_to_ns(lock);
+	int rc;
+
+	if (lock->l_flags & LDLM_FL_NS_SRV) {
+		LASSERT(list_empty(&lock->l_lru));
+		return 0;
+	}
+
+	spin_lock(&ns->ns_lock);
+	rc = ldlm_lock_remove_from_lru_nolock(lock);
+	spin_unlock(&ns->ns_lock);
+	return rc;
+}
+
+/**
+ * Adds LDLM lock \a lock to namespace LRU. Assumes LRU is already locked.
+ */
+void ldlm_lock_add_to_lru_nolock(struct ldlm_lock *lock)
+{
+	struct ldlm_namespace *ns = ldlm_lock_to_ns(lock);
+
+	lock->l_last_used = cfs_time_current();
+	LASSERT(list_empty(&lock->l_lru));
+	LASSERT(lock->l_resource->lr_type != LDLM_FLOCK);
+	list_add_tail(&lock->l_lru, &ns->ns_unused_list);
+	if (lock->l_flags & LDLM_FL_SKIPPED)
+		lock->l_flags &= ~LDLM_FL_SKIPPED;
+	LASSERT(ns->ns_nr_unused >= 0);
+	ns->ns_nr_unused++;
+}
+
+/**
+ * Adds LDLM lock \a lock to namespace LRU. Obtains necessary LRU locks
+ * first.
+ */
+void ldlm_lock_add_to_lru(struct ldlm_lock *lock)
+{
+	struct ldlm_namespace *ns = ldlm_lock_to_ns(lock);
+
+	spin_lock(&ns->ns_lock);
+	ldlm_lock_add_to_lru_nolock(lock);
+	spin_unlock(&ns->ns_lock);
+}
+
+/**
+ * Moves LDLM lock \a lock that is already in namespace LRU to the tail of
+ * the LRU. Performs necessary LRU locking
+ */
+void ldlm_lock_touch_in_lru(struct ldlm_lock *lock)
+{
+	struct ldlm_namespace *ns = ldlm_lock_to_ns(lock);
+
+	if (lock->l_flags & LDLM_FL_NS_SRV) {
+		LASSERT(list_empty(&lock->l_lru));
+		return;
+	}
+
+	spin_lock(&ns->ns_lock);
+	if (!list_empty(&lock->l_lru)) {
+		ldlm_lock_remove_from_lru_nolock(lock);
+		ldlm_lock_add_to_lru_nolock(lock);
+	}
+	spin_unlock(&ns->ns_lock);
+}
+
+/**
+ * Helper to destroy a locked lock.
+ *
+ * Used by ldlm_lock_destroy and ldlm_lock_destroy_nolock
+ * Must be called with l_lock and lr_lock held.
+ *
+ * Does not actually free the lock data, but rather marks the lock as
+ * destroyed by setting l_destroyed field in the lock to 1.  Destroys a
+ * handle->lock association too, so that the lock can no longer be found
+ * and removes the lock from LRU list.  Actual lock freeing occurs when
+ * last lock reference goes away.
+ *
+ * Original comment (of some historical value):
+ * This used to have a 'strict' flag, which recovery would use to mark an
+ * in-use lock as needing-to-die.  Lest I am ever tempted to put it back, I
+ * shall explain why it's gone: with the new hash table scheme, once you call
+ * ldlm_lock_destroy, you can never drop your final references on this lock.
+ * Because it's not in the hash table anymore.  -phil
+ */
+int ldlm_lock_destroy_internal(struct ldlm_lock *lock)
+{
+	if (lock->l_readers || lock->l_writers) {
+		LDLM_ERROR(lock, "lock still has references");
+		LBUG();
+	}
+
+	if (!list_empty(&lock->l_res_link)) {
+		LDLM_ERROR(lock, "lock still on resource");
+		LBUG();
+	}
+
+	if (lock->l_flags & LDLM_FL_DESTROYED) {
+		LASSERT(list_empty(&lock->l_lru));
+		return 0;
+	}
+	lock->l_flags |= LDLM_FL_DESTROYED;
+
+	if (lock->l_export && lock->l_export->exp_lock_hash) {
+		/* NB: it's safe to call cfs_hash_del() even lock isn't
+		 * in exp_lock_hash. */
+		/* In the function below, .hs_keycmp resolves to
+		 * ldlm_export_lock_keycmp() */
+		/* coverity[overrun-buffer-val] */
+		cfs_hash_del(lock->l_export->exp_lock_hash,
+			     &lock->l_remote_handle, &lock->l_exp_hash);
+	}
+
+	ldlm_lock_remove_from_lru(lock);
+	class_handle_unhash(&lock->l_handle);
+
+#if 0
+	/* Wake anyone waiting for this lock */
+	/* FIXME: I should probably add yet another flag, instead of using
+	 * l_export to only call this on clients */
+	if (lock->l_export)
+		class_export_put(lock->l_export);
+	lock->l_export = NULL;
+	if (lock->l_export && lock->l_completion_ast)
+		lock->l_completion_ast(lock, 0);
+#endif
+	return 1;
+}
+
+/**
+ * Destroys a LDLM lock \a lock. Performs necessary locking first.
+ */
+void ldlm_lock_destroy(struct ldlm_lock *lock)
+{
+	int first;
+
+	lock_res_and_lock(lock);
+	first = ldlm_lock_destroy_internal(lock);
+	unlock_res_and_lock(lock);
+
+	/* drop reference from hashtable only for first destroy */
+	if (first) {
+		lu_ref_del(&lock->l_reference, "hash", lock);
+		LDLM_LOCK_RELEASE(lock);
+	}
+}
+
+/**
+ * Destroys a LDLM lock \a lock that is already locked.
+ */
+void ldlm_lock_destroy_nolock(struct ldlm_lock *lock)
+{
+	int first;
+
+	first = ldlm_lock_destroy_internal(lock);
+	/* drop reference from hashtable only for first destroy */
+	if (first) {
+		lu_ref_del(&lock->l_reference, "hash", lock);
+		LDLM_LOCK_RELEASE(lock);
+	}
+}
+
+/* this is called by portals_handle2object with the handle lock taken */
+static void lock_handle_addref(void *lock)
+{
+	LDLM_LOCK_GET((struct ldlm_lock *)lock);
+}
+
+static void lock_handle_free(void *lock, int size)
+{
+	LASSERT(size == sizeof(struct ldlm_lock));
+	OBD_SLAB_FREE(lock, ldlm_lock_slab, size);
+}
+
+struct portals_handle_ops lock_handle_ops = {
+	.hop_addref = lock_handle_addref,
+	.hop_free   = lock_handle_free,
+};
+
+/**
+ *
+ * Allocate and initialize new lock structure.
+ *
+ * usage: pass in a resource on which you have done ldlm_resource_get
+ *	new lock will take over the refcount.
+ * returns: lock with refcount 2 - one for current caller and one for remote
+ */
+static struct ldlm_lock *ldlm_lock_new(struct ldlm_resource *resource)
+{
+	struct ldlm_lock *lock;
+
+	if (resource == NULL)
+		LBUG();
+
+	OBD_SLAB_ALLOC_PTR_GFP(lock, ldlm_lock_slab, GFP_NOFS);
+	if (lock == NULL)
+		return NULL;
+
+	spin_lock_init(&lock->l_lock);
+	lock->l_resource = resource;
+	lu_ref_add(&resource->lr_reference, "lock", lock);
+
+	atomic_set(&lock->l_refc, 2);
+	INIT_LIST_HEAD(&lock->l_res_link);
+	INIT_LIST_HEAD(&lock->l_lru);
+	INIT_LIST_HEAD(&lock->l_pending_chain);
+	INIT_LIST_HEAD(&lock->l_bl_ast);
+	INIT_LIST_HEAD(&lock->l_cp_ast);
+	INIT_LIST_HEAD(&lock->l_rk_ast);
+	init_waitqueue_head(&lock->l_waitq);
+	lock->l_blocking_lock = NULL;
+	INIT_LIST_HEAD(&lock->l_sl_mode);
+	INIT_LIST_HEAD(&lock->l_sl_policy);
+	INIT_HLIST_NODE(&lock->l_exp_hash);
+	INIT_HLIST_NODE(&lock->l_exp_flock_hash);
+
+	lprocfs_counter_incr(ldlm_res_to_ns(resource)->ns_stats,
+			     LDLM_NSS_LOCKS);
+	INIT_LIST_HEAD(&lock->l_handle.h_link);
+	class_handle_hash(&lock->l_handle, &lock_handle_ops);
+
+	lu_ref_init(&lock->l_reference);
+	lu_ref_add(&lock->l_reference, "hash", lock);
+	lock->l_callback_timeout = 0;
+
+#if LUSTRE_TRACKS_LOCK_EXP_REFS
+	INIT_LIST_HEAD(&lock->l_exp_refs_link);
+	lock->l_exp_refs_nr = 0;
+	lock->l_exp_refs_target = NULL;
+#endif
+	INIT_LIST_HEAD(&lock->l_exp_list);
+
+	return lock;
+}
+
+/**
+ * Moves LDLM lock \a lock to another resource.
+ * This is used on client when server returns some other lock than requested
+ * (typically as a result of intent operation)
+ */
+int ldlm_lock_change_resource(struct ldlm_namespace *ns, struct ldlm_lock *lock,
+			      const struct ldlm_res_id *new_resid)
+{
+	struct ldlm_resource *oldres = lock->l_resource;
+	struct ldlm_resource *newres;
+	int type;
+
+	LASSERT(ns_is_client(ns));
+
+	lock_res_and_lock(lock);
+	if (memcmp(new_resid, &lock->l_resource->lr_name,
+		   sizeof(lock->l_resource->lr_name)) == 0) {
+		/* Nothing to do */
+		unlock_res_and_lock(lock);
+		return 0;
+	}
+
+	LASSERT(new_resid->name[0] != 0);
+
+	/* This function assumes that the lock isn't on any lists */
+	LASSERT(list_empty(&lock->l_res_link));
+
+	type = oldres->lr_type;
+	unlock_res_and_lock(lock);
+
+	newres = ldlm_resource_get(ns, NULL, new_resid, type, 1);
+	if (newres == NULL)
+		return -ENOMEM;
+
+	lu_ref_add(&newres->lr_reference, "lock", lock);
+	/*
+	 * To flip the lock from the old to the new resource, lock, oldres and
+	 * newres have to be locked. Resource spin-locks are nested within
+	 * lock->l_lock, and are taken in the memory address order to avoid
+	 * dead-locks.
+	 */
+	spin_lock(&lock->l_lock);
+	oldres = lock->l_resource;
+	if (oldres < newres) {
+		lock_res(oldres);
+		lock_res_nested(newres, LRT_NEW);
+	} else {
+		lock_res(newres);
+		lock_res_nested(oldres, LRT_NEW);
+	}
+	LASSERT(memcmp(new_resid, &oldres->lr_name,
+		       sizeof(oldres->lr_name)) != 0);
+	lock->l_resource = newres;
+	unlock_res(oldres);
+	unlock_res_and_lock(lock);
+
+	/* ...and the flowers are still standing! */
+	lu_ref_del(&oldres->lr_reference, "lock", lock);
+	ldlm_resource_putref(oldres);
+
+	return 0;
+}
+EXPORT_SYMBOL(ldlm_lock_change_resource);
+
+/** \defgroup ldlm_handles LDLM HANDLES
+ * Ways to get hold of locks without any addresses.
+ * @{
+ */
+
+/**
+ * Fills in handle for LDLM lock \a lock into supplied \a lockh
+ * Does not take any references.
+ */
+void ldlm_lock2handle(const struct ldlm_lock *lock, struct lustre_handle *lockh)
+{
+	lockh->cookie = lock->l_handle.h_cookie;
+}
+EXPORT_SYMBOL(ldlm_lock2handle);
+
+/**
+ * Obtain a lock reference by handle.
+ *
+ * if \a flags: atomically get the lock and set the flags.
+ *	      Return NULL if flag already set
+ */
+struct ldlm_lock *__ldlm_handle2lock(const struct lustre_handle *handle,
+				     __u64 flags)
+{
+	struct ldlm_lock *lock;
+
+	LASSERT(handle);
+
+	lock = class_handle2object(handle->cookie);
+	if (lock == NULL)
+		return NULL;
+
+	/* It's unlikely but possible that someone marked the lock as
+	 * destroyed after we did handle2object on it */
+	if (flags == 0 && ((lock->l_flags & LDLM_FL_DESTROYED) == 0)) {
+		lu_ref_add(&lock->l_reference, "handle", current);
+		return lock;
+	}
+
+	lock_res_and_lock(lock);
+
+	LASSERT(lock->l_resource != NULL);
+
+	lu_ref_add_atomic(&lock->l_reference, "handle", current);
+	if (unlikely(lock->l_flags & LDLM_FL_DESTROYED)) {
+		unlock_res_and_lock(lock);
+		CDEBUG(D_INFO, "lock already destroyed: lock %p\n", lock);
+		LDLM_LOCK_PUT(lock);
+		return NULL;
+	}
+
+	if (flags && (lock->l_flags & flags)) {
+		unlock_res_and_lock(lock);
+		LDLM_LOCK_PUT(lock);
+		return NULL;
+	}
+
+	if (flags)
+		lock->l_flags |= flags;
+
+	unlock_res_and_lock(lock);
+	return lock;
+}
+EXPORT_SYMBOL(__ldlm_handle2lock);
+/** @} ldlm_handles */
+
+/**
+ * Fill in "on the wire" representation for given LDLM lock into supplied
+ * lock descriptor \a desc structure.
+ */
+void ldlm_lock2desc(struct ldlm_lock *lock, struct ldlm_lock_desc *desc)
+{
+	ldlm_res2desc(lock->l_resource, &desc->l_resource);
+	desc->l_req_mode = lock->l_req_mode;
+	desc->l_granted_mode = lock->l_granted_mode;
+	ldlm_convert_policy_to_wire(lock->l_resource->lr_type,
+				    &lock->l_policy_data,
+				    &desc->l_policy_data);
+}
+EXPORT_SYMBOL(ldlm_lock2desc);
+
+/**
+ * Add a lock to list of conflicting locks to send AST to.
+ *
+ * Only add if we have not sent a blocking AST to the lock yet.
+ */
+void ldlm_add_bl_work_item(struct ldlm_lock *lock, struct ldlm_lock *new,
+			   struct list_head *work_list)
+{
+	if ((lock->l_flags & LDLM_FL_AST_SENT) == 0) {
+		LDLM_DEBUG(lock, "lock incompatible; sending blocking AST.");
+		lock->l_flags |= LDLM_FL_AST_SENT;
+		/* If the enqueuing client said so, tell the AST recipient to
+		 * discard dirty data, rather than writing back. */
+		if (new->l_flags & LDLM_FL_AST_DISCARD_DATA)
+			lock->l_flags |= LDLM_FL_DISCARD_DATA;
+		LASSERT(list_empty(&lock->l_bl_ast));
+		list_add(&lock->l_bl_ast, work_list);
+		LDLM_LOCK_GET(lock);
+		LASSERT(lock->l_blocking_lock == NULL);
+		lock->l_blocking_lock = LDLM_LOCK_GET(new);
+	}
+}
+
+/**
+ * Add a lock to list of just granted locks to send completion AST to.
+ */
+void ldlm_add_cp_work_item(struct ldlm_lock *lock, struct list_head *work_list)
+{
+	if ((lock->l_flags & LDLM_FL_CP_REQD) == 0) {
+		lock->l_flags |= LDLM_FL_CP_REQD;
+		LDLM_DEBUG(lock, "lock granted; sending completion AST.");
+		LASSERT(list_empty(&lock->l_cp_ast));
+		list_add(&lock->l_cp_ast, work_list);
+		LDLM_LOCK_GET(lock);
+	}
+}
+
+/**
+ * Aggregator function to add AST work items into a list. Determines
+ * what sort of an AST work needs to be done and calls the proper
+ * adding function.
+ * Must be called with lr_lock held.
+ */
+void ldlm_add_ast_work_item(struct ldlm_lock *lock, struct ldlm_lock *new,
+			    struct list_head *work_list)
+{
+	check_res_locked(lock->l_resource);
+	if (new)
+		ldlm_add_bl_work_item(lock, new, work_list);
+	else
+		ldlm_add_cp_work_item(lock, work_list);
+}
+
+/**
+ * Add specified reader/writer reference to LDLM lock with handle \a lockh.
+ * r/w reference type is determined by \a mode
+ * Calls ldlm_lock_addref_internal.
+ */
+void ldlm_lock_addref(struct lustre_handle *lockh, __u32 mode)
+{
+	struct ldlm_lock *lock;
+
+	lock = ldlm_handle2lock(lockh);
+	LASSERT(lock != NULL);
+	ldlm_lock_addref_internal(lock, mode);
+	LDLM_LOCK_PUT(lock);
+}
+EXPORT_SYMBOL(ldlm_lock_addref);
+
+/**
+ * Helper function.
+ * Add specified reader/writer reference to LDLM lock \a lock.
+ * r/w reference type is determined by \a mode
+ * Removes lock from LRU if it is there.
+ * Assumes the LDLM lock is already locked.
+ */
+void ldlm_lock_addref_internal_nolock(struct ldlm_lock *lock, __u32 mode)
+{
+	ldlm_lock_remove_from_lru(lock);
+	if (mode & (LCK_NL | LCK_CR | LCK_PR)) {
+		lock->l_readers++;
+		lu_ref_add_atomic(&lock->l_reference, "reader", lock);
+	}
+	if (mode & (LCK_EX | LCK_CW | LCK_PW | LCK_GROUP | LCK_COS)) {
+		lock->l_writers++;
+		lu_ref_add_atomic(&lock->l_reference, "writer", lock);
+	}
+	LDLM_LOCK_GET(lock);
+	lu_ref_add_atomic(&lock->l_reference, "user", lock);
+	LDLM_DEBUG(lock, "ldlm_lock_addref(%s)", ldlm_lockname[mode]);
+}
+
+/**
+ * Attempts to add reader/writer reference to a lock with handle \a lockh, and
+ * fails if lock is already LDLM_FL_CBPENDING or destroyed.
+ *
+ * \retval 0 success, lock was addref-ed
+ *
+ * \retval -EAGAIN lock is being canceled.
+ */
+int ldlm_lock_addref_try(struct lustre_handle *lockh, __u32 mode)
+{
+	struct ldlm_lock *lock;
+	int	       result;
+
+	result = -EAGAIN;
+	lock = ldlm_handle2lock(lockh);
+	if (lock != NULL) {
+		lock_res_and_lock(lock);
+		if (lock->l_readers != 0 || lock->l_writers != 0 ||
+		    !(lock->l_flags & LDLM_FL_CBPENDING)) {
+			ldlm_lock_addref_internal_nolock(lock, mode);
+			result = 0;
+		}
+		unlock_res_and_lock(lock);
+		LDLM_LOCK_PUT(lock);
+	}
+	return result;
+}
+EXPORT_SYMBOL(ldlm_lock_addref_try);
+
+/**
+ * Add specified reader/writer reference to LDLM lock \a lock.
+ * Locks LDLM lock and calls ldlm_lock_addref_internal_nolock to do the work.
+ * Only called for local locks.
+ */
+void ldlm_lock_addref_internal(struct ldlm_lock *lock, __u32 mode)
+{
+	lock_res_and_lock(lock);
+	ldlm_lock_addref_internal_nolock(lock, mode);
+	unlock_res_and_lock(lock);
+}
+
+/**
+ * Removes reader/writer reference for LDLM lock \a lock.
+ * Assumes LDLM lock is already locked.
+ * only called in ldlm_flock_destroy and for local locks.
+ * Does NOT add lock to LRU if no r/w references left to accommodate flock locks
+ * that cannot be placed in LRU.
+ */
+void ldlm_lock_decref_internal_nolock(struct ldlm_lock *lock, __u32 mode)
+{
+	LDLM_DEBUG(lock, "ldlm_lock_decref(%s)", ldlm_lockname[mode]);
+	if (mode & (LCK_NL | LCK_CR | LCK_PR)) {
+		LASSERT(lock->l_readers > 0);
+		lu_ref_del(&lock->l_reference, "reader", lock);
+		lock->l_readers--;
+	}
+	if (mode & (LCK_EX | LCK_CW | LCK_PW | LCK_GROUP | LCK_COS)) {
+		LASSERT(lock->l_writers > 0);
+		lu_ref_del(&lock->l_reference, "writer", lock);
+		lock->l_writers--;
+	}
+
+	lu_ref_del(&lock->l_reference, "user", lock);
+	LDLM_LOCK_RELEASE(lock);    /* matches the LDLM_LOCK_GET() in addref */
+}
+
+/**
+ * Removes reader/writer reference for LDLM lock \a lock.
+ * Locks LDLM lock first.
+ * If the lock is determined to be client lock on a client and r/w refcount
+ * drops to zero and the lock is not blocked, the lock is added to LRU lock
+ * on the namespace.
+ * For blocked LDLM locks if r/w count drops to zero, blocking_ast is called.
+ */
+void ldlm_lock_decref_internal(struct ldlm_lock *lock, __u32 mode)
+{
+	struct ldlm_namespace *ns;
+
+	lock_res_and_lock(lock);
+
+	ns = ldlm_lock_to_ns(lock);
+
+	ldlm_lock_decref_internal_nolock(lock, mode);
+
+	if (lock->l_flags & LDLM_FL_LOCAL &&
+	    !lock->l_readers && !lock->l_writers) {
+		/* If this is a local lock on a server namespace and this was
+		 * the last reference, cancel the lock. */
+		CDEBUG(D_INFO, "forcing cancel of local lock\n");
+		lock->l_flags |= LDLM_FL_CBPENDING;
+	}
+
+	if (!lock->l_readers && !lock->l_writers &&
+	    (lock->l_flags & LDLM_FL_CBPENDING)) {
+		/* If we received a blocked AST and this was the last reference,
+		 * run the callback. */
+		if ((lock->l_flags & LDLM_FL_NS_SRV) && lock->l_export)
+			CERROR("FL_CBPENDING set on non-local lock--just a warning\n");
+
+		LDLM_DEBUG(lock, "final decref done on cbpending lock");
+
+		LDLM_LOCK_GET(lock); /* dropped by bl thread */
+		ldlm_lock_remove_from_lru(lock);
+		unlock_res_and_lock(lock);
+
+		if (lock->l_flags & LDLM_FL_FAIL_LOC)
+			OBD_RACE(OBD_FAIL_LDLM_CP_BL_RACE);
+
+		if ((lock->l_flags & LDLM_FL_ATOMIC_CB) ||
+		    ldlm_bl_to_thread_lock(ns, NULL, lock) != 0)
+			ldlm_handle_bl_callback(ns, NULL, lock);
+	} else if (ns_is_client(ns) &&
+		   !lock->l_readers && !lock->l_writers &&
+		   !(lock->l_flags & LDLM_FL_NO_LRU) &&
+		   !(lock->l_flags & LDLM_FL_BL_AST)) {
+
+		LDLM_DEBUG(lock, "add lock into lru list");
+
+		/* If this is a client-side namespace and this was the last
+		 * reference, put it on the LRU. */
+		ldlm_lock_add_to_lru(lock);
+		unlock_res_and_lock(lock);
+
+		if (lock->l_flags & LDLM_FL_FAIL_LOC)
+			OBD_RACE(OBD_FAIL_LDLM_CP_BL_RACE);
+
+		/* Call ldlm_cancel_lru() only if EARLY_CANCEL and LRU RESIZE
+		 * are not supported by the server, otherwise, it is done on
+		 * enqueue. */
+		if (!exp_connect_cancelset(lock->l_conn_export) &&
+		    !ns_connect_lru_resize(ns))
+			ldlm_cancel_lru(ns, 0, LCF_ASYNC, 0);
+	} else {
+		LDLM_DEBUG(lock, "do not add lock into lru list");
+		unlock_res_and_lock(lock);
+	}
+}
+
+/**
+ * Decrease reader/writer refcount for LDLM lock with handle \a lockh
+ */
+void ldlm_lock_decref(struct lustre_handle *lockh, __u32 mode)
+{
+	struct ldlm_lock *lock = __ldlm_handle2lock(lockh, 0);
+
+	LASSERTF(lock != NULL, "Non-existing lock: %#llx\n", lockh->cookie);
+	ldlm_lock_decref_internal(lock, mode);
+	LDLM_LOCK_PUT(lock);
+}
+EXPORT_SYMBOL(ldlm_lock_decref);
+
+/**
+ * Decrease reader/writer refcount for LDLM lock with handle
+ * \a lockh and mark it for subsequent cancellation once r/w refcount
+ * drops to zero instead of putting into LRU.
+ *
+ * Typical usage is for GROUP locks which we cannot allow to be cached.
+ */
+void ldlm_lock_decref_and_cancel(struct lustre_handle *lockh, __u32 mode)
+{
+	struct ldlm_lock *lock = __ldlm_handle2lock(lockh, 0);
+
+	LASSERT(lock != NULL);
+
+	LDLM_DEBUG(lock, "ldlm_lock_decref(%s)", ldlm_lockname[mode]);
+	lock_res_and_lock(lock);
+	lock->l_flags |= LDLM_FL_CBPENDING;
+	unlock_res_and_lock(lock);
+	ldlm_lock_decref_internal(lock, mode);
+	LDLM_LOCK_PUT(lock);
+}
+EXPORT_SYMBOL(ldlm_lock_decref_and_cancel);
+
+struct sl_insert_point {
+	struct list_head *res_link;
+	struct list_head *mode_link;
+	struct list_head *policy_link;
+};
+
+/**
+ * Finds a position to insert the new lock into granted lock list.
+ *
+ * Used for locks eligible for skiplist optimization.
+ *
+ * Parameters:
+ *      queue [input]:  the granted list where search acts on;
+ *      req [input]:    the lock whose position to be located;
+ *      prev [output]:  positions within 3 lists to insert @req to
+ * Return Value:
+ *      filled @prev
+ * NOTE: called by
+ *  - ldlm_grant_lock_with_skiplist
+ */
+static void search_granted_lock(struct list_head *queue,
+				struct ldlm_lock *req,
+				struct sl_insert_point *prev)
+{
+	struct list_head *tmp;
+	struct ldlm_lock *lock, *mode_end, *policy_end;
+
+	list_for_each(tmp, queue) {
+		lock = list_entry(tmp, struct ldlm_lock, l_res_link);
+
+		mode_end = list_entry(lock->l_sl_mode.prev,
+					  struct ldlm_lock, l_sl_mode);
+
+		if (lock->l_req_mode != req->l_req_mode) {
+			/* jump to last lock of mode group */
+			tmp = &mode_end->l_res_link;
+			continue;
+		}
+
+		/* suitable mode group is found */
+		if (lock->l_resource->lr_type == LDLM_PLAIN) {
+			/* insert point is last lock of the mode group */
+			prev->res_link = &mode_end->l_res_link;
+			prev->mode_link = &mode_end->l_sl_mode;
+			prev->policy_link = &req->l_sl_policy;
+			return;
+		} else if (lock->l_resource->lr_type == LDLM_IBITS) {
+			for (;;) {
+				policy_end =
+					list_entry(lock->l_sl_policy.prev,
+						       struct ldlm_lock,
+						       l_sl_policy);
+
+				if (lock->l_policy_data.l_inodebits.bits ==
+				    req->l_policy_data.l_inodebits.bits) {
+					/* insert point is last lock of
+					 * the policy group */
+					prev->res_link =
+						&policy_end->l_res_link;
+					prev->mode_link =
+						&policy_end->l_sl_mode;
+					prev->policy_link =
+						&policy_end->l_sl_policy;
+					return;
+				}
+
+				if (policy_end == mode_end)
+					/* done with mode group */
+					break;
+
+				/* go to next policy group within mode group */
+				tmp = policy_end->l_res_link.next;
+				lock = list_entry(tmp, struct ldlm_lock,
+						      l_res_link);
+			}  /* loop over policy groups within the mode group */
+
+			/* insert point is last lock of the mode group,
+			 * new policy group is started */
+			prev->res_link = &mode_end->l_res_link;
+			prev->mode_link = &mode_end->l_sl_mode;
+			prev->policy_link = &req->l_sl_policy;
+			return;
+		} else {
+			LDLM_ERROR(lock,
+				   "is not LDLM_PLAIN or LDLM_IBITS lock");
+			LBUG();
+		}
+	}
+
+	/* insert point is last lock on the queue,
+	 * new mode group and new policy group are started */
+	prev->res_link = queue->prev;
+	prev->mode_link = &req->l_sl_mode;
+	prev->policy_link = &req->l_sl_policy;
+}
+
+/**
+ * Add a lock into resource granted list after a position described by
+ * \a prev.
+ */
+static void ldlm_granted_list_add_lock(struct ldlm_lock *lock,
+				       struct sl_insert_point *prev)
+{
+	struct ldlm_resource *res = lock->l_resource;
+
+	check_res_locked(res);
+
+	ldlm_resource_dump(D_INFO, res);
+	LDLM_DEBUG(lock, "About to add lock:");
+
+	if (lock->l_flags & LDLM_FL_DESTROYED) {
+		CDEBUG(D_OTHER, "Lock destroyed, not adding to resource\n");
+		return;
+	}
+
+	LASSERT(list_empty(&lock->l_res_link));
+	LASSERT(list_empty(&lock->l_sl_mode));
+	LASSERT(list_empty(&lock->l_sl_policy));
+
+	/*
+	 * lock->link == prev->link means lock is first starting the group.
+	 * Don't re-add to itself to suppress kernel warnings.
+	 */
+	if (&lock->l_res_link != prev->res_link)
+		list_add(&lock->l_res_link, prev->res_link);
+	if (&lock->l_sl_mode != prev->mode_link)
+		list_add(&lock->l_sl_mode, prev->mode_link);
+	if (&lock->l_sl_policy != prev->policy_link)
+		list_add(&lock->l_sl_policy, prev->policy_link);
+}
+
+/**
+ * Add a lock to granted list on a resource maintaining skiplist
+ * correctness.
+ */
+static void ldlm_grant_lock_with_skiplist(struct ldlm_lock *lock)
+{
+	struct sl_insert_point prev;
+
+	LASSERT(lock->l_req_mode == lock->l_granted_mode);
+
+	search_granted_lock(&lock->l_resource->lr_granted, lock, &prev);
+	ldlm_granted_list_add_lock(lock, &prev);
+}
+
+/**
+ * Perform lock granting bookkeeping.
+ *
+ * Includes putting the lock into granted list and updating lock mode.
+ * NOTE: called by
+ *  - ldlm_lock_enqueue
+ *  - ldlm_reprocess_queue
+ *  - ldlm_lock_convert
+ *
+ * must be called with lr_lock held
+ */
+void ldlm_grant_lock(struct ldlm_lock *lock, struct list_head *work_list)
+{
+	struct ldlm_resource *res = lock->l_resource;
+
+	check_res_locked(res);
+
+	lock->l_granted_mode = lock->l_req_mode;
+	if (res->lr_type == LDLM_PLAIN || res->lr_type == LDLM_IBITS)
+		ldlm_grant_lock_with_skiplist(lock);
+	else if (res->lr_type == LDLM_EXTENT)
+		ldlm_extent_add_lock(res, lock);
+	else
+		ldlm_resource_add_lock(res, &res->lr_granted, lock);
+
+	if (lock->l_granted_mode < res->lr_most_restr)
+		res->lr_most_restr = lock->l_granted_mode;
+
+	if (work_list && lock->l_completion_ast != NULL)
+		ldlm_add_ast_work_item(lock, NULL, work_list);
+
+	ldlm_pool_add(&ldlm_res_to_ns(res)->ns_pool, lock);
+}
+
+/**
+ * Search for a lock with given properties in a queue.
+ *
+ * \retval a referenced lock or NULL.  See the flag descriptions below, in the
+ * comment above ldlm_lock_match
+ */
+static struct ldlm_lock *search_queue(struct list_head *queue,
+				      ldlm_mode_t *mode,
+				      ldlm_policy_data_t *policy,
+				      struct ldlm_lock *old_lock,
+				      __u64 flags, int unref)
+{
+	struct ldlm_lock *lock;
+	struct list_head       *tmp;
+
+	list_for_each(tmp, queue) {
+		ldlm_mode_t match;
+
+		lock = list_entry(tmp, struct ldlm_lock, l_res_link);
+
+		if (lock == old_lock)
+			break;
+
+		/* Check if this lock can be matched.
+		 * Used by LU-2919(exclusive open) for open lease lock */
+		if (ldlm_is_excl(lock))
+			continue;
+
+		/* llite sometimes wants to match locks that will be
+		 * canceled when their users drop, but we allow it to match
+		 * if it passes in CBPENDING and the lock still has users.
+		 * this is generally only going to be used by children
+		 * whose parents already hold a lock so forward progress
+		 * can still happen. */
+		if (lock->l_flags & LDLM_FL_CBPENDING &&
+		    !(flags & LDLM_FL_CBPENDING))
+			continue;
+		if (!unref && lock->l_flags & LDLM_FL_CBPENDING &&
+		    lock->l_readers == 0 && lock->l_writers == 0)
+			continue;
+
+		if (!(lock->l_req_mode & *mode))
+			continue;
+		match = lock->l_req_mode;
+
+		if (lock->l_resource->lr_type == LDLM_EXTENT &&
+		    (lock->l_policy_data.l_extent.start >
+		     policy->l_extent.start ||
+		     lock->l_policy_data.l_extent.end < policy->l_extent.end))
+			continue;
+
+		if (unlikely(match == LCK_GROUP) &&
+		    lock->l_resource->lr_type == LDLM_EXTENT &&
+		    lock->l_policy_data.l_extent.gid != policy->l_extent.gid)
+			continue;
+
+		/* We match if we have existing lock with same or wider set
+		   of bits. */
+		if (lock->l_resource->lr_type == LDLM_IBITS &&
+		     ((lock->l_policy_data.l_inodebits.bits &
+		      policy->l_inodebits.bits) !=
+		      policy->l_inodebits.bits))
+			continue;
+
+		if (!unref && (lock->l_flags & LDLM_FL_GONE_MASK))
+			continue;
+
+		if ((flags & LDLM_FL_LOCAL_ONLY) &&
+		    !(lock->l_flags & LDLM_FL_LOCAL))
+			continue;
+
+		if (flags & LDLM_FL_TEST_LOCK) {
+			LDLM_LOCK_GET(lock);
+			ldlm_lock_touch_in_lru(lock);
+		} else {
+			ldlm_lock_addref_internal_nolock(lock, match);
+		}
+		*mode = match;
+		return lock;
+	}
+
+	return NULL;
+}
+
+void ldlm_lock_fail_match_locked(struct ldlm_lock *lock)
+{
+	if ((lock->l_flags & LDLM_FL_FAIL_NOTIFIED) == 0) {
+		lock->l_flags |= LDLM_FL_FAIL_NOTIFIED;
+		wake_up_all(&lock->l_waitq);
+	}
+}
+EXPORT_SYMBOL(ldlm_lock_fail_match_locked);
+
+void ldlm_lock_fail_match(struct ldlm_lock *lock)
+{
+	lock_res_and_lock(lock);
+	ldlm_lock_fail_match_locked(lock);
+	unlock_res_and_lock(lock);
+}
+EXPORT_SYMBOL(ldlm_lock_fail_match);
+
+/**
+ * Mark lock as "matchable" by OST.
+ *
+ * Used to prevent certain races in LOV/OSC where the lock is granted, but LVB
+ * is not yet valid.
+ * Assumes LDLM lock is already locked.
+ */
+void ldlm_lock_allow_match_locked(struct ldlm_lock *lock)
+{
+	lock->l_flags |= LDLM_FL_LVB_READY;
+	wake_up_all(&lock->l_waitq);
+}
+EXPORT_SYMBOL(ldlm_lock_allow_match_locked);
+
+/**
+ * Mark lock as "matchable" by OST.
+ * Locks the lock and then \see ldlm_lock_allow_match_locked
+ */
+void ldlm_lock_allow_match(struct ldlm_lock *lock)
+{
+	lock_res_and_lock(lock);
+	ldlm_lock_allow_match_locked(lock);
+	unlock_res_and_lock(lock);
+}
+EXPORT_SYMBOL(ldlm_lock_allow_match);
+
+/**
+ * Attempt to find a lock with specified properties.
+ *
+ * Typically returns a reference to matched lock unless LDLM_FL_TEST_LOCK is
+ * set in \a flags
+ *
+ * Can be called in two ways:
+ *
+ * If 'ns' is NULL, then lockh describes an existing lock that we want to look
+ * for a duplicate of.
+ *
+ * Otherwise, all of the fields must be filled in, to match against.
+ *
+ * If 'flags' contains LDLM_FL_LOCAL_ONLY, then only match local locks on the
+ *     server (ie, connh is NULL)
+ * If 'flags' contains LDLM_FL_BLOCK_GRANTED, then only locks on the granted
+ *     list will be considered
+ * If 'flags' contains LDLM_FL_CBPENDING, then locks that have been marked
+ *     to be canceled can still be matched as long as they still have reader
+ *     or writer referneces
+ * If 'flags' contains LDLM_FL_TEST_LOCK, then don't actually reference a lock,
+ *     just tell us if we would have matched.
+ *
+ * \retval 1 if it finds an already-existing lock that is compatible; in this
+ * case, lockh is filled in with a addref()ed lock
+ *
+ * We also check security context, and if that fails we simply return 0 (to
+ * keep caller code unchanged), the context failure will be discovered by
+ * caller sometime later.
+ */
+ldlm_mode_t ldlm_lock_match(struct ldlm_namespace *ns, __u64 flags,
+			    const struct ldlm_res_id *res_id, ldlm_type_t type,
+			    ldlm_policy_data_t *policy, ldlm_mode_t mode,
+			    struct lustre_handle *lockh, int unref)
+{
+	struct ldlm_resource *res;
+	struct ldlm_lock *lock, *old_lock = NULL;
+	int rc = 0;
+
+	if (ns == NULL) {
+		old_lock = ldlm_handle2lock(lockh);
+		LASSERT(old_lock);
+
+		ns = ldlm_lock_to_ns(old_lock);
+		res_id = &old_lock->l_resource->lr_name;
+		type = old_lock->l_resource->lr_type;
+		mode = old_lock->l_req_mode;
+	}
+
+	res = ldlm_resource_get(ns, NULL, res_id, type, 0);
+	if (res == NULL) {
+		LASSERT(old_lock == NULL);
+		return 0;
+	}
+
+	LDLM_RESOURCE_ADDREF(res);
+	lock_res(res);
+
+	lock = search_queue(&res->lr_granted, &mode, policy, old_lock,
+			    flags, unref);
+	if (lock != NULL) {
+		rc = 1;
+		goto out;
+	}
+	if (flags & LDLM_FL_BLOCK_GRANTED) {
+		rc = 0;
+		goto out;
+	}
+	lock = search_queue(&res->lr_converting, &mode, policy, old_lock,
+			    flags, unref);
+	if (lock != NULL) {
+		rc = 1;
+		goto out;
+	}
+	lock = search_queue(&res->lr_waiting, &mode, policy, old_lock,
+			    flags, unref);
+	if (lock != NULL) {
+		rc = 1;
+		goto out;
+	}
+
+ out:
+	unlock_res(res);
+	LDLM_RESOURCE_DELREF(res);
+	ldlm_resource_putref(res);
+
+	if (lock) {
+		ldlm_lock2handle(lock, lockh);
+		if ((flags & LDLM_FL_LVB_READY) &&
+		    (!(lock->l_flags & LDLM_FL_LVB_READY))) {
+			__u64 wait_flags = LDLM_FL_LVB_READY |
+				LDLM_FL_DESTROYED | LDLM_FL_FAIL_NOTIFIED;
+			struct l_wait_info lwi;
+
+			if (lock->l_completion_ast) {
+				int err = lock->l_completion_ast(lock,
+							  LDLM_FL_WAIT_NOREPROC,
+								 NULL);
+				if (err) {
+					if (flags & LDLM_FL_TEST_LOCK)
+						LDLM_LOCK_RELEASE(lock);
+					else
+						ldlm_lock_decref_internal(lock,
+									  mode);
+					rc = 0;
+					goto out2;
+				}
+			}
+
+			lwi = LWI_TIMEOUT_INTR(cfs_time_seconds(obd_timeout),
+					       NULL, LWI_ON_SIGNAL_NOOP, NULL);
+
+			/* XXX FIXME see comment on CAN_MATCH in lustre_dlm.h */
+			l_wait_event(lock->l_waitq,
+				     lock->l_flags & wait_flags,
+				     &lwi);
+			if (!(lock->l_flags & LDLM_FL_LVB_READY)) {
+				if (flags & LDLM_FL_TEST_LOCK)
+					LDLM_LOCK_RELEASE(lock);
+				else
+					ldlm_lock_decref_internal(lock, mode);
+				rc = 0;
+			}
+		}
+	}
+ out2:
+	if (rc) {
+		LDLM_DEBUG(lock, "matched (%llu %llu)",
+			   (type == LDLM_PLAIN || type == LDLM_IBITS) ?
+				res_id->name[2] : policy->l_extent.start,
+			   (type == LDLM_PLAIN || type == LDLM_IBITS) ?
+				res_id->name[3] : policy->l_extent.end);
+
+		/* check user's security context */
+		if (lock->l_conn_export &&
+		    sptlrpc_import_check_ctx(
+				class_exp2cliimp(lock->l_conn_export))) {
+			if (!(flags & LDLM_FL_TEST_LOCK))
+				ldlm_lock_decref_internal(lock, mode);
+			rc = 0;
+		}
+
+		if (flags & LDLM_FL_TEST_LOCK)
+			LDLM_LOCK_RELEASE(lock);
+
+	} else if (!(flags & LDLM_FL_TEST_LOCK)) {/*less verbose for test-only*/
+		LDLM_DEBUG_NOLOCK("not matched ns %p type %u mode %u res %llu/%llu (%llu %llu)",
+				  ns, type, mode, res_id->name[0],
+				  res_id->name[1],
+				  (type == LDLM_PLAIN || type == LDLM_IBITS) ?
+					res_id->name[2] : policy->l_extent.start,
+				  (type == LDLM_PLAIN || type == LDLM_IBITS) ?
+					res_id->name[3] : policy->l_extent.end);
+	}
+	if (old_lock)
+		LDLM_LOCK_PUT(old_lock);
+
+	return rc ? mode : 0;
+}
+EXPORT_SYMBOL(ldlm_lock_match);
+
+ldlm_mode_t ldlm_revalidate_lock_handle(struct lustre_handle *lockh,
+					__u64 *bits)
+{
+	struct ldlm_lock *lock;
+	ldlm_mode_t mode = 0;
+
+	lock = ldlm_handle2lock(lockh);
+	if (lock != NULL) {
+		lock_res_and_lock(lock);
+		if (lock->l_flags & LDLM_FL_GONE_MASK)
+			goto out;
+
+		if (lock->l_flags & LDLM_FL_CBPENDING &&
+		    lock->l_readers == 0 && lock->l_writers == 0)
+			goto out;
+
+		if (bits)
+			*bits = lock->l_policy_data.l_inodebits.bits;
+		mode = lock->l_granted_mode;
+		ldlm_lock_addref_internal_nolock(lock, mode);
+	}
+
+out:
+	if (lock != NULL) {
+		unlock_res_and_lock(lock);
+		LDLM_LOCK_PUT(lock);
+	}
+	return mode;
+}
+EXPORT_SYMBOL(ldlm_revalidate_lock_handle);
+
+/** The caller must guarantee that the buffer is large enough. */
+int ldlm_fill_lvb(struct ldlm_lock *lock, struct req_capsule *pill,
+		  enum req_location loc, void *data, int size)
+{
+	void *lvb;
+
+	LASSERT(data != NULL);
+	LASSERT(size >= 0);
+
+	switch (lock->l_lvb_type) {
+	case LVB_T_OST:
+		if (size == sizeof(struct ost_lvb)) {
+			if (loc == RCL_CLIENT)
+				lvb = req_capsule_client_swab_get(pill,
+						&RMF_DLM_LVB,
+						lustre_swab_ost_lvb);
+			else
+				lvb = req_capsule_server_swab_get(pill,
+						&RMF_DLM_LVB,
+						lustre_swab_ost_lvb);
+			if (unlikely(lvb == NULL)) {
+				LDLM_ERROR(lock, "no LVB");
+				return -EPROTO;
+			}
+
+			memcpy(data, lvb, size);
+		} else if (size == sizeof(struct ost_lvb_v1)) {
+			struct ost_lvb *olvb = data;
+
+			if (loc == RCL_CLIENT)
+				lvb = req_capsule_client_swab_get(pill,
+						&RMF_DLM_LVB,
+						lustre_swab_ost_lvb_v1);
+			else
+				lvb = req_capsule_server_sized_swab_get(pill,
+						&RMF_DLM_LVB, size,
+						lustre_swab_ost_lvb_v1);
+			if (unlikely(lvb == NULL)) {
+				LDLM_ERROR(lock, "no LVB");
+				return -EPROTO;
+			}
+
+			memcpy(data, lvb, size);
+			olvb->lvb_mtime_ns = 0;
+			olvb->lvb_atime_ns = 0;
+			olvb->lvb_ctime_ns = 0;
+		} else {
+			LDLM_ERROR(lock, "Replied unexpected ost LVB size %d",
+				   size);
+			return -EINVAL;
+		}
+		break;
+	case LVB_T_LQUOTA:
+		if (size == sizeof(struct lquota_lvb)) {
+			if (loc == RCL_CLIENT)
+				lvb = req_capsule_client_swab_get(pill,
+						&RMF_DLM_LVB,
+						lustre_swab_lquota_lvb);
+			else
+				lvb = req_capsule_server_swab_get(pill,
+						&RMF_DLM_LVB,
+						lustre_swab_lquota_lvb);
+			if (unlikely(lvb == NULL)) {
+				LDLM_ERROR(lock, "no LVB");
+				return -EPROTO;
+			}
+
+			memcpy(data, lvb, size);
+		} else {
+			LDLM_ERROR(lock,
+				   "Replied unexpected lquota LVB size %d",
+				   size);
+			return -EINVAL;
+		}
+		break;
+	case LVB_T_LAYOUT:
+		if (size == 0)
+			break;
+
+		if (loc == RCL_CLIENT)
+			lvb = req_capsule_client_get(pill, &RMF_DLM_LVB);
+		else
+			lvb = req_capsule_server_get(pill, &RMF_DLM_LVB);
+		if (unlikely(lvb == NULL)) {
+			LDLM_ERROR(lock, "no LVB");
+			return -EPROTO;
+		}
+
+		memcpy(data, lvb, size);
+		break;
+	default:
+		LDLM_ERROR(lock, "Unknown LVB type: %d\n", lock->l_lvb_type);
+		dump_stack();
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+/**
+ * Create and fill in new LDLM lock with specified properties.
+ * Returns a referenced lock
+ */
+struct ldlm_lock *ldlm_lock_create(struct ldlm_namespace *ns,
+				   const struct ldlm_res_id *res_id,
+				   ldlm_type_t type,
+				   ldlm_mode_t mode,
+				   const struct ldlm_callback_suite *cbs,
+				   void *data, __u32 lvb_len,
+				   enum lvb_type lvb_type)
+{
+	struct ldlm_lock *lock;
+	struct ldlm_resource *res;
+
+	res = ldlm_resource_get(ns, NULL, res_id, type, 1);
+	if (res == NULL)
+		return NULL;
+
+	lock = ldlm_lock_new(res);
+
+	if (lock == NULL)
+		return NULL;
+
+	lock->l_req_mode = mode;
+	lock->l_ast_data = data;
+	lock->l_pid = current_pid();
+	if (ns_is_server(ns))
+		lock->l_flags |= LDLM_FL_NS_SRV;
+	if (cbs) {
+		lock->l_blocking_ast = cbs->lcs_blocking;
+		lock->l_completion_ast = cbs->lcs_completion;
+		lock->l_glimpse_ast = cbs->lcs_glimpse;
+	}
+
+	lock->l_tree_node = NULL;
+	/* if this is the extent lock, allocate the interval tree node */
+	if (type == LDLM_EXTENT) {
+		if (ldlm_interval_alloc(lock) == NULL)
+			goto out;
+	}
+
+	if (lvb_len) {
+		lock->l_lvb_len = lvb_len;
+		OBD_ALLOC(lock->l_lvb_data, lvb_len);
+		if (lock->l_lvb_data == NULL)
+			goto out;
+	}
+
+	lock->l_lvb_type = lvb_type;
+	if (OBD_FAIL_CHECK(OBD_FAIL_LDLM_NEW_LOCK))
+		goto out;
+
+	return lock;
+
+out:
+	ldlm_lock_destroy(lock);
+	LDLM_LOCK_RELEASE(lock);
+	return NULL;
+}
+
+/**
+ * Enqueue (request) a lock.
+ *
+ * Does not block. As a result of enqueue the lock would be put
+ * into granted or waiting list.
+ *
+ * If namespace has intent policy sent and the lock has LDLM_FL_HAS_INTENT flag
+ * set, skip all the enqueueing and delegate lock processing to intent policy
+ * function.
+ */
+ldlm_error_t ldlm_lock_enqueue(struct ldlm_namespace *ns,
+			       struct ldlm_lock **lockp,
+			       void *cookie, __u64 *flags)
+{
+	struct ldlm_lock *lock = *lockp;
+	struct ldlm_resource *res = lock->l_resource;
+	int local = ns_is_client(ldlm_res_to_ns(res));
+	ldlm_error_t rc = ELDLM_OK;
+	struct ldlm_interval *node = NULL;
+
+	lock->l_last_activity = get_seconds();
+	/* policies are not executed on the client or during replay */
+	if ((*flags & (LDLM_FL_HAS_INTENT|LDLM_FL_REPLAY)) == LDLM_FL_HAS_INTENT
+	    && !local && ns->ns_policy) {
+		rc = ns->ns_policy(ns, lockp, cookie, lock->l_req_mode, *flags,
+				   NULL);
+		if (rc == ELDLM_LOCK_REPLACED) {
+			/* The lock that was returned has already been granted,
+			 * and placed into lockp.  If it's not the same as the
+			 * one we passed in, then destroy the old one and our
+			 * work here is done. */
+			if (lock != *lockp) {
+				ldlm_lock_destroy(lock);
+				LDLM_LOCK_RELEASE(lock);
+			}
+			*flags |= LDLM_FL_LOCK_CHANGED;
+			return 0;
+		} else if (rc != ELDLM_OK ||
+			   (rc == ELDLM_OK && (*flags & LDLM_FL_INTENT_ONLY))) {
+			ldlm_lock_destroy(lock);
+			return rc;
+		}
+	}
+
+	/* For a replaying lock, it might be already in granted list. So
+	 * unlinking the lock will cause the interval node to be freed, we
+	 * have to allocate the interval node early otherwise we can't regrant
+	 * this lock in the future. - jay */
+	if (!local && (*flags & LDLM_FL_REPLAY) && res->lr_type == LDLM_EXTENT)
+		OBD_SLAB_ALLOC_PTR_GFP(node, ldlm_interval_slab, GFP_NOFS);
+
+	lock_res_and_lock(lock);
+	if (local && lock->l_req_mode == lock->l_granted_mode) {
+		/* The server returned a blocked lock, but it was granted
+		 * before we got a chance to actually enqueue it.  We don't
+		 * need to do anything else. */
+		*flags &= ~(LDLM_FL_BLOCK_GRANTED |
+			    LDLM_FL_BLOCK_CONV | LDLM_FL_BLOCK_WAIT);
+		goto out;
+	}
+
+	ldlm_resource_unlink_lock(lock);
+	if (res->lr_type == LDLM_EXTENT && lock->l_tree_node == NULL) {
+		if (node == NULL) {
+			ldlm_lock_destroy_nolock(lock);
+			rc = -ENOMEM;
+			goto out;
+		}
+
+		INIT_LIST_HEAD(&node->li_group);
+		ldlm_interval_attach(node, lock);
+		node = NULL;
+	}
+
+	/* Some flags from the enqueue want to make it into the AST, via the
+	 * lock's l_flags. */
+	lock->l_flags |= *flags & LDLM_FL_AST_DISCARD_DATA;
+
+	/* This distinction between local lock trees is very important; a client
+	 * namespace only has information about locks taken by that client, and
+	 * thus doesn't have enough information to decide for itself if it can
+	 * be granted (below).  In this case, we do exactly what the server
+	 * tells us to do, as dictated by the 'flags'.
+	 *
+	 * We do exactly the same thing during recovery, when the server is
+	 * more or less trusting the clients not to lie.
+	 *
+	 * FIXME (bug 268): Detect obvious lies by checking compatibility in
+	 * granted/converting queues. */
+	if (local) {
+		if (*flags & LDLM_FL_BLOCK_CONV)
+			ldlm_resource_add_lock(res, &res->lr_converting, lock);
+		else if (*flags & (LDLM_FL_BLOCK_WAIT | LDLM_FL_BLOCK_GRANTED))
+			ldlm_resource_add_lock(res, &res->lr_waiting, lock);
+		else
+			ldlm_grant_lock(lock, NULL);
+		goto out;
+	} else {
+		CERROR("This is client-side-only module, cannot handle LDLM_NAMESPACE_SERVER resource type lock.\n");
+		LBUG();
+	}
+
+out:
+	unlock_res_and_lock(lock);
+	if (node)
+		OBD_SLAB_FREE(node, ldlm_interval_slab, sizeof(*node));
+	return rc;
+}
+
+
+/**
+ * Process a call to blocking AST callback for a lock in ast_work list
+ */
+static int
+ldlm_work_bl_ast_lock(struct ptlrpc_request_set *rqset, void *opaq)
+{
+	struct ldlm_cb_set_arg *arg = opaq;
+	struct ldlm_lock_desc   d;
+	int		     rc;
+	struct ldlm_lock       *lock;
+
+	if (list_empty(arg->list))
+		return -ENOENT;
+
+	lock = list_entry(arg->list->next, struct ldlm_lock, l_bl_ast);
+
+	/* nobody should touch l_bl_ast */
+	lock_res_and_lock(lock);
+	list_del_init(&lock->l_bl_ast);
+
+	LASSERT(lock->l_flags & LDLM_FL_AST_SENT);
+	LASSERT(lock->l_bl_ast_run == 0);
+	LASSERT(lock->l_blocking_lock);
+	lock->l_bl_ast_run++;
+	unlock_res_and_lock(lock);
+
+	ldlm_lock2desc(lock->l_blocking_lock, &d);
+
+	rc = lock->l_blocking_ast(lock, &d, (void *)arg, LDLM_CB_BLOCKING);
+	LDLM_LOCK_RELEASE(lock->l_blocking_lock);
+	lock->l_blocking_lock = NULL;
+	LDLM_LOCK_RELEASE(lock);
+
+	return rc;
+}
+
+/**
+ * Process a call to completion AST callback for a lock in ast_work list
+ */
+static int
+ldlm_work_cp_ast_lock(struct ptlrpc_request_set *rqset, void *opaq)
+{
+	struct ldlm_cb_set_arg  *arg = opaq;
+	int		      rc = 0;
+	struct ldlm_lock	*lock;
+	ldlm_completion_callback completion_callback;
+
+	if (list_empty(arg->list))
+		return -ENOENT;
+
+	lock = list_entry(arg->list->next, struct ldlm_lock, l_cp_ast);
+
+	/* It's possible to receive a completion AST before we've set
+	 * the l_completion_ast pointer: either because the AST arrived
+	 * before the reply, or simply because there's a small race
+	 * window between receiving the reply and finishing the local
+	 * enqueue. (bug 842)
+	 *
+	 * This can't happen with the blocking_ast, however, because we
+	 * will never call the local blocking_ast until we drop our
+	 * reader/writer reference, which we won't do until we get the
+	 * reply and finish enqueueing. */
+
+	/* nobody should touch l_cp_ast */
+	lock_res_and_lock(lock);
+	list_del_init(&lock->l_cp_ast);
+	LASSERT(lock->l_flags & LDLM_FL_CP_REQD);
+	/* save l_completion_ast since it can be changed by
+	 * mds_intent_policy(), see bug 14225 */
+	completion_callback = lock->l_completion_ast;
+	lock->l_flags &= ~LDLM_FL_CP_REQD;
+	unlock_res_and_lock(lock);
+
+	if (completion_callback != NULL)
+		rc = completion_callback(lock, 0, (void *)arg);
+	LDLM_LOCK_RELEASE(lock);
+
+	return rc;
+}
+
+/**
+ * Process a call to revocation AST callback for a lock in ast_work list
+ */
+static int
+ldlm_work_revoke_ast_lock(struct ptlrpc_request_set *rqset, void *opaq)
+{
+	struct ldlm_cb_set_arg *arg = opaq;
+	struct ldlm_lock_desc   desc;
+	int		     rc;
+	struct ldlm_lock       *lock;
+
+	if (list_empty(arg->list))
+		return -ENOENT;
+
+	lock = list_entry(arg->list->next, struct ldlm_lock, l_rk_ast);
+	list_del_init(&lock->l_rk_ast);
+
+	/* the desc just pretend to exclusive */
+	ldlm_lock2desc(lock, &desc);
+	desc.l_req_mode = LCK_EX;
+	desc.l_granted_mode = 0;
+
+	rc = lock->l_blocking_ast(lock, &desc, (void *)arg, LDLM_CB_BLOCKING);
+	LDLM_LOCK_RELEASE(lock);
+
+	return rc;
+}
+
+/**
+ * Process a call to glimpse AST callback for a lock in ast_work list
+ */
+int ldlm_work_gl_ast_lock(struct ptlrpc_request_set *rqset, void *opaq)
+{
+	struct ldlm_cb_set_arg		*arg = opaq;
+	struct ldlm_glimpse_work	*gl_work;
+	struct ldlm_lock		*lock;
+	int				 rc = 0;
+
+	if (list_empty(arg->list))
+		return -ENOENT;
+
+	gl_work = list_entry(arg->list->next, struct ldlm_glimpse_work,
+				 gl_list);
+	list_del_init(&gl_work->gl_list);
+
+	lock = gl_work->gl_lock;
+
+	/* transfer the glimpse descriptor to ldlm_cb_set_arg */
+	arg->gl_desc = gl_work->gl_desc;
+
+	/* invoke the actual glimpse callback */
+	if (lock->l_glimpse_ast(lock, (void *)arg) == 0)
+		rc = 1;
+
+	LDLM_LOCK_RELEASE(lock);
+
+	if ((gl_work->gl_flags & LDLM_GL_WORK_NOFREE) == 0)
+		OBD_FREE_PTR(gl_work);
+
+	return rc;
+}
+
+/**
+ * Process list of locks in need of ASTs being sent.
+ *
+ * Used on server to send multiple ASTs together instead of sending one by
+ * one.
+ */
+int ldlm_run_ast_work(struct ldlm_namespace *ns, struct list_head *rpc_list,
+		      enum ldlm_desc_ast_t ast_type)
+{
+	struct ldlm_cb_set_arg *arg;
+	set_producer_func       work_ast_lock;
+	int		     rc;
+
+	if (list_empty(rpc_list))
+		return 0;
+
+	OBD_ALLOC_PTR(arg);
+	if (arg == NULL)
+		return -ENOMEM;
+
+	atomic_set(&arg->restart, 0);
+	arg->list = rpc_list;
+
+	switch (ast_type) {
+	case LDLM_WORK_BL_AST:
+		arg->type = LDLM_BL_CALLBACK;
+		work_ast_lock = ldlm_work_bl_ast_lock;
+		break;
+	case LDLM_WORK_CP_AST:
+		arg->type = LDLM_CP_CALLBACK;
+		work_ast_lock = ldlm_work_cp_ast_lock;
+		break;
+	case LDLM_WORK_REVOKE_AST:
+		arg->type = LDLM_BL_CALLBACK;
+		work_ast_lock = ldlm_work_revoke_ast_lock;
+		break;
+	case LDLM_WORK_GL_AST:
+		arg->type = LDLM_GL_CALLBACK;
+		work_ast_lock = ldlm_work_gl_ast_lock;
+		break;
+	default:
+		LBUG();
+	}
+
+	/* We create a ptlrpc request set with flow control extension.
+	 * This request set will use the work_ast_lock function to produce new
+	 * requests and will send a new request each time one completes in order
+	 * to keep the number of requests in flight to ns_max_parallel_ast */
+	arg->set = ptlrpc_prep_fcset(ns->ns_max_parallel_ast ? : UINT_MAX,
+				     work_ast_lock, arg);
+	if (arg->set == NULL) {
+		rc = -ENOMEM;
+		goto out;
+	}
+
+	ptlrpc_set_wait(arg->set);
+	ptlrpc_set_destroy(arg->set);
+
+	rc = atomic_read(&arg->restart) ? -ERESTART : 0;
+	goto out;
+out:
+	OBD_FREE_PTR(arg);
+	return rc;
+}
+
+static int reprocess_one_queue(struct ldlm_resource *res, void *closure)
+{
+	ldlm_reprocess_all(res);
+	return LDLM_ITER_CONTINUE;
+}
+
+static int ldlm_reprocess_res(struct cfs_hash *hs, struct cfs_hash_bd *bd,
+			      struct hlist_node *hnode, void *arg)
+{
+	struct ldlm_resource *res = cfs_hash_object(hs, hnode);
+	int    rc;
+
+	rc = reprocess_one_queue(res, arg);
+
+	return rc == LDLM_ITER_STOP;
+}
+
+/**
+ * Iterate through all resources on a namespace attempting to grant waiting
+ * locks.
+ */
+void ldlm_reprocess_all_ns(struct ldlm_namespace *ns)
+{
+	if (ns != NULL) {
+		cfs_hash_for_each_nolock(ns->ns_rs_hash,
+					 ldlm_reprocess_res, NULL);
+	}
+}
+EXPORT_SYMBOL(ldlm_reprocess_all_ns);
+
+/**
+ * Try to grant all waiting locks on a resource.
+ *
+ * Calls ldlm_reprocess_queue on converting and waiting queues.
+ *
+ * Typically called after some resource locks are cancelled to see
+ * if anything could be granted as a result of the cancellation.
+ */
+void ldlm_reprocess_all(struct ldlm_resource *res)
+{
+	LIST_HEAD(rpc_list);
+
+	if (!ns_is_client(ldlm_res_to_ns(res))) {
+		CERROR("This is client-side-only module, cannot handle LDLM_NAMESPACE_SERVER resource type lock.\n");
+		LBUG();
+	}
+}
+
+/**
+ * Helper function to call blocking AST for LDLM lock \a lock in a
+ * "cancelling" mode.
+ */
+void ldlm_cancel_callback(struct ldlm_lock *lock)
+{
+	check_res_locked(lock->l_resource);
+	if (!(lock->l_flags & LDLM_FL_CANCEL)) {
+		lock->l_flags |= LDLM_FL_CANCEL;
+		if (lock->l_blocking_ast) {
+			unlock_res_and_lock(lock);
+			lock->l_blocking_ast(lock, NULL, lock->l_ast_data,
+					     LDLM_CB_CANCELING);
+			lock_res_and_lock(lock);
+		} else {
+			LDLM_DEBUG(lock, "no blocking ast");
+		}
+	}
+	lock->l_flags |= LDLM_FL_BL_DONE;
+}
+
+/**
+ * Remove skiplist-enabled LDLM lock \a req from granted list
+ */
+void ldlm_unlink_lock_skiplist(struct ldlm_lock *req)
+{
+	if (req->l_resource->lr_type != LDLM_PLAIN &&
+	    req->l_resource->lr_type != LDLM_IBITS)
+		return;
+
+	list_del_init(&req->l_sl_policy);
+	list_del_init(&req->l_sl_mode);
+}
+
+/**
+ * Attempts to cancel LDLM lock \a lock that has no reader/writer references.
+ */
+void ldlm_lock_cancel(struct ldlm_lock *lock)
+{
+	struct ldlm_resource *res;
+	struct ldlm_namespace *ns;
+
+	lock_res_and_lock(lock);
+
+	res = lock->l_resource;
+	ns  = ldlm_res_to_ns(res);
+
+	/* Please do not, no matter how tempting, remove this LBUG without
+	 * talking to me first. -phik */
+	if (lock->l_readers || lock->l_writers) {
+		LDLM_ERROR(lock, "lock still has references");
+		LBUG();
+	}
+
+	if (lock->l_flags & LDLM_FL_WAITED)
+		ldlm_del_waiting_lock(lock);
+
+	/* Releases cancel callback. */
+	ldlm_cancel_callback(lock);
+
+	/* Yes, second time, just in case it was added again while we were
+	 * running with no res lock in ldlm_cancel_callback */
+	if (lock->l_flags & LDLM_FL_WAITED)
+		ldlm_del_waiting_lock(lock);
+
+	ldlm_resource_unlink_lock(lock);
+	ldlm_lock_destroy_nolock(lock);
+
+	if (lock->l_granted_mode == lock->l_req_mode)
+		ldlm_pool_del(&ns->ns_pool, lock);
+
+	/* Make sure we will not be called again for same lock what is possible
+	 * if not to zero out lock->l_granted_mode */
+	lock->l_granted_mode = LCK_MINMODE;
+	unlock_res_and_lock(lock);
+}
+EXPORT_SYMBOL(ldlm_lock_cancel);
+
+/**
+ * Set opaque data into the lock that only makes sense to upper layer.
+ */
+int ldlm_lock_set_data(struct lustre_handle *lockh, void *data)
+{
+	struct ldlm_lock *lock = ldlm_handle2lock(lockh);
+	int rc = -EINVAL;
+
+	if (lock) {
+		if (lock->l_ast_data == NULL)
+			lock->l_ast_data = data;
+		if (lock->l_ast_data == data)
+			rc = 0;
+		LDLM_LOCK_PUT(lock);
+	}
+	return rc;
+}
+EXPORT_SYMBOL(ldlm_lock_set_data);
+
+struct export_cl_data {
+	struct obd_export	*ecl_exp;
+	int			ecl_loop;
+};
+
+/**
+ * Iterator function for ldlm_cancel_locks_for_export.
+ * Cancels passed locks.
+ */
+int ldlm_cancel_locks_for_export_cb(struct cfs_hash *hs, struct cfs_hash_bd *bd,
+				    struct hlist_node *hnode, void *data)
+
+{
+	struct export_cl_data	*ecl = (struct export_cl_data *)data;
+	struct obd_export	*exp  = ecl->ecl_exp;
+	struct ldlm_lock     *lock = cfs_hash_object(hs, hnode);
+	struct ldlm_resource *res;
+
+	res = ldlm_resource_getref(lock->l_resource);
+	LDLM_LOCK_GET(lock);
+
+	LDLM_DEBUG(lock, "export %p", exp);
+	ldlm_res_lvbo_update(res, NULL, 1);
+	ldlm_lock_cancel(lock);
+	ldlm_reprocess_all(res);
+	ldlm_resource_putref(res);
+	LDLM_LOCK_RELEASE(lock);
+
+	ecl->ecl_loop++;
+	if ((ecl->ecl_loop & -ecl->ecl_loop) == ecl->ecl_loop) {
+		CDEBUG(D_INFO,
+		       "Cancel lock %p for export %p (loop %d), still have %d locks left on hash table.\n",
+		       lock, exp, ecl->ecl_loop,
+		       atomic_read(&hs->hs_count));
+	}
+
+	return 0;
+}
+
+/**
+ * Cancel all locks for given export.
+ *
+ * Typically called on client disconnection/eviction
+ */
+void ldlm_cancel_locks_for_export(struct obd_export *exp)
+{
+	struct export_cl_data	ecl = {
+		.ecl_exp	= exp,
+		.ecl_loop	= 0,
+	};
+
+	cfs_hash_for_each_empty(exp->exp_lock_hash,
+				ldlm_cancel_locks_for_export_cb, &ecl);
+}
+
+/**
+ * Downgrade an exclusive lock.
+ *
+ * A fast variant of ldlm_lock_convert for conversion of exclusive
+ * locks. The conversion is always successful.
+ * Used by Commit on Sharing (COS) code.
+ *
+ * \param lock A lock to convert
+ * \param new_mode new lock mode
+ */
+void ldlm_lock_downgrade(struct ldlm_lock *lock, int new_mode)
+{
+	LASSERT(lock->l_granted_mode & (LCK_PW | LCK_EX));
+	LASSERT(new_mode == LCK_COS);
+
+	lock_res_and_lock(lock);
+	ldlm_resource_unlink_lock(lock);
+	/*
+	 * Remove the lock from pool as it will be added again in
+	 * ldlm_grant_lock() called below.
+	 */
+	ldlm_pool_del(&ldlm_lock_to_ns(lock)->ns_pool, lock);
+
+	lock->l_req_mode = new_mode;
+	ldlm_grant_lock(lock, NULL);
+	unlock_res_and_lock(lock);
+	ldlm_reprocess_all(lock->l_resource);
+}
+EXPORT_SYMBOL(ldlm_lock_downgrade);
+
+/**
+ * Attempt to convert already granted lock to a different mode.
+ *
+ * While lock conversion is not currently used, future client-side
+ * optimizations could take advantage of it to avoid discarding cached
+ * pages on a file.
+ */
+struct ldlm_resource *ldlm_lock_convert(struct ldlm_lock *lock, int new_mode,
+					__u32 *flags)
+{
+	LIST_HEAD(rpc_list);
+	struct ldlm_resource *res;
+	struct ldlm_namespace *ns;
+	int granted = 0;
+	struct ldlm_interval *node;
+
+	/* Just return if mode is unchanged. */
+	if (new_mode == lock->l_granted_mode) {
+		*flags |= LDLM_FL_BLOCK_GRANTED;
+		return lock->l_resource;
+	}
+
+	/* I can't check the type of lock here because the bitlock of lock
+	 * is not held here, so do the allocation blindly. -jay */
+	OBD_SLAB_ALLOC_PTR_GFP(node, ldlm_interval_slab, GFP_NOFS);
+	if (node == NULL)
+		/* Actually, this causes EDEADLOCK to be returned */
+		return NULL;
+
+	LASSERTF((new_mode == LCK_PW && lock->l_granted_mode == LCK_PR),
+		 "new_mode %u, granted %u\n", new_mode, lock->l_granted_mode);
+
+	lock_res_and_lock(lock);
+
+	res = lock->l_resource;
+	ns  = ldlm_res_to_ns(res);
+
+	lock->l_req_mode = new_mode;
+	if (res->lr_type == LDLM_PLAIN || res->lr_type == LDLM_IBITS) {
+		ldlm_resource_unlink_lock(lock);
+	} else {
+		ldlm_resource_unlink_lock(lock);
+		if (res->lr_type == LDLM_EXTENT) {
+			/* FIXME: ugly code, I have to attach the lock to a
+			 * interval node again since perhaps it will be granted
+			 * soon */
+			INIT_LIST_HEAD(&node->li_group);
+			ldlm_interval_attach(node, lock);
+			node = NULL;
+		}
+	}
+
+	/*
+	 * Remove old lock from the pool before adding the lock with new
+	 * mode below in ->policy()
+	 */
+	ldlm_pool_del(&ns->ns_pool, lock);
+
+	/* If this is a local resource, put it on the appropriate list. */
+	if (ns_is_client(ldlm_res_to_ns(res))) {
+		if (*flags & (LDLM_FL_BLOCK_CONV | LDLM_FL_BLOCK_GRANTED)) {
+			ldlm_resource_add_lock(res, &res->lr_converting, lock);
+		} else {
+			/* This should never happen, because of the way the
+			 * server handles conversions. */
+			LDLM_ERROR(lock, "Erroneous flags %x on local lock\n",
+				   *flags);
+			LBUG();
+
+			ldlm_grant_lock(lock, &rpc_list);
+			granted = 1;
+			/* FIXME: completion handling not with lr_lock held ! */
+			if (lock->l_completion_ast)
+				lock->l_completion_ast(lock, 0, NULL);
+		}
+	} else {
+		CERROR("This is client-side-only module, cannot handle LDLM_NAMESPACE_SERVER resource type lock.\n");
+		LBUG();
+	}
+	unlock_res_and_lock(lock);
+
+	if (granted)
+		ldlm_run_ast_work(ns, &rpc_list, LDLM_WORK_CP_AST);
+	if (node)
+		OBD_SLAB_FREE(node, ldlm_interval_slab, sizeof(*node));
+	return res;
+}
+EXPORT_SYMBOL(ldlm_lock_convert);
+
+/**
+ * Print lock with lock handle \a lockh description into debug log.
+ *
+ * Used when printing all locks on a resource for debug purposes.
+ */
+void ldlm_lock_dump_handle(int level, struct lustre_handle *lockh)
+{
+	struct ldlm_lock *lock;
+
+	if (!((libcfs_debug | D_ERROR) & level))
+		return;
+
+	lock = ldlm_handle2lock(lockh);
+	if (lock == NULL)
+		return;
+
+	LDLM_DEBUG_LIMIT(level, lock, "###");
+
+	LDLM_LOCK_PUT(lock);
+}
+EXPORT_SYMBOL(ldlm_lock_dump_handle);
+
+/**
+ * Print lock information with custom message into debug log.
+ * Helper function.
+ */
+void _ldlm_lock_debug(struct ldlm_lock *lock,
+		      struct libcfs_debug_msg_data *msgdata,
+		      const char *fmt, ...)
+{
+	va_list args;
+	struct obd_export *exp = lock->l_export;
+	struct ldlm_resource *resource = lock->l_resource;
+	char *nid = "local";
+
+	va_start(args, fmt);
+
+	if (exp && exp->exp_connection) {
+		nid = libcfs_nid2str(exp->exp_connection->c_peer.nid);
+	} else if (exp && exp->exp_obd != NULL) {
+		struct obd_import *imp = exp->exp_obd->u.cli.cl_import;
+
+		nid = libcfs_nid2str(imp->imp_connection->c_peer.nid);
+	}
+
+	if (resource == NULL) {
+		libcfs_debug_vmsg2(msgdata, fmt, args,
+				   " ns: \?\? lock: %p/%#llx lrc: %d/%d,%d mode: %s/%s res: \?\? rrc=\?\? type: \?\?\? flags: %#llx nid: %s remote: %#llx expref: %d pid: %u timeout: %lu lvb_type: %d\n",
+				   lock,
+				   lock->l_handle.h_cookie, atomic_read(&lock->l_refc),
+				   lock->l_readers, lock->l_writers,
+				   ldlm_lockname[lock->l_granted_mode],
+				   ldlm_lockname[lock->l_req_mode],
+				   lock->l_flags, nid, lock->l_remote_handle.cookie,
+				   exp ? atomic_read(&exp->exp_refcount) : -99,
+				   lock->l_pid, lock->l_callback_timeout, lock->l_lvb_type);
+		va_end(args);
+		return;
+	}
+
+	switch (resource->lr_type) {
+	case LDLM_EXTENT:
+		libcfs_debug_vmsg2(msgdata, fmt, args,
+				   " ns: %s lock: %p/%#llx lrc: %d/%d,%d mode: %s/%s res: " DLDLMRES " rrc: %d type: %s [%llu->%llu] (req %llu->%llu) flags: %#llx nid: %s remote: %#llx expref: %d pid: %u timeout: %lu lvb_type: %d\n",
+				   ldlm_lock_to_ns_name(lock), lock,
+				   lock->l_handle.h_cookie, atomic_read(&lock->l_refc),
+				   lock->l_readers, lock->l_writers,
+				   ldlm_lockname[lock->l_granted_mode],
+				   ldlm_lockname[lock->l_req_mode],
+				   PLDLMRES(resource),
+				   atomic_read(&resource->lr_refcount),
+				   ldlm_typename[resource->lr_type],
+				   lock->l_policy_data.l_extent.start,
+				   lock->l_policy_data.l_extent.end,
+				   lock->l_req_extent.start, lock->l_req_extent.end,
+				   lock->l_flags, nid, lock->l_remote_handle.cookie,
+				   exp ? atomic_read(&exp->exp_refcount) : -99,
+				   lock->l_pid, lock->l_callback_timeout,
+				   lock->l_lvb_type);
+		break;
+
+	case LDLM_FLOCK:
+		libcfs_debug_vmsg2(msgdata, fmt, args,
+				   " ns: %s lock: %p/%#llx lrc: %d/%d,%d mode: %s/%s res: " DLDLMRES " rrc: %d type: %s pid: %d [%llu->%llu] flags: %#llx nid: %s remote: %#llx expref: %d pid: %u timeout: %lu\n",
+				   ldlm_lock_to_ns_name(lock), lock,
+				   lock->l_handle.h_cookie, atomic_read(&lock->l_refc),
+				   lock->l_readers, lock->l_writers,
+				   ldlm_lockname[lock->l_granted_mode],
+				   ldlm_lockname[lock->l_req_mode],
+				   PLDLMRES(resource),
+				   atomic_read(&resource->lr_refcount),
+				   ldlm_typename[resource->lr_type],
+				   lock->l_policy_data.l_flock.pid,
+				   lock->l_policy_data.l_flock.start,
+				   lock->l_policy_data.l_flock.end,
+				   lock->l_flags, nid, lock->l_remote_handle.cookie,
+				   exp ? atomic_read(&exp->exp_refcount) : -99,
+				   lock->l_pid, lock->l_callback_timeout);
+		break;
+
+	case LDLM_IBITS:
+		libcfs_debug_vmsg2(msgdata, fmt, args,
+				   " ns: %s lock: %p/%#llx lrc: %d/%d,%d mode: %s/%s res: " DLDLMRES " bits %#llx rrc: %d type: %s flags: %#llx nid: %s remote: %#llx expref: %d pid: %u timeout: %lu lvb_type: %d\n",
+				   ldlm_lock_to_ns_name(lock),
+				   lock, lock->l_handle.h_cookie,
+				   atomic_read(&lock->l_refc),
+				   lock->l_readers, lock->l_writers,
+				   ldlm_lockname[lock->l_granted_mode],
+				   ldlm_lockname[lock->l_req_mode],
+				   PLDLMRES(resource),
+				   lock->l_policy_data.l_inodebits.bits,
+				   atomic_read(&resource->lr_refcount),
+				   ldlm_typename[resource->lr_type],
+				   lock->l_flags, nid, lock->l_remote_handle.cookie,
+				   exp ? atomic_read(&exp->exp_refcount) : -99,
+				   lock->l_pid, lock->l_callback_timeout,
+				   lock->l_lvb_type);
+		break;
+
+	default:
+		libcfs_debug_vmsg2(msgdata, fmt, args,
+				   " ns: %s lock: %p/%#llx lrc: %d/%d,%d mode: %s/%s res: " DLDLMRES " rrc: %d type: %s flags: %#llx nid: %s remote: %#llx expref: %d pid: %u timeout: %lu lvb_type: %d\n",
+				   ldlm_lock_to_ns_name(lock),
+				   lock, lock->l_handle.h_cookie,
+				   atomic_read(&lock->l_refc),
+				   lock->l_readers, lock->l_writers,
+				   ldlm_lockname[lock->l_granted_mode],
+				   ldlm_lockname[lock->l_req_mode],
+				   PLDLMRES(resource),
+				   atomic_read(&resource->lr_refcount),
+				   ldlm_typename[resource->lr_type],
+				   lock->l_flags, nid, lock->l_remote_handle.cookie,
+				   exp ? atomic_read(&exp->exp_refcount) : -99,
+				   lock->l_pid, lock->l_callback_timeout,
+				   lock->l_lvb_type);
+		break;
+	}
+	va_end(args);
+}
+EXPORT_SYMBOL(_ldlm_lock_debug);
diff --git a/kernel/drivers/staging/lustre/lustre/ldlm/ldlm_lockd.c b/kernel/drivers/staging/lustre/lustre/ldlm/ldlm_lockd.c
new file mode 100644
index 000000000..08a91f5d9
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/ldlm/ldlm_lockd.c
@@ -0,0 +1,1191 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2010, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/ldlm/ldlm_lockd.c
+ *
+ * Author: Peter Braam <braam@clusterfs.com>
+ * Author: Phil Schwan <phil@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LDLM
+
+#include "../../include/linux/libcfs/libcfs.h"
+#include "../include/lustre_dlm.h"
+#include "../include/obd_class.h"
+#include <linux/list.h>
+#include "ldlm_internal.h"
+
+static int ldlm_num_threads;
+module_param(ldlm_num_threads, int, 0444);
+MODULE_PARM_DESC(ldlm_num_threads, "number of DLM service threads to start");
+
+static char *ldlm_cpts;
+module_param(ldlm_cpts, charp, 0444);
+MODULE_PARM_DESC(ldlm_cpts, "CPU partitions ldlm threads should run on");
+
+static struct mutex	ldlm_ref_mutex;
+static int ldlm_refcount;
+
+struct ldlm_cb_async_args {
+	struct ldlm_cb_set_arg *ca_set_arg;
+	struct ldlm_lock       *ca_lock;
+};
+
+/* LDLM state */
+
+static struct ldlm_state *ldlm_state;
+
+inline unsigned long round_timeout(unsigned long timeout)
+{
+	return cfs_time_seconds((int)cfs_duration_sec(cfs_time_sub(timeout, 0)) + 1);
+}
+
+/* timeout for initial callback (AST) reply (bz10399) */
+static inline unsigned int ldlm_get_rq_timeout(void)
+{
+	/* Non-AT value */
+	unsigned int timeout = min(ldlm_timeout, obd_timeout / 3);
+
+	return timeout < 1 ? 1 : timeout;
+}
+
+#define ELT_STOPPED   0
+#define ELT_READY     1
+#define ELT_TERMINATE 2
+
+struct ldlm_bl_pool {
+	spinlock_t		blp_lock;
+
+	/*
+	 * blp_prio_list is used for callbacks that should be handled
+	 * as a priority. It is used for LDLM_FL_DISCARD_DATA requests.
+	 * see bug 13843
+	 */
+	struct list_head	      blp_prio_list;
+
+	/*
+	 * blp_list is used for all other callbacks which are likely
+	 * to take longer to process.
+	 */
+	struct list_head	      blp_list;
+
+	wait_queue_head_t	     blp_waitq;
+	struct completion	blp_comp;
+	atomic_t	    blp_num_threads;
+	atomic_t	    blp_busy_threads;
+	int		     blp_min_threads;
+	int		     blp_max_threads;
+};
+
+struct ldlm_bl_work_item {
+	struct list_head	      blwi_entry;
+	struct ldlm_namespace  *blwi_ns;
+	struct ldlm_lock_desc   blwi_ld;
+	struct ldlm_lock       *blwi_lock;
+	struct list_head	      blwi_head;
+	int		     blwi_count;
+	struct completion	blwi_comp;
+	ldlm_cancel_flags_t     blwi_flags;
+	int		     blwi_mem_pressure;
+};
+
+
+int ldlm_del_waiting_lock(struct ldlm_lock *lock)
+{
+	return 0;
+}
+
+int ldlm_refresh_waiting_lock(struct ldlm_lock *lock, int timeout)
+{
+	return 0;
+}
+
+
+
+/**
+ * Callback handler for receiving incoming blocking ASTs.
+ *
+ * This can only happen on client side.
+ */
+void ldlm_handle_bl_callback(struct ldlm_namespace *ns,
+			     struct ldlm_lock_desc *ld, struct ldlm_lock *lock)
+{
+	int do_ast;
+
+	LDLM_DEBUG(lock, "client blocking AST callback handler");
+
+	lock_res_and_lock(lock);
+	lock->l_flags |= LDLM_FL_CBPENDING;
+
+	if (lock->l_flags & LDLM_FL_CANCEL_ON_BLOCK)
+		lock->l_flags |= LDLM_FL_CANCEL;
+
+	do_ast = !lock->l_readers && !lock->l_writers;
+	unlock_res_and_lock(lock);
+
+	if (do_ast) {
+		CDEBUG(D_DLMTRACE,
+		       "Lock %p already unused, calling callback (%p)\n", lock,
+		       lock->l_blocking_ast);
+		if (lock->l_blocking_ast != NULL)
+			lock->l_blocking_ast(lock, ld, lock->l_ast_data,
+					     LDLM_CB_BLOCKING);
+	} else {
+		CDEBUG(D_DLMTRACE,
+		       "Lock %p is referenced, will be cancelled later\n",
+		       lock);
+	}
+
+	LDLM_DEBUG(lock, "client blocking callback handler END");
+	LDLM_LOCK_RELEASE(lock);
+}
+
+/**
+ * Callback handler for receiving incoming completion ASTs.
+ *
+ * This only can happen on client side.
+ */
+static void ldlm_handle_cp_callback(struct ptlrpc_request *req,
+				    struct ldlm_namespace *ns,
+				    struct ldlm_request *dlm_req,
+				    struct ldlm_lock *lock)
+{
+	int lvb_len;
+	LIST_HEAD(ast_list);
+	int rc = 0;
+
+	LDLM_DEBUG(lock, "client completion callback handler START");
+
+	if (OBD_FAIL_CHECK(OBD_FAIL_LDLM_CANCEL_BL_CB_RACE)) {
+		int to = cfs_time_seconds(1);
+
+		while (to > 0) {
+			set_current_state(TASK_INTERRUPTIBLE);
+			schedule_timeout(to);
+			if (lock->l_granted_mode == lock->l_req_mode ||
+			    lock->l_flags & LDLM_FL_DESTROYED)
+				break;
+		}
+	}
+
+	lvb_len = req_capsule_get_size(&req->rq_pill, &RMF_DLM_LVB, RCL_CLIENT);
+	if (lvb_len < 0) {
+		LDLM_ERROR(lock, "Fail to get lvb_len, rc = %d", lvb_len);
+		rc = lvb_len;
+		goto out;
+	} else if (lvb_len > 0) {
+		if (lock->l_lvb_len > 0) {
+			/* for extent lock, lvb contains ost_lvb{}. */
+			LASSERT(lock->l_lvb_data != NULL);
+
+			if (unlikely(lock->l_lvb_len < lvb_len)) {
+				LDLM_ERROR(lock, "Replied LVB is larger than expectation, expected = %d, replied = %d",
+					   lock->l_lvb_len, lvb_len);
+				rc = -EINVAL;
+				goto out;
+			}
+		} else if (ldlm_has_layout(lock)) { /* for layout lock, lvb has
+						     * variable length */
+			void *lvb_data;
+
+			OBD_ALLOC(lvb_data, lvb_len);
+			if (lvb_data == NULL) {
+				LDLM_ERROR(lock, "No memory: %d.\n", lvb_len);
+				rc = -ENOMEM;
+				goto out;
+			}
+
+			lock_res_and_lock(lock);
+			LASSERT(lock->l_lvb_data == NULL);
+			lock->l_lvb_type = LVB_T_LAYOUT;
+			lock->l_lvb_data = lvb_data;
+			lock->l_lvb_len = lvb_len;
+			unlock_res_and_lock(lock);
+		}
+	}
+
+	lock_res_and_lock(lock);
+	if ((lock->l_flags & LDLM_FL_DESTROYED) ||
+	    lock->l_granted_mode == lock->l_req_mode) {
+		/* bug 11300: the lock has already been granted */
+		unlock_res_and_lock(lock);
+		LDLM_DEBUG(lock, "Double grant race happened");
+		rc = 0;
+		goto out;
+	}
+
+	/* If we receive the completion AST before the actual enqueue returned,
+	 * then we might need to switch lock modes, resources, or extents. */
+	if (dlm_req->lock_desc.l_granted_mode != lock->l_req_mode) {
+		lock->l_req_mode = dlm_req->lock_desc.l_granted_mode;
+		LDLM_DEBUG(lock, "completion AST, new lock mode");
+	}
+
+	if (lock->l_resource->lr_type != LDLM_PLAIN) {
+		ldlm_convert_policy_to_local(req->rq_export,
+					  dlm_req->lock_desc.l_resource.lr_type,
+					  &dlm_req->lock_desc.l_policy_data,
+					  &lock->l_policy_data);
+		LDLM_DEBUG(lock, "completion AST, new policy data");
+	}
+
+	ldlm_resource_unlink_lock(lock);
+	if (memcmp(&dlm_req->lock_desc.l_resource.lr_name,
+		   &lock->l_resource->lr_name,
+		   sizeof(lock->l_resource->lr_name)) != 0) {
+		unlock_res_and_lock(lock);
+		rc = ldlm_lock_change_resource(ns, lock,
+				&dlm_req->lock_desc.l_resource.lr_name);
+		if (rc < 0) {
+			LDLM_ERROR(lock, "Failed to allocate resource");
+			goto out;
+		}
+		LDLM_DEBUG(lock, "completion AST, new resource");
+		CERROR("change resource!\n");
+		lock_res_and_lock(lock);
+	}
+
+	if (dlm_req->lock_flags & LDLM_FL_AST_SENT) {
+		/* BL_AST locks are not needed in LRU.
+		 * Let ldlm_cancel_lru() be fast. */
+		ldlm_lock_remove_from_lru(lock);
+		lock->l_flags |= LDLM_FL_CBPENDING | LDLM_FL_BL_AST;
+		LDLM_DEBUG(lock, "completion AST includes blocking AST");
+	}
+
+	if (lock->l_lvb_len > 0) {
+		rc = ldlm_fill_lvb(lock, &req->rq_pill, RCL_CLIENT,
+				   lock->l_lvb_data, lvb_len);
+		if (rc < 0) {
+			unlock_res_and_lock(lock);
+			goto out;
+		}
+	}
+
+	ldlm_grant_lock(lock, &ast_list);
+	unlock_res_and_lock(lock);
+
+	LDLM_DEBUG(lock, "callback handler finished, about to run_ast_work");
+
+	/* Let Enqueue to call osc_lock_upcall() and initialize
+	 * l_ast_data */
+	OBD_FAIL_TIMEOUT(OBD_FAIL_OSC_CP_ENQ_RACE, 2);
+
+	ldlm_run_ast_work(ns, &ast_list, LDLM_WORK_CP_AST);
+
+	LDLM_DEBUG_NOLOCK("client completion callback handler END (lock %p)",
+			  lock);
+	goto out;
+
+out:
+	if (rc < 0) {
+		lock_res_and_lock(lock);
+		lock->l_flags |= LDLM_FL_FAILED;
+		unlock_res_and_lock(lock);
+		wake_up(&lock->l_waitq);
+	}
+	LDLM_LOCK_RELEASE(lock);
+}
+
+/**
+ * Callback handler for receiving incoming glimpse ASTs.
+ *
+ * This only can happen on client side.  After handling the glimpse AST
+ * we also consider dropping the lock here if it is unused locally for a
+ * long time.
+ */
+static void ldlm_handle_gl_callback(struct ptlrpc_request *req,
+				    struct ldlm_namespace *ns,
+				    struct ldlm_request *dlm_req,
+				    struct ldlm_lock *lock)
+{
+	int rc = -ENOSYS;
+
+	LDLM_DEBUG(lock, "client glimpse AST callback handler");
+
+	if (lock->l_glimpse_ast != NULL)
+		rc = lock->l_glimpse_ast(lock, req);
+
+	if (req->rq_repmsg != NULL) {
+		ptlrpc_reply(req);
+	} else {
+		req->rq_status = rc;
+		ptlrpc_error(req);
+	}
+
+	lock_res_and_lock(lock);
+	if (lock->l_granted_mode == LCK_PW &&
+	    !lock->l_readers && !lock->l_writers &&
+	    cfs_time_after(cfs_time_current(),
+			   cfs_time_add(lock->l_last_used,
+					cfs_time_seconds(10)))) {
+		unlock_res_and_lock(lock);
+		if (ldlm_bl_to_thread_lock(ns, NULL, lock))
+			ldlm_handle_bl_callback(ns, NULL, lock);
+
+		return;
+	}
+	unlock_res_and_lock(lock);
+	LDLM_LOCK_RELEASE(lock);
+}
+
+static int ldlm_callback_reply(struct ptlrpc_request *req, int rc)
+{
+	if (req->rq_no_reply)
+		return 0;
+
+	req->rq_status = rc;
+	if (!req->rq_packed_final) {
+		rc = lustre_pack_reply(req, 1, NULL, NULL);
+		if (rc)
+			return rc;
+	}
+	return ptlrpc_reply(req);
+}
+
+static int __ldlm_bl_to_thread(struct ldlm_bl_work_item *blwi,
+			       ldlm_cancel_flags_t cancel_flags)
+{
+	struct ldlm_bl_pool *blp = ldlm_state->ldlm_bl_pool;
+
+	spin_lock(&blp->blp_lock);
+	if (blwi->blwi_lock &&
+	    blwi->blwi_lock->l_flags & LDLM_FL_DISCARD_DATA) {
+		/* add LDLM_FL_DISCARD_DATA requests to the priority list */
+		list_add_tail(&blwi->blwi_entry, &blp->blp_prio_list);
+	} else {
+		/* other blocking callbacks are added to the regular list */
+		list_add_tail(&blwi->blwi_entry, &blp->blp_list);
+	}
+	spin_unlock(&blp->blp_lock);
+
+	wake_up(&blp->blp_waitq);
+
+	/* can not check blwi->blwi_flags as blwi could be already freed in
+	   LCF_ASYNC mode */
+	if (!(cancel_flags & LCF_ASYNC))
+		wait_for_completion(&blwi->blwi_comp);
+
+	return 0;
+}
+
+static inline void init_blwi(struct ldlm_bl_work_item *blwi,
+			     struct ldlm_namespace *ns,
+			     struct ldlm_lock_desc *ld,
+			     struct list_head *cancels, int count,
+			     struct ldlm_lock *lock,
+			     ldlm_cancel_flags_t cancel_flags)
+{
+	init_completion(&blwi->blwi_comp);
+	INIT_LIST_HEAD(&blwi->blwi_head);
+
+	if (memory_pressure_get())
+		blwi->blwi_mem_pressure = 1;
+
+	blwi->blwi_ns = ns;
+	blwi->blwi_flags = cancel_flags;
+	if (ld != NULL)
+		blwi->blwi_ld = *ld;
+	if (count) {
+		list_add(&blwi->blwi_head, cancels);
+		list_del_init(cancels);
+		blwi->blwi_count = count;
+	} else {
+		blwi->blwi_lock = lock;
+	}
+}
+
+/**
+ * Queues a list of locks \a cancels containing \a count locks
+ * for later processing by a blocking thread.  If \a count is zero,
+ * then the lock referenced as \a lock is queued instead.
+ *
+ * The blocking thread would then call ->l_blocking_ast callback in the lock.
+ * If list addition fails an error is returned and caller is supposed to
+ * call ->l_blocking_ast itself.
+ */
+static int ldlm_bl_to_thread(struct ldlm_namespace *ns,
+			     struct ldlm_lock_desc *ld,
+			     struct ldlm_lock *lock,
+			     struct list_head *cancels, int count,
+			     ldlm_cancel_flags_t cancel_flags)
+{
+	if (cancels && count == 0)
+		return 0;
+
+	if (cancel_flags & LCF_ASYNC) {
+		struct ldlm_bl_work_item *blwi;
+
+		OBD_ALLOC(blwi, sizeof(*blwi));
+		if (blwi == NULL)
+			return -ENOMEM;
+		init_blwi(blwi, ns, ld, cancels, count, lock, cancel_flags);
+
+		return __ldlm_bl_to_thread(blwi, cancel_flags);
+	} else {
+		/* if it is synchronous call do minimum mem alloc, as it could
+		 * be triggered from kernel shrinker
+		 */
+		struct ldlm_bl_work_item blwi;
+
+		memset(&blwi, 0, sizeof(blwi));
+		init_blwi(&blwi, ns, ld, cancels, count, lock, cancel_flags);
+		return __ldlm_bl_to_thread(&blwi, cancel_flags);
+	}
+}
+
+
+int ldlm_bl_to_thread_lock(struct ldlm_namespace *ns, struct ldlm_lock_desc *ld,
+			   struct ldlm_lock *lock)
+{
+	return ldlm_bl_to_thread(ns, ld, lock, NULL, 0, LCF_ASYNC);
+}
+
+int ldlm_bl_to_thread_list(struct ldlm_namespace *ns, struct ldlm_lock_desc *ld,
+			   struct list_head *cancels, int count,
+			   ldlm_cancel_flags_t cancel_flags)
+{
+	return ldlm_bl_to_thread(ns, ld, NULL, cancels, count, cancel_flags);
+}
+
+/* Setinfo coming from Server (eg MDT) to Client (eg MDC)! */
+static int ldlm_handle_setinfo(struct ptlrpc_request *req)
+{
+	struct obd_device *obd = req->rq_export->exp_obd;
+	char *key;
+	void *val;
+	int keylen, vallen;
+	int rc = -ENOSYS;
+
+	DEBUG_REQ(D_HSM, req, "%s: handle setinfo\n", obd->obd_name);
+
+	req_capsule_set(&req->rq_pill, &RQF_OBD_SET_INFO);
+
+	key = req_capsule_client_get(&req->rq_pill, &RMF_SETINFO_KEY);
+	if (key == NULL) {
+		DEBUG_REQ(D_IOCTL, req, "no set_info key");
+		return -EFAULT;
+	}
+	keylen = req_capsule_get_size(&req->rq_pill, &RMF_SETINFO_KEY,
+				      RCL_CLIENT);
+	val = req_capsule_client_get(&req->rq_pill, &RMF_SETINFO_VAL);
+	if (val == NULL) {
+		DEBUG_REQ(D_IOCTL, req, "no set_info val");
+		return -EFAULT;
+	}
+	vallen = req_capsule_get_size(&req->rq_pill, &RMF_SETINFO_VAL,
+				      RCL_CLIENT);
+
+	/* We are responsible for swabbing contents of val */
+
+	if (KEY_IS(KEY_HSM_COPYTOOL_SEND))
+		/* Pass it on to mdc (the "export" in this case) */
+		rc = obd_set_info_async(req->rq_svc_thread->t_env,
+					req->rq_export,
+					sizeof(KEY_HSM_COPYTOOL_SEND),
+					KEY_HSM_COPYTOOL_SEND,
+					vallen, val, NULL);
+	else
+		DEBUG_REQ(D_WARNING, req, "ignoring unknown key %s", key);
+
+	return rc;
+}
+
+static inline void ldlm_callback_errmsg(struct ptlrpc_request *req,
+					const char *msg, int rc,
+					struct lustre_handle *handle)
+{
+	DEBUG_REQ((req->rq_no_reply || rc) ? D_WARNING : D_DLMTRACE, req,
+		  "%s: [nid %s] [rc %d] [lock %#llx]",
+		  msg, libcfs_id2str(req->rq_peer), rc,
+		  handle ? handle->cookie : 0);
+	if (req->rq_no_reply)
+		CWARN("No reply was sent, maybe cause bug 21636.\n");
+	else if (rc)
+		CWARN("Send reply failed, maybe cause bug 21636.\n");
+}
+
+static int ldlm_handle_qc_callback(struct ptlrpc_request *req)
+{
+	struct obd_quotactl *oqctl;
+	struct client_obd *cli = &req->rq_export->exp_obd->u.cli;
+
+	oqctl = req_capsule_client_get(&req->rq_pill, &RMF_OBD_QUOTACTL);
+	if (oqctl == NULL) {
+		CERROR("Can't unpack obd_quotactl\n");
+		return -EPROTO;
+	}
+
+	oqctl->qc_stat = ptlrpc_status_ntoh(oqctl->qc_stat);
+
+	cli->cl_qchk_stat = oqctl->qc_stat;
+	return 0;
+}
+
+/* TODO: handle requests in a similar way as MDT: see mdt_handle_common() */
+static int ldlm_callback_handler(struct ptlrpc_request *req)
+{
+	struct ldlm_namespace *ns;
+	struct ldlm_request *dlm_req;
+	struct ldlm_lock *lock;
+	int rc;
+
+	/* Requests arrive in sender's byte order.  The ptlrpc service
+	 * handler has already checked and, if necessary, byte-swapped the
+	 * incoming request message body, but I am responsible for the
+	 * message buffers. */
+
+	/* do nothing for sec context finalize */
+	if (lustre_msg_get_opc(req->rq_reqmsg) == SEC_CTX_FINI)
+		return 0;
+
+	req_capsule_init(&req->rq_pill, req, RCL_SERVER);
+
+	if (req->rq_export == NULL) {
+		rc = ldlm_callback_reply(req, -ENOTCONN);
+		ldlm_callback_errmsg(req, "Operate on unconnected server",
+				     rc, NULL);
+		return 0;
+	}
+
+	LASSERT(req->rq_export != NULL);
+	LASSERT(req->rq_export->exp_obd != NULL);
+
+	switch (lustre_msg_get_opc(req->rq_reqmsg)) {
+	case LDLM_BL_CALLBACK:
+		if (OBD_FAIL_CHECK(OBD_FAIL_LDLM_BL_CALLBACK_NET))
+			return 0;
+		break;
+	case LDLM_CP_CALLBACK:
+		if (OBD_FAIL_CHECK(OBD_FAIL_LDLM_CP_CALLBACK_NET))
+			return 0;
+		break;
+	case LDLM_GL_CALLBACK:
+		if (OBD_FAIL_CHECK(OBD_FAIL_LDLM_GL_CALLBACK_NET))
+			return 0;
+		break;
+	case LDLM_SET_INFO:
+		rc = ldlm_handle_setinfo(req);
+		ldlm_callback_reply(req, rc);
+		return 0;
+	case OBD_QC_CALLBACK:
+		req_capsule_set(&req->rq_pill, &RQF_QC_CALLBACK);
+		if (OBD_FAIL_CHECK(OBD_FAIL_OBD_QC_CALLBACK_NET))
+			return 0;
+		rc = ldlm_handle_qc_callback(req);
+		ldlm_callback_reply(req, rc);
+		return 0;
+	default:
+		CERROR("unknown opcode %u\n",
+		       lustre_msg_get_opc(req->rq_reqmsg));
+		ldlm_callback_reply(req, -EPROTO);
+		return 0;
+	}
+
+	ns = req->rq_export->exp_obd->obd_namespace;
+	LASSERT(ns != NULL);
+
+	req_capsule_set(&req->rq_pill, &RQF_LDLM_CALLBACK);
+
+	dlm_req = req_capsule_client_get(&req->rq_pill, &RMF_DLM_REQ);
+	if (dlm_req == NULL) {
+		rc = ldlm_callback_reply(req, -EPROTO);
+		ldlm_callback_errmsg(req, "Operate without parameter", rc,
+				     NULL);
+		return 0;
+	}
+
+	/* Force a known safe race, send a cancel to the server for a lock
+	 * which the server has already started a blocking callback on. */
+	if (OBD_FAIL_CHECK(OBD_FAIL_LDLM_CANCEL_BL_CB_RACE) &&
+	    lustre_msg_get_opc(req->rq_reqmsg) == LDLM_BL_CALLBACK) {
+		rc = ldlm_cli_cancel(&dlm_req->lock_handle[0], 0);
+		if (rc < 0)
+			CERROR("ldlm_cli_cancel: %d\n", rc);
+	}
+
+	lock = ldlm_handle2lock_long(&dlm_req->lock_handle[0], 0);
+	if (!lock) {
+		CDEBUG(D_DLMTRACE, "callback on lock %#llx - lock disappeared\n",
+		       dlm_req->lock_handle[0].cookie);
+		rc = ldlm_callback_reply(req, -EINVAL);
+		ldlm_callback_errmsg(req, "Operate with invalid parameter", rc,
+				     &dlm_req->lock_handle[0]);
+		return 0;
+	}
+
+	if ((lock->l_flags & LDLM_FL_FAIL_LOC) &&
+	    lustre_msg_get_opc(req->rq_reqmsg) == LDLM_BL_CALLBACK)
+		OBD_RACE(OBD_FAIL_LDLM_CP_BL_RACE);
+
+	/* Copy hints/flags (e.g. LDLM_FL_DISCARD_DATA) from AST. */
+	lock_res_and_lock(lock);
+	lock->l_flags |= ldlm_flags_from_wire(dlm_req->lock_flags &
+					      LDLM_AST_FLAGS);
+	if (lustre_msg_get_opc(req->rq_reqmsg) == LDLM_BL_CALLBACK) {
+		/* If somebody cancels lock and cache is already dropped,
+		 * or lock is failed before cp_ast received on client,
+		 * we can tell the server we have no lock. Otherwise, we
+		 * should send cancel after dropping the cache. */
+		if (((lock->l_flags & LDLM_FL_CANCELING) &&
+		    (lock->l_flags & LDLM_FL_BL_DONE)) ||
+		    (lock->l_flags & LDLM_FL_FAILED)) {
+			LDLM_DEBUG(lock, "callback on lock %#llx - lock disappeared\n",
+				   dlm_req->lock_handle[0].cookie);
+			unlock_res_and_lock(lock);
+			LDLM_LOCK_RELEASE(lock);
+			rc = ldlm_callback_reply(req, -EINVAL);
+			ldlm_callback_errmsg(req, "Operate on stale lock", rc,
+					     &dlm_req->lock_handle[0]);
+			return 0;
+		}
+		/* BL_AST locks are not needed in LRU.
+		 * Let ldlm_cancel_lru() be fast. */
+		ldlm_lock_remove_from_lru(lock);
+		lock->l_flags |= LDLM_FL_BL_AST;
+	}
+	unlock_res_and_lock(lock);
+
+	/* We want the ost thread to get this reply so that it can respond
+	 * to ost requests (write cache writeback) that might be triggered
+	 * in the callback.
+	 *
+	 * But we'd also like to be able to indicate in the reply that we're
+	 * cancelling right now, because it's unused, or have an intent result
+	 * in the reply, so we might have to push the responsibility for sending
+	 * the reply down into the AST handlers, alas. */
+
+	switch (lustre_msg_get_opc(req->rq_reqmsg)) {
+	case LDLM_BL_CALLBACK:
+		CDEBUG(D_INODE, "blocking ast\n");
+		req_capsule_extend(&req->rq_pill, &RQF_LDLM_BL_CALLBACK);
+		if (!(lock->l_flags & LDLM_FL_CANCEL_ON_BLOCK)) {
+			rc = ldlm_callback_reply(req, 0);
+			if (req->rq_no_reply || rc)
+				ldlm_callback_errmsg(req, "Normal process", rc,
+						     &dlm_req->lock_handle[0]);
+		}
+		if (ldlm_bl_to_thread_lock(ns, &dlm_req->lock_desc, lock))
+			ldlm_handle_bl_callback(ns, &dlm_req->lock_desc, lock);
+		break;
+	case LDLM_CP_CALLBACK:
+		CDEBUG(D_INODE, "completion ast\n");
+		req_capsule_extend(&req->rq_pill, &RQF_LDLM_CP_CALLBACK);
+		ldlm_callback_reply(req, 0);
+		ldlm_handle_cp_callback(req, ns, dlm_req, lock);
+		break;
+	case LDLM_GL_CALLBACK:
+		CDEBUG(D_INODE, "glimpse ast\n");
+		req_capsule_extend(&req->rq_pill, &RQF_LDLM_GL_CALLBACK);
+		ldlm_handle_gl_callback(req, ns, dlm_req, lock);
+		break;
+	default:
+		LBUG();			 /* checked above */
+	}
+
+	return 0;
+}
+
+
+static struct ldlm_bl_work_item *ldlm_bl_get_work(struct ldlm_bl_pool *blp)
+{
+	struct ldlm_bl_work_item *blwi = NULL;
+	static unsigned int num_bl;
+
+	spin_lock(&blp->blp_lock);
+	/* process a request from the blp_list at least every blp_num_threads */
+	if (!list_empty(&blp->blp_list) &&
+	    (list_empty(&blp->blp_prio_list) || num_bl == 0))
+		blwi = list_entry(blp->blp_list.next,
+				      struct ldlm_bl_work_item, blwi_entry);
+	else
+		if (!list_empty(&blp->blp_prio_list))
+			blwi = list_entry(blp->blp_prio_list.next,
+					      struct ldlm_bl_work_item,
+					      blwi_entry);
+
+	if (blwi) {
+		if (++num_bl >= atomic_read(&blp->blp_num_threads))
+			num_bl = 0;
+		list_del(&blwi->blwi_entry);
+	}
+	spin_unlock(&blp->blp_lock);
+
+	return blwi;
+}
+
+/* This only contains temporary data until the thread starts */
+struct ldlm_bl_thread_data {
+	char			bltd_name[CFS_CURPROC_COMM_MAX];
+	struct ldlm_bl_pool	*bltd_blp;
+	struct completion	bltd_comp;
+	int			bltd_num;
+};
+
+static int ldlm_bl_thread_main(void *arg);
+
+static int ldlm_bl_thread_start(struct ldlm_bl_pool *blp)
+{
+	struct ldlm_bl_thread_data bltd = { .bltd_blp = blp };
+	struct task_struct *task;
+
+	init_completion(&bltd.bltd_comp);
+	bltd.bltd_num = atomic_read(&blp->blp_num_threads);
+	snprintf(bltd.bltd_name, sizeof(bltd.bltd_name),
+		"ldlm_bl_%02d", bltd.bltd_num);
+	task = kthread_run(ldlm_bl_thread_main, &bltd, "%s", bltd.bltd_name);
+	if (IS_ERR(task)) {
+		CERROR("cannot start LDLM thread ldlm_bl_%02d: rc %ld\n",
+		       atomic_read(&blp->blp_num_threads), PTR_ERR(task));
+		return PTR_ERR(task);
+	}
+	wait_for_completion(&bltd.bltd_comp);
+
+	return 0;
+}
+
+/**
+ * Main blocking requests processing thread.
+ *
+ * Callers put locks into its queue by calling ldlm_bl_to_thread.
+ * This thread in the end ends up doing actual call to ->l_blocking_ast
+ * for queued locks.
+ */
+static int ldlm_bl_thread_main(void *arg)
+{
+	struct ldlm_bl_pool *blp;
+
+	{
+		struct ldlm_bl_thread_data *bltd = arg;
+
+		blp = bltd->bltd_blp;
+
+		atomic_inc(&blp->blp_num_threads);
+		atomic_inc(&blp->blp_busy_threads);
+
+		complete(&bltd->bltd_comp);
+		/* cannot use bltd after this, it is only on caller's stack */
+	}
+
+	while (1) {
+		struct l_wait_info lwi = { 0 };
+		struct ldlm_bl_work_item *blwi = NULL;
+		int busy;
+
+		blwi = ldlm_bl_get_work(blp);
+
+		if (blwi == NULL) {
+			atomic_dec(&blp->blp_busy_threads);
+			l_wait_event_exclusive(blp->blp_waitq,
+					 (blwi = ldlm_bl_get_work(blp)) != NULL,
+					 &lwi);
+			busy = atomic_inc_return(&blp->blp_busy_threads);
+		} else {
+			busy = atomic_read(&blp->blp_busy_threads);
+		}
+
+		if (blwi->blwi_ns == NULL)
+			/* added by ldlm_cleanup() */
+			break;
+
+		/* Not fatal if racy and have a few too many threads */
+		if (unlikely(busy < blp->blp_max_threads &&
+			     busy >= atomic_read(&blp->blp_num_threads) &&
+			     !blwi->blwi_mem_pressure))
+			/* discard the return value, we tried */
+			ldlm_bl_thread_start(blp);
+
+		if (blwi->blwi_mem_pressure)
+			memory_pressure_set();
+
+		if (blwi->blwi_count) {
+			int count;
+			/* The special case when we cancel locks in LRU
+			 * asynchronously, we pass the list of locks here.
+			 * Thus locks are marked LDLM_FL_CANCELING, but NOT
+			 * canceled locally yet. */
+			count = ldlm_cli_cancel_list_local(&blwi->blwi_head,
+							   blwi->blwi_count,
+							   LCF_BL_AST);
+			ldlm_cli_cancel_list(&blwi->blwi_head, count, NULL,
+					     blwi->blwi_flags);
+		} else {
+			ldlm_handle_bl_callback(blwi->blwi_ns, &blwi->blwi_ld,
+						blwi->blwi_lock);
+		}
+		if (blwi->blwi_mem_pressure)
+			memory_pressure_clr();
+
+		if (blwi->blwi_flags & LCF_ASYNC)
+			OBD_FREE(blwi, sizeof(*blwi));
+		else
+			complete(&blwi->blwi_comp);
+	}
+
+	atomic_dec(&blp->blp_busy_threads);
+	atomic_dec(&blp->blp_num_threads);
+	complete(&blp->blp_comp);
+	return 0;
+}
+
+
+static int ldlm_setup(void);
+static int ldlm_cleanup(void);
+
+int ldlm_get_ref(void)
+{
+	int rc = 0;
+
+	mutex_lock(&ldlm_ref_mutex);
+	if (++ldlm_refcount == 1) {
+		rc = ldlm_setup();
+		if (rc)
+			ldlm_refcount--;
+	}
+	mutex_unlock(&ldlm_ref_mutex);
+
+	return rc;
+}
+EXPORT_SYMBOL(ldlm_get_ref);
+
+void ldlm_put_ref(void)
+{
+	mutex_lock(&ldlm_ref_mutex);
+	if (ldlm_refcount == 1) {
+		int rc = ldlm_cleanup();
+
+		if (rc)
+			CERROR("ldlm_cleanup failed: %d\n", rc);
+		else
+			ldlm_refcount--;
+	} else {
+		ldlm_refcount--;
+	}
+	mutex_unlock(&ldlm_ref_mutex);
+}
+EXPORT_SYMBOL(ldlm_put_ref);
+
+/*
+ * Export handle<->lock hash operations.
+ */
+static unsigned
+ldlm_export_lock_hash(struct cfs_hash *hs, const void *key, unsigned mask)
+{
+	return cfs_hash_u64_hash(((struct lustre_handle *)key)->cookie, mask);
+}
+
+static void *
+ldlm_export_lock_key(struct hlist_node *hnode)
+{
+	struct ldlm_lock *lock;
+
+	lock = hlist_entry(hnode, struct ldlm_lock, l_exp_hash);
+	return &lock->l_remote_handle;
+}
+
+static void
+ldlm_export_lock_keycpy(struct hlist_node *hnode, void *key)
+{
+	struct ldlm_lock     *lock;
+
+	lock = hlist_entry(hnode, struct ldlm_lock, l_exp_hash);
+	lock->l_remote_handle = *(struct lustre_handle *)key;
+}
+
+static int
+ldlm_export_lock_keycmp(const void *key, struct hlist_node *hnode)
+{
+	return lustre_handle_equal(ldlm_export_lock_key(hnode), key);
+}
+
+static void *
+ldlm_export_lock_object(struct hlist_node *hnode)
+{
+	return hlist_entry(hnode, struct ldlm_lock, l_exp_hash);
+}
+
+static void
+ldlm_export_lock_get(struct cfs_hash *hs, struct hlist_node *hnode)
+{
+	struct ldlm_lock *lock;
+
+	lock = hlist_entry(hnode, struct ldlm_lock, l_exp_hash);
+	LDLM_LOCK_GET(lock);
+}
+
+static void
+ldlm_export_lock_put(struct cfs_hash *hs, struct hlist_node *hnode)
+{
+	struct ldlm_lock *lock;
+
+	lock = hlist_entry(hnode, struct ldlm_lock, l_exp_hash);
+	LDLM_LOCK_RELEASE(lock);
+}
+
+static cfs_hash_ops_t ldlm_export_lock_ops = {
+	.hs_hash	= ldlm_export_lock_hash,
+	.hs_key	 = ldlm_export_lock_key,
+	.hs_keycmp      = ldlm_export_lock_keycmp,
+	.hs_keycpy      = ldlm_export_lock_keycpy,
+	.hs_object      = ldlm_export_lock_object,
+	.hs_get	 = ldlm_export_lock_get,
+	.hs_put	 = ldlm_export_lock_put,
+	.hs_put_locked  = ldlm_export_lock_put,
+};
+
+int ldlm_init_export(struct obd_export *exp)
+{
+	int rc;
+
+	exp->exp_lock_hash =
+		cfs_hash_create(obd_uuid2str(&exp->exp_client_uuid),
+				HASH_EXP_LOCK_CUR_BITS,
+				HASH_EXP_LOCK_MAX_BITS,
+				HASH_EXP_LOCK_BKT_BITS, 0,
+				CFS_HASH_MIN_THETA, CFS_HASH_MAX_THETA,
+				&ldlm_export_lock_ops,
+				CFS_HASH_DEFAULT | CFS_HASH_REHASH_KEY |
+				CFS_HASH_NBLK_CHANGE);
+
+	if (!exp->exp_lock_hash)
+		return -ENOMEM;
+
+	rc = ldlm_init_flock_export(exp);
+	if (rc)
+		goto err;
+
+	return 0;
+err:
+	ldlm_destroy_export(exp);
+	return rc;
+}
+EXPORT_SYMBOL(ldlm_init_export);
+
+void ldlm_destroy_export(struct obd_export *exp)
+{
+	cfs_hash_putref(exp->exp_lock_hash);
+	exp->exp_lock_hash = NULL;
+
+	ldlm_destroy_flock_export(exp);
+}
+EXPORT_SYMBOL(ldlm_destroy_export);
+
+static int ldlm_setup(void)
+{
+	static struct ptlrpc_service_conf	conf;
+	struct ldlm_bl_pool			*blp = NULL;
+	int rc = 0;
+	int i;
+
+	if (ldlm_state != NULL)
+		return -EALREADY;
+
+	OBD_ALLOC(ldlm_state, sizeof(*ldlm_state));
+	if (ldlm_state == NULL)
+		return -ENOMEM;
+
+	rc = ldlm_proc_setup();
+	if (rc != 0)
+		goto out;
+
+	memset(&conf, 0, sizeof(conf));
+	conf = (typeof(conf)) {
+		.psc_name		= "ldlm_cbd",
+		.psc_watchdog_factor	= 2,
+		.psc_buf		= {
+			.bc_nbufs		= LDLM_CLIENT_NBUFS,
+			.bc_buf_size		= LDLM_BUFSIZE,
+			.bc_req_max_size	= LDLM_MAXREQSIZE,
+			.bc_rep_max_size	= LDLM_MAXREPSIZE,
+			.bc_req_portal		= LDLM_CB_REQUEST_PORTAL,
+			.bc_rep_portal		= LDLM_CB_REPLY_PORTAL,
+		},
+		.psc_thr		= {
+			.tc_thr_name		= "ldlm_cb",
+			.tc_thr_factor		= LDLM_THR_FACTOR,
+			.tc_nthrs_init		= LDLM_NTHRS_INIT,
+			.tc_nthrs_base		= LDLM_NTHRS_BASE,
+			.tc_nthrs_max		= LDLM_NTHRS_MAX,
+			.tc_nthrs_user		= ldlm_num_threads,
+			.tc_cpu_affinity	= 1,
+			.tc_ctx_tags		= LCT_MD_THREAD | LCT_DT_THREAD,
+		},
+		.psc_cpt		= {
+			.cc_pattern		= ldlm_cpts,
+		},
+		.psc_ops		= {
+			.so_req_handler		= ldlm_callback_handler,
+		},
+	};
+	ldlm_state->ldlm_cb_service =
+			ptlrpc_register_service(&conf, ldlm_svc_proc_dir);
+	if (IS_ERR(ldlm_state->ldlm_cb_service)) {
+		CERROR("failed to start service\n");
+		rc = PTR_ERR(ldlm_state->ldlm_cb_service);
+		ldlm_state->ldlm_cb_service = NULL;
+		goto out;
+	}
+
+
+	OBD_ALLOC(blp, sizeof(*blp));
+	if (blp == NULL) {
+		rc = -ENOMEM;
+		goto out;
+	}
+	ldlm_state->ldlm_bl_pool = blp;
+
+	spin_lock_init(&blp->blp_lock);
+	INIT_LIST_HEAD(&blp->blp_list);
+	INIT_LIST_HEAD(&blp->blp_prio_list);
+	init_waitqueue_head(&blp->blp_waitq);
+	atomic_set(&blp->blp_num_threads, 0);
+	atomic_set(&blp->blp_busy_threads, 0);
+
+	if (ldlm_num_threads == 0) {
+		blp->blp_min_threads = LDLM_NTHRS_INIT;
+		blp->blp_max_threads = LDLM_NTHRS_MAX;
+	} else {
+		blp->blp_min_threads = blp->blp_max_threads =
+			min_t(int, LDLM_NTHRS_MAX, max_t(int, LDLM_NTHRS_INIT,
+							 ldlm_num_threads));
+	}
+
+	for (i = 0; i < blp->blp_min_threads; i++) {
+		rc = ldlm_bl_thread_start(blp);
+		if (rc < 0)
+			goto out;
+	}
+
+
+	rc = ldlm_pools_init();
+	if (rc) {
+		CERROR("Failed to initialize LDLM pools: %d\n", rc);
+		goto out;
+	}
+	return 0;
+
+ out:
+	ldlm_cleanup();
+	return rc;
+}
+
+static int ldlm_cleanup(void)
+{
+	if (!list_empty(ldlm_namespace_list(LDLM_NAMESPACE_SERVER)) ||
+	    !list_empty(ldlm_namespace_list(LDLM_NAMESPACE_CLIENT))) {
+		CERROR("ldlm still has namespaces; clean these up first.\n");
+		ldlm_dump_all_namespaces(LDLM_NAMESPACE_SERVER, D_DLMTRACE);
+		ldlm_dump_all_namespaces(LDLM_NAMESPACE_CLIENT, D_DLMTRACE);
+		return -EBUSY;
+	}
+
+	ldlm_pools_fini();
+
+	if (ldlm_state->ldlm_bl_pool != NULL) {
+		struct ldlm_bl_pool *blp = ldlm_state->ldlm_bl_pool;
+
+		while (atomic_read(&blp->blp_num_threads) > 0) {
+			struct ldlm_bl_work_item blwi = { .blwi_ns = NULL };
+
+			init_completion(&blp->blp_comp);
+
+			spin_lock(&blp->blp_lock);
+			list_add_tail(&blwi.blwi_entry, &blp->blp_list);
+			wake_up(&blp->blp_waitq);
+			spin_unlock(&blp->blp_lock);
+
+			wait_for_completion(&blp->blp_comp);
+		}
+
+		OBD_FREE(blp, sizeof(*blp));
+	}
+
+	if (ldlm_state->ldlm_cb_service != NULL)
+		ptlrpc_unregister_service(ldlm_state->ldlm_cb_service);
+
+	ldlm_proc_cleanup();
+
+
+	OBD_FREE(ldlm_state, sizeof(*ldlm_state));
+	ldlm_state = NULL;
+
+	return 0;
+}
+
+int ldlm_init(void)
+{
+	mutex_init(&ldlm_ref_mutex);
+	mutex_init(ldlm_namespace_lock(LDLM_NAMESPACE_SERVER));
+	mutex_init(ldlm_namespace_lock(LDLM_NAMESPACE_CLIENT));
+	ldlm_resource_slab = kmem_cache_create("ldlm_resources",
+					       sizeof(struct ldlm_resource), 0,
+					       SLAB_HWCACHE_ALIGN, NULL);
+	if (ldlm_resource_slab == NULL)
+		return -ENOMEM;
+
+	ldlm_lock_slab = kmem_cache_create("ldlm_locks",
+			      sizeof(struct ldlm_lock), 0,
+			      SLAB_HWCACHE_ALIGN | SLAB_DESTROY_BY_RCU, NULL);
+	if (ldlm_lock_slab == NULL) {
+		kmem_cache_destroy(ldlm_resource_slab);
+		return -ENOMEM;
+	}
+
+	ldlm_interval_slab = kmem_cache_create("interval_node",
+					sizeof(struct ldlm_interval),
+					0, SLAB_HWCACHE_ALIGN, NULL);
+	if (ldlm_interval_slab == NULL) {
+		kmem_cache_destroy(ldlm_resource_slab);
+		kmem_cache_destroy(ldlm_lock_slab);
+		return -ENOMEM;
+	}
+#if LUSTRE_TRACKS_LOCK_EXP_REFS
+	class_export_dump_hook = ldlm_dump_export_locks;
+#endif
+	return 0;
+}
+
+void ldlm_exit(void)
+{
+	if (ldlm_refcount)
+		CERROR("ldlm_refcount is %d in ldlm_exit!\n", ldlm_refcount);
+	kmem_cache_destroy(ldlm_resource_slab);
+	/* ldlm_lock_put() use RCU to call ldlm_lock_free, so need call
+	 * synchronize_rcu() to wait a grace period elapsed, so that
+	 * ldlm_lock_free() get a chance to be called. */
+	synchronize_rcu();
+	kmem_cache_destroy(ldlm_lock_slab);
+	kmem_cache_destroy(ldlm_interval_slab);
+}
diff --git a/kernel/drivers/staging/lustre/lustre/ldlm/ldlm_plain.c b/kernel/drivers/staging/lustre/lustre/ldlm/ldlm_plain.c
new file mode 100644
index 000000000..a1fe2c161
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/ldlm/ldlm_plain.c
@@ -0,0 +1,72 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/ldlm/ldlm_plain.c
+ *
+ * Author: Peter Braam <braam@clusterfs.com>
+ * Author: Phil Schwan <phil@clusterfs.com>
+ */
+
+/**
+ * This file contains implementation of PLAIN lock type.
+ *
+ * PLAIN locks are the simplest form of LDLM locking, and are used when
+ * there only needs to be a single lock on a resource. This avoids some
+ * of the complexity of EXTENT and IBITS lock types, but doesn't allow
+ * different "parts" of a resource to be locked concurrently.  Example
+ * use cases for PLAIN locks include locking of MGS configuration logs
+ * and (as of Lustre 2.4) quota records.
+ */
+
+#define DEBUG_SUBSYSTEM S_LDLM
+
+#include "../include/lustre_dlm.h"
+#include "../include/obd_support.h"
+#include "../include/lustre_lib.h"
+
+#include "ldlm_internal.h"
+
+
+void ldlm_plain_policy_wire_to_local(const ldlm_wire_policy_data_t *wpolicy,
+				     ldlm_policy_data_t *lpolicy)
+{
+	/* No policy for plain locks */
+}
+
+void ldlm_plain_policy_local_to_wire(const ldlm_policy_data_t *lpolicy,
+				     ldlm_wire_policy_data_t *wpolicy)
+{
+	/* No policy for plain locks */
+}
diff --git a/kernel/drivers/staging/lustre/lustre/ldlm/ldlm_pool.c b/kernel/drivers/staging/lustre/lustre/ldlm/ldlm_pool.c
new file mode 100644
index 000000000..a9f4833e0
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/ldlm/ldlm_pool.c
@@ -0,0 +1,1455 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2010, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/ldlm/ldlm_pool.c
+ *
+ * Author: Yury Umanets <umka@clusterfs.com>
+ */
+
+/*
+ * Idea of this code is rather simple. Each second, for each server namespace
+ * we have SLV - server lock volume which is calculated on current number of
+ * granted locks, grant speed for past period, etc - that is, locking load.
+ * This SLV number may be thought as a flow definition for simplicity. It is
+ * sent to clients with each occasion to let them know what is current load
+ * situation on the server. By default, at the beginning, SLV on server is
+ * set max value which is calculated as the following: allow to one client
+ * have all locks of limit ->pl_limit for 10h.
+ *
+ * Next, on clients, number of cached locks is not limited artificially in any
+ * way as it was before. Instead, client calculates CLV, that is, client lock
+ * volume for each lock and compares it with last SLV from the server. CLV is
+ * calculated as the number of locks in LRU * lock live time in seconds. If
+ * CLV > SLV - lock is canceled.
+ *
+ * Client has LVF, that is, lock volume factor which regulates how much
+ * sensitive client should be about last SLV from server. The higher LVF is the
+ * more locks will be canceled on client. Default value for it is 1. Setting LVF
+ * to 2 means that client will cancel locks 2 times faster.
+ *
+ * Locks on a client will be canceled more intensively in these cases:
+ * (1) if SLV is smaller, that is, load is higher on the server;
+ * (2) client has a lot of locks (the more locks are held by client, the bigger
+ *     chances that some of them should be canceled);
+ * (3) client has old locks (taken some time ago);
+ *
+ * Thus, according to flow paradigm that we use for better understanding SLV,
+ * CLV is the volume of particle in flow described by SLV. According to this,
+ * if flow is getting thinner, more and more particles become outside of it and
+ * as particles are locks, they should be canceled.
+ *
+ * General idea of this belongs to Vitaly Fertman (vitaly@clusterfs.com).
+ * Andreas Dilger (adilger@clusterfs.com) proposed few nice ideas like using
+ * LVF and many cleanups. Flow definition to allow more easy understanding of
+ * the logic belongs to Nikita Danilov (nikita@clusterfs.com) as well as many
+ * cleanups and fixes. And design and implementation are done by Yury Umanets
+ * (umka@clusterfs.com).
+ *
+ * Glossary for terms used:
+ *
+ * pl_limit - Number of allowed locks in pool. Applies to server and client
+ * side (tunable);
+ *
+ * pl_granted - Number of granted locks (calculated);
+ * pl_grant_rate - Number of granted locks for last T (calculated);
+ * pl_cancel_rate - Number of canceled locks for last T (calculated);
+ * pl_grant_speed - Grant speed (GR - CR) for last T (calculated);
+ * pl_grant_plan - Planned number of granted locks for next T (calculated);
+ * pl_server_lock_volume - Current server lock volume (calculated);
+ *
+ * As it may be seen from list above, we have few possible tunables which may
+ * affect behavior much. They all may be modified via proc. However, they also
+ * give a possibility for constructing few pre-defined behavior policies. If
+ * none of predefines is suitable for a working pattern being used, new one may
+ * be "constructed" via proc tunables.
+ */
+
+#define DEBUG_SUBSYSTEM S_LDLM
+
+#include "../include/lustre_dlm.h"
+#include "../include/cl_object.h"
+#include "../include/obd_class.h"
+#include "../include/obd_support.h"
+#include "ldlm_internal.h"
+
+
+/*
+ * 50 ldlm locks for 1MB of RAM.
+ */
+#define LDLM_POOL_HOST_L ((NUM_CACHEPAGES >> (20 - PAGE_CACHE_SHIFT)) * 50)
+
+/*
+ * Maximal possible grant step plan in %.
+ */
+#define LDLM_POOL_MAX_GSP (30)
+
+/*
+ * Minimal possible grant step plan in %.
+ */
+#define LDLM_POOL_MIN_GSP (1)
+
+/*
+ * This controls the speed of reaching LDLM_POOL_MAX_GSP
+ * with increasing thread period.
+ */
+#define LDLM_POOL_GSP_STEP_SHIFT (2)
+
+/*
+ * LDLM_POOL_GSP% of all locks is default GP.
+ */
+#define LDLM_POOL_GP(L)   (((L) * LDLM_POOL_MAX_GSP) / 100)
+
+/*
+ * Max age for locks on clients.
+ */
+#define LDLM_POOL_MAX_AGE (36000)
+
+/*
+ * The granularity of SLV calculation.
+ */
+#define LDLM_POOL_SLV_SHIFT (10)
+
+static inline __u64 dru(__u64 val, __u32 shift, int round_up)
+{
+	return (val + (round_up ? (1 << shift) - 1 : 0)) >> shift;
+}
+
+static inline __u64 ldlm_pool_slv_max(__u32 L)
+{
+	/*
+	 * Allow to have all locks for 1 client for 10 hrs.
+	 * Formula is the following: limit * 10h / 1 client.
+	 */
+	__u64 lim = (__u64)L *  LDLM_POOL_MAX_AGE / 1;
+	return lim;
+}
+
+static inline __u64 ldlm_pool_slv_min(__u32 L)
+{
+	return 1;
+}
+
+enum {
+	LDLM_POOL_FIRST_STAT = 0,
+	LDLM_POOL_GRANTED_STAT = LDLM_POOL_FIRST_STAT,
+	LDLM_POOL_GRANT_STAT,
+	LDLM_POOL_CANCEL_STAT,
+	LDLM_POOL_GRANT_RATE_STAT,
+	LDLM_POOL_CANCEL_RATE_STAT,
+	LDLM_POOL_GRANT_PLAN_STAT,
+	LDLM_POOL_SLV_STAT,
+	LDLM_POOL_SHRINK_REQTD_STAT,
+	LDLM_POOL_SHRINK_FREED_STAT,
+	LDLM_POOL_RECALC_STAT,
+	LDLM_POOL_TIMING_STAT,
+	LDLM_POOL_LAST_STAT
+};
+
+static inline struct ldlm_namespace *ldlm_pl2ns(struct ldlm_pool *pl)
+{
+	return container_of(pl, struct ldlm_namespace, ns_pool);
+}
+
+/**
+ * Calculates suggested grant_step in % of available locks for passed
+ * \a period. This is later used in grant_plan calculations.
+ */
+static inline int ldlm_pool_t2gsp(unsigned int t)
+{
+	/*
+	 * This yields 1% grant step for anything below LDLM_POOL_GSP_STEP
+	 * and up to 30% for anything higher than LDLM_POOL_GSP_STEP.
+	 *
+	 * How this will affect execution is the following:
+	 *
+	 * - for thread period 1s we will have grant_step 1% which good from
+	 * pov of taking some load off from server and push it out to clients.
+	 * This is like that because 1% for grant_step means that server will
+	 * not allow clients to get lots of locks in short period of time and
+	 * keep all old locks in their caches. Clients will always have to
+	 * get some locks back if they want to take some new;
+	 *
+	 * - for thread period 10s (which is default) we will have 23% which
+	 * means that clients will have enough of room to take some new locks
+	 * without getting some back. All locks from this 23% which were not
+	 * taken by clients in current period will contribute in SLV growing.
+	 * SLV growing means more locks cached on clients until limit or grant
+	 * plan is reached.
+	 */
+	return LDLM_POOL_MAX_GSP -
+		((LDLM_POOL_MAX_GSP - LDLM_POOL_MIN_GSP) >>
+		 (t >> LDLM_POOL_GSP_STEP_SHIFT));
+}
+
+/**
+ * Recalculates next grant limit on passed \a pl.
+ *
+ * \pre ->pl_lock is locked.
+ */
+static void ldlm_pool_recalc_grant_plan(struct ldlm_pool *pl)
+{
+	int granted, grant_step, limit;
+
+	limit = ldlm_pool_get_limit(pl);
+	granted = atomic_read(&pl->pl_granted);
+
+	grant_step = ldlm_pool_t2gsp(pl->pl_recalc_period);
+	grant_step = ((limit - granted) * grant_step) / 100;
+	pl->pl_grant_plan = granted + grant_step;
+	limit = (limit * 5) >> 2;
+	if (pl->pl_grant_plan > limit)
+		pl->pl_grant_plan = limit;
+}
+
+/**
+ * Recalculates next SLV on passed \a pl.
+ *
+ * \pre ->pl_lock is locked.
+ */
+static void ldlm_pool_recalc_slv(struct ldlm_pool *pl)
+{
+	int granted;
+	int grant_plan;
+	int round_up;
+	__u64 slv;
+	__u64 slv_factor;
+	__u64 grant_usage;
+	__u32 limit;
+
+	slv = pl->pl_server_lock_volume;
+	grant_plan = pl->pl_grant_plan;
+	limit = ldlm_pool_get_limit(pl);
+	granted = atomic_read(&pl->pl_granted);
+	round_up = granted < limit;
+
+	grant_usage = max_t(int, limit - (granted - grant_plan), 1);
+
+	/*
+	 * Find out SLV change factor which is the ratio of grant usage
+	 * from limit. SLV changes as fast as the ratio of grant plan
+	 * consumption. The more locks from grant plan are not consumed
+	 * by clients in last interval (idle time), the faster grows
+	 * SLV. And the opposite, the more grant plan is over-consumed
+	 * (load time) the faster drops SLV.
+	 */
+	slv_factor = grant_usage << LDLM_POOL_SLV_SHIFT;
+	do_div(slv_factor, limit);
+	slv = slv * slv_factor;
+	slv = dru(slv, LDLM_POOL_SLV_SHIFT, round_up);
+
+	if (slv > ldlm_pool_slv_max(limit))
+		slv = ldlm_pool_slv_max(limit);
+	else if (slv < ldlm_pool_slv_min(limit))
+		slv = ldlm_pool_slv_min(limit);
+
+	pl->pl_server_lock_volume = slv;
+}
+
+/**
+ * Recalculates next stats on passed \a pl.
+ *
+ * \pre ->pl_lock is locked.
+ */
+static void ldlm_pool_recalc_stats(struct ldlm_pool *pl)
+{
+	int grant_plan = pl->pl_grant_plan;
+	__u64 slv = pl->pl_server_lock_volume;
+	int granted = atomic_read(&pl->pl_granted);
+	int grant_rate = atomic_read(&pl->pl_grant_rate);
+	int cancel_rate = atomic_read(&pl->pl_cancel_rate);
+
+	lprocfs_counter_add(pl->pl_stats, LDLM_POOL_SLV_STAT,
+			    slv);
+	lprocfs_counter_add(pl->pl_stats, LDLM_POOL_GRANTED_STAT,
+			    granted);
+	lprocfs_counter_add(pl->pl_stats, LDLM_POOL_GRANT_RATE_STAT,
+			    grant_rate);
+	lprocfs_counter_add(pl->pl_stats, LDLM_POOL_GRANT_PLAN_STAT,
+			    grant_plan);
+	lprocfs_counter_add(pl->pl_stats, LDLM_POOL_CANCEL_RATE_STAT,
+			    cancel_rate);
+}
+
+/**
+ * Sets current SLV into obd accessible via ldlm_pl2ns(pl)->ns_obd.
+ */
+static void ldlm_srv_pool_push_slv(struct ldlm_pool *pl)
+{
+	struct obd_device *obd;
+
+	/*
+	 * Set new SLV in obd field for using it later without accessing the
+	 * pool. This is required to avoid race between sending reply to client
+	 * with new SLV and cleanup server stack in which we can't guarantee
+	 * that namespace is still alive. We know only that obd is alive as
+	 * long as valid export is alive.
+	 */
+	obd = ldlm_pl2ns(pl)->ns_obd;
+	LASSERT(obd != NULL);
+	write_lock(&obd->obd_pool_lock);
+	obd->obd_pool_slv = pl->pl_server_lock_volume;
+	write_unlock(&obd->obd_pool_lock);
+}
+
+/**
+ * Recalculates all pool fields on passed \a pl.
+ *
+ * \pre ->pl_lock is not locked.
+ */
+static int ldlm_srv_pool_recalc(struct ldlm_pool *pl)
+{
+	time_t recalc_interval_sec;
+
+	recalc_interval_sec = get_seconds() - pl->pl_recalc_time;
+	if (recalc_interval_sec < pl->pl_recalc_period)
+		return 0;
+
+	spin_lock(&pl->pl_lock);
+	recalc_interval_sec = get_seconds() - pl->pl_recalc_time;
+	if (recalc_interval_sec < pl->pl_recalc_period) {
+		spin_unlock(&pl->pl_lock);
+		return 0;
+	}
+	/*
+	 * Recalc SLV after last period. This should be done
+	 * _before_ recalculating new grant plan.
+	 */
+	ldlm_pool_recalc_slv(pl);
+
+	/*
+	 * Make sure that pool informed obd of last SLV changes.
+	 */
+	ldlm_srv_pool_push_slv(pl);
+
+	/*
+	 * Update grant_plan for new period.
+	 */
+	ldlm_pool_recalc_grant_plan(pl);
+
+	pl->pl_recalc_time = get_seconds();
+	lprocfs_counter_add(pl->pl_stats, LDLM_POOL_TIMING_STAT,
+			    recalc_interval_sec);
+	spin_unlock(&pl->pl_lock);
+	return 0;
+}
+
+/**
+ * This function is used on server side as main entry point for memory
+ * pressure handling. It decreases SLV on \a pl according to passed
+ * \a nr and \a gfp_mask.
+ *
+ * Our goal here is to decrease SLV such a way that clients hold \a nr
+ * locks smaller in next 10h.
+ */
+static int ldlm_srv_pool_shrink(struct ldlm_pool *pl,
+				int nr, gfp_t gfp_mask)
+{
+	__u32 limit;
+
+	/*
+	 * VM is asking how many entries may be potentially freed.
+	 */
+	if (nr == 0)
+		return atomic_read(&pl->pl_granted);
+
+	/*
+	 * Client already canceled locks but server is already in shrinker
+	 * and can't cancel anything. Let's catch this race.
+	 */
+	if (atomic_read(&pl->pl_granted) == 0)
+		return 0;
+
+	spin_lock(&pl->pl_lock);
+
+	/*
+	 * We want shrinker to possibly cause cancellation of @nr locks from
+	 * clients or grant approximately @nr locks smaller next intervals.
+	 *
+	 * This is why we decreased SLV by @nr. This effect will only be as
+	 * long as one re-calc interval (1s these days) and this should be
+	 * enough to pass this decreased SLV to all clients. On next recalc
+	 * interval pool will either increase SLV if locks load is not high
+	 * or will keep on same level or even decrease again, thus, shrinker
+	 * decreased SLV will affect next recalc intervals and this way will
+	 * make locking load lower.
+	 */
+	if (nr < pl->pl_server_lock_volume) {
+		pl->pl_server_lock_volume = pl->pl_server_lock_volume - nr;
+	} else {
+		limit = ldlm_pool_get_limit(pl);
+		pl->pl_server_lock_volume = ldlm_pool_slv_min(limit);
+	}
+
+	/*
+	 * Make sure that pool informed obd of last SLV changes.
+	 */
+	ldlm_srv_pool_push_slv(pl);
+	spin_unlock(&pl->pl_lock);
+
+	/*
+	 * We did not really free any memory here so far, it only will be
+	 * freed later may be, so that we return 0 to not confuse VM.
+	 */
+	return 0;
+}
+
+/**
+ * Setup server side pool \a pl with passed \a limit.
+ */
+static int ldlm_srv_pool_setup(struct ldlm_pool *pl, int limit)
+{
+	struct obd_device *obd;
+
+	obd = ldlm_pl2ns(pl)->ns_obd;
+	LASSERT(obd != NULL && obd != LP_POISON);
+	LASSERT(obd->obd_type != LP_POISON);
+	write_lock(&obd->obd_pool_lock);
+	obd->obd_pool_limit = limit;
+	write_unlock(&obd->obd_pool_lock);
+
+	ldlm_pool_set_limit(pl, limit);
+	return 0;
+}
+
+/**
+ * Sets SLV and Limit from ldlm_pl2ns(pl)->ns_obd tp passed \a pl.
+ */
+static void ldlm_cli_pool_pop_slv(struct ldlm_pool *pl)
+{
+	struct obd_device *obd;
+
+	/*
+	 * Get new SLV and Limit from obd which is updated with coming
+	 * RPCs.
+	 */
+	obd = ldlm_pl2ns(pl)->ns_obd;
+	LASSERT(obd != NULL);
+	read_lock(&obd->obd_pool_lock);
+	pl->pl_server_lock_volume = obd->obd_pool_slv;
+	ldlm_pool_set_limit(pl, obd->obd_pool_limit);
+	read_unlock(&obd->obd_pool_lock);
+}
+
+/**
+ * Recalculates client size pool \a pl according to current SLV and Limit.
+ */
+static int ldlm_cli_pool_recalc(struct ldlm_pool *pl)
+{
+	time_t recalc_interval_sec;
+	int ret;
+
+	recalc_interval_sec = get_seconds() - pl->pl_recalc_time;
+	if (recalc_interval_sec < pl->pl_recalc_period)
+		return 0;
+
+	spin_lock(&pl->pl_lock);
+	/*
+	 * Check if we need to recalc lists now.
+	 */
+	recalc_interval_sec = get_seconds() - pl->pl_recalc_time;
+	if (recalc_interval_sec < pl->pl_recalc_period) {
+		spin_unlock(&pl->pl_lock);
+		return 0;
+	}
+
+	/*
+	 * Make sure that pool knows last SLV and Limit from obd.
+	 */
+	ldlm_cli_pool_pop_slv(pl);
+
+	spin_unlock(&pl->pl_lock);
+
+	/*
+	 * Do not cancel locks in case lru resize is disabled for this ns.
+	 */
+	if (!ns_connect_lru_resize(ldlm_pl2ns(pl))) {
+		ret = 0;
+		goto out;
+	}
+
+	/*
+	 * In the time of canceling locks on client we do not need to maintain
+	 * sharp timing, we only want to cancel locks asap according to new SLV.
+	 * It may be called when SLV has changed much, this is why we do not
+	 * take into account pl->pl_recalc_time here.
+	 */
+	ret = ldlm_cancel_lru(ldlm_pl2ns(pl), 0, LCF_ASYNC, LDLM_CANCEL_LRUR);
+
+out:
+	spin_lock(&pl->pl_lock);
+	/*
+	 * Time of LRU resizing might be longer than period,
+	 * so update after LRU resizing rather than before it.
+	 */
+	pl->pl_recalc_time = get_seconds();
+	lprocfs_counter_add(pl->pl_stats, LDLM_POOL_TIMING_STAT,
+			    recalc_interval_sec);
+	spin_unlock(&pl->pl_lock);
+	return ret;
+}
+
+/**
+ * This function is main entry point for memory pressure handling on client
+ * side.  Main goal of this function is to cancel some number of locks on
+ * passed \a pl according to \a nr and \a gfp_mask.
+ */
+static int ldlm_cli_pool_shrink(struct ldlm_pool *pl,
+				int nr, gfp_t gfp_mask)
+{
+	struct ldlm_namespace *ns;
+	int unused;
+
+	ns = ldlm_pl2ns(pl);
+
+	/*
+	 * Do not cancel locks in case lru resize is disabled for this ns.
+	 */
+	if (!ns_connect_lru_resize(ns))
+		return 0;
+
+	/*
+	 * Make sure that pool knows last SLV and Limit from obd.
+	 */
+	ldlm_cli_pool_pop_slv(pl);
+
+	spin_lock(&ns->ns_lock);
+	unused = ns->ns_nr_unused;
+	spin_unlock(&ns->ns_lock);
+
+	if (nr == 0)
+		return (unused / 100) * sysctl_vfs_cache_pressure;
+	else
+		return ldlm_cancel_lru(ns, nr, LCF_ASYNC, LDLM_CANCEL_SHRINK);
+}
+
+static const struct ldlm_pool_ops ldlm_srv_pool_ops = {
+	.po_recalc = ldlm_srv_pool_recalc,
+	.po_shrink = ldlm_srv_pool_shrink,
+	.po_setup  = ldlm_srv_pool_setup
+};
+
+static const struct ldlm_pool_ops ldlm_cli_pool_ops = {
+	.po_recalc = ldlm_cli_pool_recalc,
+	.po_shrink = ldlm_cli_pool_shrink
+};
+
+/**
+ * Pool recalc wrapper. Will call either client or server pool recalc callback
+ * depending what pool \a pl is used.
+ */
+int ldlm_pool_recalc(struct ldlm_pool *pl)
+{
+	time_t recalc_interval_sec;
+	int count;
+
+	recalc_interval_sec = get_seconds() - pl->pl_recalc_time;
+	if (recalc_interval_sec <= 0)
+		goto recalc;
+
+	spin_lock(&pl->pl_lock);
+	if (recalc_interval_sec > 0) {
+		/*
+		 * Update pool statistics every 1s.
+		 */
+		ldlm_pool_recalc_stats(pl);
+
+		/*
+		 * Zero out all rates and speed for the last period.
+		 */
+		atomic_set(&pl->pl_grant_rate, 0);
+		atomic_set(&pl->pl_cancel_rate, 0);
+	}
+	spin_unlock(&pl->pl_lock);
+
+ recalc:
+	if (pl->pl_ops->po_recalc != NULL) {
+		count = pl->pl_ops->po_recalc(pl);
+		lprocfs_counter_add(pl->pl_stats, LDLM_POOL_RECALC_STAT,
+				    count);
+	}
+	recalc_interval_sec = pl->pl_recalc_time - get_seconds() +
+			      pl->pl_recalc_period;
+	if (recalc_interval_sec <= 0) {
+		/* Prevent too frequent recalculation. */
+		CDEBUG(D_DLMTRACE, "Negative interval(%ld), "
+		       "too short period(%ld)",
+		       recalc_interval_sec,
+		       pl->pl_recalc_period);
+		recalc_interval_sec = 1;
+	}
+
+	return recalc_interval_sec;
+}
+
+/*
+ * Pool shrink wrapper. Will call either client or server pool recalc callback
+ * depending what pool pl is used. When nr == 0, just return the number of
+ * freeable locks. Otherwise, return the number of canceled locks.
+ */
+int ldlm_pool_shrink(struct ldlm_pool *pl, int nr,
+		     gfp_t gfp_mask)
+{
+	int cancel = 0;
+
+	if (pl->pl_ops->po_shrink != NULL) {
+		cancel = pl->pl_ops->po_shrink(pl, nr, gfp_mask);
+		if (nr > 0) {
+			lprocfs_counter_add(pl->pl_stats,
+					    LDLM_POOL_SHRINK_REQTD_STAT,
+					    nr);
+			lprocfs_counter_add(pl->pl_stats,
+					    LDLM_POOL_SHRINK_FREED_STAT,
+					    cancel);
+			CDEBUG(D_DLMTRACE, "%s: request to shrink %d locks, shrunk %d\n",
+			       pl->pl_name, nr, cancel);
+		}
+	}
+	return cancel;
+}
+EXPORT_SYMBOL(ldlm_pool_shrink);
+
+/**
+ * Pool setup wrapper. Will call either client or server pool recalc callback
+ * depending what pool \a pl is used.
+ *
+ * Sets passed \a limit into pool \a pl.
+ */
+int ldlm_pool_setup(struct ldlm_pool *pl, int limit)
+{
+	if (pl->pl_ops->po_setup != NULL)
+		return pl->pl_ops->po_setup(pl, limit);
+	return 0;
+}
+EXPORT_SYMBOL(ldlm_pool_setup);
+
+#if defined(CONFIG_PROC_FS)
+static int lprocfs_pool_state_seq_show(struct seq_file *m, void *unused)
+{
+	int granted, grant_rate, cancel_rate, grant_step;
+	int grant_speed, grant_plan, lvf;
+	struct ldlm_pool *pl = m->private;
+	__u64 slv, clv;
+	__u32 limit;
+
+	spin_lock(&pl->pl_lock);
+	slv = pl->pl_server_lock_volume;
+	clv = pl->pl_client_lock_volume;
+	limit = ldlm_pool_get_limit(pl);
+	grant_plan = pl->pl_grant_plan;
+	granted = atomic_read(&pl->pl_granted);
+	grant_rate = atomic_read(&pl->pl_grant_rate);
+	cancel_rate = atomic_read(&pl->pl_cancel_rate);
+	grant_speed = grant_rate - cancel_rate;
+	lvf = atomic_read(&pl->pl_lock_volume_factor);
+	grant_step = ldlm_pool_t2gsp(pl->pl_recalc_period);
+	spin_unlock(&pl->pl_lock);
+
+	seq_printf(m, "LDLM pool state (%s):\n"
+		      "  SLV: %llu\n"
+		      "  CLV: %llu\n"
+		      "  LVF: %d\n",
+		      pl->pl_name, slv, clv, lvf);
+
+	if (ns_is_server(ldlm_pl2ns(pl))) {
+		seq_printf(m, "  GSP: %d%%\n"
+			      "  GP:  %d\n",
+			      grant_step, grant_plan);
+	}
+	seq_printf(m, "  GR:  %d\n  CR:  %d\n  GS:  %d\n"
+		      "  G:   %d\n  L:   %d\n",
+		      grant_rate, cancel_rate, grant_speed,
+		      granted, limit);
+
+	return 0;
+}
+LPROC_SEQ_FOPS_RO(lprocfs_pool_state);
+
+static int lprocfs_grant_speed_seq_show(struct seq_file *m, void *unused)
+{
+	struct ldlm_pool *pl = m->private;
+	int	       grant_speed;
+
+	spin_lock(&pl->pl_lock);
+	/* serialize with ldlm_pool_recalc */
+	grant_speed = atomic_read(&pl->pl_grant_rate) -
+			atomic_read(&pl->pl_cancel_rate);
+	spin_unlock(&pl->pl_lock);
+	return lprocfs_rd_uint(m, &grant_speed);
+}
+
+LDLM_POOL_PROC_READER_SEQ_SHOW(grant_plan, int);
+LPROC_SEQ_FOPS_RO(lprocfs_grant_plan);
+
+LDLM_POOL_PROC_READER_SEQ_SHOW(recalc_period, int);
+LDLM_POOL_PROC_WRITER(recalc_period, int);
+static ssize_t lprocfs_recalc_period_seq_write(struct file *file,
+					       const char __user *buf,
+					       size_t len, loff_t *off)
+{
+	struct seq_file *seq = file->private_data;
+
+	return lprocfs_wr_recalc_period(file, buf, len, seq->private);
+}
+LPROC_SEQ_FOPS(lprocfs_recalc_period);
+
+LPROC_SEQ_FOPS_RO_TYPE(ldlm_pool, u64);
+LPROC_SEQ_FOPS_RO_TYPE(ldlm_pool, atomic);
+LPROC_SEQ_FOPS_RW_TYPE(ldlm_pool_rw, atomic);
+
+LPROC_SEQ_FOPS_RO(lprocfs_grant_speed);
+
+#define LDLM_POOL_ADD_VAR(name, var, ops)			\
+	do {							\
+		snprintf(var_name, MAX_STRING_SIZE, #name);	\
+		pool_vars[0].data = var;			\
+		pool_vars[0].fops = ops;			\
+		lprocfs_add_vars(pl->pl_proc_dir, pool_vars, NULL);\
+	} while (0)
+
+static int ldlm_pool_proc_init(struct ldlm_pool *pl)
+{
+	struct ldlm_namespace *ns = ldlm_pl2ns(pl);
+	struct proc_dir_entry *parent_ns_proc;
+	struct lprocfs_vars pool_vars[2];
+	char *var_name = NULL;
+	int rc = 0;
+
+	OBD_ALLOC(var_name, MAX_STRING_SIZE + 1);
+	if (!var_name)
+		return -ENOMEM;
+
+	parent_ns_proc = ns->ns_proc_dir_entry;
+	if (parent_ns_proc == NULL) {
+		CERROR("%s: proc entry is not initialized\n",
+		       ldlm_ns_name(ns));
+		rc = -EINVAL;
+		goto out_free_name;
+	}
+	pl->pl_proc_dir = lprocfs_register("pool", parent_ns_proc,
+					   NULL, NULL);
+	if (IS_ERR(pl->pl_proc_dir)) {
+		CERROR("LProcFS failed in ldlm-pool-init\n");
+		rc = PTR_ERR(pl->pl_proc_dir);
+		pl->pl_proc_dir = NULL;
+		goto out_free_name;
+	}
+
+	var_name[MAX_STRING_SIZE] = '\0';
+	memset(pool_vars, 0, sizeof(pool_vars));
+	pool_vars[0].name = var_name;
+
+	LDLM_POOL_ADD_VAR("server_lock_volume", &pl->pl_server_lock_volume,
+			  &ldlm_pool_u64_fops);
+	LDLM_POOL_ADD_VAR("limit", &pl->pl_limit, &ldlm_pool_rw_atomic_fops);
+	LDLM_POOL_ADD_VAR("granted", &pl->pl_granted, &ldlm_pool_atomic_fops);
+	LDLM_POOL_ADD_VAR("grant_speed", pl, &lprocfs_grant_speed_fops);
+	LDLM_POOL_ADD_VAR("cancel_rate", &pl->pl_cancel_rate,
+			  &ldlm_pool_atomic_fops);
+	LDLM_POOL_ADD_VAR("grant_rate", &pl->pl_grant_rate,
+			  &ldlm_pool_atomic_fops);
+	LDLM_POOL_ADD_VAR("grant_plan", pl, &lprocfs_grant_plan_fops);
+	LDLM_POOL_ADD_VAR("recalc_period", pl, &lprocfs_recalc_period_fops);
+	LDLM_POOL_ADD_VAR("lock_volume_factor", &pl->pl_lock_volume_factor,
+			  &ldlm_pool_rw_atomic_fops);
+	LDLM_POOL_ADD_VAR("state", pl, &lprocfs_pool_state_fops);
+
+	pl->pl_stats = lprocfs_alloc_stats(LDLM_POOL_LAST_STAT -
+					   LDLM_POOL_FIRST_STAT, 0);
+	if (!pl->pl_stats) {
+		rc = -ENOMEM;
+		goto out_free_name;
+	}
+
+	lprocfs_counter_init(pl->pl_stats, LDLM_POOL_GRANTED_STAT,
+			     LPROCFS_CNTR_AVGMINMAX | LPROCFS_CNTR_STDDEV,
+			     "granted", "locks");
+	lprocfs_counter_init(pl->pl_stats, LDLM_POOL_GRANT_STAT,
+			     LPROCFS_CNTR_AVGMINMAX | LPROCFS_CNTR_STDDEV,
+			     "grant", "locks");
+	lprocfs_counter_init(pl->pl_stats, LDLM_POOL_CANCEL_STAT,
+			     LPROCFS_CNTR_AVGMINMAX | LPROCFS_CNTR_STDDEV,
+			     "cancel", "locks");
+	lprocfs_counter_init(pl->pl_stats, LDLM_POOL_GRANT_RATE_STAT,
+			     LPROCFS_CNTR_AVGMINMAX | LPROCFS_CNTR_STDDEV,
+			     "grant_rate", "locks/s");
+	lprocfs_counter_init(pl->pl_stats, LDLM_POOL_CANCEL_RATE_STAT,
+			     LPROCFS_CNTR_AVGMINMAX | LPROCFS_CNTR_STDDEV,
+			     "cancel_rate", "locks/s");
+	lprocfs_counter_init(pl->pl_stats, LDLM_POOL_GRANT_PLAN_STAT,
+			     LPROCFS_CNTR_AVGMINMAX | LPROCFS_CNTR_STDDEV,
+			     "grant_plan", "locks/s");
+	lprocfs_counter_init(pl->pl_stats, LDLM_POOL_SLV_STAT,
+			     LPROCFS_CNTR_AVGMINMAX | LPROCFS_CNTR_STDDEV,
+			     "slv", "slv");
+	lprocfs_counter_init(pl->pl_stats, LDLM_POOL_SHRINK_REQTD_STAT,
+			     LPROCFS_CNTR_AVGMINMAX | LPROCFS_CNTR_STDDEV,
+			     "shrink_request", "locks");
+	lprocfs_counter_init(pl->pl_stats, LDLM_POOL_SHRINK_FREED_STAT,
+			     LPROCFS_CNTR_AVGMINMAX | LPROCFS_CNTR_STDDEV,
+			     "shrink_freed", "locks");
+	lprocfs_counter_init(pl->pl_stats, LDLM_POOL_RECALC_STAT,
+			     LPROCFS_CNTR_AVGMINMAX | LPROCFS_CNTR_STDDEV,
+			     "recalc_freed", "locks");
+	lprocfs_counter_init(pl->pl_stats, LDLM_POOL_TIMING_STAT,
+			     LPROCFS_CNTR_AVGMINMAX | LPROCFS_CNTR_STDDEV,
+			     "recalc_timing", "sec");
+	rc = lprocfs_register_stats(pl->pl_proc_dir, "stats", pl->pl_stats);
+
+out_free_name:
+	OBD_FREE(var_name, MAX_STRING_SIZE + 1);
+	return rc;
+}
+
+static void ldlm_pool_proc_fini(struct ldlm_pool *pl)
+{
+	if (pl->pl_stats != NULL) {
+		lprocfs_free_stats(&pl->pl_stats);
+		pl->pl_stats = NULL;
+	}
+	if (pl->pl_proc_dir != NULL) {
+		lprocfs_remove(&pl->pl_proc_dir);
+		pl->pl_proc_dir = NULL;
+	}
+}
+#else /* !CONFIG_PROC_FS */
+static int ldlm_pool_proc_init(struct ldlm_pool *pl)
+{
+	return 0;
+}
+
+static void ldlm_pool_proc_fini(struct ldlm_pool *pl) {}
+#endif /* CONFIG_PROC_FS */
+
+int ldlm_pool_init(struct ldlm_pool *pl, struct ldlm_namespace *ns,
+		   int idx, ldlm_side_t client)
+{
+	int rc;
+
+	spin_lock_init(&pl->pl_lock);
+	atomic_set(&pl->pl_granted, 0);
+	pl->pl_recalc_time = get_seconds();
+	atomic_set(&pl->pl_lock_volume_factor, 1);
+
+	atomic_set(&pl->pl_grant_rate, 0);
+	atomic_set(&pl->pl_cancel_rate, 0);
+	pl->pl_grant_plan = LDLM_POOL_GP(LDLM_POOL_HOST_L);
+
+	snprintf(pl->pl_name, sizeof(pl->pl_name), "ldlm-pool-%s-%d",
+		 ldlm_ns_name(ns), idx);
+
+	if (client == LDLM_NAMESPACE_SERVER) {
+		pl->pl_ops = &ldlm_srv_pool_ops;
+		ldlm_pool_set_limit(pl, LDLM_POOL_HOST_L);
+		pl->pl_recalc_period = LDLM_POOL_SRV_DEF_RECALC_PERIOD;
+		pl->pl_server_lock_volume = ldlm_pool_slv_max(LDLM_POOL_HOST_L);
+	} else {
+		ldlm_pool_set_limit(pl, 1);
+		pl->pl_server_lock_volume = 0;
+		pl->pl_ops = &ldlm_cli_pool_ops;
+		pl->pl_recalc_period = LDLM_POOL_CLI_DEF_RECALC_PERIOD;
+	}
+	pl->pl_client_lock_volume = 0;
+	rc = ldlm_pool_proc_init(pl);
+	if (rc)
+		return rc;
+
+	CDEBUG(D_DLMTRACE, "Lock pool %s is initialized\n", pl->pl_name);
+
+	return rc;
+}
+EXPORT_SYMBOL(ldlm_pool_init);
+
+void ldlm_pool_fini(struct ldlm_pool *pl)
+{
+	ldlm_pool_proc_fini(pl);
+
+	/*
+	 * Pool should not be used after this point. We can't free it here as
+	 * it lives in struct ldlm_namespace, but still interested in catching
+	 * any abnormal using cases.
+	 */
+	POISON(pl, 0x5a, sizeof(*pl));
+}
+EXPORT_SYMBOL(ldlm_pool_fini);
+
+/**
+ * Add new taken ldlm lock \a lock into pool \a pl accounting.
+ */
+void ldlm_pool_add(struct ldlm_pool *pl, struct ldlm_lock *lock)
+{
+	/*
+	 * FLOCK locks are special in a sense that they are almost never
+	 * cancelled, instead special kind of lock is used to drop them.
+	 * also there is no LRU for flock locks, so no point in tracking
+	 * them anyway.
+	 */
+	if (lock->l_resource->lr_type == LDLM_FLOCK)
+		return;
+
+	atomic_inc(&pl->pl_granted);
+	atomic_inc(&pl->pl_grant_rate);
+	lprocfs_counter_incr(pl->pl_stats, LDLM_POOL_GRANT_STAT);
+	/*
+	 * Do not do pool recalc for client side as all locks which
+	 * potentially may be canceled has already been packed into
+	 * enqueue/cancel rpc. Also we do not want to run out of stack
+	 * with too long call paths.
+	 */
+	if (ns_is_server(ldlm_pl2ns(pl)))
+		ldlm_pool_recalc(pl);
+}
+EXPORT_SYMBOL(ldlm_pool_add);
+
+/**
+ * Remove ldlm lock \a lock from pool \a pl accounting.
+ */
+void ldlm_pool_del(struct ldlm_pool *pl, struct ldlm_lock *lock)
+{
+	/*
+	 * Filter out FLOCK locks. Read above comment in ldlm_pool_add().
+	 */
+	if (lock->l_resource->lr_type == LDLM_FLOCK)
+		return;
+
+	LASSERT(atomic_read(&pl->pl_granted) > 0);
+	atomic_dec(&pl->pl_granted);
+	atomic_inc(&pl->pl_cancel_rate);
+
+	lprocfs_counter_incr(pl->pl_stats, LDLM_POOL_CANCEL_STAT);
+
+	if (ns_is_server(ldlm_pl2ns(pl)))
+		ldlm_pool_recalc(pl);
+}
+EXPORT_SYMBOL(ldlm_pool_del);
+
+/**
+ * Returns current \a pl SLV.
+ *
+ * \pre ->pl_lock is not locked.
+ */
+__u64 ldlm_pool_get_slv(struct ldlm_pool *pl)
+{
+	__u64 slv;
+
+	spin_lock(&pl->pl_lock);
+	slv = pl->pl_server_lock_volume;
+	spin_unlock(&pl->pl_lock);
+	return slv;
+}
+EXPORT_SYMBOL(ldlm_pool_get_slv);
+
+/**
+ * Sets passed \a slv to \a pl.
+ *
+ * \pre ->pl_lock is not locked.
+ */
+void ldlm_pool_set_slv(struct ldlm_pool *pl, __u64 slv)
+{
+	spin_lock(&pl->pl_lock);
+	pl->pl_server_lock_volume = slv;
+	spin_unlock(&pl->pl_lock);
+}
+EXPORT_SYMBOL(ldlm_pool_set_slv);
+
+/**
+ * Returns current \a pl CLV.
+ *
+ * \pre ->pl_lock is not locked.
+ */
+__u64 ldlm_pool_get_clv(struct ldlm_pool *pl)
+{
+	__u64 slv;
+
+	spin_lock(&pl->pl_lock);
+	slv = pl->pl_client_lock_volume;
+	spin_unlock(&pl->pl_lock);
+	return slv;
+}
+EXPORT_SYMBOL(ldlm_pool_get_clv);
+
+/**
+ * Sets passed \a clv to \a pl.
+ *
+ * \pre ->pl_lock is not locked.
+ */
+void ldlm_pool_set_clv(struct ldlm_pool *pl, __u64 clv)
+{
+	spin_lock(&pl->pl_lock);
+	pl->pl_client_lock_volume = clv;
+	spin_unlock(&pl->pl_lock);
+}
+EXPORT_SYMBOL(ldlm_pool_set_clv);
+
+/**
+ * Returns current \a pl limit.
+ */
+__u32 ldlm_pool_get_limit(struct ldlm_pool *pl)
+{
+	return atomic_read(&pl->pl_limit);
+}
+EXPORT_SYMBOL(ldlm_pool_get_limit);
+
+/**
+ * Sets passed \a limit to \a pl.
+ */
+void ldlm_pool_set_limit(struct ldlm_pool *pl, __u32 limit)
+{
+	atomic_set(&pl->pl_limit, limit);
+}
+EXPORT_SYMBOL(ldlm_pool_set_limit);
+
+/**
+ * Returns current LVF from \a pl.
+ */
+__u32 ldlm_pool_get_lvf(struct ldlm_pool *pl)
+{
+	return atomic_read(&pl->pl_lock_volume_factor);
+}
+EXPORT_SYMBOL(ldlm_pool_get_lvf);
+
+static int ldlm_pool_granted(struct ldlm_pool *pl)
+{
+	return atomic_read(&pl->pl_granted);
+}
+
+static struct ptlrpc_thread *ldlm_pools_thread;
+static struct completion ldlm_pools_comp;
+
+/*
+ * count locks from all namespaces (if possible). Returns number of
+ * cached locks.
+ */
+static unsigned long ldlm_pools_count(ldlm_side_t client, gfp_t gfp_mask)
+{
+	int total = 0, nr_ns;
+	struct ldlm_namespace *ns;
+	struct ldlm_namespace *ns_old = NULL; /* loop detection */
+	void *cookie;
+
+	if (client == LDLM_NAMESPACE_CLIENT && !(gfp_mask & __GFP_FS))
+		return 0;
+
+	CDEBUG(D_DLMTRACE, "Request to count %s locks from all pools\n",
+	       client == LDLM_NAMESPACE_CLIENT ? "client" : "server");
+
+	cookie = cl_env_reenter();
+
+	/*
+	 * Find out how many resources we may release.
+	 */
+	for (nr_ns = ldlm_namespace_nr_read(client);
+	     nr_ns > 0; nr_ns--) {
+		mutex_lock(ldlm_namespace_lock(client));
+		if (list_empty(ldlm_namespace_list(client))) {
+			mutex_unlock(ldlm_namespace_lock(client));
+			cl_env_reexit(cookie);
+			return 0;
+		}
+		ns = ldlm_namespace_first_locked(client);
+
+		if (ns == ns_old) {
+			mutex_unlock(ldlm_namespace_lock(client));
+			break;
+		}
+
+		if (ldlm_ns_empty(ns)) {
+			ldlm_namespace_move_to_inactive_locked(ns, client);
+			mutex_unlock(ldlm_namespace_lock(client));
+			continue;
+		}
+
+		if (ns_old == NULL)
+			ns_old = ns;
+
+		ldlm_namespace_get(ns);
+		ldlm_namespace_move_to_active_locked(ns, client);
+		mutex_unlock(ldlm_namespace_lock(client));
+		total += ldlm_pool_shrink(&ns->ns_pool, 0, gfp_mask);
+		ldlm_namespace_put(ns);
+	}
+
+	cl_env_reexit(cookie);
+	return total;
+}
+
+static unsigned long ldlm_pools_scan(ldlm_side_t client, int nr, gfp_t gfp_mask)
+{
+	unsigned long freed = 0;
+	int tmp, nr_ns;
+	struct ldlm_namespace *ns;
+	void *cookie;
+
+	if (client == LDLM_NAMESPACE_CLIENT && !(gfp_mask & __GFP_FS))
+		return -1;
+
+	cookie = cl_env_reenter();
+
+	/*
+	 * Shrink at least ldlm_namespace_nr_read(client) namespaces.
+	 */
+	for (tmp = nr_ns = ldlm_namespace_nr_read(client);
+	     tmp > 0; tmp--) {
+		int cancel, nr_locks;
+
+		/*
+		 * Do not call shrink under ldlm_namespace_lock(client)
+		 */
+		mutex_lock(ldlm_namespace_lock(client));
+		if (list_empty(ldlm_namespace_list(client))) {
+			mutex_unlock(ldlm_namespace_lock(client));
+			break;
+		}
+		ns = ldlm_namespace_first_locked(client);
+		ldlm_namespace_get(ns);
+		ldlm_namespace_move_to_active_locked(ns, client);
+		mutex_unlock(ldlm_namespace_lock(client));
+
+		nr_locks = ldlm_pool_granted(&ns->ns_pool);
+		/*
+		 * We use to shrink propotionally but with new shrinker API,
+		 * we lost the total number of freeable locks.
+		 */
+		cancel = 1 + min_t(int, nr_locks, nr / nr_ns);
+		freed += ldlm_pool_shrink(&ns->ns_pool, cancel, gfp_mask);
+		ldlm_namespace_put(ns);
+	}
+	cl_env_reexit(cookie);
+	/*
+	 * we only decrease the SLV in server pools shrinker, return
+	 * SHRINK_STOP to kernel to avoid needless loop. LU-1128
+	 */
+	return (client == LDLM_NAMESPACE_SERVER) ? SHRINK_STOP : freed;
+}
+
+static unsigned long ldlm_pools_srv_count(struct shrinker *s,
+					  struct shrink_control *sc)
+{
+	return ldlm_pools_count(LDLM_NAMESPACE_SERVER, sc->gfp_mask);
+}
+
+static unsigned long ldlm_pools_srv_scan(struct shrinker *s,
+					 struct shrink_control *sc)
+{
+	return ldlm_pools_scan(LDLM_NAMESPACE_SERVER, sc->nr_to_scan,
+			       sc->gfp_mask);
+}
+
+static unsigned long ldlm_pools_cli_count(struct shrinker *s,
+					  struct shrink_control *sc)
+{
+	return ldlm_pools_count(LDLM_NAMESPACE_CLIENT, sc->gfp_mask);
+}
+
+static unsigned long ldlm_pools_cli_scan(struct shrinker *s,
+					 struct shrink_control *sc)
+{
+	return ldlm_pools_scan(LDLM_NAMESPACE_CLIENT, sc->nr_to_scan,
+			       sc->gfp_mask);
+}
+
+int ldlm_pools_recalc(ldlm_side_t client)
+{
+	__u32 nr_l = 0, nr_p = 0, l;
+	struct ldlm_namespace *ns;
+	struct ldlm_namespace *ns_old = NULL;
+	int nr, equal = 0;
+	int time = 50; /* seconds of sleep if no active namespaces */
+
+	/*
+	 * No need to setup pool limit for client pools.
+	 */
+	if (client == LDLM_NAMESPACE_SERVER) {
+		/*
+		 * Check all modest namespaces first.
+		 */
+		mutex_lock(ldlm_namespace_lock(client));
+		list_for_each_entry(ns, ldlm_namespace_list(client),
+					ns_list_chain) {
+			if (ns->ns_appetite != LDLM_NAMESPACE_MODEST)
+				continue;
+
+			l = ldlm_pool_granted(&ns->ns_pool);
+			if (l == 0)
+				l = 1;
+
+			/*
+			 * Set the modest pools limit equal to their avg granted
+			 * locks + ~6%.
+			 */
+			l += dru(l, LDLM_POOLS_MODEST_MARGIN_SHIFT, 0);
+			ldlm_pool_setup(&ns->ns_pool, l);
+			nr_l += l;
+			nr_p++;
+		}
+
+		/*
+		 * Make sure that modest namespaces did not eat more that 2/3
+		 * of limit.
+		 */
+		if (nr_l >= 2 * (LDLM_POOL_HOST_L / 3)) {
+			CWARN("\"Modest\" pools eat out 2/3 of server locks limit (%d of %lu). This means that you have too many clients for this amount of server RAM. Upgrade server!\n",
+			      nr_l, LDLM_POOL_HOST_L);
+			equal = 1;
+		}
+
+		/*
+		 * The rest is given to greedy namespaces.
+		 */
+		list_for_each_entry(ns, ldlm_namespace_list(client),
+				    ns_list_chain) {
+			if (!equal && ns->ns_appetite != LDLM_NAMESPACE_GREEDY)
+				continue;
+
+			if (equal) {
+				/*
+				 * In the case 2/3 locks are eaten out by
+				 * modest pools, we re-setup equal limit
+				 * for _all_ pools.
+				 */
+				l = LDLM_POOL_HOST_L /
+					ldlm_namespace_nr_read(client);
+			} else {
+				/*
+				 * All the rest of greedy pools will have
+				 * all locks in equal parts.
+				 */
+				l = (LDLM_POOL_HOST_L - nr_l) /
+					(ldlm_namespace_nr_read(client) -
+					 nr_p);
+			}
+			ldlm_pool_setup(&ns->ns_pool, l);
+		}
+		mutex_unlock(ldlm_namespace_lock(client));
+	}
+
+	/*
+	 * Recalc at least ldlm_namespace_nr_read(client) namespaces.
+	 */
+	for (nr = ldlm_namespace_nr_read(client); nr > 0; nr--) {
+		int     skip;
+		/*
+		 * Lock the list, get first @ns in the list, getref, move it
+		 * to the tail, unlock and call pool recalc. This way we avoid
+		 * calling recalc under @ns lock what is really good as we get
+		 * rid of potential deadlock on client nodes when canceling
+		 * locks synchronously.
+		 */
+		mutex_lock(ldlm_namespace_lock(client));
+		if (list_empty(ldlm_namespace_list(client))) {
+			mutex_unlock(ldlm_namespace_lock(client));
+			break;
+		}
+		ns = ldlm_namespace_first_locked(client);
+
+		if (ns_old == ns) { /* Full pass complete */
+			mutex_unlock(ldlm_namespace_lock(client));
+			break;
+		}
+
+		/* We got an empty namespace, need to move it back to inactive
+		 * list.
+		 * The race with parallel resource creation is fine:
+		 * - If they do namespace_get before our check, we fail the
+		 *   check and they move this item to the end of the list anyway
+		 * - If we do the check and then they do namespace_get, then
+		 *   we move the namespace to inactive and they will move
+		 *   it back to active (synchronised by the lock, so no clash
+		 *   there).
+		 */
+		if (ldlm_ns_empty(ns)) {
+			ldlm_namespace_move_to_inactive_locked(ns, client);
+			mutex_unlock(ldlm_namespace_lock(client));
+			continue;
+		}
+
+		if (ns_old == NULL)
+			ns_old = ns;
+
+		spin_lock(&ns->ns_lock);
+		/*
+		 * skip ns which is being freed, and we don't want to increase
+		 * its refcount again, not even temporarily. bz21519 & LU-499.
+		 */
+		if (ns->ns_stopping) {
+			skip = 1;
+		} else {
+			skip = 0;
+			ldlm_namespace_get(ns);
+		}
+		spin_unlock(&ns->ns_lock);
+
+		ldlm_namespace_move_to_active_locked(ns, client);
+		mutex_unlock(ldlm_namespace_lock(client));
+
+		/*
+		 * After setup is done - recalc the pool.
+		 */
+		if (!skip) {
+			int ttime = ldlm_pool_recalc(&ns->ns_pool);
+
+			if (ttime < time)
+				time = ttime;
+
+			ldlm_namespace_put(ns);
+		}
+	}
+	return time;
+}
+EXPORT_SYMBOL(ldlm_pools_recalc);
+
+static int ldlm_pools_thread_main(void *arg)
+{
+	struct ptlrpc_thread *thread = (struct ptlrpc_thread *)arg;
+	int s_time, c_time;
+
+	thread_set_flags(thread, SVC_RUNNING);
+	wake_up(&thread->t_ctl_waitq);
+
+	CDEBUG(D_DLMTRACE, "%s: pool thread starting, process %d\n",
+		"ldlm_poold", current_pid());
+
+	while (1) {
+		struct l_wait_info lwi;
+
+		/*
+		 * Recal all pools on this tick.
+		 */
+		s_time = ldlm_pools_recalc(LDLM_NAMESPACE_SERVER);
+		c_time = ldlm_pools_recalc(LDLM_NAMESPACE_CLIENT);
+
+		/*
+		 * Wait until the next check time, or until we're
+		 * stopped.
+		 */
+		lwi = LWI_TIMEOUT(cfs_time_seconds(min(s_time, c_time)),
+				  NULL, NULL);
+		l_wait_event(thread->t_ctl_waitq,
+			     thread_is_stopping(thread) ||
+			     thread_is_event(thread),
+			     &lwi);
+
+		if (thread_test_and_clear_flags(thread, SVC_STOPPING))
+			break;
+		else
+			thread_test_and_clear_flags(thread, SVC_EVENT);
+	}
+
+	thread_set_flags(thread, SVC_STOPPED);
+	wake_up(&thread->t_ctl_waitq);
+
+	CDEBUG(D_DLMTRACE, "%s: pool thread exiting, process %d\n",
+		"ldlm_poold", current_pid());
+
+	complete_and_exit(&ldlm_pools_comp, 0);
+}
+
+static int ldlm_pools_thread_start(void)
+{
+	struct l_wait_info lwi = { 0 };
+	struct task_struct *task;
+
+	if (ldlm_pools_thread != NULL)
+		return -EALREADY;
+
+	OBD_ALLOC_PTR(ldlm_pools_thread);
+	if (ldlm_pools_thread == NULL)
+		return -ENOMEM;
+
+	init_completion(&ldlm_pools_comp);
+	init_waitqueue_head(&ldlm_pools_thread->t_ctl_waitq);
+
+	task = kthread_run(ldlm_pools_thread_main, ldlm_pools_thread,
+			   "ldlm_poold");
+	if (IS_ERR(task)) {
+		CERROR("Can't start pool thread, error %ld\n", PTR_ERR(task));
+		OBD_FREE(ldlm_pools_thread, sizeof(*ldlm_pools_thread));
+		ldlm_pools_thread = NULL;
+		return PTR_ERR(task);
+	}
+	l_wait_event(ldlm_pools_thread->t_ctl_waitq,
+		     thread_is_running(ldlm_pools_thread), &lwi);
+	return 0;
+}
+
+static void ldlm_pools_thread_stop(void)
+{
+	if (ldlm_pools_thread == NULL)
+		return;
+
+	thread_set_flags(ldlm_pools_thread, SVC_STOPPING);
+	wake_up(&ldlm_pools_thread->t_ctl_waitq);
+
+	/*
+	 * Make sure that pools thread is finished before freeing @thread.
+	 * This fixes possible race and oops due to accessing freed memory
+	 * in pools thread.
+	 */
+	wait_for_completion(&ldlm_pools_comp);
+	OBD_FREE_PTR(ldlm_pools_thread);
+	ldlm_pools_thread = NULL;
+}
+
+static struct shrinker ldlm_pools_srv_shrinker = {
+	.count_objects	= ldlm_pools_srv_count,
+	.scan_objects	= ldlm_pools_srv_scan,
+	.seeks		= DEFAULT_SEEKS,
+};
+
+static struct shrinker ldlm_pools_cli_shrinker = {
+	.count_objects	= ldlm_pools_cli_count,
+	.scan_objects	= ldlm_pools_cli_scan,
+	.seeks		= DEFAULT_SEEKS,
+};
+
+int ldlm_pools_init(void)
+{
+	int rc;
+
+	rc = ldlm_pools_thread_start();
+	if (rc == 0) {
+		register_shrinker(&ldlm_pools_srv_shrinker);
+		register_shrinker(&ldlm_pools_cli_shrinker);
+	}
+	return rc;
+}
+EXPORT_SYMBOL(ldlm_pools_init);
+
+void ldlm_pools_fini(void)
+{
+	unregister_shrinker(&ldlm_pools_srv_shrinker);
+	unregister_shrinker(&ldlm_pools_cli_shrinker);
+	ldlm_pools_thread_stop();
+}
+EXPORT_SYMBOL(ldlm_pools_fini);
diff --git a/kernel/drivers/staging/lustre/lustre/ldlm/ldlm_request.c b/kernel/drivers/staging/lustre/lustre/ldlm/ldlm_request.c
new file mode 100644
index 000000000..4f7131831
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/ldlm/ldlm_request.c
@@ -0,0 +1,2294 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2010, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+/**
+ * This file contains Asynchronous System Trap (AST) handlers and related
+ * LDLM request-processing routines.
+ *
+ * An AST is a callback issued on a lock when its state is changed. There are
+ * several different types of ASTs (callbacks) registered for each lock:
+ *
+ * - completion AST: when a lock is enqueued by some process, but cannot be
+ *   granted immediately due to other conflicting locks on the same resource,
+ *   the completion AST is sent to notify the caller when the lock is
+ *   eventually granted
+ *
+ * - blocking AST: when a lock is granted to some process, if another process
+ *   enqueues a conflicting (blocking) lock on a resource, a blocking AST is
+ *   sent to notify the holder(s) of the lock(s) of the conflicting lock
+ *   request. The lock holder(s) must release their lock(s) on that resource in
+ *   a timely manner or be evicted by the server.
+ *
+ * - glimpse AST: this is used when a process wants information about a lock
+ *   (i.e. the lock value block (LVB)) but does not necessarily require holding
+ *   the lock. If the resource is locked, the lock holder(s) are sent glimpse
+ *   ASTs and the LVB is returned to the caller, and lock holder(s) may CANCEL
+ *   their lock(s) if they are idle. If the resource is not locked, the server
+ *   may grant the lock.
+ */
+
+#define DEBUG_SUBSYSTEM S_LDLM
+
+#include "../include/lustre_dlm.h"
+#include "../include/obd_class.h"
+#include "../include/obd.h"
+
+#include "ldlm_internal.h"
+
+int ldlm_enqueue_min = OBD_TIMEOUT_DEFAULT;
+module_param(ldlm_enqueue_min, int, 0644);
+MODULE_PARM_DESC(ldlm_enqueue_min, "lock enqueue timeout minimum");
+
+/* in client side, whether the cached locks will be canceled before replay */
+unsigned int ldlm_cancel_unused_locks_before_replay = 1;
+
+static void interrupted_completion_wait(void *data)
+{
+}
+
+struct lock_wait_data {
+	struct ldlm_lock *lwd_lock;
+	__u32	     lwd_conn_cnt;
+};
+
+struct ldlm_async_args {
+	struct lustre_handle lock_handle;
+};
+
+int ldlm_expired_completion_wait(void *data)
+{
+	struct lock_wait_data *lwd = data;
+	struct ldlm_lock *lock = lwd->lwd_lock;
+	struct obd_import *imp;
+	struct obd_device *obd;
+
+	if (lock->l_conn_export == NULL) {
+		static unsigned long next_dump, last_dump;
+
+		LCONSOLE_WARN("lock timed out (enqueued at "CFS_TIME_T", "
+			      CFS_DURATION_T"s ago)\n",
+			      lock->l_last_activity,
+			      cfs_time_sub(get_seconds(),
+					   lock->l_last_activity));
+		LDLM_DEBUG(lock, "lock timed out (enqueued at " CFS_TIME_T ", " CFS_DURATION_T "s ago); not entering recovery in server code, just going back to sleep",
+			   lock->l_last_activity,
+			   cfs_time_sub(get_seconds(),
+					lock->l_last_activity));
+		if (cfs_time_after(cfs_time_current(), next_dump)) {
+			last_dump = next_dump;
+			next_dump = cfs_time_shift(300);
+			ldlm_namespace_dump(D_DLMTRACE,
+					    ldlm_lock_to_ns(lock));
+			if (last_dump == 0)
+				libcfs_debug_dumplog();
+		}
+		return 0;
+	}
+
+	obd = lock->l_conn_export->exp_obd;
+	imp = obd->u.cli.cl_import;
+	ptlrpc_fail_import(imp, lwd->lwd_conn_cnt);
+	LDLM_ERROR(lock, "lock timed out (enqueued at "CFS_TIME_T", "
+		  CFS_DURATION_T"s ago), entering recovery for %s@%s",
+		  lock->l_last_activity,
+		  cfs_time_sub(get_seconds(), lock->l_last_activity),
+		  obd2cli_tgt(obd), imp->imp_connection->c_remote_uuid.uuid);
+
+	return 0;
+}
+EXPORT_SYMBOL(ldlm_expired_completion_wait);
+
+/* We use the same basis for both server side and client side functions
+   from a single node. */
+int ldlm_get_enq_timeout(struct ldlm_lock *lock)
+{
+	int timeout = at_get(ldlm_lock_to_ns_at(lock));
+
+	if (AT_OFF)
+		return obd_timeout / 2;
+	/* Since these are non-updating timeouts, we should be conservative.
+	   It would be nice to have some kind of "early reply" mechanism for
+	   lock callbacks too... */
+	timeout = min_t(int, at_max, timeout + (timeout >> 1)); /* 150% */
+	return max(timeout, ldlm_enqueue_min);
+}
+EXPORT_SYMBOL(ldlm_get_enq_timeout);
+
+/**
+ * Helper function for ldlm_completion_ast(), updating timings when lock is
+ * actually granted.
+ */
+static int ldlm_completion_tail(struct ldlm_lock *lock)
+{
+	long delay;
+	int  result;
+
+	if (lock->l_flags & (LDLM_FL_DESTROYED | LDLM_FL_FAILED)) {
+		LDLM_DEBUG(lock, "client-side enqueue: destroyed");
+		result = -EIO;
+	} else {
+		delay = cfs_time_sub(get_seconds(),
+				     lock->l_last_activity);
+		LDLM_DEBUG(lock, "client-side enqueue: granted after "
+			   CFS_DURATION_T"s", delay);
+
+		/* Update our time estimate */
+		at_measured(ldlm_lock_to_ns_at(lock),
+			    delay);
+		result = 0;
+	}
+	return result;
+}
+
+/**
+ * Implementation of ->l_completion_ast() for a client, that doesn't wait
+ * until lock is granted. Suitable for locks enqueued through ptlrpcd, of
+ * other threads that cannot block for long.
+ */
+int ldlm_completion_ast_async(struct ldlm_lock *lock, __u64 flags, void *data)
+{
+	if (flags == LDLM_FL_WAIT_NOREPROC) {
+		LDLM_DEBUG(lock, "client-side enqueue waiting on pending lock");
+		return 0;
+	}
+
+	if (!(flags & (LDLM_FL_BLOCK_WAIT | LDLM_FL_BLOCK_GRANTED |
+		       LDLM_FL_BLOCK_CONV))) {
+		wake_up(&lock->l_waitq);
+		return ldlm_completion_tail(lock);
+	}
+
+	LDLM_DEBUG(lock, "client-side enqueue returned a blocked lock, going forward");
+	ldlm_reprocess_all(lock->l_resource);
+	return 0;
+}
+EXPORT_SYMBOL(ldlm_completion_ast_async);
+
+/**
+ * Generic LDLM "completion" AST. This is called in several cases:
+ *
+ *     - when a reply to an ENQUEUE RPC is received from the server
+ *       (ldlm_cli_enqueue_fini()). Lock might be granted or not granted at
+ *       this point (determined by flags);
+ *
+ *     - when LDLM_CP_CALLBACK RPC comes to client to notify it that lock has
+ *       been granted;
+ *
+ *     - when ldlm_lock_match(LDLM_FL_LVB_READY) is about to wait until lock
+ *       gets correct lvb;
+ *
+ *     - to force all locks when resource is destroyed (cleanup_resource());
+ *
+ *     - during lock conversion (not used currently).
+ *
+ * If lock is not granted in the first case, this function waits until second
+ * or penultimate cases happen in some other thread.
+ *
+ */
+int ldlm_completion_ast(struct ldlm_lock *lock, __u64 flags, void *data)
+{
+	/* XXX ALLOCATE - 160 bytes */
+	struct lock_wait_data lwd;
+	struct obd_device *obd;
+	struct obd_import *imp = NULL;
+	struct l_wait_info lwi;
+	__u32 timeout;
+	int rc = 0;
+
+	if (flags == LDLM_FL_WAIT_NOREPROC) {
+		LDLM_DEBUG(lock, "client-side enqueue waiting on pending lock");
+		goto noreproc;
+	}
+
+	if (!(flags & (LDLM_FL_BLOCK_WAIT | LDLM_FL_BLOCK_GRANTED |
+		       LDLM_FL_BLOCK_CONV))) {
+		wake_up(&lock->l_waitq);
+		return 0;
+	}
+
+	LDLM_DEBUG(lock, "client-side enqueue returned a blocked lock, sleeping");
+
+noreproc:
+
+	obd = class_exp2obd(lock->l_conn_export);
+
+	/* if this is a local lock, then there is no import */
+	if (obd != NULL)
+		imp = obd->u.cli.cl_import;
+
+	/* Wait a long time for enqueue - server may have to callback a
+	   lock from another client.  Server will evict the other client if it
+	   doesn't respond reasonably, and then give us the lock. */
+	timeout = ldlm_get_enq_timeout(lock) * 2;
+
+	lwd.lwd_lock = lock;
+
+	if (lock->l_flags & LDLM_FL_NO_TIMEOUT) {
+		LDLM_DEBUG(lock, "waiting indefinitely because of NO_TIMEOUT");
+		lwi = LWI_INTR(interrupted_completion_wait, &lwd);
+	} else {
+		lwi = LWI_TIMEOUT_INTR(cfs_time_seconds(timeout),
+				       ldlm_expired_completion_wait,
+				       interrupted_completion_wait, &lwd);
+	}
+
+	if (imp != NULL) {
+		spin_lock(&imp->imp_lock);
+		lwd.lwd_conn_cnt = imp->imp_conn_cnt;
+		spin_unlock(&imp->imp_lock);
+	}
+
+	if (ns_is_client(ldlm_lock_to_ns(lock)) &&
+	    OBD_FAIL_CHECK_RESET(OBD_FAIL_LDLM_INTR_CP_AST,
+				 OBD_FAIL_LDLM_CP_BL_RACE | OBD_FAIL_ONCE)) {
+		lock->l_flags |= LDLM_FL_FAIL_LOC;
+		rc = -EINTR;
+	} else {
+		/* Go to sleep until the lock is granted or cancelled. */
+		rc = l_wait_event(lock->l_waitq,
+				  is_granted_or_cancelled(lock), &lwi);
+	}
+
+	if (rc) {
+		LDLM_DEBUG(lock, "client-side enqueue waking up: failed (%d)",
+			   rc);
+		return rc;
+	}
+
+	return ldlm_completion_tail(lock);
+}
+EXPORT_SYMBOL(ldlm_completion_ast);
+
+/**
+ * A helper to build a blocking AST function
+ *
+ * Perform a common operation for blocking ASTs:
+ * deferred lock cancellation.
+ *
+ * \param lock the lock blocking or canceling AST was called on
+ * \retval 0
+ * \see mdt_blocking_ast
+ * \see ldlm_blocking_ast
+ */
+int ldlm_blocking_ast_nocheck(struct ldlm_lock *lock)
+{
+	int do_ast;
+
+	lock->l_flags |= LDLM_FL_CBPENDING;
+	do_ast = !lock->l_readers && !lock->l_writers;
+	unlock_res_and_lock(lock);
+
+	if (do_ast) {
+		struct lustre_handle lockh;
+		int rc;
+
+		LDLM_DEBUG(lock, "already unused, calling ldlm_cli_cancel");
+		ldlm_lock2handle(lock, &lockh);
+		rc = ldlm_cli_cancel(&lockh, LCF_ASYNC);
+		if (rc < 0)
+			CERROR("ldlm_cli_cancel: %d\n", rc);
+	} else {
+		LDLM_DEBUG(lock, "Lock still has references, will be cancelled later");
+	}
+	return 0;
+}
+EXPORT_SYMBOL(ldlm_blocking_ast_nocheck);
+
+/**
+ * Server blocking AST
+ *
+ * ->l_blocking_ast() callback for LDLM locks acquired by server-side
+ * OBDs.
+ *
+ * \param lock the lock which blocks a request or cancelling lock
+ * \param desc unused
+ * \param data unused
+ * \param flag indicates whether this cancelling or blocking callback
+ * \retval 0
+ * \see ldlm_blocking_ast_nocheck
+ */
+int ldlm_blocking_ast(struct ldlm_lock *lock, struct ldlm_lock_desc *desc,
+		      void *data, int flag)
+{
+	if (flag == LDLM_CB_CANCELING) {
+		/* Don't need to do anything here. */
+		return 0;
+	}
+
+	lock_res_and_lock(lock);
+	/* Get this: if ldlm_blocking_ast is racing with intent_policy, such
+	 * that ldlm_blocking_ast is called just before intent_policy method
+	 * takes the lr_lock, then by the time we get the lock, we might not
+	 * be the correct blocking function anymore.  So check, and return
+	 * early, if so. */
+	if (lock->l_blocking_ast != ldlm_blocking_ast) {
+		unlock_res_and_lock(lock);
+		return 0;
+	}
+	return ldlm_blocking_ast_nocheck(lock);
+}
+EXPORT_SYMBOL(ldlm_blocking_ast);
+
+/**
+ * ->l_glimpse_ast() for DLM extent locks acquired on the server-side. See
+ * comment in filter_intent_policy() on why you may need this.
+ */
+int ldlm_glimpse_ast(struct ldlm_lock *lock, void *reqp)
+{
+	/*
+	 * Returning -ELDLM_NO_LOCK_DATA actually works, but the reason for
+	 * that is rather subtle: with OST-side locking, it may so happen that
+	 * _all_ extent locks are held by the OST. If client wants to obtain
+	 * current file size it calls ll{,u}_glimpse_size(), and (as locks are
+	 * on the server), dummy glimpse callback fires and does
+	 * nothing. Client still receives correct file size due to the
+	 * following fragment in filter_intent_policy():
+	 *
+	 * rc = l->l_glimpse_ast(l, NULL); // this will update the LVB
+	 * if (rc != 0 && res->lr_namespace->ns_lvbo &&
+	 *     res->lr_namespace->ns_lvbo->lvbo_update) {
+	 *	 res->lr_namespace->ns_lvbo->lvbo_update(res, NULL, 0, 1);
+	 * }
+	 *
+	 * that is, after glimpse_ast() fails, filter_lvbo_update() runs, and
+	 * returns correct file size to the client.
+	 */
+	return -ELDLM_NO_LOCK_DATA;
+}
+EXPORT_SYMBOL(ldlm_glimpse_ast);
+
+/**
+ * Enqueue a local lock (typically on a server).
+ */
+int ldlm_cli_enqueue_local(struct ldlm_namespace *ns,
+			   const struct ldlm_res_id *res_id,
+			   ldlm_type_t type, ldlm_policy_data_t *policy,
+			   ldlm_mode_t mode, __u64 *flags,
+			   ldlm_blocking_callback blocking,
+			   ldlm_completion_callback completion,
+			   ldlm_glimpse_callback glimpse,
+			   void *data, __u32 lvb_len, enum lvb_type lvb_type,
+			   const __u64 *client_cookie,
+			   struct lustre_handle *lockh)
+{
+	struct ldlm_lock *lock;
+	int err;
+	const struct ldlm_callback_suite cbs = { .lcs_completion = completion,
+						 .lcs_blocking   = blocking,
+						 .lcs_glimpse    = glimpse,
+	};
+
+	LASSERT(!(*flags & LDLM_FL_REPLAY));
+	if (unlikely(ns_is_client(ns))) {
+		CERROR("Trying to enqueue local lock in a shadow namespace\n");
+		LBUG();
+	}
+
+	lock = ldlm_lock_create(ns, res_id, type, mode, &cbs, data, lvb_len,
+				lvb_type);
+	if (unlikely(!lock)) {
+		err = -ENOMEM;
+		goto out_nolock;
+	}
+
+	ldlm_lock2handle(lock, lockh);
+
+	/* NB: we don't have any lock now (lock_res_and_lock)
+	 * because it's a new lock */
+	ldlm_lock_addref_internal_nolock(lock, mode);
+	lock->l_flags |= LDLM_FL_LOCAL;
+	if (*flags & LDLM_FL_ATOMIC_CB)
+		lock->l_flags |= LDLM_FL_ATOMIC_CB;
+
+	if (policy != NULL)
+		lock->l_policy_data = *policy;
+	if (client_cookie != NULL)
+		lock->l_client_cookie = *client_cookie;
+	if (type == LDLM_EXTENT)
+		lock->l_req_extent = policy->l_extent;
+
+	err = ldlm_lock_enqueue(ns, &lock, policy, flags);
+	if (unlikely(err != ELDLM_OK))
+		goto out;
+
+	if (policy != NULL)
+		*policy = lock->l_policy_data;
+
+	if (lock->l_completion_ast)
+		lock->l_completion_ast(lock, *flags, NULL);
+
+	LDLM_DEBUG(lock, "client-side local enqueue handler, new lock created");
+ out:
+	LDLM_LOCK_RELEASE(lock);
+ out_nolock:
+	return err;
+}
+EXPORT_SYMBOL(ldlm_cli_enqueue_local);
+
+static void failed_lock_cleanup(struct ldlm_namespace *ns,
+				struct ldlm_lock *lock, int mode)
+{
+	int need_cancel = 0;
+
+	/* Set a flag to prevent us from sending a CANCEL (bug 407) */
+	lock_res_and_lock(lock);
+	/* Check that lock is not granted or failed, we might race. */
+	if ((lock->l_req_mode != lock->l_granted_mode) &&
+	    !(lock->l_flags & LDLM_FL_FAILED)) {
+		/* Make sure that this lock will not be found by raced
+		 * bl_ast and -EINVAL reply is sent to server anyways.
+		 * bug 17645 */
+		lock->l_flags |= LDLM_FL_LOCAL_ONLY | LDLM_FL_FAILED |
+				 LDLM_FL_ATOMIC_CB | LDLM_FL_CBPENDING;
+		need_cancel = 1;
+	}
+	unlock_res_and_lock(lock);
+
+	if (need_cancel)
+		LDLM_DEBUG(lock,
+			   "setting FL_LOCAL_ONLY | LDLM_FL_FAILED | LDLM_FL_ATOMIC_CB | LDLM_FL_CBPENDING");
+	else
+		LDLM_DEBUG(lock, "lock was granted or failed in race");
+
+	ldlm_lock_decref_internal(lock, mode);
+
+	/* XXX - HACK because we shouldn't call ldlm_lock_destroy()
+	 *       from llite/file.c/ll_file_flock(). */
+	/* This code makes for the fact that we do not have blocking handler on
+	 * a client for flock locks. As such this is the place where we must
+	 * completely kill failed locks. (interrupted and those that
+	 * were waiting to be granted when server evicted us. */
+	if (lock->l_resource->lr_type == LDLM_FLOCK) {
+		lock_res_and_lock(lock);
+		ldlm_resource_unlink_lock(lock);
+		ldlm_lock_destroy_nolock(lock);
+		unlock_res_and_lock(lock);
+	}
+}
+
+/**
+ * Finishing portion of client lock enqueue code.
+ *
+ * Called after receiving reply from server.
+ */
+int ldlm_cli_enqueue_fini(struct obd_export *exp, struct ptlrpc_request *req,
+			  ldlm_type_t type, __u8 with_policy, ldlm_mode_t mode,
+			  __u64 *flags, void *lvb, __u32 lvb_len,
+			  struct lustre_handle *lockh, int rc)
+{
+	struct ldlm_namespace *ns = exp->exp_obd->obd_namespace;
+	int is_replay = *flags & LDLM_FL_REPLAY;
+	struct ldlm_lock *lock;
+	struct ldlm_reply *reply;
+	int cleanup_phase = 1;
+	int size = 0;
+
+	lock = ldlm_handle2lock(lockh);
+	/* ldlm_cli_enqueue is holding a reference on this lock. */
+	if (!lock) {
+		LASSERT(type == LDLM_FLOCK);
+		return -ENOLCK;
+	}
+
+	LASSERTF(ergo(lvb_len != 0, lvb_len == lock->l_lvb_len),
+		 "lvb_len = %d, l_lvb_len = %d\n", lvb_len, lock->l_lvb_len);
+
+	if (rc != ELDLM_OK) {
+		LASSERT(!is_replay);
+		LDLM_DEBUG(lock, "client-side enqueue END (%s)",
+			   rc == ELDLM_LOCK_ABORTED ? "ABORTED" : "FAILED");
+
+		if (rc != ELDLM_LOCK_ABORTED)
+			goto cleanup;
+	}
+
+	/* Before we return, swab the reply */
+	reply = req_capsule_server_get(&req->rq_pill, &RMF_DLM_REP);
+	if (reply == NULL) {
+		rc = -EPROTO;
+		goto cleanup;
+	}
+
+	if (lvb_len != 0) {
+		LASSERT(lvb != NULL);
+
+		size = req_capsule_get_size(&req->rq_pill, &RMF_DLM_LVB,
+					    RCL_SERVER);
+		if (size < 0) {
+			LDLM_ERROR(lock, "Fail to get lvb_len, rc = %d", size);
+			rc = size;
+			goto cleanup;
+		} else if (unlikely(size > lvb_len)) {
+			LDLM_ERROR(lock, "Replied LVB is larger than expectation, expected = %d, replied = %d",
+				   lvb_len, size);
+			rc = -EINVAL;
+			goto cleanup;
+		}
+	}
+
+	if (rc == ELDLM_LOCK_ABORTED) {
+		if (lvb_len != 0)
+			rc = ldlm_fill_lvb(lock, &req->rq_pill, RCL_SERVER,
+					   lvb, size);
+		if (rc == 0)
+			rc = ELDLM_LOCK_ABORTED;
+		goto cleanup;
+	}
+
+	/* lock enqueued on the server */
+	cleanup_phase = 0;
+
+	lock_res_and_lock(lock);
+	/* Key change rehash lock in per-export hash with new key */
+	if (exp->exp_lock_hash) {
+		/* In the function below, .hs_keycmp resolves to
+		 * ldlm_export_lock_keycmp() */
+		/* coverity[overrun-buffer-val] */
+		cfs_hash_rehash_key(exp->exp_lock_hash,
+				    &lock->l_remote_handle,
+				    &reply->lock_handle,
+				    &lock->l_exp_hash);
+	} else {
+		lock->l_remote_handle = reply->lock_handle;
+	}
+
+	*flags = ldlm_flags_from_wire(reply->lock_flags);
+	lock->l_flags |= ldlm_flags_from_wire(reply->lock_flags &
+					      LDLM_INHERIT_FLAGS);
+	/* move NO_TIMEOUT flag to the lock to force ldlm_lock_match()
+	 * to wait with no timeout as well */
+	lock->l_flags |= ldlm_flags_from_wire(reply->lock_flags &
+					      LDLM_FL_NO_TIMEOUT);
+	unlock_res_and_lock(lock);
+
+	CDEBUG(D_INFO, "local: %p, remote cookie: %#llx, flags: 0x%llx\n",
+	       lock, reply->lock_handle.cookie, *flags);
+
+	/* If enqueue returned a blocked lock but the completion handler has
+	 * already run, then it fixed up the resource and we don't need to do it
+	 * again. */
+	if ((*flags) & LDLM_FL_LOCK_CHANGED) {
+		int newmode = reply->lock_desc.l_req_mode;
+
+		LASSERT(!is_replay);
+		if (newmode && newmode != lock->l_req_mode) {
+			LDLM_DEBUG(lock, "server returned different mode %s",
+				   ldlm_lockname[newmode]);
+			lock->l_req_mode = newmode;
+		}
+
+		if (!ldlm_res_eq(&reply->lock_desc.l_resource.lr_name,
+				 &lock->l_resource->lr_name)) {
+			CDEBUG(D_INFO, "remote intent success, locking "DLDLMRES
+				       " instead of "DLDLMRES"\n",
+			       PLDLMRES(&reply->lock_desc.l_resource),
+			       PLDLMRES(lock->l_resource));
+
+			rc = ldlm_lock_change_resource(ns, lock,
+					&reply->lock_desc.l_resource.lr_name);
+			if (rc || lock->l_resource == NULL) {
+				rc = -ENOMEM;
+				goto cleanup;
+			}
+			LDLM_DEBUG(lock, "client-side enqueue, new resource");
+		}
+		if (with_policy)
+			if (!(type == LDLM_IBITS &&
+			      !(exp_connect_flags(exp) & OBD_CONNECT_IBITS)))
+				/* We assume lock type cannot change on server*/
+				ldlm_convert_policy_to_local(exp,
+						lock->l_resource->lr_type,
+						&reply->lock_desc.l_policy_data,
+						&lock->l_policy_data);
+		if (type != LDLM_PLAIN)
+			LDLM_DEBUG(lock,
+				   "client-side enqueue, new policy data");
+	}
+
+	if ((*flags) & LDLM_FL_AST_SENT ||
+	    /* Cancel extent locks as soon as possible on a liblustre client,
+	     * because it cannot handle asynchronous ASTs robustly (see
+	     * bug 7311). */
+	    (LIBLUSTRE_CLIENT && type == LDLM_EXTENT)) {
+		lock_res_and_lock(lock);
+		lock->l_flags |= LDLM_FL_CBPENDING |  LDLM_FL_BL_AST;
+		unlock_res_and_lock(lock);
+		LDLM_DEBUG(lock, "enqueue reply includes blocking AST");
+	}
+
+	/* If the lock has already been granted by a completion AST, don't
+	 * clobber the LVB with an older one. */
+	if (lvb_len != 0) {
+		/* We must lock or a racing completion might update lvb without
+		 * letting us know and we'll clobber the correct value.
+		 * Cannot unlock after the check either, a that still leaves
+		 * a tiny window for completion to get in */
+		lock_res_and_lock(lock);
+		if (lock->l_req_mode != lock->l_granted_mode)
+			rc = ldlm_fill_lvb(lock, &req->rq_pill, RCL_SERVER,
+					   lock->l_lvb_data, size);
+		unlock_res_and_lock(lock);
+		if (rc < 0) {
+			cleanup_phase = 1;
+			goto cleanup;
+		}
+	}
+
+	if (!is_replay) {
+		rc = ldlm_lock_enqueue(ns, &lock, NULL, flags);
+		if (lock->l_completion_ast != NULL) {
+			int err = lock->l_completion_ast(lock, *flags, NULL);
+
+			if (!rc)
+				rc = err;
+			if (rc)
+				cleanup_phase = 1;
+		}
+	}
+
+	if (lvb_len && lvb != NULL) {
+		/* Copy the LVB here, and not earlier, because the completion
+		 * AST (if any) can override what we got in the reply */
+		memcpy(lvb, lock->l_lvb_data, lvb_len);
+	}
+
+	LDLM_DEBUG(lock, "client-side enqueue END");
+cleanup:
+	if (cleanup_phase == 1 && rc)
+		failed_lock_cleanup(ns, lock, mode);
+	/* Put lock 2 times, the second reference is held by ldlm_cli_enqueue */
+	LDLM_LOCK_PUT(lock);
+	LDLM_LOCK_RELEASE(lock);
+	return rc;
+}
+EXPORT_SYMBOL(ldlm_cli_enqueue_fini);
+
+/**
+ * Estimate number of lock handles that would fit into request of given
+ * size.  PAGE_SIZE-512 is to allow TCP/IP and LNET headers to fit into
+ * a single page on the send/receive side. XXX: 512 should be changed to
+ * more adequate value.
+ */
+static inline int ldlm_req_handles_avail(int req_size, int off)
+{
+	int avail;
+
+	avail = min_t(int, LDLM_MAXREQSIZE, PAGE_CACHE_SIZE - 512) - req_size;
+	if (likely(avail >= 0))
+		avail /= (int)sizeof(struct lustre_handle);
+	else
+		avail = 0;
+	avail += LDLM_LOCKREQ_HANDLES - off;
+
+	return avail;
+}
+
+static inline int ldlm_capsule_handles_avail(struct req_capsule *pill,
+					     enum req_location loc,
+					     int off)
+{
+	int size = req_capsule_msg_size(pill, loc);
+
+	return ldlm_req_handles_avail(size, off);
+}
+
+static inline int ldlm_format_handles_avail(struct obd_import *imp,
+					    const struct req_format *fmt,
+					    enum req_location loc, int off)
+{
+	int size = req_capsule_fmt_size(imp->imp_msg_magic, fmt, loc);
+
+	return ldlm_req_handles_avail(size, off);
+}
+
+/**
+ * Cancel LRU locks and pack them into the enqueue request. Pack there the given
+ * \a count locks in \a cancels.
+ *
+ * This is to be called by functions preparing their own requests that
+ * might contain lists of locks to cancel in addition to actual operation
+ * that needs to be performed.
+ */
+int ldlm_prep_elc_req(struct obd_export *exp, struct ptlrpc_request *req,
+		      int version, int opc, int canceloff,
+		      struct list_head *cancels, int count)
+{
+	struct ldlm_namespace   *ns = exp->exp_obd->obd_namespace;
+	struct req_capsule      *pill = &req->rq_pill;
+	struct ldlm_request     *dlm = NULL;
+	int flags, avail, to_free, pack = 0;
+	LIST_HEAD(head);
+	int rc;
+
+	if (cancels == NULL)
+		cancels = &head;
+	if (ns_connect_cancelset(ns)) {
+		/* Estimate the amount of available space in the request. */
+		req_capsule_filled_sizes(pill, RCL_CLIENT);
+		avail = ldlm_capsule_handles_avail(pill, RCL_CLIENT, canceloff);
+
+		flags = ns_connect_lru_resize(ns) ?
+			LDLM_CANCEL_LRUR : LDLM_CANCEL_AGED;
+		to_free = !ns_connect_lru_resize(ns) &&
+			  opc == LDLM_ENQUEUE ? 1 : 0;
+
+		/* Cancel LRU locks here _only_ if the server supports
+		 * EARLY_CANCEL. Otherwise we have to send extra CANCEL
+		 * RPC, which will make us slower. */
+		if (avail > count)
+			count += ldlm_cancel_lru_local(ns, cancels, to_free,
+						       avail - count, 0, flags);
+		if (avail > count)
+			pack = count;
+		else
+			pack = avail;
+		req_capsule_set_size(pill, &RMF_DLM_REQ, RCL_CLIENT,
+				     ldlm_request_bufsize(pack, opc));
+	}
+
+	rc = ptlrpc_request_pack(req, version, opc);
+	if (rc) {
+		ldlm_lock_list_put(cancels, l_bl_ast, count);
+		return rc;
+	}
+
+	if (ns_connect_cancelset(ns)) {
+		if (canceloff) {
+			dlm = req_capsule_client_get(pill, &RMF_DLM_REQ);
+			LASSERT(dlm);
+			/* Skip first lock handler in ldlm_request_pack(),
+			 * this method will increment @lock_count according
+			 * to the lock handle amount actually written to
+			 * the buffer. */
+			dlm->lock_count = canceloff;
+		}
+		/* Pack into the request @pack lock handles. */
+		ldlm_cli_cancel_list(cancels, pack, req, 0);
+		/* Prepare and send separate cancel RPC for others. */
+		ldlm_cli_cancel_list(cancels, count - pack, NULL, 0);
+	} else {
+		ldlm_lock_list_put(cancels, l_bl_ast, count);
+	}
+	return 0;
+}
+EXPORT_SYMBOL(ldlm_prep_elc_req);
+
+int ldlm_prep_enqueue_req(struct obd_export *exp, struct ptlrpc_request *req,
+			  struct list_head *cancels, int count)
+{
+	return ldlm_prep_elc_req(exp, req, LUSTRE_DLM_VERSION, LDLM_ENQUEUE,
+				 LDLM_ENQUEUE_CANCEL_OFF, cancels, count);
+}
+EXPORT_SYMBOL(ldlm_prep_enqueue_req);
+
+struct ptlrpc_request *ldlm_enqueue_pack(struct obd_export *exp, int lvb_len)
+{
+	struct ptlrpc_request *req;
+	int rc;
+
+	req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_LDLM_ENQUEUE);
+	if (req == NULL)
+		return ERR_PTR(-ENOMEM);
+
+	rc = ldlm_prep_enqueue_req(exp, req, NULL, 0);
+	if (rc) {
+		ptlrpc_request_free(req);
+		return ERR_PTR(rc);
+	}
+
+	req_capsule_set_size(&req->rq_pill, &RMF_DLM_LVB, RCL_SERVER, lvb_len);
+	ptlrpc_request_set_replen(req);
+	return req;
+}
+EXPORT_SYMBOL(ldlm_enqueue_pack);
+
+/**
+ * Client-side lock enqueue.
+ *
+ * If a request has some specific initialisation it is passed in \a reqp,
+ * otherwise it is created in ldlm_cli_enqueue.
+ *
+ * Supports sync and async requests, pass \a async flag accordingly. If a
+ * request was created in ldlm_cli_enqueue and it is the async request,
+ * pass it to the caller in \a reqp.
+ */
+int ldlm_cli_enqueue(struct obd_export *exp, struct ptlrpc_request **reqp,
+		     struct ldlm_enqueue_info *einfo,
+		     const struct ldlm_res_id *res_id,
+		     ldlm_policy_data_t const *policy, __u64 *flags,
+		     void *lvb, __u32 lvb_len, enum lvb_type lvb_type,
+		     struct lustre_handle *lockh, int async)
+{
+	struct ldlm_namespace *ns;
+	struct ldlm_lock      *lock;
+	struct ldlm_request   *body;
+	int		    is_replay = *flags & LDLM_FL_REPLAY;
+	int		    req_passed_in = 1;
+	int		    rc, err;
+	struct ptlrpc_request *req;
+
+	LASSERT(exp != NULL);
+
+	ns = exp->exp_obd->obd_namespace;
+
+	/* If we're replaying this lock, just check some invariants.
+	 * If we're creating a new lock, get everything all setup nice. */
+	if (is_replay) {
+		lock = ldlm_handle2lock_long(lockh, 0);
+		LASSERT(lock != NULL);
+		LDLM_DEBUG(lock, "client-side enqueue START");
+		LASSERT(exp == lock->l_conn_export);
+	} else {
+		const struct ldlm_callback_suite cbs = {
+			.lcs_completion = einfo->ei_cb_cp,
+			.lcs_blocking	= einfo->ei_cb_bl,
+			.lcs_glimpse	= einfo->ei_cb_gl
+		};
+		lock = ldlm_lock_create(ns, res_id, einfo->ei_type,
+					einfo->ei_mode, &cbs, einfo->ei_cbdata,
+					lvb_len, lvb_type);
+		if (lock == NULL)
+			return -ENOMEM;
+		/* for the local lock, add the reference */
+		ldlm_lock_addref_internal(lock, einfo->ei_mode);
+		ldlm_lock2handle(lock, lockh);
+		if (policy != NULL)
+				lock->l_policy_data = *policy;
+
+		if (einfo->ei_type == LDLM_EXTENT)
+			lock->l_req_extent = policy->l_extent;
+		LDLM_DEBUG(lock, "client-side enqueue START, flags %llx\n",
+			   *flags);
+	}
+
+	lock->l_conn_export = exp;
+	lock->l_export = NULL;
+	lock->l_blocking_ast = einfo->ei_cb_bl;
+	lock->l_flags |= (*flags & (LDLM_FL_NO_LRU | LDLM_FL_EXCL));
+
+	/* lock not sent to server yet */
+
+	if (reqp == NULL || *reqp == NULL) {
+		req = ptlrpc_request_alloc_pack(class_exp2cliimp(exp),
+						&RQF_LDLM_ENQUEUE,
+						LUSTRE_DLM_VERSION,
+						LDLM_ENQUEUE);
+		if (req == NULL) {
+			failed_lock_cleanup(ns, lock, einfo->ei_mode);
+			LDLM_LOCK_RELEASE(lock);
+			return -ENOMEM;
+		}
+		req_passed_in = 0;
+		if (reqp)
+			*reqp = req;
+	} else {
+		int len;
+
+		req = *reqp;
+		len = req_capsule_get_size(&req->rq_pill, &RMF_DLM_REQ,
+					   RCL_CLIENT);
+		LASSERTF(len >= sizeof(*body), "buflen[%d] = %d, not %d\n",
+			 DLM_LOCKREQ_OFF, len, (int)sizeof(*body));
+	}
+
+	/* Dump lock data into the request buffer */
+	body = req_capsule_client_get(&req->rq_pill, &RMF_DLM_REQ);
+	ldlm_lock2desc(lock, &body->lock_desc);
+	body->lock_flags = ldlm_flags_to_wire(*flags);
+	body->lock_handle[0] = *lockh;
+
+	/* Continue as normal. */
+	if (!req_passed_in) {
+		if (lvb_len > 0)
+			req_capsule_extend(&req->rq_pill,
+					   &RQF_LDLM_ENQUEUE_LVB);
+		req_capsule_set_size(&req->rq_pill, &RMF_DLM_LVB, RCL_SERVER,
+				     lvb_len);
+		ptlrpc_request_set_replen(req);
+	}
+
+	/*
+	 * Liblustre client doesn't get extent locks, except for O_APPEND case
+	 * where [0, OBD_OBJECT_EOF] lock is taken, or truncate, where
+	 * [i_size, OBD_OBJECT_EOF] lock is taken.
+	 */
+	LASSERT(ergo(LIBLUSTRE_CLIENT, einfo->ei_type != LDLM_EXTENT ||
+		     policy->l_extent.end == OBD_OBJECT_EOF));
+
+	if (async) {
+		LASSERT(reqp != NULL);
+		return 0;
+	}
+
+	LDLM_DEBUG(lock, "sending request");
+
+	rc = ptlrpc_queue_wait(req);
+
+	err = ldlm_cli_enqueue_fini(exp, req, einfo->ei_type, policy ? 1 : 0,
+				    einfo->ei_mode, flags, lvb, lvb_len,
+				    lockh, rc);
+
+	/* If ldlm_cli_enqueue_fini did not find the lock, we need to free
+	 * one reference that we took */
+	if (err == -ENOLCK)
+		LDLM_LOCK_RELEASE(lock);
+	else
+		rc = err;
+
+	if (!req_passed_in && req != NULL) {
+		ptlrpc_req_finished(req);
+		if (reqp)
+			*reqp = NULL;
+	}
+
+	return rc;
+}
+EXPORT_SYMBOL(ldlm_cli_enqueue);
+
+static int ldlm_cli_convert_local(struct ldlm_lock *lock, int new_mode,
+				  __u32 *flags)
+{
+	struct ldlm_resource *res;
+	int rc;
+
+	if (ns_is_client(ldlm_lock_to_ns(lock))) {
+		CERROR("Trying to cancel local lock\n");
+		LBUG();
+	}
+	LDLM_DEBUG(lock, "client-side local convert");
+
+	res = ldlm_lock_convert(lock, new_mode, flags);
+	if (res) {
+		ldlm_reprocess_all(res);
+		rc = 0;
+	} else {
+		rc = LUSTRE_EDEADLK;
+	}
+	LDLM_DEBUG(lock, "client-side local convert handler END");
+	LDLM_LOCK_PUT(lock);
+	return rc;
+}
+
+/* FIXME: one of ldlm_cli_convert or the server side should reject attempted
+ * conversion of locks which are on the waiting or converting queue */
+/* Caller of this code is supposed to take care of lock readers/writers
+   accounting */
+int ldlm_cli_convert(struct lustre_handle *lockh, int new_mode, __u32 *flags)
+{
+	struct ldlm_request   *body;
+	struct ldlm_reply     *reply;
+	struct ldlm_lock      *lock;
+	struct ldlm_resource  *res;
+	struct ptlrpc_request *req;
+	int		    rc;
+
+	lock = ldlm_handle2lock(lockh);
+	if (!lock) {
+		LBUG();
+		return -EINVAL;
+	}
+	*flags = 0;
+
+	if (lock->l_conn_export == NULL)
+		return ldlm_cli_convert_local(lock, new_mode, flags);
+
+	LDLM_DEBUG(lock, "client-side convert");
+
+	req = ptlrpc_request_alloc_pack(class_exp2cliimp(lock->l_conn_export),
+					&RQF_LDLM_CONVERT, LUSTRE_DLM_VERSION,
+					LDLM_CONVERT);
+	if (req == NULL) {
+		LDLM_LOCK_PUT(lock);
+		return -ENOMEM;
+	}
+
+	body = req_capsule_client_get(&req->rq_pill, &RMF_DLM_REQ);
+	body->lock_handle[0] = lock->l_remote_handle;
+
+	body->lock_desc.l_req_mode = new_mode;
+	body->lock_flags = ldlm_flags_to_wire(*flags);
+
+
+	ptlrpc_request_set_replen(req);
+	rc = ptlrpc_queue_wait(req);
+	if (rc != ELDLM_OK)
+		goto out;
+
+	reply = req_capsule_server_get(&req->rq_pill, &RMF_DLM_REP);
+	if (reply == NULL) {
+		rc = -EPROTO;
+		goto out;
+	}
+
+	if (req->rq_status) {
+		rc = req->rq_status;
+		goto out;
+	}
+
+	res = ldlm_lock_convert(lock, new_mode, &reply->lock_flags);
+	if (res != NULL) {
+		ldlm_reprocess_all(res);
+		/* Go to sleep until the lock is granted. */
+		/* FIXME: or cancelled. */
+		if (lock->l_completion_ast) {
+			rc = lock->l_completion_ast(lock, LDLM_FL_WAIT_NOREPROC,
+						    NULL);
+			if (rc)
+				goto out;
+		}
+	} else {
+		rc = LUSTRE_EDEADLK;
+	}
+ out:
+	LDLM_LOCK_PUT(lock);
+	ptlrpc_req_finished(req);
+	return rc;
+}
+EXPORT_SYMBOL(ldlm_cli_convert);
+
+/**
+ * Cancel locks locally.
+ * Returns:
+ * \retval LDLM_FL_LOCAL_ONLY if there is no need for a CANCEL RPC to the server
+ * \retval LDLM_FL_CANCELING otherwise;
+ * \retval LDLM_FL_BL_AST if there is a need for a separate CANCEL RPC.
+ */
+static __u64 ldlm_cli_cancel_local(struct ldlm_lock *lock)
+{
+	__u64 rc = LDLM_FL_LOCAL_ONLY;
+
+	if (lock->l_conn_export) {
+		bool local_only;
+
+		LDLM_DEBUG(lock, "client-side cancel");
+		/* Set this flag to prevent others from getting new references*/
+		lock_res_and_lock(lock);
+		lock->l_flags |= LDLM_FL_CBPENDING;
+		local_only = !!(lock->l_flags &
+				(LDLM_FL_LOCAL_ONLY|LDLM_FL_CANCEL_ON_BLOCK));
+		ldlm_cancel_callback(lock);
+		rc = (lock->l_flags & LDLM_FL_BL_AST) ?
+			LDLM_FL_BL_AST : LDLM_FL_CANCELING;
+		unlock_res_and_lock(lock);
+
+		if (local_only) {
+			CDEBUG(D_DLMTRACE, "not sending request (at caller's instruction)\n");
+			rc = LDLM_FL_LOCAL_ONLY;
+		}
+		ldlm_lock_cancel(lock);
+	} else {
+		if (ns_is_client(ldlm_lock_to_ns(lock))) {
+			LDLM_ERROR(lock, "Trying to cancel local lock");
+			LBUG();
+		}
+		LDLM_DEBUG(lock, "server-side local cancel");
+		ldlm_lock_cancel(lock);
+		ldlm_reprocess_all(lock->l_resource);
+	}
+
+	return rc;
+}
+
+/**
+ * Pack \a count locks in \a head into ldlm_request buffer of request \a req.
+ */
+static void ldlm_cancel_pack(struct ptlrpc_request *req,
+			     struct list_head *head, int count)
+{
+	struct ldlm_request *dlm;
+	struct ldlm_lock *lock;
+	int max, packed = 0;
+
+	dlm = req_capsule_client_get(&req->rq_pill, &RMF_DLM_REQ);
+	LASSERT(dlm != NULL);
+
+	/* Check the room in the request buffer. */
+	max = req_capsule_get_size(&req->rq_pill, &RMF_DLM_REQ, RCL_CLIENT) -
+		sizeof(struct ldlm_request);
+	max /= sizeof(struct lustre_handle);
+	max += LDLM_LOCKREQ_HANDLES;
+	LASSERT(max >= dlm->lock_count + count);
+
+	/* XXX: it would be better to pack lock handles grouped by resource.
+	 * so that the server cancel would call filter_lvbo_update() less
+	 * frequently. */
+	list_for_each_entry(lock, head, l_bl_ast) {
+		if (!count--)
+			break;
+		LASSERT(lock->l_conn_export);
+		/* Pack the lock handle to the given request buffer. */
+		LDLM_DEBUG(lock, "packing");
+		dlm->lock_handle[dlm->lock_count++] = lock->l_remote_handle;
+		packed++;
+	}
+	CDEBUG(D_DLMTRACE, "%d locks packed\n", packed);
+}
+
+/**
+ * Prepare and send a batched cancel RPC. It will include \a count lock
+ * handles of locks given in \a cancels list. */
+int ldlm_cli_cancel_req(struct obd_export *exp, struct list_head *cancels,
+			int count, ldlm_cancel_flags_t flags)
+{
+	struct ptlrpc_request *req = NULL;
+	struct obd_import *imp;
+	int free, sent = 0;
+	int rc = 0;
+
+	LASSERT(exp != NULL);
+	LASSERT(count > 0);
+
+	CFS_FAIL_TIMEOUT(OBD_FAIL_LDLM_PAUSE_CANCEL, cfs_fail_val);
+
+	if (CFS_FAIL_CHECK(OBD_FAIL_LDLM_CANCEL_RACE))
+		return count;
+
+	free = ldlm_format_handles_avail(class_exp2cliimp(exp),
+					 &RQF_LDLM_CANCEL, RCL_CLIENT, 0);
+	if (count > free)
+		count = free;
+
+	while (1) {
+		imp = class_exp2cliimp(exp);
+		if (imp == NULL || imp->imp_invalid) {
+			CDEBUG(D_DLMTRACE,
+			       "skipping cancel on invalid import %p\n", imp);
+			return count;
+		}
+
+		req = ptlrpc_request_alloc(imp, &RQF_LDLM_CANCEL);
+		if (req == NULL) {
+			rc = -ENOMEM;
+			goto out;
+		}
+
+		req_capsule_filled_sizes(&req->rq_pill, RCL_CLIENT);
+		req_capsule_set_size(&req->rq_pill, &RMF_DLM_REQ, RCL_CLIENT,
+				     ldlm_request_bufsize(count, LDLM_CANCEL));
+
+		rc = ptlrpc_request_pack(req, LUSTRE_DLM_VERSION, LDLM_CANCEL);
+		if (rc) {
+			ptlrpc_request_free(req);
+			goto out;
+		}
+
+		req->rq_request_portal = LDLM_CANCEL_REQUEST_PORTAL;
+		req->rq_reply_portal = LDLM_CANCEL_REPLY_PORTAL;
+		ptlrpc_at_set_req_timeout(req);
+
+		ldlm_cancel_pack(req, cancels, count);
+
+		ptlrpc_request_set_replen(req);
+		if (flags & LCF_ASYNC) {
+			ptlrpcd_add_req(req, PDL_POLICY_LOCAL, -1);
+			sent = count;
+			goto out;
+		} else {
+			rc = ptlrpc_queue_wait(req);
+		}
+		if (rc == LUSTRE_ESTALE) {
+			CDEBUG(D_DLMTRACE, "client/server (nid %s) out of sync -- not fatal\n",
+			       libcfs_nid2str(req->rq_import->
+					      imp_connection->c_peer.nid));
+			rc = 0;
+		} else if (rc == -ETIMEDOUT && /* check there was no reconnect*/
+			   req->rq_import_generation == imp->imp_generation) {
+			ptlrpc_req_finished(req);
+			continue;
+		} else if (rc != ELDLM_OK) {
+			/* -ESHUTDOWN is common on umount */
+			CDEBUG_LIMIT(rc == -ESHUTDOWN ? D_DLMTRACE : D_ERROR,
+				     "Got rc %d from cancel RPC: canceling anyway\n",
+				     rc);
+			break;
+		}
+		sent = count;
+		break;
+	}
+
+	ptlrpc_req_finished(req);
+out:
+	return sent ? sent : rc;
+}
+EXPORT_SYMBOL(ldlm_cli_cancel_req);
+
+static inline struct ldlm_pool *ldlm_imp2pl(struct obd_import *imp)
+{
+	LASSERT(imp != NULL);
+	return &imp->imp_obd->obd_namespace->ns_pool;
+}
+
+/**
+ * Update client's OBD pool related fields with new SLV and Limit from \a req.
+ */
+int ldlm_cli_update_pool(struct ptlrpc_request *req)
+{
+	struct obd_device *obd;
+	__u64 new_slv;
+	__u32 new_limit;
+
+	if (unlikely(!req->rq_import || !req->rq_import->imp_obd ||
+		     !imp_connect_lru_resize(req->rq_import))) {
+		/*
+		 * Do nothing for corner cases.
+		 */
+		return 0;
+	}
+
+	/* In some cases RPC may contain SLV and limit zeroed out. This
+	 * is the case when server does not support LRU resize feature.
+	 * This is also possible in some recovery cases when server-side
+	 * reqs have no reference to the OBD export and thus access to
+	 * server-side namespace is not possible. */
+	if (lustre_msg_get_slv(req->rq_repmsg) == 0 ||
+	    lustre_msg_get_limit(req->rq_repmsg) == 0) {
+		DEBUG_REQ(D_HA, req,
+			  "Zero SLV or Limit found (SLV: %llu, Limit: %u)",
+			  lustre_msg_get_slv(req->rq_repmsg),
+			  lustre_msg_get_limit(req->rq_repmsg));
+		return 0;
+	}
+
+	new_limit = lustre_msg_get_limit(req->rq_repmsg);
+	new_slv = lustre_msg_get_slv(req->rq_repmsg);
+	obd = req->rq_import->imp_obd;
+
+	/* Set new SLV and limit in OBD fields to make them accessible
+	 * to the pool thread. We do not access obd_namespace and pool
+	 * directly here as there is no reliable way to make sure that
+	 * they are still alive at cleanup time. Evil races are possible
+	 * which may cause Oops at that time. */
+	write_lock(&obd->obd_pool_lock);
+	obd->obd_pool_slv = new_slv;
+	obd->obd_pool_limit = new_limit;
+	write_unlock(&obd->obd_pool_lock);
+
+	return 0;
+}
+EXPORT_SYMBOL(ldlm_cli_update_pool);
+
+/**
+ * Client side lock cancel.
+ *
+ * Lock must not have any readers or writers by this time.
+ */
+int ldlm_cli_cancel(struct lustre_handle *lockh,
+		    ldlm_cancel_flags_t cancel_flags)
+{
+	struct obd_export *exp;
+	int avail, flags, count = 1;
+	__u64 rc = 0;
+	struct ldlm_namespace *ns;
+	struct ldlm_lock *lock;
+	LIST_HEAD(cancels);
+
+	/* concurrent cancels on the same handle can happen */
+	lock = ldlm_handle2lock_long(lockh, LDLM_FL_CANCELING);
+	if (lock == NULL) {
+		LDLM_DEBUG_NOLOCK("lock is already being destroyed\n");
+		return 0;
+	}
+
+	rc = ldlm_cli_cancel_local(lock);
+	if (rc == LDLM_FL_LOCAL_ONLY || cancel_flags & LCF_LOCAL) {
+		LDLM_LOCK_RELEASE(lock);
+		return 0;
+	}
+	/* Even if the lock is marked as LDLM_FL_BL_AST, this is a LDLM_CANCEL
+	 * RPC which goes to canceld portal, so we can cancel other LRU locks
+	 * here and send them all as one LDLM_CANCEL RPC. */
+	LASSERT(list_empty(&lock->l_bl_ast));
+	list_add(&lock->l_bl_ast, &cancels);
+
+	exp = lock->l_conn_export;
+	if (exp_connect_cancelset(exp)) {
+		avail = ldlm_format_handles_avail(class_exp2cliimp(exp),
+						  &RQF_LDLM_CANCEL,
+						  RCL_CLIENT, 0);
+		LASSERT(avail > 0);
+
+		ns = ldlm_lock_to_ns(lock);
+		flags = ns_connect_lru_resize(ns) ?
+			LDLM_CANCEL_LRUR : LDLM_CANCEL_AGED;
+		count += ldlm_cancel_lru_local(ns, &cancels, 0, avail - 1,
+					       LCF_BL_AST, flags);
+	}
+	ldlm_cli_cancel_list(&cancels, count, NULL, cancel_flags);
+	return 0;
+}
+EXPORT_SYMBOL(ldlm_cli_cancel);
+
+/**
+ * Locally cancel up to \a count locks in list \a cancels.
+ * Return the number of cancelled locks.
+ */
+int ldlm_cli_cancel_list_local(struct list_head *cancels, int count,
+			       ldlm_cancel_flags_t flags)
+{
+	LIST_HEAD(head);
+	struct ldlm_lock *lock, *next;
+	int left = 0, bl_ast = 0;
+	__u64 rc;
+
+	left = count;
+	list_for_each_entry_safe(lock, next, cancels, l_bl_ast) {
+		if (left-- == 0)
+			break;
+
+		if (flags & LCF_LOCAL) {
+			rc = LDLM_FL_LOCAL_ONLY;
+			ldlm_lock_cancel(lock);
+		} else {
+			rc = ldlm_cli_cancel_local(lock);
+		}
+		/* Until we have compound requests and can send LDLM_CANCEL
+		 * requests batched with generic RPCs, we need to send cancels
+		 * with the LDLM_FL_BL_AST flag in a separate RPC from
+		 * the one being generated now. */
+		if (!(flags & LCF_BL_AST) && (rc == LDLM_FL_BL_AST)) {
+			LDLM_DEBUG(lock, "Cancel lock separately");
+			list_del_init(&lock->l_bl_ast);
+			list_add(&lock->l_bl_ast, &head);
+			bl_ast++;
+			continue;
+		}
+		if (rc == LDLM_FL_LOCAL_ONLY) {
+			/* CANCEL RPC should not be sent to server. */
+			list_del_init(&lock->l_bl_ast);
+			LDLM_LOCK_RELEASE(lock);
+			count--;
+		}
+	}
+	if (bl_ast > 0) {
+		count -= bl_ast;
+		ldlm_cli_cancel_list(&head, bl_ast, NULL, 0);
+	}
+
+	return count;
+}
+EXPORT_SYMBOL(ldlm_cli_cancel_list_local);
+
+/**
+ * Cancel as many locks as possible w/o sending any RPCs (e.g. to write back
+ * dirty data, to close a file, ...) or waiting for any RPCs in-flight (e.g.
+ * readahead requests, ...)
+ */
+static ldlm_policy_res_t ldlm_cancel_no_wait_policy(struct ldlm_namespace *ns,
+						    struct ldlm_lock *lock,
+						    int unused, int added,
+						    int count)
+{
+	ldlm_policy_res_t result = LDLM_POLICY_CANCEL_LOCK;
+	ldlm_cancel_for_recovery cb = ns->ns_cancel_for_recovery;
+
+	lock_res_and_lock(lock);
+
+	/* don't check added & count since we want to process all locks
+	 * from unused list */
+	switch (lock->l_resource->lr_type) {
+	case LDLM_EXTENT:
+	case LDLM_IBITS:
+		if (cb && cb(lock))
+			break;
+	default:
+		result = LDLM_POLICY_SKIP_LOCK;
+		lock->l_flags |= LDLM_FL_SKIPPED;
+		break;
+	}
+
+	unlock_res_and_lock(lock);
+	return result;
+}
+
+/**
+ * Callback function for LRU-resize policy. Decides whether to keep
+ * \a lock in LRU for current \a LRU size \a unused, added in current
+ * scan \a added and number of locks to be preferably canceled \a count.
+ *
+ * \retval LDLM_POLICY_KEEP_LOCK keep lock in LRU in stop scanning
+ *
+ * \retval LDLM_POLICY_CANCEL_LOCK cancel lock from LRU
+ */
+static ldlm_policy_res_t ldlm_cancel_lrur_policy(struct ldlm_namespace *ns,
+						 struct ldlm_lock *lock,
+						 int unused, int added,
+						 int count)
+{
+	unsigned long cur = cfs_time_current();
+	struct ldlm_pool *pl = &ns->ns_pool;
+	__u64 slv, lvf, lv;
+	unsigned long la;
+
+	/* Stop LRU processing when we reach past @count or have checked all
+	 * locks in LRU. */
+	if (count && added >= count)
+		return LDLM_POLICY_KEEP_LOCK;
+
+	slv = ldlm_pool_get_slv(pl);
+	lvf = ldlm_pool_get_lvf(pl);
+	la = cfs_duration_sec(cfs_time_sub(cur,
+			      lock->l_last_used));
+	lv = lvf * la * unused;
+
+	/* Inform pool about current CLV to see it via proc. */
+	ldlm_pool_set_clv(pl, lv);
+
+	/* Stop when SLV is not yet come from server or lv is smaller than
+	 * it is. */
+	return (slv == 0 || lv < slv) ?
+		LDLM_POLICY_KEEP_LOCK : LDLM_POLICY_CANCEL_LOCK;
+}
+
+/**
+ * Callback function for proc used policy. Makes decision whether to keep
+ * \a lock in LRU for current \a LRU size \a unused, added in current scan \a
+ * added and number of locks to be preferably canceled \a count.
+ *
+ * \retval LDLM_POLICY_KEEP_LOCK keep lock in LRU in stop scanning
+ *
+ * \retval LDLM_POLICY_CANCEL_LOCK cancel lock from LRU
+ */
+static ldlm_policy_res_t ldlm_cancel_passed_policy(struct ldlm_namespace *ns,
+						   struct ldlm_lock *lock,
+						   int unused, int added,
+						   int count)
+{
+	/* Stop LRU processing when we reach past @count or have checked all
+	 * locks in LRU. */
+	return (added >= count) ?
+		LDLM_POLICY_KEEP_LOCK : LDLM_POLICY_CANCEL_LOCK;
+}
+
+/**
+ * Callback function for aged policy. Makes decision whether to keep \a lock in
+ * LRU for current LRU size \a unused, added in current scan \a added and
+ * number of locks to be preferably canceled \a count.
+ *
+ * \retval LDLM_POLICY_KEEP_LOCK keep lock in LRU in stop scanning
+ *
+ * \retval LDLM_POLICY_CANCEL_LOCK cancel lock from LRU
+ */
+static ldlm_policy_res_t ldlm_cancel_aged_policy(struct ldlm_namespace *ns,
+						 struct ldlm_lock *lock,
+						 int unused, int added,
+						 int count)
+{
+	/* Stop LRU processing if young lock is found and we reach past count */
+	return ((added >= count) &&
+		time_before(cfs_time_current(),
+			    cfs_time_add(lock->l_last_used, ns->ns_max_age))) ?
+		LDLM_POLICY_KEEP_LOCK : LDLM_POLICY_CANCEL_LOCK;
+}
+
+/**
+ * Callback function for default policy. Makes decision whether to keep \a lock
+ * in LRU for current LRU size \a unused, added in current scan \a added and
+ * number of locks to be preferably canceled \a count.
+ *
+ * \retval LDLM_POLICY_KEEP_LOCK keep lock in LRU in stop scanning
+ *
+ * \retval LDLM_POLICY_CANCEL_LOCK cancel lock from LRU
+ */
+static ldlm_policy_res_t ldlm_cancel_default_policy(struct ldlm_namespace *ns,
+						    struct ldlm_lock *lock,
+						    int unused, int added,
+						    int count)
+{
+	/* Stop LRU processing when we reach past count or have checked all
+	 * locks in LRU. */
+	return (added >= count) ?
+		LDLM_POLICY_KEEP_LOCK : LDLM_POLICY_CANCEL_LOCK;
+}
+
+typedef ldlm_policy_res_t (*ldlm_cancel_lru_policy_t)(struct ldlm_namespace *,
+						      struct ldlm_lock *, int,
+						      int, int);
+
+static ldlm_cancel_lru_policy_t
+ldlm_cancel_lru_policy(struct ldlm_namespace *ns, int flags)
+{
+	if (flags & LDLM_CANCEL_NO_WAIT)
+		return ldlm_cancel_no_wait_policy;
+
+	if (ns_connect_lru_resize(ns)) {
+		if (flags & LDLM_CANCEL_SHRINK)
+			/* We kill passed number of old locks. */
+			return ldlm_cancel_passed_policy;
+		else if (flags & LDLM_CANCEL_LRUR)
+			return ldlm_cancel_lrur_policy;
+		else if (flags & LDLM_CANCEL_PASSED)
+			return ldlm_cancel_passed_policy;
+	} else {
+		if (flags & LDLM_CANCEL_AGED)
+			return ldlm_cancel_aged_policy;
+	}
+
+	return ldlm_cancel_default_policy;
+}
+
+/**
+ * - Free space in LRU for \a count new locks,
+ *   redundant unused locks are canceled locally;
+ * - also cancel locally unused aged locks;
+ * - do not cancel more than \a max locks;
+ * - GET the found locks and add them into the \a cancels list.
+ *
+ * A client lock can be added to the l_bl_ast list only when it is
+ * marked LDLM_FL_CANCELING. Otherwise, somebody is already doing
+ * CANCEL.  There are the following use cases:
+ * ldlm_cancel_resource_local(), ldlm_cancel_lru_local() and
+ * ldlm_cli_cancel(), which check and set this flag properly. As any
+ * attempt to cancel a lock rely on this flag, l_bl_ast list is accessed
+ * later without any special locking.
+ *
+ * Calling policies for enabled LRU resize:
+ * ----------------------------------------
+ * flags & LDLM_CANCEL_LRUR - use LRU resize policy (SLV from server) to
+ *			    cancel not more than \a count locks;
+ *
+ * flags & LDLM_CANCEL_PASSED - cancel \a count number of old locks (located at
+ *			      the beginning of LRU list);
+ *
+ * flags & LDLM_CANCEL_SHRINK - cancel not more than \a count locks according to
+ *			      memory pressure policy function;
+ *
+ * flags & LDLM_CANCEL_AGED - cancel \a count locks according to "aged policy".
+ *
+ * flags & LDLM_CANCEL_NO_WAIT - cancel as many unused locks as possible
+ *			       (typically before replaying locks) w/o
+ *			       sending any RPCs or waiting for any
+ *			       outstanding RPC to complete.
+ */
+static int ldlm_prepare_lru_list(struct ldlm_namespace *ns,
+				 struct list_head *cancels, int count, int max,
+				 int flags)
+{
+	ldlm_cancel_lru_policy_t pf;
+	struct ldlm_lock *lock, *next;
+	int added = 0, unused, remained;
+
+	spin_lock(&ns->ns_lock);
+	unused = ns->ns_nr_unused;
+	remained = unused;
+
+	if (!ns_connect_lru_resize(ns))
+		count += unused - ns->ns_max_unused;
+
+	pf = ldlm_cancel_lru_policy(ns, flags);
+	LASSERT(pf != NULL);
+
+	while (!list_empty(&ns->ns_unused_list)) {
+		ldlm_policy_res_t result;
+
+		/* all unused locks */
+		if (remained-- <= 0)
+			break;
+
+		/* For any flags, stop scanning if @max is reached. */
+		if (max && added >= max)
+			break;
+
+		list_for_each_entry_safe(lock, next, &ns->ns_unused_list,
+					     l_lru) {
+			/* No locks which got blocking requests. */
+			LASSERT(!(lock->l_flags & LDLM_FL_BL_AST));
+
+			if (flags & LDLM_CANCEL_NO_WAIT &&
+			    lock->l_flags & LDLM_FL_SKIPPED)
+				/* already processed */
+				continue;
+
+			/* Somebody is already doing CANCEL. No need for this
+			 * lock in LRU, do not traverse it again. */
+			if (!(lock->l_flags & LDLM_FL_CANCELING))
+				break;
+
+			ldlm_lock_remove_from_lru_nolock(lock);
+		}
+		if (&lock->l_lru == &ns->ns_unused_list)
+			break;
+
+		LDLM_LOCK_GET(lock);
+		spin_unlock(&ns->ns_lock);
+		lu_ref_add(&lock->l_reference, __func__, current);
+
+		/* Pass the lock through the policy filter and see if it
+		 * should stay in LRU.
+		 *
+		 * Even for shrinker policy we stop scanning if
+		 * we find a lock that should stay in the cache.
+		 * We should take into account lock age anyway
+		 * as a new lock is a valuable resource even if
+		 * it has a low weight.
+		 *
+		 * That is, for shrinker policy we drop only
+		 * old locks, but additionally choose them by
+		 * their weight. Big extent locks will stay in
+		 * the cache. */
+		result = pf(ns, lock, unused, added, count);
+		if (result == LDLM_POLICY_KEEP_LOCK) {
+			lu_ref_del(&lock->l_reference,
+				   __func__, current);
+			LDLM_LOCK_RELEASE(lock);
+			spin_lock(&ns->ns_lock);
+			break;
+		}
+		if (result == LDLM_POLICY_SKIP_LOCK) {
+			lu_ref_del(&lock->l_reference,
+				   __func__, current);
+			LDLM_LOCK_RELEASE(lock);
+			spin_lock(&ns->ns_lock);
+			continue;
+		}
+
+		lock_res_and_lock(lock);
+		/* Check flags again under the lock. */
+		if ((lock->l_flags & LDLM_FL_CANCELING) ||
+		    (ldlm_lock_remove_from_lru(lock) == 0)) {
+			/* Another thread is removing lock from LRU, or
+			 * somebody is already doing CANCEL, or there
+			 * is a blocking request which will send cancel
+			 * by itself, or the lock is no longer unused. */
+			unlock_res_and_lock(lock);
+			lu_ref_del(&lock->l_reference,
+				   __func__, current);
+			LDLM_LOCK_RELEASE(lock);
+			spin_lock(&ns->ns_lock);
+			continue;
+		}
+		LASSERT(!lock->l_readers && !lock->l_writers);
+
+		/* If we have chosen to cancel this lock voluntarily, we
+		 * better send cancel notification to server, so that it
+		 * frees appropriate state. This might lead to a race
+		 * where while we are doing cancel here, server is also
+		 * silently cancelling this lock. */
+		lock->l_flags &= ~LDLM_FL_CANCEL_ON_BLOCK;
+
+		/* Setting the CBPENDING flag is a little misleading,
+		 * but prevents an important race; namely, once
+		 * CBPENDING is set, the lock can accumulate no more
+		 * readers/writers. Since readers and writers are
+		 * already zero here, ldlm_lock_decref() won't see
+		 * this flag and call l_blocking_ast */
+		lock->l_flags |= LDLM_FL_CBPENDING | LDLM_FL_CANCELING;
+
+		/* We can't re-add to l_lru as it confuses the
+		 * refcounting in ldlm_lock_remove_from_lru() if an AST
+		 * arrives after we drop lr_lock below. We use l_bl_ast
+		 * and can't use l_pending_chain as it is used both on
+		 * server and client nevertheless bug 5666 says it is
+		 * used only on server */
+		LASSERT(list_empty(&lock->l_bl_ast));
+		list_add(&lock->l_bl_ast, cancels);
+		unlock_res_and_lock(lock);
+		lu_ref_del(&lock->l_reference, __func__, current);
+		spin_lock(&ns->ns_lock);
+		added++;
+		unused--;
+	}
+	spin_unlock(&ns->ns_lock);
+	return added;
+}
+
+int ldlm_cancel_lru_local(struct ldlm_namespace *ns, struct list_head *cancels,
+			  int count, int max, ldlm_cancel_flags_t cancel_flags,
+			  int flags)
+{
+	int added;
+
+	added = ldlm_prepare_lru_list(ns, cancels, count, max, flags);
+	if (added <= 0)
+		return added;
+	return ldlm_cli_cancel_list_local(cancels, added, cancel_flags);
+}
+
+/**
+ * Cancel at least \a nr locks from given namespace LRU.
+ *
+ * When called with LCF_ASYNC the blocking callback will be handled
+ * in a thread and this function will return after the thread has been
+ * asked to call the callback.  When called with LCF_ASYNC the blocking
+ * callback will be performed in this function.
+ */
+int ldlm_cancel_lru(struct ldlm_namespace *ns, int nr,
+		    ldlm_cancel_flags_t cancel_flags,
+		    int flags)
+{
+	LIST_HEAD(cancels);
+	int count, rc;
+
+	/* Just prepare the list of locks, do not actually cancel them yet.
+	 * Locks are cancelled later in a separate thread. */
+	count = ldlm_prepare_lru_list(ns, &cancels, nr, 0, flags);
+	rc = ldlm_bl_to_thread_list(ns, NULL, &cancels, count, cancel_flags);
+	if (rc == 0)
+		return count;
+
+	return 0;
+}
+
+/**
+ * Find and cancel locally unused locks found on resource, matched to the
+ * given policy, mode. GET the found locks and add them into the \a cancels
+ * list.
+ */
+int ldlm_cancel_resource_local(struct ldlm_resource *res,
+			       struct list_head *cancels,
+			       ldlm_policy_data_t *policy,
+			       ldlm_mode_t mode, __u64 lock_flags,
+			       ldlm_cancel_flags_t cancel_flags, void *opaque)
+{
+	struct ldlm_lock *lock;
+	int count = 0;
+
+	lock_res(res);
+	list_for_each_entry(lock, &res->lr_granted, l_res_link) {
+		if (opaque != NULL && lock->l_ast_data != opaque) {
+			LDLM_ERROR(lock, "data %p doesn't match opaque %p",
+				   lock->l_ast_data, opaque);
+			continue;
+		}
+
+		if (lock->l_readers || lock->l_writers)
+			continue;
+
+		/* If somebody is already doing CANCEL, or blocking AST came,
+		 * skip this lock. */
+		if (lock->l_flags & LDLM_FL_BL_AST ||
+		    lock->l_flags & LDLM_FL_CANCELING)
+			continue;
+
+		if (lockmode_compat(lock->l_granted_mode, mode))
+			continue;
+
+		/* If policy is given and this is IBITS lock, add to list only
+		 * those locks that match by policy. */
+		if (policy && (lock->l_resource->lr_type == LDLM_IBITS) &&
+		    !(lock->l_policy_data.l_inodebits.bits &
+		      policy->l_inodebits.bits))
+			continue;
+
+		/* See CBPENDING comment in ldlm_cancel_lru */
+		lock->l_flags |= LDLM_FL_CBPENDING | LDLM_FL_CANCELING |
+				 lock_flags;
+
+		LASSERT(list_empty(&lock->l_bl_ast));
+		list_add(&lock->l_bl_ast, cancels);
+		LDLM_LOCK_GET(lock);
+		count++;
+	}
+	unlock_res(res);
+
+	return ldlm_cli_cancel_list_local(cancels, count, cancel_flags);
+}
+EXPORT_SYMBOL(ldlm_cancel_resource_local);
+
+/**
+ * Cancel client-side locks from a list and send/prepare cancel RPCs to the
+ * server.
+ * If \a req is NULL, send CANCEL request to server with handles of locks
+ * in the \a cancels. If EARLY_CANCEL is not supported, send CANCEL requests
+ * separately per lock.
+ * If \a req is not NULL, put handles of locks in \a cancels into the request
+ * buffer at the offset \a off.
+ * Destroy \a cancels at the end.
+ */
+int ldlm_cli_cancel_list(struct list_head *cancels, int count,
+			 struct ptlrpc_request *req, ldlm_cancel_flags_t flags)
+{
+	struct ldlm_lock *lock;
+	int res = 0;
+
+	if (list_empty(cancels) || count == 0)
+		return 0;
+
+	/* XXX: requests (both batched and not) could be sent in parallel.
+	 * Usually it is enough to have just 1 RPC, but it is possible that
+	 * there are too many locks to be cancelled in LRU or on a resource.
+	 * It would also speed up the case when the server does not support
+	 * the feature. */
+	while (count > 0) {
+		LASSERT(!list_empty(cancels));
+		lock = list_entry(cancels->next, struct ldlm_lock,
+				      l_bl_ast);
+		LASSERT(lock->l_conn_export);
+
+		if (exp_connect_cancelset(lock->l_conn_export)) {
+			res = count;
+			if (req)
+				ldlm_cancel_pack(req, cancels, count);
+			else
+				res = ldlm_cli_cancel_req(lock->l_conn_export,
+							  cancels, count,
+							  flags);
+		} else {
+			res = ldlm_cli_cancel_req(lock->l_conn_export,
+						  cancels, 1, flags);
+		}
+
+		if (res < 0) {
+			CDEBUG_LIMIT(res == -ESHUTDOWN ? D_DLMTRACE : D_ERROR,
+				     "ldlm_cli_cancel_list: %d\n", res);
+			res = count;
+		}
+
+		count -= res;
+		ldlm_lock_list_put(cancels, l_bl_ast, res);
+	}
+	LASSERT(count == 0);
+	return 0;
+}
+EXPORT_SYMBOL(ldlm_cli_cancel_list);
+
+/**
+ * Cancel all locks on a resource that have 0 readers/writers.
+ *
+ * If flags & LDLM_FL_LOCAL_ONLY, throw the locks away without trying
+ * to notify the server. */
+int ldlm_cli_cancel_unused_resource(struct ldlm_namespace *ns,
+				    const struct ldlm_res_id *res_id,
+				    ldlm_policy_data_t *policy,
+				    ldlm_mode_t mode,
+				    ldlm_cancel_flags_t flags,
+				    void *opaque)
+{
+	struct ldlm_resource *res;
+	LIST_HEAD(cancels);
+	int count;
+	int rc;
+
+	res = ldlm_resource_get(ns, NULL, res_id, 0, 0);
+	if (res == NULL) {
+		/* This is not a problem. */
+		CDEBUG(D_INFO, "No resource %llu\n", res_id->name[0]);
+		return 0;
+	}
+
+	LDLM_RESOURCE_ADDREF(res);
+	count = ldlm_cancel_resource_local(res, &cancels, policy, mode,
+					   0, flags | LCF_BL_AST, opaque);
+	rc = ldlm_cli_cancel_list(&cancels, count, NULL, flags);
+	if (rc != ELDLM_OK)
+		CERROR("canceling unused lock "DLDLMRES": rc = %d\n",
+		       PLDLMRES(res), rc);
+
+	LDLM_RESOURCE_DELREF(res);
+	ldlm_resource_putref(res);
+	return 0;
+}
+EXPORT_SYMBOL(ldlm_cli_cancel_unused_resource);
+
+struct ldlm_cli_cancel_arg {
+	int     lc_flags;
+	void   *lc_opaque;
+};
+
+static int ldlm_cli_hash_cancel_unused(struct cfs_hash *hs,
+				       struct cfs_hash_bd *bd,
+				       struct hlist_node *hnode, void *arg)
+{
+	struct ldlm_resource	   *res = cfs_hash_object(hs, hnode);
+	struct ldlm_cli_cancel_arg     *lc = arg;
+
+	ldlm_cli_cancel_unused_resource(ldlm_res_to_ns(res), &res->lr_name,
+					NULL, LCK_MINMODE,
+					lc->lc_flags, lc->lc_opaque);
+	/* must return 0 for hash iteration */
+	return 0;
+}
+
+/**
+ * Cancel all locks on a namespace (or a specific resource, if given)
+ * that have 0 readers/writers.
+ *
+ * If flags & LCF_LOCAL, throw the locks away without trying
+ * to notify the server. */
+int ldlm_cli_cancel_unused(struct ldlm_namespace *ns,
+			   const struct ldlm_res_id *res_id,
+			   ldlm_cancel_flags_t flags, void *opaque)
+{
+	struct ldlm_cli_cancel_arg arg = {
+		.lc_flags       = flags,
+		.lc_opaque      = opaque,
+	};
+
+	if (ns == NULL)
+		return ELDLM_OK;
+
+	if (res_id != NULL) {
+		return ldlm_cli_cancel_unused_resource(ns, res_id, NULL,
+						       LCK_MINMODE, flags,
+						       opaque);
+	} else {
+		cfs_hash_for_each_nolock(ns->ns_rs_hash,
+					 ldlm_cli_hash_cancel_unused, &arg);
+		return ELDLM_OK;
+	}
+}
+EXPORT_SYMBOL(ldlm_cli_cancel_unused);
+
+/* Lock iterators. */
+
+int ldlm_resource_foreach(struct ldlm_resource *res, ldlm_iterator_t iter,
+			  void *closure)
+{
+	struct list_head *tmp, *next;
+	struct ldlm_lock *lock;
+	int rc = LDLM_ITER_CONTINUE;
+
+	if (!res)
+		return LDLM_ITER_CONTINUE;
+
+	lock_res(res);
+	list_for_each_safe(tmp, next, &res->lr_granted) {
+		lock = list_entry(tmp, struct ldlm_lock, l_res_link);
+
+		if (iter(lock, closure) == LDLM_ITER_STOP) {
+			rc = LDLM_ITER_STOP;
+			goto out;
+		}
+	}
+
+	list_for_each_safe(tmp, next, &res->lr_converting) {
+		lock = list_entry(tmp, struct ldlm_lock, l_res_link);
+
+		if (iter(lock, closure) == LDLM_ITER_STOP) {
+			rc = LDLM_ITER_STOP;
+			goto out;
+		}
+	}
+
+	list_for_each_safe(tmp, next, &res->lr_waiting) {
+		lock = list_entry(tmp, struct ldlm_lock, l_res_link);
+
+		if (iter(lock, closure) == LDLM_ITER_STOP) {
+			rc = LDLM_ITER_STOP;
+			goto out;
+		}
+	}
+ out:
+	unlock_res(res);
+	return rc;
+}
+EXPORT_SYMBOL(ldlm_resource_foreach);
+
+struct iter_helper_data {
+	ldlm_iterator_t iter;
+	void *closure;
+};
+
+static int ldlm_iter_helper(struct ldlm_lock *lock, void *closure)
+{
+	struct iter_helper_data *helper = closure;
+
+	return helper->iter(lock, helper->closure);
+}
+
+static int ldlm_res_iter_helper(struct cfs_hash *hs, struct cfs_hash_bd *bd,
+				struct hlist_node *hnode, void *arg)
+
+{
+	struct ldlm_resource *res = cfs_hash_object(hs, hnode);
+
+	return ldlm_resource_foreach(res, ldlm_iter_helper, arg) ==
+	       LDLM_ITER_STOP;
+}
+
+void ldlm_namespace_foreach(struct ldlm_namespace *ns,
+			    ldlm_iterator_t iter, void *closure)
+
+{
+	struct iter_helper_data helper = {
+		.iter		= iter,
+		.closure	= closure,
+	};
+
+	cfs_hash_for_each_nolock(ns->ns_rs_hash,
+				 ldlm_res_iter_helper, &helper);
+
+}
+EXPORT_SYMBOL(ldlm_namespace_foreach);
+
+/* non-blocking function to manipulate a lock whose cb_data is being put away.
+ * return  0:  find no resource
+ *       > 0:  must be LDLM_ITER_STOP/LDLM_ITER_CONTINUE.
+ *       < 0:  errors
+ */
+int ldlm_resource_iterate(struct ldlm_namespace *ns,
+			  const struct ldlm_res_id *res_id,
+			  ldlm_iterator_t iter, void *data)
+{
+	struct ldlm_resource *res;
+	int rc;
+
+	if (ns == NULL) {
+		CERROR("must pass in namespace\n");
+		LBUG();
+	}
+
+	res = ldlm_resource_get(ns, NULL, res_id, 0, 0);
+	if (res == NULL)
+		return 0;
+
+	LDLM_RESOURCE_ADDREF(res);
+	rc = ldlm_resource_foreach(res, iter, data);
+	LDLM_RESOURCE_DELREF(res);
+	ldlm_resource_putref(res);
+	return rc;
+}
+EXPORT_SYMBOL(ldlm_resource_iterate);
+
+/* Lock replay */
+
+static int ldlm_chain_lock_for_replay(struct ldlm_lock *lock, void *closure)
+{
+	struct list_head *list = closure;
+
+	/* we use l_pending_chain here, because it's unused on clients. */
+	LASSERTF(list_empty(&lock->l_pending_chain),
+		 "lock %p next %p prev %p\n",
+		 lock, &lock->l_pending_chain.next,
+		 &lock->l_pending_chain.prev);
+	/* bug 9573: don't replay locks left after eviction, or
+	 * bug 17614: locks being actively cancelled. Get a reference
+	 * on a lock so that it does not disappear under us (e.g. due to cancel)
+	 */
+	if (!(lock->l_flags & (LDLM_FL_FAILED|LDLM_FL_CANCELING))) {
+		list_add(&lock->l_pending_chain, list);
+		LDLM_LOCK_GET(lock);
+	}
+
+	return LDLM_ITER_CONTINUE;
+}
+
+static int replay_lock_interpret(const struct lu_env *env,
+				 struct ptlrpc_request *req,
+				 struct ldlm_async_args *aa, int rc)
+{
+	struct ldlm_lock     *lock;
+	struct ldlm_reply    *reply;
+	struct obd_export    *exp;
+
+	atomic_dec(&req->rq_import->imp_replay_inflight);
+	if (rc != ELDLM_OK)
+		goto out;
+
+
+	reply = req_capsule_server_get(&req->rq_pill, &RMF_DLM_REP);
+	if (reply == NULL) {
+		rc = -EPROTO;
+		goto out;
+	}
+
+	lock = ldlm_handle2lock(&aa->lock_handle);
+	if (!lock) {
+		CERROR("received replay ack for unknown local cookie %#llx remote cookie %#llx from server %s id %s\n",
+		       aa->lock_handle.cookie, reply->lock_handle.cookie,
+		       req->rq_export->exp_client_uuid.uuid,
+		       libcfs_id2str(req->rq_peer));
+		rc = -ESTALE;
+		goto out;
+	}
+
+	/* Key change rehash lock in per-export hash with new key */
+	exp = req->rq_export;
+	if (exp && exp->exp_lock_hash) {
+		/* In the function below, .hs_keycmp resolves to
+		 * ldlm_export_lock_keycmp() */
+		/* coverity[overrun-buffer-val] */
+		cfs_hash_rehash_key(exp->exp_lock_hash,
+				    &lock->l_remote_handle,
+				    &reply->lock_handle,
+				    &lock->l_exp_hash);
+	} else {
+		lock->l_remote_handle = reply->lock_handle;
+	}
+
+	LDLM_DEBUG(lock, "replayed lock:");
+	ptlrpc_import_recovery_state_machine(req->rq_import);
+	LDLM_LOCK_PUT(lock);
+out:
+	if (rc != ELDLM_OK)
+		ptlrpc_connect_import(req->rq_import);
+
+	return rc;
+}
+
+static int replay_one_lock(struct obd_import *imp, struct ldlm_lock *lock)
+{
+	struct ptlrpc_request *req;
+	struct ldlm_async_args *aa;
+	struct ldlm_request   *body;
+	int flags;
+
+	/* Bug 11974: Do not replay a lock which is actively being canceled */
+	if (lock->l_flags & LDLM_FL_CANCELING) {
+		LDLM_DEBUG(lock, "Not replaying canceled lock:");
+		return 0;
+	}
+
+	/* If this is reply-less callback lock, we cannot replay it, since
+	 * server might have long dropped it, but notification of that event was
+	 * lost by network. (and server granted conflicting lock already) */
+	if (lock->l_flags & LDLM_FL_CANCEL_ON_BLOCK) {
+		LDLM_DEBUG(lock, "Not replaying reply-less lock:");
+		ldlm_lock_cancel(lock);
+		return 0;
+	}
+
+	/*
+	 * If granted mode matches the requested mode, this lock is granted.
+	 *
+	 * If they differ, but we have a granted mode, then we were granted
+	 * one mode and now want another: ergo, converting.
+	 *
+	 * If we haven't been granted anything and are on a resource list,
+	 * then we're blocked/waiting.
+	 *
+	 * If we haven't been granted anything and we're NOT on a resource list,
+	 * then we haven't got a reply yet and don't have a known disposition.
+	 * This happens whenever a lock enqueue is the request that triggers
+	 * recovery.
+	 */
+	if (lock->l_granted_mode == lock->l_req_mode)
+		flags = LDLM_FL_REPLAY | LDLM_FL_BLOCK_GRANTED;
+	else if (lock->l_granted_mode)
+		flags = LDLM_FL_REPLAY | LDLM_FL_BLOCK_CONV;
+	else if (!list_empty(&lock->l_res_link))
+		flags = LDLM_FL_REPLAY | LDLM_FL_BLOCK_WAIT;
+	else
+		flags = LDLM_FL_REPLAY;
+
+	req = ptlrpc_request_alloc_pack(imp, &RQF_LDLM_ENQUEUE,
+					LUSTRE_DLM_VERSION, LDLM_ENQUEUE);
+	if (req == NULL)
+		return -ENOMEM;
+
+	/* We're part of recovery, so don't wait for it. */
+	req->rq_send_state = LUSTRE_IMP_REPLAY_LOCKS;
+
+	body = req_capsule_client_get(&req->rq_pill, &RMF_DLM_REQ);
+	ldlm_lock2desc(lock, &body->lock_desc);
+	body->lock_flags = ldlm_flags_to_wire(flags);
+
+	ldlm_lock2handle(lock, &body->lock_handle[0]);
+	if (lock->l_lvb_len > 0)
+		req_capsule_extend(&req->rq_pill, &RQF_LDLM_ENQUEUE_LVB);
+	req_capsule_set_size(&req->rq_pill, &RMF_DLM_LVB, RCL_SERVER,
+			     lock->l_lvb_len);
+	ptlrpc_request_set_replen(req);
+	/* notify the server we've replayed all requests.
+	 * also, we mark the request to be put on a dedicated
+	 * queue to be processed after all request replayes.
+	 * bug 6063 */
+	lustre_msg_set_flags(req->rq_reqmsg, MSG_REQ_REPLAY_DONE);
+
+	LDLM_DEBUG(lock, "replaying lock:");
+
+	atomic_inc(&req->rq_import->imp_replay_inflight);
+	CLASSERT(sizeof(*aa) <= sizeof(req->rq_async_args));
+	aa = ptlrpc_req_async_args(req);
+	aa->lock_handle = body->lock_handle[0];
+	req->rq_interpret_reply = (ptlrpc_interpterer_t)replay_lock_interpret;
+	ptlrpcd_add_req(req, PDL_POLICY_LOCAL, -1);
+
+	return 0;
+}
+
+/**
+ * Cancel as many unused locks as possible before replay. since we are
+ * in recovery, we can't wait for any outstanding RPCs to send any RPC
+ * to the server.
+ *
+ * Called only in recovery before replaying locks. there is no need to
+ * replay locks that are unused. since the clients may hold thousands of
+ * cached unused locks, dropping the unused locks can greatly reduce the
+ * load on the servers at recovery time.
+ */
+static void ldlm_cancel_unused_locks_for_replay(struct ldlm_namespace *ns)
+{
+	int canceled;
+	LIST_HEAD(cancels);
+
+	CDEBUG(D_DLMTRACE, "Dropping as many unused locks as possible before replay for namespace %s (%d)\n",
+	       ldlm_ns_name(ns), ns->ns_nr_unused);
+
+	/* We don't need to care whether or not LRU resize is enabled
+	 * because the LDLM_CANCEL_NO_WAIT policy doesn't use the
+	 * count parameter */
+	canceled = ldlm_cancel_lru_local(ns, &cancels, ns->ns_nr_unused, 0,
+					 LCF_LOCAL, LDLM_CANCEL_NO_WAIT);
+
+	CDEBUG(D_DLMTRACE, "Canceled %d unused locks from namespace %s\n",
+			   canceled, ldlm_ns_name(ns));
+}
+
+int ldlm_replay_locks(struct obd_import *imp)
+{
+	struct ldlm_namespace *ns = imp->imp_obd->obd_namespace;
+	LIST_HEAD(list);
+	struct ldlm_lock *lock, *next;
+	int rc = 0;
+
+	LASSERT(atomic_read(&imp->imp_replay_inflight) == 0);
+
+	/* don't replay locks if import failed recovery */
+	if (imp->imp_vbr_failed)
+		return 0;
+
+	/* ensure this doesn't fall to 0 before all have been queued */
+	atomic_inc(&imp->imp_replay_inflight);
+
+	if (ldlm_cancel_unused_locks_before_replay)
+		ldlm_cancel_unused_locks_for_replay(ns);
+
+	ldlm_namespace_foreach(ns, ldlm_chain_lock_for_replay, &list);
+
+	list_for_each_entry_safe(lock, next, &list, l_pending_chain) {
+		list_del_init(&lock->l_pending_chain);
+		if (rc) {
+			LDLM_LOCK_RELEASE(lock);
+			continue; /* or try to do the rest? */
+		}
+		rc = replay_one_lock(imp, lock);
+		LDLM_LOCK_RELEASE(lock);
+	}
+
+	atomic_dec(&imp->imp_replay_inflight);
+
+	return rc;
+}
+EXPORT_SYMBOL(ldlm_replay_locks);
diff --git a/kernel/drivers/staging/lustre/lustre/ldlm/ldlm_resource.c b/kernel/drivers/staging/lustre/lustre/ldlm/ldlm_resource.c
new file mode 100644
index 000000000..f750d42a7
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/ldlm/ldlm_resource.c
@@ -0,0 +1,1425 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2010, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/ldlm/ldlm_resource.c
+ *
+ * Author: Phil Schwan <phil@clusterfs.com>
+ * Author: Peter Braam <braam@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LDLM
+#include "../include/lustre_dlm.h"
+#include "../include/lustre_fid.h"
+#include "../include/obd_class.h"
+#include "ldlm_internal.h"
+
+struct kmem_cache *ldlm_resource_slab, *ldlm_lock_slab;
+
+int ldlm_srv_namespace_nr = 0;
+int ldlm_cli_namespace_nr = 0;
+
+struct mutex ldlm_srv_namespace_lock;
+LIST_HEAD(ldlm_srv_namespace_list);
+
+struct mutex ldlm_cli_namespace_lock;
+/* Client Namespaces that have active resources in them.
+ * Once all resources go away, ldlm_poold moves such namespaces to the
+ * inactive list */
+LIST_HEAD(ldlm_cli_active_namespace_list);
+/* Client namespaces that don't have any locks in them */
+LIST_HEAD(ldlm_cli_inactive_namespace_list);
+
+struct proc_dir_entry *ldlm_type_proc_dir = NULL;
+static struct proc_dir_entry *ldlm_ns_proc_dir = NULL;
+struct proc_dir_entry *ldlm_svc_proc_dir = NULL;
+
+extern unsigned int ldlm_cancel_unused_locks_before_replay;
+
+/* during debug dump certain amount of granted locks for one resource to avoid
+ * DDOS. */
+unsigned int ldlm_dump_granted_max = 256;
+
+#if defined(CONFIG_PROC_FS)
+static ssize_t lprocfs_wr_dump_ns(struct file *file, const char __user *buffer,
+				  size_t count, loff_t *off)
+{
+	ldlm_dump_all_namespaces(LDLM_NAMESPACE_SERVER, D_DLMTRACE);
+	ldlm_dump_all_namespaces(LDLM_NAMESPACE_CLIENT, D_DLMTRACE);
+	return count;
+}
+LPROC_SEQ_FOPS_WR_ONLY(ldlm, dump_ns);
+
+LPROC_SEQ_FOPS_RW_TYPE(ldlm_rw, uint);
+LPROC_SEQ_FOPS_RO_TYPE(ldlm, uint);
+
+int ldlm_proc_setup(void)
+{
+	int rc;
+	struct lprocfs_vars list[] = {
+		{ "dump_namespaces", &ldlm_dump_ns_fops, NULL, 0222 },
+		{ "dump_granted_max", &ldlm_rw_uint_fops,
+		  &ldlm_dump_granted_max },
+		{ "cancel_unused_locks_before_replay", &ldlm_rw_uint_fops,
+		  &ldlm_cancel_unused_locks_before_replay },
+		{ NULL } };
+	LASSERT(ldlm_ns_proc_dir == NULL);
+
+	ldlm_type_proc_dir = lprocfs_register(OBD_LDLM_DEVICENAME,
+					      proc_lustre_root,
+					      NULL, NULL);
+	if (IS_ERR(ldlm_type_proc_dir)) {
+		CERROR("LProcFS failed in ldlm-init\n");
+		rc = PTR_ERR(ldlm_type_proc_dir);
+		goto err;
+	}
+
+	ldlm_ns_proc_dir = lprocfs_register("namespaces",
+					    ldlm_type_proc_dir,
+					    NULL, NULL);
+	if (IS_ERR(ldlm_ns_proc_dir)) {
+		CERROR("LProcFS failed in ldlm-init\n");
+		rc = PTR_ERR(ldlm_ns_proc_dir);
+		goto err_type;
+	}
+
+	ldlm_svc_proc_dir = lprocfs_register("services",
+					    ldlm_type_proc_dir,
+					    NULL, NULL);
+	if (IS_ERR(ldlm_svc_proc_dir)) {
+		CERROR("LProcFS failed in ldlm-init\n");
+		rc = PTR_ERR(ldlm_svc_proc_dir);
+		goto err_ns;
+	}
+
+	rc = lprocfs_add_vars(ldlm_type_proc_dir, list, NULL);
+
+	return 0;
+
+err_ns:
+	lprocfs_remove(&ldlm_ns_proc_dir);
+err_type:
+	lprocfs_remove(&ldlm_type_proc_dir);
+err:
+	ldlm_svc_proc_dir = NULL;
+	ldlm_type_proc_dir = NULL;
+	ldlm_ns_proc_dir = NULL;
+	return rc;
+}
+
+void ldlm_proc_cleanup(void)
+{
+	if (ldlm_svc_proc_dir)
+		lprocfs_remove(&ldlm_svc_proc_dir);
+
+	if (ldlm_ns_proc_dir)
+		lprocfs_remove(&ldlm_ns_proc_dir);
+
+	if (ldlm_type_proc_dir)
+		lprocfs_remove(&ldlm_type_proc_dir);
+
+	ldlm_svc_proc_dir = NULL;
+	ldlm_type_proc_dir = NULL;
+	ldlm_ns_proc_dir = NULL;
+}
+
+static int lprocfs_ns_resources_seq_show(struct seq_file *m, void *v)
+{
+	struct ldlm_namespace *ns  = m->private;
+	__u64		  res = 0;
+	struct cfs_hash_bd	  bd;
+	int		    i;
+
+	/* result is not strictly consistent */
+	cfs_hash_for_each_bucket(ns->ns_rs_hash, &bd, i)
+		res += cfs_hash_bd_count_get(&bd);
+	return lprocfs_rd_u64(m, &res);
+}
+LPROC_SEQ_FOPS_RO(lprocfs_ns_resources);
+
+static int lprocfs_ns_locks_seq_show(struct seq_file *m, void *v)
+{
+	struct ldlm_namespace *ns = m->private;
+	__u64		  locks;
+
+	locks = lprocfs_stats_collector(ns->ns_stats, LDLM_NSS_LOCKS,
+					LPROCFS_FIELDS_FLAGS_SUM);
+	return lprocfs_rd_u64(m, &locks);
+}
+LPROC_SEQ_FOPS_RO(lprocfs_ns_locks);
+
+static int lprocfs_lru_size_seq_show(struct seq_file *m, void *v)
+{
+	struct ldlm_namespace *ns = m->private;
+	__u32 *nr = &ns->ns_max_unused;
+
+	if (ns_connect_lru_resize(ns))
+		nr = &ns->ns_nr_unused;
+	return lprocfs_rd_uint(m, nr);
+}
+
+static ssize_t lprocfs_lru_size_seq_write(struct file *file,
+					const char __user *buffer,
+					size_t count, loff_t *off)
+{
+	struct ldlm_namespace *ns = ((struct seq_file *)file->private_data)->private;
+	char dummy[MAX_STRING_SIZE + 1];
+	unsigned long tmp;
+	int lru_resize;
+	int err;
+
+	dummy[MAX_STRING_SIZE] = '\0';
+	if (copy_from_user(dummy, buffer, MAX_STRING_SIZE))
+		return -EFAULT;
+
+	if (strncmp(dummy, "clear", 5) == 0) {
+		CDEBUG(D_DLMTRACE,
+		       "dropping all unused locks from namespace %s\n",
+		       ldlm_ns_name(ns));
+		if (ns_connect_lru_resize(ns)) {
+			int canceled, unused  = ns->ns_nr_unused;
+
+			/* Try to cancel all @ns_nr_unused locks. */
+			canceled = ldlm_cancel_lru(ns, unused, 0,
+						   LDLM_CANCEL_PASSED);
+			if (canceled < unused) {
+				CDEBUG(D_DLMTRACE,
+				       "not all requested locks are canceled, requested: %d, canceled: %d\n",
+				       unused,
+				       canceled);
+				return -EINVAL;
+			}
+		} else {
+			tmp = ns->ns_max_unused;
+			ns->ns_max_unused = 0;
+			ldlm_cancel_lru(ns, 0, 0, LDLM_CANCEL_PASSED);
+			ns->ns_max_unused = tmp;
+		}
+		return count;
+	}
+
+	err = kstrtoul(dummy, 10, &tmp);
+	if (err != 0) {
+		CERROR("invalid value written\n");
+		return -EINVAL;
+	}
+	lru_resize = (tmp == 0);
+
+	if (ns_connect_lru_resize(ns)) {
+		if (!lru_resize)
+			ns->ns_max_unused = (unsigned int)tmp;
+
+		if (tmp > ns->ns_nr_unused)
+			tmp = ns->ns_nr_unused;
+		tmp = ns->ns_nr_unused - tmp;
+
+		CDEBUG(D_DLMTRACE,
+		       "changing namespace %s unused locks from %u to %u\n",
+		       ldlm_ns_name(ns), ns->ns_nr_unused,
+		       (unsigned int)tmp);
+		ldlm_cancel_lru(ns, tmp, LCF_ASYNC, LDLM_CANCEL_PASSED);
+
+		if (!lru_resize) {
+			CDEBUG(D_DLMTRACE,
+			       "disable lru_resize for namespace %s\n",
+			       ldlm_ns_name(ns));
+			ns->ns_connect_flags &= ~OBD_CONNECT_LRU_RESIZE;
+		}
+	} else {
+		CDEBUG(D_DLMTRACE,
+		       "changing namespace %s max_unused from %u to %u\n",
+		       ldlm_ns_name(ns), ns->ns_max_unused,
+		       (unsigned int)tmp);
+		ns->ns_max_unused = (unsigned int)tmp;
+		ldlm_cancel_lru(ns, 0, LCF_ASYNC, LDLM_CANCEL_PASSED);
+
+		/* Make sure that LRU resize was originally supported before
+		 * turning it on here. */
+		if (lru_resize &&
+		    (ns->ns_orig_connect_flags & OBD_CONNECT_LRU_RESIZE)) {
+			CDEBUG(D_DLMTRACE,
+			       "enable lru_resize for namespace %s\n",
+			       ldlm_ns_name(ns));
+			ns->ns_connect_flags |= OBD_CONNECT_LRU_RESIZE;
+		}
+	}
+
+	return count;
+}
+LPROC_SEQ_FOPS(lprocfs_lru_size);
+
+static int lprocfs_elc_seq_show(struct seq_file *m, void *v)
+{
+	struct ldlm_namespace *ns = m->private;
+	unsigned int supp = ns_connect_cancelset(ns);
+
+	return lprocfs_rd_uint(m, &supp);
+}
+
+static ssize_t lprocfs_elc_seq_write(struct file *file,
+				const char __user *buffer,
+				size_t count, loff_t *off)
+{
+	struct ldlm_namespace *ns = ((struct seq_file *)file->private_data)->private;
+	unsigned int supp = -1;
+	int rc;
+
+	rc = lprocfs_wr_uint(file, buffer, count, &supp);
+	if (rc < 0)
+		return rc;
+
+	if (supp == 0)
+		ns->ns_connect_flags &= ~OBD_CONNECT_CANCELSET;
+	else if (ns->ns_orig_connect_flags & OBD_CONNECT_CANCELSET)
+		ns->ns_connect_flags |= OBD_CONNECT_CANCELSET;
+	return count;
+}
+LPROC_SEQ_FOPS(lprocfs_elc);
+
+void ldlm_namespace_proc_unregister(struct ldlm_namespace *ns)
+{
+	if (ns->ns_proc_dir_entry == NULL)
+		CERROR("dlm namespace %s has no procfs dir?\n",
+		       ldlm_ns_name(ns));
+	else
+		lprocfs_remove(&ns->ns_proc_dir_entry);
+
+	if (ns->ns_stats != NULL)
+		lprocfs_free_stats(&ns->ns_stats);
+}
+
+#define LDLM_NS_ADD_VAR(name, var, ops)				\
+	do {							\
+		snprintf(lock_name, MAX_STRING_SIZE, name);	\
+		lock_vars[0].data = var;			\
+		lock_vars[0].fops = ops;			\
+		lprocfs_add_vars(ns_pde, lock_vars, NULL);	\
+	} while (0)
+
+int ldlm_namespace_proc_register(struct ldlm_namespace *ns)
+{
+	struct lprocfs_vars lock_vars[2];
+	char lock_name[MAX_STRING_SIZE + 1];
+	struct proc_dir_entry *ns_pde;
+
+	LASSERT(ns != NULL);
+	LASSERT(ns->ns_rs_hash != NULL);
+
+	if (ns->ns_proc_dir_entry != NULL) {
+		ns_pde = ns->ns_proc_dir_entry;
+	} else {
+		ns_pde = proc_mkdir(ldlm_ns_name(ns), ldlm_ns_proc_dir);
+		if (ns_pde == NULL)
+			return -ENOMEM;
+		ns->ns_proc_dir_entry = ns_pde;
+	}
+
+	ns->ns_stats = lprocfs_alloc_stats(LDLM_NSS_LAST, 0);
+	if (ns->ns_stats == NULL)
+		return -ENOMEM;
+
+	lprocfs_counter_init(ns->ns_stats, LDLM_NSS_LOCKS,
+			     LPROCFS_CNTR_AVGMINMAX, "locks", "locks");
+
+	lock_name[MAX_STRING_SIZE] = '\0';
+
+	memset(lock_vars, 0, sizeof(lock_vars));
+	lock_vars[0].name = lock_name;
+
+	LDLM_NS_ADD_VAR("resource_count", ns, &lprocfs_ns_resources_fops);
+	LDLM_NS_ADD_VAR("lock_count", ns, &lprocfs_ns_locks_fops);
+
+	if (ns_is_client(ns)) {
+		LDLM_NS_ADD_VAR("lock_unused_count", &ns->ns_nr_unused,
+				&ldlm_uint_fops);
+		LDLM_NS_ADD_VAR("lru_size", ns, &lprocfs_lru_size_fops);
+		LDLM_NS_ADD_VAR("lru_max_age", &ns->ns_max_age,
+				&ldlm_rw_uint_fops);
+		LDLM_NS_ADD_VAR("early_lock_cancel", ns, &lprocfs_elc_fops);
+	} else {
+		LDLM_NS_ADD_VAR("ctime_age_limit", &ns->ns_ctime_age_limit,
+				&ldlm_rw_uint_fops);
+		LDLM_NS_ADD_VAR("lock_timeouts", &ns->ns_timeouts,
+				&ldlm_uint_fops);
+		LDLM_NS_ADD_VAR("max_nolock_bytes", &ns->ns_max_nolock_size,
+				&ldlm_rw_uint_fops);
+		LDLM_NS_ADD_VAR("contention_seconds", &ns->ns_contention_time,
+				&ldlm_rw_uint_fops);
+		LDLM_NS_ADD_VAR("contended_locks", &ns->ns_contended_locks,
+				&ldlm_rw_uint_fops);
+		LDLM_NS_ADD_VAR("max_parallel_ast", &ns->ns_max_parallel_ast,
+				&ldlm_rw_uint_fops);
+	}
+	return 0;
+}
+#undef MAX_STRING_SIZE
+#else /* CONFIG_PROC_FS */
+
+#define ldlm_namespace_proc_unregister(ns)      ({; })
+#define ldlm_namespace_proc_register(ns)	({0; })
+
+#endif /* CONFIG_PROC_FS */
+
+static unsigned ldlm_res_hop_hash(struct cfs_hash *hs,
+				  const void *key, unsigned mask)
+{
+	const struct ldlm_res_id     *id  = key;
+	unsigned		val = 0;
+	unsigned		i;
+
+	for (i = 0; i < RES_NAME_SIZE; i++)
+		val += id->name[i];
+	return val & mask;
+}
+
+static unsigned ldlm_res_hop_fid_hash(struct cfs_hash *hs,
+				      const void *key, unsigned mask)
+{
+	const struct ldlm_res_id *id = key;
+	struct lu_fid       fid;
+	__u32	       hash;
+	__u32	       val;
+
+	fid.f_seq = id->name[LUSTRE_RES_ID_SEQ_OFF];
+	fid.f_oid = (__u32)id->name[LUSTRE_RES_ID_VER_OID_OFF];
+	fid.f_ver = (__u32)(id->name[LUSTRE_RES_ID_VER_OID_OFF] >> 32);
+
+	hash = fid_flatten32(&fid);
+	hash += (hash >> 4) + (hash << 12); /* mixing oid and seq */
+	if (id->name[LUSTRE_RES_ID_HSH_OFF] != 0) {
+		val = id->name[LUSTRE_RES_ID_HSH_OFF];
+		hash += (val >> 5) + (val << 11);
+	} else {
+		val = fid_oid(&fid);
+	}
+	hash = hash_long(hash, hs->hs_bkt_bits);
+	/* give me another random factor */
+	hash -= hash_long((unsigned long)hs, val % 11 + 3);
+
+	hash <<= hs->hs_cur_bits - hs->hs_bkt_bits;
+	hash |= ldlm_res_hop_hash(hs, key, CFS_HASH_NBKT(hs) - 1);
+
+	return hash & mask;
+}
+
+static void *ldlm_res_hop_key(struct hlist_node *hnode)
+{
+	struct ldlm_resource   *res;
+
+	res = hlist_entry(hnode, struct ldlm_resource, lr_hash);
+	return &res->lr_name;
+}
+
+static int ldlm_res_hop_keycmp(const void *key, struct hlist_node *hnode)
+{
+	struct ldlm_resource   *res;
+
+	res = hlist_entry(hnode, struct ldlm_resource, lr_hash);
+	return ldlm_res_eq((const struct ldlm_res_id *)key,
+			   (const struct ldlm_res_id *)&res->lr_name);
+}
+
+static void *ldlm_res_hop_object(struct hlist_node *hnode)
+{
+	return hlist_entry(hnode, struct ldlm_resource, lr_hash);
+}
+
+static void ldlm_res_hop_get_locked(struct cfs_hash *hs,
+				    struct hlist_node *hnode)
+{
+	struct ldlm_resource *res;
+
+	res = hlist_entry(hnode, struct ldlm_resource, lr_hash);
+	ldlm_resource_getref(res);
+}
+
+static void ldlm_res_hop_put_locked(struct cfs_hash *hs,
+				    struct hlist_node *hnode)
+{
+	struct ldlm_resource *res;
+
+	res = hlist_entry(hnode, struct ldlm_resource, lr_hash);
+	/* cfs_hash_for_each_nolock is the only chance we call it */
+	ldlm_resource_putref_locked(res);
+}
+
+static void ldlm_res_hop_put(struct cfs_hash *hs, struct hlist_node *hnode)
+{
+	struct ldlm_resource *res;
+
+	res = hlist_entry(hnode, struct ldlm_resource, lr_hash);
+	ldlm_resource_putref(res);
+}
+
+cfs_hash_ops_t ldlm_ns_hash_ops = {
+	.hs_hash	= ldlm_res_hop_hash,
+	.hs_key	 = ldlm_res_hop_key,
+	.hs_keycmp      = ldlm_res_hop_keycmp,
+	.hs_keycpy      = NULL,
+	.hs_object      = ldlm_res_hop_object,
+	.hs_get	 = ldlm_res_hop_get_locked,
+	.hs_put_locked  = ldlm_res_hop_put_locked,
+	.hs_put	 = ldlm_res_hop_put
+};
+
+cfs_hash_ops_t ldlm_ns_fid_hash_ops = {
+	.hs_hash	= ldlm_res_hop_fid_hash,
+	.hs_key	 = ldlm_res_hop_key,
+	.hs_keycmp      = ldlm_res_hop_keycmp,
+	.hs_keycpy      = NULL,
+	.hs_object      = ldlm_res_hop_object,
+	.hs_get	 = ldlm_res_hop_get_locked,
+	.hs_put_locked  = ldlm_res_hop_put_locked,
+	.hs_put	 = ldlm_res_hop_put
+};
+
+struct ldlm_ns_hash_def {
+	ldlm_ns_type_t  nsd_type;
+	/** hash bucket bits */
+	unsigned	nsd_bkt_bits;
+	/** hash bits */
+	unsigned	nsd_all_bits;
+	/** hash operations */
+	cfs_hash_ops_t *nsd_hops;
+};
+
+struct ldlm_ns_hash_def ldlm_ns_hash_defs[] = {
+	{
+		.nsd_type       = LDLM_NS_TYPE_MDC,
+		.nsd_bkt_bits   = 11,
+		.nsd_all_bits   = 16,
+		.nsd_hops       = &ldlm_ns_fid_hash_ops,
+	},
+	{
+		.nsd_type       = LDLM_NS_TYPE_MDT,
+		.nsd_bkt_bits   = 14,
+		.nsd_all_bits   = 21,
+		.nsd_hops       = &ldlm_ns_fid_hash_ops,
+	},
+	{
+		.nsd_type       = LDLM_NS_TYPE_OSC,
+		.nsd_bkt_bits   = 8,
+		.nsd_all_bits   = 12,
+		.nsd_hops       = &ldlm_ns_hash_ops,
+	},
+	{
+		.nsd_type       = LDLM_NS_TYPE_OST,
+		.nsd_bkt_bits   = 11,
+		.nsd_all_bits   = 17,
+		.nsd_hops       = &ldlm_ns_hash_ops,
+	},
+	{
+		.nsd_type       = LDLM_NS_TYPE_MGC,
+		.nsd_bkt_bits   = 4,
+		.nsd_all_bits   = 4,
+		.nsd_hops       = &ldlm_ns_hash_ops,
+	},
+	{
+		.nsd_type       = LDLM_NS_TYPE_MGT,
+		.nsd_bkt_bits   = 4,
+		.nsd_all_bits   = 4,
+		.nsd_hops       = &ldlm_ns_hash_ops,
+	},
+	{
+		.nsd_type       = LDLM_NS_TYPE_UNKNOWN,
+	},
+};
+
+/**
+ * Create and initialize new empty namespace.
+ */
+struct ldlm_namespace *ldlm_namespace_new(struct obd_device *obd, char *name,
+					  ldlm_side_t client,
+					  ldlm_appetite_t apt,
+					  ldlm_ns_type_t ns_type)
+{
+	struct ldlm_namespace *ns = NULL;
+	struct ldlm_ns_bucket *nsb;
+	struct ldlm_ns_hash_def    *nsd;
+	struct cfs_hash_bd	  bd;
+	int		    idx;
+	int		    rc;
+
+	LASSERT(obd != NULL);
+
+	rc = ldlm_get_ref();
+	if (rc) {
+		CERROR("ldlm_get_ref failed: %d\n", rc);
+		return NULL;
+	}
+
+	for (idx = 0;; idx++) {
+		nsd = &ldlm_ns_hash_defs[idx];
+		if (nsd->nsd_type == LDLM_NS_TYPE_UNKNOWN) {
+			CERROR("Unknown type %d for ns %s\n", ns_type, name);
+			goto out_ref;
+		}
+
+		if (nsd->nsd_type == ns_type)
+			break;
+	}
+
+	OBD_ALLOC_PTR(ns);
+	if (!ns)
+		goto out_ref;
+
+	ns->ns_rs_hash = cfs_hash_create(name,
+					 nsd->nsd_all_bits, nsd->nsd_all_bits,
+					 nsd->nsd_bkt_bits, sizeof(*nsb),
+					 CFS_HASH_MIN_THETA,
+					 CFS_HASH_MAX_THETA,
+					 nsd->nsd_hops,
+					 CFS_HASH_DEPTH |
+					 CFS_HASH_BIGNAME |
+					 CFS_HASH_SPIN_BKTLOCK |
+					 CFS_HASH_NO_ITEMREF);
+	if (ns->ns_rs_hash == NULL)
+		goto out_ns;
+
+	cfs_hash_for_each_bucket(ns->ns_rs_hash, &bd, idx) {
+		nsb = cfs_hash_bd_extra_get(ns->ns_rs_hash, &bd);
+		at_init(&nsb->nsb_at_estimate, ldlm_enqueue_min, 0);
+		nsb->nsb_namespace = ns;
+	}
+
+	ns->ns_obd      = obd;
+	ns->ns_appetite = apt;
+	ns->ns_client   = client;
+
+	INIT_LIST_HEAD(&ns->ns_list_chain);
+	INIT_LIST_HEAD(&ns->ns_unused_list);
+	spin_lock_init(&ns->ns_lock);
+	atomic_set(&ns->ns_bref, 0);
+	init_waitqueue_head(&ns->ns_waitq);
+
+	ns->ns_max_nolock_size    = NS_DEFAULT_MAX_NOLOCK_BYTES;
+	ns->ns_contention_time    = NS_DEFAULT_CONTENTION_SECONDS;
+	ns->ns_contended_locks    = NS_DEFAULT_CONTENDED_LOCKS;
+
+	ns->ns_max_parallel_ast   = LDLM_DEFAULT_PARALLEL_AST_LIMIT;
+	ns->ns_nr_unused	  = 0;
+	ns->ns_max_unused	 = LDLM_DEFAULT_LRU_SIZE;
+	ns->ns_max_age	    = LDLM_DEFAULT_MAX_ALIVE;
+	ns->ns_ctime_age_limit    = LDLM_CTIME_AGE_LIMIT;
+	ns->ns_timeouts	   = 0;
+	ns->ns_orig_connect_flags = 0;
+	ns->ns_connect_flags      = 0;
+	ns->ns_stopping	   = 0;
+	rc = ldlm_namespace_proc_register(ns);
+	if (rc != 0) {
+		CERROR("Can't initialize ns proc, rc %d\n", rc);
+		goto out_hash;
+	}
+
+	idx = ldlm_namespace_nr_read(client);
+	rc = ldlm_pool_init(&ns->ns_pool, ns, idx, client);
+	if (rc) {
+		CERROR("Can't initialize lock pool, rc %d\n", rc);
+		goto out_proc;
+	}
+
+	ldlm_namespace_register(ns, client);
+	return ns;
+out_proc:
+	ldlm_namespace_proc_unregister(ns);
+	ldlm_namespace_cleanup(ns, 0);
+out_hash:
+	cfs_hash_putref(ns->ns_rs_hash);
+out_ns:
+	OBD_FREE_PTR(ns);
+out_ref:
+	ldlm_put_ref();
+	return NULL;
+}
+EXPORT_SYMBOL(ldlm_namespace_new);
+
+extern struct ldlm_lock *ldlm_lock_get(struct ldlm_lock *lock);
+
+/**
+ * Cancel and destroy all locks on a resource.
+ *
+ * If flags contains FL_LOCAL_ONLY, don't try to tell the server, just
+ * clean up.  This is currently only used for recovery, and we make
+ * certain assumptions as a result--notably, that we shouldn't cancel
+ * locks with refs.
+ */
+static void cleanup_resource(struct ldlm_resource *res, struct list_head *q,
+			     __u64 flags)
+{
+	struct list_head *tmp;
+	int rc = 0, client = ns_is_client(ldlm_res_to_ns(res));
+	bool local_only = !!(flags & LDLM_FL_LOCAL_ONLY);
+
+	do {
+		struct ldlm_lock *lock = NULL;
+
+		/* First, we look for non-cleaned-yet lock
+		 * all cleaned locks are marked by CLEANED flag. */
+		lock_res(res);
+		list_for_each(tmp, q) {
+			lock = list_entry(tmp, struct ldlm_lock,
+					      l_res_link);
+			if (lock->l_flags & LDLM_FL_CLEANED) {
+				lock = NULL;
+				continue;
+			}
+			LDLM_LOCK_GET(lock);
+			lock->l_flags |= LDLM_FL_CLEANED;
+			break;
+		}
+
+		if (lock == NULL) {
+			unlock_res(res);
+			break;
+		}
+
+		/* Set CBPENDING so nothing in the cancellation path
+		 * can match this lock. */
+		lock->l_flags |= LDLM_FL_CBPENDING;
+		lock->l_flags |= LDLM_FL_FAILED;
+		lock->l_flags |= flags;
+
+		/* ... without sending a CANCEL message for local_only. */
+		if (local_only)
+			lock->l_flags |= LDLM_FL_LOCAL_ONLY;
+
+		if (local_only && (lock->l_readers || lock->l_writers)) {
+			/* This is a little bit gross, but much better than the
+			 * alternative: pretend that we got a blocking AST from
+			 * the server, so that when the lock is decref'd, it
+			 * will go away ... */
+			unlock_res(res);
+			LDLM_DEBUG(lock, "setting FL_LOCAL_ONLY");
+			if (lock->l_completion_ast)
+				lock->l_completion_ast(lock, 0, NULL);
+			LDLM_LOCK_RELEASE(lock);
+			continue;
+		}
+
+		if (client) {
+			struct lustre_handle lockh;
+
+			unlock_res(res);
+			ldlm_lock2handle(lock, &lockh);
+			rc = ldlm_cli_cancel(&lockh, LCF_ASYNC);
+			if (rc)
+				CERROR("ldlm_cli_cancel: %d\n", rc);
+		} else {
+			ldlm_resource_unlink_lock(lock);
+			unlock_res(res);
+			LDLM_DEBUG(lock, "Freeing a lock still held by a client node");
+			ldlm_lock_destroy(lock);
+		}
+		LDLM_LOCK_RELEASE(lock);
+	} while (1);
+}
+
+static int ldlm_resource_clean(struct cfs_hash *hs, struct cfs_hash_bd *bd,
+			       struct hlist_node *hnode, void *arg)
+{
+	struct ldlm_resource *res = cfs_hash_object(hs, hnode);
+	__u64 flags = *(__u64 *)arg;
+
+	cleanup_resource(res, &res->lr_granted, flags);
+	cleanup_resource(res, &res->lr_converting, flags);
+	cleanup_resource(res, &res->lr_waiting, flags);
+
+	return 0;
+}
+
+static int ldlm_resource_complain(struct cfs_hash *hs, struct cfs_hash_bd *bd,
+				  struct hlist_node *hnode, void *arg)
+{
+	struct ldlm_resource  *res = cfs_hash_object(hs, hnode);
+
+	lock_res(res);
+	CERROR("%s: namespace resource "DLDLMRES
+	       " (%p) refcount nonzero (%d) after lock cleanup; forcing cleanup.\n",
+	       ldlm_ns_name(ldlm_res_to_ns(res)), PLDLMRES(res), res,
+	       atomic_read(&res->lr_refcount) - 1);
+
+	ldlm_resource_dump(D_ERROR, res);
+	unlock_res(res);
+	return 0;
+}
+
+/**
+ * Cancel and destroy all locks in the namespace.
+ *
+ * Typically used during evictions when server notified client that it was
+ * evicted and all of its state needs to be destroyed.
+ * Also used during shutdown.
+ */
+int ldlm_namespace_cleanup(struct ldlm_namespace *ns, __u64 flags)
+{
+	if (ns == NULL) {
+		CDEBUG(D_INFO, "NULL ns, skipping cleanup\n");
+		return ELDLM_OK;
+	}
+
+	cfs_hash_for_each_nolock(ns->ns_rs_hash, ldlm_resource_clean, &flags);
+	cfs_hash_for_each_nolock(ns->ns_rs_hash, ldlm_resource_complain, NULL);
+	return ELDLM_OK;
+}
+EXPORT_SYMBOL(ldlm_namespace_cleanup);
+
+/**
+ * Attempts to free namespace.
+ *
+ * Only used when namespace goes away, like during an unmount.
+ */
+static int __ldlm_namespace_free(struct ldlm_namespace *ns, int force)
+{
+	/* At shutdown time, don't call the cancellation callback */
+	ldlm_namespace_cleanup(ns, force ? LDLM_FL_LOCAL_ONLY : 0);
+
+	if (atomic_read(&ns->ns_bref) > 0) {
+		struct l_wait_info lwi = LWI_INTR(LWI_ON_SIGNAL_NOOP, NULL);
+		int rc;
+
+		CDEBUG(D_DLMTRACE,
+		       "dlm namespace %s free waiting on refcount %d\n",
+		       ldlm_ns_name(ns), atomic_read(&ns->ns_bref));
+force_wait:
+		if (force)
+			lwi = LWI_TIMEOUT(obd_timeout * HZ / 4, NULL, NULL);
+
+		rc = l_wait_event(ns->ns_waitq,
+				  atomic_read(&ns->ns_bref) == 0, &lwi);
+
+		/* Forced cleanups should be able to reclaim all references,
+		 * so it's safe to wait forever... we can't leak locks... */
+		if (force && rc == -ETIMEDOUT) {
+			LCONSOLE_ERROR("Forced cleanup waiting for %s namespace with %d resources in use, (rc=%d)\n",
+				       ldlm_ns_name(ns),
+				       atomic_read(&ns->ns_bref), rc);
+			goto force_wait;
+		}
+
+		if (atomic_read(&ns->ns_bref)) {
+			LCONSOLE_ERROR("Cleanup waiting for %s namespace with %d resources in use, (rc=%d)\n",
+				       ldlm_ns_name(ns),
+				       atomic_read(&ns->ns_bref), rc);
+			return ELDLM_NAMESPACE_EXISTS;
+		}
+		CDEBUG(D_DLMTRACE, "dlm namespace %s free done waiting\n",
+		       ldlm_ns_name(ns));
+	}
+
+	return ELDLM_OK;
+}
+
+/**
+ * Performs various cleanups for passed \a ns to make it drop refc and be
+ * ready for freeing. Waits for refc == 0.
+ *
+ * The following is done:
+ * (0) Unregister \a ns from its list to make inaccessible for potential
+ * users like pools thread and others;
+ * (1) Clear all locks in \a ns.
+ */
+void ldlm_namespace_free_prior(struct ldlm_namespace *ns,
+			       struct obd_import *imp,
+			       int force)
+{
+	int rc;
+
+	if (!ns)
+		return;
+
+	spin_lock(&ns->ns_lock);
+	ns->ns_stopping = 1;
+	spin_unlock(&ns->ns_lock);
+
+	/*
+	 * Can fail with -EINTR when force == 0 in which case try harder.
+	 */
+	rc = __ldlm_namespace_free(ns, force);
+	if (rc != ELDLM_OK) {
+		if (imp) {
+			ptlrpc_disconnect_import(imp, 0);
+			ptlrpc_invalidate_import(imp);
+		}
+
+		/*
+		 * With all requests dropped and the import inactive
+		 * we are guaranteed all reference will be dropped.
+		 */
+		rc = __ldlm_namespace_free(ns, 1);
+		LASSERT(rc == 0);
+	}
+}
+
+/**
+ * Performs freeing memory structures related to \a ns. This is only done
+ * when ldlm_namespce_free_prior() successfully removed all resources
+ * referencing \a ns and its refc == 0.
+ */
+void ldlm_namespace_free_post(struct ldlm_namespace *ns)
+{
+	if (!ns)
+		return;
+
+	/* Make sure that nobody can find this ns in its list. */
+	ldlm_namespace_unregister(ns, ns->ns_client);
+	/* Fini pool _before_ parent proc dir is removed. This is important as
+	 * ldlm_pool_fini() removes own proc dir which is child to @dir.
+	 * Removing it after @dir may cause oops. */
+	ldlm_pool_fini(&ns->ns_pool);
+
+	ldlm_namespace_proc_unregister(ns);
+	cfs_hash_putref(ns->ns_rs_hash);
+	/* Namespace \a ns should be not on list at this time, otherwise
+	 * this will cause issues related to using freed \a ns in poold
+	 * thread. */
+	LASSERT(list_empty(&ns->ns_list_chain));
+	OBD_FREE_PTR(ns);
+	ldlm_put_ref();
+}
+
+/**
+ * Cleanup the resource, and free namespace.
+ * bug 12864:
+ * Deadlock issue:
+ * proc1: destroy import
+ *	class_disconnect_export(grab cl_sem) ->
+ *	      -> ldlm_namespace_free ->
+ *	      -> lprocfs_remove(grab _lprocfs_lock).
+ * proc2: read proc info
+ *	lprocfs_fops_read(grab _lprocfs_lock) ->
+ *	      -> osc_rd_active, etc(grab cl_sem).
+ *
+ * So that I have to split the ldlm_namespace_free into two parts - the first
+ * part ldlm_namespace_free_prior is used to cleanup the resource which is
+ * being used; the 2nd part ldlm_namespace_free_post is used to unregister the
+ * lprocfs entries, and then free memory. It will be called w/o cli->cl_sem
+ * held.
+ */
+void ldlm_namespace_free(struct ldlm_namespace *ns,
+			 struct obd_import *imp,
+			 int force)
+{
+	ldlm_namespace_free_prior(ns, imp, force);
+	ldlm_namespace_free_post(ns);
+}
+EXPORT_SYMBOL(ldlm_namespace_free);
+
+void ldlm_namespace_get(struct ldlm_namespace *ns)
+{
+	atomic_inc(&ns->ns_bref);
+}
+EXPORT_SYMBOL(ldlm_namespace_get);
+
+/* This is only for callers that care about refcount */
+int ldlm_namespace_get_return(struct ldlm_namespace *ns)
+{
+	return atomic_inc_return(&ns->ns_bref);
+}
+
+void ldlm_namespace_put(struct ldlm_namespace *ns)
+{
+	if (atomic_dec_and_lock(&ns->ns_bref, &ns->ns_lock)) {
+		wake_up(&ns->ns_waitq);
+		spin_unlock(&ns->ns_lock);
+	}
+}
+EXPORT_SYMBOL(ldlm_namespace_put);
+
+/** Register \a ns in the list of namespaces */
+void ldlm_namespace_register(struct ldlm_namespace *ns, ldlm_side_t client)
+{
+	mutex_lock(ldlm_namespace_lock(client));
+	LASSERT(list_empty(&ns->ns_list_chain));
+	list_add(&ns->ns_list_chain, ldlm_namespace_inactive_list(client));
+	ldlm_namespace_nr_inc(client);
+	mutex_unlock(ldlm_namespace_lock(client));
+}
+
+/** Unregister \a ns from the list of namespaces. */
+void ldlm_namespace_unregister(struct ldlm_namespace *ns, ldlm_side_t client)
+{
+	mutex_lock(ldlm_namespace_lock(client));
+	LASSERT(!list_empty(&ns->ns_list_chain));
+	/* Some asserts and possibly other parts of the code are still
+	 * using list_empty(&ns->ns_list_chain). This is why it is
+	 * important to use list_del_init() here. */
+	list_del_init(&ns->ns_list_chain);
+	ldlm_namespace_nr_dec(client);
+	mutex_unlock(ldlm_namespace_lock(client));
+}
+
+/** Should be called with ldlm_namespace_lock(client) taken. */
+void ldlm_namespace_move_to_active_locked(struct ldlm_namespace *ns,
+					  ldlm_side_t client)
+{
+	LASSERT(!list_empty(&ns->ns_list_chain));
+	LASSERT(mutex_is_locked(ldlm_namespace_lock(client)));
+	list_move_tail(&ns->ns_list_chain, ldlm_namespace_list(client));
+}
+
+/** Should be called with ldlm_namespace_lock(client) taken. */
+void ldlm_namespace_move_to_inactive_locked(struct ldlm_namespace *ns,
+					    ldlm_side_t client)
+{
+	LASSERT(!list_empty(&ns->ns_list_chain));
+	LASSERT(mutex_is_locked(ldlm_namespace_lock(client)));
+	list_move_tail(&ns->ns_list_chain,
+		       ldlm_namespace_inactive_list(client));
+}
+
+/** Should be called with ldlm_namespace_lock(client) taken. */
+struct ldlm_namespace *ldlm_namespace_first_locked(ldlm_side_t client)
+{
+	LASSERT(mutex_is_locked(ldlm_namespace_lock(client)));
+	LASSERT(!list_empty(ldlm_namespace_list(client)));
+	return container_of(ldlm_namespace_list(client)->next,
+		struct ldlm_namespace, ns_list_chain);
+}
+
+/** Create and initialize new resource. */
+static struct ldlm_resource *ldlm_resource_new(void)
+{
+	struct ldlm_resource *res;
+	int idx;
+
+	OBD_SLAB_ALLOC_PTR_GFP(res, ldlm_resource_slab, GFP_NOFS);
+	if (res == NULL)
+		return NULL;
+
+	INIT_LIST_HEAD(&res->lr_granted);
+	INIT_LIST_HEAD(&res->lr_converting);
+	INIT_LIST_HEAD(&res->lr_waiting);
+
+	/* Initialize interval trees for each lock mode. */
+	for (idx = 0; idx < LCK_MODE_NUM; idx++) {
+		res->lr_itree[idx].lit_size = 0;
+		res->lr_itree[idx].lit_mode = 1 << idx;
+		res->lr_itree[idx].lit_root = NULL;
+	}
+
+	atomic_set(&res->lr_refcount, 1);
+	spin_lock_init(&res->lr_lock);
+	lu_ref_init(&res->lr_reference);
+
+	/* The creator of the resource must unlock the mutex after LVB
+	 * initialization. */
+	mutex_init(&res->lr_lvb_mutex);
+	mutex_lock(&res->lr_lvb_mutex);
+
+	return res;
+}
+
+/**
+ * Return a reference to resource with given name, creating it if necessary.
+ * Args: namespace with ns_lock unlocked
+ * Locks: takes and releases NS hash-lock and res->lr_lock
+ * Returns: referenced, unlocked ldlm_resource or NULL
+ */
+struct ldlm_resource *
+ldlm_resource_get(struct ldlm_namespace *ns, struct ldlm_resource *parent,
+		  const struct ldlm_res_id *name, ldlm_type_t type, int create)
+{
+	struct hlist_node     *hnode;
+	struct ldlm_resource *res;
+	struct cfs_hash_bd	 bd;
+	__u64		 version;
+	int		      ns_refcount = 0;
+
+	LASSERT(ns != NULL);
+	LASSERT(parent == NULL);
+	LASSERT(ns->ns_rs_hash != NULL);
+	LASSERT(name->name[0] != 0);
+
+	cfs_hash_bd_get_and_lock(ns->ns_rs_hash, (void *)name, &bd, 0);
+	hnode = cfs_hash_bd_lookup_locked(ns->ns_rs_hash, &bd, (void *)name);
+	if (hnode != NULL) {
+		cfs_hash_bd_unlock(ns->ns_rs_hash, &bd, 0);
+		res = hlist_entry(hnode, struct ldlm_resource, lr_hash);
+		/* Synchronize with regard to resource creation. */
+		if (ns->ns_lvbo && ns->ns_lvbo->lvbo_init) {
+			mutex_lock(&res->lr_lvb_mutex);
+			mutex_unlock(&res->lr_lvb_mutex);
+		}
+
+		if (unlikely(res->lr_lvb_len < 0)) {
+			ldlm_resource_putref(res);
+			res = NULL;
+		}
+		return res;
+	}
+
+	version = cfs_hash_bd_version_get(&bd);
+	cfs_hash_bd_unlock(ns->ns_rs_hash, &bd, 0);
+
+	if (create == 0)
+		return NULL;
+
+	LASSERTF(type >= LDLM_MIN_TYPE && type < LDLM_MAX_TYPE,
+		 "type: %d\n", type);
+	res = ldlm_resource_new();
+	if (!res)
+		return NULL;
+
+	res->lr_ns_bucket  = cfs_hash_bd_extra_get(ns->ns_rs_hash, &bd);
+	res->lr_name       = *name;
+	res->lr_type       = type;
+	res->lr_most_restr = LCK_NL;
+
+	cfs_hash_bd_lock(ns->ns_rs_hash, &bd, 1);
+	hnode = (version == cfs_hash_bd_version_get(&bd)) ?  NULL :
+		cfs_hash_bd_lookup_locked(ns->ns_rs_hash, &bd, (void *)name);
+
+	if (hnode != NULL) {
+		/* Someone won the race and already added the resource. */
+		cfs_hash_bd_unlock(ns->ns_rs_hash, &bd, 1);
+		/* Clean lu_ref for failed resource. */
+		lu_ref_fini(&res->lr_reference);
+		/* We have taken lr_lvb_mutex. Drop it. */
+		mutex_unlock(&res->lr_lvb_mutex);
+		OBD_SLAB_FREE(res, ldlm_resource_slab, sizeof(*res));
+
+		res = hlist_entry(hnode, struct ldlm_resource, lr_hash);
+		/* Synchronize with regard to resource creation. */
+		if (ns->ns_lvbo && ns->ns_lvbo->lvbo_init) {
+			mutex_lock(&res->lr_lvb_mutex);
+			mutex_unlock(&res->lr_lvb_mutex);
+		}
+
+		if (unlikely(res->lr_lvb_len < 0)) {
+			ldlm_resource_putref(res);
+			res = NULL;
+		}
+		return res;
+	}
+	/* We won! Let's add the resource. */
+	cfs_hash_bd_add_locked(ns->ns_rs_hash, &bd, &res->lr_hash);
+	if (cfs_hash_bd_count_get(&bd) == 1)
+		ns_refcount = ldlm_namespace_get_return(ns);
+
+	cfs_hash_bd_unlock(ns->ns_rs_hash, &bd, 1);
+	if (ns->ns_lvbo && ns->ns_lvbo->lvbo_init) {
+		int rc;
+
+		OBD_FAIL_TIMEOUT(OBD_FAIL_LDLM_CREATE_RESOURCE, 2);
+		rc = ns->ns_lvbo->lvbo_init(res);
+		if (rc < 0) {
+			CERROR("%s: lvbo_init failed for resource %#llx:%#llx: rc = %d\n",
+			       ns->ns_obd->obd_name, name->name[0],
+			       name->name[1], rc);
+			if (res->lr_lvb_data) {
+				OBD_FREE(res->lr_lvb_data, res->lr_lvb_len);
+				res->lr_lvb_data = NULL;
+			}
+			res->lr_lvb_len = rc;
+			mutex_unlock(&res->lr_lvb_mutex);
+			ldlm_resource_putref(res);
+			return NULL;
+		}
+	}
+
+	/* We create resource with locked lr_lvb_mutex. */
+	mutex_unlock(&res->lr_lvb_mutex);
+
+	/* Let's see if we happened to be the very first resource in this
+	 * namespace. If so, and this is a client namespace, we need to move
+	 * the namespace into the active namespaces list to be patrolled by
+	 * the ldlm_poold. */
+	if (ns_is_client(ns) && ns_refcount == 1) {
+		mutex_lock(ldlm_namespace_lock(LDLM_NAMESPACE_CLIENT));
+		ldlm_namespace_move_to_active_locked(ns, LDLM_NAMESPACE_CLIENT);
+		mutex_unlock(ldlm_namespace_lock(LDLM_NAMESPACE_CLIENT));
+	}
+
+	return res;
+}
+EXPORT_SYMBOL(ldlm_resource_get);
+
+struct ldlm_resource *ldlm_resource_getref(struct ldlm_resource *res)
+{
+	LASSERT(res != NULL);
+	LASSERT(res != LP_POISON);
+	atomic_inc(&res->lr_refcount);
+	CDEBUG(D_INFO, "getref res: %p count: %d\n", res,
+	       atomic_read(&res->lr_refcount));
+	return res;
+}
+
+static void __ldlm_resource_putref_final(struct cfs_hash_bd *bd,
+					 struct ldlm_resource *res)
+{
+	struct ldlm_ns_bucket *nsb = res->lr_ns_bucket;
+
+	if (!list_empty(&res->lr_granted)) {
+		ldlm_resource_dump(D_ERROR, res);
+		LBUG();
+	}
+
+	if (!list_empty(&res->lr_converting)) {
+		ldlm_resource_dump(D_ERROR, res);
+		LBUG();
+	}
+
+	if (!list_empty(&res->lr_waiting)) {
+		ldlm_resource_dump(D_ERROR, res);
+		LBUG();
+	}
+
+	cfs_hash_bd_del_locked(nsb->nsb_namespace->ns_rs_hash,
+			       bd, &res->lr_hash);
+	lu_ref_fini(&res->lr_reference);
+	if (cfs_hash_bd_count_get(bd) == 0)
+		ldlm_namespace_put(nsb->nsb_namespace);
+}
+
+/* Returns 1 if the resource was freed, 0 if it remains. */
+int ldlm_resource_putref(struct ldlm_resource *res)
+{
+	struct ldlm_namespace *ns = ldlm_res_to_ns(res);
+	struct cfs_hash_bd   bd;
+
+	LASSERT_ATOMIC_GT_LT(&res->lr_refcount, 0, LI_POISON);
+	CDEBUG(D_INFO, "putref res: %p count: %d\n",
+	       res, atomic_read(&res->lr_refcount) - 1);
+
+	cfs_hash_bd_get(ns->ns_rs_hash, &res->lr_name, &bd);
+	if (cfs_hash_bd_dec_and_lock(ns->ns_rs_hash, &bd, &res->lr_refcount)) {
+		__ldlm_resource_putref_final(&bd, res);
+		cfs_hash_bd_unlock(ns->ns_rs_hash, &bd, 1);
+		if (ns->ns_lvbo && ns->ns_lvbo->lvbo_free)
+			ns->ns_lvbo->lvbo_free(res);
+		OBD_SLAB_FREE(res, ldlm_resource_slab, sizeof(*res));
+		return 1;
+	}
+	return 0;
+}
+EXPORT_SYMBOL(ldlm_resource_putref);
+
+/* Returns 1 if the resource was freed, 0 if it remains. */
+int ldlm_resource_putref_locked(struct ldlm_resource *res)
+{
+	struct ldlm_namespace *ns = ldlm_res_to_ns(res);
+
+	LASSERT_ATOMIC_GT_LT(&res->lr_refcount, 0, LI_POISON);
+	CDEBUG(D_INFO, "putref res: %p count: %d\n",
+	       res, atomic_read(&res->lr_refcount) - 1);
+
+	if (atomic_dec_and_test(&res->lr_refcount)) {
+		struct cfs_hash_bd bd;
+
+		cfs_hash_bd_get(ldlm_res_to_ns(res)->ns_rs_hash,
+				&res->lr_name, &bd);
+		__ldlm_resource_putref_final(&bd, res);
+		cfs_hash_bd_unlock(ns->ns_rs_hash, &bd, 1);
+		/* NB: ns_rs_hash is created with CFS_HASH_NO_ITEMREF,
+		 * so we should never be here while calling cfs_hash_del,
+		 * cfs_hash_for_each_nolock is the only case we can get
+		 * here, which is safe to release cfs_hash_bd_lock.
+		 */
+		if (ns->ns_lvbo && ns->ns_lvbo->lvbo_free)
+			ns->ns_lvbo->lvbo_free(res);
+		OBD_SLAB_FREE(res, ldlm_resource_slab, sizeof(*res));
+
+		cfs_hash_bd_lock(ns->ns_rs_hash, &bd, 1);
+		return 1;
+	}
+	return 0;
+}
+
+/**
+ * Add a lock into a given resource into specified lock list.
+ */
+void ldlm_resource_add_lock(struct ldlm_resource *res, struct list_head *head,
+			    struct ldlm_lock *lock)
+{
+	check_res_locked(res);
+
+	LDLM_DEBUG(lock, "About to add this lock:\n");
+
+	if (lock->l_flags & LDLM_FL_DESTROYED) {
+		CDEBUG(D_OTHER, "Lock destroyed, not adding to resource\n");
+		return;
+	}
+
+	LASSERT(list_empty(&lock->l_res_link));
+
+	list_add_tail(&lock->l_res_link, head);
+}
+
+/**
+ * Insert a lock into resource after specified lock.
+ *
+ * Obtain resource description from the lock we are inserting after.
+ */
+void ldlm_resource_insert_lock_after(struct ldlm_lock *original,
+				     struct ldlm_lock *new)
+{
+	struct ldlm_resource *res = original->l_resource;
+
+	check_res_locked(res);
+
+	ldlm_resource_dump(D_INFO, res);
+	LDLM_DEBUG(new, "About to insert this lock after %p:\n", original);
+
+	if (new->l_flags & LDLM_FL_DESTROYED) {
+		CDEBUG(D_OTHER, "Lock destroyed, not adding to resource\n");
+		goto out;
+	}
+
+	LASSERT(list_empty(&new->l_res_link));
+
+	list_add(&new->l_res_link, &original->l_res_link);
+ out:;
+}
+
+void ldlm_resource_unlink_lock(struct ldlm_lock *lock)
+{
+	int type = lock->l_resource->lr_type;
+
+	check_res_locked(lock->l_resource);
+	if (type == LDLM_IBITS || type == LDLM_PLAIN)
+		ldlm_unlink_lock_skiplist(lock);
+	else if (type == LDLM_EXTENT)
+		ldlm_extent_unlink_lock(lock);
+	list_del_init(&lock->l_res_link);
+}
+EXPORT_SYMBOL(ldlm_resource_unlink_lock);
+
+void ldlm_res2desc(struct ldlm_resource *res, struct ldlm_resource_desc *desc)
+{
+	desc->lr_type = res->lr_type;
+	desc->lr_name = res->lr_name;
+}
+
+/**
+ * Print information about all locks in all namespaces on this node to debug
+ * log.
+ */
+void ldlm_dump_all_namespaces(ldlm_side_t client, int level)
+{
+	struct list_head *tmp;
+
+	if (!((libcfs_debug | D_ERROR) & level))
+		return;
+
+	mutex_lock(ldlm_namespace_lock(client));
+
+	list_for_each(tmp, ldlm_namespace_list(client)) {
+		struct ldlm_namespace *ns;
+
+		ns = list_entry(tmp, struct ldlm_namespace, ns_list_chain);
+		ldlm_namespace_dump(level, ns);
+	}
+
+	mutex_unlock(ldlm_namespace_lock(client));
+}
+EXPORT_SYMBOL(ldlm_dump_all_namespaces);
+
+static int ldlm_res_hash_dump(struct cfs_hash *hs, struct cfs_hash_bd *bd,
+			      struct hlist_node *hnode, void *arg)
+{
+	struct ldlm_resource *res = cfs_hash_object(hs, hnode);
+	int    level = (int)(unsigned long)arg;
+
+	lock_res(res);
+	ldlm_resource_dump(level, res);
+	unlock_res(res);
+
+	return 0;
+}
+
+/**
+ * Print information about all locks in this namespace on this node to debug
+ * log.
+ */
+void ldlm_namespace_dump(int level, struct ldlm_namespace *ns)
+{
+	if (!((libcfs_debug | D_ERROR) & level))
+		return;
+
+	CDEBUG(level, "--- Namespace: %s (rc: %d, side: %s)\n",
+	       ldlm_ns_name(ns), atomic_read(&ns->ns_bref),
+	       ns_is_client(ns) ? "client" : "server");
+
+	if (time_before(cfs_time_current(), ns->ns_next_dump))
+		return;
+
+	cfs_hash_for_each_nolock(ns->ns_rs_hash,
+				 ldlm_res_hash_dump,
+				 (void *)(unsigned long)level);
+	spin_lock(&ns->ns_lock);
+	ns->ns_next_dump = cfs_time_shift(10);
+	spin_unlock(&ns->ns_lock);
+}
+EXPORT_SYMBOL(ldlm_namespace_dump);
+
+/**
+ * Print information about all locks in this resource to debug log.
+ */
+void ldlm_resource_dump(int level, struct ldlm_resource *res)
+{
+	struct ldlm_lock *lock;
+	unsigned int granted = 0;
+
+	CLASSERT(RES_NAME_SIZE == 4);
+
+	if (!((libcfs_debug | D_ERROR) & level))
+		return;
+
+	CDEBUG(level, "--- Resource: "DLDLMRES" (%p) refcount = %d\n",
+	       PLDLMRES(res), res, atomic_read(&res->lr_refcount));
+
+	if (!list_empty(&res->lr_granted)) {
+		CDEBUG(level, "Granted locks (in reverse order):\n");
+		list_for_each_entry_reverse(lock, &res->lr_granted,
+						l_res_link) {
+			LDLM_DEBUG_LIMIT(level, lock, "###");
+			if (!(level & D_CANTMASK) &&
+			    ++granted > ldlm_dump_granted_max) {
+				CDEBUG(level, "only dump %d granted locks to avoid DDOS.\n",
+				       granted);
+				break;
+			}
+		}
+	}
+	if (!list_empty(&res->lr_converting)) {
+		CDEBUG(level, "Converting locks:\n");
+		list_for_each_entry(lock, &res->lr_converting, l_res_link)
+			LDLM_DEBUG_LIMIT(level, lock, "###");
+	}
+	if (!list_empty(&res->lr_waiting)) {
+		CDEBUG(level, "Waiting locks:\n");
+		list_for_each_entry(lock, &res->lr_waiting, l_res_link)
+			LDLM_DEBUG_LIMIT(level, lock, "###");
+	}
+}
diff --git a/kernel/drivers/staging/lustre/lustre/libcfs/Makefile b/kernel/drivers/staging/lustre/lustre/libcfs/Makefile
new file mode 100644
index 000000000..2996a48a3
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/libcfs/Makefile
@@ -0,0 +1,18 @@
+obj-$(CONFIG_LUSTRE_FS) += libcfs.o
+
+libcfs-linux-objs := linux-tracefile.o linux-debug.o
+libcfs-linux-objs += linux-prim.o linux-cpu.o
+libcfs-linux-objs += linux-tcpip.o
+libcfs-linux-objs += linux-curproc.o
+libcfs-linux-objs += linux-module.o
+libcfs-linux-objs += linux-crypto.o
+libcfs-linux-objs += linux-crypto-adler.o
+
+libcfs-linux-objs := $(addprefix linux/,$(libcfs-linux-objs))
+
+libcfs-all-objs := debug.o fail.o nidstrings.o module.o tracefile.o \
+		   libcfs_string.o hash.o kernel_user_comm.o \
+		   prng.o workitem.o libcfs_cpu.o \
+		   libcfs_mem.o libcfs_lock.o
+
+libcfs-objs := $(libcfs-linux-objs) $(libcfs-all-objs)
diff --git a/kernel/drivers/staging/lustre/lustre/libcfs/debug.c b/kernel/drivers/staging/lustre/lustre/libcfs/debug.c
new file mode 100644
index 000000000..021c92fa0
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/libcfs/debug.c
@@ -0,0 +1,460 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * libcfs/libcfs/debug.c
+ *
+ * Author: Phil Schwan <phil@clusterfs.com>
+ *
+ */
+
+# define DEBUG_SUBSYSTEM S_LNET
+
+#include "../../include/linux/libcfs/libcfs.h"
+#include "tracefile.h"
+
+static char debug_file_name[1024];
+
+unsigned int libcfs_subsystem_debug = ~0;
+module_param(libcfs_subsystem_debug, int, 0644);
+MODULE_PARM_DESC(libcfs_subsystem_debug, "Lustre kernel debug subsystem mask");
+EXPORT_SYMBOL(libcfs_subsystem_debug);
+
+unsigned int libcfs_debug = (D_CANTMASK |
+			     D_NETERROR | D_HA | D_CONFIG | D_IOCTL);
+module_param(libcfs_debug, int, 0644);
+MODULE_PARM_DESC(libcfs_debug, "Lustre kernel debug mask");
+EXPORT_SYMBOL(libcfs_debug);
+
+static unsigned int libcfs_debug_mb;
+module_param(libcfs_debug_mb, uint, 0644);
+MODULE_PARM_DESC(libcfs_debug_mb, "Total debug buffer size.");
+EXPORT_SYMBOL(libcfs_debug_mb);
+
+unsigned int libcfs_printk = D_CANTMASK;
+module_param(libcfs_printk, uint, 0644);
+MODULE_PARM_DESC(libcfs_printk, "Lustre kernel debug console mask");
+EXPORT_SYMBOL(libcfs_printk);
+
+unsigned int libcfs_console_ratelimit = 1;
+module_param(libcfs_console_ratelimit, uint, 0644);
+MODULE_PARM_DESC(libcfs_console_ratelimit, "Lustre kernel debug console ratelimit (0 to disable)");
+EXPORT_SYMBOL(libcfs_console_ratelimit);
+
+unsigned int libcfs_console_max_delay;
+module_param(libcfs_console_max_delay, uint, 0644);
+MODULE_PARM_DESC(libcfs_console_max_delay, "Lustre kernel debug console max delay (jiffies)");
+EXPORT_SYMBOL(libcfs_console_max_delay);
+
+unsigned int libcfs_console_min_delay;
+module_param(libcfs_console_min_delay, uint, 0644);
+MODULE_PARM_DESC(libcfs_console_min_delay, "Lustre kernel debug console min delay (jiffies)");
+EXPORT_SYMBOL(libcfs_console_min_delay);
+
+unsigned int libcfs_console_backoff = CDEBUG_DEFAULT_BACKOFF;
+module_param(libcfs_console_backoff, uint, 0644);
+MODULE_PARM_DESC(libcfs_console_backoff, "Lustre kernel debug console backoff factor");
+EXPORT_SYMBOL(libcfs_console_backoff);
+
+unsigned int libcfs_debug_binary = 1;
+EXPORT_SYMBOL(libcfs_debug_binary);
+
+unsigned int libcfs_stack = 3 * THREAD_SIZE / 4;
+EXPORT_SYMBOL(libcfs_stack);
+
+static unsigned int portal_enter_debugger;
+EXPORT_SYMBOL(portal_enter_debugger);
+
+unsigned int libcfs_catastrophe;
+EXPORT_SYMBOL(libcfs_catastrophe);
+
+unsigned int libcfs_watchdog_ratelimit = 300;
+EXPORT_SYMBOL(libcfs_watchdog_ratelimit);
+
+unsigned int libcfs_panic_on_lbug = 1;
+module_param(libcfs_panic_on_lbug, uint, 0644);
+MODULE_PARM_DESC(libcfs_panic_on_lbug, "Lustre kernel panic on LBUG");
+EXPORT_SYMBOL(libcfs_panic_on_lbug);
+
+atomic_t libcfs_kmemory = ATOMIC_INIT(0);
+EXPORT_SYMBOL(libcfs_kmemory);
+
+static wait_queue_head_t debug_ctlwq;
+
+char libcfs_debug_file_path_arr[PATH_MAX] = LIBCFS_DEBUG_FILE_PATH_DEFAULT;
+
+/* We need to pass a pointer here, but elsewhere this must be a const */
+static char *libcfs_debug_file_path;
+module_param(libcfs_debug_file_path, charp, 0644);
+MODULE_PARM_DESC(libcfs_debug_file_path,
+		 "Path for dumping debug logs, set 'NONE' to prevent log dumping");
+
+int libcfs_panic_in_progress;
+
+/* libcfs_debug_token2mask() expects the returned
+ * string in lower-case */
+static const char *
+libcfs_debug_subsys2str(int subsys)
+{
+	switch (1 << subsys) {
+	default:
+		return NULL;
+	case S_UNDEFINED:
+		return "undefined";
+	case S_MDC:
+		return "mdc";
+	case S_MDS:
+		return "mds";
+	case S_OSC:
+		return "osc";
+	case S_OST:
+		return "ost";
+	case S_CLASS:
+		return "class";
+	case S_LOG:
+		return "log";
+	case S_LLITE:
+		return "llite";
+	case S_RPC:
+		return "rpc";
+	case S_LNET:
+		return "lnet";
+	case S_LND:
+		return "lnd";
+	case S_PINGER:
+		return "pinger";
+	case S_FILTER:
+		return "filter";
+	case S_ECHO:
+		return "echo";
+	case S_LDLM:
+		return "ldlm";
+	case S_LOV:
+		return "lov";
+	case S_LQUOTA:
+		return "lquota";
+	case S_OSD:
+		return "osd";
+	case S_LMV:
+		return "lmv";
+	case S_SEC:
+		return "sec";
+	case S_GSS:
+		return "gss";
+	case S_MGC:
+		return "mgc";
+	case S_MGS:
+		return "mgs";
+	case S_FID:
+		return "fid";
+	case S_FLD:
+		return "fld";
+	}
+}
+
+/* libcfs_debug_token2mask() expects the returned
+ * string in lower-case */
+static const char *
+libcfs_debug_dbg2str(int debug)
+{
+	switch (1 << debug) {
+	default:
+		return NULL;
+	case D_TRACE:
+		return "trace";
+	case D_INODE:
+		return "inode";
+	case D_SUPER:
+		return "super";
+	case D_EXT2:
+		return "ext2";
+	case D_MALLOC:
+		return "malloc";
+	case D_CACHE:
+		return "cache";
+	case D_INFO:
+		return "info";
+	case D_IOCTL:
+		return "ioctl";
+	case D_NETERROR:
+		return "neterror";
+	case D_NET:
+		return "net";
+	case D_WARNING:
+		return "warning";
+	case D_BUFFS:
+		return "buffs";
+	case D_OTHER:
+		return "other";
+	case D_DENTRY:
+		return "dentry";
+	case D_NETTRACE:
+		return "nettrace";
+	case D_PAGE:
+		return "page";
+	case D_DLMTRACE:
+		return "dlmtrace";
+	case D_ERROR:
+		return "error";
+	case D_EMERG:
+		return "emerg";
+	case D_HA:
+		return "ha";
+	case D_RPCTRACE:
+		return "rpctrace";
+	case D_VFSTRACE:
+		return "vfstrace";
+	case D_READA:
+		return "reada";
+	case D_MMAP:
+		return "mmap";
+	case D_CONFIG:
+		return "config";
+	case D_CONSOLE:
+		return "console";
+	case D_QUOTA:
+		return "quota";
+	case D_SEC:
+		return "sec";
+	case D_LFSCK:
+		return "lfsck";
+	}
+}
+
+int
+libcfs_debug_mask2str(char *str, int size, int mask, int is_subsys)
+{
+	const char *(*fn)(int bit) = is_subsys ? libcfs_debug_subsys2str :
+						 libcfs_debug_dbg2str;
+	int	   len = 0;
+	const char   *token;
+	int	   i;
+
+	if (mask == 0) {			/* "0" */
+		if (size > 0)
+			str[0] = '0';
+		len = 1;
+	} else {				/* space-separated tokens */
+		for (i = 0; i < 32; i++) {
+			if ((mask & (1 << i)) == 0)
+				continue;
+
+			token = fn(i);
+			if (token == NULL)	      /* unused bit */
+				continue;
+
+			if (len > 0) {		  /* separator? */
+				if (len < size)
+					str[len] = ' ';
+				len++;
+			}
+
+			while (*token != 0) {
+				if (len < size)
+					str[len] = *token;
+				token++;
+				len++;
+			}
+		}
+	}
+
+	/* terminate 'str' */
+	if (len < size)
+		str[len] = 0;
+	else
+		str[size - 1] = 0;
+
+	return len;
+}
+
+int
+libcfs_debug_str2mask(int *mask, const char *str, int is_subsys)
+{
+	const char *(*fn)(int bit) = is_subsys ? libcfs_debug_subsys2str :
+						 libcfs_debug_dbg2str;
+	int	 m = 0;
+	int	 matched;
+	int	 n;
+	int	 t;
+
+	/* Allow a number for backwards compatibility */
+
+	for (n = strlen(str); n > 0; n--)
+		if (!isspace(str[n-1]))
+			break;
+	matched = n;
+	t = sscanf(str, "%i%n", &m, &matched);
+	if (t >= 1 && matched == n) {
+		/* don't print warning for lctl set_param debug=0 or -1 */
+		if (m != 0 && m != -1)
+			CWARN("You are trying to use a numerical value for the mask - this will be deprecated in a future release.\n");
+		*mask = m;
+		return 0;
+	}
+
+	return cfs_str2mask(str, fn, mask, is_subsys ? 0 : D_CANTMASK,
+			    0xffffffff);
+}
+
+/**
+ * Dump Lustre log to ::debug_file_path by calling tracefile_dump_all_pages()
+ */
+void libcfs_debug_dumplog_internal(void *arg)
+{
+	void *journal_info;
+
+	journal_info = current->journal_info;
+	current->journal_info = NULL;
+
+	if (strncmp(libcfs_debug_file_path_arr, "NONE", 4) != 0) {
+		snprintf(debug_file_name, sizeof(debug_file_name) - 1,
+			 "%s.%ld.%ld", libcfs_debug_file_path_arr,
+			 get_seconds(), (long_ptr_t)arg);
+		pr_alert("LustreError: dumping log to %s\n",
+		       debug_file_name);
+		cfs_tracefile_dump_all_pages(debug_file_name);
+		libcfs_run_debug_log_upcall(debug_file_name);
+	}
+
+	current->journal_info = journal_info;
+}
+
+static int libcfs_debug_dumplog_thread(void *arg)
+{
+	libcfs_debug_dumplog_internal(arg);
+	wake_up(&debug_ctlwq);
+	return 0;
+}
+
+void libcfs_debug_dumplog(void)
+{
+	wait_queue_t wait;
+	struct task_struct *dumper;
+
+	/* we're being careful to ensure that the kernel thread is
+	 * able to set our state to running as it exits before we
+	 * get to schedule() */
+	init_waitqueue_entry(&wait, current);
+	set_current_state(TASK_INTERRUPTIBLE);
+	add_wait_queue(&debug_ctlwq, &wait);
+
+	dumper = kthread_run(libcfs_debug_dumplog_thread,
+			     (void *)(long)current_pid(),
+			     "libcfs_debug_dumper");
+	if (IS_ERR(dumper))
+		pr_err("LustreError: cannot start log dump thread: %ld\n",
+		       PTR_ERR(dumper));
+	else
+		schedule();
+
+	/* be sure to teardown if cfs_create_thread() failed */
+	remove_wait_queue(&debug_ctlwq, &wait);
+	set_current_state(TASK_RUNNING);
+}
+EXPORT_SYMBOL(libcfs_debug_dumplog);
+
+int libcfs_debug_init(unsigned long bufsize)
+{
+	int    rc = 0;
+	unsigned int max = libcfs_debug_mb;
+
+	init_waitqueue_head(&debug_ctlwq);
+
+	if (libcfs_console_max_delay <= 0 || /* not set by user or */
+	    libcfs_console_min_delay <= 0 || /* set to invalid values */
+	    libcfs_console_min_delay >= libcfs_console_max_delay) {
+		libcfs_console_max_delay = CDEBUG_DEFAULT_MAX_DELAY;
+		libcfs_console_min_delay = CDEBUG_DEFAULT_MIN_DELAY;
+	}
+
+	if (libcfs_debug_file_path != NULL) {
+		strncpy(libcfs_debug_file_path_arr,
+			libcfs_debug_file_path, PATH_MAX-1);
+		libcfs_debug_file_path_arr[PATH_MAX - 1] = '\0';
+	}
+
+	/* If libcfs_debug_mb is set to an invalid value or uninitialized
+	 * then just make the total buffers smp_num_cpus * TCD_MAX_PAGES */
+	if (max > cfs_trace_max_debug_mb() || max < num_possible_cpus()) {
+		max = TCD_MAX_PAGES;
+	} else {
+		max = max / num_possible_cpus();
+		max <<= (20 - PAGE_CACHE_SHIFT);
+	}
+	rc = cfs_tracefile_init(max);
+
+	if (rc == 0)
+		libcfs_register_panic_notifier();
+
+	return rc;
+}
+
+int libcfs_debug_cleanup(void)
+{
+	libcfs_unregister_panic_notifier();
+	cfs_tracefile_exit();
+	return 0;
+}
+
+int libcfs_debug_clear_buffer(void)
+{
+	cfs_trace_flush_pages();
+	return 0;
+}
+
+/* Debug markers, although printed by S_LNET
+ * should not be be marked as such. */
+#undef DEBUG_SUBSYSTEM
+#define DEBUG_SUBSYSTEM S_UNDEFINED
+int libcfs_debug_mark_buffer(const char *text)
+{
+	CDEBUG(D_TRACE,
+	       "***************************************************\n");
+	LCONSOLE(D_WARNING, "DEBUG MARKER: %s\n", text);
+	CDEBUG(D_TRACE,
+	       "***************************************************\n");
+
+	return 0;
+}
+#undef DEBUG_SUBSYSTEM
+#define DEBUG_SUBSYSTEM S_LNET
+
+void libcfs_debug_set_level(unsigned int debug_level)
+{
+	pr_warn("Lustre: Setting portals debug level to %08x\n",
+	       debug_level);
+	libcfs_debug = debug_level;
+}
+
+EXPORT_SYMBOL(libcfs_debug_set_level);
diff --git a/kernel/drivers/staging/lustre/lustre/libcfs/fail.c b/kernel/drivers/staging/lustre/lustre/libcfs/fail.c
new file mode 100644
index 000000000..92444b0fe
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/libcfs/fail.c
@@ -0,0 +1,138 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see http://www.gnu.org/licenses
+ *
+ * Please contact Oracle Corporation, Inc., 500 Oracle Parkway, Redwood Shores,
+ * CA 94065 USA or visit www.oracle.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Oracle Corporation, Inc.
+ */
+
+#include "../../include/linux/libcfs/libcfs.h"
+
+unsigned long cfs_fail_loc = 0;
+EXPORT_SYMBOL(cfs_fail_loc);
+
+unsigned int cfs_fail_val = 0;
+EXPORT_SYMBOL(cfs_fail_val);
+
+wait_queue_head_t cfs_race_waitq;
+EXPORT_SYMBOL(cfs_race_waitq);
+
+int cfs_race_state;
+EXPORT_SYMBOL(cfs_race_state);
+
+int __cfs_fail_check_set(__u32 id, __u32 value, int set)
+{
+	static atomic_t cfs_fail_count = ATOMIC_INIT(0);
+
+	LASSERT(!(id & CFS_FAIL_ONCE));
+
+	if ((cfs_fail_loc & (CFS_FAILED | CFS_FAIL_ONCE)) ==
+	    (CFS_FAILED | CFS_FAIL_ONCE)) {
+		atomic_set(&cfs_fail_count, 0); /* paranoia */
+		return 0;
+	}
+
+	/* Fail 1/cfs_fail_val times */
+	if (cfs_fail_loc & CFS_FAIL_RAND) {
+		if (cfs_fail_val < 2 || cfs_rand() % cfs_fail_val > 0)
+			return 0;
+	}
+
+	/* Skip the first cfs_fail_val, then fail */
+	if (cfs_fail_loc & CFS_FAIL_SKIP) {
+		if (atomic_inc_return(&cfs_fail_count) <= cfs_fail_val)
+			return 0;
+	}
+
+	/* check cfs_fail_val... */
+	if (set == CFS_FAIL_LOC_VALUE) {
+		if (cfs_fail_val != -1 && cfs_fail_val != value)
+			return 0;
+	}
+
+	/* Fail cfs_fail_val times, overridden by FAIL_ONCE */
+	if (cfs_fail_loc & CFS_FAIL_SOME &&
+	    (!(cfs_fail_loc & CFS_FAIL_ONCE) || cfs_fail_val <= 1)) {
+		int count = atomic_inc_return(&cfs_fail_count);
+
+		if (count >= cfs_fail_val) {
+			set_bit(CFS_FAIL_ONCE_BIT, &cfs_fail_loc);
+			atomic_set(&cfs_fail_count, 0);
+			/* we are lost race to increase  */
+			if (count > cfs_fail_val)
+				return 0;
+		}
+	}
+
+	if ((set == CFS_FAIL_LOC_ORSET || set == CFS_FAIL_LOC_RESET) &&
+	    (value & CFS_FAIL_ONCE))
+		set_bit(CFS_FAIL_ONCE_BIT, &cfs_fail_loc);
+	/* Lost race to set CFS_FAILED_BIT. */
+	if (test_and_set_bit(CFS_FAILED_BIT, &cfs_fail_loc)) {
+		/* If CFS_FAIL_ONCE is valid, only one process can fail,
+		 * otherwise multi-process can fail at the same time. */
+		if (cfs_fail_loc & CFS_FAIL_ONCE)
+			return 0;
+	}
+
+	switch (set) {
+	case CFS_FAIL_LOC_NOSET:
+	case CFS_FAIL_LOC_VALUE:
+		break;
+	case CFS_FAIL_LOC_ORSET:
+		cfs_fail_loc |= value & ~(CFS_FAILED | CFS_FAIL_ONCE);
+		break;
+	case CFS_FAIL_LOC_RESET:
+		cfs_fail_loc = value;
+		break;
+	default:
+		LASSERTF(0, "called with bad set %u\n", set);
+		break;
+	}
+
+	return 1;
+}
+EXPORT_SYMBOL(__cfs_fail_check_set);
+
+int __cfs_fail_timeout_set(__u32 id, __u32 value, int ms, int set)
+{
+	int ret = 0;
+
+	ret = __cfs_fail_check_set(id, value, set);
+	if (ret) {
+		CERROR("cfs_fail_timeout id %x sleeping for %dms\n",
+		       id, ms);
+		set_current_state(TASK_UNINTERRUPTIBLE);
+		schedule_timeout(cfs_time_seconds(ms) / 1000);
+		CERROR("cfs_fail_timeout id %x awake\n", id);
+	}
+	return ret;
+}
+EXPORT_SYMBOL(__cfs_fail_timeout_set);
diff --git a/kernel/drivers/staging/lustre/lustre/libcfs/hash.c b/kernel/drivers/staging/lustre/lustre/libcfs/hash.c
new file mode 100644
index 000000000..a55567e0d
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/libcfs/hash.c
@@ -0,0 +1,2098 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * libcfs/libcfs/hash.c
+ *
+ * Implement a hash class for hash process in lustre system.
+ *
+ * Author: YuZhangyong <yzy@clusterfs.com>
+ *
+ * 2008-08-15: Brian Behlendorf <behlendorf1@llnl.gov>
+ * - Simplified API and improved documentation
+ * - Added per-hash feature flags:
+ *   * CFS_HASH_DEBUG additional validation
+ *   * CFS_HASH_REHASH dynamic rehashing
+ * - Added per-hash statistics
+ * - General performance enhancements
+ *
+ * 2009-07-31: Liang Zhen <zhen.liang@sun.com>
+ * - move all stuff to libcfs
+ * - don't allow cur_bits != max_bits without setting of CFS_HASH_REHASH
+ * - ignore hs_rwlock if without CFS_HASH_REHASH setting
+ * - buckets are allocated one by one(instead of contiguous memory),
+ *   to avoid unnecessary cacheline conflict
+ *
+ * 2010-03-01: Liang Zhen <zhen.liang@sun.com>
+ * - "bucket" is a group of hlist_head now, user can specify bucket size
+ *   by bkt_bits of cfs_hash_create(), all hlist_heads in a bucket share
+ *   one lock for reducing memory overhead.
+ *
+ * - support lockless hash, caller will take care of locks:
+ *   avoid lock overhead for hash tables that are already protected
+ *   by locking in the caller for another reason
+ *
+ * - support both spin_lock/rwlock for bucket:
+ *   overhead of spinlock contention is lower than read/write
+ *   contention of rwlock, so using spinlock to serialize operations on
+ *   bucket is more reasonable for those frequently changed hash tables
+ *
+ * - support one-single lock mode:
+ *   one lock to protect all hash operations to avoid overhead of
+ *   multiple locks if hash table is always small
+ *
+ * - removed a lot of unnecessary addref & decref on hash element:
+ *   addref & decref are atomic operations in many use-cases which
+ *   are expensive.
+ *
+ * - support non-blocking cfs_hash_add() and cfs_hash_findadd():
+ *   some lustre use-cases require these functions to be strictly
+ *   non-blocking, we need to schedule required rehash on a different
+ *   thread on those cases.
+ *
+ * - safer rehash on large hash table
+ *   In old implementation, rehash function will exclusively lock the
+ *   hash table and finish rehash in one batch, it's dangerous on SMP
+ *   system because rehash millions of elements could take long time.
+ *   New implemented rehash can release lock and relax CPU in middle
+ *   of rehash, it's safe for another thread to search/change on the
+ *   hash table even it's in rehasing.
+ *
+ * - support two different refcount modes
+ *   . hash table has refcount on element
+ *   . hash table doesn't change refcount on adding/removing element
+ *
+ * - support long name hash table (for param-tree)
+ *
+ * - fix a bug for cfs_hash_rehash_key:
+ *   in old implementation, cfs_hash_rehash_key could screw up the
+ *   hash-table because @key is overwritten without any protection.
+ *   Now we need user to define hs_keycpy for those rehash enabled
+ *   hash tables, cfs_hash_rehash_key will overwrite hash-key
+ *   inside lock by calling hs_keycpy.
+ *
+ * - better hash iteration:
+ *   Now we support both locked iteration & lockless iteration of hash
+ *   table. Also, user can break the iteration by return 1 in callback.
+ */
+
+#include "../../include/linux/libcfs/libcfs.h"
+#include <linux/seq_file.h>
+
+#if CFS_HASH_DEBUG_LEVEL >= CFS_HASH_DEBUG_1
+static unsigned int warn_on_depth = 8;
+module_param(warn_on_depth, uint, 0644);
+MODULE_PARM_DESC(warn_on_depth, "warning when hash depth is high.");
+#endif
+
+struct cfs_wi_sched *cfs_sched_rehash;
+
+static inline void
+cfs_hash_nl_lock(union cfs_hash_lock *lock, int exclusive) {}
+
+static inline void
+cfs_hash_nl_unlock(union cfs_hash_lock *lock, int exclusive) {}
+
+static inline void
+cfs_hash_spin_lock(union cfs_hash_lock *lock, int exclusive)
+	__acquires(&lock->spin)
+{
+	spin_lock(&lock->spin);
+}
+
+static inline void
+cfs_hash_spin_unlock(union cfs_hash_lock *lock, int exclusive)
+	__releases(&lock->spin)
+{
+	spin_unlock(&lock->spin);
+}
+
+static inline void
+cfs_hash_rw_lock(union cfs_hash_lock *lock, int exclusive)
+	__acquires(&lock->rw)
+{
+	if (!exclusive)
+		read_lock(&lock->rw);
+	else
+		write_lock(&lock->rw);
+}
+
+static inline void
+cfs_hash_rw_unlock(union cfs_hash_lock *lock, int exclusive)
+	__releases(&lock->rw)
+{
+	if (!exclusive)
+		read_unlock(&lock->rw);
+	else
+		write_unlock(&lock->rw);
+}
+
+/** No lock hash */
+static cfs_hash_lock_ops_t cfs_hash_nl_lops = {
+	.hs_lock	= cfs_hash_nl_lock,
+	.hs_unlock      = cfs_hash_nl_unlock,
+	.hs_bkt_lock    = cfs_hash_nl_lock,
+	.hs_bkt_unlock  = cfs_hash_nl_unlock,
+};
+
+/** no bucket lock, one spinlock to protect everything */
+static cfs_hash_lock_ops_t cfs_hash_nbl_lops = {
+	.hs_lock	= cfs_hash_spin_lock,
+	.hs_unlock      = cfs_hash_spin_unlock,
+	.hs_bkt_lock    = cfs_hash_nl_lock,
+	.hs_bkt_unlock  = cfs_hash_nl_unlock,
+};
+
+/** spin bucket lock, rehash is enabled */
+static cfs_hash_lock_ops_t cfs_hash_bkt_spin_lops = {
+	.hs_lock	= cfs_hash_rw_lock,
+	.hs_unlock      = cfs_hash_rw_unlock,
+	.hs_bkt_lock    = cfs_hash_spin_lock,
+	.hs_bkt_unlock  = cfs_hash_spin_unlock,
+};
+
+/** rw bucket lock, rehash is enabled */
+static cfs_hash_lock_ops_t cfs_hash_bkt_rw_lops = {
+	.hs_lock	= cfs_hash_rw_lock,
+	.hs_unlock      = cfs_hash_rw_unlock,
+	.hs_bkt_lock    = cfs_hash_rw_lock,
+	.hs_bkt_unlock  = cfs_hash_rw_unlock,
+};
+
+/** spin bucket lock, rehash is disabled */
+static cfs_hash_lock_ops_t cfs_hash_nr_bkt_spin_lops = {
+	.hs_lock	= cfs_hash_nl_lock,
+	.hs_unlock      = cfs_hash_nl_unlock,
+	.hs_bkt_lock    = cfs_hash_spin_lock,
+	.hs_bkt_unlock  = cfs_hash_spin_unlock,
+};
+
+/** rw bucket lock, rehash is disabled */
+static cfs_hash_lock_ops_t cfs_hash_nr_bkt_rw_lops = {
+	.hs_lock	= cfs_hash_nl_lock,
+	.hs_unlock      = cfs_hash_nl_unlock,
+	.hs_bkt_lock    = cfs_hash_rw_lock,
+	.hs_bkt_unlock  = cfs_hash_rw_unlock,
+};
+
+static void
+cfs_hash_lock_setup(struct cfs_hash *hs)
+{
+	if (cfs_hash_with_no_lock(hs)) {
+		hs->hs_lops = &cfs_hash_nl_lops;
+
+	} else if (cfs_hash_with_no_bktlock(hs)) {
+		hs->hs_lops = &cfs_hash_nbl_lops;
+		spin_lock_init(&hs->hs_lock.spin);
+
+	} else if (cfs_hash_with_rehash(hs)) {
+		rwlock_init(&hs->hs_lock.rw);
+
+		if (cfs_hash_with_rw_bktlock(hs))
+			hs->hs_lops = &cfs_hash_bkt_rw_lops;
+		else if (cfs_hash_with_spin_bktlock(hs))
+			hs->hs_lops = &cfs_hash_bkt_spin_lops;
+		else
+			LBUG();
+	} else {
+		if (cfs_hash_with_rw_bktlock(hs))
+			hs->hs_lops = &cfs_hash_nr_bkt_rw_lops;
+		else if (cfs_hash_with_spin_bktlock(hs))
+			hs->hs_lops = &cfs_hash_nr_bkt_spin_lops;
+		else
+			LBUG();
+	}
+}
+
+/**
+ * Simple hash head without depth tracking
+ * new element is always added to head of hlist
+ */
+typedef struct {
+	struct hlist_head	hh_head;	/**< entries list */
+} cfs_hash_head_t;
+
+static int
+cfs_hash_hh_hhead_size(struct cfs_hash *hs)
+{
+	return sizeof(cfs_hash_head_t);
+}
+
+static struct hlist_head *
+cfs_hash_hh_hhead(struct cfs_hash *hs, struct cfs_hash_bd *bd)
+{
+	cfs_hash_head_t *head = (cfs_hash_head_t *)&bd->bd_bucket->hsb_head[0];
+
+	return &head[bd->bd_offset].hh_head;
+}
+
+static int
+cfs_hash_hh_hnode_add(struct cfs_hash *hs, struct cfs_hash_bd *bd,
+		      struct hlist_node *hnode)
+{
+	hlist_add_head(hnode, cfs_hash_hh_hhead(hs, bd));
+	return -1; /* unknown depth */
+}
+
+static int
+cfs_hash_hh_hnode_del(struct cfs_hash *hs, struct cfs_hash_bd *bd,
+		      struct hlist_node *hnode)
+{
+	hlist_del_init(hnode);
+	return -1; /* unknown depth */
+}
+
+/**
+ * Simple hash head with depth tracking
+ * new element is always added to head of hlist
+ */
+typedef struct {
+	struct hlist_head	hd_head;	/**< entries list */
+	unsigned int	    hd_depth;       /**< list length */
+} cfs_hash_head_dep_t;
+
+static int
+cfs_hash_hd_hhead_size(struct cfs_hash *hs)
+{
+	return sizeof(cfs_hash_head_dep_t);
+}
+
+static struct hlist_head *
+cfs_hash_hd_hhead(struct cfs_hash *hs, struct cfs_hash_bd *bd)
+{
+	cfs_hash_head_dep_t   *head;
+
+	head = (cfs_hash_head_dep_t *)&bd->bd_bucket->hsb_head[0];
+	return &head[bd->bd_offset].hd_head;
+}
+
+static int
+cfs_hash_hd_hnode_add(struct cfs_hash *hs, struct cfs_hash_bd *bd,
+		      struct hlist_node *hnode)
+{
+	cfs_hash_head_dep_t *hh = container_of(cfs_hash_hd_hhead(hs, bd),
+					       cfs_hash_head_dep_t, hd_head);
+	hlist_add_head(hnode, &hh->hd_head);
+	return ++hh->hd_depth;
+}
+
+static int
+cfs_hash_hd_hnode_del(struct cfs_hash *hs, struct cfs_hash_bd *bd,
+		      struct hlist_node *hnode)
+{
+	cfs_hash_head_dep_t *hh = container_of(cfs_hash_hd_hhead(hs, bd),
+					       cfs_hash_head_dep_t, hd_head);
+	hlist_del_init(hnode);
+	return --hh->hd_depth;
+}
+
+/**
+ * double links hash head without depth tracking
+ * new element is always added to tail of hlist
+ */
+typedef struct {
+	struct hlist_head	dh_head;	/**< entries list */
+	struct hlist_node       *dh_tail;	/**< the last entry */
+} cfs_hash_dhead_t;
+
+static int
+cfs_hash_dh_hhead_size(struct cfs_hash *hs)
+{
+	return sizeof(cfs_hash_dhead_t);
+}
+
+static struct hlist_head *
+cfs_hash_dh_hhead(struct cfs_hash *hs, struct cfs_hash_bd *bd)
+{
+	cfs_hash_dhead_t *head;
+
+	head = (cfs_hash_dhead_t *)&bd->bd_bucket->hsb_head[0];
+	return &head[bd->bd_offset].dh_head;
+}
+
+static int
+cfs_hash_dh_hnode_add(struct cfs_hash *hs, struct cfs_hash_bd *bd,
+		      struct hlist_node *hnode)
+{
+	cfs_hash_dhead_t *dh = container_of(cfs_hash_dh_hhead(hs, bd),
+					    cfs_hash_dhead_t, dh_head);
+
+	if (dh->dh_tail != NULL) /* not empty */
+		hlist_add_behind(hnode, dh->dh_tail);
+	else /* empty list */
+		hlist_add_head(hnode, &dh->dh_head);
+	dh->dh_tail = hnode;
+	return -1; /* unknown depth */
+}
+
+static int
+cfs_hash_dh_hnode_del(struct cfs_hash *hs, struct cfs_hash_bd *bd,
+		      struct hlist_node *hnd)
+{
+	cfs_hash_dhead_t *dh = container_of(cfs_hash_dh_hhead(hs, bd),
+					    cfs_hash_dhead_t, dh_head);
+
+	if (hnd->next == NULL) { /* it's the tail */
+		dh->dh_tail = (hnd->pprev == &dh->dh_head.first) ? NULL :
+			      container_of(hnd->pprev, struct hlist_node, next);
+	}
+	hlist_del_init(hnd);
+	return -1; /* unknown depth */
+}
+
+/**
+ * double links hash head with depth tracking
+ * new element is always added to tail of hlist
+ */
+typedef struct {
+	struct hlist_head	dd_head;	/**< entries list */
+	struct hlist_node       *dd_tail;	/**< the last entry */
+	unsigned int	    dd_depth;       /**< list length */
+} cfs_hash_dhead_dep_t;
+
+static int
+cfs_hash_dd_hhead_size(struct cfs_hash *hs)
+{
+	return sizeof(cfs_hash_dhead_dep_t);
+}
+
+static struct hlist_head *
+cfs_hash_dd_hhead(struct cfs_hash *hs, struct cfs_hash_bd *bd)
+{
+	cfs_hash_dhead_dep_t *head;
+
+	head = (cfs_hash_dhead_dep_t *)&bd->bd_bucket->hsb_head[0];
+	return &head[bd->bd_offset].dd_head;
+}
+
+static int
+cfs_hash_dd_hnode_add(struct cfs_hash *hs, struct cfs_hash_bd *bd,
+		      struct hlist_node *hnode)
+{
+	cfs_hash_dhead_dep_t *dh = container_of(cfs_hash_dd_hhead(hs, bd),
+						cfs_hash_dhead_dep_t, dd_head);
+
+	if (dh->dd_tail != NULL) /* not empty */
+		hlist_add_behind(hnode, dh->dd_tail);
+	else /* empty list */
+		hlist_add_head(hnode, &dh->dd_head);
+	dh->dd_tail = hnode;
+	return ++dh->dd_depth;
+}
+
+static int
+cfs_hash_dd_hnode_del(struct cfs_hash *hs, struct cfs_hash_bd *bd,
+		      struct hlist_node *hnd)
+{
+	cfs_hash_dhead_dep_t *dh = container_of(cfs_hash_dd_hhead(hs, bd),
+						cfs_hash_dhead_dep_t, dd_head);
+
+	if (hnd->next == NULL) { /* it's the tail */
+		dh->dd_tail = (hnd->pprev == &dh->dd_head.first) ? NULL :
+			      container_of(hnd->pprev, struct hlist_node, next);
+	}
+	hlist_del_init(hnd);
+	return --dh->dd_depth;
+}
+
+static cfs_hash_hlist_ops_t cfs_hash_hh_hops = {
+       .hop_hhead      = cfs_hash_hh_hhead,
+       .hop_hhead_size = cfs_hash_hh_hhead_size,
+       .hop_hnode_add  = cfs_hash_hh_hnode_add,
+       .hop_hnode_del  = cfs_hash_hh_hnode_del,
+};
+
+static cfs_hash_hlist_ops_t cfs_hash_hd_hops = {
+       .hop_hhead      = cfs_hash_hd_hhead,
+       .hop_hhead_size = cfs_hash_hd_hhead_size,
+       .hop_hnode_add  = cfs_hash_hd_hnode_add,
+       .hop_hnode_del  = cfs_hash_hd_hnode_del,
+};
+
+static cfs_hash_hlist_ops_t cfs_hash_dh_hops = {
+       .hop_hhead      = cfs_hash_dh_hhead,
+       .hop_hhead_size = cfs_hash_dh_hhead_size,
+       .hop_hnode_add  = cfs_hash_dh_hnode_add,
+       .hop_hnode_del  = cfs_hash_dh_hnode_del,
+};
+
+static cfs_hash_hlist_ops_t cfs_hash_dd_hops = {
+       .hop_hhead      = cfs_hash_dd_hhead,
+       .hop_hhead_size = cfs_hash_dd_hhead_size,
+       .hop_hnode_add  = cfs_hash_dd_hnode_add,
+       .hop_hnode_del  = cfs_hash_dd_hnode_del,
+};
+
+static void
+cfs_hash_hlist_setup(struct cfs_hash *hs)
+{
+	if (cfs_hash_with_add_tail(hs)) {
+		hs->hs_hops = cfs_hash_with_depth(hs) ?
+			      &cfs_hash_dd_hops : &cfs_hash_dh_hops;
+	} else {
+		hs->hs_hops = cfs_hash_with_depth(hs) ?
+			      &cfs_hash_hd_hops : &cfs_hash_hh_hops;
+	}
+}
+
+static void
+cfs_hash_bd_from_key(struct cfs_hash *hs, struct cfs_hash_bucket **bkts,
+		     unsigned int bits, const void *key, struct cfs_hash_bd *bd)
+{
+	unsigned int index = cfs_hash_id(hs, key, (1U << bits) - 1);
+
+	LASSERT(bits == hs->hs_cur_bits || bits == hs->hs_rehash_bits);
+
+	bd->bd_bucket = bkts[index & ((1U << (bits - hs->hs_bkt_bits)) - 1)];
+	bd->bd_offset = index >> (bits - hs->hs_bkt_bits);
+}
+
+void
+cfs_hash_bd_get(struct cfs_hash *hs, const void *key, struct cfs_hash_bd *bd)
+{
+	/* NB: caller should hold hs->hs_rwlock if REHASH is set */
+	if (likely(hs->hs_rehash_buckets == NULL)) {
+		cfs_hash_bd_from_key(hs, hs->hs_buckets,
+				     hs->hs_cur_bits, key, bd);
+	} else {
+		LASSERT(hs->hs_rehash_bits != 0);
+		cfs_hash_bd_from_key(hs, hs->hs_rehash_buckets,
+				     hs->hs_rehash_bits, key, bd);
+	}
+}
+EXPORT_SYMBOL(cfs_hash_bd_get);
+
+static inline void
+cfs_hash_bd_dep_record(struct cfs_hash *hs, struct cfs_hash_bd *bd, int dep_cur)
+{
+	if (likely(dep_cur <= bd->bd_bucket->hsb_depmax))
+		return;
+
+	bd->bd_bucket->hsb_depmax = dep_cur;
+# if CFS_HASH_DEBUG_LEVEL >= CFS_HASH_DEBUG_1
+	if (likely(warn_on_depth == 0 ||
+		   max(warn_on_depth, hs->hs_dep_max) >= dep_cur))
+		return;
+
+	spin_lock(&hs->hs_dep_lock);
+	hs->hs_dep_max  = dep_cur;
+	hs->hs_dep_bkt  = bd->bd_bucket->hsb_index;
+	hs->hs_dep_off  = bd->bd_offset;
+	hs->hs_dep_bits = hs->hs_cur_bits;
+	spin_unlock(&hs->hs_dep_lock);
+
+	cfs_wi_schedule(cfs_sched_rehash, &hs->hs_dep_wi);
+# endif
+}
+
+void
+cfs_hash_bd_add_locked(struct cfs_hash *hs, struct cfs_hash_bd *bd,
+		       struct hlist_node *hnode)
+{
+	int		rc;
+
+	rc = hs->hs_hops->hop_hnode_add(hs, bd, hnode);
+	cfs_hash_bd_dep_record(hs, bd, rc);
+	bd->bd_bucket->hsb_version++;
+	if (unlikely(bd->bd_bucket->hsb_version == 0))
+		bd->bd_bucket->hsb_version++;
+	bd->bd_bucket->hsb_count++;
+
+	if (cfs_hash_with_counter(hs))
+		atomic_inc(&hs->hs_count);
+	if (!cfs_hash_with_no_itemref(hs))
+		cfs_hash_get(hs, hnode);
+}
+EXPORT_SYMBOL(cfs_hash_bd_add_locked);
+
+void
+cfs_hash_bd_del_locked(struct cfs_hash *hs, struct cfs_hash_bd *bd,
+		       struct hlist_node *hnode)
+{
+	hs->hs_hops->hop_hnode_del(hs, bd, hnode);
+
+	LASSERT(bd->bd_bucket->hsb_count > 0);
+	bd->bd_bucket->hsb_count--;
+	bd->bd_bucket->hsb_version++;
+	if (unlikely(bd->bd_bucket->hsb_version == 0))
+		bd->bd_bucket->hsb_version++;
+
+	if (cfs_hash_with_counter(hs)) {
+		LASSERT(atomic_read(&hs->hs_count) > 0);
+		atomic_dec(&hs->hs_count);
+	}
+	if (!cfs_hash_with_no_itemref(hs))
+		cfs_hash_put_locked(hs, hnode);
+}
+EXPORT_SYMBOL(cfs_hash_bd_del_locked);
+
+void
+cfs_hash_bd_move_locked(struct cfs_hash *hs, struct cfs_hash_bd *bd_old,
+			struct cfs_hash_bd *bd_new, struct hlist_node *hnode)
+{
+	struct cfs_hash_bucket *obkt = bd_old->bd_bucket;
+	struct cfs_hash_bucket *nbkt = bd_new->bd_bucket;
+	int		rc;
+
+	if (cfs_hash_bd_compare(bd_old, bd_new) == 0)
+		return;
+
+	/* use cfs_hash_bd_hnode_add/del, to avoid atomic & refcount ops
+	 * in cfs_hash_bd_del/add_locked */
+	hs->hs_hops->hop_hnode_del(hs, bd_old, hnode);
+	rc = hs->hs_hops->hop_hnode_add(hs, bd_new, hnode);
+	cfs_hash_bd_dep_record(hs, bd_new, rc);
+
+	LASSERT(obkt->hsb_count > 0);
+	obkt->hsb_count--;
+	obkt->hsb_version++;
+	if (unlikely(obkt->hsb_version == 0))
+		obkt->hsb_version++;
+	nbkt->hsb_count++;
+	nbkt->hsb_version++;
+	if (unlikely(nbkt->hsb_version == 0))
+		nbkt->hsb_version++;
+}
+EXPORT_SYMBOL(cfs_hash_bd_move_locked);
+
+enum {
+	/** always set, for sanity (avoid ZERO intent) */
+	CFS_HS_LOOKUP_MASK_FIND     = 1 << 0,
+	/** return entry with a ref */
+	CFS_HS_LOOKUP_MASK_REF      = 1 << 1,
+	/** add entry if not existing */
+	CFS_HS_LOOKUP_MASK_ADD      = 1 << 2,
+	/** delete entry, ignore other masks */
+	CFS_HS_LOOKUP_MASK_DEL      = 1 << 3,
+};
+
+typedef enum cfs_hash_lookup_intent {
+	/** return item w/o refcount */
+	CFS_HS_LOOKUP_IT_PEEK       = CFS_HS_LOOKUP_MASK_FIND,
+	/** return item with refcount */
+	CFS_HS_LOOKUP_IT_FIND       = (CFS_HS_LOOKUP_MASK_FIND |
+				       CFS_HS_LOOKUP_MASK_REF),
+	/** return item w/o refcount if existed, otherwise add */
+	CFS_HS_LOOKUP_IT_ADD	= (CFS_HS_LOOKUP_MASK_FIND |
+				       CFS_HS_LOOKUP_MASK_ADD),
+	/** return item with refcount if existed, otherwise add */
+	CFS_HS_LOOKUP_IT_FINDADD    = (CFS_HS_LOOKUP_IT_FIND |
+				       CFS_HS_LOOKUP_MASK_ADD),
+	/** delete if existed */
+	CFS_HS_LOOKUP_IT_FINDDEL    = (CFS_HS_LOOKUP_MASK_FIND |
+				       CFS_HS_LOOKUP_MASK_DEL)
+} cfs_hash_lookup_intent_t;
+
+static struct hlist_node *
+cfs_hash_bd_lookup_intent(struct cfs_hash *hs, struct cfs_hash_bd *bd,
+			  const void *key, struct hlist_node *hnode,
+			  cfs_hash_lookup_intent_t intent)
+
+{
+	struct hlist_head  *hhead = cfs_hash_bd_hhead(hs, bd);
+	struct hlist_node  *ehnode;
+	struct hlist_node  *match;
+	int  intent_add = (intent & CFS_HS_LOOKUP_MASK_ADD) != 0;
+
+	/* with this function, we can avoid a lot of useless refcount ops,
+	 * which are expensive atomic operations most time. */
+	match = intent_add ? NULL : hnode;
+	hlist_for_each(ehnode, hhead) {
+		if (!cfs_hash_keycmp(hs, key, ehnode))
+			continue;
+
+		if (match != NULL && match != ehnode) /* can't match */
+			continue;
+
+		/* match and ... */
+		if ((intent & CFS_HS_LOOKUP_MASK_DEL) != 0) {
+			cfs_hash_bd_del_locked(hs, bd, ehnode);
+			return ehnode;
+		}
+
+		/* caller wants refcount? */
+		if ((intent & CFS_HS_LOOKUP_MASK_REF) != 0)
+			cfs_hash_get(hs, ehnode);
+		return ehnode;
+	}
+	/* no match item */
+	if (!intent_add)
+		return NULL;
+
+	LASSERT(hnode != NULL);
+	cfs_hash_bd_add_locked(hs, bd, hnode);
+	return hnode;
+}
+
+struct hlist_node *
+cfs_hash_bd_lookup_locked(struct cfs_hash *hs, struct cfs_hash_bd *bd, const void *key)
+{
+	return cfs_hash_bd_lookup_intent(hs, bd, key, NULL,
+					 CFS_HS_LOOKUP_IT_FIND);
+}
+EXPORT_SYMBOL(cfs_hash_bd_lookup_locked);
+
+struct hlist_node *
+cfs_hash_bd_peek_locked(struct cfs_hash *hs, struct cfs_hash_bd *bd, const void *key)
+{
+	return cfs_hash_bd_lookup_intent(hs, bd, key, NULL,
+					 CFS_HS_LOOKUP_IT_PEEK);
+}
+EXPORT_SYMBOL(cfs_hash_bd_peek_locked);
+
+struct hlist_node *
+cfs_hash_bd_findadd_locked(struct cfs_hash *hs, struct cfs_hash_bd *bd,
+			   const void *key, struct hlist_node *hnode,
+			   int noref)
+{
+	return cfs_hash_bd_lookup_intent(hs, bd, key, hnode,
+					 CFS_HS_LOOKUP_IT_ADD |
+					 (!noref * CFS_HS_LOOKUP_MASK_REF));
+}
+EXPORT_SYMBOL(cfs_hash_bd_findadd_locked);
+
+struct hlist_node *
+cfs_hash_bd_finddel_locked(struct cfs_hash *hs, struct cfs_hash_bd *bd,
+			   const void *key, struct hlist_node *hnode)
+{
+	/* hnode can be NULL, we find the first item with @key */
+	return cfs_hash_bd_lookup_intent(hs, bd, key, hnode,
+					 CFS_HS_LOOKUP_IT_FINDDEL);
+}
+EXPORT_SYMBOL(cfs_hash_bd_finddel_locked);
+
+static void
+cfs_hash_multi_bd_lock(struct cfs_hash *hs, struct cfs_hash_bd *bds,
+		       unsigned n, int excl)
+{
+	struct cfs_hash_bucket *prev = NULL;
+	int		i;
+
+	/**
+	 * bds must be ascendantly ordered by bd->bd_bucket->hsb_index.
+	 * NB: it's possible that several bds point to the same bucket but
+	 * have different bd::bd_offset, so need take care of deadlock.
+	 */
+	cfs_hash_for_each_bd(bds, n, i) {
+		if (prev == bds[i].bd_bucket)
+			continue;
+
+		LASSERT(prev == NULL ||
+			prev->hsb_index < bds[i].bd_bucket->hsb_index);
+		cfs_hash_bd_lock(hs, &bds[i], excl);
+		prev = bds[i].bd_bucket;
+	}
+}
+
+static void
+cfs_hash_multi_bd_unlock(struct cfs_hash *hs, struct cfs_hash_bd *bds,
+			 unsigned n, int excl)
+{
+	struct cfs_hash_bucket *prev = NULL;
+	int		i;
+
+	cfs_hash_for_each_bd(bds, n, i) {
+		if (prev != bds[i].bd_bucket) {
+			cfs_hash_bd_unlock(hs, &bds[i], excl);
+			prev = bds[i].bd_bucket;
+		}
+	}
+}
+
+static struct hlist_node *
+cfs_hash_multi_bd_lookup_locked(struct cfs_hash *hs, struct cfs_hash_bd *bds,
+				unsigned n, const void *key)
+{
+	struct hlist_node  *ehnode;
+	unsigned	   i;
+
+	cfs_hash_for_each_bd(bds, n, i) {
+		ehnode = cfs_hash_bd_lookup_intent(hs, &bds[i], key, NULL,
+						   CFS_HS_LOOKUP_IT_FIND);
+		if (ehnode != NULL)
+			return ehnode;
+	}
+	return NULL;
+}
+
+static struct hlist_node *
+cfs_hash_multi_bd_findadd_locked(struct cfs_hash *hs,
+				 struct cfs_hash_bd *bds, unsigned n, const void *key,
+				 struct hlist_node *hnode, int noref)
+{
+	struct hlist_node  *ehnode;
+	int		intent;
+	unsigned	   i;
+
+	LASSERT(hnode != NULL);
+	intent = CFS_HS_LOOKUP_IT_PEEK | (!noref * CFS_HS_LOOKUP_MASK_REF);
+
+	cfs_hash_for_each_bd(bds, n, i) {
+		ehnode = cfs_hash_bd_lookup_intent(hs, &bds[i], key,
+						   NULL, intent);
+		if (ehnode != NULL)
+			return ehnode;
+	}
+
+	if (i == 1) { /* only one bucket */
+		cfs_hash_bd_add_locked(hs, &bds[0], hnode);
+	} else {
+		struct cfs_hash_bd      mybd;
+
+		cfs_hash_bd_get(hs, key, &mybd);
+		cfs_hash_bd_add_locked(hs, &mybd, hnode);
+	}
+
+	return hnode;
+}
+
+static struct hlist_node *
+cfs_hash_multi_bd_finddel_locked(struct cfs_hash *hs, struct cfs_hash_bd *bds,
+				 unsigned n, const void *key,
+				 struct hlist_node *hnode)
+{
+	struct hlist_node  *ehnode;
+	unsigned	   i;
+
+	cfs_hash_for_each_bd(bds, n, i) {
+		ehnode = cfs_hash_bd_lookup_intent(hs, &bds[i], key, hnode,
+						   CFS_HS_LOOKUP_IT_FINDDEL);
+		if (ehnode != NULL)
+			return ehnode;
+	}
+	return NULL;
+}
+
+static void
+cfs_hash_bd_order(struct cfs_hash_bd *bd1, struct cfs_hash_bd *bd2)
+{
+	int     rc;
+
+	if (bd2->bd_bucket == NULL)
+		return;
+
+	if (bd1->bd_bucket == NULL) {
+		*bd1 = *bd2;
+		bd2->bd_bucket = NULL;
+		return;
+	}
+
+	rc = cfs_hash_bd_compare(bd1, bd2);
+	if (rc == 0) {
+		bd2->bd_bucket = NULL;
+
+	} else if (rc > 0) { /* swab bd1 and bd2 */
+		struct cfs_hash_bd tmp;
+
+		tmp = *bd2;
+		*bd2 = *bd1;
+		*bd1 = tmp;
+	}
+}
+
+void
+cfs_hash_dual_bd_get(struct cfs_hash *hs, const void *key, struct cfs_hash_bd *bds)
+{
+	/* NB: caller should hold hs_lock.rw if REHASH is set */
+	cfs_hash_bd_from_key(hs, hs->hs_buckets,
+			     hs->hs_cur_bits, key, &bds[0]);
+	if (likely(hs->hs_rehash_buckets == NULL)) {
+		/* no rehash or not rehashing */
+		bds[1].bd_bucket = NULL;
+		return;
+	}
+
+	LASSERT(hs->hs_rehash_bits != 0);
+	cfs_hash_bd_from_key(hs, hs->hs_rehash_buckets,
+			     hs->hs_rehash_bits, key, &bds[1]);
+
+	cfs_hash_bd_order(&bds[0], &bds[1]);
+}
+EXPORT_SYMBOL(cfs_hash_dual_bd_get);
+
+void
+cfs_hash_dual_bd_lock(struct cfs_hash *hs, struct cfs_hash_bd *bds, int excl)
+{
+	cfs_hash_multi_bd_lock(hs, bds, 2, excl);
+}
+EXPORT_SYMBOL(cfs_hash_dual_bd_lock);
+
+void
+cfs_hash_dual_bd_unlock(struct cfs_hash *hs, struct cfs_hash_bd *bds, int excl)
+{
+	cfs_hash_multi_bd_unlock(hs, bds, 2, excl);
+}
+EXPORT_SYMBOL(cfs_hash_dual_bd_unlock);
+
+struct hlist_node *
+cfs_hash_dual_bd_lookup_locked(struct cfs_hash *hs, struct cfs_hash_bd *bds,
+			       const void *key)
+{
+	return cfs_hash_multi_bd_lookup_locked(hs, bds, 2, key);
+}
+EXPORT_SYMBOL(cfs_hash_dual_bd_lookup_locked);
+
+struct hlist_node *
+cfs_hash_dual_bd_findadd_locked(struct cfs_hash *hs, struct cfs_hash_bd *bds,
+				const void *key, struct hlist_node *hnode,
+				int noref)
+{
+	return cfs_hash_multi_bd_findadd_locked(hs, bds, 2, key,
+						hnode, noref);
+}
+EXPORT_SYMBOL(cfs_hash_dual_bd_findadd_locked);
+
+struct hlist_node *
+cfs_hash_dual_bd_finddel_locked(struct cfs_hash *hs, struct cfs_hash_bd *bds,
+				const void *key, struct hlist_node *hnode)
+{
+	return cfs_hash_multi_bd_finddel_locked(hs, bds, 2, key, hnode);
+}
+EXPORT_SYMBOL(cfs_hash_dual_bd_finddel_locked);
+
+static void
+cfs_hash_buckets_free(struct cfs_hash_bucket **buckets,
+		      int bkt_size, int prev_size, int size)
+{
+	int     i;
+
+	for (i = prev_size; i < size; i++) {
+		if (buckets[i] != NULL)
+			LIBCFS_FREE(buckets[i], bkt_size);
+	}
+
+	LIBCFS_FREE(buckets, sizeof(buckets[0]) * size);
+}
+
+/*
+ * Create or grow bucket memory. Return old_buckets if no allocation was
+ * needed, the newly allocated buckets if allocation was needed and
+ * successful, and NULL on error.
+ */
+static struct cfs_hash_bucket **
+cfs_hash_buckets_realloc(struct cfs_hash *hs, struct cfs_hash_bucket **old_bkts,
+			 unsigned int old_size, unsigned int new_size)
+{
+	struct cfs_hash_bucket **new_bkts;
+	int		 i;
+
+	LASSERT(old_size == 0 || old_bkts != NULL);
+
+	if (old_bkts != NULL && old_size == new_size)
+		return old_bkts;
+
+	LIBCFS_ALLOC(new_bkts, sizeof(new_bkts[0]) * new_size);
+	if (new_bkts == NULL)
+		return NULL;
+
+	if (old_bkts != NULL) {
+		memcpy(new_bkts, old_bkts,
+		       min(old_size, new_size) * sizeof(*old_bkts));
+	}
+
+	for (i = old_size; i < new_size; i++) {
+		struct hlist_head *hhead;
+		struct cfs_hash_bd     bd;
+
+		LIBCFS_ALLOC(new_bkts[i], cfs_hash_bkt_size(hs));
+		if (new_bkts[i] == NULL) {
+			cfs_hash_buckets_free(new_bkts, cfs_hash_bkt_size(hs),
+					      old_size, new_size);
+			return NULL;
+		}
+
+		new_bkts[i]->hsb_index   = i;
+		new_bkts[i]->hsb_version = 1;  /* shouldn't be zero */
+		new_bkts[i]->hsb_depmax  = -1; /* unknown */
+		bd.bd_bucket = new_bkts[i];
+		cfs_hash_bd_for_each_hlist(hs, &bd, hhead)
+			INIT_HLIST_HEAD(hhead);
+
+		if (cfs_hash_with_no_lock(hs) ||
+		    cfs_hash_with_no_bktlock(hs))
+			continue;
+
+		if (cfs_hash_with_rw_bktlock(hs))
+			rwlock_init(&new_bkts[i]->hsb_lock.rw);
+		else if (cfs_hash_with_spin_bktlock(hs))
+			spin_lock_init(&new_bkts[i]->hsb_lock.spin);
+		else
+			LBUG(); /* invalid use-case */
+	}
+	return new_bkts;
+}
+
+/**
+ * Initialize new libcfs hash, where:
+ * @name     - Descriptive hash name
+ * @cur_bits - Initial hash table size, in bits
+ * @max_bits - Maximum allowed hash table resize, in bits
+ * @ops      - Registered hash table operations
+ * @flags    - CFS_HASH_REHASH enable synamic hash resizing
+ *	   - CFS_HASH_SORT enable chained hash sort
+ */
+static int cfs_hash_rehash_worker(cfs_workitem_t *wi);
+
+#if CFS_HASH_DEBUG_LEVEL >= CFS_HASH_DEBUG_1
+static int cfs_hash_dep_print(cfs_workitem_t *wi)
+{
+	struct cfs_hash *hs = container_of(wi, struct cfs_hash, hs_dep_wi);
+	int	 dep;
+	int	 bkt;
+	int	 off;
+	int	 bits;
+
+	spin_lock(&hs->hs_dep_lock);
+	dep  = hs->hs_dep_max;
+	bkt  = hs->hs_dep_bkt;
+	off  = hs->hs_dep_off;
+	bits = hs->hs_dep_bits;
+	spin_unlock(&hs->hs_dep_lock);
+
+	LCONSOLE_WARN("#### HASH %s (bits: %d): max depth %d at bucket %d/%d\n",
+		      hs->hs_name, bits, dep, bkt, off);
+	spin_lock(&hs->hs_dep_lock);
+	hs->hs_dep_bits = 0; /* mark as workitem done */
+	spin_unlock(&hs->hs_dep_lock);
+	return 0;
+}
+
+static void cfs_hash_depth_wi_init(struct cfs_hash *hs)
+{
+	spin_lock_init(&hs->hs_dep_lock);
+	cfs_wi_init(&hs->hs_dep_wi, hs, cfs_hash_dep_print);
+}
+
+static void cfs_hash_depth_wi_cancel(struct cfs_hash *hs)
+{
+	if (cfs_wi_deschedule(cfs_sched_rehash, &hs->hs_dep_wi))
+		return;
+
+	spin_lock(&hs->hs_dep_lock);
+	while (hs->hs_dep_bits != 0) {
+		spin_unlock(&hs->hs_dep_lock);
+		cond_resched();
+		spin_lock(&hs->hs_dep_lock);
+	}
+	spin_unlock(&hs->hs_dep_lock);
+}
+
+#else /* CFS_HASH_DEBUG_LEVEL < CFS_HASH_DEBUG_1 */
+
+static inline void cfs_hash_depth_wi_init(struct cfs_hash *hs) {}
+static inline void cfs_hash_depth_wi_cancel(struct cfs_hash *hs) {}
+
+#endif /* CFS_HASH_DEBUG_LEVEL >= CFS_HASH_DEBUG_1 */
+
+struct cfs_hash *
+cfs_hash_create(char *name, unsigned cur_bits, unsigned max_bits,
+		unsigned bkt_bits, unsigned extra_bytes,
+		unsigned min_theta, unsigned max_theta,
+		cfs_hash_ops_t *ops, unsigned flags)
+{
+	struct cfs_hash *hs;
+	int	 len;
+
+	CLASSERT(CFS_HASH_THETA_BITS < 15);
+
+	LASSERT(name != NULL);
+	LASSERT(ops != NULL);
+	LASSERT(ops->hs_key);
+	LASSERT(ops->hs_hash);
+	LASSERT(ops->hs_object);
+	LASSERT(ops->hs_keycmp);
+	LASSERT(ops->hs_get != NULL);
+	LASSERT(ops->hs_put_locked != NULL);
+
+	if ((flags & CFS_HASH_REHASH) != 0)
+		flags |= CFS_HASH_COUNTER; /* must have counter */
+
+	LASSERT(cur_bits > 0);
+	LASSERT(cur_bits >= bkt_bits);
+	LASSERT(max_bits >= cur_bits && max_bits < 31);
+	LASSERT(ergo((flags & CFS_HASH_REHASH) == 0, cur_bits == max_bits));
+	LASSERT(ergo((flags & CFS_HASH_REHASH) != 0,
+		     (flags & CFS_HASH_NO_LOCK) == 0));
+	LASSERT(ergo((flags & CFS_HASH_REHASH_KEY) != 0,
+		      ops->hs_keycpy != NULL));
+
+	len = (flags & CFS_HASH_BIGNAME) == 0 ?
+	      CFS_HASH_NAME_LEN : CFS_HASH_BIGNAME_LEN;
+	LIBCFS_ALLOC(hs, offsetof(struct cfs_hash, hs_name[len]));
+	if (hs == NULL)
+		return NULL;
+
+	strncpy(hs->hs_name, name, len);
+	hs->hs_name[len - 1] = '\0';
+	hs->hs_flags = flags;
+
+	atomic_set(&hs->hs_refcount, 1);
+	atomic_set(&hs->hs_count, 0);
+
+	cfs_hash_lock_setup(hs);
+	cfs_hash_hlist_setup(hs);
+
+	hs->hs_cur_bits = (__u8)cur_bits;
+	hs->hs_min_bits = (__u8)cur_bits;
+	hs->hs_max_bits = (__u8)max_bits;
+	hs->hs_bkt_bits = (__u8)bkt_bits;
+
+	hs->hs_ops	 = ops;
+	hs->hs_extra_bytes = extra_bytes;
+	hs->hs_rehash_bits = 0;
+	cfs_wi_init(&hs->hs_rehash_wi, hs, cfs_hash_rehash_worker);
+	cfs_hash_depth_wi_init(hs);
+
+	if (cfs_hash_with_rehash(hs))
+		__cfs_hash_set_theta(hs, min_theta, max_theta);
+
+	hs->hs_buckets = cfs_hash_buckets_realloc(hs, NULL, 0,
+						  CFS_HASH_NBKT(hs));
+	if (hs->hs_buckets != NULL)
+		return hs;
+
+	LIBCFS_FREE(hs, offsetof(struct cfs_hash, hs_name[len]));
+	return NULL;
+}
+EXPORT_SYMBOL(cfs_hash_create);
+
+/**
+ * Cleanup libcfs hash @hs.
+ */
+static void
+cfs_hash_destroy(struct cfs_hash *hs)
+{
+	struct hlist_node     *hnode;
+	struct hlist_node     *pos;
+	struct cfs_hash_bd	 bd;
+	int		   i;
+
+	LASSERT(hs != NULL);
+	LASSERT(!cfs_hash_is_exiting(hs) &&
+		!cfs_hash_is_iterating(hs));
+
+	/**
+	 * prohibit further rehashes, don't need any lock because
+	 * I'm the only (last) one can change it.
+	 */
+	hs->hs_exiting = 1;
+	if (cfs_hash_with_rehash(hs))
+		cfs_hash_rehash_cancel(hs);
+
+	cfs_hash_depth_wi_cancel(hs);
+	/* rehash should be done/canceled */
+	LASSERT(hs->hs_buckets != NULL &&
+		hs->hs_rehash_buckets == NULL);
+
+	cfs_hash_for_each_bucket(hs, &bd, i) {
+		struct hlist_head *hhead;
+
+		LASSERT(bd.bd_bucket != NULL);
+		/* no need to take this lock, just for consistent code */
+		cfs_hash_bd_lock(hs, &bd, 1);
+
+		cfs_hash_bd_for_each_hlist(hs, &bd, hhead) {
+			hlist_for_each_safe(hnode, pos, hhead) {
+				LASSERTF(!cfs_hash_with_assert_empty(hs),
+					 "hash %s bucket %u(%u) is not empty: %u items left\n",
+					 hs->hs_name, bd.bd_bucket->hsb_index,
+					 bd.bd_offset, bd.bd_bucket->hsb_count);
+				/* can't assert key valicate, because we
+				 * can interrupt rehash */
+				cfs_hash_bd_del_locked(hs, &bd, hnode);
+				cfs_hash_exit(hs, hnode);
+			}
+		}
+		LASSERT(bd.bd_bucket->hsb_count == 0);
+		cfs_hash_bd_unlock(hs, &bd, 1);
+		cond_resched();
+	}
+
+	LASSERT(atomic_read(&hs->hs_count) == 0);
+
+	cfs_hash_buckets_free(hs->hs_buckets, cfs_hash_bkt_size(hs),
+			      0, CFS_HASH_NBKT(hs));
+	i = cfs_hash_with_bigname(hs) ?
+	    CFS_HASH_BIGNAME_LEN : CFS_HASH_NAME_LEN;
+	LIBCFS_FREE(hs, offsetof(struct cfs_hash, hs_name[i]));
+}
+
+struct cfs_hash *cfs_hash_getref(struct cfs_hash *hs)
+{
+	if (atomic_inc_not_zero(&hs->hs_refcount))
+		return hs;
+	return NULL;
+}
+EXPORT_SYMBOL(cfs_hash_getref);
+
+void cfs_hash_putref(struct cfs_hash *hs)
+{
+	if (atomic_dec_and_test(&hs->hs_refcount))
+		cfs_hash_destroy(hs);
+}
+EXPORT_SYMBOL(cfs_hash_putref);
+
+static inline int
+cfs_hash_rehash_bits(struct cfs_hash *hs)
+{
+	if (cfs_hash_with_no_lock(hs) ||
+	    !cfs_hash_with_rehash(hs))
+		return -EOPNOTSUPP;
+
+	if (unlikely(cfs_hash_is_exiting(hs)))
+		return -ESRCH;
+
+	if (unlikely(cfs_hash_is_rehashing(hs)))
+		return -EALREADY;
+
+	if (unlikely(cfs_hash_is_iterating(hs)))
+		return -EAGAIN;
+
+	/* XXX: need to handle case with max_theta != 2.0
+	 *      and the case with min_theta != 0.5 */
+	if ((hs->hs_cur_bits < hs->hs_max_bits) &&
+	    (__cfs_hash_theta(hs) > hs->hs_max_theta))
+		return hs->hs_cur_bits + 1;
+
+	if (!cfs_hash_with_shrink(hs))
+		return 0;
+
+	if ((hs->hs_cur_bits > hs->hs_min_bits) &&
+	    (__cfs_hash_theta(hs) < hs->hs_min_theta))
+		return hs->hs_cur_bits - 1;
+
+	return 0;
+}
+
+/**
+ * don't allow inline rehash if:
+ * - user wants non-blocking change (add/del) on hash table
+ * - too many elements
+ */
+static inline int
+cfs_hash_rehash_inline(struct cfs_hash *hs)
+{
+	return !cfs_hash_with_nblk_change(hs) &&
+	       atomic_read(&hs->hs_count) < CFS_HASH_LOOP_HOG;
+}
+
+/**
+ * Add item @hnode to libcfs hash @hs using @key.  The registered
+ * ops->hs_get function will be called when the item is added.
+ */
+void
+cfs_hash_add(struct cfs_hash *hs, const void *key, struct hlist_node *hnode)
+{
+	struct cfs_hash_bd   bd;
+	int	     bits;
+
+	LASSERT(hlist_unhashed(hnode));
+
+	cfs_hash_lock(hs, 0);
+	cfs_hash_bd_get_and_lock(hs, key, &bd, 1);
+
+	cfs_hash_key_validate(hs, key, hnode);
+	cfs_hash_bd_add_locked(hs, &bd, hnode);
+
+	cfs_hash_bd_unlock(hs, &bd, 1);
+
+	bits = cfs_hash_rehash_bits(hs);
+	cfs_hash_unlock(hs, 0);
+	if (bits > 0)
+		cfs_hash_rehash(hs, cfs_hash_rehash_inline(hs));
+}
+EXPORT_SYMBOL(cfs_hash_add);
+
+static struct hlist_node *
+cfs_hash_find_or_add(struct cfs_hash *hs, const void *key,
+		     struct hlist_node *hnode, int noref)
+{
+	struct hlist_node *ehnode;
+	struct cfs_hash_bd     bds[2];
+	int	       bits = 0;
+
+	LASSERT(hlist_unhashed(hnode));
+
+	cfs_hash_lock(hs, 0);
+	cfs_hash_dual_bd_get_and_lock(hs, key, bds, 1);
+
+	cfs_hash_key_validate(hs, key, hnode);
+	ehnode = cfs_hash_dual_bd_findadd_locked(hs, bds, key,
+						 hnode, noref);
+	cfs_hash_dual_bd_unlock(hs, bds, 1);
+
+	if (ehnode == hnode) /* new item added */
+		bits = cfs_hash_rehash_bits(hs);
+	cfs_hash_unlock(hs, 0);
+	if (bits > 0)
+		cfs_hash_rehash(hs, cfs_hash_rehash_inline(hs));
+
+	return ehnode;
+}
+
+/**
+ * Add item @hnode to libcfs hash @hs using @key.  The registered
+ * ops->hs_get function will be called if the item was added.
+ * Returns 0 on success or -EALREADY on key collisions.
+ */
+int
+cfs_hash_add_unique(struct cfs_hash *hs, const void *key, struct hlist_node *hnode)
+{
+	return cfs_hash_find_or_add(hs, key, hnode, 1) != hnode ?
+	       -EALREADY : 0;
+}
+EXPORT_SYMBOL(cfs_hash_add_unique);
+
+/**
+ * Add item @hnode to libcfs hash @hs using @key.  If this @key
+ * already exists in the hash then ops->hs_get will be called on the
+ * conflicting entry and that entry will be returned to the caller.
+ * Otherwise ops->hs_get is called on the item which was added.
+ */
+void *
+cfs_hash_findadd_unique(struct cfs_hash *hs, const void *key,
+			struct hlist_node *hnode)
+{
+	hnode = cfs_hash_find_or_add(hs, key, hnode, 0);
+
+	return cfs_hash_object(hs, hnode);
+}
+EXPORT_SYMBOL(cfs_hash_findadd_unique);
+
+/**
+ * Delete item @hnode from the libcfs hash @hs using @key.  The @key
+ * is required to ensure the correct hash bucket is locked since there
+ * is no direct linkage from the item to the bucket.  The object
+ * removed from the hash will be returned and obs->hs_put is called
+ * on the removed object.
+ */
+void *
+cfs_hash_del(struct cfs_hash *hs, const void *key, struct hlist_node *hnode)
+{
+	void	   *obj  = NULL;
+	int	     bits = 0;
+	struct cfs_hash_bd   bds[2];
+
+	cfs_hash_lock(hs, 0);
+	cfs_hash_dual_bd_get_and_lock(hs, key, bds, 1);
+
+	/* NB: do nothing if @hnode is not in hash table */
+	if (hnode == NULL || !hlist_unhashed(hnode)) {
+		if (bds[1].bd_bucket == NULL && hnode != NULL) {
+			cfs_hash_bd_del_locked(hs, &bds[0], hnode);
+		} else {
+			hnode = cfs_hash_dual_bd_finddel_locked(hs, bds,
+								key, hnode);
+		}
+	}
+
+	if (hnode != NULL) {
+		obj  = cfs_hash_object(hs, hnode);
+		bits = cfs_hash_rehash_bits(hs);
+	}
+
+	cfs_hash_dual_bd_unlock(hs, bds, 1);
+	cfs_hash_unlock(hs, 0);
+	if (bits > 0)
+		cfs_hash_rehash(hs, cfs_hash_rehash_inline(hs));
+
+	return obj;
+}
+EXPORT_SYMBOL(cfs_hash_del);
+
+/**
+ * Delete item given @key in libcfs hash @hs.  The first @key found in
+ * the hash will be removed, if the key exists multiple times in the hash
+ * @hs this function must be called once per key.  The removed object
+ * will be returned and ops->hs_put is called on the removed object.
+ */
+void *
+cfs_hash_del_key(struct cfs_hash *hs, const void *key)
+{
+	return cfs_hash_del(hs, key, NULL);
+}
+EXPORT_SYMBOL(cfs_hash_del_key);
+
+/**
+ * Lookup an item using @key in the libcfs hash @hs and return it.
+ * If the @key is found in the hash hs->hs_get() is called and the
+ * matching objects is returned.  It is the callers responsibility
+ * to call the counterpart ops->hs_put using the cfs_hash_put() macro
+ * when when finished with the object.  If the @key was not found
+ * in the hash @hs NULL is returned.
+ */
+void *
+cfs_hash_lookup(struct cfs_hash *hs, const void *key)
+{
+	void		 *obj = NULL;
+	struct hlist_node     *hnode;
+	struct cfs_hash_bd	 bds[2];
+
+	cfs_hash_lock(hs, 0);
+	cfs_hash_dual_bd_get_and_lock(hs, key, bds, 0);
+
+	hnode = cfs_hash_dual_bd_lookup_locked(hs, bds, key);
+	if (hnode != NULL)
+		obj = cfs_hash_object(hs, hnode);
+
+	cfs_hash_dual_bd_unlock(hs, bds, 0);
+	cfs_hash_unlock(hs, 0);
+
+	return obj;
+}
+EXPORT_SYMBOL(cfs_hash_lookup);
+
+static void
+cfs_hash_for_each_enter(struct cfs_hash *hs) {
+	LASSERT(!cfs_hash_is_exiting(hs));
+
+	if (!cfs_hash_with_rehash(hs))
+		return;
+	/*
+	 * NB: it's race on cfs_has_t::hs_iterating, but doesn't matter
+	 * because it's just an unreliable signal to rehash-thread,
+	 * rehash-thread will try to finish rehash ASAP when seeing this.
+	 */
+	hs->hs_iterating = 1;
+
+	cfs_hash_lock(hs, 1);
+	hs->hs_iterators++;
+
+	/* NB: iteration is mostly called by service thread,
+	 * we tend to cancel pending rehash-request, instead of
+	 * blocking service thread, we will relaunch rehash request
+	 * after iteration */
+	if (cfs_hash_is_rehashing(hs))
+		cfs_hash_rehash_cancel_locked(hs);
+	cfs_hash_unlock(hs, 1);
+}
+
+static void
+cfs_hash_for_each_exit(struct cfs_hash *hs) {
+	int remained;
+	int bits;
+
+	if (!cfs_hash_with_rehash(hs))
+		return;
+	cfs_hash_lock(hs, 1);
+	remained = --hs->hs_iterators;
+	bits = cfs_hash_rehash_bits(hs);
+	cfs_hash_unlock(hs, 1);
+	/* NB: it's race on cfs_has_t::hs_iterating, see above */
+	if (remained == 0)
+		hs->hs_iterating = 0;
+	if (bits > 0) {
+		cfs_hash_rehash(hs, atomic_read(&hs->hs_count) <
+				    CFS_HASH_LOOP_HOG);
+	}
+}
+
+/**
+ * For each item in the libcfs hash @hs call the passed callback @func
+ * and pass to it as an argument each hash item and the private @data.
+ *
+ * a) the function may sleep!
+ * b) during the callback:
+ *    . the bucket lock is held so the callback must never sleep.
+ *    . if @removal_safe is true, use can remove current item by
+ *      cfs_hash_bd_del_locked
+ */
+static __u64
+cfs_hash_for_each_tight(struct cfs_hash *hs, cfs_hash_for_each_cb_t func,
+			void *data, int remove_safe) {
+	struct hlist_node     *hnode;
+	struct hlist_node     *pos;
+	struct cfs_hash_bd	 bd;
+	__u64		 count = 0;
+	int		   excl  = !!remove_safe;
+	int		   loop  = 0;
+	int		   i;
+
+	cfs_hash_for_each_enter(hs);
+
+	cfs_hash_lock(hs, 0);
+	LASSERT(!cfs_hash_is_rehashing(hs));
+
+	cfs_hash_for_each_bucket(hs, &bd, i) {
+		struct hlist_head *hhead;
+
+		cfs_hash_bd_lock(hs, &bd, excl);
+		if (func == NULL) { /* only glimpse size */
+			count += bd.bd_bucket->hsb_count;
+			cfs_hash_bd_unlock(hs, &bd, excl);
+			continue;
+		}
+
+		cfs_hash_bd_for_each_hlist(hs, &bd, hhead) {
+			hlist_for_each_safe(hnode, pos, hhead) {
+				cfs_hash_bucket_validate(hs, &bd, hnode);
+				count++;
+				loop++;
+				if (func(hs, &bd, hnode, data)) {
+					cfs_hash_bd_unlock(hs, &bd, excl);
+					goto out;
+				}
+			}
+		}
+		cfs_hash_bd_unlock(hs, &bd, excl);
+		if (loop < CFS_HASH_LOOP_HOG)
+			continue;
+		loop = 0;
+		cfs_hash_unlock(hs, 0);
+		cond_resched();
+		cfs_hash_lock(hs, 0);
+	}
+ out:
+	cfs_hash_unlock(hs, 0);
+
+	cfs_hash_for_each_exit(hs);
+	return count;
+}
+
+typedef struct {
+	cfs_hash_cond_opt_cb_t  func;
+	void		   *arg;
+} cfs_hash_cond_arg_t;
+
+static int
+cfs_hash_cond_del_locked(struct cfs_hash *hs, struct cfs_hash_bd *bd,
+			 struct hlist_node *hnode, void *data)
+{
+	cfs_hash_cond_arg_t *cond = data;
+
+	if (cond->func(cfs_hash_object(hs, hnode), cond->arg))
+		cfs_hash_bd_del_locked(hs, bd, hnode);
+	return 0;
+}
+
+/**
+ * Delete item from the libcfs hash @hs when @func return true.
+ * The write lock being hold during loop for each bucket to avoid
+ * any object be reference.
+ */
+void
+cfs_hash_cond_del(struct cfs_hash *hs, cfs_hash_cond_opt_cb_t func, void *data)
+{
+	cfs_hash_cond_arg_t arg = {
+		.func   = func,
+		.arg    = data,
+	};
+
+	cfs_hash_for_each_tight(hs, cfs_hash_cond_del_locked, &arg, 1);
+}
+EXPORT_SYMBOL(cfs_hash_cond_del);
+
+void
+cfs_hash_for_each(struct cfs_hash *hs,
+		  cfs_hash_for_each_cb_t func, void *data)
+{
+	cfs_hash_for_each_tight(hs, func, data, 0);
+}
+EXPORT_SYMBOL(cfs_hash_for_each);
+
+void
+cfs_hash_for_each_safe(struct cfs_hash *hs,
+		       cfs_hash_for_each_cb_t func, void *data) {
+	cfs_hash_for_each_tight(hs, func, data, 1);
+}
+EXPORT_SYMBOL(cfs_hash_for_each_safe);
+
+static int
+cfs_hash_peek(struct cfs_hash *hs, struct cfs_hash_bd *bd,
+	      struct hlist_node *hnode, void *data)
+{
+	*(int *)data = 0;
+	return 1; /* return 1 to break the loop */
+}
+
+int
+cfs_hash_is_empty(struct cfs_hash *hs)
+{
+	int empty = 1;
+
+	cfs_hash_for_each_tight(hs, cfs_hash_peek, &empty, 0);
+	return empty;
+}
+EXPORT_SYMBOL(cfs_hash_is_empty);
+
+__u64
+cfs_hash_size_get(struct cfs_hash *hs)
+{
+	return cfs_hash_with_counter(hs) ?
+	       atomic_read(&hs->hs_count) :
+	       cfs_hash_for_each_tight(hs, NULL, NULL, 0);
+}
+EXPORT_SYMBOL(cfs_hash_size_get);
+
+/*
+ * cfs_hash_for_each_relax:
+ * Iterate the hash table and call @func on each item without
+ * any lock. This function can't guarantee to finish iteration
+ * if these features are enabled:
+ *
+ *  a. if rehash_key is enabled, an item can be moved from
+ *     one bucket to another bucket
+ *  b. user can remove non-zero-ref item from hash-table,
+ *     so the item can be removed from hash-table, even worse,
+ *     it's possible that user changed key and insert to another
+ *     hash bucket.
+ * there's no way for us to finish iteration correctly on previous
+ * two cases, so iteration has to be stopped on change.
+ */
+static int
+cfs_hash_for_each_relax(struct cfs_hash *hs, cfs_hash_for_each_cb_t func,
+			void *data) {
+	struct hlist_node *hnode;
+	struct hlist_node *tmp;
+	struct cfs_hash_bd     bd;
+	__u32	     version;
+	int	       count = 0;
+	int	       stop_on_change;
+	int	       rc;
+	int	       i;
+
+	stop_on_change = cfs_hash_with_rehash_key(hs) ||
+			 !cfs_hash_with_no_itemref(hs) ||
+			 hs->hs_ops->hs_put_locked == NULL;
+	cfs_hash_lock(hs, 0);
+	LASSERT(!cfs_hash_is_rehashing(hs));
+
+	cfs_hash_for_each_bucket(hs, &bd, i) {
+		struct hlist_head *hhead;
+
+		cfs_hash_bd_lock(hs, &bd, 0);
+		version = cfs_hash_bd_version_get(&bd);
+
+		cfs_hash_bd_for_each_hlist(hs, &bd, hhead) {
+			for (hnode = hhead->first; hnode != NULL;) {
+				cfs_hash_bucket_validate(hs, &bd, hnode);
+				cfs_hash_get(hs, hnode);
+				cfs_hash_bd_unlock(hs, &bd, 0);
+				cfs_hash_unlock(hs, 0);
+
+				rc = func(hs, &bd, hnode, data);
+				if (stop_on_change)
+					cfs_hash_put(hs, hnode);
+				cond_resched();
+				count++;
+
+				cfs_hash_lock(hs, 0);
+				cfs_hash_bd_lock(hs, &bd, 0);
+				if (!stop_on_change) {
+					tmp = hnode->next;
+					cfs_hash_put_locked(hs, hnode);
+					hnode = tmp;
+				} else { /* bucket changed? */
+					if (version !=
+					    cfs_hash_bd_version_get(&bd))
+						break;
+					/* safe to continue because no change */
+					hnode = hnode->next;
+				}
+				if (rc) /* callback wants to break iteration */
+					break;
+			}
+		}
+		cfs_hash_bd_unlock(hs, &bd, 0);
+	}
+	cfs_hash_unlock(hs, 0);
+
+	return count;
+}
+
+int
+cfs_hash_for_each_nolock(struct cfs_hash *hs,
+			 cfs_hash_for_each_cb_t func, void *data) {
+	if (cfs_hash_with_no_lock(hs) ||
+	    cfs_hash_with_rehash_key(hs) ||
+	    !cfs_hash_with_no_itemref(hs))
+		return -EOPNOTSUPP;
+
+	if (hs->hs_ops->hs_get == NULL ||
+	    (hs->hs_ops->hs_put == NULL &&
+	     hs->hs_ops->hs_put_locked == NULL))
+		return -EOPNOTSUPP;
+
+	cfs_hash_for_each_enter(hs);
+	cfs_hash_for_each_relax(hs, func, data);
+	cfs_hash_for_each_exit(hs);
+
+	return 0;
+}
+EXPORT_SYMBOL(cfs_hash_for_each_nolock);
+
+/**
+ * For each hash bucket in the libcfs hash @hs call the passed callback
+ * @func until all the hash buckets are empty.  The passed callback @func
+ * or the previously registered callback hs->hs_put must remove the item
+ * from the hash.  You may either use the cfs_hash_del() or hlist_del()
+ * functions.  No rwlocks will be held during the callback @func it is
+ * safe to sleep if needed.  This function will not terminate until the
+ * hash is empty.  Note it is still possible to concurrently add new
+ * items in to the hash.  It is the callers responsibility to ensure
+ * the required locking is in place to prevent concurrent insertions.
+ */
+int
+cfs_hash_for_each_empty(struct cfs_hash *hs,
+			cfs_hash_for_each_cb_t func, void *data) {
+	unsigned  i = 0;
+
+	if (cfs_hash_with_no_lock(hs))
+		return -EOPNOTSUPP;
+
+	if (hs->hs_ops->hs_get == NULL ||
+	    (hs->hs_ops->hs_put == NULL &&
+	     hs->hs_ops->hs_put_locked == NULL))
+		return -EOPNOTSUPP;
+
+	cfs_hash_for_each_enter(hs);
+	while (cfs_hash_for_each_relax(hs, func, data)) {
+		CDEBUG(D_INFO, "Try to empty hash: %s, loop: %u\n",
+		       hs->hs_name, i++);
+	}
+	cfs_hash_for_each_exit(hs);
+	return 0;
+}
+EXPORT_SYMBOL(cfs_hash_for_each_empty);
+
+void
+cfs_hash_hlist_for_each(struct cfs_hash *hs, unsigned hindex,
+			cfs_hash_for_each_cb_t func, void *data)
+{
+	struct hlist_head   *hhead;
+	struct hlist_node   *hnode;
+	struct cfs_hash_bd       bd;
+
+	cfs_hash_for_each_enter(hs);
+	cfs_hash_lock(hs, 0);
+	if (hindex >= CFS_HASH_NHLIST(hs))
+		goto out;
+
+	cfs_hash_bd_index_set(hs, hindex, &bd);
+
+	cfs_hash_bd_lock(hs, &bd, 0);
+	hhead = cfs_hash_bd_hhead(hs, &bd);
+	hlist_for_each(hnode, hhead) {
+		if (func(hs, &bd, hnode, data))
+			break;
+	}
+	cfs_hash_bd_unlock(hs, &bd, 0);
+ out:
+	cfs_hash_unlock(hs, 0);
+	cfs_hash_for_each_exit(hs);
+}
+
+EXPORT_SYMBOL(cfs_hash_hlist_for_each);
+
+/*
+ * For each item in the libcfs hash @hs which matches the @key call
+ * the passed callback @func and pass to it as an argument each hash
+ * item and the private @data. During the callback the bucket lock
+ * is held so the callback must never sleep.
+   */
+void
+cfs_hash_for_each_key(struct cfs_hash *hs, const void *key,
+		      cfs_hash_for_each_cb_t func, void *data) {
+	struct hlist_node   *hnode;
+	struct cfs_hash_bd       bds[2];
+	unsigned	    i;
+
+	cfs_hash_lock(hs, 0);
+
+	cfs_hash_dual_bd_get_and_lock(hs, key, bds, 0);
+
+	cfs_hash_for_each_bd(bds, 2, i) {
+		struct hlist_head *hlist = cfs_hash_bd_hhead(hs, &bds[i]);
+
+		hlist_for_each(hnode, hlist) {
+			cfs_hash_bucket_validate(hs, &bds[i], hnode);
+
+			if (cfs_hash_keycmp(hs, key, hnode)) {
+				if (func(hs, &bds[i], hnode, data))
+					break;
+			}
+		}
+	}
+
+	cfs_hash_dual_bd_unlock(hs, bds, 0);
+	cfs_hash_unlock(hs, 0);
+}
+EXPORT_SYMBOL(cfs_hash_for_each_key);
+
+/**
+ * Rehash the libcfs hash @hs to the given @bits.  This can be used
+ * to grow the hash size when excessive chaining is detected, or to
+ * shrink the hash when it is larger than needed.  When the CFS_HASH_REHASH
+ * flag is set in @hs the libcfs hash may be dynamically rehashed
+ * during addition or removal if the hash's theta value exceeds
+ * either the hs->hs_min_theta or hs->max_theta values.  By default
+ * these values are tuned to keep the chained hash depth small, and
+ * this approach assumes a reasonably uniform hashing function.  The
+ * theta thresholds for @hs are tunable via cfs_hash_set_theta().
+ */
+void
+cfs_hash_rehash_cancel_locked(struct cfs_hash *hs)
+{
+	int     i;
+
+	/* need hold cfs_hash_lock(hs, 1) */
+	LASSERT(cfs_hash_with_rehash(hs) &&
+		!cfs_hash_with_no_lock(hs));
+
+	if (!cfs_hash_is_rehashing(hs))
+		return;
+
+	if (cfs_wi_deschedule(cfs_sched_rehash, &hs->hs_rehash_wi)) {
+		hs->hs_rehash_bits = 0;
+		return;
+	}
+
+	for (i = 2; cfs_hash_is_rehashing(hs); i++) {
+		cfs_hash_unlock(hs, 1);
+		/* raise console warning while waiting too long */
+		CDEBUG(IS_PO2(i >> 3) ? D_WARNING : D_INFO,
+		       "hash %s is still rehashing, rescheded %d\n",
+		       hs->hs_name, i - 1);
+		cond_resched();
+		cfs_hash_lock(hs, 1);
+	}
+}
+EXPORT_SYMBOL(cfs_hash_rehash_cancel_locked);
+
+void
+cfs_hash_rehash_cancel(struct cfs_hash *hs)
+{
+	cfs_hash_lock(hs, 1);
+	cfs_hash_rehash_cancel_locked(hs);
+	cfs_hash_unlock(hs, 1);
+}
+EXPORT_SYMBOL(cfs_hash_rehash_cancel);
+
+int
+cfs_hash_rehash(struct cfs_hash *hs, int do_rehash)
+{
+	int     rc;
+
+	LASSERT(cfs_hash_with_rehash(hs) && !cfs_hash_with_no_lock(hs));
+
+	cfs_hash_lock(hs, 1);
+
+	rc = cfs_hash_rehash_bits(hs);
+	if (rc <= 0) {
+		cfs_hash_unlock(hs, 1);
+		return rc;
+	}
+
+	hs->hs_rehash_bits = rc;
+	if (!do_rehash) {
+		/* launch and return */
+		cfs_wi_schedule(cfs_sched_rehash, &hs->hs_rehash_wi);
+		cfs_hash_unlock(hs, 1);
+		return 0;
+	}
+
+	/* rehash right now */
+	cfs_hash_unlock(hs, 1);
+
+	return cfs_hash_rehash_worker(&hs->hs_rehash_wi);
+}
+EXPORT_SYMBOL(cfs_hash_rehash);
+
+static int
+cfs_hash_rehash_bd(struct cfs_hash *hs, struct cfs_hash_bd *old)
+{
+	struct cfs_hash_bd      new;
+	struct hlist_head  *hhead;
+	struct hlist_node  *hnode;
+	struct hlist_node  *pos;
+	void	      *key;
+	int		c = 0;
+
+	/* hold cfs_hash_lock(hs, 1), so don't need any bucket lock */
+	cfs_hash_bd_for_each_hlist(hs, old, hhead) {
+		hlist_for_each_safe(hnode, pos, hhead) {
+			key = cfs_hash_key(hs, hnode);
+			LASSERT(key != NULL);
+			/* Validate hnode is in the correct bucket. */
+			cfs_hash_bucket_validate(hs, old, hnode);
+			/*
+			 * Delete from old hash bucket; move to new bucket.
+			 * ops->hs_key must be defined.
+			 */
+			cfs_hash_bd_from_key(hs, hs->hs_rehash_buckets,
+					     hs->hs_rehash_bits, key, &new);
+			cfs_hash_bd_move_locked(hs, old, &new, hnode);
+			c++;
+		}
+	}
+
+	return c;
+}
+
+static int
+cfs_hash_rehash_worker(cfs_workitem_t *wi)
+{
+	struct cfs_hash	 *hs = container_of(wi, struct cfs_hash, hs_rehash_wi);
+	struct cfs_hash_bucket **bkts;
+	struct cfs_hash_bd       bd;
+	unsigned int	old_size;
+	unsigned int	new_size;
+	int		 bsize;
+	int		 count = 0;
+	int		 rc = 0;
+	int		 i;
+
+	LASSERT (hs != NULL && cfs_hash_with_rehash(hs));
+
+	cfs_hash_lock(hs, 0);
+	LASSERT(cfs_hash_is_rehashing(hs));
+
+	old_size = CFS_HASH_NBKT(hs);
+	new_size = CFS_HASH_RH_NBKT(hs);
+
+	cfs_hash_unlock(hs, 0);
+
+	/*
+	 * don't need hs::hs_rwlock for hs::hs_buckets,
+	 * because nobody can change bkt-table except me.
+	 */
+	bkts = cfs_hash_buckets_realloc(hs, hs->hs_buckets,
+					old_size, new_size);
+	cfs_hash_lock(hs, 1);
+	if (bkts == NULL) {
+		rc = -ENOMEM;
+		goto out;
+	}
+
+	if (bkts == hs->hs_buckets) {
+		bkts = NULL; /* do nothing */
+		goto out;
+	}
+
+	rc = __cfs_hash_theta(hs);
+	if ((rc >= hs->hs_min_theta) && (rc <= hs->hs_max_theta)) {
+		/* free the new allocated bkt-table */
+		old_size = new_size;
+		new_size = CFS_HASH_NBKT(hs);
+		rc = -EALREADY;
+		goto out;
+	}
+
+	LASSERT(hs->hs_rehash_buckets == NULL);
+	hs->hs_rehash_buckets = bkts;
+
+	rc = 0;
+	cfs_hash_for_each_bucket(hs, &bd, i) {
+		if (cfs_hash_is_exiting(hs)) {
+			rc = -ESRCH;
+			/* someone wants to destroy the hash, abort now */
+			if (old_size < new_size) /* OK to free old bkt-table */
+				break;
+			/* it's shrinking, need free new bkt-table */
+			hs->hs_rehash_buckets = NULL;
+			old_size = new_size;
+			new_size = CFS_HASH_NBKT(hs);
+			goto out;
+		}
+
+		count += cfs_hash_rehash_bd(hs, &bd);
+		if (count < CFS_HASH_LOOP_HOG ||
+		    cfs_hash_is_iterating(hs)) { /* need to finish ASAP */
+			continue;
+		}
+
+		count = 0;
+		cfs_hash_unlock(hs, 1);
+		cond_resched();
+		cfs_hash_lock(hs, 1);
+	}
+
+	hs->hs_rehash_count++;
+
+	bkts = hs->hs_buckets;
+	hs->hs_buckets = hs->hs_rehash_buckets;
+	hs->hs_rehash_buckets = NULL;
+
+	hs->hs_cur_bits = hs->hs_rehash_bits;
+ out:
+	hs->hs_rehash_bits = 0;
+	if (rc == -ESRCH) /* never be scheduled again */
+		cfs_wi_exit(cfs_sched_rehash, wi);
+	bsize = cfs_hash_bkt_size(hs);
+	cfs_hash_unlock(hs, 1);
+	/* can't refer to @hs anymore because it could be destroyed */
+	if (bkts != NULL)
+		cfs_hash_buckets_free(bkts, bsize, new_size, old_size);
+	if (rc != 0)
+		CDEBUG(D_INFO, "early quit of rehashing: %d\n", rc);
+	/* return 1 only if cfs_wi_exit is called */
+	return rc == -ESRCH;
+}
+
+/**
+ * Rehash the object referenced by @hnode in the libcfs hash @hs.  The
+ * @old_key must be provided to locate the objects previous location
+ * in the hash, and the @new_key will be used to reinsert the object.
+ * Use this function instead of a cfs_hash_add() + cfs_hash_del()
+ * combo when it is critical that there is no window in time where the
+ * object is missing from the hash.  When an object is being rehashed
+ * the registered cfs_hash_get() and cfs_hash_put() functions will
+ * not be called.
+ */
+void cfs_hash_rehash_key(struct cfs_hash *hs, const void *old_key,
+			 void *new_key, struct hlist_node *hnode)
+{
+	struct cfs_hash_bd	bds[3];
+	struct cfs_hash_bd	old_bds[2];
+	struct cfs_hash_bd	new_bd;
+
+	LASSERT(!hlist_unhashed(hnode));
+
+	cfs_hash_lock(hs, 0);
+
+	cfs_hash_dual_bd_get(hs, old_key, old_bds);
+	cfs_hash_bd_get(hs, new_key, &new_bd);
+
+	bds[0] = old_bds[0];
+	bds[1] = old_bds[1];
+	bds[2] = new_bd;
+
+	/* NB: bds[0] and bds[1] are ordered already */
+	cfs_hash_bd_order(&bds[1], &bds[2]);
+	cfs_hash_bd_order(&bds[0], &bds[1]);
+
+	cfs_hash_multi_bd_lock(hs, bds, 3, 1);
+	if (likely(old_bds[1].bd_bucket == NULL)) {
+		cfs_hash_bd_move_locked(hs, &old_bds[0], &new_bd, hnode);
+	} else {
+		cfs_hash_dual_bd_finddel_locked(hs, old_bds, old_key, hnode);
+		cfs_hash_bd_add_locked(hs, &new_bd, hnode);
+	}
+	/* overwrite key inside locks, otherwise may screw up with
+	 * other operations, i.e: rehash */
+	cfs_hash_keycpy(hs, new_key, hnode);
+
+	cfs_hash_multi_bd_unlock(hs, bds, 3, 1);
+	cfs_hash_unlock(hs, 0);
+}
+EXPORT_SYMBOL(cfs_hash_rehash_key);
+
+void cfs_hash_debug_header(struct seq_file *m)
+{
+	seq_printf(m, "%-*s   cur   min   max theta t-min t-max flags rehash   count  maxdep maxdepb distribution\n",
+		   CFS_HASH_BIGNAME_LEN, "name");
+}
+EXPORT_SYMBOL(cfs_hash_debug_header);
+
+static struct cfs_hash_bucket **
+cfs_hash_full_bkts(struct cfs_hash *hs)
+{
+	/* NB: caller should hold hs->hs_rwlock if REHASH is set */
+	if (hs->hs_rehash_buckets == NULL)
+		return hs->hs_buckets;
+
+	LASSERT(hs->hs_rehash_bits != 0);
+	return hs->hs_rehash_bits > hs->hs_cur_bits ?
+	       hs->hs_rehash_buckets : hs->hs_buckets;
+}
+
+static unsigned int
+cfs_hash_full_nbkt(struct cfs_hash *hs)
+{
+	/* NB: caller should hold hs->hs_rwlock if REHASH is set */
+	if (hs->hs_rehash_buckets == NULL)
+		return CFS_HASH_NBKT(hs);
+
+	LASSERT(hs->hs_rehash_bits != 0);
+	return hs->hs_rehash_bits > hs->hs_cur_bits ?
+	       CFS_HASH_RH_NBKT(hs) : CFS_HASH_NBKT(hs);
+}
+
+void cfs_hash_debug_str(struct cfs_hash *hs, struct seq_file *m)
+{
+	int		    dist[8] = { 0, };
+	int		    maxdep  = -1;
+	int		    maxdepb = -1;
+	int		    total   = 0;
+	int		    theta;
+	int		    i;
+
+	cfs_hash_lock(hs, 0);
+	theta = __cfs_hash_theta(hs);
+
+	seq_printf(m, "%-*s %5d %5d %5d %d.%03d %d.%03d %d.%03d  0x%02x %6d ",
+		      CFS_HASH_BIGNAME_LEN, hs->hs_name,
+		      1 << hs->hs_cur_bits, 1 << hs->hs_min_bits,
+		      1 << hs->hs_max_bits,
+		      __cfs_hash_theta_int(theta), __cfs_hash_theta_frac(theta),
+		      __cfs_hash_theta_int(hs->hs_min_theta),
+		      __cfs_hash_theta_frac(hs->hs_min_theta),
+		      __cfs_hash_theta_int(hs->hs_max_theta),
+		      __cfs_hash_theta_frac(hs->hs_max_theta),
+		      hs->hs_flags, hs->hs_rehash_count);
+
+	/*
+	 * The distribution is a summary of the chained hash depth in
+	 * each of the libcfs hash buckets.  Each buckets hsb_count is
+	 * divided by the hash theta value and used to generate a
+	 * histogram of the hash distribution.  A uniform hash will
+	 * result in all hash buckets being close to the average thus
+	 * only the first few entries in the histogram will be non-zero.
+	 * If you hash function results in a non-uniform hash the will
+	 * be observable by outlier bucks in the distribution histogram.
+	 *
+	 * Uniform hash distribution:      128/128/0/0/0/0/0/0
+	 * Non-Uniform hash distribution:  128/125/0/0/0/0/2/1
+	 */
+	for (i = 0; i < cfs_hash_full_nbkt(hs); i++) {
+		struct cfs_hash_bd  bd;
+
+		bd.bd_bucket = cfs_hash_full_bkts(hs)[i];
+		cfs_hash_bd_lock(hs, &bd, 0);
+		if (maxdep < bd.bd_bucket->hsb_depmax) {
+			maxdep  = bd.bd_bucket->hsb_depmax;
+			maxdepb = ffz(~maxdep);
+		}
+		total += bd.bd_bucket->hsb_count;
+		dist[min(fls(bd.bd_bucket->hsb_count / max(theta, 1)), 7)]++;
+		cfs_hash_bd_unlock(hs, &bd, 0);
+	}
+
+	seq_printf(m, "%7d %7d %7d ", total, maxdep, maxdepb);
+	for (i = 0; i < 8; i++)
+		seq_printf(m, "%d%c",  dist[i], (i == 7) ? '\n' : '/');
+
+	cfs_hash_unlock(hs, 0);
+}
+EXPORT_SYMBOL(cfs_hash_debug_str);
diff --git a/kernel/drivers/staging/lustre/lustre/libcfs/kernel_user_comm.c b/kernel/drivers/staging/lustre/lustre/libcfs/kernel_user_comm.c
new file mode 100644
index 000000000..d9b7c6b69
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/libcfs/kernel_user_comm.c
@@ -0,0 +1,240 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Author: Nathan Rutman <nathan.rutman@sun.com>
+ *
+ * Kernel <-> userspace communication routines.
+ * Using pipes for all arches.
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+#define D_KUC D_OTHER
+
+#include "../../include/linux/libcfs/libcfs.h"
+
+/* This is the kernel side (liblustre as well). */
+
+/**
+ * libcfs_kkuc_msg_put - send an message from kernel to userspace
+ * @param fp to send the message to
+ * @param payload Payload data.  First field of payload is always
+ *   struct kuc_hdr
+ */
+int libcfs_kkuc_msg_put(struct file *filp, void *payload)
+{
+	struct kuc_hdr *kuch = (struct kuc_hdr *)payload;
+	ssize_t count = kuch->kuc_msglen;
+	loff_t offset = 0;
+	mm_segment_t fs;
+	int rc = -ENOSYS;
+
+	if (filp == NULL || IS_ERR(filp))
+		return -EBADF;
+
+	if (kuch->kuc_magic != KUC_MAGIC) {
+		CERROR("KernelComm: bad magic %x\n", kuch->kuc_magic);
+		return -ENOSYS;
+	}
+
+	fs = get_fs();
+	set_fs(KERNEL_DS);
+	while (count > 0) {
+		rc = vfs_write(filp, (void __force __user *)payload,
+			       count, &offset);
+		if (rc < 0)
+			break;
+		count -= rc;
+		payload += rc;
+		rc = 0;
+	}
+	set_fs(fs);
+
+	if (rc < 0)
+		CWARN("message send failed (%d)\n", rc);
+	else
+		CDEBUG(D_KUC, "Sent message rc=%d, fp=%p\n", rc, filp);
+
+	return rc;
+}
+EXPORT_SYMBOL(libcfs_kkuc_msg_put);
+
+/* Broadcast groups are global across all mounted filesystems;
+ * i.e. registering for a group on 1 fs will get messages for that
+ * group from any fs */
+/** A single group registration has a uid and a file pointer */
+struct kkuc_reg {
+	struct list_head	kr_chain;
+	int		kr_uid;
+	struct file	*kr_fp;
+	__u32		kr_data;
+};
+static struct list_head kkuc_groups[KUC_GRP_MAX+1] = {};
+/* Protect message sending against remove and adds */
+static DECLARE_RWSEM(kg_sem);
+
+/** Add a receiver to a broadcast group
+ * @param filp pipe to write into
+ * @param uid identifier for this receiver
+ * @param group group number
+ */
+int libcfs_kkuc_group_add(struct file *filp, int uid, int group, __u32 data)
+{
+	struct kkuc_reg *reg;
+
+	if (group > KUC_GRP_MAX) {
+		CDEBUG(D_WARNING, "Kernelcomm: bad group %d\n", group);
+		return -EINVAL;
+	}
+
+	/* fput in group_rem */
+	if (filp == NULL)
+		return -EBADF;
+
+	/* freed in group_rem */
+	reg = kmalloc(sizeof(*reg), 0);
+	if (reg == NULL)
+		return -ENOMEM;
+
+	reg->kr_fp = filp;
+	reg->kr_uid = uid;
+	reg->kr_data = data;
+
+	down_write(&kg_sem);
+	if (kkuc_groups[group].next == NULL)
+		INIT_LIST_HEAD(&kkuc_groups[group]);
+	list_add(&reg->kr_chain, &kkuc_groups[group]);
+	up_write(&kg_sem);
+
+	CDEBUG(D_KUC, "Added uid=%d fp=%p to group %d\n", uid, filp, group);
+
+	return 0;
+}
+EXPORT_SYMBOL(libcfs_kkuc_group_add);
+
+int libcfs_kkuc_group_rem(int uid, int group)
+{
+	struct kkuc_reg *reg, *next;
+
+	if (kkuc_groups[group].next == NULL)
+		return 0;
+
+	if (uid == 0) {
+		/* Broadcast a shutdown message */
+		struct kuc_hdr lh;
+
+		lh.kuc_magic = KUC_MAGIC;
+		lh.kuc_transport = KUC_TRANSPORT_GENERIC;
+		lh.kuc_msgtype = KUC_MSG_SHUTDOWN;
+		lh.kuc_msglen = sizeof(lh);
+		libcfs_kkuc_group_put(group, &lh);
+	}
+
+	down_write(&kg_sem);
+	list_for_each_entry_safe(reg, next, &kkuc_groups[group], kr_chain) {
+		if ((uid == 0) || (uid == reg->kr_uid)) {
+			list_del(&reg->kr_chain);
+			CDEBUG(D_KUC, "Removed uid=%d fp=%p from group %d\n",
+			       reg->kr_uid, reg->kr_fp, group);
+			if (reg->kr_fp != NULL)
+				fput(reg->kr_fp);
+			kfree(reg);
+		}
+	}
+	up_write(&kg_sem);
+
+	return 0;
+}
+EXPORT_SYMBOL(libcfs_kkuc_group_rem);
+
+int libcfs_kkuc_group_put(int group, void *payload)
+{
+	struct kkuc_reg	*reg;
+	int		 rc = 0;
+	int one_success = 0;
+
+	down_read(&kg_sem);
+	list_for_each_entry(reg, &kkuc_groups[group], kr_chain) {
+		if (reg->kr_fp != NULL) {
+			rc = libcfs_kkuc_msg_put(reg->kr_fp, payload);
+			if (rc == 0)
+				one_success = 1;
+			else if (rc == -EPIPE) {
+				fput(reg->kr_fp);
+				reg->kr_fp = NULL;
+			}
+		}
+	}
+	up_read(&kg_sem);
+
+	/* don't return an error if the message has been delivered
+	 * at least to one agent */
+	if (one_success)
+		rc = 0;
+
+	return rc;
+}
+EXPORT_SYMBOL(libcfs_kkuc_group_put);
+
+/**
+ * Calls a callback function for each link of the given kuc group.
+ * @param group the group to call the function on.
+ * @param cb_func the function to be called.
+ * @param cb_arg iextra argument to be passed to the callback function.
+ */
+int libcfs_kkuc_group_foreach(int group, libcfs_kkuc_cb_t cb_func,
+			      void *cb_arg)
+{
+	struct kkuc_reg *reg;
+	int rc = 0;
+
+	if (group > KUC_GRP_MAX) {
+		CDEBUG(D_WARNING, "Kernelcomm: bad group %d\n", group);
+		return -EINVAL;
+	}
+
+	/* no link for this group */
+	if (kkuc_groups[group].next == NULL)
+		return 0;
+
+	down_write(&kg_sem);
+	list_for_each_entry(reg, &kkuc_groups[group], kr_chain) {
+		if (reg->kr_fp != NULL)
+			rc = cb_func(reg->kr_data, cb_arg);
+	}
+	up_write(&kg_sem);
+
+	return rc;
+}
+EXPORT_SYMBOL(libcfs_kkuc_group_foreach);
diff --git a/kernel/drivers/staging/lustre/lustre/libcfs/libcfs_cpu.c b/kernel/drivers/staging/lustre/lustre/libcfs/libcfs_cpu.c
new file mode 100644
index 000000000..31a558115
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/libcfs/libcfs_cpu.c
@@ -0,0 +1,224 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Please see comments in libcfs/include/libcfs/libcfs_cpu.h for introduction
+ *
+ * Author: liang@whamcloud.com
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+
+#include "../../include/linux/libcfs/libcfs.h"
+
+/** Global CPU partition table */
+struct cfs_cpt_table   *cfs_cpt_table __read_mostly;
+EXPORT_SYMBOL(cfs_cpt_table);
+
+#ifndef HAVE_LIBCFS_CPT
+
+#define CFS_CPU_VERSION_MAGIC	   0xbabecafe
+
+struct cfs_cpt_table *
+cfs_cpt_table_alloc(unsigned int ncpt)
+{
+	struct cfs_cpt_table *cptab;
+
+	if (ncpt != 1) {
+		CERROR("Can't support cpu partition number %d\n", ncpt);
+		return NULL;
+	}
+
+	LIBCFS_ALLOC(cptab, sizeof(*cptab));
+	if (cptab != NULL) {
+		cptab->ctb_version = CFS_CPU_VERSION_MAGIC;
+		cptab->ctb_nparts  = ncpt;
+	}
+
+	return cptab;
+}
+EXPORT_SYMBOL(cfs_cpt_table_alloc);
+
+void
+cfs_cpt_table_free(struct cfs_cpt_table *cptab)
+{
+	LASSERT(cptab->ctb_version == CFS_CPU_VERSION_MAGIC);
+
+	LIBCFS_FREE(cptab, sizeof(*cptab));
+}
+EXPORT_SYMBOL(cfs_cpt_table_free);
+
+#ifdef CONFIG_SMP
+int
+cfs_cpt_table_print(struct cfs_cpt_table *cptab, char *buf, int len)
+{
+	int	rc = 0;
+
+	rc = snprintf(buf, len, "%d\t: %d\n", 0, 0);
+	len -= rc;
+	if (len <= 0)
+		return -EFBIG;
+
+	return rc;
+}
+EXPORT_SYMBOL(cfs_cpt_table_print);
+#endif /* CONFIG_SMP */
+
+int
+cfs_cpt_number(struct cfs_cpt_table *cptab)
+{
+	return 1;
+}
+EXPORT_SYMBOL(cfs_cpt_number);
+
+int
+cfs_cpt_weight(struct cfs_cpt_table *cptab, int cpt)
+{
+	return 1;
+}
+EXPORT_SYMBOL(cfs_cpt_weight);
+
+int
+cfs_cpt_online(struct cfs_cpt_table *cptab, int cpt)
+{
+	return 1;
+}
+EXPORT_SYMBOL(cfs_cpt_online);
+
+int
+cfs_cpt_set_cpu(struct cfs_cpt_table *cptab, int cpt, int cpu)
+{
+	return 1;
+}
+EXPORT_SYMBOL(cfs_cpt_set_cpu);
+
+void
+cfs_cpt_unset_cpu(struct cfs_cpt_table *cptab, int cpt, int cpu)
+{
+}
+EXPORT_SYMBOL(cfs_cpt_unset_cpu);
+
+int
+cfs_cpt_set_cpumask(struct cfs_cpt_table *cptab, int cpt, cpumask_t *mask)
+{
+	return 1;
+}
+EXPORT_SYMBOL(cfs_cpt_set_cpumask);
+
+void
+cfs_cpt_unset_cpumask(struct cfs_cpt_table *cptab, int cpt, cpumask_t *mask)
+{
+}
+EXPORT_SYMBOL(cfs_cpt_unset_cpumask);
+
+int
+cfs_cpt_set_node(struct cfs_cpt_table *cptab, int cpt, int node)
+{
+	return 1;
+}
+EXPORT_SYMBOL(cfs_cpt_set_node);
+
+void
+cfs_cpt_unset_node(struct cfs_cpt_table *cptab, int cpt, int node)
+{
+}
+EXPORT_SYMBOL(cfs_cpt_unset_node);
+
+int
+cfs_cpt_set_nodemask(struct cfs_cpt_table *cptab, int cpt, nodemask_t *mask)
+{
+	return 1;
+}
+EXPORT_SYMBOL(cfs_cpt_set_nodemask);
+
+void
+cfs_cpt_unset_nodemask(struct cfs_cpt_table *cptab, int cpt, nodemask_t *mask)
+{
+}
+EXPORT_SYMBOL(cfs_cpt_unset_nodemask);
+
+void
+cfs_cpt_clear(struct cfs_cpt_table *cptab, int cpt)
+{
+}
+EXPORT_SYMBOL(cfs_cpt_clear);
+
+int
+cfs_cpt_spread_node(struct cfs_cpt_table *cptab, int cpt)
+{
+	return 0;
+}
+EXPORT_SYMBOL(cfs_cpt_spread_node);
+
+int
+cfs_cpu_ht_nsiblings(int cpu)
+{
+	return 1;
+}
+EXPORT_SYMBOL(cfs_cpu_ht_nsiblings);
+
+int
+cfs_cpt_current(struct cfs_cpt_table *cptab, int remap)
+{
+	return 0;
+}
+EXPORT_SYMBOL(cfs_cpt_current);
+
+int
+cfs_cpt_of_cpu(struct cfs_cpt_table *cptab, int cpu)
+{
+	return 0;
+}
+EXPORT_SYMBOL(cfs_cpt_of_cpu);
+
+int
+cfs_cpt_bind(struct cfs_cpt_table *cptab, int cpt)
+{
+	return 0;
+}
+EXPORT_SYMBOL(cfs_cpt_bind);
+
+void
+cfs_cpu_fini(void)
+{
+	if (cfs_cpt_table != NULL) {
+		cfs_cpt_table_free(cfs_cpt_table);
+		cfs_cpt_table = NULL;
+	}
+}
+
+int
+cfs_cpu_init(void)
+{
+	cfs_cpt_table = cfs_cpt_table_alloc(1);
+
+	return cfs_cpt_table != NULL ? 0 : -1;
+}
+
+#endif /* HAVE_LIBCFS_CPT */
diff --git a/kernel/drivers/staging/lustre/lustre/libcfs/libcfs_lock.c b/kernel/drivers/staging/lustre/lustre/libcfs/libcfs_lock.c
new file mode 100644
index 000000000..2c199c725
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/libcfs/libcfs_lock.c
@@ -0,0 +1,189 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA
+ *
+ * GPL HEADER END
+ */
+/* Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Author: liang@whamcloud.com
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+
+#include "../../include/linux/libcfs/libcfs.h"
+
+
+/** destroy cpu-partition lock, see libcfs_private.h for more detail */
+void
+cfs_percpt_lock_free(struct cfs_percpt_lock *pcl)
+{
+	LASSERT(pcl->pcl_locks != NULL);
+	LASSERT(!pcl->pcl_locked);
+
+	cfs_percpt_free(pcl->pcl_locks);
+	LIBCFS_FREE(pcl, sizeof(*pcl));
+}
+EXPORT_SYMBOL(cfs_percpt_lock_free);
+
+/**
+ * create cpu-partition lock, see libcfs_private.h for more detail.
+ *
+ * cpu-partition lock is designed for large-scale SMP system, so we need to
+ * reduce cacheline conflict as possible as we can, that's the
+ * reason we always allocate cacheline-aligned memory block.
+ */
+struct cfs_percpt_lock *
+cfs_percpt_lock_alloc(struct cfs_cpt_table *cptab)
+{
+	struct cfs_percpt_lock	*pcl;
+	spinlock_t		*lock;
+	int			i;
+
+	/* NB: cptab can be NULL, pcl will be for HW CPUs on that case */
+	LIBCFS_ALLOC(pcl, sizeof(*pcl));
+	if (pcl == NULL)
+		return NULL;
+
+	pcl->pcl_cptab = cptab;
+	pcl->pcl_locks = cfs_percpt_alloc(cptab, sizeof(*lock));
+	if (pcl->pcl_locks == NULL) {
+		LIBCFS_FREE(pcl, sizeof(*pcl));
+		return NULL;
+	}
+
+	cfs_percpt_for_each(lock, i, pcl->pcl_locks)
+		spin_lock_init(lock);
+
+	return pcl;
+}
+EXPORT_SYMBOL(cfs_percpt_lock_alloc);
+
+/**
+ * lock a CPU partition
+ *
+ * \a index != CFS_PERCPT_LOCK_EX
+ *     hold private lock indexed by \a index
+ *
+ * \a index == CFS_PERCPT_LOCK_EX
+ *     exclusively lock @pcl and nobody can take private lock
+ */
+void
+cfs_percpt_lock(struct cfs_percpt_lock *pcl, int index)
+{
+	int	ncpt = cfs_cpt_number(pcl->pcl_cptab);
+	int	i;
+
+	LASSERT(index >= CFS_PERCPT_LOCK_EX && index < ncpt);
+
+	if (ncpt == 1) {
+		index = 0;
+	} else { /* serialize with exclusive lock */
+		while (pcl->pcl_locked)
+			cpu_relax();
+	}
+
+	if (likely(index != CFS_PERCPT_LOCK_EX)) {
+		spin_lock(pcl->pcl_locks[index]);
+		return;
+	}
+
+	/* exclusive lock request */
+	for (i = 0; i < ncpt; i++) {
+		spin_lock(pcl->pcl_locks[i]);
+		if (i == 0) {
+			LASSERT(!pcl->pcl_locked);
+			/* nobody should take private lock after this
+			 * so I wouldn't starve for too long time */
+			pcl->pcl_locked = 1;
+		}
+	}
+}
+EXPORT_SYMBOL(cfs_percpt_lock);
+
+/** unlock a CPU partition */
+void
+cfs_percpt_unlock(struct cfs_percpt_lock *pcl, int index)
+{
+	int	ncpt = cfs_cpt_number(pcl->pcl_cptab);
+	int	i;
+
+	index = ncpt == 1 ? 0 : index;
+
+	if (likely(index != CFS_PERCPT_LOCK_EX)) {
+		spin_unlock(pcl->pcl_locks[index]);
+		return;
+	}
+
+	for (i = ncpt - 1; i >= 0; i--) {
+		if (i == 0) {
+			LASSERT(pcl->pcl_locked);
+			pcl->pcl_locked = 0;
+		}
+		spin_unlock(pcl->pcl_locks[i]);
+	}
+}
+EXPORT_SYMBOL(cfs_percpt_unlock);
+
+
+/** free cpu-partition refcount */
+void
+cfs_percpt_atomic_free(atomic_t **refs)
+{
+	cfs_percpt_free(refs);
+}
+EXPORT_SYMBOL(cfs_percpt_atomic_free);
+
+/** allocate cpu-partition refcount with initial value @init_val */
+atomic_t **
+cfs_percpt_atomic_alloc(struct cfs_cpt_table *cptab, int init_val)
+{
+	atomic_t	**refs;
+	atomic_t	*ref;
+	int		i;
+
+	refs = cfs_percpt_alloc(cptab, sizeof(*ref));
+	if (refs == NULL)
+		return NULL;
+
+	cfs_percpt_for_each(ref, i, refs)
+		atomic_set(ref, init_val);
+	return refs;
+}
+EXPORT_SYMBOL(cfs_percpt_atomic_alloc);
+
+/** return sum of cpu-partition refs */
+int
+cfs_percpt_atomic_summary(atomic_t **refs)
+{
+	atomic_t	*ref;
+	int		i;
+	int		val = 0;
+
+	cfs_percpt_for_each(ref, i, refs)
+		val += atomic_read(ref);
+
+	return val;
+}
+EXPORT_SYMBOL(cfs_percpt_atomic_summary);
diff --git a/kernel/drivers/staging/lustre/lustre/libcfs/libcfs_mem.c b/kernel/drivers/staging/lustre/lustre/libcfs/libcfs_mem.c
new file mode 100644
index 000000000..1debdda72
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/libcfs/libcfs_mem.c
@@ -0,0 +1,202 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Author: liang@whamcloud.com
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+
+#include "../../include/linux/libcfs/libcfs.h"
+
+struct cfs_var_array {
+	unsigned int		va_count;	/* # of buffers */
+	unsigned int		va_size;	/* size of each var */
+	struct cfs_cpt_table	*va_cptab;	/* cpu partition table */
+	void			*va_ptrs[0];	/* buffer addresses */
+};
+
+/*
+ * free per-cpu data, see more detail in cfs_percpt_free
+ */
+void
+cfs_percpt_free(void *vars)
+{
+	struct	cfs_var_array *arr;
+	int	i;
+
+	arr = container_of(vars, struct cfs_var_array, va_ptrs[0]);
+
+	for (i = 0; i < arr->va_count; i++) {
+		if (arr->va_ptrs[i] != NULL)
+			LIBCFS_FREE(arr->va_ptrs[i], arr->va_size);
+	}
+
+	LIBCFS_FREE(arr, offsetof(struct cfs_var_array,
+				  va_ptrs[arr->va_count]));
+}
+EXPORT_SYMBOL(cfs_percpt_free);
+
+/*
+ * allocate per cpu-partition variables, returned value is an array of pointers,
+ * variable can be indexed by CPU partition ID, i.e:
+ *
+ *	arr = cfs_percpt_alloc(cfs_cpu_pt, size);
+ *	then caller can access memory block for CPU 0 by arr[0],
+ *	memory block for CPU 1 by arr[1]...
+ *	memory block for CPU N by arr[N]...
+ *
+ * cacheline aligned.
+ */
+void *
+cfs_percpt_alloc(struct cfs_cpt_table *cptab, unsigned int size)
+{
+	struct cfs_var_array	*arr;
+	int			count;
+	int			i;
+
+	count = cfs_cpt_number(cptab);
+
+	LIBCFS_ALLOC(arr, offsetof(struct cfs_var_array, va_ptrs[count]));
+	if (arr == NULL)
+		return NULL;
+
+	arr->va_size	= size = L1_CACHE_ALIGN(size);
+	arr->va_count	= count;
+	arr->va_cptab	= cptab;
+
+	for (i = 0; i < count; i++) {
+		LIBCFS_CPT_ALLOC(arr->va_ptrs[i], cptab, i, size);
+		if (arr->va_ptrs[i] == NULL) {
+			cfs_percpt_free((void *)&arr->va_ptrs[0]);
+			return NULL;
+		}
+	}
+
+	return (void *)&arr->va_ptrs[0];
+}
+EXPORT_SYMBOL(cfs_percpt_alloc);
+
+/*
+ * return number of CPUs (or number of elements in per-cpu data)
+ * according to cptab of @vars
+ */
+int
+cfs_percpt_number(void *vars)
+{
+	struct cfs_var_array *arr;
+
+	arr = container_of(vars, struct cfs_var_array, va_ptrs[0]);
+
+	return arr->va_count;
+}
+EXPORT_SYMBOL(cfs_percpt_number);
+
+/*
+ * return memory block shadowed from current CPU
+ */
+void *
+cfs_percpt_current(void *vars)
+{
+	struct cfs_var_array *arr;
+	int    cpt;
+
+	arr = container_of(vars, struct cfs_var_array, va_ptrs[0]);
+	cpt = cfs_cpt_current(arr->va_cptab, 0);
+	if (cpt < 0)
+		return NULL;
+
+	return arr->va_ptrs[cpt];
+}
+EXPORT_SYMBOL(cfs_percpt_current);
+
+void *
+cfs_percpt_index(void *vars, int idx)
+{
+	struct cfs_var_array *arr;
+
+	arr = container_of(vars, struct cfs_var_array, va_ptrs[0]);
+
+	LASSERT(idx >= 0 && idx < arr->va_count);
+	return arr->va_ptrs[idx];
+}
+EXPORT_SYMBOL(cfs_percpt_index);
+
+/*
+ * free variable array, see more detail in cfs_array_alloc
+ */
+void
+cfs_array_free(void *vars)
+{
+	struct cfs_var_array	*arr;
+	int			i;
+
+	arr = container_of(vars, struct cfs_var_array, va_ptrs[0]);
+
+	for (i = 0; i < arr->va_count; i++) {
+		if (arr->va_ptrs[i] == NULL)
+			continue;
+
+		LIBCFS_FREE(arr->va_ptrs[i], arr->va_size);
+	}
+	LIBCFS_FREE(arr, offsetof(struct cfs_var_array,
+				  va_ptrs[arr->va_count]));
+}
+EXPORT_SYMBOL(cfs_array_free);
+
+/*
+ * allocate a variable array, returned value is an array of pointers.
+ * Caller can specify length of array by @count, @size is size of each
+ * memory block in array.
+ */
+void *
+cfs_array_alloc(int count, unsigned int size)
+{
+	struct cfs_var_array	*arr;
+	int			i;
+
+	LIBCFS_ALLOC(arr, offsetof(struct cfs_var_array, va_ptrs[count]));
+	if (arr == NULL)
+		return NULL;
+
+	arr->va_count	= count;
+	arr->va_size	= size;
+
+	for (i = 0; i < count; i++) {
+		LIBCFS_ALLOC(arr->va_ptrs[i], size);
+
+		if (arr->va_ptrs[i] == NULL) {
+			cfs_array_free((void *)&arr->va_ptrs[0]);
+			return NULL;
+		}
+	}
+
+	return (void *)&arr->va_ptrs[0];
+}
+EXPORT_SYMBOL(cfs_array_alloc);
diff --git a/kernel/drivers/staging/lustre/lustre/libcfs/libcfs_string.c b/kernel/drivers/staging/lustre/lustre/libcfs/libcfs_string.c
new file mode 100644
index 000000000..76d4392bd
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/libcfs/libcfs_string.c
@@ -0,0 +1,562 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * String manipulation functions.
+ *
+ * libcfs/libcfs/libcfs_string.c
+ *
+ * Author: Nathan Rutman <nathan.rutman@sun.com>
+ */
+
+#include "../../include/linux/libcfs/libcfs.h"
+
+/* Convert a text string to a bitmask */
+int cfs_str2mask(const char *str, const char *(*bit2str)(int bit),
+		 int *oldmask, int minmask, int allmask)
+{
+	const char *debugstr;
+	char op = '\0';
+	int newmask = minmask, i, len, found = 0;
+
+	/* <str> must be a list of tokens separated by whitespace
+	 * and optionally an operator ('+' or '-').  If an operator
+	 * appears first in <str>, '*oldmask' is used as the starting point
+	 * (relative), otherwise minmask is used (absolute).  An operator
+	 * applies to all following tokens up to the next operator. */
+	while (*str != '\0') {
+		while (isspace(*str))
+			str++;
+		if (*str == '\0')
+			break;
+		if (*str == '+' || *str == '-') {
+			op = *str++;
+			if (!found)
+				/* only if first token is relative */
+				newmask = *oldmask;
+			while (isspace(*str))
+				str++;
+			if (*str == '\0')  /* trailing op */
+				return -EINVAL;
+		}
+
+		/* find token length */
+		len = 0;
+		while (str[len] != '\0' && !isspace(str[len]) &&
+		       str[len] != '+' && str[len] != '-')
+			len++;
+
+		/* match token */
+		found = 0;
+		for (i = 0; i < 32; i++) {
+			debugstr = bit2str(i);
+			if (debugstr != NULL &&
+			    strlen(debugstr) == len &&
+			    strncasecmp(str, debugstr, len) == 0) {
+				if (op == '-')
+					newmask &= ~(1 << i);
+				else
+					newmask |= (1 << i);
+				found = 1;
+				break;
+			}
+		}
+		if (!found && len == 3 &&
+		    (strncasecmp(str, "ALL", len) == 0)) {
+			if (op == '-')
+				newmask = minmask;
+			else
+				newmask = allmask;
+			found = 1;
+		}
+		if (!found) {
+			CWARN("unknown mask '%.*s'.\n"
+			      "mask usage: [+|-]<all|type> ...\n", len, str);
+			return -EINVAL;
+		}
+		str += len;
+	}
+
+	*oldmask = newmask;
+	return 0;
+}
+
+/* get the first string out of @str */
+char *cfs_firststr(char *str, size_t size)
+{
+	size_t i = 0;
+	char  *end;
+
+	/* trim leading spaces */
+	while (i < size && *str && isspace(*str)) {
+		++i;
+		++str;
+	}
+
+	/* string with all spaces */
+	if (*str == '\0')
+		goto out;
+
+	end = str;
+	while (i < size && *end != '\0' && !isspace(*end)) {
+		++i;
+		++end;
+	}
+
+	*end = '\0';
+out:
+	return str;
+}
+EXPORT_SYMBOL(cfs_firststr);
+
+char *
+cfs_trimwhite(char *str)
+{
+	char *end;
+
+	while (isspace(*str))
+		str++;
+
+	end = str + strlen(str);
+	while (end > str) {
+		if (!isspace(end[-1]))
+			break;
+		end--;
+	}
+
+	*end = 0;
+	return str;
+}
+EXPORT_SYMBOL(cfs_trimwhite);
+
+/**
+ * Extracts tokens from strings.
+ *
+ * Looks for \a delim in string \a next, sets \a res to point to
+ * substring before the delimiter, sets \a next right after the found
+ * delimiter.
+ *
+ * \retval 1 if \a res points to a string of non-whitespace characters
+ * \retval 0 otherwise
+ */
+int
+cfs_gettok(struct cfs_lstr *next, char delim, struct cfs_lstr *res)
+{
+	char *end;
+
+	if (next->ls_str == NULL)
+		return 0;
+
+	/* skip leading white spaces */
+	while (next->ls_len) {
+		if (!isspace(*next->ls_str))
+			break;
+		next->ls_str++;
+		next->ls_len--;
+	}
+
+	if (next->ls_len == 0) /* whitespaces only */
+		return 0;
+
+	if (*next->ls_str == delim) {
+		/* first non-writespace is the delimiter */
+		return 0;
+	}
+
+	res->ls_str = next->ls_str;
+	end = memchr(next->ls_str, delim, next->ls_len);
+	if (end == NULL) {
+		/* there is no the delimeter in the string */
+		end = next->ls_str + next->ls_len;
+		next->ls_str = NULL;
+	} else {
+		next->ls_str = end + 1;
+		next->ls_len -= (end - res->ls_str + 1);
+	}
+
+	/* skip ending whitespaces */
+	while (--end != res->ls_str) {
+		if (!isspace(*end))
+			break;
+	}
+
+	res->ls_len = end - res->ls_str + 1;
+	return 1;
+}
+
+/**
+ * Converts string to integer.
+ *
+ * Accepts decimal and hexadecimal number recordings.
+ *
+ * \retval 1 if first \a nob chars of \a str convert to decimal or
+ * hexadecimal integer in the range [\a min, \a max]
+ * \retval 0 otherwise
+ */
+int
+cfs_str2num_check(char *str, int nob, unsigned *num,
+		  unsigned min, unsigned max)
+{
+	char	*endp;
+
+	str = cfs_trimwhite(str);
+	*num = strtoul(str, &endp, 0);
+	if (endp == str)
+		return 0;
+
+	for (; endp < str + nob; endp++) {
+		if (!isspace(*endp))
+			return 0;
+	}
+
+	return (*num >= min && *num <= max);
+}
+
+/**
+ * Parses \<range_expr\> token of the syntax. If \a bracketed is false,
+ * \a src should only have a single token which can be \<number\> or  \*
+ *
+ * \retval pointer to allocated range_expr and initialized
+ * range_expr::re_lo, range_expr::re_hi and range_expr:re_stride if \a
+ `* src parses to
+ * \<number\> |
+ * \<number\> '-' \<number\> |
+ * \<number\> '-' \<number\> '/' \<number\>
+ * \retval 0 will be returned if it can be parsed, otherwise -EINVAL or
+ * -ENOMEM will be returned.
+ */
+static int
+cfs_range_expr_parse(struct cfs_lstr *src, unsigned min, unsigned max,
+		     int bracketed, struct cfs_range_expr **expr)
+{
+	struct cfs_range_expr	*re;
+	struct cfs_lstr		tok;
+
+	LIBCFS_ALLOC(re, sizeof(*re));
+	if (re == NULL)
+		return -ENOMEM;
+
+	if (src->ls_len == 1 && src->ls_str[0] == '*') {
+		re->re_lo = min;
+		re->re_hi = max;
+		re->re_stride = 1;
+		goto out;
+	}
+
+	if (cfs_str2num_check(src->ls_str, src->ls_len,
+			      &re->re_lo, min, max)) {
+		/* <number> is parsed */
+		re->re_hi = re->re_lo;
+		re->re_stride = 1;
+		goto out;
+	}
+
+	if (!bracketed || !cfs_gettok(src, '-', &tok))
+		goto failed;
+
+	if (!cfs_str2num_check(tok.ls_str, tok.ls_len,
+			       &re->re_lo, min, max))
+		goto failed;
+
+	/* <number> - */
+	if (cfs_str2num_check(src->ls_str, src->ls_len,
+			      &re->re_hi, min, max)) {
+		/* <number> - <number> is parsed */
+		re->re_stride = 1;
+		goto out;
+	}
+
+	/* go to check <number> '-' <number> '/' <number> */
+	if (cfs_gettok(src, '/', &tok)) {
+		if (!cfs_str2num_check(tok.ls_str, tok.ls_len,
+				       &re->re_hi, min, max))
+			goto failed;
+
+		/* <number> - <number> / ... */
+		if (cfs_str2num_check(src->ls_str, src->ls_len,
+				      &re->re_stride, min, max)) {
+			/* <number> - <number> / <number> is parsed */
+			goto out;
+		}
+	}
+
+ out:
+	*expr = re;
+	return 0;
+
+ failed:
+	LIBCFS_FREE(re, sizeof(*re));
+	return -EINVAL;
+}
+
+/**
+ * Matches value (\a value) against ranges expression list \a expr_list.
+ *
+ * \retval 1 if \a value matches
+ * \retval 0 otherwise
+ */
+int
+cfs_expr_list_match(__u32 value, struct cfs_expr_list *expr_list)
+{
+	struct cfs_range_expr	*expr;
+
+	list_for_each_entry(expr, &expr_list->el_exprs, re_link) {
+		if (value >= expr->re_lo && value <= expr->re_hi &&
+		    ((value - expr->re_lo) % expr->re_stride) == 0)
+			return 1;
+	}
+
+	return 0;
+}
+
+/**
+ * Convert express list (\a expr_list) to an array of all matched values
+ *
+ * \retval N N is total number of all matched values
+ * \retval 0 if expression list is empty
+ * \retval < 0 for failure
+ */
+int
+cfs_expr_list_values(struct cfs_expr_list *expr_list, int max, __u32 **valpp)
+{
+	struct cfs_range_expr	*expr;
+	__u32			*val;
+	int			count = 0;
+	int			i;
+
+	list_for_each_entry(expr, &expr_list->el_exprs, re_link) {
+		for (i = expr->re_lo; i <= expr->re_hi; i++) {
+			if (((i - expr->re_lo) % expr->re_stride) == 0)
+				count++;
+		}
+	}
+
+	if (count == 0) /* empty expression list */
+		return 0;
+
+	if (count > max) {
+		CERROR("Number of values %d exceeds max allowed %d\n",
+		       max, count);
+		return -EINVAL;
+	}
+
+	LIBCFS_ALLOC(val, sizeof(val[0]) * count);
+	if (val == NULL)
+		return -ENOMEM;
+
+	count = 0;
+	list_for_each_entry(expr, &expr_list->el_exprs, re_link) {
+		for (i = expr->re_lo; i <= expr->re_hi; i++) {
+			if (((i - expr->re_lo) % expr->re_stride) == 0)
+				val[count++] = i;
+		}
+	}
+
+	*valpp = val;
+	return count;
+}
+EXPORT_SYMBOL(cfs_expr_list_values);
+
+/**
+ * Frees cfs_range_expr structures of \a expr_list.
+ *
+ * \retval none
+ */
+void
+cfs_expr_list_free(struct cfs_expr_list *expr_list)
+{
+	while (!list_empty(&expr_list->el_exprs)) {
+		struct cfs_range_expr *expr;
+
+		expr = list_entry(expr_list->el_exprs.next,
+				      struct cfs_range_expr, re_link),
+		list_del(&expr->re_link);
+		LIBCFS_FREE(expr, sizeof(*expr));
+	}
+
+	LIBCFS_FREE(expr_list, sizeof(*expr_list));
+}
+EXPORT_SYMBOL(cfs_expr_list_free);
+
+/**
+ * Parses \<cfs_expr_list\> token of the syntax.
+ *
+ * \retval 1 if \a str parses to \<number\> | \<expr_list\>
+ * \retval 0 otherwise
+ */
+int
+cfs_expr_list_parse(char *str, int len, unsigned min, unsigned max,
+		    struct cfs_expr_list **elpp)
+{
+	struct cfs_expr_list	*expr_list;
+	struct cfs_range_expr	*expr;
+	struct cfs_lstr		src;
+	int			rc;
+
+	LIBCFS_ALLOC(expr_list, sizeof(*expr_list));
+	if (expr_list == NULL)
+		return -ENOMEM;
+
+	src.ls_str = str;
+	src.ls_len = len;
+
+	INIT_LIST_HEAD(&expr_list->el_exprs);
+
+	if (src.ls_str[0] == '[' &&
+	    src.ls_str[src.ls_len - 1] == ']') {
+		src.ls_str++;
+		src.ls_len -= 2;
+
+		rc = -EINVAL;
+		while (src.ls_str != NULL) {
+			struct cfs_lstr tok;
+
+			if (!cfs_gettok(&src, ',', &tok)) {
+				rc = -EINVAL;
+				break;
+			}
+
+			rc = cfs_range_expr_parse(&tok, min, max, 1, &expr);
+			if (rc != 0)
+				break;
+
+			list_add_tail(&expr->re_link,
+					  &expr_list->el_exprs);
+		}
+	} else {
+		rc = cfs_range_expr_parse(&src, min, max, 0, &expr);
+		if (rc == 0) {
+			list_add_tail(&expr->re_link,
+					  &expr_list->el_exprs);
+		}
+	}
+
+	if (rc != 0)
+		cfs_expr_list_free(expr_list);
+	else
+		*elpp = expr_list;
+
+	return rc;
+}
+EXPORT_SYMBOL(cfs_expr_list_parse);
+
+/**
+ * Frees cfs_expr_list structures of \a list.
+ *
+ * For each struct cfs_expr_list structure found on \a list it frees
+ * range_expr list attached to it and frees the cfs_expr_list itself.
+ *
+ * \retval none
+ */
+void
+cfs_expr_list_free_list(struct list_head *list)
+{
+	struct cfs_expr_list *el;
+
+	while (!list_empty(list)) {
+		el = list_entry(list->next,
+				    struct cfs_expr_list, el_link);
+		list_del(&el->el_link);
+		cfs_expr_list_free(el);
+	}
+}
+
+int
+cfs_ip_addr_parse(char *str, int len, struct list_head *list)
+{
+	struct cfs_expr_list	*el;
+	struct cfs_lstr		src;
+	int			rc;
+	int			i;
+
+	src.ls_str = str;
+	src.ls_len = len;
+	i = 0;
+
+	while (src.ls_str != NULL) {
+		struct cfs_lstr res;
+
+		if (!cfs_gettok(&src, '.', &res)) {
+			rc = -EINVAL;
+			goto out;
+		}
+
+		rc = cfs_expr_list_parse(res.ls_str, res.ls_len, 0, 255, &el);
+		if (rc != 0)
+			goto out;
+
+		list_add_tail(&el->el_link, list);
+		i++;
+	}
+
+	if (i == 4)
+		return 0;
+
+	rc = -EINVAL;
+ out:
+	cfs_expr_list_free_list(list);
+
+	return rc;
+}
+EXPORT_SYMBOL(cfs_ip_addr_parse);
+
+/**
+ * Matches address (\a addr) against address set encoded in \a list.
+ *
+ * \retval 1 if \a addr matches
+ * \retval 0 otherwise
+ */
+int
+cfs_ip_addr_match(__u32 addr, struct list_head *list)
+{
+	struct cfs_expr_list *el;
+	int i = 0;
+
+	list_for_each_entry_reverse(el, list, el_link) {
+		if (!cfs_expr_list_match(addr & 0xff, el))
+			return 0;
+		addr >>= 8;
+		i++;
+	}
+
+	return i == 4;
+}
+EXPORT_SYMBOL(cfs_ip_addr_match);
+
+void
+cfs_ip_addr_free(struct list_head *list)
+{
+	cfs_expr_list_free_list(list);
+}
+EXPORT_SYMBOL(cfs_ip_addr_free);
diff --git a/kernel/drivers/staging/lustre/lustre/libcfs/linux/linux-cpu.c b/kernel/drivers/staging/lustre/lustre/libcfs/linux/linux-cpu.c
new file mode 100644
index 000000000..cc3ab3519
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/libcfs/linux/linux-cpu.c
@@ -0,0 +1,1056 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Author: liang@whamcloud.com
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+
+#include <linux/cpu.h>
+#include <linux/sched.h>
+#include "../../../include/linux/libcfs/libcfs.h"
+
+#ifdef CONFIG_SMP
+
+/**
+ * modparam for setting number of partitions
+ *
+ *  0 : estimate best value based on cores or NUMA nodes
+ *  1 : disable multiple partitions
+ * >1 : specify number of partitions
+ */
+static int	cpu_npartitions;
+module_param(cpu_npartitions, int, 0444);
+MODULE_PARM_DESC(cpu_npartitions, "# of CPU partitions");
+
+/**
+ * modparam for setting CPU partitions patterns:
+ *
+ * i.e: "0[0,1,2,3] 1[4,5,6,7]", number before bracket is CPU partition ID,
+ *      number in bracket is processor ID (core or HT)
+ *
+ * i.e: "N 0[0,1] 1[2,3]" the first character 'N' means numbers in bracket
+ *       are NUMA node ID, number before bracket is CPU partition ID.
+ *
+ * NB: If user specified cpu_pattern, cpu_npartitions will be ignored
+ */
+static char	*cpu_pattern = "";
+module_param(cpu_pattern, charp, 0444);
+MODULE_PARM_DESC(cpu_pattern, "CPU partitions pattern");
+
+struct cfs_cpt_data {
+	/* serialize hotplug etc */
+	spinlock_t		cpt_lock;
+	/* reserved for hotplug */
+	unsigned long		cpt_version;
+	/* mutex to protect cpt_cpumask */
+	struct mutex		cpt_mutex;
+	/* scratch buffer for set/unset_node */
+	cpumask_t		*cpt_cpumask;
+};
+
+static struct cfs_cpt_data	cpt_data;
+
+static void cfs_cpu_core_siblings(int cpu, cpumask_t *mask)
+{
+	/* return cpumask of cores in the same socket */
+	cpumask_copy(mask, topology_core_cpumask(cpu));
+}
+
+/* return cpumask of HTs in the same core */
+static void cfs_cpu_ht_siblings(int cpu, cpumask_t *mask)
+{
+	cpumask_copy(mask, topology_thread_cpumask(cpu));
+}
+
+static void cfs_node_to_cpumask(int node, cpumask_t *mask)
+{
+	cpumask_copy(mask, cpumask_of_node(node));
+}
+
+void
+cfs_cpt_table_free(struct cfs_cpt_table *cptab)
+{
+	int	i;
+
+	if (cptab->ctb_cpu2cpt != NULL) {
+		LIBCFS_FREE(cptab->ctb_cpu2cpt,
+			    num_possible_cpus() *
+			    sizeof(cptab->ctb_cpu2cpt[0]));
+	}
+
+	for (i = 0; cptab->ctb_parts != NULL && i < cptab->ctb_nparts; i++) {
+		struct cfs_cpu_partition *part = &cptab->ctb_parts[i];
+
+		if (part->cpt_nodemask != NULL) {
+			LIBCFS_FREE(part->cpt_nodemask,
+				    sizeof(*part->cpt_nodemask));
+		}
+
+		if (part->cpt_cpumask != NULL)
+			LIBCFS_FREE(part->cpt_cpumask, cpumask_size());
+	}
+
+	if (cptab->ctb_parts != NULL) {
+		LIBCFS_FREE(cptab->ctb_parts,
+			    cptab->ctb_nparts * sizeof(cptab->ctb_parts[0]));
+	}
+
+	if (cptab->ctb_nodemask != NULL)
+		LIBCFS_FREE(cptab->ctb_nodemask, sizeof(*cptab->ctb_nodemask));
+	if (cptab->ctb_cpumask != NULL)
+		LIBCFS_FREE(cptab->ctb_cpumask, cpumask_size());
+
+	LIBCFS_FREE(cptab, sizeof(*cptab));
+}
+EXPORT_SYMBOL(cfs_cpt_table_free);
+
+struct cfs_cpt_table *
+cfs_cpt_table_alloc(unsigned int ncpt)
+{
+	struct cfs_cpt_table *cptab;
+	int	i;
+
+	LIBCFS_ALLOC(cptab, sizeof(*cptab));
+	if (cptab == NULL)
+		return NULL;
+
+	cptab->ctb_nparts = ncpt;
+
+	LIBCFS_ALLOC(cptab->ctb_cpumask, cpumask_size());
+	LIBCFS_ALLOC(cptab->ctb_nodemask, sizeof(*cptab->ctb_nodemask));
+
+	if (cptab->ctb_cpumask == NULL || cptab->ctb_nodemask == NULL)
+		goto failed;
+
+	LIBCFS_ALLOC(cptab->ctb_cpu2cpt,
+		     num_possible_cpus() * sizeof(cptab->ctb_cpu2cpt[0]));
+	if (cptab->ctb_cpu2cpt == NULL)
+		goto failed;
+
+	memset(cptab->ctb_cpu2cpt, -1,
+	       num_possible_cpus() * sizeof(cptab->ctb_cpu2cpt[0]));
+
+	LIBCFS_ALLOC(cptab->ctb_parts, ncpt * sizeof(cptab->ctb_parts[0]));
+	if (cptab->ctb_parts == NULL)
+		goto failed;
+
+	for (i = 0; i < ncpt; i++) {
+		struct cfs_cpu_partition *part = &cptab->ctb_parts[i];
+
+		LIBCFS_ALLOC(part->cpt_cpumask, cpumask_size());
+		LIBCFS_ALLOC(part->cpt_nodemask, sizeof(*part->cpt_nodemask));
+		if (part->cpt_cpumask == NULL || part->cpt_nodemask == NULL)
+			goto failed;
+	}
+
+	spin_lock(&cpt_data.cpt_lock);
+	/* Reserved for hotplug */
+	cptab->ctb_version = cpt_data.cpt_version;
+	spin_unlock(&cpt_data.cpt_lock);
+
+	return cptab;
+
+ failed:
+	cfs_cpt_table_free(cptab);
+	return NULL;
+}
+EXPORT_SYMBOL(cfs_cpt_table_alloc);
+
+int
+cfs_cpt_table_print(struct cfs_cpt_table *cptab, char *buf, int len)
+{
+	char	*tmp = buf;
+	int	rc = 0;
+	int	i;
+	int	j;
+
+	for (i = 0; i < cptab->ctb_nparts; i++) {
+		if (len > 0) {
+			rc = snprintf(tmp, len, "%d\t: ", i);
+			len -= rc;
+		}
+
+		if (len <= 0) {
+			rc = -EFBIG;
+			goto out;
+		}
+
+		tmp += rc;
+		for_each_cpu(j, cptab->ctb_parts[i].cpt_cpumask) {
+			rc = snprintf(tmp, len, "%d ", j);
+			len -= rc;
+			if (len <= 0) {
+				rc = -EFBIG;
+				goto out;
+			}
+			tmp += rc;
+		}
+
+		*tmp = '\n';
+		tmp++;
+		len--;
+	}
+
+ out:
+	if (rc < 0)
+		return rc;
+
+	return tmp - buf;
+}
+EXPORT_SYMBOL(cfs_cpt_table_print);
+
+int
+cfs_cpt_number(struct cfs_cpt_table *cptab)
+{
+	return cptab->ctb_nparts;
+}
+EXPORT_SYMBOL(cfs_cpt_number);
+
+int
+cfs_cpt_weight(struct cfs_cpt_table *cptab, int cpt)
+{
+	LASSERT(cpt == CFS_CPT_ANY || (cpt >= 0 && cpt < cptab->ctb_nparts));
+
+	return cpt == CFS_CPT_ANY ?
+	       cpumask_weight(cptab->ctb_cpumask) :
+	       cpumask_weight(cptab->ctb_parts[cpt].cpt_cpumask);
+}
+EXPORT_SYMBOL(cfs_cpt_weight);
+
+int
+cfs_cpt_online(struct cfs_cpt_table *cptab, int cpt)
+{
+	LASSERT(cpt == CFS_CPT_ANY || (cpt >= 0 && cpt < cptab->ctb_nparts));
+
+	return cpt == CFS_CPT_ANY ?
+	       cpumask_any_and(cptab->ctb_cpumask,
+			       cpu_online_mask) < nr_cpu_ids :
+	       cpumask_any_and(cptab->ctb_parts[cpt].cpt_cpumask,
+			       cpu_online_mask) < nr_cpu_ids;
+}
+EXPORT_SYMBOL(cfs_cpt_online);
+
+cpumask_t *
+cfs_cpt_cpumask(struct cfs_cpt_table *cptab, int cpt)
+{
+	LASSERT(cpt == CFS_CPT_ANY || (cpt >= 0 && cpt < cptab->ctb_nparts));
+
+	return cpt == CFS_CPT_ANY ?
+	       cptab->ctb_cpumask : cptab->ctb_parts[cpt].cpt_cpumask;
+}
+EXPORT_SYMBOL(cfs_cpt_cpumask);
+
+nodemask_t *
+cfs_cpt_nodemask(struct cfs_cpt_table *cptab, int cpt)
+{
+	LASSERT(cpt == CFS_CPT_ANY || (cpt >= 0 && cpt < cptab->ctb_nparts));
+
+	return cpt == CFS_CPT_ANY ?
+	       cptab->ctb_nodemask : cptab->ctb_parts[cpt].cpt_nodemask;
+}
+EXPORT_SYMBOL(cfs_cpt_nodemask);
+
+int
+cfs_cpt_set_cpu(struct cfs_cpt_table *cptab, int cpt, int cpu)
+{
+	int	node;
+
+	LASSERT(cpt >= 0 && cpt < cptab->ctb_nparts);
+
+	if (cpu < 0 || cpu >= nr_cpu_ids || !cpu_online(cpu)) {
+		CDEBUG(D_INFO, "CPU %d is invalid or it's offline\n", cpu);
+		return 0;
+	}
+
+	if (cptab->ctb_cpu2cpt[cpu] != -1) {
+		CDEBUG(D_INFO, "CPU %d is already in partition %d\n",
+		       cpu, cptab->ctb_cpu2cpt[cpu]);
+		return 0;
+	}
+
+	cptab->ctb_cpu2cpt[cpu] = cpt;
+
+	LASSERT(!cpumask_test_cpu(cpu, cptab->ctb_cpumask));
+	LASSERT(!cpumask_test_cpu(cpu, cptab->ctb_parts[cpt].cpt_cpumask));
+
+	cpumask_set_cpu(cpu, cptab->ctb_cpumask);
+	cpumask_set_cpu(cpu, cptab->ctb_parts[cpt].cpt_cpumask);
+
+	node = cpu_to_node(cpu);
+
+	/* first CPU of @node in this CPT table */
+	if (!node_isset(node, *cptab->ctb_nodemask))
+		node_set(node, *cptab->ctb_nodemask);
+
+	/* first CPU of @node in this partition */
+	if (!node_isset(node, *cptab->ctb_parts[cpt].cpt_nodemask))
+		node_set(node, *cptab->ctb_parts[cpt].cpt_nodemask);
+
+	return 1;
+}
+EXPORT_SYMBOL(cfs_cpt_set_cpu);
+
+void
+cfs_cpt_unset_cpu(struct cfs_cpt_table *cptab, int cpt, int cpu)
+{
+	int	node;
+	int	i;
+
+	LASSERT(cpt == CFS_CPT_ANY || (cpt >= 0 && cpt < cptab->ctb_nparts));
+
+	if (cpu < 0 || cpu >= nr_cpu_ids) {
+		CDEBUG(D_INFO, "Invalid CPU id %d\n", cpu);
+		return;
+	}
+
+	if (cpt == CFS_CPT_ANY) {
+		/* caller doesn't know the partition ID */
+		cpt = cptab->ctb_cpu2cpt[cpu];
+		if (cpt < 0) { /* not set in this CPT-table */
+			CDEBUG(D_INFO, "Try to unset cpu %d which is not in CPT-table %p\n",
+			       cpt, cptab);
+			return;
+		}
+
+	} else if (cpt != cptab->ctb_cpu2cpt[cpu]) {
+		CDEBUG(D_INFO,
+		       "CPU %d is not in cpu-partition %d\n", cpu, cpt);
+		return;
+	}
+
+	LASSERT(cpumask_test_cpu(cpu, cptab->ctb_parts[cpt].cpt_cpumask));
+	LASSERT(cpumask_test_cpu(cpu, cptab->ctb_cpumask));
+
+	cpumask_clear_cpu(cpu, cptab->ctb_parts[cpt].cpt_cpumask);
+	cpumask_clear_cpu(cpu, cptab->ctb_cpumask);
+	cptab->ctb_cpu2cpt[cpu] = -1;
+
+	node = cpu_to_node(cpu);
+
+	LASSERT(node_isset(node, *cptab->ctb_parts[cpt].cpt_nodemask));
+	LASSERT(node_isset(node, *cptab->ctb_nodemask));
+
+	for_each_cpu(i, cptab->ctb_parts[cpt].cpt_cpumask) {
+		/* this CPT has other CPU belonging to this node? */
+		if (cpu_to_node(i) == node)
+			break;
+	}
+
+	if (i >= nr_cpu_ids)
+		node_clear(node, *cptab->ctb_parts[cpt].cpt_nodemask);
+
+	for_each_cpu(i, cptab->ctb_cpumask) {
+		/* this CPT-table has other CPU belonging to this node? */
+		if (cpu_to_node(i) == node)
+			break;
+	}
+
+	if (i >= nr_cpu_ids)
+		node_clear(node, *cptab->ctb_nodemask);
+
+	return;
+}
+EXPORT_SYMBOL(cfs_cpt_unset_cpu);
+
+int
+cfs_cpt_set_cpumask(struct cfs_cpt_table *cptab, int cpt, cpumask_t *mask)
+{
+	int	i;
+
+	if (cpumask_weight(mask) == 0 ||
+	    cpumask_any_and(mask, cpu_online_mask) >= nr_cpu_ids) {
+		CDEBUG(D_INFO, "No online CPU is found in the CPU mask for CPU partition %d\n",
+		       cpt);
+		return 0;
+	}
+
+	for_each_cpu(i, mask) {
+		if (!cfs_cpt_set_cpu(cptab, cpt, i))
+			return 0;
+	}
+
+	return 1;
+}
+EXPORT_SYMBOL(cfs_cpt_set_cpumask);
+
+void
+cfs_cpt_unset_cpumask(struct cfs_cpt_table *cptab, int cpt, cpumask_t *mask)
+{
+	int	i;
+
+	for_each_cpu(i, mask)
+		cfs_cpt_unset_cpu(cptab, cpt, i);
+}
+EXPORT_SYMBOL(cfs_cpt_unset_cpumask);
+
+int
+cfs_cpt_set_node(struct cfs_cpt_table *cptab, int cpt, int node)
+{
+	cpumask_t	*mask;
+	int		rc;
+
+	if (node < 0 || node >= MAX_NUMNODES) {
+		CDEBUG(D_INFO,
+		       "Invalid NUMA id %d for CPU partition %d\n", node, cpt);
+		return 0;
+	}
+
+	mutex_lock(&cpt_data.cpt_mutex);
+
+	mask = cpt_data.cpt_cpumask;
+	cfs_node_to_cpumask(node, mask);
+
+	rc = cfs_cpt_set_cpumask(cptab, cpt, mask);
+
+	mutex_unlock(&cpt_data.cpt_mutex);
+
+	return rc;
+}
+EXPORT_SYMBOL(cfs_cpt_set_node);
+
+void
+cfs_cpt_unset_node(struct cfs_cpt_table *cptab, int cpt, int node)
+{
+	cpumask_t *mask;
+
+	if (node < 0 || node >= MAX_NUMNODES) {
+		CDEBUG(D_INFO,
+		       "Invalid NUMA id %d for CPU partition %d\n", node, cpt);
+		return;
+	}
+
+	mutex_lock(&cpt_data.cpt_mutex);
+
+	mask = cpt_data.cpt_cpumask;
+	cfs_node_to_cpumask(node, mask);
+
+	cfs_cpt_unset_cpumask(cptab, cpt, mask);
+
+	mutex_unlock(&cpt_data.cpt_mutex);
+}
+EXPORT_SYMBOL(cfs_cpt_unset_node);
+
+int
+cfs_cpt_set_nodemask(struct cfs_cpt_table *cptab, int cpt, nodemask_t *mask)
+{
+	int	i;
+
+	for_each_node_mask(i, *mask) {
+		if (!cfs_cpt_set_node(cptab, cpt, i))
+			return 0;
+	}
+
+	return 1;
+}
+EXPORT_SYMBOL(cfs_cpt_set_nodemask);
+
+void
+cfs_cpt_unset_nodemask(struct cfs_cpt_table *cptab, int cpt, nodemask_t *mask)
+{
+	int	i;
+
+	for_each_node_mask(i, *mask)
+		cfs_cpt_unset_node(cptab, cpt, i);
+}
+EXPORT_SYMBOL(cfs_cpt_unset_nodemask);
+
+void
+cfs_cpt_clear(struct cfs_cpt_table *cptab, int cpt)
+{
+	int	last;
+	int	i;
+
+	if (cpt == CFS_CPT_ANY) {
+		last = cptab->ctb_nparts - 1;
+		cpt = 0;
+	} else {
+		last = cpt;
+	}
+
+	for (; cpt <= last; cpt++) {
+		for_each_cpu(i, cptab->ctb_parts[cpt].cpt_cpumask)
+			cfs_cpt_unset_cpu(cptab, cpt, i);
+	}
+}
+EXPORT_SYMBOL(cfs_cpt_clear);
+
+int
+cfs_cpt_spread_node(struct cfs_cpt_table *cptab, int cpt)
+{
+	nodemask_t	*mask;
+	int		weight;
+	int		rotor;
+	int		node;
+
+	/* convert CPU partition ID to HW node id */
+
+	if (cpt < 0 || cpt >= cptab->ctb_nparts) {
+		mask = cptab->ctb_nodemask;
+		rotor = cptab->ctb_spread_rotor++;
+	} else {
+		mask = cptab->ctb_parts[cpt].cpt_nodemask;
+		rotor = cptab->ctb_parts[cpt].cpt_spread_rotor++;
+	}
+
+	weight = nodes_weight(*mask);
+	LASSERT(weight > 0);
+
+	rotor %= weight;
+
+	for_each_node_mask(node, *mask) {
+		if (rotor-- == 0)
+			return node;
+	}
+
+	LBUG();
+	return 0;
+}
+EXPORT_SYMBOL(cfs_cpt_spread_node);
+
+int
+cfs_cpt_current(struct cfs_cpt_table *cptab, int remap)
+{
+	int	cpu = smp_processor_id();
+	int	cpt = cptab->ctb_cpu2cpt[cpu];
+
+	if (cpt < 0) {
+		if (!remap)
+			return cpt;
+
+		/* don't return negative value for safety of upper layer,
+		 * instead we shadow the unknown cpu to a valid partition ID */
+		cpt = cpu % cptab->ctb_nparts;
+	}
+
+	return cpt;
+}
+EXPORT_SYMBOL(cfs_cpt_current);
+
+int
+cfs_cpt_of_cpu(struct cfs_cpt_table *cptab, int cpu)
+{
+	LASSERT(cpu >= 0 && cpu < nr_cpu_ids);
+
+	return cptab->ctb_cpu2cpt[cpu];
+}
+EXPORT_SYMBOL(cfs_cpt_of_cpu);
+
+int
+cfs_cpt_bind(struct cfs_cpt_table *cptab, int cpt)
+{
+	cpumask_t	*cpumask;
+	nodemask_t	*nodemask;
+	int		rc;
+	int		i;
+
+	LASSERT(cpt == CFS_CPT_ANY || (cpt >= 0 && cpt < cptab->ctb_nparts));
+
+	if (cpt == CFS_CPT_ANY) {
+		cpumask = cptab->ctb_cpumask;
+		nodemask = cptab->ctb_nodemask;
+	} else {
+		cpumask = cptab->ctb_parts[cpt].cpt_cpumask;
+		nodemask = cptab->ctb_parts[cpt].cpt_nodemask;
+	}
+
+	if (cpumask_any_and(cpumask, cpu_online_mask) >= nr_cpu_ids) {
+		CERROR("No online CPU found in CPU partition %d, did someone do CPU hotplug on system? You might need to reload Lustre modules to keep system working well.\n",
+		       cpt);
+		return -EINVAL;
+	}
+
+	for_each_online_cpu(i) {
+		if (cpumask_test_cpu(i, cpumask))
+			continue;
+
+		rc = set_cpus_allowed_ptr(current, cpumask);
+		set_mems_allowed(*nodemask);
+		if (rc == 0)
+			schedule(); /* switch to allowed CPU */
+
+		return rc;
+	}
+
+	/* don't need to set affinity because all online CPUs are covered */
+	return 0;
+}
+EXPORT_SYMBOL(cfs_cpt_bind);
+
+/**
+ * Choose max to \a number CPUs from \a node and set them in \a cpt.
+ * We always prefer to choose CPU in the same core/socket.
+ */
+static int
+cfs_cpt_choose_ncpus(struct cfs_cpt_table *cptab, int cpt,
+		     cpumask_t *node, int number)
+{
+	cpumask_t	*socket = NULL;
+	cpumask_t	*core = NULL;
+	int		rc = 0;
+	int		cpu;
+
+	LASSERT(number > 0);
+
+	if (number >= cpumask_weight(node)) {
+		while (!cpumask_empty(node)) {
+			cpu = cpumask_first(node);
+
+			rc = cfs_cpt_set_cpu(cptab, cpt, cpu);
+			if (!rc)
+				return -EINVAL;
+			cpumask_clear_cpu(cpu, node);
+		}
+		return 0;
+	}
+
+	/* allocate scratch buffer */
+	LIBCFS_ALLOC(socket, cpumask_size());
+	LIBCFS_ALLOC(core, cpumask_size());
+	if (socket == NULL || core == NULL) {
+		rc = -ENOMEM;
+		goto out;
+	}
+
+	while (!cpumask_empty(node)) {
+		cpu = cpumask_first(node);
+
+		/* get cpumask for cores in the same socket */
+		cfs_cpu_core_siblings(cpu, socket);
+		cpumask_and(socket, socket, node);
+
+		LASSERT(!cpumask_empty(socket));
+
+		while (!cpumask_empty(socket)) {
+			int     i;
+
+			/* get cpumask for hts in the same core */
+			cfs_cpu_ht_siblings(cpu, core);
+			cpumask_and(core, core, node);
+
+			LASSERT(!cpumask_empty(core));
+
+			for_each_cpu(i, core) {
+				cpumask_clear_cpu(i, socket);
+				cpumask_clear_cpu(i, node);
+
+				rc = cfs_cpt_set_cpu(cptab, cpt, i);
+				if (!rc) {
+					rc = -EINVAL;
+					goto out;
+				}
+
+				if (--number == 0)
+					goto out;
+			}
+			cpu = cpumask_first(socket);
+		}
+	}
+
+ out:
+	if (socket != NULL)
+		LIBCFS_FREE(socket, cpumask_size());
+	if (core != NULL)
+		LIBCFS_FREE(core, cpumask_size());
+	return rc;
+}
+
+#define CPT_WEIGHT_MIN  4u
+
+static unsigned int
+cfs_cpt_num_estimate(void)
+{
+	unsigned nnode = num_online_nodes();
+	unsigned ncpu  = num_online_cpus();
+	unsigned ncpt;
+
+	if (ncpu <= CPT_WEIGHT_MIN) {
+		ncpt = 1;
+		goto out;
+	}
+
+	/* generate reasonable number of CPU partitions based on total number
+	 * of CPUs, Preferred N should be power2 and match this condition:
+	 * 2 * (N - 1)^2 < NCPUS <= 2 * N^2 */
+	for (ncpt = 2; ncpu > 2 * ncpt * ncpt; ncpt <<= 1) {}
+
+	if (ncpt <= nnode) { /* fat numa system */
+		while (nnode > ncpt)
+			nnode >>= 1;
+
+	} else { /* ncpt > nnode */
+		while ((nnode << 1) <= ncpt)
+			nnode <<= 1;
+	}
+
+	ncpt = nnode;
+
+ out:
+#if (BITS_PER_LONG == 32)
+	/* config many CPU partitions on 32-bit system could consume
+	 * too much memory */
+	ncpt = min(2U, ncpt);
+#endif
+	while (ncpu % ncpt != 0)
+		ncpt--; /* worst case is 1 */
+
+	return ncpt;
+}
+
+static struct cfs_cpt_table *
+cfs_cpt_table_create(int ncpt)
+{
+	struct cfs_cpt_table *cptab = NULL;
+	cpumask_t	*mask = NULL;
+	int		cpt = 0;
+	int		num;
+	int		rc;
+	int		i;
+
+	rc = cfs_cpt_num_estimate();
+	if (ncpt <= 0)
+		ncpt = rc;
+
+	if (ncpt > num_online_cpus() || ncpt > 4 * rc) {
+		CWARN("CPU partition number %d is larger than suggested value (%d), your system may have performance issue or run out of memory while under pressure\n",
+		      ncpt, rc);
+	}
+
+	if (num_online_cpus() % ncpt != 0) {
+		CERROR("CPU number %d is not multiple of cpu_npartition %d, please try different cpu_npartitions value or set pattern string by cpu_pattern=STRING\n",
+		       (int)num_online_cpus(), ncpt);
+		goto failed;
+	}
+
+	cptab = cfs_cpt_table_alloc(ncpt);
+	if (cptab == NULL) {
+		CERROR("Failed to allocate CPU map(%d)\n", ncpt);
+		goto failed;
+	}
+
+	num = num_online_cpus() / ncpt;
+	if (num == 0) {
+		CERROR("CPU changed while setting CPU partition\n");
+		goto failed;
+	}
+
+	LIBCFS_ALLOC(mask, cpumask_size());
+	if (mask == NULL) {
+		CERROR("Failed to allocate scratch cpumask\n");
+		goto failed;
+	}
+
+	for_each_online_node(i) {
+		cfs_node_to_cpumask(i, mask);
+
+		while (!cpumask_empty(mask)) {
+			struct cfs_cpu_partition *part;
+			int    n;
+
+			if (cpt >= ncpt)
+				goto failed;
+
+			part = &cptab->ctb_parts[cpt];
+
+			n = num - cpumask_weight(part->cpt_cpumask);
+			LASSERT(n > 0);
+
+			rc = cfs_cpt_choose_ncpus(cptab, cpt, mask, n);
+			if (rc < 0)
+				goto failed;
+
+			LASSERT(num >= cpumask_weight(part->cpt_cpumask));
+			if (num == cpumask_weight(part->cpt_cpumask))
+				cpt++;
+		}
+	}
+
+	if (cpt != ncpt ||
+	    num != cpumask_weight(cptab->ctb_parts[ncpt - 1].cpt_cpumask)) {
+		CERROR("Expect %d(%d) CPU partitions but got %d(%d), CPU hotplug/unplug while setting?\n",
+		       cptab->ctb_nparts, num, cpt,
+		       cpumask_weight(cptab->ctb_parts[ncpt - 1].cpt_cpumask));
+		goto failed;
+	}
+
+	LIBCFS_FREE(mask, cpumask_size());
+
+	return cptab;
+
+ failed:
+	CERROR("Failed to setup CPU-partition-table with %d CPU-partitions, online HW nodes: %d, HW cpus: %d.\n",
+	       ncpt, num_online_nodes(), num_online_cpus());
+
+	if (mask != NULL)
+		LIBCFS_FREE(mask, cpumask_size());
+
+	if (cptab != NULL)
+		cfs_cpt_table_free(cptab);
+
+	return NULL;
+}
+
+static struct cfs_cpt_table *
+cfs_cpt_table_create_pattern(char *pattern)
+{
+	struct cfs_cpt_table	*cptab;
+	char			*str	= pattern;
+	int			node	= 0;
+	int			high;
+	int			ncpt;
+	int			c;
+
+	for (ncpt = 0;; ncpt++) { /* quick scan bracket */
+		str = strchr(str, '[');
+		if (str == NULL)
+			break;
+		str++;
+	}
+
+	str = cfs_trimwhite(pattern);
+	if (*str == 'n' || *str == 'N') {
+		pattern = str + 1;
+		node = 1;
+	}
+
+	if (ncpt == 0 ||
+	    (node && ncpt > num_online_nodes()) ||
+	    (!node && ncpt > num_online_cpus())) {
+		CERROR("Invalid pattern %s, or too many partitions %d\n",
+		       pattern, ncpt);
+		return NULL;
+	}
+
+	high = node ? MAX_NUMNODES - 1 : nr_cpu_ids - 1;
+
+	cptab = cfs_cpt_table_alloc(ncpt);
+	if (cptab == NULL) {
+		CERROR("Failed to allocate cpu partition table\n");
+		return NULL;
+	}
+
+	for (str = cfs_trimwhite(pattern), c = 0;; c++) {
+		struct cfs_range_expr	*range;
+		struct cfs_expr_list	*el;
+		char			*bracket = strchr(str, '[');
+		int			cpt;
+		int			rc;
+		int			i;
+		int			n;
+
+		if (bracket == NULL) {
+			if (*str != 0) {
+				CERROR("Invalid pattern %s\n", str);
+				goto failed;
+			} else if (c != ncpt) {
+				CERROR("expect %d partitions but found %d\n",
+				       ncpt, c);
+				goto failed;
+			}
+			break;
+		}
+
+		if (sscanf(str, "%d%n", &cpt, &n) < 1) {
+			CERROR("Invalid cpu pattern %s\n", str);
+			goto failed;
+		}
+
+		if (cpt < 0 || cpt >= ncpt) {
+			CERROR("Invalid partition id %d, total partitions %d\n",
+			       cpt, ncpt);
+			goto failed;
+		}
+
+		if (cfs_cpt_weight(cptab, cpt) != 0) {
+			CERROR("Partition %d has already been set.\n", cpt);
+			goto failed;
+		}
+
+		str = cfs_trimwhite(str + n);
+		if (str != bracket) {
+			CERROR("Invalid pattern %s\n", str);
+			goto failed;
+		}
+
+		bracket = strchr(str, ']');
+		if (bracket == NULL) {
+			CERROR("missing right bracket for cpt %d, %s\n",
+			       cpt, str);
+			goto failed;
+		}
+
+		if (cfs_expr_list_parse(str, (bracket - str) + 1,
+					0, high, &el) != 0) {
+			CERROR("Can't parse number range: %s\n", str);
+			goto failed;
+		}
+
+		list_for_each_entry(range, &el->el_exprs, re_link) {
+			for (i = range->re_lo; i <= range->re_hi; i++) {
+				if ((i - range->re_lo) % range->re_stride != 0)
+					continue;
+
+				rc = node ? cfs_cpt_set_node(cptab, cpt, i) :
+					    cfs_cpt_set_cpu(cptab, cpt, i);
+				if (!rc) {
+					cfs_expr_list_free(el);
+					goto failed;
+				}
+			}
+		}
+
+		cfs_expr_list_free(el);
+
+		if (!cfs_cpt_online(cptab, cpt)) {
+			CERROR("No online CPU is found on partition %d\n", cpt);
+			goto failed;
+		}
+
+		str = cfs_trimwhite(bracket + 1);
+	}
+
+	return cptab;
+
+ failed:
+	cfs_cpt_table_free(cptab);
+	return NULL;
+}
+
+#ifdef CONFIG_HOTPLUG_CPU
+static int
+cfs_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
+{
+	unsigned int  cpu = (unsigned long)hcpu;
+	bool	     warn;
+
+	switch (action) {
+	case CPU_DEAD:
+	case CPU_DEAD_FROZEN:
+	case CPU_ONLINE:
+	case CPU_ONLINE_FROZEN:
+		spin_lock(&cpt_data.cpt_lock);
+		cpt_data.cpt_version++;
+		spin_unlock(&cpt_data.cpt_lock);
+	default:
+		if (action != CPU_DEAD && action != CPU_DEAD_FROZEN) {
+			CDEBUG(D_INFO, "CPU changed [cpu %u action %lx]\n",
+			       cpu, action);
+			break;
+		}
+
+		mutex_lock(&cpt_data.cpt_mutex);
+		/* if all HTs in a core are offline, it may break affinity */
+		cfs_cpu_ht_siblings(cpu, cpt_data.cpt_cpumask);
+		warn = cpumask_any_and(cpt_data.cpt_cpumask,
+				       cpu_online_mask) >= nr_cpu_ids;
+		mutex_unlock(&cpt_data.cpt_mutex);
+		CDEBUG(warn ? D_WARNING : D_INFO,
+		       "Lustre: can't support CPU plug-out well now, performance and stability could be impacted [CPU %u action: %lx]\n",
+		       cpu, action);
+	}
+
+	return NOTIFY_OK;
+}
+
+static struct notifier_block cfs_cpu_notifier = {
+	.notifier_call	= cfs_cpu_notify,
+	.priority	= 0
+};
+
+#endif
+
+void
+cfs_cpu_fini(void)
+{
+	if (cfs_cpt_table != NULL)
+		cfs_cpt_table_free(cfs_cpt_table);
+
+#ifdef CONFIG_HOTPLUG_CPU
+	unregister_hotcpu_notifier(&cfs_cpu_notifier);
+#endif
+	if (cpt_data.cpt_cpumask != NULL)
+		LIBCFS_FREE(cpt_data.cpt_cpumask, cpumask_size());
+}
+
+int
+cfs_cpu_init(void)
+{
+	LASSERT(cfs_cpt_table == NULL);
+
+	memset(&cpt_data, 0, sizeof(cpt_data));
+
+	LIBCFS_ALLOC(cpt_data.cpt_cpumask, cpumask_size());
+	if (cpt_data.cpt_cpumask == NULL) {
+		CERROR("Failed to allocate scratch buffer\n");
+		return -1;
+	}
+
+	spin_lock_init(&cpt_data.cpt_lock);
+	mutex_init(&cpt_data.cpt_mutex);
+
+#ifdef CONFIG_HOTPLUG_CPU
+	register_hotcpu_notifier(&cfs_cpu_notifier);
+#endif
+
+	if (*cpu_pattern != 0) {
+		cfs_cpt_table = cfs_cpt_table_create_pattern(cpu_pattern);
+		if (cfs_cpt_table == NULL) {
+			CERROR("Failed to create cptab from pattern %s\n",
+			       cpu_pattern);
+			goto failed;
+		}
+
+	} else {
+		cfs_cpt_table = cfs_cpt_table_create(cpu_npartitions);
+		if (cfs_cpt_table == NULL) {
+			CERROR("Failed to create ptable with npartitions %d\n",
+			       cpu_npartitions);
+			goto failed;
+		}
+	}
+
+	spin_lock(&cpt_data.cpt_lock);
+	if (cfs_cpt_table->ctb_version != cpt_data.cpt_version) {
+		spin_unlock(&cpt_data.cpt_lock);
+		CERROR("CPU hotplug/unplug during setup\n");
+		goto failed;
+	}
+	spin_unlock(&cpt_data.cpt_lock);
+
+	LCONSOLE(0, "HW CPU cores: %d, npartitions: %d\n",
+		 num_online_cpus(), cfs_cpt_number(cfs_cpt_table));
+	return 0;
+
+ failed:
+	cfs_cpu_fini();
+	return -1;
+}
+
+#endif
diff --git a/kernel/drivers/staging/lustre/lustre/libcfs/linux/linux-crypto-adler.c b/kernel/drivers/staging/lustre/lustre/libcfs/linux/linux-crypto-adler.c
new file mode 100644
index 000000000..5e185fa59
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/libcfs/linux/linux-crypto-adler.c
@@ -0,0 +1,141 @@
+/* GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see http://www.gnu.org/licenses
+ *
+ * Please  visit http://www.xyratex.com/contact if you need additional
+ * information or have any questions.
+ *
+ * GPL HEADER END
+ */
+
+/*
+ * Copyright 2012 Xyratex Technology Limited
+ */
+
+/*
+ * This is crypto api shash wrappers to zlib_adler32.
+ */
+
+#include <linux/module.h>
+#include <linux/zutil.h>
+#include <crypto/internal/hash.h>
+#include "linux-crypto.h"
+
+#define CHKSUM_BLOCK_SIZE	1
+#define CHKSUM_DIGEST_SIZE	4
+
+static u32 __adler32(u32 cksum, unsigned char const *p, size_t len)
+{
+	return zlib_adler32(cksum, p, len);
+}
+
+static int adler32_cra_init(struct crypto_tfm *tfm)
+{
+	u32 *key = crypto_tfm_ctx(tfm);
+
+	*key = 1;
+
+	return 0;
+}
+
+static int adler32_setkey(struct crypto_shash *hash, const u8 *key,
+			  unsigned int keylen)
+{
+	u32 *mctx = crypto_shash_ctx(hash);
+
+	if (keylen != sizeof(u32)) {
+		crypto_shash_set_flags(hash, CRYPTO_TFM_RES_BAD_KEY_LEN);
+		return -EINVAL;
+	}
+	*mctx = *(u32 *)key;
+	return 0;
+}
+
+static int adler32_init(struct shash_desc *desc)
+{
+	u32 *mctx = crypto_shash_ctx(desc->tfm);
+	u32 *cksump = shash_desc_ctx(desc);
+
+	*cksump = *mctx;
+
+	return 0;
+}
+
+static int adler32_update(struct shash_desc *desc, const u8 *data,
+			  unsigned int len)
+{
+	u32 *cksump = shash_desc_ctx(desc);
+
+	*cksump = __adler32(*cksump, data, len);
+	return 0;
+}
+static int __adler32_finup(u32 *cksump, const u8 *data, unsigned int len,
+			   u8 *out)
+{
+	*(u32 *)out = __adler32(*cksump, data, len);
+	return 0;
+}
+
+static int adler32_finup(struct shash_desc *desc, const u8 *data,
+			 unsigned int len, u8 *out)
+{
+	return __adler32_finup(shash_desc_ctx(desc), data, len, out);
+}
+
+static int adler32_final(struct shash_desc *desc, u8 *out)
+{
+	u32 *cksump = shash_desc_ctx(desc);
+
+	*(u32 *)out = *cksump;
+	return 0;
+}
+
+static int adler32_digest(struct shash_desc *desc, const u8 *data,
+			  unsigned int len, u8 *out)
+{
+	return __adler32_finup(crypto_shash_ctx(desc->tfm), data, len,
+				    out);
+}
+static struct shash_alg alg = {
+	.setkey		= adler32_setkey,
+	.init		= adler32_init,
+	.update		= adler32_update,
+	.final		= adler32_final,
+	.finup		= adler32_finup,
+	.digest		= adler32_digest,
+	.descsize	= sizeof(u32),
+	.digestsize	= CHKSUM_DIGEST_SIZE,
+	.base		= {
+		.cra_name		= "adler32",
+		.cra_driver_name	= "adler32-zlib",
+		.cra_priority		= 100,
+		.cra_blocksize		= CHKSUM_BLOCK_SIZE,
+		.cra_ctxsize		= sizeof(u32),
+		.cra_module		= THIS_MODULE,
+		.cra_init		= adler32_cra_init,
+	}
+};
+
+
+int cfs_crypto_adler32_register(void)
+{
+	return crypto_register_shash(&alg);
+}
+
+void cfs_crypto_adler32_unregister(void)
+{
+	crypto_unregister_shash(&alg);
+}
diff --git a/kernel/drivers/staging/lustre/lustre/libcfs/linux/linux-crypto.c b/kernel/drivers/staging/lustre/lustre/libcfs/linux/linux-crypto.c
new file mode 100644
index 000000000..aa3fffed1
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/libcfs/linux/linux-crypto.c
@@ -0,0 +1,291 @@
+/* GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see http://www.gnu.org/licenses
+ *
+ * Please  visit http://www.xyratex.com/contact if you need additional
+ * information or have any questions.
+ *
+ * GPL HEADER END
+ */
+
+/*
+ * Copyright 2012 Xyratex Technology Limited
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+
+#include <linux/crypto.h>
+#include <linux/scatterlist.h>
+#include "../../../include/linux/libcfs/libcfs.h"
+#include "linux-crypto.h"
+/**
+ *  Array of  hash algorithm speed in MByte per second
+ */
+static int cfs_crypto_hash_speeds[CFS_HASH_ALG_MAX];
+
+
+
+static int cfs_crypto_hash_alloc(unsigned char alg_id,
+				 const struct cfs_crypto_hash_type **type,
+				 struct hash_desc *desc, unsigned char *key,
+				 unsigned int key_len)
+{
+	int     err = 0;
+
+	*type = cfs_crypto_hash_type(alg_id);
+
+	if (*type == NULL) {
+		CWARN("Unsupported hash algorithm id = %d, max id is %d\n",
+		      alg_id, CFS_HASH_ALG_MAX);
+		return -EINVAL;
+	}
+	desc->tfm = crypto_alloc_hash((*type)->cht_name, 0, 0);
+
+	if (desc->tfm == NULL)
+		return -EINVAL;
+
+	if (IS_ERR(desc->tfm)) {
+		CDEBUG(D_INFO, "Failed to alloc crypto hash %s\n",
+		       (*type)->cht_name);
+		return PTR_ERR(desc->tfm);
+	}
+
+	desc->flags = 0;
+
+	/** Shash have different logic for initialization then digest
+	 * shash: crypto_hash_setkey, crypto_hash_init
+	 * digest: crypto_digest_init, crypto_digest_setkey
+	 * Skip this function for digest, because we use shash logic at
+	 * cfs_crypto_hash_alloc.
+	 */
+	if (key != NULL) {
+		err = crypto_hash_setkey(desc->tfm, key, key_len);
+	} else if ((*type)->cht_key != 0) {
+		err = crypto_hash_setkey(desc->tfm,
+					 (unsigned char *)&((*type)->cht_key),
+					 (*type)->cht_size);
+	}
+
+	if (err != 0) {
+		crypto_free_hash(desc->tfm);
+		return err;
+	}
+
+	CDEBUG(D_INFO, "Using crypto hash: %s (%s) speed %d MB/s\n",
+	       (crypto_hash_tfm(desc->tfm))->__crt_alg->cra_name,
+	       (crypto_hash_tfm(desc->tfm))->__crt_alg->cra_driver_name,
+	       cfs_crypto_hash_speeds[alg_id]);
+
+	return crypto_hash_init(desc);
+}
+
+int cfs_crypto_hash_digest(unsigned char alg_id,
+			   const void *buf, unsigned int buf_len,
+			   unsigned char *key, unsigned int key_len,
+			   unsigned char *hash, unsigned int *hash_len)
+{
+	struct scatterlist	sl;
+	struct hash_desc	hdesc;
+	int			err;
+	const struct cfs_crypto_hash_type	*type;
+
+	if (buf == NULL || buf_len == 0 || hash_len == NULL)
+		return -EINVAL;
+
+	err = cfs_crypto_hash_alloc(alg_id, &type, &hdesc, key, key_len);
+	if (err != 0)
+		return err;
+
+	if (hash == NULL || *hash_len < type->cht_size) {
+		*hash_len = type->cht_size;
+		crypto_free_hash(hdesc.tfm);
+		return -ENOSPC;
+	}
+	sg_init_one(&sl, (void *)buf, buf_len);
+
+	hdesc.flags = 0;
+	err = crypto_hash_digest(&hdesc, &sl, sl.length, hash);
+	crypto_free_hash(hdesc.tfm);
+
+	return err;
+}
+EXPORT_SYMBOL(cfs_crypto_hash_digest);
+
+struct cfs_crypto_hash_desc *
+	cfs_crypto_hash_init(unsigned char alg_id,
+			     unsigned char *key, unsigned int key_len)
+{
+
+	struct  hash_desc       *hdesc;
+	int		     err;
+	const struct cfs_crypto_hash_type       *type;
+
+	hdesc = kmalloc(sizeof(*hdesc), 0);
+	if (hdesc == NULL)
+		return ERR_PTR(-ENOMEM);
+
+	err = cfs_crypto_hash_alloc(alg_id, &type, hdesc, key, key_len);
+
+	if (err) {
+		kfree(hdesc);
+		return ERR_PTR(err);
+	}
+	return (struct cfs_crypto_hash_desc *)hdesc;
+}
+EXPORT_SYMBOL(cfs_crypto_hash_init);
+
+int cfs_crypto_hash_update_page(struct cfs_crypto_hash_desc *hdesc,
+				struct page *page, unsigned int offset,
+				unsigned int len)
+{
+	struct scatterlist sl;
+
+	sg_init_table(&sl, 1);
+	sg_set_page(&sl, page, len, offset & ~CFS_PAGE_MASK);
+
+	return crypto_hash_update((struct hash_desc *)hdesc, &sl, sl.length);
+}
+EXPORT_SYMBOL(cfs_crypto_hash_update_page);
+
+int cfs_crypto_hash_update(struct cfs_crypto_hash_desc *hdesc,
+			   const void *buf, unsigned int buf_len)
+{
+	struct scatterlist sl;
+
+	sg_init_one(&sl, (void *)buf, buf_len);
+
+	return crypto_hash_update((struct hash_desc *)hdesc, &sl, sl.length);
+}
+EXPORT_SYMBOL(cfs_crypto_hash_update);
+
+/*      If hash_len pointer is NULL - destroy descriptor. */
+int cfs_crypto_hash_final(struct cfs_crypto_hash_desc *hdesc,
+			  unsigned char *hash, unsigned int *hash_len)
+{
+	int     err;
+	int     size = crypto_hash_digestsize(((struct hash_desc *)hdesc)->tfm);
+
+	if (hash_len == NULL) {
+		crypto_free_hash(((struct hash_desc *)hdesc)->tfm);
+		kfree(hdesc);
+		return 0;
+	}
+	if (hash == NULL || *hash_len < size) {
+		*hash_len = size;
+		return -ENOSPC;
+	}
+	err = crypto_hash_final((struct hash_desc *) hdesc, hash);
+
+	if (err < 0) {
+		/* May be caller can fix error */
+		return err;
+	}
+	crypto_free_hash(((struct hash_desc *)hdesc)->tfm);
+	kfree(hdesc);
+	return err;
+}
+EXPORT_SYMBOL(cfs_crypto_hash_final);
+
+static void cfs_crypto_performance_test(unsigned char alg_id,
+					const unsigned char *buf,
+					unsigned int buf_len)
+{
+	unsigned long		   start, end;
+	int			     bcount, err = 0;
+	int			     sec = 1; /* do test only 1 sec */
+	unsigned char		   hash[64];
+	unsigned int		    hash_len = 64;
+
+	for (start = jiffies, end = start + sec * HZ, bcount = 0;
+	     time_before(jiffies, end); bcount++) {
+		err = cfs_crypto_hash_digest(alg_id, buf, buf_len, NULL, 0,
+					     hash, &hash_len);
+		if (err)
+			break;
+
+	}
+	end = jiffies;
+
+	if (err) {
+		cfs_crypto_hash_speeds[alg_id] =  -1;
+		CDEBUG(D_INFO, "Crypto hash algorithm %s, err = %d\n",
+		       cfs_crypto_hash_name(alg_id), err);
+	} else {
+		unsigned long   tmp;
+		tmp = ((bcount * buf_len / jiffies_to_msecs(end - start)) *
+		       1000) / (1024 * 1024);
+		cfs_crypto_hash_speeds[alg_id] = (int)tmp;
+	}
+	CDEBUG(D_INFO, "Crypto hash algorithm %s speed = %d MB/s\n",
+	       cfs_crypto_hash_name(alg_id), cfs_crypto_hash_speeds[alg_id]);
+}
+
+int cfs_crypto_hash_speed(unsigned char hash_alg)
+{
+	if (hash_alg < CFS_HASH_ALG_MAX)
+		return cfs_crypto_hash_speeds[hash_alg];
+	else
+		return -1;
+}
+EXPORT_SYMBOL(cfs_crypto_hash_speed);
+
+/**
+ * Do performance test for all hash algorithms.
+ */
+static int cfs_crypto_test_hashes(void)
+{
+	unsigned char	   i;
+	unsigned char	   *data;
+	unsigned int	    j;
+	/* Data block size for testing hash. Maximum
+	 * kmalloc size for 2.6.18 kernel is 128K */
+	unsigned int	    data_len = 1 * 128 * 1024;
+
+	data = kmalloc(data_len, 0);
+	if (data == NULL) {
+		CERROR("Failed to allocate mem\n");
+		return -ENOMEM;
+	}
+
+	for (j = 0; j < data_len; j++)
+		data[j] = j & 0xff;
+
+	for (i = 0; i < CFS_HASH_ALG_MAX; i++)
+		cfs_crypto_performance_test(i, data, data_len);
+
+	kfree(data);
+	return 0;
+}
+
+static int adler32;
+
+int cfs_crypto_register(void)
+{
+	request_module("crc32c");
+
+	adler32 = cfs_crypto_adler32_register();
+
+	/* check all algorithms and do performance test */
+	cfs_crypto_test_hashes();
+	return 0;
+}
+void cfs_crypto_unregister(void)
+{
+	if (adler32 == 0)
+		cfs_crypto_adler32_unregister();
+
+	return;
+}
diff --git a/kernel/drivers/staging/lustre/lustre/libcfs/linux/linux-crypto.h b/kernel/drivers/staging/lustre/lustre/libcfs/linux/linux-crypto.h
new file mode 100644
index 000000000..18e8cd4d8
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/libcfs/linux/linux-crypto.h
@@ -0,0 +1,29 @@
+ /*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see http://www.gnu.org/licenses
+ *
+ * Please  visit http://www.xyratex.com/contact if you need additional
+ * information or have any questions.
+ *
+ * GPL HEADER END
+ */
+
+/**
+ * Functions for start/stop shash adler32 algorithm.
+ */
+int cfs_crypto_adler32_register(void);
+void cfs_crypto_adler32_unregister(void);
diff --git a/kernel/drivers/staging/lustre/lustre/libcfs/linux/linux-curproc.c b/kernel/drivers/staging/lustre/lustre/libcfs/linux/linux-curproc.c
new file mode 100644
index 000000000..277f6b890
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/libcfs/linux/linux-curproc.c
@@ -0,0 +1,111 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * libcfs/libcfs/linux/linux-curproc.c
+ *
+ * Lustre curproc API implementation for Linux kernel
+ *
+ * Author: Nikita Danilov <nikita@clusterfs.com>
+ */
+
+#include <linux/sched.h>
+#include <linux/fs_struct.h>
+
+#include <linux/compat.h>
+#include <linux/thread_info.h>
+
+#define DEBUG_SUBSYSTEM S_LNET
+
+#include "../../../include/linux/libcfs/libcfs.h"
+
+/*
+ * Implementation of cfs_curproc API (see portals/include/libcfs/curproc.h)
+ * for Linux kernel.
+ */
+
+void cfs_cap_raise(cfs_cap_t cap)
+{
+	struct cred *cred;
+
+	cred = prepare_creds();
+	if (cred) {
+		cap_raise(cred->cap_effective, cap);
+		commit_creds(cred);
+	}
+}
+
+void cfs_cap_lower(cfs_cap_t cap)
+{
+	struct cred *cred;
+
+	cred = prepare_creds();
+	if (cred) {
+		cap_lower(cred->cap_effective, cap);
+		commit_creds(cred);
+	}
+}
+
+int cfs_cap_raised(cfs_cap_t cap)
+{
+	return cap_raised(current_cap(), cap);
+}
+
+static void cfs_kernel_cap_pack(kernel_cap_t kcap, cfs_cap_t *cap)
+{
+	/* XXX lost high byte */
+	*cap = kcap.cap[0];
+}
+
+cfs_cap_t cfs_curproc_cap_pack(void)
+{
+	cfs_cap_t cap;
+	cfs_kernel_cap_pack(current_cap(), &cap);
+	return cap;
+}
+
+EXPORT_SYMBOL(cfs_cap_raise);
+EXPORT_SYMBOL(cfs_cap_lower);
+EXPORT_SYMBOL(cfs_cap_raised);
+EXPORT_SYMBOL(cfs_curproc_cap_pack);
+
+/*
+ * Local variables:
+ * c-indentation-style: "K&R"
+ * c-basic-offset: 8
+ * tab-width: 8
+ * fill-column: 80
+ * scroll-step: 1
+ * End:
+ */
diff --git a/kernel/drivers/staging/lustre/lustre/libcfs/linux/linux-debug.c b/kernel/drivers/staging/lustre/lustre/libcfs/linux/linux-debug.c
new file mode 100644
index 000000000..4545d54f7
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/libcfs/linux/linux-debug.c
@@ -0,0 +1,200 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * libcfs/libcfs/linux/linux-debug.c
+ *
+ * Author: Phil Schwan <phil@clusterfs.com>
+ */
+
+#include <linux/module.h>
+#include <linux/kmod.h>
+#include <linux/notifier.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/stat.h>
+#include <linux/errno.h>
+#include <linux/unistd.h>
+#include <linux/interrupt.h>
+#include <linux/completion.h>
+#include <linux/fs.h>
+#include <linux/uaccess.h>
+#include <linux/miscdevice.h>
+
+# define DEBUG_SUBSYSTEM S_LNET
+
+#include "../../../include/linux/libcfs/libcfs.h"
+
+#include "../tracefile.h"
+
+#include <linux/kallsyms.h>
+
+char lnet_upcall[1024] = "/usr/lib/lustre/lnet_upcall";
+char lnet_debug_log_upcall[1024] = "/usr/lib/lustre/lnet_debug_log_upcall";
+
+/**
+ * Upcall function once a Lustre log has been dumped.
+ *
+ * \param file  path of the dumped log
+ */
+void libcfs_run_debug_log_upcall(char *file)
+{
+	char *argv[3];
+	int   rc;
+	char *envp[] = {
+		"HOME=/",
+		"PATH=/sbin:/bin:/usr/sbin:/usr/bin",
+		NULL};
+
+	argv[0] = lnet_debug_log_upcall;
+
+	LASSERTF(file != NULL, "called on a null filename\n");
+	argv[1] = file; /* only need to pass the path of the file */
+
+	argv[2] = NULL;
+
+	rc = call_usermodehelper(argv[0], argv, envp, 1);
+	if (rc < 0 && rc != -ENOENT) {
+		CERROR("Error %d invoking LNET debug log upcall %s %s; check /proc/sys/lnet/debug_log_upcall\n",
+		       rc, argv[0], argv[1]);
+	} else {
+		CDEBUG(D_HA, "Invoked LNET debug log upcall %s %s\n",
+		       argv[0], argv[1]);
+	}
+}
+
+void libcfs_run_upcall(char **argv)
+{
+	int   rc;
+	int   argc;
+	char *envp[] = {
+		"HOME=/",
+		"PATH=/sbin:/bin:/usr/sbin:/usr/bin",
+		NULL};
+
+	argv[0] = lnet_upcall;
+	argc = 1;
+	while (argv[argc] != NULL)
+		argc++;
+
+	LASSERT(argc >= 2);
+
+	rc = call_usermodehelper(argv[0], argv, envp, 1);
+	if (rc < 0 && rc != -ENOENT) {
+		CERROR("Error %d invoking LNET upcall %s %s%s%s%s%s%s%s%s; check /proc/sys/lnet/upcall\n",
+		       rc, argv[0], argv[1],
+		       argc < 3 ? "" : ",", argc < 3 ? "" : argv[2],
+		       argc < 4 ? "" : ",", argc < 4 ? "" : argv[3],
+		       argc < 5 ? "" : ",", argc < 5 ? "" : argv[4],
+		       argc < 6 ? "" : ",...");
+	} else {
+		CDEBUG(D_HA, "Invoked LNET upcall %s %s%s%s%s%s%s%s%s\n",
+		       argv[0], argv[1],
+		       argc < 3 ? "" : ",", argc < 3 ? "" : argv[2],
+		       argc < 4 ? "" : ",", argc < 4 ? "" : argv[3],
+		       argc < 5 ? "" : ",", argc < 5 ? "" : argv[4],
+		       argc < 6 ? "" : ",...");
+	}
+}
+
+void libcfs_run_lbug_upcall(struct libcfs_debug_msg_data *msgdata)
+{
+	char *argv[6];
+	char buf[32];
+
+	snprintf(buf, sizeof(buf), "%d", msgdata->msg_line);
+
+	argv[1] = "LBUG";
+	argv[2] = (char *)msgdata->msg_file;
+	argv[3] = (char *)msgdata->msg_fn;
+	argv[4] = buf;
+	argv[5] = NULL;
+
+	libcfs_run_upcall (argv);
+}
+
+/* coverity[+kill] */
+void lbug_with_loc(struct libcfs_debug_msg_data *msgdata)
+{
+	libcfs_catastrophe = 1;
+	libcfs_debug_msg(msgdata, "LBUG\n");
+
+	if (in_interrupt()) {
+		panic("LBUG in interrupt.\n");
+		/* not reached */
+	}
+
+	dump_stack();
+	if (!libcfs_panic_on_lbug)
+		libcfs_debug_dumplog();
+	libcfs_run_lbug_upcall(msgdata);
+	if (libcfs_panic_on_lbug)
+		panic("LBUG");
+	set_task_state(current, TASK_UNINTERRUPTIBLE);
+	while (1)
+		schedule();
+}
+
+static int panic_notifier(struct notifier_block *self, unsigned long unused1,
+			 void *unused2)
+{
+	if (libcfs_panic_in_progress)
+		return 0;
+
+	libcfs_panic_in_progress = 1;
+	mb();
+
+	return 0;
+}
+
+static struct notifier_block libcfs_panic_notifier = {
+	.notifier_call	= panic_notifier,
+	.next		= NULL,
+	.priority	= 10000,
+};
+
+void libcfs_register_panic_notifier(void)
+{
+	atomic_notifier_chain_register(&panic_notifier_list, &libcfs_panic_notifier);
+}
+
+void libcfs_unregister_panic_notifier(void)
+{
+	atomic_notifier_chain_unregister(&panic_notifier_list, &libcfs_panic_notifier);
+}
+
+EXPORT_SYMBOL(libcfs_run_upcall);
+EXPORT_SYMBOL(libcfs_run_lbug_upcall);
+EXPORT_SYMBOL(lbug_with_loc);
diff --git a/kernel/drivers/staging/lustre/lustre/libcfs/linux/linux-module.c b/kernel/drivers/staging/lustre/lustre/libcfs/linux/linux-module.c
new file mode 100644
index 000000000..e962f8968
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/libcfs/linux/linux-module.c
@@ -0,0 +1,183 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+
+#include "../../../include/linux/libcfs/libcfs.h"
+
+#define LNET_MINOR 240
+
+int libcfs_ioctl_getdata(char *buf, char *end, void *arg)
+{
+	struct libcfs_ioctl_hdr   *hdr;
+	struct libcfs_ioctl_data  *data;
+	int orig_len;
+
+	hdr = (struct libcfs_ioctl_hdr *)buf;
+	data = (struct libcfs_ioctl_data *)buf;
+
+	if (copy_from_user(buf, (void *)arg, sizeof(*hdr)))
+		return -EFAULT;
+
+	if (hdr->ioc_version != LIBCFS_IOCTL_VERSION) {
+		CERROR("PORTALS: version mismatch kernel vs application\n");
+		return -EINVAL;
+	}
+
+	if (hdr->ioc_len >= end - buf) {
+		CERROR("PORTALS: user buffer exceeds kernel buffer\n");
+		return -EINVAL;
+	}
+
+
+	if (hdr->ioc_len < sizeof(struct libcfs_ioctl_data)) {
+		CERROR("PORTALS: user buffer too small for ioctl\n");
+		return -EINVAL;
+	}
+
+	orig_len = hdr->ioc_len;
+	if (copy_from_user(buf, (void *)arg, hdr->ioc_len))
+		return -EFAULT;
+	if (orig_len != data->ioc_len)
+		return -EINVAL;
+
+	if (libcfs_ioctl_is_invalid(data)) {
+		CERROR("PORTALS: ioctl not correctly formatted\n");
+		return -EINVAL;
+	}
+
+	if (data->ioc_inllen1)
+		data->ioc_inlbuf1 = &data->ioc_bulk[0];
+
+	if (data->ioc_inllen2)
+		data->ioc_inlbuf2 = &data->ioc_bulk[0] +
+			cfs_size_round(data->ioc_inllen1);
+
+	return 0;
+}
+
+int libcfs_ioctl_popdata(void *arg, void *data, int size)
+{
+	if (copy_to_user((char *)arg, data, size))
+		return -EFAULT;
+	return 0;
+}
+
+extern struct cfs_psdev_ops	  libcfs_psdev_ops;
+
+static int
+libcfs_psdev_open(struct inode *inode, struct file *file)
+{
+	struct libcfs_device_userstate **pdu = NULL;
+	int    rc = 0;
+
+	if (!inode)
+		return -EINVAL;
+	pdu = (struct libcfs_device_userstate **)&file->private_data;
+	if (libcfs_psdev_ops.p_open != NULL)
+		rc = libcfs_psdev_ops.p_open(0, (void *)pdu);
+	else
+		return -EPERM;
+	return rc;
+}
+
+/* called when closing /dev/device */
+static int
+libcfs_psdev_release(struct inode *inode, struct file *file)
+{
+	struct libcfs_device_userstate *pdu;
+	int    rc = 0;
+
+	if (!inode)
+		return -EINVAL;
+	pdu = file->private_data;
+	if (libcfs_psdev_ops.p_close != NULL)
+		rc = libcfs_psdev_ops.p_close(0, (void *)pdu);
+	else
+		rc = -EPERM;
+	return rc;
+}
+
+static long libcfs_ioctl(struct file *file,
+			 unsigned int cmd, unsigned long arg)
+{
+	struct cfs_psdev_file	 pfile;
+	int    rc = 0;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EACCES;
+
+	if (_IOC_TYPE(cmd) != IOC_LIBCFS_TYPE ||
+	     _IOC_NR(cmd) < IOC_LIBCFS_MIN_NR  ||
+	     _IOC_NR(cmd) > IOC_LIBCFS_MAX_NR) {
+		CDEBUG(D_IOCTL, "invalid ioctl ( type %d, nr %d, size %d )\n",
+		       _IOC_TYPE(cmd), _IOC_NR(cmd), _IOC_SIZE(cmd));
+		return -EINVAL;
+	}
+
+	/* Handle platform-dependent IOC requests */
+	switch (cmd) {
+	case IOC_LIBCFS_PANIC:
+		if (!capable(CFS_CAP_SYS_BOOT))
+			return -EPERM;
+		panic("debugctl-invoked panic");
+		return 0;
+	case IOC_LIBCFS_MEMHOG:
+		if (!capable(CFS_CAP_SYS_ADMIN))
+			return -EPERM;
+		/* go thought */
+	}
+
+	pfile.off = 0;
+	pfile.private_data = file->private_data;
+	if (libcfs_psdev_ops.p_ioctl != NULL)
+		rc = libcfs_psdev_ops.p_ioctl(&pfile, cmd, (void *)arg);
+	else
+		rc = -EPERM;
+	return rc;
+}
+
+static const struct file_operations libcfs_fops = {
+	.unlocked_ioctl	= libcfs_ioctl,
+	.open		= libcfs_psdev_open,
+	.release	= libcfs_psdev_release,
+};
+
+struct miscdevice libcfs_dev = {
+	.minor = LNET_MINOR,
+	.name = "lnet",
+	.fops = &libcfs_fops,
+};
diff --git a/kernel/drivers/staging/lustre/lustre/libcfs/linux/linux-prim.c b/kernel/drivers/staging/lustre/lustre/libcfs/linux/linux-prim.c
new file mode 100644
index 000000000..838f5f3bd
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/libcfs/linux/linux-prim.c
@@ -0,0 +1,217 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/fs_struct.h>
+#include <linux/sched.h>
+
+#include "../../../include/linux/libcfs/libcfs.h"
+
+#if defined(CONFIG_KGDB)
+#include <linux/kgdb.h>
+#endif
+
+/**
+ * wait_queue_t of Linux (version < 2.6.34) is a FIFO list for exclusively
+ * waiting threads, which is not always desirable because all threads will
+ * be waken up again and again, even user only needs a few of them to be
+ * active most time. This is not good for performance because cache can
+ * be polluted by different threads.
+ *
+ * LIFO list can resolve this problem because we always wakeup the most
+ * recent active thread by default.
+ *
+ * NB: please don't call non-exclusive & exclusive wait on the same
+ * waitq if add_wait_queue_exclusive_head is used.
+ */
+void
+add_wait_queue_exclusive_head(wait_queue_head_t *waitq, wait_queue_t *link)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&waitq->lock, flags);
+	__add_wait_queue_exclusive(waitq, link);
+	spin_unlock_irqrestore(&waitq->lock, flags);
+}
+EXPORT_SYMBOL(add_wait_queue_exclusive_head);
+
+void cfs_init_timer(struct timer_list *t)
+{
+	init_timer(t);
+}
+EXPORT_SYMBOL(cfs_init_timer);
+
+void cfs_timer_init(struct timer_list *t, cfs_timer_func_t *func, void *arg)
+{
+	init_timer(t);
+	t->function = func;
+	t->data = (unsigned long)arg;
+}
+EXPORT_SYMBOL(cfs_timer_init);
+
+void cfs_timer_done(struct timer_list *t)
+{
+	return;
+}
+EXPORT_SYMBOL(cfs_timer_done);
+
+void cfs_timer_arm(struct timer_list *t, unsigned long deadline)
+{
+	mod_timer(t, deadline);
+}
+EXPORT_SYMBOL(cfs_timer_arm);
+
+void cfs_timer_disarm(struct timer_list *t)
+{
+	del_timer(t);
+}
+EXPORT_SYMBOL(cfs_timer_disarm);
+
+int  cfs_timer_is_armed(struct timer_list *t)
+{
+	return timer_pending(t);
+}
+EXPORT_SYMBOL(cfs_timer_is_armed);
+
+unsigned long cfs_timer_deadline(struct timer_list *t)
+{
+	return t->expires;
+}
+EXPORT_SYMBOL(cfs_timer_deadline);
+
+void cfs_enter_debugger(void)
+{
+#if defined(CONFIG_KGDB)
+	/* BREAKPOINT(); */
+#else
+	/* nothing */
+#endif
+}
+EXPORT_SYMBOL(cfs_enter_debugger);
+
+
+sigset_t
+cfs_block_allsigs(void)
+{
+	unsigned long	  flags;
+	sigset_t	old;
+
+	spin_lock_irqsave(&current->sighand->siglock, flags);
+	old = current->blocked;
+	sigfillset(&current->blocked);
+	recalc_sigpending();
+	spin_unlock_irqrestore(&current->sighand->siglock, flags);
+
+	return old;
+}
+EXPORT_SYMBOL(cfs_block_allsigs);
+
+sigset_t cfs_block_sigs(unsigned long sigs)
+{
+	unsigned long  flags;
+	sigset_t	old;
+
+	spin_lock_irqsave(&current->sighand->siglock, flags);
+	old = current->blocked;
+	sigaddsetmask(&current->blocked, sigs);
+	recalc_sigpending();
+	spin_unlock_irqrestore(&current->sighand->siglock, flags);
+	return old;
+}
+EXPORT_SYMBOL(cfs_block_sigs);
+
+/* Block all signals except for the @sigs */
+sigset_t cfs_block_sigsinv(unsigned long sigs)
+{
+	unsigned long flags;
+	sigset_t old;
+
+	spin_lock_irqsave(&current->sighand->siglock, flags);
+	old = current->blocked;
+	sigaddsetmask(&current->blocked, ~sigs);
+	recalc_sigpending();
+	spin_unlock_irqrestore(&current->sighand->siglock, flags);
+
+	return old;
+}
+EXPORT_SYMBOL(cfs_block_sigsinv);
+
+void
+cfs_restore_sigs(sigset_t old)
+{
+	unsigned long  flags;
+
+	spin_lock_irqsave(&current->sighand->siglock, flags);
+	current->blocked = old;
+	recalc_sigpending();
+	spin_unlock_irqrestore(&current->sighand->siglock, flags);
+}
+EXPORT_SYMBOL(cfs_restore_sigs);
+
+int
+cfs_signal_pending(void)
+{
+	return signal_pending(current);
+}
+EXPORT_SYMBOL(cfs_signal_pending);
+
+void
+cfs_clear_sigpending(void)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&current->sighand->siglock, flags);
+	clear_tsk_thread_flag(current, TIF_SIGPENDING);
+	spin_unlock_irqrestore(&current->sighand->siglock, flags);
+}
+EXPORT_SYMBOL(cfs_clear_sigpending);
+
+int
+libcfs_arch_init(void)
+{
+	return 0;
+}
+EXPORT_SYMBOL(libcfs_arch_init);
+
+void
+libcfs_arch_cleanup(void)
+{
+	return;
+}
+EXPORT_SYMBOL(libcfs_arch_cleanup);
+
diff --git a/kernel/drivers/staging/lustre/lustre/libcfs/linux/linux-tcpip.c b/kernel/drivers/staging/lustre/lustre/libcfs/linux/linux-tcpip.c
new file mode 100644
index 000000000..f2462e7f0
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/libcfs/linux/linux-tcpip.c
@@ -0,0 +1,623 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+#define DEBUG_SUBSYSTEM S_LNET
+
+#include "../../../include/linux/libcfs/libcfs.h"
+
+#include <linux/if.h>
+#include <linux/in.h>
+#include <linux/file.h>
+/* For sys_open & sys_close */
+#include <linux/syscalls.h>
+
+static int
+libcfs_sock_ioctl(int cmd, unsigned long arg)
+{
+	mm_segment_t	oldmm = get_fs();
+	struct socket  *sock;
+	int		rc;
+	struct file    *sock_filp;
+
+	rc = sock_create (PF_INET, SOCK_STREAM, 0, &sock);
+	if (rc != 0) {
+		CERROR ("Can't create socket: %d\n", rc);
+		return rc;
+	}
+
+	sock_filp = sock_alloc_file(sock, 0, NULL);
+	if (IS_ERR(sock_filp)) {
+		sock_release(sock);
+		rc = PTR_ERR(sock_filp);
+		goto out;
+	}
+
+	set_fs(KERNEL_DS);
+	if (sock_filp->f_op->unlocked_ioctl)
+		rc = sock_filp->f_op->unlocked_ioctl(sock_filp, cmd, arg);
+	set_fs(oldmm);
+
+	fput(sock_filp);
+out:
+	return rc;
+}
+
+int
+libcfs_ipif_query (char *name, int *up, __u32 *ip, __u32 *mask)
+{
+	struct ifreq   ifr;
+	int	    nob;
+	int	    rc;
+	__u32	  val;
+
+	nob = strnlen(name, IFNAMSIZ);
+	if (nob == IFNAMSIZ) {
+		CERROR("Interface name %s too long\n", name);
+		return -EINVAL;
+	}
+
+	CLASSERT (sizeof(ifr.ifr_name) >= IFNAMSIZ);
+
+	strcpy(ifr.ifr_name, name);
+	rc = libcfs_sock_ioctl(SIOCGIFFLAGS, (unsigned long)&ifr);
+
+	if (rc != 0) {
+		CERROR("Can't get flags for interface %s\n", name);
+		return rc;
+	}
+
+	if ((ifr.ifr_flags & IFF_UP) == 0) {
+		CDEBUG(D_NET, "Interface %s down\n", name);
+		*up = 0;
+		*ip = *mask = 0;
+		return 0;
+	}
+
+	*up = 1;
+
+	strcpy(ifr.ifr_name, name);
+	ifr.ifr_addr.sa_family = AF_INET;
+	rc = libcfs_sock_ioctl(SIOCGIFADDR, (unsigned long)&ifr);
+
+	if (rc != 0) {
+		CERROR("Can't get IP address for interface %s\n", name);
+		return rc;
+	}
+
+	val = ((struct sockaddr_in *)&ifr.ifr_addr)->sin_addr.s_addr;
+	*ip = ntohl(val);
+
+	strcpy(ifr.ifr_name, name);
+	ifr.ifr_addr.sa_family = AF_INET;
+	rc = libcfs_sock_ioctl(SIOCGIFNETMASK, (unsigned long)&ifr);
+
+	if (rc != 0) {
+		CERROR("Can't get netmask for interface %s\n", name);
+		return rc;
+	}
+
+	val = ((struct sockaddr_in *)&ifr.ifr_netmask)->sin_addr.s_addr;
+	*mask = ntohl(val);
+
+	return 0;
+}
+
+EXPORT_SYMBOL(libcfs_ipif_query);
+
+int
+libcfs_ipif_enumerate (char ***namesp)
+{
+	/* Allocate and fill in 'names', returning # interfaces/error */
+	char	   **names;
+	int	     toobig;
+	int	     nalloc;
+	int	     nfound;
+	struct ifreq   *ifr;
+	struct ifconf   ifc;
+	int	     rc;
+	int	     nob;
+	int	     i;
+
+
+	nalloc = 16;	/* first guess at max interfaces */
+	toobig = 0;
+	for (;;) {
+		if (nalloc * sizeof(*ifr) > PAGE_CACHE_SIZE) {
+			toobig = 1;
+			nalloc = PAGE_CACHE_SIZE/sizeof(*ifr);
+			CWARN("Too many interfaces: only enumerating first %d\n",
+			      nalloc);
+		}
+
+		LIBCFS_ALLOC(ifr, nalloc * sizeof(*ifr));
+		if (ifr == NULL) {
+			CERROR ("ENOMEM enumerating up to %d interfaces\n", nalloc);
+			rc = -ENOMEM;
+			goto out0;
+		}
+
+		ifc.ifc_buf = (char *)ifr;
+		ifc.ifc_len = nalloc * sizeof(*ifr);
+
+		rc = libcfs_sock_ioctl(SIOCGIFCONF, (unsigned long)&ifc);
+
+		if (rc < 0) {
+			CERROR ("Error %d enumerating interfaces\n", rc);
+			goto out1;
+		}
+
+		LASSERT (rc == 0);
+
+		nfound = ifc.ifc_len/sizeof(*ifr);
+		LASSERT (nfound <= nalloc);
+
+		if (nfound < nalloc || toobig)
+			break;
+
+		LIBCFS_FREE(ifr, nalloc * sizeof(*ifr));
+		nalloc *= 2;
+	}
+
+	if (nfound == 0)
+		goto out1;
+
+	LIBCFS_ALLOC(names, nfound * sizeof(*names));
+	if (names == NULL) {
+		rc = -ENOMEM;
+		goto out1;
+	}
+
+	for (i = 0; i < nfound; i++) {
+
+		nob = strnlen (ifr[i].ifr_name, IFNAMSIZ);
+		if (nob == IFNAMSIZ) {
+			/* no space for terminating NULL */
+			CERROR("interface name %.*s too long (%d max)\n",
+			       nob, ifr[i].ifr_name, IFNAMSIZ);
+			rc = -ENAMETOOLONG;
+			goto out2;
+		}
+
+		LIBCFS_ALLOC(names[i], IFNAMSIZ);
+		if (names[i] == NULL) {
+			rc = -ENOMEM;
+			goto out2;
+		}
+
+		memcpy(names[i], ifr[i].ifr_name, nob);
+		names[i][nob] = 0;
+	}
+
+	*namesp = names;
+	rc = nfound;
+
+ out2:
+	if (rc < 0)
+		libcfs_ipif_free_enumeration(names, nfound);
+ out1:
+	LIBCFS_FREE(ifr, nalloc * sizeof(*ifr));
+ out0:
+	return rc;
+}
+
+EXPORT_SYMBOL(libcfs_ipif_enumerate);
+
+void
+libcfs_ipif_free_enumeration (char **names, int n)
+{
+	int      i;
+
+	LASSERT (n > 0);
+
+	for (i = 0; i < n && names[i] != NULL; i++)
+		LIBCFS_FREE(names[i], IFNAMSIZ);
+
+	LIBCFS_FREE(names, n * sizeof(*names));
+}
+
+EXPORT_SYMBOL(libcfs_ipif_free_enumeration);
+
+int
+libcfs_sock_write (struct socket *sock, void *buffer, int nob, int timeout)
+{
+	int	    rc;
+	long	   ticks = timeout * HZ;
+	unsigned long  then;
+	struct timeval tv;
+
+	LASSERT (nob > 0);
+	/* Caller may pass a zero timeout if she thinks the socket buffer is
+	 * empty enough to take the whole message immediately */
+
+	for (;;) {
+		struct kvec  iov = {
+			.iov_base = buffer,
+			.iov_len  = nob
+		};
+		struct msghdr msg = {
+			.msg_flags      = (timeout == 0) ? MSG_DONTWAIT : 0
+		};
+
+		if (timeout != 0) {
+			/* Set send timeout to remaining time */
+			tv = (struct timeval) {
+				.tv_sec = ticks / HZ,
+				.tv_usec = ((ticks % HZ) * 1000000) / HZ
+			};
+			rc = kernel_setsockopt(sock, SOL_SOCKET, SO_SNDTIMEO,
+					     (char *)&tv, sizeof(tv));
+			if (rc != 0) {
+				CERROR("Can't set socket send timeout %ld.%06d: %d\n",
+				       (long)tv.tv_sec, (int)tv.tv_usec, rc);
+				return rc;
+			}
+		}
+
+		then = jiffies;
+		rc = kernel_sendmsg(sock, &msg, &iov, 1, nob);
+		ticks -= jiffies - then;
+
+		if (rc == nob)
+			return 0;
+
+		if (rc < 0)
+			return rc;
+
+		if (rc == 0) {
+			CERROR ("Unexpected zero rc\n");
+			return -ECONNABORTED;
+		}
+
+		if (ticks <= 0)
+			return -EAGAIN;
+
+		buffer = ((char *)buffer) + rc;
+		nob -= rc;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL(libcfs_sock_write);
+
+int
+libcfs_sock_read (struct socket *sock, void *buffer, int nob, int timeout)
+{
+	int	    rc;
+	long	   ticks = timeout * HZ;
+	unsigned long  then;
+	struct timeval tv;
+
+	LASSERT (nob > 0);
+	LASSERT (ticks > 0);
+
+	for (;;) {
+		struct kvec  iov = {
+			.iov_base = buffer,
+			.iov_len  = nob
+		};
+		struct msghdr msg = {
+			.msg_flags      = 0
+		};
+
+		/* Set receive timeout to remaining time */
+		tv = (struct timeval) {
+			.tv_sec = ticks / HZ,
+			.tv_usec = ((ticks % HZ) * 1000000) / HZ
+		};
+		rc = kernel_setsockopt(sock, SOL_SOCKET, SO_RCVTIMEO,
+				     (char *)&tv, sizeof(tv));
+		if (rc != 0) {
+			CERROR("Can't set socket recv timeout %ld.%06d: %d\n",
+			       (long)tv.tv_sec, (int)tv.tv_usec, rc);
+			return rc;
+		}
+
+		then = jiffies;
+		rc = kernel_recvmsg(sock, &msg, &iov, 1, nob, 0);
+		ticks -= jiffies - then;
+
+		if (rc < 0)
+			return rc;
+
+		if (rc == 0)
+			return -ECONNRESET;
+
+		buffer = ((char *)buffer) + rc;
+		nob -= rc;
+
+		if (nob == 0)
+			return 0;
+
+		if (ticks <= 0)
+			return -ETIMEDOUT;
+	}
+}
+
+EXPORT_SYMBOL(libcfs_sock_read);
+
+static int
+libcfs_sock_create (struct socket **sockp, int *fatal,
+		    __u32 local_ip, int local_port)
+{
+	struct sockaddr_in  locaddr;
+	struct socket      *sock;
+	int		 rc;
+	int		 option;
+
+	/* All errors are fatal except bind failure if the port is in use */
+	*fatal = 1;
+
+	rc = sock_create (PF_INET, SOCK_STREAM, 0, &sock);
+	*sockp = sock;
+	if (rc != 0) {
+		CERROR ("Can't create socket: %d\n", rc);
+		return rc;
+	}
+
+	option = 1;
+	rc = kernel_setsockopt(sock, SOL_SOCKET, SO_REUSEADDR,
+			     (char *)&option, sizeof (option));
+	if (rc != 0) {
+		CERROR("Can't set SO_REUSEADDR for socket: %d\n", rc);
+		goto failed;
+	}
+
+	if (local_ip != 0 || local_port != 0) {
+		memset(&locaddr, 0, sizeof(locaddr));
+		locaddr.sin_family = AF_INET;
+		locaddr.sin_port = htons(local_port);
+		locaddr.sin_addr.s_addr = (local_ip == 0) ?
+					  INADDR_ANY : htonl(local_ip);
+
+		rc = sock->ops->bind(sock, (struct sockaddr *)&locaddr,
+				     sizeof(locaddr));
+		if (rc == -EADDRINUSE) {
+			CDEBUG(D_NET, "Port %d already in use\n", local_port);
+			*fatal = 0;
+			goto failed;
+		}
+		if (rc != 0) {
+			CERROR("Error trying to bind to port %d: %d\n",
+			       local_port, rc);
+			goto failed;
+		}
+	}
+
+	return 0;
+
+ failed:
+	sock_release(sock);
+	return rc;
+}
+
+int
+libcfs_sock_setbuf (struct socket *sock, int txbufsize, int rxbufsize)
+{
+	int		 option;
+	int		 rc;
+
+	if (txbufsize != 0) {
+		option = txbufsize;
+		rc = kernel_setsockopt(sock, SOL_SOCKET, SO_SNDBUF,
+				     (char *)&option, sizeof (option));
+		if (rc != 0) {
+			CERROR ("Can't set send buffer %d: %d\n",
+				option, rc);
+			return rc;
+		}
+	}
+
+	if (rxbufsize != 0) {
+		option = rxbufsize;
+		rc = kernel_setsockopt(sock, SOL_SOCKET, SO_RCVBUF,
+				      (char *)&option, sizeof (option));
+		if (rc != 0) {
+			CERROR ("Can't set receive buffer %d: %d\n",
+				option, rc);
+			return rc;
+		}
+	}
+
+	return 0;
+}
+
+EXPORT_SYMBOL(libcfs_sock_setbuf);
+
+int
+libcfs_sock_getaddr (struct socket *sock, int remote, __u32 *ip, int *port)
+{
+	struct sockaddr_in sin;
+	int		len = sizeof (sin);
+	int		rc;
+
+	rc = sock->ops->getname (sock, (struct sockaddr *)&sin, &len,
+				 remote ? 2 : 0);
+	if (rc != 0) {
+		CERROR ("Error %d getting sock %s IP/port\n",
+			rc, remote ? "peer" : "local");
+		return rc;
+	}
+
+	if (ip != NULL)
+		*ip = ntohl (sin.sin_addr.s_addr);
+
+	if (port != NULL)
+		*port = ntohs (sin.sin_port);
+
+	return 0;
+}
+
+EXPORT_SYMBOL(libcfs_sock_getaddr);
+
+int
+libcfs_sock_getbuf (struct socket *sock, int *txbufsize, int *rxbufsize)
+{
+
+	if (txbufsize != NULL) {
+		*txbufsize = sock->sk->sk_sndbuf;
+	}
+
+	if (rxbufsize != NULL) {
+		*rxbufsize = sock->sk->sk_rcvbuf;
+	}
+
+	return 0;
+}
+
+EXPORT_SYMBOL(libcfs_sock_getbuf);
+
+int
+libcfs_sock_listen (struct socket **sockp,
+		    __u32 local_ip, int local_port, int backlog)
+{
+	int      fatal;
+	int      rc;
+
+	rc = libcfs_sock_create(sockp, &fatal, local_ip, local_port);
+	if (rc != 0) {
+		if (!fatal)
+			CERROR("Can't create socket: port %d already in use\n",
+			       local_port);
+		return rc;
+	}
+
+	rc = (*sockp)->ops->listen(*sockp, backlog);
+	if (rc == 0)
+		return 0;
+
+	CERROR("Can't set listen backlog %d: %d\n", backlog, rc);
+	sock_release(*sockp);
+	return rc;
+}
+
+EXPORT_SYMBOL(libcfs_sock_listen);
+
+int
+libcfs_sock_accept (struct socket **newsockp, struct socket *sock)
+{
+	wait_queue_t   wait;
+	struct socket *newsock;
+	int	    rc;
+
+	init_waitqueue_entry(&wait, current);
+
+	/* XXX this should add a ref to sock->ops->owner, if
+	 * TCP could be a module */
+	rc = sock_create_lite(PF_PACKET, sock->type, IPPROTO_TCP, &newsock);
+	if (rc) {
+		CERROR("Can't allocate socket\n");
+		return rc;
+	}
+
+	newsock->ops = sock->ops;
+
+	rc = sock->ops->accept(sock, newsock, O_NONBLOCK);
+	if (rc == -EAGAIN) {
+		/* Nothing ready, so wait for activity */
+		set_current_state(TASK_INTERRUPTIBLE);
+		add_wait_queue(sk_sleep(sock->sk), &wait);
+		schedule();
+		remove_wait_queue(sk_sleep(sock->sk), &wait);
+		set_current_state(TASK_RUNNING);
+		rc = sock->ops->accept(sock, newsock, O_NONBLOCK);
+	}
+
+	if (rc != 0)
+		goto failed;
+
+	*newsockp = newsock;
+	return 0;
+
+ failed:
+	sock_release(newsock);
+	return rc;
+}
+
+EXPORT_SYMBOL(libcfs_sock_accept);
+
+void
+libcfs_sock_abort_accept (struct socket *sock)
+{
+	wake_up_all(sk_sleep(sock->sk));
+}
+
+EXPORT_SYMBOL(libcfs_sock_abort_accept);
+
+int
+libcfs_sock_connect (struct socket **sockp, int *fatal,
+		     __u32 local_ip, int local_port,
+		     __u32 peer_ip, int peer_port)
+{
+	struct sockaddr_in  srvaddr;
+	int		 rc;
+
+	rc = libcfs_sock_create(sockp, fatal, local_ip, local_port);
+	if (rc != 0)
+		return rc;
+
+	memset (&srvaddr, 0, sizeof (srvaddr));
+	srvaddr.sin_family = AF_INET;
+	srvaddr.sin_port = htons(peer_port);
+	srvaddr.sin_addr.s_addr = htonl(peer_ip);
+
+	rc = (*sockp)->ops->connect(*sockp,
+				    (struct sockaddr *)&srvaddr, sizeof(srvaddr),
+				    0);
+	if (rc == 0)
+		return 0;
+
+	/* EADDRNOTAVAIL probably means we're already connected to the same
+	 * peer/port on the same local port on a differently typed
+	 * connection.  Let our caller retry with a different local
+	 * port... */
+	*fatal = !(rc == -EADDRNOTAVAIL);
+
+	CDEBUG_LIMIT(*fatal ? D_NETERROR : D_NET,
+	       "Error %d connecting %pI4h/%d -> %pI4h/%d\n", rc,
+	       &local_ip, local_port, &peer_ip, peer_port);
+
+	sock_release(*sockp);
+	return rc;
+}
+
+EXPORT_SYMBOL(libcfs_sock_connect);
+
+void
+libcfs_sock_release (struct socket *sock)
+{
+	sock_release(sock);
+}
+
+EXPORT_SYMBOL(libcfs_sock_release);
diff --git a/kernel/drivers/staging/lustre/lustre/libcfs/linux/linux-tracefile.c b/kernel/drivers/staging/lustre/lustre/libcfs/linux/linux-tracefile.c
new file mode 100644
index 000000000..c8e293002
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/libcfs/linux/linux-tracefile.c
@@ -0,0 +1,275 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+#define LUSTRE_TRACEFILE_PRIVATE
+
+#include "../../../include/linux/libcfs/libcfs.h"
+#include "../tracefile.h"
+
+/* percents to share the total debug memory for each type */
+static unsigned int pages_factor[CFS_TCD_TYPE_MAX] = {
+	80,  /* 80% pages for CFS_TCD_TYPE_PROC */
+	10,  /* 10% pages for CFS_TCD_TYPE_SOFTIRQ */
+	10   /* 10% pages for CFS_TCD_TYPE_IRQ */
+};
+
+char *cfs_trace_console_buffers[NR_CPUS][CFS_TCD_TYPE_MAX];
+
+struct rw_semaphore cfs_tracefile_sem;
+
+int cfs_tracefile_init_arch(void)
+{
+	int    i;
+	int    j;
+	struct cfs_trace_cpu_data *tcd;
+
+	init_rwsem(&cfs_tracefile_sem);
+
+	/* initialize trace_data */
+	memset(cfs_trace_data, 0, sizeof(cfs_trace_data));
+	for (i = 0; i < CFS_TCD_TYPE_MAX; i++) {
+		cfs_trace_data[i] =
+			kmalloc(sizeof(union cfs_trace_data_union) *
+				num_possible_cpus(), GFP_KERNEL);
+		if (cfs_trace_data[i] == NULL)
+			goto out;
+
+	}
+
+	/* arch related info initialized */
+	cfs_tcd_for_each(tcd, i, j) {
+		spin_lock_init(&tcd->tcd_lock);
+		tcd->tcd_pages_factor = pages_factor[i];
+		tcd->tcd_type = i;
+		tcd->tcd_cpu = j;
+	}
+
+	for (i = 0; i < num_possible_cpus(); i++)
+		for (j = 0; j < 3; j++) {
+			cfs_trace_console_buffers[i][j] =
+				kmalloc(CFS_TRACE_CONSOLE_BUFFER_SIZE,
+					GFP_KERNEL);
+
+			if (cfs_trace_console_buffers[i][j] == NULL)
+				goto out;
+		}
+
+	return 0;
+
+out:
+	cfs_tracefile_fini_arch();
+	printk(KERN_ERR "lnet: Not enough memory\n");
+	return -ENOMEM;
+}
+
+void cfs_tracefile_fini_arch(void)
+{
+	int    i;
+	int    j;
+
+	for (i = 0; i < num_possible_cpus(); i++)
+		for (j = 0; j < 3; j++)
+			if (cfs_trace_console_buffers[i][j] != NULL) {
+				kfree(cfs_trace_console_buffers[i][j]);
+				cfs_trace_console_buffers[i][j] = NULL;
+			}
+
+	for (i = 0; cfs_trace_data[i] != NULL; i++) {
+		kfree(cfs_trace_data[i]);
+		cfs_trace_data[i] = NULL;
+	}
+}
+
+void cfs_tracefile_read_lock(void)
+{
+	down_read(&cfs_tracefile_sem);
+}
+
+void cfs_tracefile_read_unlock(void)
+{
+	up_read(&cfs_tracefile_sem);
+}
+
+void cfs_tracefile_write_lock(void)
+{
+	down_write(&cfs_tracefile_sem);
+}
+
+void cfs_tracefile_write_unlock(void)
+{
+	up_write(&cfs_tracefile_sem);
+}
+
+cfs_trace_buf_type_t cfs_trace_buf_idx_get(void)
+{
+	if (in_irq())
+		return CFS_TCD_TYPE_IRQ;
+	else if (in_softirq())
+		return CFS_TCD_TYPE_SOFTIRQ;
+	else
+		return CFS_TCD_TYPE_PROC;
+}
+
+/*
+ * The walking argument indicates the locking comes from all tcd types
+ * iterator and we must lock it and dissable local irqs to avoid deadlocks
+ * with other interrupt locks that might be happening. See LU-1311
+ * for details.
+ */
+int cfs_trace_lock_tcd(struct cfs_trace_cpu_data *tcd, int walking)
+	__acquires(&tcd->tc_lock)
+{
+	__LASSERT(tcd->tcd_type < CFS_TCD_TYPE_MAX);
+	if (tcd->tcd_type == CFS_TCD_TYPE_IRQ)
+		spin_lock_irqsave(&tcd->tcd_lock, tcd->tcd_lock_flags);
+	else if (tcd->tcd_type == CFS_TCD_TYPE_SOFTIRQ)
+		spin_lock_bh(&tcd->tcd_lock);
+	else if (unlikely(walking))
+		spin_lock_irq(&tcd->tcd_lock);
+	else
+		spin_lock(&tcd->tcd_lock);
+	return 1;
+}
+
+void cfs_trace_unlock_tcd(struct cfs_trace_cpu_data *tcd, int walking)
+	__releases(&tcd->tcd_lock)
+{
+	__LASSERT(tcd->tcd_type < CFS_TCD_TYPE_MAX);
+	if (tcd->tcd_type == CFS_TCD_TYPE_IRQ)
+		spin_unlock_irqrestore(&tcd->tcd_lock, tcd->tcd_lock_flags);
+	else if (tcd->tcd_type == CFS_TCD_TYPE_SOFTIRQ)
+		spin_unlock_bh(&tcd->tcd_lock);
+	else if (unlikely(walking))
+		spin_unlock_irq(&tcd->tcd_lock);
+	else
+		spin_unlock(&tcd->tcd_lock);
+}
+
+int cfs_tcd_owns_tage(struct cfs_trace_cpu_data *tcd,
+		      struct cfs_trace_page *tage)
+{
+	/*
+	 * XXX nikita: do NOT call portals_debug_msg() (CDEBUG/ENTRY/EXIT)
+	 * from here: this will lead to infinite recursion.
+	 */
+	return tcd->tcd_cpu == tage->cpu;
+}
+
+void
+cfs_set_ptldebug_header(struct ptldebug_header *header,
+			struct libcfs_debug_msg_data *msgdata,
+			unsigned long stack)
+{
+	struct timeval tv;
+
+	do_gettimeofday(&tv);
+
+	header->ph_subsys = msgdata->msg_subsys;
+	header->ph_mask = msgdata->msg_mask;
+	header->ph_cpu_id = smp_processor_id();
+	header->ph_type = cfs_trace_buf_idx_get();
+	header->ph_sec = (__u32)tv.tv_sec;
+	header->ph_usec = tv.tv_usec;
+	header->ph_stack = stack;
+	header->ph_pid = current->pid;
+	header->ph_line_num = msgdata->msg_line;
+	header->ph_extern_pid = 0;
+	return;
+}
+
+static char *
+dbghdr_to_err_string(struct ptldebug_header *hdr)
+{
+	switch (hdr->ph_subsys) {
+
+		case S_LND:
+		case S_LNET:
+			return "LNetError";
+		default:
+			return "LustreError";
+	}
+}
+
+static char *
+dbghdr_to_info_string(struct ptldebug_header *hdr)
+{
+	switch (hdr->ph_subsys) {
+
+		case S_LND:
+		case S_LNET:
+			return "LNet";
+		default:
+			return "Lustre";
+	}
+}
+
+void cfs_print_to_console(struct ptldebug_header *hdr, int mask,
+			  const char *buf, int len, const char *file,
+			  const char *fn)
+{
+	char *prefix = "Lustre", *ptype = NULL;
+
+	if ((mask & D_EMERG) != 0) {
+		prefix = dbghdr_to_err_string(hdr);
+		ptype = KERN_EMERG;
+	} else if ((mask & D_ERROR) != 0) {
+		prefix = dbghdr_to_err_string(hdr);
+		ptype = KERN_ERR;
+	} else if ((mask & D_WARNING) != 0) {
+		prefix = dbghdr_to_info_string(hdr);
+		ptype = KERN_WARNING;
+	} else if ((mask & (D_CONSOLE | libcfs_printk)) != 0) {
+		prefix = dbghdr_to_info_string(hdr);
+		ptype = KERN_INFO;
+	}
+
+	if ((mask & D_CONSOLE) != 0) {
+		printk("%s%s: %.*s", ptype, prefix, len, buf);
+	} else {
+		printk("%s%s: %d:%d:(%s:%d:%s()) %.*s", ptype, prefix,
+		       hdr->ph_pid, hdr->ph_extern_pid, file, hdr->ph_line_num,
+		       fn, len, buf);
+	}
+	return;
+}
+
+int cfs_trace_max_debug_mb(void)
+{
+	int  total_mb = (totalram_pages >> (20 - PAGE_SHIFT));
+
+	return max(512, (total_mb * 80)/100);
+}
diff --git a/kernel/drivers/staging/lustre/lustre/libcfs/linux/linux-tracefile.h b/kernel/drivers/staging/lustre/lustre/libcfs/linux/linux-tracefile.h
new file mode 100644
index 000000000..ba84e4ffd
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/libcfs/linux/linux-tracefile.h
@@ -0,0 +1,48 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef __LIBCFS_LINUX_TRACEFILE_H__
+#define __LIBCFS_LINUX_TRACEFILE_H__
+
+/**
+ * three types of trace_data in linux
+ */
+typedef enum {
+	CFS_TCD_TYPE_PROC = 0,
+	CFS_TCD_TYPE_SOFTIRQ,
+	CFS_TCD_TYPE_IRQ,
+	CFS_TCD_TYPE_MAX
+} cfs_trace_buf_type_t;
+
+#endif
diff --git a/kernel/drivers/staging/lustre/lustre/libcfs/module.c b/kernel/drivers/staging/lustre/lustre/libcfs/module.c
new file mode 100644
index 000000000..f0ee76abf
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/libcfs/module.c
@@ -0,0 +1,976 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/stat.h>
+#include <linux/errno.h>
+#include <linux/unistd.h>
+#include <net/sock.h>
+#include <linux/uio.h>
+
+#include <linux/uaccess.h>
+
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/list.h>
+
+#include <linux/proc_fs.h>
+#include <linux/sysctl.h>
+
+# define DEBUG_SUBSYSTEM S_LNET
+
+#include "../../include/linux/libcfs/libcfs.h"
+#include <asm/div64.h>
+
+#include "../../include/linux/libcfs/libcfs_crypto.h"
+#include "../../include/linux/lnet/lib-lnet.h"
+#include "../../include/linux/lnet/lnet.h"
+#include "tracefile.h"
+
+MODULE_AUTHOR("Peter J. Braam <braam@clusterfs.com>");
+MODULE_DESCRIPTION("Portals v3.1");
+MODULE_LICENSE("GPL");
+
+extern struct miscdevice libcfs_dev;
+extern struct rw_semaphore cfs_tracefile_sem;
+extern struct mutex cfs_trace_thread_mutex;
+extern struct cfs_wi_sched *cfs_sched_rehash;
+extern void libcfs_init_nidstrings(void);
+
+static int insert_proc(void);
+static void remove_proc(void);
+
+static struct ctl_table_header *lnet_table_header;
+extern char lnet_upcall[1024];
+/**
+ * The path of debug log dump upcall script.
+ */
+extern char lnet_debug_log_upcall[1024];
+
+#define CTL_LNET	(0x100)
+
+enum {
+	PSDEV_DEBUG = 1,	  /* control debugging */
+	PSDEV_SUBSYSTEM_DEBUG,    /* control debugging */
+	PSDEV_PRINTK,	     /* force all messages to console */
+	PSDEV_CONSOLE_RATELIMIT,  /* ratelimit console messages */
+	PSDEV_CONSOLE_MAX_DELAY_CS, /* maximum delay over which we skip messages */
+	PSDEV_CONSOLE_MIN_DELAY_CS, /* initial delay over which we skip messages */
+	PSDEV_CONSOLE_BACKOFF,    /* delay increase factor */
+	PSDEV_DEBUG_PATH,	 /* crashdump log location */
+	PSDEV_DEBUG_DUMP_PATH,    /* crashdump tracelog location */
+	PSDEV_CPT_TABLE,	  /* information about cpu partitions */
+	PSDEV_LNET_UPCALL,	/* User mode upcall script  */
+	PSDEV_LNET_MEMUSED,       /* bytes currently PORTAL_ALLOCated */
+	PSDEV_LNET_CATASTROPHE,   /* if we have LBUGged or panic'd */
+	PSDEV_LNET_PANIC_ON_LBUG, /* flag to panic on LBUG */
+	PSDEV_LNET_DUMP_KERNEL,   /* snapshot kernel debug buffer to file */
+	PSDEV_LNET_DAEMON_FILE,   /* spool kernel debug buffer to file */
+	PSDEV_LNET_DEBUG_MB,      /* size of debug buffer */
+	PSDEV_LNET_DEBUG_LOG_UPCALL, /* debug log upcall script */
+	PSDEV_LNET_WATCHDOG_RATELIMIT,  /* ratelimit watchdog messages  */
+	PSDEV_LNET_FORCE_LBUG,    /* hook to force an LBUG */
+	PSDEV_LNET_FAIL_LOC,      /* control test failures instrumentation */
+	PSDEV_LNET_FAIL_VAL,      /* userdata for fail loc */
+};
+
+static void kportal_memhog_free (struct libcfs_device_userstate *ldu)
+{
+	struct page **level0p = &ldu->ldu_memhog_root_page;
+	struct page **level1p;
+	struct page **level2p;
+	int	   count1;
+	int	   count2;
+
+	if (*level0p != NULL) {
+
+		level1p = (struct page **)page_address(*level0p);
+		count1 = 0;
+
+		while (count1 < PAGE_CACHE_SIZE/sizeof(struct page *) &&
+		       *level1p != NULL) {
+
+			level2p = (struct page **)page_address(*level1p);
+			count2 = 0;
+
+			while (count2 < PAGE_CACHE_SIZE/sizeof(struct page *) &&
+			       *level2p != NULL) {
+
+				__free_page(*level2p);
+				ldu->ldu_memhog_pages--;
+				level2p++;
+				count2++;
+			}
+
+			__free_page(*level1p);
+			ldu->ldu_memhog_pages--;
+			level1p++;
+			count1++;
+		}
+
+		__free_page(*level0p);
+		ldu->ldu_memhog_pages--;
+
+		*level0p = NULL;
+	}
+
+	LASSERT (ldu->ldu_memhog_pages == 0);
+}
+
+static int kportal_memhog_alloc(struct libcfs_device_userstate *ldu, int npages,
+		     gfp_t flags)
+{
+	struct page **level0p;
+	struct page **level1p;
+	struct page **level2p;
+	int	   count1;
+	int	   count2;
+
+	LASSERT (ldu->ldu_memhog_pages == 0);
+	LASSERT (ldu->ldu_memhog_root_page == NULL);
+
+	if (npages < 0)
+		return -EINVAL;
+
+	if (npages == 0)
+		return 0;
+
+	level0p = &ldu->ldu_memhog_root_page;
+	*level0p = alloc_page(flags);
+	if (*level0p == NULL)
+		return -ENOMEM;
+	ldu->ldu_memhog_pages++;
+
+	level1p = (struct page **)page_address(*level0p);
+	count1 = 0;
+	memset(level1p, 0, PAGE_CACHE_SIZE);
+
+	while (ldu->ldu_memhog_pages < npages &&
+	       count1 < PAGE_CACHE_SIZE/sizeof(struct page *)) {
+
+		if (cfs_signal_pending())
+			return -EINTR;
+
+		*level1p = alloc_page(flags);
+		if (*level1p == NULL)
+			return -ENOMEM;
+		ldu->ldu_memhog_pages++;
+
+		level2p = (struct page **)page_address(*level1p);
+		count2 = 0;
+		memset(level2p, 0, PAGE_CACHE_SIZE);
+
+		while (ldu->ldu_memhog_pages < npages &&
+		       count2 < PAGE_CACHE_SIZE/sizeof(struct page *)) {
+
+			if (cfs_signal_pending())
+				return -EINTR;
+
+			*level2p = alloc_page(flags);
+			if (*level2p == NULL)
+				return -ENOMEM;
+			ldu->ldu_memhog_pages++;
+
+			level2p++;
+			count2++;
+		}
+
+		level1p++;
+		count1++;
+	}
+
+	return 0;
+}
+
+/* called when opening /dev/device */
+static int libcfs_psdev_open(unsigned long flags, void *args)
+{
+	struct libcfs_device_userstate *ldu;
+
+	try_module_get(THIS_MODULE);
+
+	LIBCFS_ALLOC(ldu, sizeof(*ldu));
+	if (ldu != NULL) {
+		ldu->ldu_memhog_pages = 0;
+		ldu->ldu_memhog_root_page = NULL;
+	}
+	*(struct libcfs_device_userstate **)args = ldu;
+
+	return 0;
+}
+
+/* called when closing /dev/device */
+static int libcfs_psdev_release(unsigned long flags, void *args)
+{
+	struct libcfs_device_userstate *ldu;
+
+	ldu = (struct libcfs_device_userstate *)args;
+	if (ldu != NULL) {
+		kportal_memhog_free(ldu);
+		LIBCFS_FREE(ldu, sizeof(*ldu));
+	}
+
+	module_put(THIS_MODULE);
+	return 0;
+}
+
+static struct rw_semaphore ioctl_list_sem;
+static struct list_head ioctl_list;
+
+int libcfs_register_ioctl(struct libcfs_ioctl_handler *hand)
+{
+	int rc = 0;
+
+	down_write(&ioctl_list_sem);
+	if (!list_empty(&hand->item))
+		rc = -EBUSY;
+	else
+		list_add_tail(&hand->item, &ioctl_list);
+	up_write(&ioctl_list_sem);
+
+	return rc;
+}
+EXPORT_SYMBOL(libcfs_register_ioctl);
+
+int libcfs_deregister_ioctl(struct libcfs_ioctl_handler *hand)
+{
+	int rc = 0;
+
+	down_write(&ioctl_list_sem);
+	if (list_empty(&hand->item))
+		rc = -ENOENT;
+	else
+		list_del_init(&hand->item);
+	up_write(&ioctl_list_sem);
+
+	return rc;
+}
+EXPORT_SYMBOL(libcfs_deregister_ioctl);
+
+static int libcfs_ioctl_int(struct cfs_psdev_file *pfile, unsigned long cmd,
+			    void *arg, struct libcfs_ioctl_data *data)
+{
+	int err = -EINVAL;
+
+	switch (cmd) {
+	case IOC_LIBCFS_CLEAR_DEBUG:
+		libcfs_debug_clear_buffer();
+		return 0;
+	/*
+	 * case IOC_LIBCFS_PANIC:
+	 * Handled in arch/cfs_module.c
+	 */
+	case IOC_LIBCFS_MARK_DEBUG:
+		if (data->ioc_inlbuf1 == NULL ||
+		    data->ioc_inlbuf1[data->ioc_inllen1 - 1] != '\0')
+			return -EINVAL;
+		libcfs_debug_mark_buffer(data->ioc_inlbuf1);
+		return 0;
+	case IOC_LIBCFS_MEMHOG:
+		if (pfile->private_data == NULL) {
+			err = -EINVAL;
+		} else {
+			kportal_memhog_free(pfile->private_data);
+			/* XXX The ioc_flags is not GFP flags now, need to be fixed */
+			err = kportal_memhog_alloc(pfile->private_data,
+						   data->ioc_count,
+						   data->ioc_flags);
+			if (err != 0)
+				kportal_memhog_free(pfile->private_data);
+		}
+		break;
+
+	case IOC_LIBCFS_PING_TEST: {
+		extern void (kping_client)(struct libcfs_ioctl_data *);
+		void (*ping)(struct libcfs_ioctl_data *);
+
+		CDEBUG(D_IOCTL, "doing %d pings to nid %s (%s)\n",
+		       data->ioc_count, libcfs_nid2str(data->ioc_nid),
+		       libcfs_nid2str(data->ioc_nid));
+		ping = symbol_get(kping_client);
+		if (!ping)
+			CERROR("symbol_get failed\n");
+		else {
+			ping(data);
+			symbol_put(kping_client);
+		}
+		return 0;
+	}
+
+	default: {
+		struct libcfs_ioctl_handler *hand;
+		err = -EINVAL;
+		down_read(&ioctl_list_sem);
+		list_for_each_entry(hand, &ioctl_list, item) {
+			err = hand->handle_ioctl(cmd, data);
+			if (err != -EINVAL) {
+				if (err == 0)
+					err = libcfs_ioctl_popdata(arg,
+							data, sizeof (*data));
+				break;
+			}
+		}
+		up_read(&ioctl_list_sem);
+		break;
+	}
+	}
+
+	return err;
+}
+
+static int libcfs_ioctl(struct cfs_psdev_file *pfile, unsigned long cmd, void *arg)
+{
+	char    *buf;
+	struct libcfs_ioctl_data *data;
+	int err = 0;
+
+	LIBCFS_ALLOC_GFP(buf, 1024, GFP_IOFS);
+	if (buf == NULL)
+		return -ENOMEM;
+
+	/* 'cmd' and permissions get checked in our arch-specific caller */
+	if (libcfs_ioctl_getdata(buf, buf + 800, (void *)arg)) {
+		CERROR("PORTALS ioctl: data error\n");
+		err = -EINVAL;
+		goto out;
+	}
+	data = (struct libcfs_ioctl_data *)buf;
+
+	err = libcfs_ioctl_int(pfile, cmd, arg, data);
+
+out:
+	LIBCFS_FREE(buf, 1024);
+	return err;
+}
+
+
+struct cfs_psdev_ops libcfs_psdev_ops = {
+	libcfs_psdev_open,
+	libcfs_psdev_release,
+	NULL,
+	NULL,
+	libcfs_ioctl
+};
+
+static int init_libcfs_module(void)
+{
+	int rc;
+
+	libcfs_arch_init();
+	libcfs_init_nidstrings();
+	init_rwsem(&cfs_tracefile_sem);
+	mutex_init(&cfs_trace_thread_mutex);
+	init_rwsem(&ioctl_list_sem);
+	INIT_LIST_HEAD(&ioctl_list);
+	init_waitqueue_head(&cfs_race_waitq);
+
+	rc = libcfs_debug_init(5 * 1024 * 1024);
+	if (rc < 0) {
+		pr_err("LustreError: libcfs_debug_init: %d\n", rc);
+		return rc;
+	}
+
+	rc = cfs_cpu_init();
+	if (rc != 0)
+		goto cleanup_debug;
+
+	rc = misc_register(&libcfs_dev);
+	if (rc) {
+		CERROR("misc_register: error %d\n", rc);
+		goto cleanup_cpu;
+	}
+
+	rc = cfs_wi_startup();
+	if (rc) {
+		CERROR("initialize workitem: error %d\n", rc);
+		goto cleanup_deregister;
+	}
+
+	/* max to 4 threads, should be enough for rehash */
+	rc = min(cfs_cpt_weight(cfs_cpt_table, CFS_CPT_ANY), 4);
+	rc = cfs_wi_sched_create("cfs_rh", cfs_cpt_table, CFS_CPT_ANY,
+				 rc, &cfs_sched_rehash);
+	if (rc != 0) {
+		CERROR("Startup workitem scheduler: error: %d\n", rc);
+		goto cleanup_deregister;
+	}
+
+	rc = cfs_crypto_register();
+	if (rc) {
+		CERROR("cfs_crypto_register: error %d\n", rc);
+		goto cleanup_wi;
+	}
+
+
+	rc = insert_proc();
+	if (rc) {
+		CERROR("insert_proc: error %d\n", rc);
+		goto cleanup_crypto;
+	}
+
+	CDEBUG (D_OTHER, "portals setup OK\n");
+	return 0;
+ cleanup_crypto:
+	cfs_crypto_unregister();
+ cleanup_wi:
+	cfs_wi_shutdown();
+ cleanup_deregister:
+	misc_deregister(&libcfs_dev);
+cleanup_cpu:
+	cfs_cpu_fini();
+ cleanup_debug:
+	libcfs_debug_cleanup();
+	return rc;
+}
+
+static void exit_libcfs_module(void)
+{
+	int rc;
+
+	remove_proc();
+
+	CDEBUG(D_MALLOC, "before Portals cleanup: kmem %d\n",
+	       atomic_read(&libcfs_kmemory));
+
+	if (cfs_sched_rehash != NULL) {
+		cfs_wi_sched_destroy(cfs_sched_rehash);
+		cfs_sched_rehash = NULL;
+	}
+
+	cfs_crypto_unregister();
+	cfs_wi_shutdown();
+
+	rc = misc_deregister(&libcfs_dev);
+	if (rc)
+		CERROR("misc_deregister error %d\n", rc);
+
+	cfs_cpu_fini();
+
+	if (atomic_read(&libcfs_kmemory) != 0)
+		CERROR("Portals memory leaked: %d bytes\n",
+		       atomic_read(&libcfs_kmemory));
+
+	rc = libcfs_debug_cleanup();
+	if (rc)
+		pr_err("LustreError: libcfs_debug_cleanup: %d\n", rc);
+
+	libcfs_arch_cleanup();
+}
+
+static int proc_call_handler(void *data, int write, loff_t *ppos,
+		void __user *buffer, size_t *lenp,
+		int (*handler)(void *data, int write,
+		loff_t pos, void __user *buffer, int len))
+{
+	int rc = handler(data, write, *ppos, buffer, *lenp);
+
+	if (rc < 0)
+		return rc;
+
+	if (write) {
+		*ppos += *lenp;
+	} else {
+		*lenp = rc;
+		*ppos += rc;
+	}
+	return 0;
+}
+
+static int __proc_dobitmasks(void *data, int write,
+			     loff_t pos, void __user *buffer, int nob)
+{
+	const int     tmpstrlen = 512;
+	char	 *tmpstr;
+	int	   rc;
+	unsigned int *mask = data;
+	int	   is_subsys = (mask == &libcfs_subsystem_debug) ? 1 : 0;
+	int	   is_printk = (mask == &libcfs_printk) ? 1 : 0;
+
+	rc = cfs_trace_allocate_string_buffer(&tmpstr, tmpstrlen);
+	if (rc < 0)
+		return rc;
+
+	if (!write) {
+		libcfs_debug_mask2str(tmpstr, tmpstrlen, *mask, is_subsys);
+		rc = strlen(tmpstr);
+
+		if (pos >= rc) {
+			rc = 0;
+		} else {
+			rc = cfs_trace_copyout_string(buffer, nob,
+						      tmpstr + pos, "\n");
+		}
+	} else {
+		rc = cfs_trace_copyin_string(tmpstr, tmpstrlen, buffer, nob);
+		if (rc < 0) {
+			cfs_trace_free_string_buffer(tmpstr, tmpstrlen);
+			return rc;
+		}
+
+		rc = libcfs_debug_str2mask(mask, tmpstr, is_subsys);
+		/* Always print LBUG/LASSERT to console, so keep this mask */
+		if (is_printk)
+			*mask |= D_EMERG;
+	}
+
+	cfs_trace_free_string_buffer(tmpstr, tmpstrlen);
+	return rc;
+}
+
+static int proc_dobitmasks(struct ctl_table *table, int write,
+			   void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	return proc_call_handler(table->data, write, ppos, buffer, lenp,
+				 __proc_dobitmasks);
+}
+
+static int min_watchdog_ratelimit;	  /* disable ratelimiting */
+static int max_watchdog_ratelimit = (24*60*60); /* limit to once per day */
+
+static int __proc_dump_kernel(void *data, int write,
+			      loff_t pos, void __user *buffer, int nob)
+{
+	if (!write)
+		return 0;
+
+	return cfs_trace_dump_debug_buffer_usrstr(buffer, nob);
+}
+
+static int proc_dump_kernel(struct ctl_table *table, int write,
+			    void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	return proc_call_handler(table->data, write, ppos, buffer, lenp,
+				 __proc_dump_kernel);
+}
+
+static int __proc_daemon_file(void *data, int write,
+			      loff_t pos, void __user *buffer, int nob)
+{
+	if (!write) {
+		int len = strlen(cfs_tracefile);
+
+		if (pos >= len)
+			return 0;
+
+		return cfs_trace_copyout_string(buffer, nob,
+						cfs_tracefile + pos, "\n");
+	}
+
+	return cfs_trace_daemon_command_usrstr(buffer, nob);
+}
+
+static int proc_daemon_file(struct ctl_table *table, int write,
+			    void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	return proc_call_handler(table->data, write, ppos, buffer, lenp,
+				 __proc_daemon_file);
+}
+
+static int __proc_debug_mb(void *data, int write,
+			   loff_t pos, void __user *buffer, int nob)
+{
+	if (!write) {
+		char tmpstr[32];
+		int  len = snprintf(tmpstr, sizeof(tmpstr), "%d",
+				    cfs_trace_get_debug_mb());
+
+		if (pos >= len)
+			return 0;
+
+		return cfs_trace_copyout_string(buffer, nob, tmpstr + pos,
+		       "\n");
+	}
+
+	return cfs_trace_set_debug_mb_usrstr(buffer, nob);
+}
+
+static int proc_debug_mb(struct ctl_table *table, int write,
+			 void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	return proc_call_handler(table->data, write, ppos, buffer, lenp,
+				 __proc_debug_mb);
+}
+
+static int proc_console_max_delay_cs(struct ctl_table *table, int write,
+				     void __user *buffer, size_t *lenp,
+				     loff_t *ppos)
+{
+	int rc, max_delay_cs;
+	struct ctl_table dummy = *table;
+	long d;
+
+	dummy.data = &max_delay_cs;
+	dummy.proc_handler = &proc_dointvec;
+
+	if (!write) { /* read */
+		max_delay_cs = cfs_duration_sec(libcfs_console_max_delay * 100);
+		rc = proc_dointvec(&dummy, write, buffer, lenp, ppos);
+		return rc;
+	}
+
+	/* write */
+	max_delay_cs = 0;
+	rc = proc_dointvec(&dummy, write, buffer, lenp, ppos);
+	if (rc < 0)
+		return rc;
+	if (max_delay_cs <= 0)
+		return -EINVAL;
+
+	d = cfs_time_seconds(max_delay_cs) / 100;
+	if (d == 0 || d < libcfs_console_min_delay)
+		return -EINVAL;
+	libcfs_console_max_delay = d;
+
+	return rc;
+}
+
+static int proc_console_min_delay_cs(struct ctl_table *table, int write,
+				     void __user *buffer, size_t *lenp,
+				     loff_t *ppos)
+{
+	int rc, min_delay_cs;
+	struct ctl_table dummy = *table;
+	long d;
+
+	dummy.data = &min_delay_cs;
+	dummy.proc_handler = &proc_dointvec;
+
+	if (!write) { /* read */
+		min_delay_cs = cfs_duration_sec(libcfs_console_min_delay * 100);
+		rc = proc_dointvec(&dummy, write, buffer, lenp, ppos);
+		return rc;
+	}
+
+	/* write */
+	min_delay_cs = 0;
+	rc = proc_dointvec(&dummy, write, buffer, lenp, ppos);
+	if (rc < 0)
+		return rc;
+	if (min_delay_cs <= 0)
+		return -EINVAL;
+
+	d = cfs_time_seconds(min_delay_cs) / 100;
+	if (d == 0 || d > libcfs_console_max_delay)
+		return -EINVAL;
+	libcfs_console_min_delay = d;
+
+	return rc;
+}
+
+static int proc_console_backoff(struct ctl_table *table, int write,
+				void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	int rc, backoff;
+	struct ctl_table dummy = *table;
+
+	dummy.data = &backoff;
+	dummy.proc_handler = &proc_dointvec;
+
+	if (!write) { /* read */
+		backoff = libcfs_console_backoff;
+		rc = proc_dointvec(&dummy, write, buffer, lenp, ppos);
+		return rc;
+	}
+
+	/* write */
+	backoff = 0;
+	rc = proc_dointvec(&dummy, write, buffer, lenp, ppos);
+	if (rc < 0)
+		return rc;
+	if (backoff <= 0)
+		return -EINVAL;
+
+	libcfs_console_backoff = backoff;
+
+	return rc;
+}
+
+static int libcfs_force_lbug(struct ctl_table *table, int write,
+			     void __user *buffer,
+			     size_t *lenp, loff_t *ppos)
+{
+	if (write)
+		LBUG();
+	return 0;
+}
+
+static int proc_fail_loc(struct ctl_table *table, int write,
+			 void __user *buffer,
+			 size_t *lenp, loff_t *ppos)
+{
+	int rc;
+	long old_fail_loc = cfs_fail_loc;
+
+	rc = proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
+	if (old_fail_loc != cfs_fail_loc)
+		wake_up(&cfs_race_waitq);
+	return rc;
+}
+
+static int __proc_cpt_table(void *data, int write,
+			    loff_t pos, void __user *buffer, int nob)
+{
+	char *buf = NULL;
+	int   len = 4096;
+	int   rc  = 0;
+
+	if (write)
+		return -EPERM;
+
+	LASSERT(cfs_cpt_table != NULL);
+
+	while (1) {
+		LIBCFS_ALLOC(buf, len);
+		if (buf == NULL)
+			return -ENOMEM;
+
+		rc = cfs_cpt_table_print(cfs_cpt_table, buf, len);
+		if (rc >= 0)
+			break;
+
+		if (rc == -EFBIG) {
+			LIBCFS_FREE(buf, len);
+			len <<= 1;
+			continue;
+		}
+		goto out;
+	}
+
+	if (pos >= rc) {
+		rc = 0;
+		goto out;
+	}
+
+	rc = cfs_trace_copyout_string(buffer, nob, buf + pos, NULL);
+ out:
+	if (buf != NULL)
+		LIBCFS_FREE(buf, len);
+	return rc;
+}
+
+static int proc_cpt_table(struct ctl_table *table, int write,
+			   void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	return proc_call_handler(table->data, write, ppos, buffer, lenp,
+				 __proc_cpt_table);
+}
+
+static struct ctl_table lnet_table[] = {
+	/*
+	 * NB No .strategy entries have been provided since sysctl(8) prefers
+	 * to go via /proc for portability.
+	 */
+	{
+		.procname = "debug",
+		.data     = &libcfs_debug,
+		.maxlen   = sizeof(int),
+		.mode     = 0644,
+		.proc_handler = &proc_dobitmasks,
+	},
+	{
+		.procname = "subsystem_debug",
+		.data     = &libcfs_subsystem_debug,
+		.maxlen   = sizeof(int),
+		.mode     = 0644,
+		.proc_handler = &proc_dobitmasks,
+	},
+	{
+		.procname = "printk",
+		.data     = &libcfs_printk,
+		.maxlen   = sizeof(int),
+		.mode     = 0644,
+		.proc_handler = &proc_dobitmasks,
+	},
+	{
+		.procname = "console_ratelimit",
+		.data     = &libcfs_console_ratelimit,
+		.maxlen   = sizeof(int),
+		.mode     = 0644,
+		.proc_handler = &proc_dointvec
+	},
+	{
+		.procname = "console_max_delay_centisecs",
+		.maxlen   = sizeof(int),
+		.mode     = 0644,
+		.proc_handler = &proc_console_max_delay_cs
+	},
+	{
+		.procname = "console_min_delay_centisecs",
+		.maxlen   = sizeof(int),
+		.mode     = 0644,
+		.proc_handler = &proc_console_min_delay_cs
+	},
+	{
+		.procname = "console_backoff",
+		.maxlen   = sizeof(int),
+		.mode     = 0644,
+		.proc_handler = &proc_console_backoff
+	},
+
+	{
+		.procname = "debug_path",
+		.data     = libcfs_debug_file_path_arr,
+		.maxlen   = sizeof(libcfs_debug_file_path_arr),
+		.mode     = 0644,
+		.proc_handler = &proc_dostring,
+	},
+
+	{
+		.procname = "cpu_partition_table",
+		.maxlen   = 128,
+		.mode     = 0444,
+		.proc_handler = &proc_cpt_table,
+	},
+
+	{
+		.procname = "upcall",
+		.data     = lnet_upcall,
+		.maxlen   = sizeof(lnet_upcall),
+		.mode     = 0644,
+		.proc_handler = &proc_dostring,
+	},
+	{
+		.procname = "debug_log_upcall",
+		.data     = lnet_debug_log_upcall,
+		.maxlen   = sizeof(lnet_debug_log_upcall),
+		.mode     = 0644,
+		.proc_handler = &proc_dostring,
+	},
+	{
+		.procname = "lnet_memused",
+		.data     = (int *)&libcfs_kmemory.counter,
+		.maxlen   = sizeof(int),
+		.mode     = 0444,
+		.proc_handler = &proc_dointvec,
+	},
+	{
+		.procname = "catastrophe",
+		.data     = &libcfs_catastrophe,
+		.maxlen   = sizeof(int),
+		.mode     = 0444,
+		.proc_handler = &proc_dointvec,
+	},
+	{
+		.procname = "panic_on_lbug",
+		.data     = &libcfs_panic_on_lbug,
+		.maxlen   = sizeof(int),
+		.mode     = 0644,
+		.proc_handler = &proc_dointvec,
+	},
+	{
+		.procname = "dump_kernel",
+		.maxlen   = 256,
+		.mode     = 0200,
+		.proc_handler = &proc_dump_kernel,
+	},
+	{
+		.procname = "daemon_file",
+		.mode     = 0644,
+		.maxlen   = 256,
+		.proc_handler = &proc_daemon_file,
+	},
+	{
+		.procname = "debug_mb",
+		.mode     = 0644,
+		.proc_handler = &proc_debug_mb,
+	},
+	{
+		.procname = "watchdog_ratelimit",
+		.data     = &libcfs_watchdog_ratelimit,
+		.maxlen   = sizeof(int),
+		.mode     = 0644,
+		.proc_handler = &proc_dointvec_minmax,
+		.extra1   = &min_watchdog_ratelimit,
+		.extra2   = &max_watchdog_ratelimit,
+	},
+	{
+		.procname = "force_lbug",
+		.data     = NULL,
+		.maxlen   = 0,
+		.mode     = 0200,
+		.proc_handler = &libcfs_force_lbug
+	},
+	{
+		.procname = "fail_loc",
+		.data     = &cfs_fail_loc,
+		.maxlen   = sizeof(cfs_fail_loc),
+		.mode     = 0644,
+		.proc_handler = &proc_fail_loc
+	},
+	{
+		.procname = "fail_val",
+		.data     = &cfs_fail_val,
+		.maxlen   = sizeof(int),
+		.mode     = 0644,
+		.proc_handler = &proc_dointvec
+	},
+	{
+	}
+};
+
+static struct ctl_table top_table[] = {
+	{
+		.procname = "lnet",
+		.mode     = 0555,
+		.data     = NULL,
+		.maxlen   = 0,
+		.child    = lnet_table,
+	},
+	{
+	}
+};
+
+static int insert_proc(void)
+{
+	if (lnet_table_header == NULL)
+		lnet_table_header = register_sysctl_table(top_table);
+	return 0;
+}
+
+static void remove_proc(void)
+{
+	if (lnet_table_header != NULL)
+		unregister_sysctl_table(lnet_table_header);
+
+	lnet_table_header = NULL;
+}
+
+MODULE_VERSION("1.0.0");
+
+module_init(init_libcfs_module);
+module_exit(exit_libcfs_module);
diff --git a/kernel/drivers/staging/lustre/lustre/libcfs/nidstrings.c b/kernel/drivers/staging/lustre/lustre/libcfs/nidstrings.c
new file mode 100644
index 000000000..087449f4e
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/libcfs/nidstrings.c
@@ -0,0 +1,842 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * libcfs/libcfs/nidstrings.c
+ *
+ * Author: Phil Schwan <phil@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+
+#include "../../include/linux/libcfs/libcfs.h"
+#include "../../include/linux/lnet/lnet.h"
+
+/* CAVEAT VENDITOR! Keep the canonical string representation of nets/nids
+ * consistent in all conversion functions.  Some code fragments are copied
+ * around for the sake of clarity...
+ */
+
+/* CAVEAT EMPTOR! Racey temporary buffer allocation!
+ * Choose the number of nidstrings to support the MAXIMUM expected number of
+ * concurrent users.  If there are more, the returned string will be volatile.
+ * NB this number must allow for a process to be descheduled for a timeslice
+ * between getting its string and using it.
+ */
+
+static char      libcfs_nidstrings[LNET_NIDSTR_COUNT][LNET_NIDSTR_SIZE];
+static int       libcfs_nidstring_idx;
+
+static spinlock_t libcfs_nidstring_lock;
+
+void libcfs_init_nidstrings(void)
+{
+	spin_lock_init(&libcfs_nidstring_lock);
+}
+
+static char *
+libcfs_next_nidstring(void)
+{
+	char	  *str;
+	unsigned long  flags;
+
+	spin_lock_irqsave(&libcfs_nidstring_lock, flags);
+
+	str = libcfs_nidstrings[libcfs_nidstring_idx++];
+	if (libcfs_nidstring_idx == ARRAY_SIZE(libcfs_nidstrings))
+		libcfs_nidstring_idx = 0;
+
+	spin_unlock_irqrestore(&libcfs_nidstring_lock, flags);
+	return str;
+}
+
+static int libcfs_lo_str2addr(const char *str, int nob, __u32 *addr)
+{
+	*addr = 0;
+	return 1;
+}
+
+static void libcfs_ip_addr2str(__u32 addr, char *str)
+{
+	snprintf(str, LNET_NIDSTR_SIZE, "%u.%u.%u.%u",
+		 (addr >> 24) & 0xff, (addr >> 16) & 0xff,
+		 (addr >> 8) & 0xff, addr & 0xff);
+}
+
+static int libcfs_ip_str2addr(const char *str, int nob, __u32 *addr)
+{
+	unsigned int	a;
+	unsigned int	b;
+	unsigned int	c;
+	unsigned int	d;
+	int		n = nob; /* XscanfX */
+
+	/* numeric IP? */
+	if (sscanf(str, "%u.%u.%u.%u%n", &a, &b, &c, &d, &n) >= 4 &&
+	    n == nob &&
+	    (a & ~0xff) == 0 && (b & ~0xff) == 0 &&
+	    (c & ~0xff) == 0 && (d & ~0xff) == 0) {
+		*addr = ((a<<24)|(b<<16)|(c<<8)|d);
+		return 1;
+	}
+
+	return 0;
+}
+
+static void libcfs_decnum_addr2str(__u32 addr, char *str)
+{
+	snprintf(str, LNET_NIDSTR_SIZE, "%u", addr);
+}
+
+static void libcfs_hexnum_addr2str(__u32 addr, char *str)
+{
+	snprintf(str, LNET_NIDSTR_SIZE, "0x%x", addr);
+}
+
+static int libcfs_num_str2addr(const char *str, int nob, __u32 *addr)
+{
+	int     n;
+
+	n = nob;
+	if (sscanf(str, "0x%x%n", addr, &n) >= 1 && n == nob)
+		return 1;
+
+	n = nob;
+	if (sscanf(str, "0X%x%n", addr, &n) >= 1 && n == nob)
+		return 1;
+
+	n = nob;
+	if (sscanf(str, "%u%n", addr, &n) >= 1 && n == nob)
+		return 1;
+
+	return 0;
+}
+
+/**
+ * Nf_parse_addrlist method for networks using numeric addresses.
+ *
+ * Examples of such networks are gm and elan.
+ *
+ * \retval 0 if \a str parsed to numeric address
+ * \retval errno otherwise
+ */
+static int
+libcfs_num_parse(char *str, int len, struct list_head *list)
+{
+	struct cfs_expr_list *el;
+	int	rc;
+
+	rc = cfs_expr_list_parse(str, len, 0, MAX_NUMERIC_VALUE, &el);
+	if (rc == 0)
+		list_add_tail(&el->el_link, list);
+
+	return rc;
+}
+
+/*
+ * Nf_match_addr method for networks using numeric addresses
+ *
+ * \retval 1 on match
+ * \retval 0 otherwise
+ */
+static int
+libcfs_num_match(__u32 addr, struct list_head *numaddr)
+{
+	struct cfs_expr_list *el;
+
+	LASSERT(!list_empty(numaddr));
+	el = list_entry(numaddr->next, struct cfs_expr_list, el_link);
+
+	return cfs_expr_list_match(addr, el);
+}
+
+struct netstrfns {
+	int	  nf_type;
+	char	*nf_name;
+	char	*nf_modname;
+	void       (*nf_addr2str)(__u32 addr, char *str);
+	int	(*nf_str2addr)(const char *str, int nob, __u32 *addr);
+	int	(*nf_parse_addrlist)(char *str, int len,
+					struct list_head *list);
+	int	(*nf_match_addr)(__u32 addr, struct list_head *list);
+};
+
+static struct netstrfns  libcfs_netstrfns[] = {
+	{/* .nf_type      */  LOLND,
+	 /* .nf_name      */  "lo",
+	 /* .nf_modname   */  "klolnd",
+	 /* .nf_addr2str  */  libcfs_decnum_addr2str,
+	 /* .nf_str2addr  */  libcfs_lo_str2addr,
+	 /* .nf_parse_addr*/  libcfs_num_parse,
+	 /* .nf_match_addr*/  libcfs_num_match},
+	{/* .nf_type      */  SOCKLND,
+	 /* .nf_name      */  "tcp",
+	 /* .nf_modname   */  "ksocklnd",
+	 /* .nf_addr2str  */  libcfs_ip_addr2str,
+	 /* .nf_str2addr  */  libcfs_ip_str2addr,
+	 /* .nf_parse_addrlist*/  cfs_ip_addr_parse,
+	 /* .nf_match_addr*/  cfs_ip_addr_match},
+	{/* .nf_type      */  O2IBLND,
+	 /* .nf_name      */  "o2ib",
+	 /* .nf_modname   */  "ko2iblnd",
+	 /* .nf_addr2str  */  libcfs_ip_addr2str,
+	 /* .nf_str2addr  */  libcfs_ip_str2addr,
+	 /* .nf_parse_addrlist*/  cfs_ip_addr_parse,
+	 /* .nf_match_addr*/  cfs_ip_addr_match},
+	{/* .nf_type      */  CIBLND,
+	 /* .nf_name      */  "cib",
+	 /* .nf_modname   */  "kciblnd",
+	 /* .nf_addr2str  */  libcfs_ip_addr2str,
+	 /* .nf_str2addr  */  libcfs_ip_str2addr,
+	 /* .nf_parse_addrlist*/  cfs_ip_addr_parse,
+	 /* .nf_match_addr*/  cfs_ip_addr_match},
+	{/* .nf_type      */  OPENIBLND,
+	 /* .nf_name      */  "openib",
+	 /* .nf_modname   */  "kopeniblnd",
+	 /* .nf_addr2str  */  libcfs_ip_addr2str,
+	 /* .nf_str2addr  */  libcfs_ip_str2addr,
+	 /* .nf_parse_addrlist*/  cfs_ip_addr_parse,
+	 /* .nf_match_addr*/  cfs_ip_addr_match},
+	{/* .nf_type      */  IIBLND,
+	 /* .nf_name      */  "iib",
+	 /* .nf_modname   */  "kiiblnd",
+	 /* .nf_addr2str  */  libcfs_ip_addr2str,
+	 /* .nf_str2addr  */  libcfs_ip_str2addr,
+	 /* .nf_parse_addrlist*/  cfs_ip_addr_parse,
+	 /* .nf_match_addr*/  cfs_ip_addr_match},
+	{/* .nf_type      */  VIBLND,
+	 /* .nf_name      */  "vib",
+	 /* .nf_modname   */  "kviblnd",
+	 /* .nf_addr2str  */  libcfs_ip_addr2str,
+	 /* .nf_str2addr  */  libcfs_ip_str2addr,
+	 /* .nf_parse_addrlist*/  cfs_ip_addr_parse,
+	 /* .nf_match_addr*/  cfs_ip_addr_match},
+	{/* .nf_type      */  RALND,
+	 /* .nf_name      */  "ra",
+	 /* .nf_modname   */  "kralnd",
+	 /* .nf_addr2str  */  libcfs_ip_addr2str,
+	 /* .nf_str2addr  */  libcfs_ip_str2addr,
+	 /* .nf_parse_addrlist*/  cfs_ip_addr_parse,
+	 /* .nf_match_addr*/  cfs_ip_addr_match},
+	{/* .nf_type      */  QSWLND,
+	 /* .nf_name      */  "elan",
+	 /* .nf_modname   */  "kqswlnd",
+	 /* .nf_addr2str  */  libcfs_decnum_addr2str,
+	 /* .nf_str2addr  */  libcfs_num_str2addr,
+	 /* .nf_parse_addrlist*/  libcfs_num_parse,
+	 /* .nf_match_addr*/  libcfs_num_match},
+	{/* .nf_type      */  GMLND,
+	 /* .nf_name      */  "gm",
+	 /* .nf_modname   */  "kgmlnd",
+	 /* .nf_addr2str  */  libcfs_hexnum_addr2str,
+	 /* .nf_str2addr  */  libcfs_num_str2addr,
+	 /* .nf_parse_addrlist*/  libcfs_num_parse,
+	 /* .nf_match_addr*/  libcfs_num_match},
+	{/* .nf_type      */  MXLND,
+	 /* .nf_name      */  "mx",
+	 /* .nf_modname   */  "kmxlnd",
+	 /* .nf_addr2str  */  libcfs_ip_addr2str,
+	 /* .nf_str2addr  */  libcfs_ip_str2addr,
+	 /* .nf_parse_addrlist*/  cfs_ip_addr_parse,
+	 /* .nf_match_addr*/  cfs_ip_addr_match},
+	{/* .nf_type      */  PTLLND,
+	 /* .nf_name      */  "ptl",
+	 /* .nf_modname   */  "kptllnd",
+	 /* .nf_addr2str  */  libcfs_decnum_addr2str,
+	 /* .nf_str2addr  */  libcfs_num_str2addr,
+	 /* .nf_parse_addrlist*/  libcfs_num_parse,
+	 /* .nf_match_addr*/  libcfs_num_match},
+	{/* .nf_type      */  GNILND,
+	 /* .nf_name      */  "gni",
+	 /* .nf_modname   */  "kgnilnd",
+	 /* .nf_addr2str  */  libcfs_decnum_addr2str,
+	 /* .nf_str2addr  */  libcfs_num_str2addr,
+	 /* .nf_parse_addrlist*/  libcfs_num_parse,
+	 /* .nf_match_addr*/  libcfs_num_match},
+	/* placeholder for net0 alias.  It MUST BE THE LAST ENTRY */
+	{/* .nf_type      */  -1},
+};
+
+static const int libcfs_nnetstrfns = ARRAY_SIZE(libcfs_netstrfns);
+
+/* CAVEAT EMPTOR XscanfX
+ * I use "%n" at the end of a sscanf format to detect trailing junk.  However
+ * sscanf may return immediately if it sees the terminating '0' in a string, so
+ * I initialise the %n variable to the expected length.  If sscanf sets it;
+ * fine, if it doesn't, then the scan ended at the end of the string, which is
+ * fine too :) */
+
+static struct netstrfns *
+libcfs_lnd2netstrfns(int lnd)
+{
+	int    i;
+
+	if (lnd >= 0)
+		for (i = 0; i < libcfs_nnetstrfns; i++)
+			if (lnd == libcfs_netstrfns[i].nf_type)
+				return &libcfs_netstrfns[i];
+
+	return NULL;
+}
+
+static struct netstrfns *
+libcfs_namenum2netstrfns(const char *name)
+{
+	struct netstrfns *nf;
+	int	       i;
+
+	for (i = 0; i < libcfs_nnetstrfns; i++) {
+		nf = &libcfs_netstrfns[i];
+		if (nf->nf_type >= 0 &&
+		    !strncmp(name, nf->nf_name, strlen(nf->nf_name)))
+			return nf;
+	}
+	return NULL;
+}
+
+static struct netstrfns *
+libcfs_name2netstrfns(const char *name)
+{
+	int    i;
+
+	for (i = 0; i < libcfs_nnetstrfns; i++)
+		if (libcfs_netstrfns[i].nf_type >= 0 &&
+		    !strcmp(libcfs_netstrfns[i].nf_name, name))
+			return &libcfs_netstrfns[i];
+
+	return NULL;
+}
+
+int
+libcfs_isknown_lnd(int type)
+{
+	return libcfs_lnd2netstrfns(type) != NULL;
+}
+EXPORT_SYMBOL(libcfs_isknown_lnd);
+
+char *
+libcfs_lnd2modname(int lnd)
+{
+	struct netstrfns *nf = libcfs_lnd2netstrfns(lnd);
+
+	return (nf == NULL) ? NULL : nf->nf_modname;
+}
+EXPORT_SYMBOL(libcfs_lnd2modname);
+
+char *
+libcfs_lnd2str(int lnd)
+{
+	char	   *str;
+	struct netstrfns *nf = libcfs_lnd2netstrfns(lnd);
+
+	if (nf != NULL)
+		return nf->nf_name;
+
+	str = libcfs_next_nidstring();
+	snprintf(str, LNET_NIDSTR_SIZE, "?%d?", lnd);
+	return str;
+}
+EXPORT_SYMBOL(libcfs_lnd2str);
+
+int
+libcfs_str2lnd(const char *str)
+{
+	struct netstrfns *nf = libcfs_name2netstrfns(str);
+
+	if (nf != NULL)
+		return nf->nf_type;
+
+	return -1;
+}
+EXPORT_SYMBOL(libcfs_str2lnd);
+
+char *
+libcfs_net2str(__u32 net)
+{
+	int	       lnd = LNET_NETTYP(net);
+	int	       num = LNET_NETNUM(net);
+	struct netstrfns *nf  = libcfs_lnd2netstrfns(lnd);
+	char	     *str = libcfs_next_nidstring();
+
+	if (nf == NULL)
+		snprintf(str, LNET_NIDSTR_SIZE, "<%d:%d>", lnd, num);
+	else if (num == 0)
+		snprintf(str, LNET_NIDSTR_SIZE, "%s", nf->nf_name);
+	else
+		snprintf(str, LNET_NIDSTR_SIZE, "%s%d", nf->nf_name, num);
+
+	return str;
+}
+EXPORT_SYMBOL(libcfs_net2str);
+
+char *
+libcfs_nid2str(lnet_nid_t nid)
+{
+	__u32	     addr = LNET_NIDADDR(nid);
+	__u32	     net = LNET_NIDNET(nid);
+	int	       lnd = LNET_NETTYP(net);
+	int	       nnum = LNET_NETNUM(net);
+	struct netstrfns *nf;
+	char	     *str;
+	int	       nob;
+
+	if (nid == LNET_NID_ANY)
+		return "<?>";
+
+	nf = libcfs_lnd2netstrfns(lnd);
+	str = libcfs_next_nidstring();
+
+	if (nf == NULL)
+		snprintf(str, LNET_NIDSTR_SIZE, "%x@<%d:%d>", addr, lnd, nnum);
+	else {
+		nf->nf_addr2str(addr, str);
+		nob = strlen(str);
+		if (nnum == 0)
+			snprintf(str + nob, LNET_NIDSTR_SIZE - nob, "@%s",
+				 nf->nf_name);
+		else
+			snprintf(str + nob, LNET_NIDSTR_SIZE - nob, "@%s%d",
+				 nf->nf_name, nnum);
+	}
+
+	return str;
+}
+EXPORT_SYMBOL(libcfs_nid2str);
+
+static struct netstrfns *
+libcfs_str2net_internal(const char *str, __u32 *net)
+{
+	struct netstrfns *uninitialized_var(nf);
+	int	       nob;
+	unsigned int   netnum;
+	int	       i;
+
+	for (i = 0; i < libcfs_nnetstrfns; i++) {
+		nf = &libcfs_netstrfns[i];
+		if (nf->nf_type >= 0 &&
+		    !strncmp(str, nf->nf_name, strlen(nf->nf_name)))
+			break;
+	}
+
+	if (i == libcfs_nnetstrfns)
+		return NULL;
+
+	nob = strlen(nf->nf_name);
+
+	if (strlen(str) == (unsigned int)nob) {
+		netnum = 0;
+	} else {
+		if (nf->nf_type == LOLND) /* net number not allowed */
+			return NULL;
+
+		str += nob;
+		i = strlen(str);
+		if (sscanf(str, "%u%n", &netnum, &i) < 1 ||
+		    i != (int)strlen(str))
+			return NULL;
+	}
+
+	*net = LNET_MKNET(nf->nf_type, netnum);
+	return nf;
+}
+
+__u32
+libcfs_str2net(const char *str)
+{
+	__u32  net;
+
+	if (libcfs_str2net_internal(str, &net) != NULL)
+		return net;
+
+	return LNET_NIDNET(LNET_NID_ANY);
+}
+EXPORT_SYMBOL(libcfs_str2net);
+
+lnet_nid_t
+libcfs_str2nid(const char *str)
+{
+	const char       *sep = strchr(str, '@');
+	struct netstrfns *nf;
+	__u32	     net;
+	__u32	     addr;
+
+	if (sep != NULL) {
+		nf = libcfs_str2net_internal(sep + 1, &net);
+		if (nf == NULL)
+			return LNET_NID_ANY;
+	} else {
+		sep = str + strlen(str);
+		net = LNET_MKNET(SOCKLND, 0);
+		nf = libcfs_lnd2netstrfns(SOCKLND);
+		LASSERT(nf != NULL);
+	}
+
+	if (!nf->nf_str2addr(str, (int)(sep - str), &addr))
+		return LNET_NID_ANY;
+
+	return LNET_MKNID(net, addr);
+}
+EXPORT_SYMBOL(libcfs_str2nid);
+
+char *
+libcfs_id2str(lnet_process_id_t id)
+{
+	char *str = libcfs_next_nidstring();
+
+	if (id.pid == LNET_PID_ANY) {
+		snprintf(str, LNET_NIDSTR_SIZE,
+			 "LNET_PID_ANY-%s", libcfs_nid2str(id.nid));
+		return str;
+	}
+
+	snprintf(str, LNET_NIDSTR_SIZE, "%s%u-%s",
+		 ((id.pid & LNET_PID_USERFLAG) != 0) ? "U" : "",
+		 (id.pid & ~LNET_PID_USERFLAG), libcfs_nid2str(id.nid));
+	return str;
+}
+EXPORT_SYMBOL(libcfs_id2str);
+
+int
+libcfs_str2anynid(lnet_nid_t *nidp, const char *str)
+{
+	if (!strcmp(str, "*")) {
+		*nidp = LNET_NID_ANY;
+		return 1;
+	}
+
+	*nidp = libcfs_str2nid(str);
+	return *nidp != LNET_NID_ANY;
+}
+EXPORT_SYMBOL(libcfs_str2anynid);
+
+/**
+ * Nid range list syntax.
+ * \verbatim
+ *
+ * <nidlist>	 :== <nidrange> [ ' ' <nidrange> ]
+ * <nidrange>	:== <addrrange> '@' <net>
+ * <addrrange>       :== '*' |
+ *		       <ipaddr_range> |
+ *			 <cfs_expr_list>
+ * <ipaddr_range>    :== <cfs_expr_list>.<cfs_expr_list>.<cfs_expr_list>.
+ *			 <cfs_expr_list>
+ * <cfs_expr_list>   :== <number> |
+ *		       <expr_list>
+ * <expr_list>       :== '[' <range_expr> [ ',' <range_expr>] ']'
+ * <range_expr>      :== <number> |
+ *		       <number> '-' <number> |
+ *		       <number> '-' <number> '/' <number>
+ * <net>	     :== <netname> | <netname><number>
+ * <netname>	 :== "lo" | "tcp" | "o2ib" | "cib" | "openib" | "iib" |
+ *		       "vib" | "ra" | "elan" | "mx" | "ptl"
+ * \endverbatim
+ */
+
+/**
+ * Structure to represent \<nidrange\> token of the syntax.
+ *
+ * One of this is created for each \<net\> parsed.
+ */
+struct nidrange {
+	/**
+	 * Link to list of this structures which is built on nid range
+	 * list parsing.
+	 */
+	struct list_head nr_link;
+	/**
+	 * List head for addrrange::ar_link.
+	 */
+	struct list_head nr_addrranges;
+	/**
+	 * Flag indicating that *@<net> is found.
+	 */
+	int nr_all;
+	/**
+	 * Pointer to corresponding element of libcfs_netstrfns.
+	 */
+	struct netstrfns *nr_netstrfns;
+	/**
+	 * Number of network. E.g. 5 if \<net\> is "elan5".
+	 */
+	int nr_netnum;
+};
+
+/**
+ * Structure to represent \<addrrange\> token of the syntax.
+ */
+struct addrrange {
+	/**
+	 * Link to nidrange::nr_addrranges.
+	 */
+	struct list_head ar_link;
+	/**
+	 * List head for cfs_expr_list::el_list.
+	 */
+	struct list_head ar_numaddr_ranges;
+};
+
+/**
+ * Parses \<addrrange\> token on the syntax.
+ *
+ * Allocates struct addrrange and links to \a nidrange via
+ * (nidrange::nr_addrranges)
+ *
+ * \retval 1 if \a src parses to '*' | \<ipaddr_range\> | \<cfs_expr_list\>
+ * \retval 0 otherwise
+ */
+static int
+parse_addrange(const struct cfs_lstr *src, struct nidrange *nidrange)
+{
+	struct addrrange *addrrange;
+
+	if (src->ls_len == 1 && src->ls_str[0] == '*') {
+		nidrange->nr_all = 1;
+		return 1;
+	}
+
+	LIBCFS_ALLOC(addrrange, sizeof(struct addrrange));
+	if (addrrange == NULL)
+		return 0;
+	list_add_tail(&addrrange->ar_link, &nidrange->nr_addrranges);
+	INIT_LIST_HEAD(&addrrange->ar_numaddr_ranges);
+
+	return nidrange->nr_netstrfns->nf_parse_addrlist(src->ls_str,
+						src->ls_len,
+						&addrrange->ar_numaddr_ranges);
+}
+
+/**
+ * Finds or creates struct nidrange.
+ *
+ * Checks if \a src is a valid network name, looks for corresponding
+ * nidrange on the ist of nidranges (\a nidlist), creates new struct
+ * nidrange if it is not found.
+ *
+ * \retval pointer to struct nidrange matching network specified via \a src
+ * \retval NULL if \a src does not match any network
+ */
+static struct nidrange *
+add_nidrange(const struct cfs_lstr *src,
+	     struct list_head *nidlist)
+{
+	struct netstrfns *nf;
+	struct nidrange *nr;
+	int endlen;
+	unsigned netnum;
+
+	if (src->ls_len >= LNET_NIDSTR_SIZE)
+		return NULL;
+
+	nf = libcfs_namenum2netstrfns(src->ls_str);
+	if (nf == NULL)
+		return NULL;
+	endlen = src->ls_len - strlen(nf->nf_name);
+	if (endlen == 0)
+		/* network name only, e.g. "elan" or "tcp" */
+		netnum = 0;
+	else {
+		/* e.g. "elan25" or "tcp23", refuse to parse if
+		 * network name is not appended with decimal or
+		 * hexadecimal number */
+		if (!cfs_str2num_check(src->ls_str + strlen(nf->nf_name),
+				       endlen, &netnum, 0, MAX_NUMERIC_VALUE))
+			return NULL;
+	}
+
+	list_for_each_entry(nr, nidlist, nr_link) {
+		if (nr->nr_netstrfns != nf)
+			continue;
+		if (nr->nr_netnum != netnum)
+			continue;
+		return nr;
+	}
+
+	LIBCFS_ALLOC(nr, sizeof(struct nidrange));
+	if (nr == NULL)
+		return NULL;
+	list_add_tail(&nr->nr_link, nidlist);
+	INIT_LIST_HEAD(&nr->nr_addrranges);
+	nr->nr_netstrfns = nf;
+	nr->nr_all = 0;
+	nr->nr_netnum = netnum;
+
+	return nr;
+}
+
+/**
+ * Parses \<nidrange\> token of the syntax.
+ *
+ * \retval 1 if \a src parses to \<addrrange\> '@' \<net\>
+ * \retval 0 otherwise
+ */
+static int
+parse_nidrange(struct cfs_lstr *src, struct list_head *nidlist)
+{
+	struct cfs_lstr addrrange;
+	struct cfs_lstr net;
+	struct cfs_lstr tmp;
+	struct nidrange *nr;
+
+	tmp = *src;
+	if (cfs_gettok(src, '@', &addrrange) == 0)
+		goto failed;
+
+	if (cfs_gettok(src, '@', &net) == 0 || src->ls_str != NULL)
+		goto failed;
+
+	nr = add_nidrange(&net, nidlist);
+	if (nr == NULL)
+		goto failed;
+
+	if (parse_addrange(&addrrange, nr) != 0)
+		goto failed;
+
+	return 1;
+ failed:
+	CWARN("can't parse nidrange: \"%.*s\"\n", tmp.ls_len, tmp.ls_str);
+	return 0;
+}
+
+/**
+ * Frees addrrange structures of \a list.
+ *
+ * For each struct addrrange structure found on \a list it frees
+ * cfs_expr_list list attached to it and frees the addrrange itself.
+ *
+ * \retval none
+ */
+static void
+free_addrranges(struct list_head *list)
+{
+	while (!list_empty(list)) {
+		struct addrrange *ar;
+
+		ar = list_entry(list->next, struct addrrange, ar_link);
+
+		cfs_expr_list_free_list(&ar->ar_numaddr_ranges);
+		list_del(&ar->ar_link);
+		LIBCFS_FREE(ar, sizeof(struct addrrange));
+	}
+}
+
+/**
+ * Frees nidrange strutures of \a list.
+ *
+ * For each struct nidrange structure found on \a list it frees
+ * addrrange list attached to it and frees the nidrange itself.
+ *
+ * \retval none
+ */
+void
+cfs_free_nidlist(struct list_head *list)
+{
+	struct list_head *pos, *next;
+	struct nidrange *nr;
+
+	list_for_each_safe(pos, next, list) {
+		nr = list_entry(pos, struct nidrange, nr_link);
+		free_addrranges(&nr->nr_addrranges);
+		list_del(pos);
+		LIBCFS_FREE(nr, sizeof(struct nidrange));
+	}
+}
+EXPORT_SYMBOL(cfs_free_nidlist);
+
+/**
+ * Parses nid range list.
+ *
+ * Parses with rigorous syntax and overflow checking \a str into
+ * \<nidrange\> [ ' ' \<nidrange\> ], compiles \a str into set of
+ * structures and links that structure to \a nidlist. The resulting
+ * list can be used to match a NID againts set of NIDS defined by \a
+ * str.
+ * \see cfs_match_nid
+ *
+ * \retval 1 on success
+ * \retval 0 otherwise
+ */
+int
+cfs_parse_nidlist(char *str, int len, struct list_head *nidlist)
+{
+	struct cfs_lstr src;
+	struct cfs_lstr res;
+	int rc;
+
+	src.ls_str = str;
+	src.ls_len = len;
+	INIT_LIST_HEAD(nidlist);
+	while (src.ls_str) {
+		rc = cfs_gettok(&src, ' ', &res);
+		if (rc == 0) {
+			cfs_free_nidlist(nidlist);
+			return 0;
+		}
+		rc = parse_nidrange(&res, nidlist);
+		if (rc == 0) {
+			cfs_free_nidlist(nidlist);
+			return 0;
+		}
+	}
+	return 1;
+}
+EXPORT_SYMBOL(cfs_parse_nidlist);
+
+/**
+ * Matches a nid (\a nid) against the compiled list of nidranges (\a nidlist).
+ *
+ * \see cfs_parse_nidlist()
+ *
+ * \retval 1 on match
+ * \retval 0  otherwises
+ */
+int cfs_match_nid(lnet_nid_t nid, struct list_head *nidlist)
+{
+	struct nidrange *nr;
+	struct addrrange *ar;
+
+	list_for_each_entry(nr, nidlist, nr_link) {
+		if (nr->nr_netstrfns->nf_type != LNET_NETTYP(LNET_NIDNET(nid)))
+			continue;
+		if (nr->nr_netnum != LNET_NETNUM(LNET_NIDNET(nid)))
+			continue;
+		if (nr->nr_all)
+			return 1;
+		list_for_each_entry(ar, &nr->nr_addrranges, ar_link)
+			if (nr->nr_netstrfns->nf_match_addr(LNET_NIDADDR(nid),
+						       &ar->ar_numaddr_ranges))
+				return 1;
+	}
+	return 0;
+}
+EXPORT_SYMBOL(cfs_match_nid);
diff --git a/kernel/drivers/staging/lustre/lustre/libcfs/prng.c b/kernel/drivers/staging/lustre/lustre/libcfs/prng.c
new file mode 100644
index 000000000..4147664ff
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/libcfs/prng.c
@@ -0,0 +1,139 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * libcfs/libcfs/prng.c
+ *
+ * concatenation of following two 16-bit multiply with carry generators
+ * x(n)=a*x(n-1)+carry mod 2^16 and y(n)=b*y(n-1)+carry mod 2^16,
+ * number and carry packed within the same 32 bit integer.
+ * algorithm recommended by Marsaglia
+*/
+
+#include "../../include/linux/libcfs/libcfs.h"
+
+/*
+From: George Marsaglia <geo@stat.fsu.edu>
+Newsgroups: sci.math
+Subject: Re: A RANDOM NUMBER GENERATOR FOR C
+Date: Tue, 30 Sep 1997 05:29:35 -0700
+
+ * You may replace the two constants 36969 and 18000 by any
+ * pair of distinct constants from this list:
+ * 18000 18030 18273 18513 18879 19074 19098 19164 19215 19584
+ * 19599 19950 20088 20508 20544 20664 20814 20970 21153 21243
+ * 21423 21723 21954 22125 22188 22293 22860 22938 22965 22974
+ * 23109 23124 23163 23208 23508 23520 23553 23658 23865 24114
+ * 24219 24660 24699 24864 24948 25023 25308 25443 26004 26088
+ * 26154 26550 26679 26838 27183 27258 27753 27795 27810 27834
+ * 27960 28320 28380 28689 28710 28794 28854 28959 28980 29013
+ * 29379 29889 30135 30345 30459 30714 30903 30963 31059 31083
+ * (or any other 16-bit constants k for which both k*2^16-1
+ * and k*2^15-1 are prime) */
+
+#define RANDOM_CONST_A 18030
+#define RANDOM_CONST_B 29013
+
+static unsigned int seed_x = 521288629;
+static unsigned int seed_y = 362436069;
+
+/**
+ * cfs_rand - creates new seeds
+ *
+ * First it creates new seeds from the previous seeds. Then it generates a
+ * new pseudo random number for use.
+ *
+ * Returns a pseudo-random 32-bit integer
+ */
+unsigned int cfs_rand(void)
+{
+	seed_x = RANDOM_CONST_A * (seed_x & 65535) + (seed_x >> 16);
+	seed_y = RANDOM_CONST_B * (seed_y & 65535) + (seed_y >> 16);
+
+	return ((seed_x << 16) + (seed_y & 65535));
+}
+EXPORT_SYMBOL(cfs_rand);
+
+/**
+ * cfs_srand - sets the initial seed
+ * @seed1 : (seed_x) should have the most entropy in the low bits of the word
+ * @seed2 : (seed_y) should have the most entropy in the high bits of the word
+ *
+ * Replaces the original seeds with new values. Used to generate a new pseudo
+ * random numbers.
+ */
+void cfs_srand(unsigned int seed1, unsigned int seed2)
+{
+	if (seed1)
+		seed_x = seed1; /* use default seeds if parameter is 0 */
+	if (seed2)
+		seed_y = seed2;
+}
+EXPORT_SYMBOL(cfs_srand);
+
+/**
+ * cfs_get_random_bytes - generate a bunch of random numbers
+ * @buf : buffer to fill with random numbers
+ * @size: size of passed in buffer
+ *
+ * Fills a buffer with random bytes
+ */
+void cfs_get_random_bytes(void *buf, int size)
+{
+	int *p = buf;
+	int rem, tmp;
+
+	LASSERT(size >= 0);
+
+	rem = min((int)((unsigned long)buf & (sizeof(int) - 1)), size);
+	if (rem) {
+		get_random_bytes(&tmp, sizeof(tmp));
+		tmp ^= cfs_rand();
+		memcpy(buf, &tmp, rem);
+		p = buf + rem;
+		size -= rem;
+	}
+
+	while (size >= sizeof(int)) {
+		get_random_bytes(&tmp, sizeof(tmp));
+		*p = cfs_rand() ^ tmp;
+		size -= sizeof(int);
+		p++;
+	}
+	buf = p;
+	if (size) {
+		get_random_bytes(&tmp, sizeof(tmp));
+		tmp ^= cfs_rand();
+		memcpy(buf, &tmp, size);
+	}
+}
+EXPORT_SYMBOL(cfs_get_random_bytes);
diff --git a/kernel/drivers/staging/lustre/lustre/libcfs/tracefile.c b/kernel/drivers/staging/lustre/lustre/libcfs/tracefile.c
new file mode 100644
index 000000000..c86394f7f
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/libcfs/tracefile.c
@@ -0,0 +1,1196 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * libcfs/libcfs/tracefile.c
+ *
+ * Author: Zach Brown <zab@clusterfs.com>
+ * Author: Phil Schwan <phil@clusterfs.com>
+ */
+
+
+#define DEBUG_SUBSYSTEM S_LNET
+#define LUSTRE_TRACEFILE_PRIVATE
+#include "tracefile.h"
+
+#include "../../include/linux/libcfs/libcfs.h"
+
+/* XXX move things up to the top, comment */
+union cfs_trace_data_union (*cfs_trace_data[TCD_MAX_TYPES])[NR_CPUS] __cacheline_aligned;
+
+char cfs_tracefile[TRACEFILE_NAME_SIZE];
+long long cfs_tracefile_size = CFS_TRACEFILE_SIZE;
+static struct tracefiled_ctl trace_tctl;
+struct mutex cfs_trace_thread_mutex;
+static int thread_running;
+
+static atomic_t cfs_tage_allocated = ATOMIC_INIT(0);
+
+static void put_pages_on_tcd_daemon_list(struct page_collection *pc,
+					 struct cfs_trace_cpu_data *tcd);
+
+static inline struct cfs_trace_page *
+cfs_tage_from_list(struct list_head *list)
+{
+	return list_entry(list, struct cfs_trace_page, linkage);
+}
+
+static struct cfs_trace_page *cfs_tage_alloc(gfp_t gfp)
+{
+	struct page	    *page;
+	struct cfs_trace_page *tage;
+
+	/* My caller is trying to free memory */
+	if (!in_interrupt() && memory_pressure_get())
+		return NULL;
+
+	/*
+	 * Don't spam console with allocation failures: they will be reported
+	 * by upper layer anyway.
+	 */
+	gfp |= __GFP_NOWARN;
+	page = alloc_page(gfp);
+	if (page == NULL)
+		return NULL;
+
+	tage = kmalloc(sizeof(*tage), gfp);
+	if (tage == NULL) {
+		__free_page(page);
+		return NULL;
+	}
+
+	tage->page = page;
+	atomic_inc(&cfs_tage_allocated);
+	return tage;
+}
+
+static void cfs_tage_free(struct cfs_trace_page *tage)
+{
+	__LASSERT(tage != NULL);
+	__LASSERT(tage->page != NULL);
+
+	__free_page(tage->page);
+	kfree(tage);
+	atomic_dec(&cfs_tage_allocated);
+}
+
+static void cfs_tage_to_tail(struct cfs_trace_page *tage,
+			     struct list_head *queue)
+{
+	__LASSERT(tage != NULL);
+	__LASSERT(queue != NULL);
+
+	list_move_tail(&tage->linkage, queue);
+}
+
+int cfs_trace_refill_stock(struct cfs_trace_cpu_data *tcd, gfp_t gfp,
+			   struct list_head *stock)
+{
+	int i;
+
+	/*
+	 * XXX nikita: do NOT call portals_debug_msg() (CDEBUG/ENTRY/EXIT)
+	 * from here: this will lead to infinite recursion.
+	 */
+
+	for (i = 0; i + tcd->tcd_cur_stock_pages < TCD_STOCK_PAGES ; ++ i) {
+		struct cfs_trace_page *tage;
+
+		tage = cfs_tage_alloc(gfp);
+		if (tage == NULL)
+			break;
+		list_add_tail(&tage->linkage, stock);
+	}
+	return i;
+}
+
+/* return a page that has 'len' bytes left at the end */
+static struct cfs_trace_page *
+cfs_trace_get_tage_try(struct cfs_trace_cpu_data *tcd, unsigned long len)
+{
+	struct cfs_trace_page *tage;
+
+	if (tcd->tcd_cur_pages > 0) {
+		__LASSERT(!list_empty(&tcd->tcd_pages));
+		tage = cfs_tage_from_list(tcd->tcd_pages.prev);
+		if (tage->used + len <= PAGE_CACHE_SIZE)
+			return tage;
+	}
+
+	if (tcd->tcd_cur_pages < tcd->tcd_max_pages) {
+		if (tcd->tcd_cur_stock_pages > 0) {
+			tage = cfs_tage_from_list(tcd->tcd_stock_pages.prev);
+			--tcd->tcd_cur_stock_pages;
+			list_del_init(&tage->linkage);
+		} else {
+			tage = cfs_tage_alloc(GFP_ATOMIC);
+			if (unlikely(tage == NULL)) {
+				if ((!memory_pressure_get() ||
+				     in_interrupt()) && printk_ratelimit())
+					printk(KERN_WARNING
+					       "cannot allocate a tage (%ld)\n",
+					       tcd->tcd_cur_pages);
+				return NULL;
+			}
+		}
+
+		tage->used = 0;
+		tage->cpu = smp_processor_id();
+		tage->type = tcd->tcd_type;
+		list_add_tail(&tage->linkage, &tcd->tcd_pages);
+		tcd->tcd_cur_pages++;
+
+		if (tcd->tcd_cur_pages > 8 && thread_running) {
+			struct tracefiled_ctl *tctl = &trace_tctl;
+			/*
+			 * wake up tracefiled to process some pages.
+			 */
+			wake_up(&tctl->tctl_waitq);
+		}
+		return tage;
+	}
+	return NULL;
+}
+
+static void cfs_tcd_shrink(struct cfs_trace_cpu_data *tcd)
+{
+	int pgcount = tcd->tcd_cur_pages / 10;
+	struct page_collection pc;
+	struct cfs_trace_page *tage;
+	struct cfs_trace_page *tmp;
+
+	/*
+	 * XXX nikita: do NOT call portals_debug_msg() (CDEBUG/ENTRY/EXIT)
+	 * from here: this will lead to infinite recursion.
+	 */
+
+	if (printk_ratelimit())
+		printk(KERN_WARNING "debug daemon buffer overflowed; discarding 10%% of pages (%d of %ld)\n",
+		       pgcount + 1, tcd->tcd_cur_pages);
+
+	INIT_LIST_HEAD(&pc.pc_pages);
+	spin_lock_init(&pc.pc_lock);
+
+	list_for_each_entry_safe(tage, tmp, &tcd->tcd_pages, linkage) {
+		if (pgcount-- == 0)
+			break;
+
+		list_move_tail(&tage->linkage, &pc.pc_pages);
+		tcd->tcd_cur_pages--;
+	}
+	put_pages_on_tcd_daemon_list(&pc, tcd);
+}
+
+/* return a page that has 'len' bytes left at the end */
+static struct cfs_trace_page *cfs_trace_get_tage(struct cfs_trace_cpu_data *tcd,
+						 unsigned long len)
+{
+	struct cfs_trace_page *tage;
+
+	/*
+	 * XXX nikita: do NOT call portals_debug_msg() (CDEBUG/ENTRY/EXIT)
+	 * from here: this will lead to infinite recursion.
+	 */
+
+	if (len > PAGE_CACHE_SIZE) {
+		pr_err("cowardly refusing to write %lu bytes in a page\n", len);
+		return NULL;
+	}
+
+	tage = cfs_trace_get_tage_try(tcd, len);
+	if (tage != NULL)
+		return tage;
+	if (thread_running)
+		cfs_tcd_shrink(tcd);
+	if (tcd->tcd_cur_pages > 0) {
+		tage = cfs_tage_from_list(tcd->tcd_pages.next);
+		tage->used = 0;
+		cfs_tage_to_tail(tage, &tcd->tcd_pages);
+	}
+	return tage;
+}
+
+int libcfs_debug_msg(struct libcfs_debug_msg_data *msgdata,
+		     const char *format, ...)
+{
+	va_list args;
+	int     rc;
+
+	va_start(args, format);
+	rc = libcfs_debug_vmsg2(msgdata, format, args, NULL);
+	va_end(args);
+
+	return rc;
+}
+EXPORT_SYMBOL(libcfs_debug_msg);
+
+int libcfs_debug_vmsg2(struct libcfs_debug_msg_data *msgdata,
+		       const char *format1, va_list args,
+		       const char *format2, ...)
+{
+	struct cfs_trace_cpu_data *tcd = NULL;
+	struct ptldebug_header     header = {0};
+	struct cfs_trace_page     *tage;
+	/* string_buf is used only if tcd != NULL, and is always set then */
+	char		      *string_buf = NULL;
+	char		      *debug_buf;
+	int			known_size;
+	int			needed = 85; /* average message length */
+	int			max_nob;
+	va_list		    ap;
+	int			depth;
+	int			i;
+	int			remain;
+	int			mask = msgdata->msg_mask;
+	const char		*file = kbasename(msgdata->msg_file);
+	struct cfs_debug_limit_state   *cdls = msgdata->msg_cdls;
+
+	tcd = cfs_trace_get_tcd();
+
+	/* cfs_trace_get_tcd() grabs a lock, which disables preemption and
+	 * pins us to a particular CPU.  This avoids an smp_processor_id()
+	 * warning on Linux when debugging is enabled. */
+	cfs_set_ptldebug_header(&header, msgdata, CDEBUG_STACK());
+
+	if (tcd == NULL)		/* arch may not log in IRQ context */
+		goto console;
+
+	if (tcd->tcd_cur_pages == 0)
+		header.ph_flags |= PH_FLAG_FIRST_RECORD;
+
+	if (tcd->tcd_shutting_down) {
+		cfs_trace_put_tcd(tcd);
+		tcd = NULL;
+		goto console;
+	}
+
+	depth = __current_nesting_level();
+	known_size = strlen(file) + 1 + depth;
+	if (msgdata->msg_fn)
+		known_size += strlen(msgdata->msg_fn) + 1;
+
+	if (libcfs_debug_binary)
+		known_size += sizeof(header);
+
+	/*/
+	 * '2' used because vsnprintf return real size required for output
+	 * _without_ terminating NULL.
+	 * if needed is to small for this format.
+	 */
+	for (i = 0; i < 2; i++) {
+		tage = cfs_trace_get_tage(tcd, needed + known_size + 1);
+		if (tage == NULL) {
+			if (needed + known_size > PAGE_CACHE_SIZE)
+				mask |= D_ERROR;
+
+			cfs_trace_put_tcd(tcd);
+			tcd = NULL;
+			goto console;
+		}
+
+		string_buf = (char *)page_address(tage->page) +
+					tage->used + known_size;
+
+		max_nob = PAGE_CACHE_SIZE - tage->used - known_size;
+		if (max_nob <= 0) {
+			printk(KERN_EMERG "negative max_nob: %d\n",
+			       max_nob);
+			mask |= D_ERROR;
+			cfs_trace_put_tcd(tcd);
+			tcd = NULL;
+			goto console;
+		}
+
+		needed = 0;
+		if (format1) {
+			va_copy(ap, args);
+			needed = vsnprintf(string_buf, max_nob, format1, ap);
+			va_end(ap);
+		}
+
+		if (format2) {
+			remain = max_nob - needed;
+			if (remain < 0)
+				remain = 0;
+
+			va_start(ap, format2);
+			needed += vsnprintf(string_buf + needed, remain,
+					    format2, ap);
+			va_end(ap);
+		}
+
+		if (needed < max_nob) /* well. printing ok.. */
+			break;
+	}
+
+	if (*(string_buf+needed-1) != '\n')
+		printk(KERN_INFO "format at %s:%d:%s doesn't end in newline\n",
+		       file, msgdata->msg_line, msgdata->msg_fn);
+
+	header.ph_len = known_size + needed;
+	debug_buf = (char *)page_address(tage->page) + tage->used;
+
+	if (libcfs_debug_binary) {
+		memcpy(debug_buf, &header, sizeof(header));
+		tage->used += sizeof(header);
+		debug_buf += sizeof(header);
+	}
+
+	/* indent message according to the nesting level */
+	while (depth-- > 0) {
+		*(debug_buf++) = '.';
+		++ tage->used;
+	}
+
+	strcpy(debug_buf, file);
+	tage->used += strlen(file) + 1;
+	debug_buf += strlen(file) + 1;
+
+	if (msgdata->msg_fn) {
+		strcpy(debug_buf, msgdata->msg_fn);
+		tage->used += strlen(msgdata->msg_fn) + 1;
+		debug_buf += strlen(msgdata->msg_fn) + 1;
+	}
+
+	__LASSERT(debug_buf == string_buf);
+
+	tage->used += needed;
+	__LASSERT (tage->used <= PAGE_CACHE_SIZE);
+
+console:
+	if ((mask & libcfs_printk) == 0) {
+		/* no console output requested */
+		if (tcd != NULL)
+			cfs_trace_put_tcd(tcd);
+		return 1;
+	}
+
+	if (cdls != NULL) {
+		if (libcfs_console_ratelimit &&
+		    cdls->cdls_next != 0 &&     /* not first time ever */
+		    !cfs_time_after(cfs_time_current(), cdls->cdls_next)) {
+			/* skipping a console message */
+			cdls->cdls_count++;
+			if (tcd != NULL)
+				cfs_trace_put_tcd(tcd);
+			return 1;
+		}
+
+		if (cfs_time_after(cfs_time_current(), cdls->cdls_next +
+						       libcfs_console_max_delay
+						       + cfs_time_seconds(10))) {
+			/* last timeout was a long time ago */
+			cdls->cdls_delay /= libcfs_console_backoff * 4;
+		} else {
+			cdls->cdls_delay *= libcfs_console_backoff;
+		}
+
+		if (cdls->cdls_delay < libcfs_console_min_delay)
+			cdls->cdls_delay = libcfs_console_min_delay;
+		else if (cdls->cdls_delay > libcfs_console_max_delay)
+			cdls->cdls_delay = libcfs_console_max_delay;
+
+		/* ensure cdls_next is never zero after it's been seen */
+		cdls->cdls_next = (cfs_time_current() + cdls->cdls_delay) | 1;
+	}
+
+	if (tcd != NULL) {
+		cfs_print_to_console(&header, mask, string_buf, needed, file,
+				     msgdata->msg_fn);
+		cfs_trace_put_tcd(tcd);
+	} else {
+		string_buf = cfs_trace_get_console_buffer();
+
+		needed = 0;
+		if (format1 != NULL) {
+			va_copy(ap, args);
+			needed = vsnprintf(string_buf,
+					   CFS_TRACE_CONSOLE_BUFFER_SIZE,
+					   format1, ap);
+			va_end(ap);
+		}
+		if (format2 != NULL) {
+			remain = CFS_TRACE_CONSOLE_BUFFER_SIZE - needed;
+			if (remain > 0) {
+				va_start(ap, format2);
+				needed += vsnprintf(string_buf+needed, remain,
+						    format2, ap);
+				va_end(ap);
+			}
+		}
+		cfs_print_to_console(&header, mask,
+				     string_buf, needed, file, msgdata->msg_fn);
+
+		cfs_trace_put_console_buffer(string_buf);
+	}
+
+	if (cdls != NULL && cdls->cdls_count != 0) {
+		string_buf = cfs_trace_get_console_buffer();
+
+		needed = snprintf(string_buf, CFS_TRACE_CONSOLE_BUFFER_SIZE,
+				  "Skipped %d previous similar message%s\n",
+				  cdls->cdls_count,
+				  (cdls->cdls_count > 1) ? "s" : "");
+
+		cfs_print_to_console(&header, mask,
+				     string_buf, needed, file, msgdata->msg_fn);
+
+		cfs_trace_put_console_buffer(string_buf);
+		cdls->cdls_count = 0;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL(libcfs_debug_vmsg2);
+
+void
+cfs_trace_assertion_failed(const char *str,
+			   struct libcfs_debug_msg_data *msgdata)
+{
+	struct ptldebug_header hdr;
+
+	libcfs_panic_in_progress = 1;
+	libcfs_catastrophe = 1;
+	mb();
+
+	cfs_set_ptldebug_header(&hdr, msgdata, CDEBUG_STACK());
+
+	cfs_print_to_console(&hdr, D_EMERG, str, strlen(str),
+			     msgdata->msg_file, msgdata->msg_fn);
+
+	panic("Lustre debug assertion failure\n");
+
+	/* not reached */
+}
+
+static void
+panic_collect_pages(struct page_collection *pc)
+{
+	/* Do the collect_pages job on a single CPU: assumes that all other
+	 * CPUs have been stopped during a panic.  If this isn't true for some
+	 * arch, this will have to be implemented separately in each arch.  */
+	int			i;
+	int			j;
+	struct cfs_trace_cpu_data *tcd;
+
+	INIT_LIST_HEAD(&pc->pc_pages);
+
+	cfs_tcd_for_each(tcd, i, j) {
+		list_splice_init(&tcd->tcd_pages, &pc->pc_pages);
+		tcd->tcd_cur_pages = 0;
+
+		if (pc->pc_want_daemon_pages) {
+			list_splice_init(&tcd->tcd_daemon_pages,
+					     &pc->pc_pages);
+			tcd->tcd_cur_daemon_pages = 0;
+		}
+	}
+}
+
+static void collect_pages_on_all_cpus(struct page_collection *pc)
+{
+	struct cfs_trace_cpu_data *tcd;
+	int i, cpu;
+
+	spin_lock(&pc->pc_lock);
+	for_each_possible_cpu(cpu) {
+		cfs_tcd_for_each_type_lock(tcd, i, cpu) {
+			list_splice_init(&tcd->tcd_pages, &pc->pc_pages);
+			tcd->tcd_cur_pages = 0;
+			if (pc->pc_want_daemon_pages) {
+				list_splice_init(&tcd->tcd_daemon_pages,
+						     &pc->pc_pages);
+				tcd->tcd_cur_daemon_pages = 0;
+			}
+		}
+	}
+	spin_unlock(&pc->pc_lock);
+}
+
+static void collect_pages(struct page_collection *pc)
+{
+	INIT_LIST_HEAD(&pc->pc_pages);
+
+	if (libcfs_panic_in_progress)
+		panic_collect_pages(pc);
+	else
+		collect_pages_on_all_cpus(pc);
+}
+
+static void put_pages_back_on_all_cpus(struct page_collection *pc)
+{
+	struct cfs_trace_cpu_data *tcd;
+	struct list_head *cur_head;
+	struct cfs_trace_page *tage;
+	struct cfs_trace_page *tmp;
+	int i, cpu;
+
+	spin_lock(&pc->pc_lock);
+	for_each_possible_cpu(cpu) {
+		cfs_tcd_for_each_type_lock(tcd, i, cpu) {
+			cur_head = tcd->tcd_pages.next;
+
+			list_for_each_entry_safe(tage, tmp, &pc->pc_pages,
+						 linkage) {
+
+				__LASSERT_TAGE_INVARIANT(tage);
+
+				if (tage->cpu != cpu || tage->type != i)
+					continue;
+
+				cfs_tage_to_tail(tage, cur_head);
+				tcd->tcd_cur_pages++;
+			}
+		}
+	}
+	spin_unlock(&pc->pc_lock);
+}
+
+static void put_pages_back(struct page_collection *pc)
+{
+	if (!libcfs_panic_in_progress)
+		put_pages_back_on_all_cpus(pc);
+}
+
+/* Add pages to a per-cpu debug daemon ringbuffer.  This buffer makes sure that
+ * we have a good amount of data at all times for dumping during an LBUG, even
+ * if we have been steadily writing (and otherwise discarding) pages via the
+ * debug daemon. */
+static void put_pages_on_tcd_daemon_list(struct page_collection *pc,
+					 struct cfs_trace_cpu_data *tcd)
+{
+	struct cfs_trace_page *tage;
+	struct cfs_trace_page *tmp;
+
+	spin_lock(&pc->pc_lock);
+	list_for_each_entry_safe(tage, tmp, &pc->pc_pages, linkage) {
+
+		__LASSERT_TAGE_INVARIANT(tage);
+
+		if (tage->cpu != tcd->tcd_cpu || tage->type != tcd->tcd_type)
+			continue;
+
+		cfs_tage_to_tail(tage, &tcd->tcd_daemon_pages);
+		tcd->tcd_cur_daemon_pages++;
+
+		if (tcd->tcd_cur_daemon_pages > tcd->tcd_max_pages) {
+			struct cfs_trace_page *victim;
+
+			__LASSERT(!list_empty(&tcd->tcd_daemon_pages));
+			victim = cfs_tage_from_list(tcd->tcd_daemon_pages.next);
+
+			__LASSERT_TAGE_INVARIANT(victim);
+
+			list_del(&victim->linkage);
+			cfs_tage_free(victim);
+			tcd->tcd_cur_daemon_pages--;
+		}
+	}
+	spin_unlock(&pc->pc_lock);
+}
+
+static void put_pages_on_daemon_list(struct page_collection *pc)
+{
+	struct cfs_trace_cpu_data *tcd;
+	int i, cpu;
+
+	for_each_possible_cpu(cpu) {
+		cfs_tcd_for_each_type_lock(tcd, i, cpu)
+			put_pages_on_tcd_daemon_list(pc, tcd);
+	}
+}
+
+void cfs_trace_debug_print(void)
+{
+	struct page_collection pc;
+	struct cfs_trace_page *tage;
+	struct cfs_trace_page *tmp;
+
+	spin_lock_init(&pc.pc_lock);
+
+	pc.pc_want_daemon_pages = 1;
+	collect_pages(&pc);
+	list_for_each_entry_safe(tage, tmp, &pc.pc_pages, linkage) {
+		char *p, *file, *fn;
+		struct page *page;
+
+		__LASSERT_TAGE_INVARIANT(tage);
+
+		page = tage->page;
+		p = page_address(page);
+		while (p < ((char *)page_address(page) + tage->used)) {
+			struct ptldebug_header *hdr;
+			int len;
+			hdr = (void *)p;
+			p += sizeof(*hdr);
+			file = p;
+			p += strlen(file) + 1;
+			fn = p;
+			p += strlen(fn) + 1;
+			len = hdr->ph_len - (int)(p - (char *)hdr);
+
+			cfs_print_to_console(hdr, D_EMERG, p, len, file, fn);
+
+			p += len;
+		}
+
+		list_del(&tage->linkage);
+		cfs_tage_free(tage);
+	}
+}
+
+int cfs_tracefile_dump_all_pages(char *filename)
+{
+	struct page_collection	pc;
+	struct file		*filp;
+	struct cfs_trace_page	*tage;
+	struct cfs_trace_page	*tmp;
+	char			*buf;
+	int rc;
+
+	DECL_MMSPACE;
+
+	cfs_tracefile_write_lock();
+
+	filp = filp_open(filename, O_CREAT|O_EXCL|O_WRONLY|O_LARGEFILE, 0600);
+	if (IS_ERR(filp)) {
+		rc = PTR_ERR(filp);
+		filp = NULL;
+		pr_err("LustreError: can't open %s for dump: rc %d\n",
+			filename, rc);
+		goto out;
+	}
+
+	spin_lock_init(&pc.pc_lock);
+	pc.pc_want_daemon_pages = 1;
+	collect_pages(&pc);
+	if (list_empty(&pc.pc_pages)) {
+		rc = 0;
+		goto close;
+	}
+
+	/* ok, for now, just write the pages.  in the future we'll be building
+	 * iobufs with the pages and calling generic_direct_IO */
+	MMSPACE_OPEN;
+	list_for_each_entry_safe(tage, tmp, &pc.pc_pages, linkage) {
+
+		__LASSERT_TAGE_INVARIANT(tage);
+
+		buf = kmap(tage->page);
+		rc = vfs_write(filp, (__force const char __user *)buf,
+			       tage->used, &filp->f_pos);
+		kunmap(tage->page);
+
+		if (rc != (int)tage->used) {
+			printk(KERN_WARNING "wanted to write %u but wrote %d\n",
+			       tage->used, rc);
+			put_pages_back(&pc);
+			__LASSERT(list_empty(&pc.pc_pages));
+			break;
+		}
+		list_del(&tage->linkage);
+		cfs_tage_free(tage);
+	}
+	MMSPACE_CLOSE;
+	rc = vfs_fsync(filp, 1);
+	if (rc)
+		pr_err("sync returns %d\n", rc);
+close:
+	filp_close(filp, NULL);
+out:
+	cfs_tracefile_write_unlock();
+	return rc;
+}
+
+void cfs_trace_flush_pages(void)
+{
+	struct page_collection pc;
+	struct cfs_trace_page *tage;
+	struct cfs_trace_page *tmp;
+
+	spin_lock_init(&pc.pc_lock);
+
+	pc.pc_want_daemon_pages = 1;
+	collect_pages(&pc);
+	list_for_each_entry_safe(tage, tmp, &pc.pc_pages, linkage) {
+
+		__LASSERT_TAGE_INVARIANT(tage);
+
+		list_del(&tage->linkage);
+		cfs_tage_free(tage);
+	}
+}
+
+int cfs_trace_copyin_string(char *knl_buffer, int knl_buffer_nob,
+			    const char __user *usr_buffer, int usr_buffer_nob)
+{
+	int    nob;
+
+	if (usr_buffer_nob > knl_buffer_nob)
+		return -EOVERFLOW;
+
+	if (copy_from_user((void *)knl_buffer,
+			   usr_buffer, usr_buffer_nob))
+		return -EFAULT;
+
+	nob = strnlen(knl_buffer, usr_buffer_nob);
+	while (nob-- >= 0)		      /* strip trailing whitespace */
+		if (!isspace(knl_buffer[nob]))
+			break;
+
+	if (nob < 0)			    /* empty string */
+		return -EINVAL;
+
+	if (nob == knl_buffer_nob)	      /* no space to terminate */
+		return -EOVERFLOW;
+
+	knl_buffer[nob + 1] = 0;		/* terminate */
+	return 0;
+}
+EXPORT_SYMBOL(cfs_trace_copyin_string);
+
+int cfs_trace_copyout_string(char __user *usr_buffer, int usr_buffer_nob,
+			     const char *knl_buffer, char *append)
+{
+	/* NB if 'append' != NULL, it's a single character to append to the
+	 * copied out string - usually "\n", for /proc entries and "" (i.e. a
+	 * terminating zero byte) for sysctl entries */
+	int   nob = strlen(knl_buffer);
+
+	if (nob > usr_buffer_nob)
+		nob = usr_buffer_nob;
+
+	if (copy_to_user(usr_buffer, knl_buffer, nob))
+		return -EFAULT;
+
+	if (append != NULL && nob < usr_buffer_nob) {
+		if (copy_to_user(usr_buffer + nob, append, 1))
+			return -EFAULT;
+
+		nob++;
+	}
+
+	return nob;
+}
+EXPORT_SYMBOL(cfs_trace_copyout_string);
+
+int cfs_trace_allocate_string_buffer(char **str, int nob)
+{
+	if (nob > 2 * PAGE_CACHE_SIZE)	    /* string must be "sensible" */
+		return -EINVAL;
+
+	*str = kmalloc(nob, GFP_IOFS | __GFP_ZERO);
+	if (*str == NULL)
+		return -ENOMEM;
+
+	return 0;
+}
+
+void cfs_trace_free_string_buffer(char *str, int nob)
+{
+	kfree(str);
+}
+
+int cfs_trace_dump_debug_buffer_usrstr(void __user *usr_str, int usr_str_nob)
+{
+	char	 *str;
+	int	   rc;
+
+	rc = cfs_trace_allocate_string_buffer(&str, usr_str_nob + 1);
+	if (rc != 0)
+		return rc;
+
+	rc = cfs_trace_copyin_string(str, usr_str_nob + 1,
+				     usr_str, usr_str_nob);
+	if (rc != 0)
+		goto out;
+
+	if (str[0] != '/') {
+		rc = -EINVAL;
+		goto out;
+	}
+	rc = cfs_tracefile_dump_all_pages(str);
+out:
+	cfs_trace_free_string_buffer(str, usr_str_nob + 1);
+	return rc;
+}
+
+int cfs_trace_daemon_command(char *str)
+{
+	int       rc = 0;
+
+	cfs_tracefile_write_lock();
+
+	if (strcmp(str, "stop") == 0) {
+		cfs_tracefile_write_unlock();
+		cfs_trace_stop_thread();
+		cfs_tracefile_write_lock();
+		memset(cfs_tracefile, 0, sizeof(cfs_tracefile));
+
+	} else if (strncmp(str, "size=", 5) == 0) {
+		cfs_tracefile_size = simple_strtoul(str + 5, NULL, 0);
+		if (cfs_tracefile_size < 10 || cfs_tracefile_size > 20480)
+			cfs_tracefile_size = CFS_TRACEFILE_SIZE;
+		else
+			cfs_tracefile_size <<= 20;
+
+	} else if (strlen(str) >= sizeof(cfs_tracefile)) {
+		rc = -ENAMETOOLONG;
+	} else if (str[0] != '/') {
+		rc = -EINVAL;
+	} else {
+		strcpy(cfs_tracefile, str);
+
+		printk(KERN_INFO
+		       "Lustre: debug daemon will attempt to start writing to %s (%lukB max)\n",
+		       cfs_tracefile,
+		       (long)(cfs_tracefile_size >> 10));
+
+		cfs_trace_start_thread();
+	}
+
+	cfs_tracefile_write_unlock();
+	return rc;
+}
+
+int cfs_trace_daemon_command_usrstr(void __user *usr_str, int usr_str_nob)
+{
+	char *str;
+	int   rc;
+
+	rc = cfs_trace_allocate_string_buffer(&str, usr_str_nob + 1);
+	if (rc != 0)
+		return rc;
+
+	rc = cfs_trace_copyin_string(str, usr_str_nob + 1,
+				 usr_str, usr_str_nob);
+	if (rc == 0)
+		rc = cfs_trace_daemon_command(str);
+
+	cfs_trace_free_string_buffer(str, usr_str_nob + 1);
+	return rc;
+}
+
+int cfs_trace_set_debug_mb(int mb)
+{
+	int i;
+	int j;
+	int pages;
+	int limit = cfs_trace_max_debug_mb();
+	struct cfs_trace_cpu_data *tcd;
+
+	if (mb < num_possible_cpus()) {
+		printk(KERN_WARNING
+		       "Lustre: %d MB is too small for debug buffer size, setting it to %d MB.\n",
+		       mb, num_possible_cpus());
+		mb = num_possible_cpus();
+	}
+
+	if (mb > limit) {
+		printk(KERN_WARNING
+		       "Lustre: %d MB is too large for debug buffer size, setting it to %d MB.\n",
+		       mb, limit);
+		mb = limit;
+	}
+
+	mb /= num_possible_cpus();
+	pages = mb << (20 - PAGE_CACHE_SHIFT);
+
+	cfs_tracefile_write_lock();
+
+	cfs_tcd_for_each(tcd, i, j)
+		tcd->tcd_max_pages = (pages * tcd->tcd_pages_factor) / 100;
+
+	cfs_tracefile_write_unlock();
+
+	return 0;
+}
+
+int cfs_trace_set_debug_mb_usrstr(void __user *usr_str, int usr_str_nob)
+{
+	char     str[32];
+	int      rc;
+
+	rc = cfs_trace_copyin_string(str, sizeof(str), usr_str, usr_str_nob);
+	if (rc < 0)
+		return rc;
+
+	return cfs_trace_set_debug_mb(simple_strtoul(str, NULL, 0));
+}
+
+int cfs_trace_get_debug_mb(void)
+{
+	int i;
+	int j;
+	struct cfs_trace_cpu_data *tcd;
+	int total_pages = 0;
+
+	cfs_tracefile_read_lock();
+
+	cfs_tcd_for_each(tcd, i, j)
+		total_pages += tcd->tcd_max_pages;
+
+	cfs_tracefile_read_unlock();
+
+	return (total_pages >> (20 - PAGE_CACHE_SHIFT)) + 1;
+}
+
+static int tracefiled(void *arg)
+{
+	struct page_collection pc;
+	struct tracefiled_ctl *tctl = arg;
+	struct cfs_trace_page *tage;
+	struct cfs_trace_page *tmp;
+	struct file *filp;
+	char *buf;
+	int last_loop = 0;
+	int rc;
+
+	DECL_MMSPACE;
+
+	/* we're started late enough that we pick up init's fs context */
+	/* this is so broken in uml?  what on earth is going on? */
+
+	spin_lock_init(&pc.pc_lock);
+	complete(&tctl->tctl_start);
+
+	while (1) {
+		wait_queue_t __wait;
+
+		pc.pc_want_daemon_pages = 0;
+		collect_pages(&pc);
+		if (list_empty(&pc.pc_pages))
+			goto end_loop;
+
+		filp = NULL;
+		cfs_tracefile_read_lock();
+		if (cfs_tracefile[0] != 0) {
+			filp = filp_open(cfs_tracefile,
+					 O_CREAT | O_RDWR | O_LARGEFILE,
+					 0600);
+			if (IS_ERR(filp)) {
+				rc = PTR_ERR(filp);
+				filp = NULL;
+				printk(KERN_WARNING "couldn't open %s: %d\n",
+				       cfs_tracefile, rc);
+			}
+		}
+		cfs_tracefile_read_unlock();
+		if (filp == NULL) {
+			put_pages_on_daemon_list(&pc);
+			__LASSERT(list_empty(&pc.pc_pages));
+			goto end_loop;
+		}
+
+		MMSPACE_OPEN;
+
+		list_for_each_entry_safe(tage, tmp, &pc.pc_pages,
+						   linkage) {
+			static loff_t f_pos;
+
+			__LASSERT_TAGE_INVARIANT(tage);
+
+			if (f_pos >= (off_t)cfs_tracefile_size)
+				f_pos = 0;
+			else if (f_pos > i_size_read(file_inode(filp)))
+				f_pos = i_size_read(file_inode(filp));
+
+			buf = kmap(tage->page);
+			rc = vfs_write(filp, (__force const char __user *)buf,
+				       tage->used, &f_pos);
+			kunmap(tage->page);
+
+			if (rc != (int)tage->used) {
+				printk(KERN_WARNING "wanted to write %u but wrote %d\n",
+				       tage->used, rc);
+				put_pages_back(&pc);
+				__LASSERT(list_empty(&pc.pc_pages));
+				break;
+			}
+		}
+		MMSPACE_CLOSE;
+
+		filp_close(filp, NULL);
+		put_pages_on_daemon_list(&pc);
+		if (!list_empty(&pc.pc_pages)) {
+			int i;
+
+			printk(KERN_ALERT "Lustre: trace pages aren't empty\n");
+			pr_err("total cpus(%d): ",
+				num_possible_cpus());
+			for (i = 0; i < num_possible_cpus(); i++)
+				if (cpu_online(i))
+					pr_cont("%d(on) ", i);
+				else
+					pr_cont("%d(off) ", i);
+			pr_cont("\n");
+
+			i = 0;
+			list_for_each_entry_safe(tage, tmp, &pc.pc_pages,
+						     linkage)
+				pr_err("page %d belongs to cpu %d\n",
+					++i, tage->cpu);
+			pr_err("There are %d pages unwritten\n", i);
+		}
+		__LASSERT(list_empty(&pc.pc_pages));
+end_loop:
+		if (atomic_read(&tctl->tctl_shutdown)) {
+			if (last_loop == 0) {
+				last_loop = 1;
+				continue;
+			} else {
+				break;
+			}
+		}
+		init_waitqueue_entry(&__wait, current);
+		add_wait_queue(&tctl->tctl_waitq, &__wait);
+		set_current_state(TASK_INTERRUPTIBLE);
+		schedule_timeout(cfs_time_seconds(1));
+		remove_wait_queue(&tctl->tctl_waitq, &__wait);
+	}
+	complete(&tctl->tctl_stop);
+	return 0;
+}
+
+int cfs_trace_start_thread(void)
+{
+	struct tracefiled_ctl *tctl = &trace_tctl;
+	int rc = 0;
+
+	mutex_lock(&cfs_trace_thread_mutex);
+	if (thread_running)
+		goto out;
+
+	init_completion(&tctl->tctl_start);
+	init_completion(&tctl->tctl_stop);
+	init_waitqueue_head(&tctl->tctl_waitq);
+	atomic_set(&tctl->tctl_shutdown, 0);
+
+	if (IS_ERR(kthread_run(tracefiled, tctl, "ktracefiled"))) {
+		rc = -ECHILD;
+		goto out;
+	}
+
+	wait_for_completion(&tctl->tctl_start);
+	thread_running = 1;
+out:
+	mutex_unlock(&cfs_trace_thread_mutex);
+	return rc;
+}
+
+void cfs_trace_stop_thread(void)
+{
+	struct tracefiled_ctl *tctl = &trace_tctl;
+
+	mutex_lock(&cfs_trace_thread_mutex);
+	if (thread_running) {
+		printk(KERN_INFO
+		       "Lustre: shutting down debug daemon thread...\n");
+		atomic_set(&tctl->tctl_shutdown, 1);
+		wait_for_completion(&tctl->tctl_stop);
+		thread_running = 0;
+	}
+	mutex_unlock(&cfs_trace_thread_mutex);
+}
+
+int cfs_tracefile_init(int max_pages)
+{
+	struct cfs_trace_cpu_data *tcd;
+	int		    i;
+	int		    j;
+	int		    rc;
+	int		    factor;
+
+	rc = cfs_tracefile_init_arch();
+	if (rc != 0)
+		return rc;
+
+	cfs_tcd_for_each(tcd, i, j) {
+		/* tcd_pages_factor is initialized int tracefile_init_arch. */
+		factor = tcd->tcd_pages_factor;
+		INIT_LIST_HEAD(&tcd->tcd_pages);
+		INIT_LIST_HEAD(&tcd->tcd_stock_pages);
+		INIT_LIST_HEAD(&tcd->tcd_daemon_pages);
+		tcd->tcd_cur_pages = 0;
+		tcd->tcd_cur_stock_pages = 0;
+		tcd->tcd_cur_daemon_pages = 0;
+		tcd->tcd_max_pages = (max_pages * factor) / 100;
+		LASSERT(tcd->tcd_max_pages > 0);
+		tcd->tcd_shutting_down = 0;
+	}
+
+	return 0;
+}
+
+static void trace_cleanup_on_all_cpus(void)
+{
+	struct cfs_trace_cpu_data *tcd;
+	struct cfs_trace_page *tage;
+	struct cfs_trace_page *tmp;
+	int i, cpu;
+
+	for_each_possible_cpu(cpu) {
+		cfs_tcd_for_each_type_lock(tcd, i, cpu) {
+			tcd->tcd_shutting_down = 1;
+
+			list_for_each_entry_safe(tage, tmp, &tcd->tcd_pages,
+							   linkage) {
+				__LASSERT_TAGE_INVARIANT(tage);
+
+				list_del(&tage->linkage);
+				cfs_tage_free(tage);
+			}
+
+			tcd->tcd_cur_pages = 0;
+		}
+	}
+}
+
+static void cfs_trace_cleanup(void)
+{
+	struct page_collection pc;
+
+	INIT_LIST_HEAD(&pc.pc_pages);
+	spin_lock_init(&pc.pc_lock);
+
+	trace_cleanup_on_all_cpus();
+
+	cfs_tracefile_fini_arch();
+}
+
+void cfs_tracefile_exit(void)
+{
+	cfs_trace_stop_thread();
+	cfs_trace_cleanup();
+}
diff --git a/kernel/drivers/staging/lustre/lustre/libcfs/tracefile.h b/kernel/drivers/staging/lustre/lustre/libcfs/tracefile.h
new file mode 100644
index 000000000..0601476e1
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/libcfs/tracefile.h
@@ -0,0 +1,340 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef __LIBCFS_TRACEFILE_H__
+#define __LIBCFS_TRACEFILE_H__
+
+#include "../../include/linux/libcfs/libcfs.h"
+
+#include "linux/linux-tracefile.h"
+
+/* trace file lock routines */
+
+#define TRACEFILE_NAME_SIZE 1024
+extern char      cfs_tracefile[TRACEFILE_NAME_SIZE];
+extern long long cfs_tracefile_size;
+
+extern void libcfs_run_debug_log_upcall(char *file);
+
+int  cfs_tracefile_init_arch(void);
+void cfs_tracefile_fini_arch(void);
+
+void cfs_tracefile_read_lock(void);
+void cfs_tracefile_read_unlock(void);
+void cfs_tracefile_write_lock(void);
+void cfs_tracefile_write_unlock(void);
+
+int cfs_tracefile_dump_all_pages(char *filename);
+void cfs_trace_debug_print(void);
+void cfs_trace_flush_pages(void);
+int cfs_trace_start_thread(void);
+void cfs_trace_stop_thread(void);
+int cfs_tracefile_init(int max_pages);
+void cfs_tracefile_exit(void);
+
+
+
+int cfs_trace_copyin_string(char *knl_buffer, int knl_buffer_nob,
+			    const char __user *usr_buffer, int usr_buffer_nob);
+int cfs_trace_copyout_string(char __user *usr_buffer, int usr_buffer_nob,
+			     const char *knl_str, char *append);
+int cfs_trace_allocate_string_buffer(char **str, int nob);
+void cfs_trace_free_string_buffer(char *str, int nob);
+int cfs_trace_dump_debug_buffer_usrstr(void __user *usr_str, int usr_str_nob);
+int cfs_trace_daemon_command(char *str);
+int cfs_trace_daemon_command_usrstr(void __user *usr_str, int usr_str_nob);
+int cfs_trace_set_debug_mb(int mb);
+int cfs_trace_set_debug_mb_usrstr(void __user *usr_str, int usr_str_nob);
+int cfs_trace_get_debug_mb(void);
+
+extern void libcfs_debug_dumplog_internal(void *arg);
+extern void libcfs_register_panic_notifier(void);
+extern void libcfs_unregister_panic_notifier(void);
+extern int  libcfs_panic_in_progress;
+extern int  cfs_trace_max_debug_mb(void);
+
+#define TCD_MAX_PAGES (5 << (20 - PAGE_CACHE_SHIFT))
+#define TCD_STOCK_PAGES (TCD_MAX_PAGES)
+#define CFS_TRACEFILE_SIZE (500 << 20)
+
+#ifdef LUSTRE_TRACEFILE_PRIVATE
+
+/*
+ * Private declare for tracefile
+ */
+#define TCD_MAX_PAGES (5 << (20 - PAGE_CACHE_SHIFT))
+#define TCD_STOCK_PAGES (TCD_MAX_PAGES)
+
+#define CFS_TRACEFILE_SIZE (500 << 20)
+
+/* Size of a buffer for sprinting console messages if we can't get a page
+ * from system */
+#define CFS_TRACE_CONSOLE_BUFFER_SIZE   1024
+
+union cfs_trace_data_union {
+	struct cfs_trace_cpu_data {
+		/*
+		 * Even though this structure is meant to be per-CPU, locking
+		 * is needed because in some places the data may be accessed
+		 * from other CPUs. This lock is directly used in trace_get_tcd
+		 * and trace_put_tcd, which are called in libcfs_debug_vmsg2 and
+		 * tcd_for_each_type_lock
+		 */
+		spinlock_t		tcd_lock;
+		unsigned long	   tcd_lock_flags;
+
+		/*
+		 * pages with trace records not yet processed by tracefiled.
+		 */
+		struct list_head	      tcd_pages;
+		/* number of pages on ->tcd_pages */
+		unsigned long	   tcd_cur_pages;
+
+		/*
+		 * pages with trace records already processed by
+		 * tracefiled. These pages are kept in memory, so that some
+		 * portion of log can be written in the event of LBUG. This
+		 * list is maintained in LRU order.
+		 *
+		 * Pages are moved to ->tcd_daemon_pages by tracefiled()
+		 * (put_pages_on_daemon_list()). LRU pages from this list are
+		 * discarded when list grows too large.
+		 */
+		struct list_head	      tcd_daemon_pages;
+		/* number of pages on ->tcd_daemon_pages */
+		unsigned long	   tcd_cur_daemon_pages;
+
+		/*
+		 * Maximal number of pages allowed on ->tcd_pages and
+		 * ->tcd_daemon_pages each.
+		 * Always TCD_MAX_PAGES * tcd_pages_factor / 100 in current
+		 * implementation.
+		 */
+		unsigned long	   tcd_max_pages;
+
+		/*
+		 * preallocated pages to write trace records into. Pages from
+		 * ->tcd_stock_pages are moved to ->tcd_pages by
+		 * portals_debug_msg().
+		 *
+		 * This list is necessary, because on some platforms it's
+		 * impossible to perform efficient atomic page allocation in a
+		 * non-blockable context.
+		 *
+		 * Such platforms fill ->tcd_stock_pages "on occasion", when
+		 * tracing code is entered in blockable context.
+		 *
+		 * trace_get_tage_try() tries to get a page from
+		 * ->tcd_stock_pages first and resorts to atomic page
+		 * allocation only if this queue is empty. ->tcd_stock_pages
+		 * is replenished when tracing code is entered in blocking
+		 * context (darwin-tracefile.c:trace_get_tcd()). We try to
+		 * maintain TCD_STOCK_PAGES (40 by default) pages in this
+		 * queue. Atomic allocation is only required if more than
+		 * TCD_STOCK_PAGES pagesful are consumed by trace records all
+		 * emitted in non-blocking contexts. Which is quite unlikely.
+		 */
+		struct list_head	      tcd_stock_pages;
+		/* number of pages on ->tcd_stock_pages */
+		unsigned long	   tcd_cur_stock_pages;
+
+		unsigned short	  tcd_shutting_down;
+		unsigned short	  tcd_cpu;
+		unsigned short	  tcd_type;
+		/* The factors to share debug memory. */
+		unsigned short	  tcd_pages_factor;
+	} tcd;
+	char __pad[L1_CACHE_ALIGN(sizeof(struct cfs_trace_cpu_data))];
+};
+
+#define TCD_MAX_TYPES      8
+extern union cfs_trace_data_union (*cfs_trace_data[TCD_MAX_TYPES])[NR_CPUS];
+
+#define cfs_tcd_for_each(tcd, i, j)				       \
+    for (i = 0; cfs_trace_data[i] != NULL; i++)			   \
+	for (j = 0, ((tcd) = &(*cfs_trace_data[i])[j].tcd);	       \
+	     j < num_possible_cpus();				 \
+	     j++, (tcd) = &(*cfs_trace_data[i])[j].tcd)
+
+#define cfs_tcd_for_each_type_lock(tcd, i, cpu)			   \
+    for (i = 0; cfs_trace_data[i] &&				      \
+	 (tcd = &(*cfs_trace_data[i])[cpu].tcd) &&			\
+	 cfs_trace_lock_tcd(tcd, 1); cfs_trace_unlock_tcd(tcd, 1), i++)
+
+/* XXX nikita: this declaration is internal to tracefile.c and should probably
+ * be moved there */
+struct page_collection {
+	struct list_head	pc_pages;
+	/*
+	 * spin-lock protecting ->pc_pages. It is taken by smp_call_function()
+	 * call-back functions. XXX nikita: Which is horrible: all processors
+	 * receive NMI at the same time only to be serialized by this
+	 * lock. Probably ->pc_pages should be replaced with an array of
+	 * NR_CPUS elements accessed locklessly.
+	 */
+	spinlock_t	pc_lock;
+	/*
+	 * if this flag is set, collect_pages() will spill both
+	 * ->tcd_daemon_pages and ->tcd_pages to the ->pc_pages. Otherwise,
+	 * only ->tcd_pages are spilled.
+	 */
+	int		pc_want_daemon_pages;
+};
+
+/* XXX nikita: this declaration is internal to tracefile.c and should probably
+ * be moved there */
+struct tracefiled_ctl {
+	struct completion	tctl_start;
+	struct completion	tctl_stop;
+	wait_queue_head_t		tctl_waitq;
+	pid_t			tctl_pid;
+	atomic_t		tctl_shutdown;
+};
+
+/*
+ * small data-structure for each page owned by tracefiled.
+ */
+/* XXX nikita: this declaration is internal to tracefile.c and should probably
+ * be moved there */
+struct cfs_trace_page {
+	/*
+	 * page itself
+	 */
+	struct page	  *page;
+	/*
+	 * linkage into one of the lists in trace_data_union or
+	 * page_collection
+	 */
+	struct list_head	   linkage;
+	/*
+	 * number of bytes used within this page
+	 */
+	unsigned int	 used;
+	/*
+	 * cpu that owns this page
+	 */
+	unsigned short       cpu;
+	/*
+	 * type(context) of this page
+	 */
+	unsigned short       type;
+};
+
+extern void cfs_set_ptldebug_header(struct ptldebug_header *header,
+				    struct libcfs_debug_msg_data *m,
+				    unsigned long stack);
+extern void cfs_print_to_console(struct ptldebug_header *hdr, int mask,
+				 const char *buf, int len, const char *file,
+				 const char *fn);
+
+extern int cfs_trace_lock_tcd(struct cfs_trace_cpu_data *tcd, int walking);
+extern void cfs_trace_unlock_tcd(struct cfs_trace_cpu_data *tcd, int walking);
+
+/**
+ * trace_buf_type_t, trace_buf_idx_get() and trace_console_buffers[][]
+ * are not public libcfs API; they should be defined in
+ * platform-specific tracefile include files
+ * (see, for example, linux-tracefile.h).
+ */
+
+extern char *cfs_trace_console_buffers[NR_CPUS][CFS_TCD_TYPE_MAX];
+extern cfs_trace_buf_type_t cfs_trace_buf_idx_get(void);
+
+static inline char *
+cfs_trace_get_console_buffer(void)
+{
+	unsigned int i = get_cpu();
+	unsigned int j = cfs_trace_buf_idx_get();
+
+	return cfs_trace_console_buffers[i][j];
+}
+
+static inline void
+cfs_trace_put_console_buffer(char *buffer)
+{
+	put_cpu();
+}
+
+static inline struct cfs_trace_cpu_data *
+cfs_trace_get_tcd(void)
+{
+	struct cfs_trace_cpu_data *tcd =
+		&(*cfs_trace_data[cfs_trace_buf_idx_get()])[get_cpu()].tcd;
+
+	cfs_trace_lock_tcd(tcd, 0);
+
+	return tcd;
+}
+
+static inline void
+cfs_trace_put_tcd (struct cfs_trace_cpu_data *tcd)
+{
+	cfs_trace_unlock_tcd(tcd, 0);
+
+	put_cpu();
+}
+
+int cfs_trace_refill_stock(struct cfs_trace_cpu_data *tcd, gfp_t gfp,
+			   struct list_head *stock);
+
+
+int cfs_tcd_owns_tage(struct cfs_trace_cpu_data *tcd,
+		      struct cfs_trace_page *tage);
+
+extern void cfs_trace_assertion_failed(const char *str,
+				       struct libcfs_debug_msg_data *m);
+
+/* ASSERTION that is safe to use within the debug system */
+#define __LASSERT(cond)						 \
+do {								    \
+	if (unlikely(!(cond))) {					\
+		LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, D_EMERG, NULL);     \
+		cfs_trace_assertion_failed("ASSERTION("#cond") failed", \
+					   &msgdata);		   \
+	}							       \
+} while (0)
+
+#define __LASSERT_TAGE_INVARIANT(tage)				  \
+do {								    \
+	__LASSERT(tage != NULL);					\
+	__LASSERT(tage->page != NULL);				  \
+	__LASSERT(tage->used <= PAGE_CACHE_SIZE);			 \
+	__LASSERT(page_count(tage->page) > 0);		      \
+} while (0)
+
+#endif	/* LUSTRE_TRACEFILE_PRIVATE */
+
+#endif /* __LIBCFS_TRACEFILE_H__ */
diff --git a/kernel/drivers/staging/lustre/lustre/libcfs/workitem.c b/kernel/drivers/staging/lustre/lustre/libcfs/workitem.c
new file mode 100644
index 000000000..48009b775
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/libcfs/workitem.c
@@ -0,0 +1,479 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * libcfs/libcfs/workitem.c
+ *
+ * Author: Isaac Huang <isaac@clusterfs.com>
+ *	 Liang Zhen  <zhen.liang@sun.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+
+#include "../../include/linux/libcfs/libcfs.h"
+
+#define CFS_WS_NAME_LEN	 16
+
+typedef struct cfs_wi_sched {
+	struct list_head		ws_list;	/* chain on global list */
+	/** serialised workitems */
+	spinlock_t		ws_lock;
+	/** where schedulers sleep */
+	wait_queue_head_t		ws_waitq;
+	/** concurrent workitems */
+	struct list_head		ws_runq;
+	/** rescheduled running-workitems, a workitem can be rescheduled
+	 * while running in wi_action(), but we don't to execute it again
+	 * unless it returns from wi_action(), so we put it on ws_rerunq
+	 * while rescheduling, and move it to runq after it returns
+	 * from wi_action() */
+	struct list_head		ws_rerunq;
+	/** CPT-table for this scheduler */
+	struct cfs_cpt_table	*ws_cptab;
+	/** CPT id for affinity */
+	int			ws_cpt;
+	/** number of scheduled workitems */
+	int			ws_nscheduled;
+	/** started scheduler thread, protected by cfs_wi_data::wi_glock */
+	unsigned int		ws_nthreads:30;
+	/** shutting down, protected by cfs_wi_data::wi_glock */
+	unsigned int		ws_stopping:1;
+	/** serialize starting thread, protected by cfs_wi_data::wi_glock */
+	unsigned int		ws_starting:1;
+	/** scheduler name */
+	char			ws_name[CFS_WS_NAME_LEN];
+} cfs_wi_sched_t;
+
+static struct cfs_workitem_data {
+	/** serialize */
+	spinlock_t		wi_glock;
+	/** list of all schedulers */
+	struct list_head		wi_scheds;
+	/** WI module is initialized */
+	int			wi_init;
+	/** shutting down the whole WI module */
+	int			wi_stopping;
+} cfs_wi_data;
+
+static inline void
+cfs_wi_sched_lock(cfs_wi_sched_t *sched)
+{
+	spin_lock(&sched->ws_lock);
+}
+
+static inline void
+cfs_wi_sched_unlock(cfs_wi_sched_t *sched)
+{
+	spin_unlock(&sched->ws_lock);
+}
+
+static inline int
+cfs_wi_sched_cansleep(cfs_wi_sched_t *sched)
+{
+	cfs_wi_sched_lock(sched);
+	if (sched->ws_stopping) {
+		cfs_wi_sched_unlock(sched);
+		return 0;
+	}
+
+	if (!list_empty(&sched->ws_runq)) {
+		cfs_wi_sched_unlock(sched);
+		return 0;
+	}
+	cfs_wi_sched_unlock(sched);
+	return 1;
+}
+
+
+/* XXX:
+ * 0. it only works when called from wi->wi_action.
+ * 1. when it returns no one shall try to schedule the workitem.
+ */
+void
+cfs_wi_exit(struct cfs_wi_sched *sched, cfs_workitem_t *wi)
+{
+	LASSERT(!in_interrupt()); /* because we use plain spinlock */
+	LASSERT(!sched->ws_stopping);
+
+	cfs_wi_sched_lock(sched);
+
+	LASSERT(wi->wi_running);
+	if (wi->wi_scheduled) { /* cancel pending schedules */
+		LASSERT(!list_empty(&wi->wi_list));
+		list_del_init(&wi->wi_list);
+
+		LASSERT(sched->ws_nscheduled > 0);
+		sched->ws_nscheduled--;
+	}
+
+	LASSERT(list_empty(&wi->wi_list));
+
+	wi->wi_scheduled = 1; /* LBUG future schedule attempts */
+	cfs_wi_sched_unlock(sched);
+
+	return;
+}
+EXPORT_SYMBOL(cfs_wi_exit);
+
+/**
+ * cancel schedule request of workitem \a wi
+ */
+int
+cfs_wi_deschedule(struct cfs_wi_sched *sched, cfs_workitem_t *wi)
+{
+	int	rc;
+
+	LASSERT(!in_interrupt()); /* because we use plain spinlock */
+	LASSERT(!sched->ws_stopping);
+
+	/*
+	 * return 0 if it's running already, otherwise return 1, which
+	 * means the workitem will not be scheduled and will not have
+	 * any race with wi_action.
+	 */
+	cfs_wi_sched_lock(sched);
+
+	rc = !(wi->wi_running);
+
+	if (wi->wi_scheduled) { /* cancel pending schedules */
+		LASSERT(!list_empty(&wi->wi_list));
+		list_del_init(&wi->wi_list);
+
+		LASSERT(sched->ws_nscheduled > 0);
+		sched->ws_nscheduled--;
+
+		wi->wi_scheduled = 0;
+	}
+
+	LASSERT (list_empty(&wi->wi_list));
+
+	cfs_wi_sched_unlock(sched);
+	return rc;
+}
+EXPORT_SYMBOL(cfs_wi_deschedule);
+
+/*
+ * Workitem scheduled with (serial == 1) is strictly serialised not only with
+ * itself, but also with others scheduled this way.
+ *
+ * Now there's only one static serialised queue, but in the future more might
+ * be added, and even dynamic creation of serialised queues might be supported.
+ */
+void
+cfs_wi_schedule(struct cfs_wi_sched *sched, cfs_workitem_t *wi)
+{
+	LASSERT(!in_interrupt()); /* because we use plain spinlock */
+	LASSERT(!sched->ws_stopping);
+
+	cfs_wi_sched_lock(sched);
+
+	if (!wi->wi_scheduled) {
+		LASSERT (list_empty(&wi->wi_list));
+
+		wi->wi_scheduled = 1;
+		sched->ws_nscheduled++;
+		if (!wi->wi_running) {
+			list_add_tail(&wi->wi_list, &sched->ws_runq);
+			wake_up(&sched->ws_waitq);
+		} else {
+			list_add(&wi->wi_list, &sched->ws_rerunq);
+		}
+	}
+
+	LASSERT (!list_empty(&wi->wi_list));
+	cfs_wi_sched_unlock(sched);
+	return;
+}
+EXPORT_SYMBOL(cfs_wi_schedule);
+
+
+static int
+cfs_wi_scheduler (void *arg)
+{
+	struct cfs_wi_sched	*sched = (cfs_wi_sched_t *)arg;
+
+	cfs_block_allsigs();
+
+	/* CPT affinity scheduler? */
+	if (sched->ws_cptab != NULL)
+		cfs_cpt_bind(sched->ws_cptab, sched->ws_cpt);
+
+	spin_lock(&cfs_wi_data.wi_glock);
+
+	LASSERT(sched->ws_starting == 1);
+	sched->ws_starting--;
+	sched->ws_nthreads++;
+
+	spin_unlock(&cfs_wi_data.wi_glock);
+
+	cfs_wi_sched_lock(sched);
+
+	while (!sched->ws_stopping) {
+		int	     nloops = 0;
+		int	     rc;
+		cfs_workitem_t *wi;
+
+		while (!list_empty(&sched->ws_runq) &&
+		       nloops < CFS_WI_RESCHED) {
+			wi = list_entry(sched->ws_runq.next,
+					    cfs_workitem_t, wi_list);
+			LASSERT(wi->wi_scheduled && !wi->wi_running);
+
+			list_del_init(&wi->wi_list);
+
+			LASSERT(sched->ws_nscheduled > 0);
+			sched->ws_nscheduled--;
+
+			wi->wi_running   = 1;
+			wi->wi_scheduled = 0;
+
+
+			cfs_wi_sched_unlock(sched);
+			nloops++;
+
+			rc = (*wi->wi_action) (wi);
+
+			cfs_wi_sched_lock(sched);
+			if (rc != 0) /* WI should be dead, even be freed! */
+				continue;
+
+			wi->wi_running = 0;
+			if (list_empty(&wi->wi_list))
+				continue;
+
+			LASSERT(wi->wi_scheduled);
+			/* wi is rescheduled, should be on rerunq now, we
+			 * move it to runq so it can run action now */
+			list_move_tail(&wi->wi_list, &sched->ws_runq);
+		}
+
+		if (!list_empty(&sched->ws_runq)) {
+			cfs_wi_sched_unlock(sched);
+			/* don't sleep because some workitems still
+			 * expect me to come back soon */
+			cond_resched();
+			cfs_wi_sched_lock(sched);
+			continue;
+		}
+
+		cfs_wi_sched_unlock(sched);
+		rc = wait_event_interruptible_exclusive(sched->ws_waitq,
+						!cfs_wi_sched_cansleep(sched));
+		cfs_wi_sched_lock(sched);
+	}
+
+	cfs_wi_sched_unlock(sched);
+
+	spin_lock(&cfs_wi_data.wi_glock);
+	sched->ws_nthreads--;
+	spin_unlock(&cfs_wi_data.wi_glock);
+
+	return 0;
+}
+
+
+void
+cfs_wi_sched_destroy(struct cfs_wi_sched *sched)
+{
+	int	i;
+
+	LASSERT(cfs_wi_data.wi_init);
+	LASSERT(!cfs_wi_data.wi_stopping);
+
+	spin_lock(&cfs_wi_data.wi_glock);
+	if (sched->ws_stopping) {
+		CDEBUG(D_INFO, "%s is in progress of stopping\n",
+		       sched->ws_name);
+		spin_unlock(&cfs_wi_data.wi_glock);
+		return;
+	}
+
+	LASSERT(!list_empty(&sched->ws_list));
+	sched->ws_stopping = 1;
+
+	spin_unlock(&cfs_wi_data.wi_glock);
+
+	i = 2;
+	wake_up_all(&sched->ws_waitq);
+
+	spin_lock(&cfs_wi_data.wi_glock);
+	while (sched->ws_nthreads > 0) {
+		CDEBUG(IS_PO2(++i) ? D_WARNING : D_NET,
+		       "waiting for %d threads of WI sched[%s] to terminate\n",
+		       sched->ws_nthreads, sched->ws_name);
+
+		spin_unlock(&cfs_wi_data.wi_glock);
+		set_current_state(TASK_UNINTERRUPTIBLE);
+		schedule_timeout(cfs_time_seconds(1) / 20);
+		spin_lock(&cfs_wi_data.wi_glock);
+	}
+
+	list_del(&sched->ws_list);
+
+	spin_unlock(&cfs_wi_data.wi_glock);
+	LASSERT(sched->ws_nscheduled == 0);
+
+	LIBCFS_FREE(sched, sizeof(*sched));
+}
+EXPORT_SYMBOL(cfs_wi_sched_destroy);
+
+int
+cfs_wi_sched_create(char *name, struct cfs_cpt_table *cptab,
+		    int cpt, int nthrs, struct cfs_wi_sched **sched_pp)
+{
+	struct cfs_wi_sched	*sched;
+	int			rc;
+
+	LASSERT(cfs_wi_data.wi_init);
+	LASSERT(!cfs_wi_data.wi_stopping);
+	LASSERT(cptab == NULL || cpt == CFS_CPT_ANY ||
+		(cpt >= 0 && cpt < cfs_cpt_number(cptab)));
+
+	LIBCFS_ALLOC(sched, sizeof(*sched));
+	if (sched == NULL)
+		return -ENOMEM;
+
+	strncpy(sched->ws_name, name, CFS_WS_NAME_LEN);
+	sched->ws_name[CFS_WS_NAME_LEN - 1] = '\0';
+	sched->ws_cptab = cptab;
+	sched->ws_cpt = cpt;
+
+	spin_lock_init(&sched->ws_lock);
+	init_waitqueue_head(&sched->ws_waitq);
+	INIT_LIST_HEAD(&sched->ws_runq);
+	INIT_LIST_HEAD(&sched->ws_rerunq);
+	INIT_LIST_HEAD(&sched->ws_list);
+
+	rc = 0;
+	while (nthrs > 0)  {
+		char	name[16];
+		struct task_struct *task;
+
+		spin_lock(&cfs_wi_data.wi_glock);
+		while (sched->ws_starting > 0) {
+			spin_unlock(&cfs_wi_data.wi_glock);
+			schedule();
+			spin_lock(&cfs_wi_data.wi_glock);
+		}
+
+		sched->ws_starting++;
+		spin_unlock(&cfs_wi_data.wi_glock);
+
+		if (sched->ws_cptab != NULL && sched->ws_cpt >= 0) {
+			snprintf(name, sizeof(name), "%s_%02d_%02u",
+				 sched->ws_name, sched->ws_cpt,
+				 sched->ws_nthreads);
+		} else {
+			snprintf(name, sizeof(name), "%s_%02u",
+				 sched->ws_name, sched->ws_nthreads);
+		}
+
+		task = kthread_run(cfs_wi_scheduler, sched, "%s", name);
+		if (!IS_ERR(task)) {
+			nthrs--;
+			continue;
+		}
+		rc = PTR_ERR(task);
+
+		CERROR("Failed to create thread for WI scheduler %s: %d\n",
+		       name, rc);
+
+		spin_lock(&cfs_wi_data.wi_glock);
+
+		/* make up for cfs_wi_sched_destroy */
+		list_add(&sched->ws_list, &cfs_wi_data.wi_scheds);
+		sched->ws_starting--;
+
+		spin_unlock(&cfs_wi_data.wi_glock);
+
+		cfs_wi_sched_destroy(sched);
+		return rc;
+	}
+	spin_lock(&cfs_wi_data.wi_glock);
+	list_add(&sched->ws_list, &cfs_wi_data.wi_scheds);
+	spin_unlock(&cfs_wi_data.wi_glock);
+
+	*sched_pp = sched;
+	return 0;
+}
+EXPORT_SYMBOL(cfs_wi_sched_create);
+
+int
+cfs_wi_startup(void)
+{
+	memset(&cfs_wi_data, 0, sizeof(cfs_wi_data));
+
+	spin_lock_init(&cfs_wi_data.wi_glock);
+	INIT_LIST_HEAD(&cfs_wi_data.wi_scheds);
+	cfs_wi_data.wi_init = 1;
+
+	return 0;
+}
+
+void
+cfs_wi_shutdown(void)
+{
+	struct cfs_wi_sched	*sched;
+
+	spin_lock(&cfs_wi_data.wi_glock);
+	cfs_wi_data.wi_stopping = 1;
+	spin_unlock(&cfs_wi_data.wi_glock);
+
+	/* nobody should contend on this list */
+	list_for_each_entry(sched, &cfs_wi_data.wi_scheds, ws_list) {
+		sched->ws_stopping = 1;
+		wake_up_all(&sched->ws_waitq);
+	}
+
+	list_for_each_entry(sched, &cfs_wi_data.wi_scheds, ws_list) {
+		spin_lock(&cfs_wi_data.wi_glock);
+
+		while (sched->ws_nthreads != 0) {
+			spin_unlock(&cfs_wi_data.wi_glock);
+			set_current_state(TASK_UNINTERRUPTIBLE);
+			schedule_timeout(cfs_time_seconds(1) / 20);
+			spin_lock(&cfs_wi_data.wi_glock);
+		}
+		spin_unlock(&cfs_wi_data.wi_glock);
+	}
+	while (!list_empty(&cfs_wi_data.wi_scheds)) {
+		sched = list_entry(cfs_wi_data.wi_scheds.next,
+				       struct cfs_wi_sched, ws_list);
+		list_del(&sched->ws_list);
+		LIBCFS_FREE(sched, sizeof(*sched));
+	}
+
+	cfs_wi_data.wi_stopping = 0;
+	cfs_wi_data.wi_init = 0;
+}
diff --git a/kernel/drivers/staging/lustre/lustre/llite/Makefile b/kernel/drivers/staging/lustre/lustre/llite/Makefile
new file mode 100644
index 000000000..7d70115d5
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/llite/Makefile
@@ -0,0 +1,11 @@
+obj-$(CONFIG_LUSTRE_FS) += lustre.o
+obj-$(CONFIG_LUSTRE_LLITE_LLOOP) += llite_lloop.o
+lustre-y := dcache.o dir.o file.o llite_close.o llite_lib.o llite_nfs.o \
+	    rw.o namei.o symlink.o llite_mmap.o \
+	    xattr.o xattr_cache.o remote_perm.o llite_rmtacl.o llite_capa.o \
+	    rw26.o super25.o statahead.o \
+	    ../lclient/glimpse.o ../lclient/lcommon_cl.o ../lclient/lcommon_misc.o \
+	    vvp_dev.o vvp_page.o vvp_lock.o vvp_io.o vvp_object.o
+
+lustre-$(CONFIG_PROC_FS) += lproc_llite.o
+llite_lloop-y := lloop.o
diff --git a/kernel/drivers/staging/lustre/lustre/llite/dcache.c b/kernel/drivers/staging/lustre/lustre/llite/dcache.c
new file mode 100644
index 000000000..5af013513
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/llite/dcache.c
@@ -0,0 +1,363 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#include <linux/fs.h>
+#include <linux/sched.h>
+#include <linux/quotaops.h>
+
+#define DEBUG_SUBSYSTEM S_LLITE
+
+#include "../include/obd_support.h"
+#include "../include/lustre_lite.h"
+#include "../include/lustre/lustre_idl.h"
+#include "../include/lustre_dlm.h"
+
+#include "llite_internal.h"
+
+static void free_dentry_data(struct rcu_head *head)
+{
+	struct ll_dentry_data *lld;
+
+	lld = container_of(head, struct ll_dentry_data, lld_rcu_head);
+	OBD_FREE_PTR(lld);
+}
+
+/* should NOT be called with the dcache lock, see fs/dcache.c */
+static void ll_release(struct dentry *de)
+{
+	struct ll_dentry_data *lld;
+
+	LASSERT(de != NULL);
+	lld = ll_d2d(de);
+	if (lld == NULL) /* NFS copies the de->d_op methods (bug 4655) */
+		return;
+
+	if (lld->lld_it) {
+		ll_intent_release(lld->lld_it);
+		OBD_FREE(lld->lld_it, sizeof(*lld->lld_it));
+	}
+
+	de->d_fsdata = NULL;
+	call_rcu(&lld->lld_rcu_head, free_dentry_data);
+}
+
+/* Compare if two dentries are the same.  Don't match if the existing dentry
+ * is marked invalid.  Returns 1 if different, 0 if the same.
+ *
+ * This avoids a race where ll_lookup_it() instantiates a dentry, but we get
+ * an AST before calling d_revalidate_it().  The dentry still exists (marked
+ * INVALID) so d_lookup() matches it, but we have no lock on it (so
+ * lock_match() fails) and we spin around real_lookup(). */
+static int ll_dcompare(const struct dentry *parent, const struct dentry *dentry,
+		       unsigned int len, const char *str,
+		       const struct qstr *name)
+{
+	if (len != name->len)
+		return 1;
+
+	if (memcmp(str, name->name, len))
+		return 1;
+
+	CDEBUG(D_DENTRY, "found name %.*s(%p) flags %#x refc %d\n",
+	       name->len, name->name, dentry, dentry->d_flags,
+	       d_count(dentry));
+
+	/* mountpoint is always valid */
+	if (d_mountpoint((struct dentry *)dentry))
+		return 0;
+
+	if (d_lustre_invalid(dentry))
+		return 1;
+
+	return 0;
+}
+
+static inline int return_if_equal(struct ldlm_lock *lock, void *data)
+{
+	if ((lock->l_flags &
+	     (LDLM_FL_CANCELING | LDLM_FL_DISCARD_DATA)) ==
+	    (LDLM_FL_CANCELING | LDLM_FL_DISCARD_DATA))
+		return LDLM_ITER_CONTINUE;
+	return LDLM_ITER_STOP;
+}
+
+/* find any ldlm lock of the inode in mdc and lov
+ * return 0    not find
+ *	1    find one
+ *      < 0    error */
+static int find_cbdata(struct inode *inode)
+{
+	struct ll_sb_info *sbi = ll_i2sbi(inode);
+	struct lov_stripe_md *lsm;
+	int rc = 0;
+
+	LASSERT(inode);
+	rc = md_find_cbdata(sbi->ll_md_exp, ll_inode2fid(inode),
+			    return_if_equal, NULL);
+	if (rc != 0)
+		return rc;
+
+	lsm = ccc_inode_lsm_get(inode);
+	if (lsm == NULL)
+		return rc;
+
+	rc = obd_find_cbdata(sbi->ll_dt_exp, lsm, return_if_equal, NULL);
+	ccc_inode_lsm_put(inode, lsm);
+
+	return rc;
+}
+
+/**
+ * Called when last reference to a dentry is dropped and dcache wants to know
+ * whether or not it should cache it:
+ * - return 1 to delete the dentry immediately
+ * - return 0 to cache the dentry
+ * Should NOT be called with the dcache lock, see fs/dcache.c
+ */
+static int ll_ddelete(const struct dentry *de)
+{
+	LASSERT(de);
+
+	CDEBUG(D_DENTRY, "%s dentry %pd (%p, parent %p, inode %p) %s%s\n",
+	       d_lustre_invalid((struct dentry *)de) ? "deleting" : "keeping",
+	       de, de, de->d_parent, d_inode(de),
+	       d_unhashed(de) ? "" : "hashed,",
+	       list_empty(&de->d_subdirs) ? "" : "subdirs");
+
+	/* kernel >= 2.6.38 last refcount is decreased after this function. */
+	LASSERT(d_count(de) == 1);
+
+	/* Disable this piece of code temporarily because this is called
+	 * inside dcache_lock so it's not appropriate to do lots of work
+	 * here. ATTENTION: Before this piece of code enabling, LU-2487 must be
+	 * resolved. */
+#if 0
+	/* if not ldlm lock for this inode, set i_nlink to 0 so that
+	 * this inode can be recycled later b=20433 */
+	if (d_really_is_positive(de) && !find_cbdata(d_inode(de)))
+		clear_nlink(d_inode(de));
+#endif
+
+	if (d_lustre_invalid((struct dentry *)de))
+		return 1;
+	return 0;
+}
+
+int ll_d_init(struct dentry *de)
+{
+	LASSERT(de != NULL);
+
+	CDEBUG(D_DENTRY, "ldd on dentry %pd (%p) parent %p inode %p refc %d\n",
+		de, de, de->d_parent, d_inode(de),
+		d_count(de));
+
+	if (de->d_fsdata == NULL) {
+		struct ll_dentry_data *lld;
+
+		lld = kzalloc(sizeof(*lld), GFP_NOFS);
+		if (likely(lld)) {
+			spin_lock(&de->d_lock);
+			if (likely(de->d_fsdata == NULL)) {
+				de->d_fsdata = lld;
+				__d_lustre_invalidate(de);
+			} else {
+				OBD_FREE_PTR(lld);
+			}
+			spin_unlock(&de->d_lock);
+		} else {
+			return -ENOMEM;
+		}
+	}
+	LASSERT(de->d_op == &ll_d_ops);
+
+	return 0;
+}
+
+void ll_intent_drop_lock(struct lookup_intent *it)
+{
+	if (it->it_op && it->d.lustre.it_lock_mode) {
+		struct lustre_handle handle;
+
+		handle.cookie = it->d.lustre.it_lock_handle;
+
+		CDEBUG(D_DLMTRACE, "releasing lock with cookie %#llx from it %p\n",
+		       handle.cookie, it);
+		ldlm_lock_decref(&handle, it->d.lustre.it_lock_mode);
+
+		/* bug 494: intent_release may be called multiple times, from
+		 * this thread and we don't want to double-decref this lock */
+		it->d.lustre.it_lock_mode = 0;
+		if (it->d.lustre.it_remote_lock_mode != 0) {
+			handle.cookie = it->d.lustre.it_remote_lock_handle;
+
+			CDEBUG(D_DLMTRACE, "releasing remote lock with cookie%#llx from it %p\n",
+			       handle.cookie, it);
+			ldlm_lock_decref(&handle,
+					 it->d.lustre.it_remote_lock_mode);
+			it->d.lustre.it_remote_lock_mode = 0;
+		}
+	}
+}
+
+void ll_intent_release(struct lookup_intent *it)
+{
+	CDEBUG(D_INFO, "intent %p released\n", it);
+	ll_intent_drop_lock(it);
+	/* We are still holding extra reference on a request, need to free it */
+	if (it_disposition(it, DISP_ENQ_OPEN_REF))
+		ptlrpc_req_finished(it->d.lustre.it_data); /* ll_file_open */
+
+	if (it_disposition(it, DISP_ENQ_CREATE_REF)) /* create rec */
+		ptlrpc_req_finished(it->d.lustre.it_data);
+
+	it->d.lustre.it_disposition = 0;
+	it->d.lustre.it_data = NULL;
+}
+
+void ll_invalidate_aliases(struct inode *inode)
+{
+	struct dentry *dentry;
+	struct ll_d_hlist_node *p;
+
+	LASSERT(inode != NULL);
+
+	CDEBUG(D_INODE, "marking dentries for ino %lu/%u(%p) invalid\n",
+	       inode->i_ino, inode->i_generation, inode);
+
+	ll_lock_dcache(inode);
+	ll_d_hlist_for_each_entry(dentry, p, &inode->i_dentry, d_u.d_alias) {
+		CDEBUG(D_DENTRY, "dentry in drop %pd (%p) parent %p inode %p flags %d\n",
+		       dentry, dentry, dentry->d_parent,
+		       d_inode(dentry), dentry->d_flags);
+
+		d_lustre_invalidate(dentry, 0);
+	}
+	ll_unlock_dcache(inode);
+}
+
+int ll_revalidate_it_finish(struct ptlrpc_request *request,
+			    struct lookup_intent *it,
+			    struct inode *inode)
+{
+	int rc = 0;
+
+	if (!request)
+		return 0;
+
+	if (it_disposition(it, DISP_LOOKUP_NEG))
+		return -ENOENT;
+
+	rc = ll_prep_inode(&inode, request, NULL, it);
+
+	return rc;
+}
+
+void ll_lookup_finish_locks(struct lookup_intent *it, struct inode *inode)
+{
+	LASSERT(it != NULL);
+
+	if (it->d.lustre.it_lock_mode && inode != NULL) {
+		struct ll_sb_info *sbi = ll_i2sbi(inode);
+
+		CDEBUG(D_DLMTRACE, "setting l_data to inode %p (%lu/%u)\n",
+		       inode, inode->i_ino, inode->i_generation);
+		ll_set_lock_data(sbi->ll_md_exp, inode, it, NULL);
+	}
+
+	/* drop lookup or getattr locks immediately */
+	if (it->it_op == IT_LOOKUP || it->it_op == IT_GETATTR) {
+		/* on 2.6 there are situation when several lookups and
+		 * revalidations may be requested during single operation.
+		 * therefore, we don't release intent here -bzzz */
+		ll_intent_drop_lock(it);
+	}
+}
+
+static int ll_revalidate_dentry(struct dentry *dentry,
+				unsigned int lookup_flags)
+{
+	struct inode *dir = d_inode(dentry->d_parent);
+
+	/*
+	 * if open&create is set, talk to MDS to make sure file is created if
+	 * necessary, because we can't do this in ->open() later since that's
+	 * called on an inode. return 0 here to let lookup to handle this.
+	 */
+	if ((lookup_flags & (LOOKUP_OPEN | LOOKUP_CREATE)) ==
+	    (LOOKUP_OPEN | LOOKUP_CREATE))
+		return 0;
+
+	if (lookup_flags & (LOOKUP_PARENT | LOOKUP_OPEN | LOOKUP_CREATE))
+		return 1;
+
+	if (d_need_statahead(dir, dentry) <= 0)
+		return 1;
+
+	if (lookup_flags & LOOKUP_RCU)
+		return -ECHILD;
+
+	do_statahead_enter(dir, &dentry, d_inode(dentry) == NULL);
+	ll_statahead_mark(dir, dentry);
+	return 1;
+}
+
+/*
+ * Always trust cached dentries. Update statahead window if necessary.
+ */
+static int ll_revalidate_nd(struct dentry *dentry, unsigned int flags)
+{
+	CDEBUG(D_VFSTRACE, "VFS Op:name=%pd, flags=%u\n",
+	       dentry, flags);
+
+	return ll_revalidate_dentry(dentry, flags);
+}
+
+
+static void ll_d_iput(struct dentry *de, struct inode *inode)
+{
+	LASSERT(inode);
+	if (!find_cbdata(inode))
+		clear_nlink(inode);
+	iput(inode);
+}
+
+const struct dentry_operations ll_d_ops = {
+	.d_revalidate = ll_revalidate_nd,
+	.d_release = ll_release,
+	.d_delete  = ll_ddelete,
+	.d_iput    = ll_d_iput,
+	.d_compare = ll_dcompare,
+};
diff --git a/kernel/drivers/staging/lustre/lustre/llite/dir.c b/kernel/drivers/staging/lustre/lustre/llite/dir.c
new file mode 100644
index 000000000..a5bc694dc
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/llite/dir.c
@@ -0,0 +1,1971 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/llite/dir.c
+ *
+ * Directory code for lustre client.
+ */
+
+#include <linux/fs.h>
+#include <linux/pagemap.h>
+#include <linux/mm.h>
+#include <linux/uaccess.h>
+#include <linux/buffer_head.h>   /* for wait_on_buffer */
+#include <linux/pagevec.h>
+#include <linux/prefetch.h>
+
+#define DEBUG_SUBSYSTEM S_LLITE
+
+#include "../include/obd_support.h"
+#include "../include/obd_class.h"
+#include "../include/lustre_lib.h"
+#include "../include/lustre/lustre_idl.h"
+#include "../include/lustre_lite.h"
+#include "../include/lustre_dlm.h"
+#include "../include/lustre_fid.h"
+#include "llite_internal.h"
+
+/*
+ * (new) readdir implementation overview.
+ *
+ * Original lustre readdir implementation cached exact copy of raw directory
+ * pages on the client. These pages were indexed in client page cache by
+ * logical offset in the directory file. This design, while very simple and
+ * intuitive had some inherent problems:
+ *
+ *     . it implies that byte offset to the directory entry serves as a
+ *     telldir(3)/seekdir(3) cookie, but that offset is not stable: in
+ *     ext3/htree directory entries may move due to splits, and more
+ *     importantly,
+ *
+ *     . it is incompatible with the design of split directories for cmd3,
+ *     that assumes that names are distributed across nodes based on their
+ *     hash, and so readdir should be done in hash order.
+ *
+ * New readdir implementation does readdir in hash order, and uses hash of a
+ * file name as a telldir/seekdir cookie. This led to number of complications:
+ *
+ *     . hash is not unique, so it cannot be used to index cached directory
+ *     pages on the client (note, that it requires a whole pageful of hash
+ *     collided entries to cause two pages to have identical hashes);
+ *
+ *     . hash is not unique, so it cannot, strictly speaking, be used as an
+ *     entry cookie. ext3/htree has the same problem and lustre implementation
+ *     mimics their solution: seekdir(hash) positions directory at the first
+ *     entry with the given hash.
+ *
+ * Client side.
+ *
+ * 0. caching
+ *
+ * Client caches directory pages using hash of the first entry as an index. As
+ * noted above hash is not unique, so this solution doesn't work as is:
+ * special processing is needed for "page hash chains" (i.e., sequences of
+ * pages filled with entries all having the same hash value).
+ *
+ * First, such chains have to be detected. To this end, server returns to the
+ * client the hash of the first entry on the page next to one returned. When
+ * client detects that this hash is the same as hash of the first entry on the
+ * returned page, page hash collision has to be handled. Pages in the
+ * hash chain, except first one, are termed "overflow pages".
+ *
+ * Solution to index uniqueness problem is to not cache overflow
+ * pages. Instead, when page hash collision is detected, all overflow pages
+ * from emerging chain are immediately requested from the server and placed in
+ * a special data structure (struct ll_dir_chain). This data structure is used
+ * by ll_readdir() to process entries from overflow pages. When readdir
+ * invocation finishes, overflow pages are discarded. If page hash collision
+ * chain weren't completely processed, next call to readdir will again detect
+ * page hash collision, again read overflow pages in, process next portion of
+ * entries and again discard the pages. This is not as wasteful as it looks,
+ * because, given reasonable hash, page hash collisions are extremely rare.
+ *
+ * 1. directory positioning
+ *
+ * When seekdir(hash) is called, original
+ *
+ *
+ *
+ *
+ *
+ *
+ *
+ *
+ * Server.
+ *
+ * identification of and access to overflow pages
+ *
+ * page format
+ *
+ * Page in MDS_READPAGE RPC is packed in LU_PAGE_SIZE, and each page contains
+ * a header lu_dirpage which describes the start/end hash, and whether this
+ * page is empty (contains no dir entry) or hash collide with next page.
+ * After client receives reply, several pages will be integrated into dir page
+ * in PAGE_CACHE_SIZE (if PAGE_CACHE_SIZE greater than LU_PAGE_SIZE), and the
+ * lu_dirpage for this integrated page will be adjusted. See
+ * lmv_adjust_dirpages().
+ *
+ */
+
+/* returns the page unlocked, but with a reference */
+static int ll_dir_filler(void *_hash, struct page *page0)
+{
+	struct inode *inode = page0->mapping->host;
+	int hash64 = ll_i2sbi(inode)->ll_flags & LL_SBI_64BIT_HASH;
+	struct obd_export *exp = ll_i2sbi(inode)->ll_md_exp;
+	struct ptlrpc_request *request;
+	struct mdt_body *body;
+	struct md_op_data *op_data;
+	__u64 hash = *((__u64 *)_hash);
+	struct page **page_pool;
+	struct page *page;
+	struct lu_dirpage *dp;
+	int max_pages = ll_i2sbi(inode)->ll_md_brw_size >> PAGE_CACHE_SHIFT;
+	int nrdpgs = 0; /* number of pages read actually */
+	int npages;
+	int i;
+	int rc;
+
+	CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p) hash %llu\n",
+	       inode->i_ino, inode->i_generation, inode, hash);
+
+	LASSERT(max_pages > 0 && max_pages <= MD_MAX_BRW_PAGES);
+
+	page_pool = kcalloc(max_pages, sizeof(page), GFP_NOFS);
+	if (page_pool) {
+		page_pool[0] = page0;
+	} else {
+		page_pool = &page0;
+		max_pages = 1;
+	}
+	for (npages = 1; npages < max_pages; npages++) {
+		page = page_cache_alloc_cold(inode->i_mapping);
+		if (!page)
+			break;
+		page_pool[npages] = page;
+	}
+
+	op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
+				     LUSTRE_OPC_ANY, NULL);
+	op_data->op_npages = npages;
+	op_data->op_offset = hash;
+	rc = md_readpage(exp, op_data, page_pool, &request);
+	ll_finish_md_op_data(op_data);
+	if (rc < 0) {
+		/* page0 is special, which was added into page cache early */
+		delete_from_page_cache(page0);
+	} else if (rc == 0) {
+		body = req_capsule_server_get(&request->rq_pill, &RMF_MDT_BODY);
+		/* Checked by mdc_readpage() */
+		LASSERT(body != NULL);
+
+		if (body->valid & OBD_MD_FLSIZE)
+			cl_isize_write(inode, body->size);
+
+		nrdpgs = (request->rq_bulk->bd_nob_transferred+PAGE_CACHE_SIZE-1)
+			 >> PAGE_CACHE_SHIFT;
+		SetPageUptodate(page0);
+	}
+	unlock_page(page0);
+	ptlrpc_req_finished(request);
+
+	CDEBUG(D_VFSTRACE, "read %d/%d pages\n", nrdpgs, npages);
+
+	ll_pagevec_init(&lru_pvec, 0);
+	for (i = 1; i < npages; i++) {
+		unsigned long offset;
+		int ret;
+
+		page = page_pool[i];
+
+		if (rc < 0 || i >= nrdpgs) {
+			page_cache_release(page);
+			continue;
+		}
+
+		SetPageUptodate(page);
+
+		dp = kmap(page);
+		hash = le64_to_cpu(dp->ldp_hash_start);
+		kunmap(page);
+
+		offset = hash_x_index(hash, hash64);
+
+		prefetchw(&page->flags);
+		ret = add_to_page_cache_lru(page, inode->i_mapping, offset,
+					    GFP_KERNEL);
+		if (ret == 0) {
+			unlock_page(page);
+			if (ll_pagevec_add(&lru_pvec, page) == 0)
+				ll_pagevec_lru_add_file(&lru_pvec);
+		} else {
+			CDEBUG(D_VFSTRACE, "page %lu add to page cache failed: %d\n",
+			       offset, ret);
+		}
+		page_cache_release(page);
+	}
+	ll_pagevec_lru_add_file(&lru_pvec);
+
+	if (page_pool != &page0)
+		OBD_FREE(page_pool, sizeof(struct page *) * max_pages);
+	return rc;
+}
+
+static void ll_check_page(struct inode *dir, struct page *page)
+{
+	/* XXX: check page format later */
+	SetPageChecked(page);
+}
+
+void ll_release_page(struct page *page, int remove)
+{
+	kunmap(page);
+	if (remove) {
+		lock_page(page);
+		if (likely(page->mapping != NULL))
+			truncate_complete_page(page->mapping, page);
+		unlock_page(page);
+	}
+	page_cache_release(page);
+}
+
+/*
+ * Find, kmap and return page that contains given hash.
+ */
+static struct page *ll_dir_page_locate(struct inode *dir, __u64 *hash,
+				       __u64 *start, __u64 *end)
+{
+	int hash64 = ll_i2sbi(dir)->ll_flags & LL_SBI_64BIT_HASH;
+	struct address_space *mapping = dir->i_mapping;
+	/*
+	 * Complement of hash is used as an index so that
+	 * radix_tree_gang_lookup() can be used to find a page with starting
+	 * hash _smaller_ than one we are looking for.
+	 */
+	unsigned long offset = hash_x_index(*hash, hash64);
+	struct page *page;
+	int found;
+
+	spin_lock_irq(&mapping->tree_lock);
+	found = radix_tree_gang_lookup(&mapping->page_tree,
+				       (void **)&page, offset, 1);
+	if (found > 0 && !radix_tree_exceptional_entry(page)) {
+		struct lu_dirpage *dp;
+
+		page_cache_get(page);
+		spin_unlock_irq(&mapping->tree_lock);
+		/*
+		 * In contrast to find_lock_page() we are sure that directory
+		 * page cannot be truncated (while DLM lock is held) and,
+		 * hence, can avoid restart.
+		 *
+		 * In fact, page cannot be locked here at all, because
+		 * ll_dir_filler() does synchronous io.
+		 */
+		wait_on_page_locked(page);
+		if (PageUptodate(page)) {
+			dp = kmap(page);
+			if (BITS_PER_LONG == 32 && hash64) {
+				*start = le64_to_cpu(dp->ldp_hash_start) >> 32;
+				*end   = le64_to_cpu(dp->ldp_hash_end) >> 32;
+				*hash  = *hash >> 32;
+			} else {
+				*start = le64_to_cpu(dp->ldp_hash_start);
+				*end   = le64_to_cpu(dp->ldp_hash_end);
+			}
+			LASSERTF(*start <= *hash, "start = %#llx,end = %#llx,hash = %#llx\n",
+				 *start, *end, *hash);
+			CDEBUG(D_VFSTRACE, "page %lu [%llu %llu], hash %llu\n",
+			       offset, *start, *end, *hash);
+			if (*hash > *end) {
+				ll_release_page(page, 0);
+				page = NULL;
+			} else if (*end != *start && *hash == *end) {
+				/*
+				 * upon hash collision, remove this page,
+				 * otherwise put page reference, and
+				 * ll_get_dir_page() will issue RPC to fetch
+				 * the page we want.
+				 */
+				ll_release_page(page,
+				    le32_to_cpu(dp->ldp_flags) & LDF_COLLIDE);
+				page = NULL;
+			}
+		} else {
+			page_cache_release(page);
+			page = ERR_PTR(-EIO);
+		}
+
+	} else {
+		spin_unlock_irq(&mapping->tree_lock);
+		page = NULL;
+	}
+	return page;
+}
+
+struct page *ll_get_dir_page(struct inode *dir, __u64 hash,
+			     struct ll_dir_chain *chain)
+{
+	ldlm_policy_data_t policy = {.l_inodebits = {MDS_INODELOCK_UPDATE} };
+	struct address_space *mapping = dir->i_mapping;
+	struct lustre_handle lockh;
+	struct lu_dirpage *dp;
+	struct page *page;
+	ldlm_mode_t mode;
+	int rc;
+	__u64 start = 0;
+	__u64 end = 0;
+	__u64 lhash = hash;
+	struct ll_inode_info *lli = ll_i2info(dir);
+	int hash64 = ll_i2sbi(dir)->ll_flags & LL_SBI_64BIT_HASH;
+
+	mode = LCK_PR;
+	rc = md_lock_match(ll_i2sbi(dir)->ll_md_exp, LDLM_FL_BLOCK_GRANTED,
+			   ll_inode2fid(dir), LDLM_IBITS, &policy, mode, &lockh);
+	if (!rc) {
+		struct ldlm_enqueue_info einfo = {
+			.ei_type = LDLM_IBITS,
+			.ei_mode = mode,
+			.ei_cb_bl = ll_md_blocking_ast,
+			.ei_cb_cp = ldlm_completion_ast,
+		};
+		struct lookup_intent it = { .it_op = IT_READDIR };
+		struct ptlrpc_request *request;
+		struct md_op_data *op_data;
+
+		op_data = ll_prep_md_op_data(NULL, dir, dir, NULL, 0, 0,
+		LUSTRE_OPC_ANY, NULL);
+		if (IS_ERR(op_data))
+			return (void *)op_data;
+
+		rc = md_enqueue(ll_i2sbi(dir)->ll_md_exp, &einfo, &it,
+				op_data, &lockh, NULL, 0, NULL, 0);
+
+		ll_finish_md_op_data(op_data);
+
+		request = (struct ptlrpc_request *)it.d.lustre.it_data;
+		if (request)
+			ptlrpc_req_finished(request);
+		if (rc < 0) {
+			CERROR("lock enqueue: "DFID" at %llu: rc %d\n",
+				PFID(ll_inode2fid(dir)), hash, rc);
+			return ERR_PTR(rc);
+		}
+
+		CDEBUG(D_INODE, "setting lr_lvb_inode to inode %p (%lu/%u)\n",
+		       dir, dir->i_ino, dir->i_generation);
+		md_set_lock_data(ll_i2sbi(dir)->ll_md_exp,
+				 &it.d.lustre.it_lock_handle, dir, NULL);
+	} else {
+		/* for cross-ref object, l_ast_data of the lock may not be set,
+		 * we reset it here */
+		md_set_lock_data(ll_i2sbi(dir)->ll_md_exp, &lockh.cookie,
+				 dir, NULL);
+	}
+	ldlm_lock_dump_handle(D_OTHER, &lockh);
+
+	mutex_lock(&lli->lli_readdir_mutex);
+	page = ll_dir_page_locate(dir, &lhash, &start, &end);
+	if (IS_ERR(page)) {
+		CERROR("dir page locate: "DFID" at %llu: rc %ld\n",
+		       PFID(ll_inode2fid(dir)), lhash, PTR_ERR(page));
+		goto out_unlock;
+	} else if (page != NULL) {
+		/*
+		 * XXX nikita: not entirely correct handling of a corner case:
+		 * suppose hash chain of entries with hash value HASH crosses
+		 * border between pages P0 and P1. First both P0 and P1 are
+		 * cached, seekdir() is called for some entry from the P0 part
+		 * of the chain. Later P0 goes out of cache. telldir(HASH)
+		 * happens and finds P1, as it starts with matching hash
+		 * value. Remaining entries from P0 part of the chain are
+		 * skipped. (Is that really a bug?)
+		 *
+		 * Possible solutions: 0. don't cache P1 is such case, handle
+		 * it as an "overflow" page. 1. invalidate all pages at
+		 * once. 2. use HASH|1 as an index for P1.
+		 */
+		goto hash_collision;
+	}
+
+	page = read_cache_page(mapping, hash_x_index(hash, hash64),
+			       ll_dir_filler, &lhash);
+	if (IS_ERR(page)) {
+		CERROR("read cache page: "DFID" at %llu: rc %ld\n",
+		       PFID(ll_inode2fid(dir)), hash, PTR_ERR(page));
+		goto out_unlock;
+	}
+
+	wait_on_page_locked(page);
+	(void)kmap(page);
+	if (!PageUptodate(page)) {
+		CERROR("page not updated: "DFID" at %llu: rc %d\n",
+		       PFID(ll_inode2fid(dir)), hash, -5);
+		goto fail;
+	}
+	if (!PageChecked(page))
+		ll_check_page(dir, page);
+	if (PageError(page)) {
+		CERROR("page error: "DFID" at %llu: rc %d\n",
+		       PFID(ll_inode2fid(dir)), hash, -5);
+		goto fail;
+	}
+hash_collision:
+	dp = page_address(page);
+	if (BITS_PER_LONG == 32 && hash64) {
+		start = le64_to_cpu(dp->ldp_hash_start) >> 32;
+		end   = le64_to_cpu(dp->ldp_hash_end) >> 32;
+		lhash = hash >> 32;
+	} else {
+		start = le64_to_cpu(dp->ldp_hash_start);
+		end   = le64_to_cpu(dp->ldp_hash_end);
+		lhash = hash;
+	}
+	if (end == start) {
+		LASSERT(start == lhash);
+		CWARN("Page-wide hash collision: %llu\n", end);
+		if (BITS_PER_LONG == 32 && hash64)
+			CWARN("Real page-wide hash collision at [%llu %llu] with hash %llu\n",
+			      le64_to_cpu(dp->ldp_hash_start),
+			      le64_to_cpu(dp->ldp_hash_end), hash);
+		/*
+		 * Fetch whole overflow chain...
+		 *
+		 * XXX not yet.
+		 */
+		goto fail;
+	}
+out_unlock:
+	mutex_unlock(&lli->lli_readdir_mutex);
+	ldlm_lock_decref(&lockh, mode);
+	return page;
+
+fail:
+	ll_release_page(page, 1);
+	page = ERR_PTR(-EIO);
+	goto out_unlock;
+}
+
+int ll_dir_read(struct inode *inode, struct dir_context *ctx)
+{
+	struct ll_inode_info *info       = ll_i2info(inode);
+	struct ll_sb_info    *sbi	= ll_i2sbi(inode);
+	__u64		   pos		= ctx->pos;
+	int		   api32      = ll_need_32bit_api(sbi);
+	int		   hash64     = sbi->ll_flags & LL_SBI_64BIT_HASH;
+	struct page	  *page;
+	struct ll_dir_chain   chain;
+	int		   done = 0;
+	int		   rc = 0;
+
+	ll_dir_chain_init(&chain);
+
+	page = ll_get_dir_page(inode, pos, &chain);
+
+	while (rc == 0 && !done) {
+		struct lu_dirpage *dp;
+		struct lu_dirent  *ent;
+
+		if (!IS_ERR(page)) {
+			/*
+			 * If page is empty (end of directory is reached),
+			 * use this value.
+			 */
+			__u64 hash = MDS_DIR_END_OFF;
+			__u64 next;
+
+			dp = page_address(page);
+			for (ent = lu_dirent_start(dp); ent != NULL && !done;
+			     ent = lu_dirent_next(ent)) {
+				__u16	  type;
+				int	    namelen;
+				struct lu_fid  fid;
+				__u64	  lhash;
+				__u64	  ino;
+
+				/*
+				 * XXX: implement correct swabbing here.
+				 */
+
+				hash = le64_to_cpu(ent->lde_hash);
+				if (hash < pos)
+					/*
+					 * Skip until we find target hash
+					 * value.
+					 */
+					continue;
+
+				namelen = le16_to_cpu(ent->lde_namelen);
+				if (namelen == 0)
+					/*
+					 * Skip dummy record.
+					 */
+					continue;
+
+				if (api32 && hash64)
+					lhash = hash >> 32;
+				else
+					lhash = hash;
+				fid_le_to_cpu(&fid, &ent->lde_fid);
+				ino = cl_fid_build_ino(&fid, api32);
+				type = ll_dirent_type_get(ent);
+				ctx->pos = lhash;
+				/* For 'll_nfs_get_name_filldir()', it will try
+				 * to access the 'ent' through its 'lde_name',
+				 * so the parameter 'name' for 'ctx->actor()'
+				 * must be part of the 'ent'.
+				 */
+				done = !dir_emit(ctx, ent->lde_name,
+						 namelen, ino, type);
+			}
+			next = le64_to_cpu(dp->ldp_hash_end);
+			if (!done) {
+				pos = next;
+				if (pos == MDS_DIR_END_OFF) {
+					/*
+					 * End of directory reached.
+					 */
+					done = 1;
+					ll_release_page(page, 0);
+				} else if (1 /* chain is exhausted*/) {
+					/*
+					 * Normal case: continue to the next
+					 * page.
+					 */
+					ll_release_page(page,
+					    le32_to_cpu(dp->ldp_flags) &
+							LDF_COLLIDE);
+					next = pos;
+					page = ll_get_dir_page(inode, pos,
+							       &chain);
+				} else {
+					/*
+					 * go into overflow page.
+					 */
+					LASSERT(le32_to_cpu(dp->ldp_flags) &
+						LDF_COLLIDE);
+					ll_release_page(page, 1);
+				}
+			} else {
+				pos = hash;
+				ll_release_page(page, 0);
+			}
+		} else {
+			rc = PTR_ERR(page);
+			CERROR("error reading dir "DFID" at %lu: rc %d\n",
+			       PFID(&info->lli_fid), (unsigned long)pos, rc);
+		}
+	}
+
+	ctx->pos = pos;
+	ll_dir_chain_fini(&chain);
+	return rc;
+}
+
+static int ll_readdir(struct file *filp, struct dir_context *ctx)
+{
+	struct inode		*inode	= file_inode(filp);
+	struct ll_file_data	*lfd	= LUSTRE_FPRIVATE(filp);
+	struct ll_sb_info	*sbi	= ll_i2sbi(inode);
+	int			hash64	= sbi->ll_flags & LL_SBI_64BIT_HASH;
+	int			api32	= ll_need_32bit_api(sbi);
+	int			rc;
+
+	CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p) pos %lu/%llu 32bit_api %d\n",
+	       inode->i_ino, inode->i_generation,
+	       inode, (unsigned long)lfd->lfd_pos, i_size_read(inode), api32);
+
+	if (lfd->lfd_pos == MDS_DIR_END_OFF) {
+		/*
+		 * end-of-file.
+		 */
+		rc = 0;
+		goto out;
+	}
+
+	ctx->pos = lfd->lfd_pos;
+	rc = ll_dir_read(inode, ctx);
+	lfd->lfd_pos = ctx->pos;
+	if (ctx->pos == MDS_DIR_END_OFF) {
+		if (api32)
+			ctx->pos = LL_DIR_END_OFF_32BIT;
+		else
+			ctx->pos = LL_DIR_END_OFF;
+	} else {
+		if (api32 && hash64)
+			ctx->pos >>= 32;
+	}
+	filp->f_version = inode->i_version;
+
+out:
+	if (!rc)
+		ll_stats_ops_tally(sbi, LPROC_LL_READDIR, 1);
+
+	return rc;
+}
+
+static int ll_send_mgc_param(struct obd_export *mgc, char *string)
+{
+	struct mgs_send_param *msp;
+	int rc = 0;
+
+	msp = kzalloc(sizeof(*msp), GFP_NOFS);
+	if (!msp)
+		return -ENOMEM;
+
+	strncpy(msp->mgs_param, string, MGS_PARAM_MAXLEN);
+	rc = obd_set_info_async(NULL, mgc, sizeof(KEY_SET_INFO), KEY_SET_INFO,
+				sizeof(struct mgs_send_param), msp, NULL);
+	if (rc)
+		CERROR("Failed to set parameter: %d\n", rc);
+	OBD_FREE_PTR(msp);
+
+	return rc;
+}
+
+static int ll_dir_setdirstripe(struct inode *dir, struct lmv_user_md *lump,
+			       char *filename)
+{
+	struct ptlrpc_request *request = NULL;
+	struct md_op_data *op_data;
+	struct ll_sb_info *sbi = ll_i2sbi(dir);
+	int mode;
+	int err;
+
+	mode = (0755 & ~current_umask()) | S_IFDIR;
+	op_data = ll_prep_md_op_data(NULL, dir, NULL, filename,
+				     strlen(filename), mode, LUSTRE_OPC_MKDIR,
+				     lump);
+	if (IS_ERR(op_data)) {
+		err = PTR_ERR(op_data);
+		goto err_exit;
+	}
+
+	op_data->op_cli_flags |= CLI_SET_MEA;
+	err = md_create(sbi->ll_md_exp, op_data, lump, sizeof(*lump), mode,
+			from_kuid(&init_user_ns, current_fsuid()),
+			from_kgid(&init_user_ns, current_fsgid()),
+			cfs_curproc_cap_pack(), 0, &request);
+	ll_finish_md_op_data(op_data);
+	if (err)
+		goto err_exit;
+err_exit:
+	ptlrpc_req_finished(request);
+	return err;
+}
+
+int ll_dir_setstripe(struct inode *inode, struct lov_user_md *lump,
+		     int set_default)
+{
+	struct ll_sb_info *sbi = ll_i2sbi(inode);
+	struct md_op_data *op_data;
+	struct ptlrpc_request *req = NULL;
+	int rc = 0;
+	struct lustre_sb_info *lsi = s2lsi(inode->i_sb);
+	struct obd_device *mgc = lsi->lsi_mgc;
+	int lum_size;
+
+	if (lump != NULL) {
+		/*
+		 * This is coming from userspace, so should be in
+		 * local endian.  But the MDS would like it in little
+		 * endian, so we swab it before we send it.
+		 */
+		switch (lump->lmm_magic) {
+		case LOV_USER_MAGIC_V1: {
+			if (lump->lmm_magic != cpu_to_le32(LOV_USER_MAGIC_V1))
+				lustre_swab_lov_user_md_v1(lump);
+			lum_size = sizeof(struct lov_user_md_v1);
+			break;
+		}
+		case LOV_USER_MAGIC_V3: {
+			if (lump->lmm_magic != cpu_to_le32(LOV_USER_MAGIC_V3))
+				lustre_swab_lov_user_md_v3(
+					(struct lov_user_md_v3 *)lump);
+			lum_size = sizeof(struct lov_user_md_v3);
+			break;
+		}
+		default: {
+			CDEBUG(D_IOCTL, "bad userland LOV MAGIC: %#08x != %#08x nor %#08x\n",
+			       lump->lmm_magic, LOV_USER_MAGIC_V1,
+			       LOV_USER_MAGIC_V3);
+			return -EINVAL;
+		}
+		}
+	} else {
+		lum_size = sizeof(struct lov_user_md_v1);
+	}
+
+	op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
+				     LUSTRE_OPC_ANY, NULL);
+	if (IS_ERR(op_data))
+		return PTR_ERR(op_data);
+
+	if (lump != NULL && lump->lmm_magic == cpu_to_le32(LMV_USER_MAGIC))
+		op_data->op_cli_flags |= CLI_SET_MEA;
+
+	/* swabbing is done in lov_setstripe() on server side */
+	rc = md_setattr(sbi->ll_md_exp, op_data, lump, lum_size,
+			NULL, 0, &req, NULL);
+	ll_finish_md_op_data(op_data);
+	ptlrpc_req_finished(req);
+	if (rc) {
+		if (rc != -EPERM && rc != -EACCES)
+			CERROR("mdc_setattr fails: rc = %d\n", rc);
+	}
+
+	/* In the following we use the fact that LOV_USER_MAGIC_V1 and
+	 LOV_USER_MAGIC_V3 have the same initial fields so we do not
+	 need to make the distinction between the 2 versions */
+	if (set_default && mgc->u.cli.cl_mgc_mgsexp) {
+		char *param = NULL;
+		char *buf;
+
+		param = kzalloc(MGS_PARAM_MAXLEN, GFP_NOFS);
+		if (!param) {
+			rc = -ENOMEM;
+			goto end;
+		}
+
+		buf = param;
+		/* Get fsname and assume devname to be -MDT0000. */
+		ll_get_fsname(inode->i_sb, buf, MTI_NAME_MAXLEN);
+		strcat(buf, "-MDT0000.lov");
+		buf += strlen(buf);
+
+		/* Set root stripesize */
+		sprintf(buf, ".stripesize=%u",
+			lump ? le32_to_cpu(lump->lmm_stripe_size) : 0);
+		rc = ll_send_mgc_param(mgc->u.cli.cl_mgc_mgsexp, param);
+		if (rc)
+			goto end;
+
+		/* Set root stripecount */
+		sprintf(buf, ".stripecount=%hd",
+			lump ? le16_to_cpu(lump->lmm_stripe_count) : 0);
+		rc = ll_send_mgc_param(mgc->u.cli.cl_mgc_mgsexp, param);
+		if (rc)
+			goto end;
+
+		/* Set root stripeoffset */
+		sprintf(buf, ".stripeoffset=%hd",
+			lump ? le16_to_cpu(lump->lmm_stripe_offset) :
+			(typeof(lump->lmm_stripe_offset))(-1));
+		rc = ll_send_mgc_param(mgc->u.cli.cl_mgc_mgsexp, param);
+
+end:
+		if (param != NULL)
+			OBD_FREE(param, MGS_PARAM_MAXLEN);
+	}
+	return rc;
+}
+
+int ll_dir_getstripe(struct inode *inode, struct lov_mds_md **lmmp,
+		     int *lmm_size, struct ptlrpc_request **request)
+{
+	struct ll_sb_info *sbi = ll_i2sbi(inode);
+	struct mdt_body   *body;
+	struct lov_mds_md *lmm = NULL;
+	struct ptlrpc_request *req = NULL;
+	int rc, lmmsize;
+	struct md_op_data *op_data;
+
+	rc = ll_get_default_mdsize(sbi, &lmmsize);
+	if (rc)
+		return rc;
+
+	op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL,
+				     0, lmmsize, LUSTRE_OPC_ANY,
+				     NULL);
+	if (IS_ERR(op_data))
+		return PTR_ERR(op_data);
+
+	op_data->op_valid = OBD_MD_FLEASIZE | OBD_MD_FLDIREA;
+	rc = md_getattr(sbi->ll_md_exp, op_data, &req);
+	ll_finish_md_op_data(op_data);
+	if (rc < 0) {
+		CDEBUG(D_INFO, "md_getattr failed on inode %lu/%u: rc %d\n",
+		       inode->i_ino,
+		       inode->i_generation, rc);
+		goto out;
+	}
+
+	body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
+	LASSERT(body != NULL);
+
+	lmmsize = body->eadatasize;
+
+	if (!(body->valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
+	    lmmsize == 0) {
+		rc = -ENODATA;
+		goto out;
+	}
+
+	lmm = req_capsule_server_sized_get(&req->rq_pill,
+					   &RMF_MDT_MD, lmmsize);
+	LASSERT(lmm != NULL);
+
+	/*
+	 * This is coming from the MDS, so is probably in
+	 * little endian.  We convert it to host endian before
+	 * passing it to userspace.
+	 */
+	/* We don't swab objects for directories */
+	switch (le32_to_cpu(lmm->lmm_magic)) {
+	case LOV_MAGIC_V1:
+		if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC))
+			lustre_swab_lov_user_md_v1((struct lov_user_md_v1 *)lmm);
+		break;
+	case LOV_MAGIC_V3:
+		if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC))
+			lustre_swab_lov_user_md_v3((struct lov_user_md_v3 *)lmm);
+		break;
+	default:
+		CERROR("unknown magic: %lX\n", (unsigned long)lmm->lmm_magic);
+		rc = -EPROTO;
+	}
+out:
+	*lmmp = lmm;
+	*lmm_size = lmmsize;
+	*request = req;
+	return rc;
+}
+
+/*
+ *  Get MDT index for the inode.
+ */
+int ll_get_mdt_idx(struct inode *inode)
+{
+	struct ll_sb_info *sbi = ll_i2sbi(inode);
+	struct md_op_data *op_data;
+	int rc, mdtidx;
+
+	op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0,
+				     0, LUSTRE_OPC_ANY, NULL);
+	if (IS_ERR(op_data))
+		return PTR_ERR(op_data);
+
+	op_data->op_flags |= MF_GET_MDT_IDX;
+	rc = md_getattr(sbi->ll_md_exp, op_data, NULL);
+	mdtidx = op_data->op_mds;
+	ll_finish_md_op_data(op_data);
+	if (rc < 0) {
+		CDEBUG(D_INFO, "md_getattr_name: %d\n", rc);
+		return rc;
+	}
+	return mdtidx;
+}
+
+/**
+ * Generic handler to do any pre-copy work.
+ *
+ * It send a first hsm_progress (with extent length == 0) to coordinator as a
+ * first information for it that real work has started.
+ *
+ * Moreover, for a ARCHIVE request, it will sample the file data version and
+ * store it in \a copy.
+ *
+ * \return 0 on success.
+ */
+static int ll_ioc_copy_start(struct super_block *sb, struct hsm_copy *copy)
+{
+	struct ll_sb_info		*sbi = ll_s2sbi(sb);
+	struct hsm_progress_kernel	 hpk;
+	int				 rc;
+
+	/* Forge a hsm_progress based on data from copy. */
+	hpk.hpk_fid = copy->hc_hai.hai_fid;
+	hpk.hpk_cookie = copy->hc_hai.hai_cookie;
+	hpk.hpk_extent.offset = copy->hc_hai.hai_extent.offset;
+	hpk.hpk_extent.length = 0;
+	hpk.hpk_flags = 0;
+	hpk.hpk_errval = 0;
+	hpk.hpk_data_version = 0;
+
+
+	/* For archive request, we need to read the current file version. */
+	if (copy->hc_hai.hai_action == HSMA_ARCHIVE) {
+		struct inode	*inode;
+		__u64		 data_version = 0;
+
+		/* Get inode for this fid */
+		inode = search_inode_for_lustre(sb, &copy->hc_hai.hai_fid);
+		if (IS_ERR(inode)) {
+			hpk.hpk_flags |= HP_FLAG_RETRY;
+			/* hpk_errval is >= 0 */
+			hpk.hpk_errval = -PTR_ERR(inode);
+			rc = PTR_ERR(inode);
+			goto progress;
+		}
+
+		/* Read current file data version */
+		rc = ll_data_version(inode, &data_version, 1);
+		iput(inode);
+		if (rc != 0) {
+			CDEBUG(D_HSM, "Could not read file data version of "
+				      DFID" (rc = %d). Archive request (%#llx) could not be done.\n",
+				      PFID(&copy->hc_hai.hai_fid), rc,
+				      copy->hc_hai.hai_cookie);
+			hpk.hpk_flags |= HP_FLAG_RETRY;
+			/* hpk_errval must be >= 0 */
+			hpk.hpk_errval = -rc;
+			goto progress;
+		}
+
+		/* Store it the hsm_copy for later copytool use.
+		 * Always modified even if no lsm. */
+		copy->hc_data_version = data_version;
+	}
+
+progress:
+	rc = obd_iocontrol(LL_IOC_HSM_PROGRESS, sbi->ll_md_exp, sizeof(hpk),
+			   &hpk, NULL);
+
+	return rc;
+}
+
+/**
+ * Generic handler to do any post-copy work.
+ *
+ * It will send the last hsm_progress update to coordinator to inform it
+ * that copy is finished and whether it was successful or not.
+ *
+ * Moreover,
+ * - for ARCHIVE request, it will sample the file data version and compare it
+ *   with the version saved in ll_ioc_copy_start(). If they do not match, copy
+ *   will be considered as failed.
+ * - for RESTORE request, it will sample the file data version and send it to
+ *   coordinator which is useful if the file was imported as 'released'.
+ *
+ * \return 0 on success.
+ */
+static int ll_ioc_copy_end(struct super_block *sb, struct hsm_copy *copy)
+{
+	struct ll_sb_info		*sbi = ll_s2sbi(sb);
+	struct hsm_progress_kernel	 hpk;
+	int				 rc;
+
+	/* If you modify the logic here, also check llapi_hsm_copy_end(). */
+	/* Take care: copy->hc_hai.hai_action, len, gid and data are not
+	 * initialized if copy_end was called with copy == NULL.
+	 */
+
+	/* Forge a hsm_progress based on data from copy. */
+	hpk.hpk_fid = copy->hc_hai.hai_fid;
+	hpk.hpk_cookie = copy->hc_hai.hai_cookie;
+	hpk.hpk_extent = copy->hc_hai.hai_extent;
+	hpk.hpk_flags = copy->hc_flags | HP_FLAG_COMPLETED;
+	hpk.hpk_errval = copy->hc_errval;
+	hpk.hpk_data_version = 0;
+
+	/* For archive request, we need to check the file data was not changed.
+	 *
+	 * For restore request, we need to send the file data version, this is
+	 * useful when the file was created using hsm_import.
+	 */
+	if (((copy->hc_hai.hai_action == HSMA_ARCHIVE) ||
+	     (copy->hc_hai.hai_action == HSMA_RESTORE)) &&
+	    (copy->hc_errval == 0)) {
+		struct inode	*inode;
+		__u64		 data_version = 0;
+
+		/* Get lsm for this fid */
+		inode = search_inode_for_lustre(sb, &copy->hc_hai.hai_fid);
+		if (IS_ERR(inode)) {
+			hpk.hpk_flags |= HP_FLAG_RETRY;
+			/* hpk_errval must be >= 0 */
+			hpk.hpk_errval = -PTR_ERR(inode);
+			rc = PTR_ERR(inode);
+			goto progress;
+		}
+
+		rc = ll_data_version(inode, &data_version,
+				     copy->hc_hai.hai_action == HSMA_ARCHIVE);
+		iput(inode);
+		if (rc) {
+			CDEBUG(D_HSM, "Could not read file data version. Request could not be confirmed.\n");
+			if (hpk.hpk_errval == 0)
+				hpk.hpk_errval = -rc;
+			goto progress;
+		}
+
+		/* Store it the hsm_copy for later copytool use.
+		 * Always modified even if no lsm. */
+		hpk.hpk_data_version = data_version;
+
+		/* File could have been stripped during archiving, so we need
+		 * to check anyway. */
+		if ((copy->hc_hai.hai_action == HSMA_ARCHIVE) &&
+		    (copy->hc_data_version != data_version)) {
+			CDEBUG(D_HSM, "File data version mismatched. File content was changed during archiving. "
+			       DFID", start:%#llx current:%#llx\n",
+			       PFID(&copy->hc_hai.hai_fid),
+			       copy->hc_data_version, data_version);
+			/* File was changed, send error to cdt. Do not ask for
+			 * retry because if a file is modified frequently,
+			 * the cdt will loop on retried archive requests.
+			 * The policy engine will ask for a new archive later
+			 * when the file will not be modified for some tunable
+			 * time */
+			/* we do not notify caller */
+			hpk.hpk_flags &= ~HP_FLAG_RETRY;
+			/* hpk_errval must be >= 0 */
+			hpk.hpk_errval = EBUSY;
+		}
+
+	}
+
+progress:
+	rc = obd_iocontrol(LL_IOC_HSM_PROGRESS, sbi->ll_md_exp, sizeof(hpk),
+			   &hpk, NULL);
+
+	return rc;
+}
+
+
+static int copy_and_ioctl(int cmd, struct obd_export *exp,
+			  const void __user *data, size_t size)
+{
+	void *copy;
+	int rc;
+
+	copy = kzalloc(size, GFP_NOFS);
+	if (!copy)
+		return -ENOMEM;
+
+	if (copy_from_user(copy, data, size)) {
+		rc = -EFAULT;
+		goto out;
+	}
+
+	rc = obd_iocontrol(cmd, exp, size, copy, NULL);
+out:
+	OBD_FREE(copy, size);
+
+	return rc;
+}
+
+static int quotactl_ioctl(struct ll_sb_info *sbi, struct if_quotactl *qctl)
+{
+	int cmd = qctl->qc_cmd;
+	int type = qctl->qc_type;
+	int id = qctl->qc_id;
+	int valid = qctl->qc_valid;
+	int rc = 0;
+
+	switch (cmd) {
+	case LUSTRE_Q_INVALIDATE:
+	case LUSTRE_Q_FINVALIDATE:
+	case Q_QUOTAON:
+	case Q_QUOTAOFF:
+	case Q_SETQUOTA:
+	case Q_SETINFO:
+		if (!capable(CFS_CAP_SYS_ADMIN) ||
+		    sbi->ll_flags & LL_SBI_RMT_CLIENT)
+			return -EPERM;
+		break;
+	case Q_GETQUOTA:
+		if (((type == USRQUOTA &&
+		      !uid_eq(current_euid(), make_kuid(&init_user_ns, id))) ||
+		     (type == GRPQUOTA &&
+		      !in_egroup_p(make_kgid(&init_user_ns, id)))) &&
+		    (!capable(CFS_CAP_SYS_ADMIN) ||
+		     sbi->ll_flags & LL_SBI_RMT_CLIENT))
+			return -EPERM;
+		break;
+	case Q_GETINFO:
+		break;
+	default:
+		CERROR("unsupported quotactl op: %#x\n", cmd);
+		return -ENOTTY;
+	}
+
+	if (valid != QC_GENERAL) {
+		if (sbi->ll_flags & LL_SBI_RMT_CLIENT)
+			return -EOPNOTSUPP;
+
+		if (cmd == Q_GETINFO)
+			qctl->qc_cmd = Q_GETOINFO;
+		else if (cmd == Q_GETQUOTA)
+			qctl->qc_cmd = Q_GETOQUOTA;
+		else
+			return -EINVAL;
+
+		switch (valid) {
+		case QC_MDTIDX:
+			rc = obd_iocontrol(OBD_IOC_QUOTACTL, sbi->ll_md_exp,
+					   sizeof(*qctl), qctl, NULL);
+			break;
+		case QC_OSTIDX:
+			rc = obd_iocontrol(OBD_IOC_QUOTACTL, sbi->ll_dt_exp,
+					   sizeof(*qctl), qctl, NULL);
+			break;
+		case QC_UUID:
+			rc = obd_iocontrol(OBD_IOC_QUOTACTL, sbi->ll_md_exp,
+					   sizeof(*qctl), qctl, NULL);
+			if (rc == -EAGAIN)
+				rc = obd_iocontrol(OBD_IOC_QUOTACTL,
+						   sbi->ll_dt_exp,
+						   sizeof(*qctl), qctl, NULL);
+			break;
+		default:
+			rc = -EINVAL;
+			break;
+		}
+
+		if (rc)
+			return rc;
+
+		qctl->qc_cmd = cmd;
+	} else {
+		struct obd_quotactl *oqctl;
+
+		oqctl = kzalloc(sizeof(*oqctl), GFP_NOFS);
+		if (!oqctl)
+			return -ENOMEM;
+
+		QCTL_COPY(oqctl, qctl);
+		rc = obd_quotactl(sbi->ll_md_exp, oqctl);
+		if (rc) {
+			if (rc != -EALREADY && cmd == Q_QUOTAON) {
+				oqctl->qc_cmd = Q_QUOTAOFF;
+				obd_quotactl(sbi->ll_md_exp, oqctl);
+			}
+			OBD_FREE_PTR(oqctl);
+			return rc;
+		}
+		/* If QIF_SPACE is not set, client should collect the
+		 * space usage from OSSs by itself */
+		if (cmd == Q_GETQUOTA &&
+		    !(oqctl->qc_dqblk.dqb_valid & QIF_SPACE) &&
+		    !oqctl->qc_dqblk.dqb_curspace) {
+			struct obd_quotactl *oqctl_tmp;
+
+			oqctl_tmp = kzalloc(sizeof(*oqctl_tmp), GFP_NOFS);
+			if (!oqctl_tmp) {
+				rc = -ENOMEM;
+				goto out;
+			}
+
+			oqctl_tmp->qc_cmd = Q_GETOQUOTA;
+			oqctl_tmp->qc_id = oqctl->qc_id;
+			oqctl_tmp->qc_type = oqctl->qc_type;
+
+			/* collect space usage from OSTs */
+			oqctl_tmp->qc_dqblk.dqb_curspace = 0;
+			rc = obd_quotactl(sbi->ll_dt_exp, oqctl_tmp);
+			if (!rc || rc == -EREMOTEIO) {
+				oqctl->qc_dqblk.dqb_curspace =
+					oqctl_tmp->qc_dqblk.dqb_curspace;
+				oqctl->qc_dqblk.dqb_valid |= QIF_SPACE;
+			}
+
+			/* collect space & inode usage from MDTs */
+			oqctl_tmp->qc_dqblk.dqb_curspace = 0;
+			oqctl_tmp->qc_dqblk.dqb_curinodes = 0;
+			rc = obd_quotactl(sbi->ll_md_exp, oqctl_tmp);
+			if (!rc || rc == -EREMOTEIO) {
+				oqctl->qc_dqblk.dqb_curspace +=
+					oqctl_tmp->qc_dqblk.dqb_curspace;
+				oqctl->qc_dqblk.dqb_curinodes =
+					oqctl_tmp->qc_dqblk.dqb_curinodes;
+				oqctl->qc_dqblk.dqb_valid |= QIF_INODES;
+			} else {
+				oqctl->qc_dqblk.dqb_valid &= ~QIF_SPACE;
+			}
+
+			OBD_FREE_PTR(oqctl_tmp);
+		}
+out:
+		QCTL_COPY(qctl, oqctl);
+		OBD_FREE_PTR(oqctl);
+	}
+
+	return rc;
+}
+
+static char *
+ll_getname(const char __user *filename)
+{
+	int ret = 0, len;
+	char *tmp = __getname();
+
+	if (!tmp)
+		return ERR_PTR(-ENOMEM);
+
+	len = strncpy_from_user(tmp, filename, PATH_MAX);
+	if (len == 0)
+		ret = -ENOENT;
+	else if (len > PATH_MAX)
+		ret = -ENAMETOOLONG;
+
+	if (ret) {
+		__putname(tmp);
+		tmp =  ERR_PTR(ret);
+	}
+	return tmp;
+}
+
+#define ll_putname(filename) __putname(filename)
+
+static long ll_dir_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+	struct inode *inode = file_inode(file);
+	struct ll_sb_info *sbi = ll_i2sbi(inode);
+	struct obd_ioctl_data *data;
+	int rc = 0;
+
+	CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), cmd=%#x\n",
+	       inode->i_ino, inode->i_generation, inode, cmd);
+
+	/* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
+	if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
+		return -ENOTTY;
+
+	ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
+	switch (cmd) {
+	case FSFILT_IOC_GETFLAGS:
+	case FSFILT_IOC_SETFLAGS:
+		return ll_iocontrol(inode, file, cmd, arg);
+	case FSFILT_IOC_GETVERSION_OLD:
+	case FSFILT_IOC_GETVERSION:
+		return put_user(inode->i_generation, (int *)arg);
+	/* We need to special case any other ioctls we want to handle,
+	 * to send them to the MDS/OST as appropriate and to properly
+	 * network encode the arg field.
+	case FSFILT_IOC_SETVERSION_OLD:
+	case FSFILT_IOC_SETVERSION:
+	*/
+	case LL_IOC_GET_MDTIDX: {
+		int mdtidx;
+
+		mdtidx = ll_get_mdt_idx(inode);
+		if (mdtidx < 0)
+			return mdtidx;
+
+		if (put_user((int)mdtidx, (int *)arg))
+			return -EFAULT;
+
+		return 0;
+	}
+	case IOC_MDC_LOOKUP: {
+		struct ptlrpc_request *request = NULL;
+		int namelen, len = 0;
+		char *buf = NULL;
+		char *filename;
+		struct md_op_data *op_data;
+
+		rc = obd_ioctl_getdata(&buf, &len, (void *)arg);
+		if (rc)
+			return rc;
+		data = (void *)buf;
+
+		filename = data->ioc_inlbuf1;
+		namelen = strlen(filename);
+
+		if (namelen < 1) {
+			CDEBUG(D_INFO, "IOC_MDC_LOOKUP missing filename\n");
+			rc = -EINVAL;
+			goto out_free;
+		}
+
+		op_data = ll_prep_md_op_data(NULL, inode, NULL, filename, namelen,
+					     0, LUSTRE_OPC_ANY, NULL);
+		if (IS_ERR(op_data)) {
+			rc = PTR_ERR(op_data);
+			goto out_free;
+		}
+
+		op_data->op_valid = OBD_MD_FLID;
+		rc = md_getattr_name(sbi->ll_md_exp, op_data, &request);
+		ll_finish_md_op_data(op_data);
+		if (rc < 0) {
+			CDEBUG(D_INFO, "md_getattr_name: %d\n", rc);
+			goto out_free;
+		}
+		ptlrpc_req_finished(request);
+out_free:
+		obd_ioctl_freedata(buf, len);
+		return rc;
+	}
+	case LL_IOC_LMV_SETSTRIPE: {
+		struct lmv_user_md  *lum;
+		char		*buf = NULL;
+		char		*filename;
+		int		 namelen = 0;
+		int		 lumlen = 0;
+		int		 len;
+		int		 rc;
+
+		rc = obd_ioctl_getdata(&buf, &len, (void *)arg);
+		if (rc)
+			return rc;
+
+		data = (void *)buf;
+		if (data->ioc_inlbuf1 == NULL || data->ioc_inlbuf2 == NULL ||
+		    data->ioc_inllen1 == 0 || data->ioc_inllen2 == 0) {
+			rc = -EINVAL;
+			goto lmv_out_free;
+		}
+
+		filename = data->ioc_inlbuf1;
+		namelen = data->ioc_inllen1;
+
+		if (namelen < 1) {
+			CDEBUG(D_INFO, "IOC_MDC_LOOKUP missing filename\n");
+			rc = -EINVAL;
+			goto lmv_out_free;
+		}
+		lum = (struct lmv_user_md *)data->ioc_inlbuf2;
+		lumlen = data->ioc_inllen2;
+
+		if (lum->lum_magic != LMV_USER_MAGIC ||
+		    lumlen != sizeof(*lum)) {
+			CERROR("%s: wrong lum magic %x or size %d: rc = %d\n",
+			       filename, lum->lum_magic, lumlen, -EFAULT);
+			rc = -EINVAL;
+			goto lmv_out_free;
+		}
+
+		/**
+		 * ll_dir_setdirstripe will be used to set dir stripe
+		 *  mdc_create--->mdt_reint_create (with dirstripe)
+		 */
+		rc = ll_dir_setdirstripe(inode, lum, filename);
+lmv_out_free:
+		obd_ioctl_freedata(buf, len);
+		return rc;
+
+	}
+	case LL_IOC_LOV_SETSTRIPE: {
+		struct lov_user_md_v3 lumv3;
+		struct lov_user_md_v1 *lumv1 = (struct lov_user_md_v1 *)&lumv3;
+		struct lov_user_md_v1 *lumv1p = (struct lov_user_md_v1 *)arg;
+		struct lov_user_md_v3 *lumv3p = (struct lov_user_md_v3 *)arg;
+
+		int set_default = 0;
+
+		LASSERT(sizeof(lumv3) == sizeof(*lumv3p));
+		LASSERT(sizeof(lumv3.lmm_objects[0]) ==
+			sizeof(lumv3p->lmm_objects[0]));
+		/* first try with v1 which is smaller than v3 */
+		if (copy_from_user(lumv1, lumv1p, sizeof(*lumv1)))
+			return -EFAULT;
+
+		if (lumv1->lmm_magic == LOV_USER_MAGIC_V3) {
+			if (copy_from_user(&lumv3, lumv3p, sizeof(lumv3)))
+				return -EFAULT;
+		}
+
+		if (is_root_inode(inode))
+			set_default = 1;
+
+		/* in v1 and v3 cases lumv1 points to data */
+		rc = ll_dir_setstripe(inode, lumv1, set_default);
+
+		return rc;
+	}
+	case LL_IOC_LMV_GETSTRIPE: {
+		struct lmv_user_md *lump = (struct lmv_user_md *)arg;
+		struct lmv_user_md lum;
+		struct lmv_user_md *tmp;
+		int lum_size;
+		int rc = 0;
+		int mdtindex;
+
+		if (copy_from_user(&lum, lump, sizeof(struct lmv_user_md)))
+			return -EFAULT;
+
+		if (lum.lum_magic != LMV_MAGIC_V1)
+			return -EINVAL;
+
+		lum_size = lmv_user_md_size(1, LMV_MAGIC_V1);
+		tmp = kzalloc(lum_size, GFP_NOFS);
+		if (!tmp) {
+			rc = -ENOMEM;
+			goto free_lmv;
+		}
+
+		*tmp = lum;
+		tmp->lum_type = LMV_STRIPE_TYPE;
+		tmp->lum_stripe_count = 1;
+		mdtindex = ll_get_mdt_idx(inode);
+		if (mdtindex < 0) {
+			rc = -ENOMEM;
+			goto free_lmv;
+		}
+
+		tmp->lum_stripe_offset = mdtindex;
+		tmp->lum_objects[0].lum_mds = mdtindex;
+		memcpy(&tmp->lum_objects[0].lum_fid, ll_inode2fid(inode),
+		       sizeof(struct lu_fid));
+		if (copy_to_user((void *)arg, tmp, lum_size)) {
+			rc = -EFAULT;
+			goto free_lmv;
+		}
+free_lmv:
+		if (tmp)
+			OBD_FREE(tmp, lum_size);
+		return rc;
+	}
+	case LL_IOC_REMOVE_ENTRY: {
+		char		*filename = NULL;
+		int		 namelen = 0;
+		int		 rc;
+
+		/* Here is a little hack to avoid sending REINT_RMENTRY to
+		 * unsupported server, which might crash the server(LU-2730),
+		 * Because both LVB_TYPE and REINT_RMENTRY will be supported
+		 * on 2.4, we use OBD_CONNECT_LVB_TYPE to detect whether the
+		 * server will support REINT_RMENTRY XXX*/
+		if (!(exp_connect_flags(sbi->ll_md_exp) & OBD_CONNECT_LVB_TYPE))
+			return -ENOTSUPP;
+
+		filename = ll_getname((const char *)arg);
+		if (IS_ERR(filename))
+			return PTR_ERR(filename);
+
+		namelen = strlen(filename);
+		if (namelen < 1) {
+			rc = -EINVAL;
+			goto out_rmdir;
+		}
+
+		rc = ll_rmdir_entry(inode, filename, namelen);
+out_rmdir:
+		if (filename)
+			ll_putname(filename);
+		return rc;
+	}
+	case LL_IOC_LOV_SWAP_LAYOUTS:
+		return -EPERM;
+	case LL_IOC_OBD_STATFS:
+		return ll_obd_statfs(inode, (void *)arg);
+	case LL_IOC_LOV_GETSTRIPE:
+	case LL_IOC_MDC_GETINFO:
+	case IOC_MDC_GETFILEINFO:
+	case IOC_MDC_GETFILESTRIPE: {
+		struct ptlrpc_request *request = NULL;
+		struct lov_user_md *lump;
+		struct lov_mds_md *lmm = NULL;
+		struct mdt_body *body;
+		char *filename = NULL;
+		int lmmsize;
+
+		if (cmd == IOC_MDC_GETFILEINFO ||
+		    cmd == IOC_MDC_GETFILESTRIPE) {
+			filename = ll_getname((const char *)arg);
+			if (IS_ERR(filename))
+				return PTR_ERR(filename);
+
+			rc = ll_lov_getstripe_ea_info(inode, filename, &lmm,
+						      &lmmsize, &request);
+		} else {
+			rc = ll_dir_getstripe(inode, &lmm, &lmmsize, &request);
+		}
+
+		if (request) {
+			body = req_capsule_server_get(&request->rq_pill,
+						      &RMF_MDT_BODY);
+			LASSERT(body != NULL);
+		} else {
+			goto out_req;
+		}
+
+		if (rc < 0) {
+			if (rc == -ENODATA && (cmd == IOC_MDC_GETFILEINFO ||
+					       cmd == LL_IOC_MDC_GETINFO)) {
+				rc = 0;
+				goto skip_lmm;
+			} else
+				goto out_req;
+		}
+
+		if (cmd == IOC_MDC_GETFILESTRIPE ||
+		    cmd == LL_IOC_LOV_GETSTRIPE) {
+			lump = (struct lov_user_md *)arg;
+		} else {
+			struct lov_user_mds_data *lmdp;
+
+			lmdp = (struct lov_user_mds_data *)arg;
+			lump = &lmdp->lmd_lmm;
+		}
+		if (copy_to_user(lump, lmm, lmmsize)) {
+			if (copy_to_user(lump, lmm, sizeof(*lump))) {
+				rc = -EFAULT;
+				goto out_req;
+			}
+			rc = -EOVERFLOW;
+		}
+skip_lmm:
+		if (cmd == IOC_MDC_GETFILEINFO || cmd == LL_IOC_MDC_GETINFO) {
+			struct lov_user_mds_data *lmdp;
+			lstat_t st = { 0 };
+
+			st.st_dev     = inode->i_sb->s_dev;
+			st.st_mode    = body->mode;
+			st.st_nlink   = body->nlink;
+			st.st_uid     = body->uid;
+			st.st_gid     = body->gid;
+			st.st_rdev    = body->rdev;
+			st.st_size    = body->size;
+			st.st_blksize = PAGE_CACHE_SIZE;
+			st.st_blocks  = body->blocks;
+			st.st_atime   = body->atime;
+			st.st_mtime   = body->mtime;
+			st.st_ctime   = body->ctime;
+			st.st_ino     = inode->i_ino;
+
+			lmdp = (struct lov_user_mds_data *)arg;
+			if (copy_to_user(&lmdp->lmd_st, &st, sizeof(st))) {
+				rc = -EFAULT;
+				goto out_req;
+			}
+		}
+
+out_req:
+		ptlrpc_req_finished(request);
+		if (filename)
+			ll_putname(filename);
+		return rc;
+	}
+	case IOC_LOV_GETINFO: {
+		struct lov_user_mds_data *lumd;
+		struct lov_stripe_md *lsm;
+		struct lov_user_md *lum;
+		struct lov_mds_md *lmm;
+		int lmmsize;
+		lstat_t st;
+
+		lumd = (struct lov_user_mds_data *)arg;
+		lum = &lumd->lmd_lmm;
+
+		rc = ll_get_max_mdsize(sbi, &lmmsize);
+		if (rc)
+			return rc;
+
+		OBD_ALLOC_LARGE(lmm, lmmsize);
+		if (lmm == NULL)
+			return -ENOMEM;
+		if (copy_from_user(lmm, lum, lmmsize)) {
+			rc = -EFAULT;
+			goto free_lmm;
+		}
+
+		switch (lmm->lmm_magic) {
+		case LOV_USER_MAGIC_V1:
+			if (LOV_USER_MAGIC_V1 == cpu_to_le32(LOV_USER_MAGIC_V1))
+				break;
+			/* swab objects first so that stripes num will be sane */
+			lustre_swab_lov_user_md_objects(
+				((struct lov_user_md_v1 *)lmm)->lmm_objects,
+				((struct lov_user_md_v1 *)lmm)->lmm_stripe_count);
+			lustre_swab_lov_user_md_v1((struct lov_user_md_v1 *)lmm);
+			break;
+		case LOV_USER_MAGIC_V3:
+			if (LOV_USER_MAGIC_V3 == cpu_to_le32(LOV_USER_MAGIC_V3))
+				break;
+			/* swab objects first so that stripes num will be sane */
+			lustre_swab_lov_user_md_objects(
+				((struct lov_user_md_v3 *)lmm)->lmm_objects,
+				((struct lov_user_md_v3 *)lmm)->lmm_stripe_count);
+			lustre_swab_lov_user_md_v3((struct lov_user_md_v3 *)lmm);
+			break;
+		default:
+			rc = -EINVAL;
+			goto free_lmm;
+		}
+
+		rc = obd_unpackmd(sbi->ll_dt_exp, &lsm, lmm, lmmsize);
+		if (rc < 0) {
+			rc = -ENOMEM;
+			goto free_lmm;
+		}
+
+		/* Perform glimpse_size operation. */
+		memset(&st, 0, sizeof(st));
+
+		rc = ll_glimpse_ioctl(sbi, lsm, &st);
+		if (rc)
+			goto free_lsm;
+
+		if (copy_to_user(&lumd->lmd_st, &st, sizeof(st))) {
+			rc = -EFAULT;
+			goto free_lsm;
+		}
+
+free_lsm:
+		obd_free_memmd(sbi->ll_dt_exp, &lsm);
+free_lmm:
+		OBD_FREE_LARGE(lmm, lmmsize);
+		return rc;
+	}
+	case OBD_IOC_LLOG_CATINFO: {
+		return -EOPNOTSUPP;
+	}
+	case OBD_IOC_QUOTACHECK: {
+		struct obd_quotactl *oqctl;
+		int error = 0;
+
+		if (!capable(CFS_CAP_SYS_ADMIN) ||
+		    sbi->ll_flags & LL_SBI_RMT_CLIENT)
+			return -EPERM;
+
+		oqctl = kzalloc(sizeof(*oqctl), GFP_NOFS);
+		if (!oqctl)
+			return -ENOMEM;
+		oqctl->qc_type = arg;
+		rc = obd_quotacheck(sbi->ll_md_exp, oqctl);
+		if (rc < 0) {
+			CDEBUG(D_INFO, "md_quotacheck failed: rc %d\n", rc);
+			error = rc;
+		}
+
+		rc = obd_quotacheck(sbi->ll_dt_exp, oqctl);
+		if (rc < 0)
+			CDEBUG(D_INFO, "obd_quotacheck failed: rc %d\n", rc);
+
+		OBD_FREE_PTR(oqctl);
+		return error ?: rc;
+	}
+	case OBD_IOC_POLL_QUOTACHECK: {
+		struct if_quotacheck *check;
+
+		if (!capable(CFS_CAP_SYS_ADMIN) ||
+		    sbi->ll_flags & LL_SBI_RMT_CLIENT)
+			return -EPERM;
+
+		check = kzalloc(sizeof(*check), GFP_NOFS);
+		if (!check)
+			return -ENOMEM;
+
+		rc = obd_iocontrol(cmd, sbi->ll_md_exp, 0, (void *)check,
+				   NULL);
+		if (rc) {
+			CDEBUG(D_QUOTA, "mdc ioctl %d failed: %d\n", cmd, rc);
+			if (copy_to_user((void *)arg, check,
+					     sizeof(*check)))
+				CDEBUG(D_QUOTA, "copy_to_user failed\n");
+			goto out_poll;
+		}
+
+		rc = obd_iocontrol(cmd, sbi->ll_dt_exp, 0, (void *)check,
+				   NULL);
+		if (rc) {
+			CDEBUG(D_QUOTA, "osc ioctl %d failed: %d\n", cmd, rc);
+			if (copy_to_user((void *)arg, check,
+					     sizeof(*check)))
+				CDEBUG(D_QUOTA, "copy_to_user failed\n");
+			goto out_poll;
+		}
+out_poll:
+		OBD_FREE_PTR(check);
+		return rc;
+	}
+	case LL_IOC_QUOTACTL: {
+		struct if_quotactl *qctl;
+
+		qctl = kzalloc(sizeof(*qctl), GFP_NOFS);
+		if (!qctl)
+			return -ENOMEM;
+
+		if (copy_from_user(qctl, (void *)arg, sizeof(*qctl))) {
+			rc = -EFAULT;
+			goto out_quotactl;
+		}
+
+		rc = quotactl_ioctl(sbi, qctl);
+
+		if (rc == 0 && copy_to_user((void *)arg, qctl, sizeof(*qctl)))
+			rc = -EFAULT;
+
+out_quotactl:
+		OBD_FREE_PTR(qctl);
+		return rc;
+	}
+	case OBD_IOC_GETDTNAME:
+	case OBD_IOC_GETMDNAME:
+		return ll_get_obd_name(inode, cmd, arg);
+	case LL_IOC_FLUSHCTX:
+		return ll_flush_ctx(inode);
+#ifdef CONFIG_FS_POSIX_ACL
+	case LL_IOC_RMTACL: {
+	    if (sbi->ll_flags & LL_SBI_RMT_CLIENT && is_root_inode(inode)) {
+		struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
+
+		LASSERT(fd != NULL);
+		rc = rct_add(&sbi->ll_rct, current_pid(), arg);
+		if (!rc)
+			fd->fd_flags |= LL_FILE_RMTACL;
+		return rc;
+	    } else
+		return 0;
+	}
+#endif
+	case LL_IOC_GETOBDCOUNT: {
+		int count, vallen;
+		struct obd_export *exp;
+
+		if (copy_from_user(&count, (int *)arg, sizeof(int)))
+			return -EFAULT;
+
+		/* get ost count when count is zero, get mdt count otherwise */
+		exp = count ? sbi->ll_md_exp : sbi->ll_dt_exp;
+		vallen = sizeof(count);
+		rc = obd_get_info(NULL, exp, sizeof(KEY_TGT_COUNT),
+				  KEY_TGT_COUNT, &vallen, &count, NULL);
+		if (rc) {
+			CERROR("get target count failed: %d\n", rc);
+			return rc;
+		}
+
+		if (copy_to_user((int *)arg, &count, sizeof(int)))
+			return -EFAULT;
+
+		return 0;
+	}
+	case LL_IOC_PATH2FID:
+		if (copy_to_user((void *)arg, ll_inode2fid(inode),
+				     sizeof(struct lu_fid)))
+			return -EFAULT;
+		return 0;
+	case LL_IOC_GET_CONNECT_FLAGS: {
+		return obd_iocontrol(cmd, sbi->ll_md_exp, 0, NULL, (void *)arg);
+	}
+	case OBD_IOC_CHANGELOG_SEND:
+	case OBD_IOC_CHANGELOG_CLEAR:
+		rc = copy_and_ioctl(cmd, sbi->ll_md_exp, (void *)arg,
+				    sizeof(struct ioc_changelog));
+		return rc;
+	case OBD_IOC_FID2PATH:
+		return ll_fid2path(inode, (void *)arg);
+	case LL_IOC_HSM_REQUEST: {
+		struct hsm_user_request	*hur;
+		ssize_t			 totalsize;
+
+		hur = kzalloc(sizeof(*hur), GFP_NOFS);
+		if (!hur)
+			return -ENOMEM;
+
+		/* We don't know the true size yet; copy the fixed-size part */
+		if (copy_from_user(hur, (void *)arg, sizeof(*hur))) {
+			OBD_FREE_PTR(hur);
+			return -EFAULT;
+		}
+
+		/* Compute the whole struct size */
+		totalsize = hur_len(hur);
+		OBD_FREE_PTR(hur);
+		if (totalsize < 0)
+			return -E2BIG;
+
+		/* Final size will be more than double totalsize */
+		if (totalsize >= MDS_MAXREQSIZE / 3)
+			return -E2BIG;
+
+		OBD_ALLOC_LARGE(hur, totalsize);
+		if (hur == NULL)
+			return -ENOMEM;
+
+		/* Copy the whole struct */
+		if (copy_from_user(hur, (void *)arg, totalsize)) {
+			OBD_FREE_LARGE(hur, totalsize);
+			return -EFAULT;
+		}
+
+		if (hur->hur_request.hr_action == HUA_RELEASE) {
+			const struct lu_fid *fid;
+			struct inode *f;
+			int i;
+
+			for (i = 0; i < hur->hur_request.hr_itemcount; i++) {
+				fid = &hur->hur_user_item[i].hui_fid;
+				f = search_inode_for_lustre(inode->i_sb, fid);
+				if (IS_ERR(f)) {
+					rc = PTR_ERR(f);
+					break;
+				}
+
+				rc = ll_hsm_release(f);
+				iput(f);
+				if (rc != 0)
+					break;
+			}
+		} else {
+			rc = obd_iocontrol(cmd, ll_i2mdexp(inode), totalsize,
+					   hur, NULL);
+		}
+
+		OBD_FREE_LARGE(hur, totalsize);
+
+		return rc;
+	}
+	case LL_IOC_HSM_PROGRESS: {
+		struct hsm_progress_kernel	hpk;
+		struct hsm_progress		hp;
+
+		if (copy_from_user(&hp, (void *)arg, sizeof(hp)))
+			return -EFAULT;
+
+		hpk.hpk_fid = hp.hp_fid;
+		hpk.hpk_cookie = hp.hp_cookie;
+		hpk.hpk_extent = hp.hp_extent;
+		hpk.hpk_flags = hp.hp_flags;
+		hpk.hpk_errval = hp.hp_errval;
+		hpk.hpk_data_version = 0;
+
+		/* File may not exist in Lustre; all progress
+		 * reported to Lustre root */
+		rc = obd_iocontrol(cmd, sbi->ll_md_exp, sizeof(hpk), &hpk,
+				   NULL);
+		return rc;
+	}
+	case LL_IOC_HSM_CT_START:
+		rc = copy_and_ioctl(cmd, sbi->ll_md_exp, (void *)arg,
+				    sizeof(struct lustre_kernelcomm));
+		return rc;
+
+	case LL_IOC_HSM_COPY_START: {
+		struct hsm_copy	*copy;
+		int		 rc;
+
+		copy = kzalloc(sizeof(*copy), GFP_NOFS);
+		if (!copy)
+			return -ENOMEM;
+		if (copy_from_user(copy, (char *)arg, sizeof(*copy))) {
+			OBD_FREE_PTR(copy);
+			return -EFAULT;
+		}
+
+		rc = ll_ioc_copy_start(inode->i_sb, copy);
+		if (copy_to_user((char *)arg, copy, sizeof(*copy)))
+			rc = -EFAULT;
+
+		OBD_FREE_PTR(copy);
+		return rc;
+	}
+	case LL_IOC_HSM_COPY_END: {
+		struct hsm_copy	*copy;
+		int		 rc;
+
+		copy = kzalloc(sizeof(*copy), GFP_NOFS);
+		if (!copy)
+			return -ENOMEM;
+		if (copy_from_user(copy, (char *)arg, sizeof(*copy))) {
+			OBD_FREE_PTR(copy);
+			return -EFAULT;
+		}
+
+		rc = ll_ioc_copy_end(inode->i_sb, copy);
+		if (copy_to_user((char *)arg, copy, sizeof(*copy)))
+			rc = -EFAULT;
+
+		OBD_FREE_PTR(copy);
+		return rc;
+	}
+	default:
+		return obd_iocontrol(cmd, sbi->ll_dt_exp, 0, NULL, (void *)arg);
+	}
+}
+
+static loff_t ll_dir_seek(struct file *file, loff_t offset, int origin)
+{
+	struct inode *inode = file->f_mapping->host;
+	struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
+	struct ll_sb_info *sbi = ll_i2sbi(inode);
+	int api32 = ll_need_32bit_api(sbi);
+	loff_t ret = -EINVAL;
+
+	mutex_lock(&inode->i_mutex);
+	switch (origin) {
+	case SEEK_SET:
+		break;
+	case SEEK_CUR:
+		offset += file->f_pos;
+		break;
+	case SEEK_END:
+		if (offset > 0)
+			goto out;
+		if (api32)
+			offset += LL_DIR_END_OFF_32BIT;
+		else
+			offset += LL_DIR_END_OFF;
+		break;
+	default:
+		goto out;
+	}
+
+	if (offset >= 0 &&
+	    ((api32 && offset <= LL_DIR_END_OFF_32BIT) ||
+	     (!api32 && offset <= LL_DIR_END_OFF))) {
+		if (offset != file->f_pos) {
+			if ((api32 && offset == LL_DIR_END_OFF_32BIT) ||
+			    (!api32 && offset == LL_DIR_END_OFF))
+				fd->lfd_pos = MDS_DIR_END_OFF;
+			else if (api32 && sbi->ll_flags & LL_SBI_64BIT_HASH)
+				fd->lfd_pos = offset << 32;
+			else
+				fd->lfd_pos = offset;
+			file->f_pos = offset;
+			file->f_version = 0;
+		}
+		ret = offset;
+	}
+	goto out;
+
+out:
+	mutex_unlock(&inode->i_mutex);
+	return ret;
+}
+
+static int ll_dir_open(struct inode *inode, struct file *file)
+{
+	return ll_file_open(inode, file);
+}
+
+static int ll_dir_release(struct inode *inode, struct file *file)
+{
+	return ll_file_release(inode, file);
+}
+
+const struct file_operations ll_dir_operations = {
+	.llseek   = ll_dir_seek,
+	.open     = ll_dir_open,
+	.release  = ll_dir_release,
+	.read     = generic_read_dir,
+	.iterate  = ll_readdir,
+	.unlocked_ioctl   = ll_dir_ioctl,
+	.fsync    = ll_fsync,
+};
diff --git a/kernel/drivers/staging/lustre/lustre/llite/file.c b/kernel/drivers/staging/lustre/lustre/llite/file.c
new file mode 100644
index 000000000..4b44c634f
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/llite/file.c
@@ -0,0 +1,3624 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/llite/file.c
+ *
+ * Author: Peter Braam <braam@clusterfs.com>
+ * Author: Phil Schwan <phil@clusterfs.com>
+ * Author: Andreas Dilger <adilger@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LLITE
+#include "../include/lustre_dlm.h"
+#include "../include/lustre_lite.h"
+#include <linux/pagemap.h>
+#include <linux/file.h>
+#include "llite_internal.h"
+#include "../include/lustre/ll_fiemap.h"
+
+#include "../include/cl_object.h"
+
+static int
+ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg);
+
+static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
+			  bool *lease_broken);
+
+static enum llioc_iter
+ll_iocontrol_call(struct inode *inode, struct file *file,
+		  unsigned int cmd, unsigned long arg, int *rcp);
+
+static struct ll_file_data *ll_file_data_get(void)
+{
+	struct ll_file_data *fd;
+
+	OBD_SLAB_ALLOC_PTR_GFP(fd, ll_file_data_slab, GFP_NOFS);
+	if (fd == NULL)
+		return NULL;
+	fd->fd_write_failed = false;
+	return fd;
+}
+
+static void ll_file_data_put(struct ll_file_data *fd)
+{
+	if (fd != NULL)
+		OBD_SLAB_FREE_PTR(fd, ll_file_data_slab);
+}
+
+void ll_pack_inode2opdata(struct inode *inode, struct md_op_data *op_data,
+			  struct lustre_handle *fh)
+{
+	op_data->op_fid1 = ll_i2info(inode)->lli_fid;
+	op_data->op_attr.ia_mode = inode->i_mode;
+	op_data->op_attr.ia_atime = inode->i_atime;
+	op_data->op_attr.ia_mtime = inode->i_mtime;
+	op_data->op_attr.ia_ctime = inode->i_ctime;
+	op_data->op_attr.ia_size = i_size_read(inode);
+	op_data->op_attr_blocks = inode->i_blocks;
+	((struct ll_iattr *)&op_data->op_attr)->ia_attr_flags =
+					ll_inode_to_ext_flags(inode->i_flags);
+	op_data->op_ioepoch = ll_i2info(inode)->lli_ioepoch;
+	if (fh)
+		op_data->op_handle = *fh;
+	op_data->op_capa1 = ll_mdscapa_get(inode);
+
+	if (LLIF_DATA_MODIFIED & ll_i2info(inode)->lli_flags)
+		op_data->op_bias |= MDS_DATA_MODIFIED;
+}
+
+/**
+ * Closes the IO epoch and packs all the attributes into @op_data for
+ * the CLOSE rpc.
+ */
+static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
+			     struct obd_client_handle *och)
+{
+	op_data->op_attr.ia_valid = ATTR_MODE | ATTR_ATIME | ATTR_ATIME_SET |
+					ATTR_MTIME | ATTR_MTIME_SET |
+					ATTR_CTIME | ATTR_CTIME_SET;
+
+	if (!(och->och_flags & FMODE_WRITE))
+		goto out;
+
+	if (!exp_connect_som(ll_i2mdexp(inode)) || !S_ISREG(inode->i_mode))
+		op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
+	else
+		ll_ioepoch_close(inode, op_data, &och, 0);
+
+out:
+	ll_pack_inode2opdata(inode, op_data, &och->och_fh);
+	ll_prep_md_op_data(op_data, inode, NULL, NULL,
+			   0, 0, LUSTRE_OPC_ANY, NULL);
+}
+
+static int ll_close_inode_openhandle(struct obd_export *md_exp,
+				     struct inode *inode,
+				     struct obd_client_handle *och,
+				     const __u64 *data_version)
+{
+	struct obd_export *exp = ll_i2mdexp(inode);
+	struct md_op_data *op_data;
+	struct ptlrpc_request *req = NULL;
+	struct obd_device *obd = class_exp2obd(exp);
+	int epoch_close = 1;
+	int rc;
+
+	if (obd == NULL) {
+		/*
+		 * XXX: in case of LMV, is this correct to access
+		 * ->exp_handle?
+		 */
+		CERROR("Invalid MDC connection handle %#llx\n",
+		       ll_i2mdexp(inode)->exp_handle.h_cookie);
+		rc = 0;
+		goto out;
+	}
+
+	op_data = kzalloc(sizeof(*op_data), GFP_NOFS);
+	if (!op_data) {
+		/* XXX We leak openhandle and request here. */
+		rc = -ENOMEM;
+		goto out;
+	}
+
+	ll_prepare_close(inode, op_data, och);
+	if (data_version != NULL) {
+		/* Pass in data_version implies release. */
+		op_data->op_bias |= MDS_HSM_RELEASE;
+		op_data->op_data_version = *data_version;
+		op_data->op_lease_handle = och->och_lease_handle;
+		op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
+	}
+	epoch_close = op_data->op_flags & MF_EPOCH_CLOSE;
+	rc = md_close(md_exp, op_data, och->och_mod, &req);
+	if (rc == -EAGAIN) {
+		/* This close must have the epoch closed. */
+		LASSERT(epoch_close);
+		/* MDS has instructed us to obtain Size-on-MDS attribute from
+		 * OSTs and send setattr to back to MDS. */
+		rc = ll_som_update(inode, op_data);
+		if (rc) {
+			CERROR("inode %lu mdc Size-on-MDS update failed: rc = %d\n",
+			       inode->i_ino, rc);
+			rc = 0;
+		}
+	} else if (rc) {
+		CERROR("inode %lu mdc close failed: rc = %d\n",
+		       inode->i_ino, rc);
+	}
+
+	/* DATA_MODIFIED flag was successfully sent on close, cancel data
+	 * modification flag. */
+	if (rc == 0 && (op_data->op_bias & MDS_DATA_MODIFIED)) {
+		struct ll_inode_info *lli = ll_i2info(inode);
+
+		spin_lock(&lli->lli_lock);
+		lli->lli_flags &= ~LLIF_DATA_MODIFIED;
+		spin_unlock(&lli->lli_lock);
+	}
+
+	if (rc == 0) {
+		rc = ll_objects_destroy(req, inode);
+		if (rc)
+			CERROR("inode %lu ll_objects destroy: rc = %d\n",
+			       inode->i_ino, rc);
+	}
+	if (rc == 0 && op_data->op_bias & MDS_HSM_RELEASE) {
+		struct mdt_body *body;
+
+		body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
+		if (!(body->valid & OBD_MD_FLRELEASED))
+			rc = -EBUSY;
+	}
+
+	ll_finish_md_op_data(op_data);
+
+out:
+	if (exp_connect_som(exp) && !epoch_close &&
+	    S_ISREG(inode->i_mode) && (och->och_flags & FMODE_WRITE)) {
+		ll_queue_done_writing(inode, LLIF_DONE_WRITING);
+	} else {
+		md_clear_open_replay_data(md_exp, och);
+		/* Free @och if it is not waiting for DONE_WRITING. */
+		och->och_fh.cookie = DEAD_HANDLE_MAGIC;
+		OBD_FREE_PTR(och);
+	}
+	if (req) /* This is close request */
+		ptlrpc_req_finished(req);
+	return rc;
+}
+
+int ll_md_real_close(struct inode *inode, fmode_t fmode)
+{
+	struct ll_inode_info *lli = ll_i2info(inode);
+	struct obd_client_handle **och_p;
+	struct obd_client_handle *och;
+	__u64 *och_usecount;
+	int rc = 0;
+
+	if (fmode & FMODE_WRITE) {
+		och_p = &lli->lli_mds_write_och;
+		och_usecount = &lli->lli_open_fd_write_count;
+	} else if (fmode & FMODE_EXEC) {
+		och_p = &lli->lli_mds_exec_och;
+		och_usecount = &lli->lli_open_fd_exec_count;
+	} else {
+		LASSERT(fmode & FMODE_READ);
+		och_p = &lli->lli_mds_read_och;
+		och_usecount = &lli->lli_open_fd_read_count;
+	}
+
+	mutex_lock(&lli->lli_och_mutex);
+	if (*och_usecount > 0) {
+		/* There are still users of this handle, so skip
+		 * freeing it. */
+		mutex_unlock(&lli->lli_och_mutex);
+		return 0;
+	}
+
+	och = *och_p;
+	*och_p = NULL;
+	mutex_unlock(&lli->lli_och_mutex);
+
+	if (och != NULL) {
+		/* There might be a race and this handle may already
+		   be closed. */
+		rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
+					       inode, och, NULL);
+	}
+
+	return rc;
+}
+
+static int ll_md_close(struct obd_export *md_exp, struct inode *inode,
+		       struct file *file)
+{
+	struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
+	struct ll_inode_info *lli = ll_i2info(inode);
+	int lockmode;
+	__u64 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
+	struct lustre_handle lockh;
+	ldlm_policy_data_t policy = {.l_inodebits = {MDS_INODELOCK_OPEN}};
+	int rc = 0;
+
+	/* clear group lock, if present */
+	if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED))
+		ll_put_grouplock(inode, file, fd->fd_grouplock.cg_gid);
+
+	if (fd->fd_lease_och != NULL) {
+		bool lease_broken;
+
+		/* Usually the lease is not released when the
+		 * application crashed, we need to release here. */
+		rc = ll_lease_close(fd->fd_lease_och, inode, &lease_broken);
+		CDEBUG(rc ? D_ERROR : D_INODE, "Clean up lease "DFID" %d/%d\n",
+			PFID(&lli->lli_fid), rc, lease_broken);
+
+		fd->fd_lease_och = NULL;
+	}
+
+	if (fd->fd_och != NULL) {
+		rc = ll_close_inode_openhandle(md_exp, inode, fd->fd_och, NULL);
+		fd->fd_och = NULL;
+		goto out;
+	}
+
+	/* Let's see if we have good enough OPEN lock on the file and if
+	   we can skip talking to MDS */
+
+	mutex_lock(&lli->lli_och_mutex);
+	if (fd->fd_omode & FMODE_WRITE) {
+		lockmode = LCK_CW;
+		LASSERT(lli->lli_open_fd_write_count);
+		lli->lli_open_fd_write_count--;
+	} else if (fd->fd_omode & FMODE_EXEC) {
+		lockmode = LCK_PR;
+		LASSERT(lli->lli_open_fd_exec_count);
+		lli->lli_open_fd_exec_count--;
+	} else {
+		lockmode = LCK_CR;
+		LASSERT(lli->lli_open_fd_read_count);
+		lli->lli_open_fd_read_count--;
+	}
+	mutex_unlock(&lli->lli_och_mutex);
+
+	if (!md_lock_match(md_exp, flags, ll_inode2fid(inode),
+			   LDLM_IBITS, &policy, lockmode, &lockh))
+		rc = ll_md_real_close(inode, fd->fd_omode);
+
+out:
+	LUSTRE_FPRIVATE(file) = NULL;
+	ll_file_data_put(fd);
+	ll_capa_close(inode);
+
+	return rc;
+}
+
+/* While this returns an error code, fput() the caller does not, so we need
+ * to make every effort to clean up all of our state here.  Also, applications
+ * rarely check close errors and even if an error is returned they will not
+ * re-try the close call.
+ */
+int ll_file_release(struct inode *inode, struct file *file)
+{
+	struct ll_file_data *fd;
+	struct ll_sb_info *sbi = ll_i2sbi(inode);
+	struct ll_inode_info *lli = ll_i2info(inode);
+	int rc;
+
+	CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
+	       inode->i_generation, inode);
+
+#ifdef CONFIG_FS_POSIX_ACL
+	if (sbi->ll_flags & LL_SBI_RMT_CLIENT && is_root_inode(inode)) {
+		struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
+
+		LASSERT(fd != NULL);
+		if (unlikely(fd->fd_flags & LL_FILE_RMTACL)) {
+			fd->fd_flags &= ~LL_FILE_RMTACL;
+			rct_del(&sbi->ll_rct, current_pid());
+			et_search_free(&sbi->ll_et, current_pid());
+		}
+	}
+#endif
+
+	if (!is_root_inode(inode))
+		ll_stats_ops_tally(sbi, LPROC_LL_RELEASE, 1);
+	fd = LUSTRE_FPRIVATE(file);
+	LASSERT(fd != NULL);
+
+	/* The last ref on @file, maybe not the owner pid of statahead.
+	 * Different processes can open the same dir, "ll_opendir_key" means:
+	 * it is me that should stop the statahead thread. */
+	if (S_ISDIR(inode->i_mode) && lli->lli_opendir_key == fd &&
+	    lli->lli_opendir_pid != 0)
+		ll_stop_statahead(inode, lli->lli_opendir_key);
+
+	if (is_root_inode(inode)) {
+		LUSTRE_FPRIVATE(file) = NULL;
+		ll_file_data_put(fd);
+		return 0;
+	}
+
+	if (!S_ISDIR(inode->i_mode)) {
+		lov_read_and_clear_async_rc(lli->lli_clob);
+		lli->lli_async_rc = 0;
+	}
+
+	rc = ll_md_close(sbi->ll_md_exp, inode, file);
+
+	if (CFS_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_DUMP_LOG, cfs_fail_val))
+		libcfs_debug_dumplog();
+
+	return rc;
+}
+
+static int ll_intent_file_open(struct dentry *dentry, void *lmm,
+			       int lmmsize, struct lookup_intent *itp)
+{
+	struct inode *inode = d_inode(dentry);
+	struct ll_sb_info *sbi = ll_i2sbi(inode);
+	struct dentry *parent = dentry->d_parent;
+	const char *name = dentry->d_name.name;
+	const int len = dentry->d_name.len;
+	struct md_op_data *op_data;
+	struct ptlrpc_request *req;
+	__u32 opc = LUSTRE_OPC_ANY;
+	int rc;
+
+	/* Usually we come here only for NFSD, and we want open lock.
+	   But we can also get here with pre 2.6.15 patchless kernels, and in
+	   that case that lock is also ok */
+	/* We can also get here if there was cached open handle in revalidate_it
+	 * but it disappeared while we were getting from there to ll_file_open.
+	 * But this means this file was closed and immediately opened which
+	 * makes a good candidate for using OPEN lock */
+	/* If lmmsize & lmm are not 0, we are just setting stripe info
+	 * parameters. No need for the open lock */
+	if (lmm == NULL && lmmsize == 0) {
+		itp->it_flags |= MDS_OPEN_LOCK;
+		if (itp->it_flags & FMODE_WRITE)
+			opc = LUSTRE_OPC_CREATE;
+	}
+
+	op_data  = ll_prep_md_op_data(NULL, d_inode(parent),
+				      inode, name, len,
+				      O_RDWR, opc, NULL);
+	if (IS_ERR(op_data))
+		return PTR_ERR(op_data);
+
+	itp->it_flags |= MDS_OPEN_BY_FID;
+	rc = md_intent_lock(sbi->ll_md_exp, op_data, lmm, lmmsize, itp,
+			    0 /*unused */, &req, ll_md_blocking_ast, 0);
+	ll_finish_md_op_data(op_data);
+	if (rc == -ESTALE) {
+		/* reason for keep own exit path - don`t flood log
+		* with messages with -ESTALE errors.
+		*/
+		if (!it_disposition(itp, DISP_OPEN_OPEN) ||
+		     it_open_error(DISP_OPEN_OPEN, itp))
+			goto out;
+		ll_release_openhandle(inode, itp);
+		goto out;
+	}
+
+	if (it_disposition(itp, DISP_LOOKUP_NEG)) {
+		rc = -ENOENT;
+		goto out;
+	}
+
+	if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
+		rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
+		CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc);
+		goto out;
+	}
+
+	rc = ll_prep_inode(&inode, req, NULL, itp);
+	if (!rc && itp->d.lustre.it_lock_mode)
+		ll_set_lock_data(sbi->ll_md_exp, inode, itp, NULL);
+
+out:
+	ptlrpc_req_finished(req);
+	ll_intent_drop_lock(itp);
+
+	return rc;
+}
+
+/**
+ * Assign an obtained @ioepoch to client's inode. No lock is needed, MDS does
+ * not believe attributes if a few ioepoch holders exist. Attributes for
+ * previous ioepoch if new one is opened are also skipped by MDS.
+ */
+void ll_ioepoch_open(struct ll_inode_info *lli, __u64 ioepoch)
+{
+	if (ioepoch && lli->lli_ioepoch != ioepoch) {
+		lli->lli_ioepoch = ioepoch;
+		CDEBUG(D_INODE, "Epoch %llu opened on "DFID"\n",
+		       ioepoch, PFID(&lli->lli_fid));
+	}
+}
+
+static int ll_och_fill(struct obd_export *md_exp, struct lookup_intent *it,
+		       struct obd_client_handle *och)
+{
+	struct ptlrpc_request *req = it->d.lustre.it_data;
+	struct mdt_body *body;
+
+	body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
+	och->och_fh = body->handle;
+	och->och_fid = body->fid1;
+	och->och_lease_handle.cookie = it->d.lustre.it_lock_handle;
+	och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
+	och->och_flags = it->it_flags;
+
+	return md_set_open_replay_data(md_exp, och, it);
+}
+
+static int ll_local_open(struct file *file, struct lookup_intent *it,
+			 struct ll_file_data *fd, struct obd_client_handle *och)
+{
+	struct inode *inode = file_inode(file);
+	struct ll_inode_info *lli = ll_i2info(inode);
+
+	LASSERT(!LUSTRE_FPRIVATE(file));
+
+	LASSERT(fd != NULL);
+
+	if (och) {
+		struct ptlrpc_request *req = it->d.lustre.it_data;
+		struct mdt_body *body;
+		int rc;
+
+		rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
+		if (rc != 0)
+			return rc;
+
+		body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
+		ll_ioepoch_open(lli, body->ioepoch);
+	}
+
+	LUSTRE_FPRIVATE(file) = fd;
+	ll_readahead_init(inode, &fd->fd_ras);
+	fd->fd_omode = it->it_flags & (FMODE_READ | FMODE_WRITE | FMODE_EXEC);
+	return 0;
+}
+
+/* Open a file, and (for the very first open) create objects on the OSTs at
+ * this time.  If opened with O_LOV_DELAY_CREATE, then we don't do the object
+ * creation or open until ll_lov_setstripe() ioctl is called.
+ *
+ * If we already have the stripe MD locally then we don't request it in
+ * md_open(), by passing a lmm_size = 0.
+ *
+ * It is up to the application to ensure no other processes open this file
+ * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
+ * used.  We might be able to avoid races of that sort by getting lli_open_sem
+ * before returning in the O_LOV_DELAY_CREATE case and dropping it here
+ * or in ll_file_release(), but I'm not sure that is desirable/necessary.
+ */
+int ll_file_open(struct inode *inode, struct file *file)
+{
+	struct ll_inode_info *lli = ll_i2info(inode);
+	struct lookup_intent *it, oit = { .it_op = IT_OPEN,
+					  .it_flags = file->f_flags };
+	struct obd_client_handle **och_p = NULL;
+	__u64 *och_usecount = NULL;
+	struct ll_file_data *fd;
+	int rc = 0, opendir_set = 0;
+
+	CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), flags %o\n", inode->i_ino,
+	       inode->i_generation, inode, file->f_flags);
+
+	it = file->private_data; /* XXX: compat macro */
+	file->private_data = NULL; /* prevent ll_local_open assertion */
+
+	fd = ll_file_data_get();
+	if (fd == NULL) {
+		rc = -ENOMEM;
+		goto out_openerr;
+	}
+
+	fd->fd_file = file;
+	if (S_ISDIR(inode->i_mode)) {
+		spin_lock(&lli->lli_sa_lock);
+		if (lli->lli_opendir_key == NULL && lli->lli_sai == NULL &&
+		    lli->lli_opendir_pid == 0) {
+			lli->lli_opendir_key = fd;
+			lli->lli_opendir_pid = current_pid();
+			opendir_set = 1;
+		}
+		spin_unlock(&lli->lli_sa_lock);
+	}
+
+	if (is_root_inode(inode)) {
+		LUSTRE_FPRIVATE(file) = fd;
+		return 0;
+	}
+
+	if (!it || !it->d.lustre.it_disposition) {
+		/* Convert f_flags into access mode. We cannot use file->f_mode,
+		 * because everything but O_ACCMODE mask was stripped from
+		 * there */
+		if ((oit.it_flags + 1) & O_ACCMODE)
+			oit.it_flags++;
+		if (file->f_flags & O_TRUNC)
+			oit.it_flags |= FMODE_WRITE;
+
+		/* kernel only call f_op->open in dentry_open.  filp_open calls
+		 * dentry_open after call to open_namei that checks permissions.
+		 * Only nfsd_open call dentry_open directly without checking
+		 * permissions and because of that this code below is safe. */
+		if (oit.it_flags & (FMODE_WRITE | FMODE_READ))
+			oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
+
+		/* We do not want O_EXCL here, presumably we opened the file
+		 * already? XXX - NFS implications? */
+		oit.it_flags &= ~O_EXCL;
+
+		/* bug20584, if "it_flags" contains O_CREAT, the file will be
+		 * created if necessary, then "IT_CREAT" should be set to keep
+		 * consistent with it */
+		if (oit.it_flags & O_CREAT)
+			oit.it_op |= IT_CREAT;
+
+		it = &oit;
+	}
+
+restart:
+	/* Let's see if we have file open on MDS already. */
+	if (it->it_flags & FMODE_WRITE) {
+		och_p = &lli->lli_mds_write_och;
+		och_usecount = &lli->lli_open_fd_write_count;
+	} else if (it->it_flags & FMODE_EXEC) {
+		och_p = &lli->lli_mds_exec_och;
+		och_usecount = &lli->lli_open_fd_exec_count;
+	 } else {
+		och_p = &lli->lli_mds_read_och;
+		och_usecount = &lli->lli_open_fd_read_count;
+	}
+
+	mutex_lock(&lli->lli_och_mutex);
+	if (*och_p) { /* Open handle is present */
+		if (it_disposition(it, DISP_OPEN_OPEN)) {
+			/* Well, there's extra open request that we do not need,
+			   let's close it somehow. This will decref request. */
+			rc = it_open_error(DISP_OPEN_OPEN, it);
+			if (rc) {
+				mutex_unlock(&lli->lli_och_mutex);
+				goto out_openerr;
+			}
+
+			ll_release_openhandle(inode, it);
+		}
+		(*och_usecount)++;
+
+		rc = ll_local_open(file, it, fd, NULL);
+		if (rc) {
+			(*och_usecount)--;
+			mutex_unlock(&lli->lli_och_mutex);
+			goto out_openerr;
+		}
+	} else {
+		LASSERT(*och_usecount == 0);
+		if (!it->d.lustre.it_disposition) {
+			/* We cannot just request lock handle now, new ELC code
+			   means that one of other OPEN locks for this file
+			   could be cancelled, and since blocking ast handler
+			   would attempt to grab och_mutex as well, that would
+			   result in a deadlock */
+			mutex_unlock(&lli->lli_och_mutex);
+			it->it_create_mode |= M_CHECK_STALE;
+			rc = ll_intent_file_open(file->f_path.dentry, NULL, 0, it);
+			it->it_create_mode &= ~M_CHECK_STALE;
+			if (rc)
+				goto out_openerr;
+
+			goto restart;
+		}
+		*och_p = kzalloc(sizeof(struct obd_client_handle), GFP_NOFS);
+		if (!*och_p) {
+			rc = -ENOMEM;
+			goto out_och_free;
+		}
+
+		(*och_usecount)++;
+
+		/* md_intent_lock() didn't get a request ref if there was an
+		 * open error, so don't do cleanup on the request here
+		 * (bug 3430) */
+		/* XXX (green): Should not we bail out on any error here, not
+		 * just open error? */
+		rc = it_open_error(DISP_OPEN_OPEN, it);
+		if (rc)
+			goto out_och_free;
+
+		LASSERT(it_disposition(it, DISP_ENQ_OPEN_REF));
+
+		rc = ll_local_open(file, it, fd, *och_p);
+		if (rc)
+			goto out_och_free;
+	}
+	mutex_unlock(&lli->lli_och_mutex);
+	fd = NULL;
+
+	/* Must do this outside lli_och_mutex lock to prevent deadlock where
+	   different kind of OPEN lock for this same inode gets cancelled
+	   by ldlm_cancel_lru */
+	if (!S_ISREG(inode->i_mode))
+		goto out_och_free;
+
+	ll_capa_open(inode);
+
+	if (!lli->lli_has_smd &&
+	    (cl_is_lov_delay_create(file->f_flags) ||
+	     (file->f_mode & FMODE_WRITE) == 0)) {
+		CDEBUG(D_INODE, "object creation was delayed\n");
+		goto out_och_free;
+	}
+	cl_lov_delay_create_clear(&file->f_flags);
+	goto out_och_free;
+
+out_och_free:
+	if (rc) {
+		if (och_p && *och_p) {
+			OBD_FREE(*och_p, sizeof(struct obd_client_handle));
+			*och_p = NULL; /* OBD_FREE writes some magic there */
+			(*och_usecount)--;
+		}
+		mutex_unlock(&lli->lli_och_mutex);
+
+out_openerr:
+		if (opendir_set != 0)
+			ll_stop_statahead(inode, lli->lli_opendir_key);
+		if (fd != NULL)
+			ll_file_data_put(fd);
+	} else {
+		ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1);
+	}
+
+	if (it && it_disposition(it, DISP_ENQ_OPEN_REF)) {
+		ptlrpc_req_finished(it->d.lustre.it_data);
+		it_clear_disposition(it, DISP_ENQ_OPEN_REF);
+	}
+
+	return rc;
+}
+
+static int ll_md_blocking_lease_ast(struct ldlm_lock *lock,
+			struct ldlm_lock_desc *desc, void *data, int flag)
+{
+	int rc;
+	struct lustre_handle lockh;
+
+	switch (flag) {
+	case LDLM_CB_BLOCKING:
+		ldlm_lock2handle(lock, &lockh);
+		rc = ldlm_cli_cancel(&lockh, LCF_ASYNC);
+		if (rc < 0) {
+			CDEBUG(D_INODE, "ldlm_cli_cancel: %d\n", rc);
+			return rc;
+		}
+		break;
+	case LDLM_CB_CANCELING:
+		/* do nothing */
+		break;
+	}
+	return 0;
+}
+
+/**
+ * Acquire a lease and open the file.
+ */
+static struct obd_client_handle *
+ll_lease_open(struct inode *inode, struct file *file, fmode_t fmode,
+	      __u64 open_flags)
+{
+	struct lookup_intent it = { .it_op = IT_OPEN };
+	struct ll_sb_info *sbi = ll_i2sbi(inode);
+	struct md_op_data *op_data;
+	struct ptlrpc_request *req;
+	struct lustre_handle old_handle = { 0 };
+	struct obd_client_handle *och = NULL;
+	int rc;
+	int rc2;
+
+	if (fmode != FMODE_WRITE && fmode != FMODE_READ)
+		return ERR_PTR(-EINVAL);
+
+	if (file != NULL) {
+		struct ll_inode_info *lli = ll_i2info(inode);
+		struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
+		struct obd_client_handle **och_p;
+		__u64 *och_usecount;
+
+		if (!(fmode & file->f_mode) || (file->f_mode & FMODE_EXEC))
+			return ERR_PTR(-EPERM);
+
+		/* Get the openhandle of the file */
+		rc = -EBUSY;
+		mutex_lock(&lli->lli_och_mutex);
+		if (fd->fd_lease_och != NULL) {
+			mutex_unlock(&lli->lli_och_mutex);
+			return ERR_PTR(rc);
+		}
+
+		if (fd->fd_och == NULL) {
+			if (file->f_mode & FMODE_WRITE) {
+				LASSERT(lli->lli_mds_write_och != NULL);
+				och_p = &lli->lli_mds_write_och;
+				och_usecount = &lli->lli_open_fd_write_count;
+			} else {
+				LASSERT(lli->lli_mds_read_och != NULL);
+				och_p = &lli->lli_mds_read_och;
+				och_usecount = &lli->lli_open_fd_read_count;
+			}
+			if (*och_usecount == 1) {
+				fd->fd_och = *och_p;
+				*och_p = NULL;
+				*och_usecount = 0;
+				rc = 0;
+			}
+		}
+		mutex_unlock(&lli->lli_och_mutex);
+		if (rc < 0) /* more than 1 opener */
+			return ERR_PTR(rc);
+
+		LASSERT(fd->fd_och != NULL);
+		old_handle = fd->fd_och->och_fh;
+	}
+
+	och = kzalloc(sizeof(*och), GFP_NOFS);
+	if (!och)
+		return ERR_PTR(-ENOMEM);
+
+	op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
+					LUSTRE_OPC_ANY, NULL);
+	if (IS_ERR(op_data)) {
+		rc = PTR_ERR(op_data);
+		goto out;
+	}
+
+	/* To tell the MDT this openhandle is from the same owner */
+	op_data->op_handle = old_handle;
+
+	it.it_flags = fmode | open_flags;
+	it.it_flags |= MDS_OPEN_LOCK | MDS_OPEN_BY_FID | MDS_OPEN_LEASE;
+	rc = md_intent_lock(sbi->ll_md_exp, op_data, NULL, 0, &it, 0, &req,
+				ll_md_blocking_lease_ast,
+	/* LDLM_FL_NO_LRU: To not put the lease lock into LRU list, otherwise
+	 * it can be cancelled which may mislead applications that the lease is
+	 * broken;
+	 * LDLM_FL_EXCL: Set this flag so that it won't be matched by normal
+	 * open in ll_md_blocking_ast(). Otherwise as ll_md_blocking_lease_ast
+	 * doesn't deal with openhandle, so normal openhandle will be leaked. */
+				LDLM_FL_NO_LRU | LDLM_FL_EXCL);
+	ll_finish_md_op_data(op_data);
+	ptlrpc_req_finished(req);
+	if (rc < 0)
+		goto out_release_it;
+
+	if (it_disposition(&it, DISP_LOOKUP_NEG)) {
+		rc = -ENOENT;
+		goto out_release_it;
+	}
+
+	rc = it_open_error(DISP_OPEN_OPEN, &it);
+	if (rc)
+		goto out_release_it;
+
+	LASSERT(it_disposition(&it, DISP_ENQ_OPEN_REF));
+	ll_och_fill(sbi->ll_md_exp, &it, och);
+
+	if (!it_disposition(&it, DISP_OPEN_LEASE)) /* old server? */ {
+		rc = -EOPNOTSUPP;
+		goto out_close;
+	}
+
+	/* already get lease, handle lease lock */
+	ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
+	if (it.d.lustre.it_lock_mode == 0 ||
+	    it.d.lustre.it_lock_bits != MDS_INODELOCK_OPEN) {
+		/* open lock must return for lease */
+		CERROR(DFID "lease granted but no open lock, %d/%llu.\n",
+			PFID(ll_inode2fid(inode)), it.d.lustre.it_lock_mode,
+			it.d.lustre.it_lock_bits);
+		rc = -EPROTO;
+		goto out_close;
+	}
+
+	ll_intent_release(&it);
+	return och;
+
+out_close:
+	rc2 = ll_close_inode_openhandle(sbi->ll_md_exp, inode, och, NULL);
+	if (rc2)
+		CERROR("Close openhandle returned %d\n", rc2);
+
+	/* cancel open lock */
+	if (it.d.lustre.it_lock_mode != 0) {
+		ldlm_lock_decref_and_cancel(&och->och_lease_handle,
+						it.d.lustre.it_lock_mode);
+		it.d.lustre.it_lock_mode = 0;
+	}
+out_release_it:
+	ll_intent_release(&it);
+out:
+	OBD_FREE_PTR(och);
+	return ERR_PTR(rc);
+}
+
+/**
+ * Release lease and close the file.
+ * It will check if the lease has ever broken.
+ */
+static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
+			  bool *lease_broken)
+{
+	struct ldlm_lock *lock;
+	bool cancelled = true;
+	int rc;
+
+	lock = ldlm_handle2lock(&och->och_lease_handle);
+	if (lock != NULL) {
+		lock_res_and_lock(lock);
+		cancelled = ldlm_is_cancel(lock);
+		unlock_res_and_lock(lock);
+		ldlm_lock_put(lock);
+	}
+
+	CDEBUG(D_INODE, "lease for "DFID" broken? %d\n",
+		PFID(&ll_i2info(inode)->lli_fid), cancelled);
+
+	if (!cancelled)
+		ldlm_cli_cancel(&och->och_lease_handle, 0);
+	if (lease_broken != NULL)
+		*lease_broken = cancelled;
+
+	rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp, inode, och,
+				       NULL);
+	return rc;
+}
+
+/* Fills the obdo with the attributes for the lsm */
+static int ll_lsm_getattr(struct lov_stripe_md *lsm, struct obd_export *exp,
+			  struct obd_capa *capa, struct obdo *obdo,
+			  __u64 ioepoch, int sync)
+{
+	struct ptlrpc_request_set *set;
+	struct obd_info	    oinfo = { { { 0 } } };
+	int			rc;
+
+	LASSERT(lsm != NULL);
+
+	oinfo.oi_md = lsm;
+	oinfo.oi_oa = obdo;
+	oinfo.oi_oa->o_oi = lsm->lsm_oi;
+	oinfo.oi_oa->o_mode = S_IFREG;
+	oinfo.oi_oa->o_ioepoch = ioepoch;
+	oinfo.oi_oa->o_valid = OBD_MD_FLID | OBD_MD_FLTYPE |
+			       OBD_MD_FLSIZE | OBD_MD_FLBLOCKS |
+			       OBD_MD_FLBLKSZ | OBD_MD_FLATIME |
+			       OBD_MD_FLMTIME | OBD_MD_FLCTIME |
+			       OBD_MD_FLGROUP | OBD_MD_FLEPOCH |
+			       OBD_MD_FLDATAVERSION;
+	oinfo.oi_capa = capa;
+	if (sync) {
+		oinfo.oi_oa->o_valid |= OBD_MD_FLFLAGS;
+		oinfo.oi_oa->o_flags |= OBD_FL_SRVLOCK;
+	}
+
+	set = ptlrpc_prep_set();
+	if (set == NULL) {
+		CERROR("can't allocate ptlrpc set\n");
+		rc = -ENOMEM;
+	} else {
+		rc = obd_getattr_async(exp, &oinfo, set);
+		if (rc == 0)
+			rc = ptlrpc_set_wait(set);
+		ptlrpc_set_destroy(set);
+	}
+	if (rc == 0)
+		oinfo.oi_oa->o_valid &= (OBD_MD_FLBLOCKS | OBD_MD_FLBLKSZ |
+					 OBD_MD_FLATIME | OBD_MD_FLMTIME |
+					 OBD_MD_FLCTIME | OBD_MD_FLSIZE |
+					 OBD_MD_FLDATAVERSION);
+	return rc;
+}
+
+/**
+  * Performs the getattr on the inode and updates its fields.
+  * If @sync != 0, perform the getattr under the server-side lock.
+  */
+int ll_inode_getattr(struct inode *inode, struct obdo *obdo,
+		     __u64 ioepoch, int sync)
+{
+	struct obd_capa      *capa = ll_mdscapa_get(inode);
+	struct lov_stripe_md *lsm;
+	int rc;
+
+	lsm = ccc_inode_lsm_get(inode);
+	rc = ll_lsm_getattr(lsm, ll_i2dtexp(inode),
+			    capa, obdo, ioepoch, sync);
+	capa_put(capa);
+	if (rc == 0) {
+		struct ost_id *oi = lsm ? &lsm->lsm_oi : &obdo->o_oi;
+
+		obdo_refresh_inode(inode, obdo, obdo->o_valid);
+		CDEBUG(D_INODE, "objid " DOSTID " size %llu, blocks %llu, blksize %lu\n",
+		       POSTID(oi), i_size_read(inode),
+		       (unsigned long long)inode->i_blocks,
+		       1UL << inode->i_blkbits);
+	}
+	ccc_inode_lsm_put(inode, lsm);
+	return rc;
+}
+
+int ll_merge_lvb(const struct lu_env *env, struct inode *inode)
+{
+	struct ll_inode_info *lli = ll_i2info(inode);
+	struct cl_object *obj = lli->lli_clob;
+	struct cl_attr *attr = ccc_env_thread_attr(env);
+	struct ost_lvb lvb;
+	int rc = 0;
+
+	ll_inode_size_lock(inode);
+	/* merge timestamps the most recently obtained from mds with
+	   timestamps obtained from osts */
+	LTIME_S(inode->i_atime) = lli->lli_lvb.lvb_atime;
+	LTIME_S(inode->i_mtime) = lli->lli_lvb.lvb_mtime;
+	LTIME_S(inode->i_ctime) = lli->lli_lvb.lvb_ctime;
+
+	lvb.lvb_size = i_size_read(inode);
+	lvb.lvb_blocks = inode->i_blocks;
+	lvb.lvb_mtime = LTIME_S(inode->i_mtime);
+	lvb.lvb_atime = LTIME_S(inode->i_atime);
+	lvb.lvb_ctime = LTIME_S(inode->i_ctime);
+
+	cl_object_attr_lock(obj);
+	rc = cl_object_attr_get(env, obj, attr);
+	cl_object_attr_unlock(obj);
+
+	if (rc == 0) {
+		if (lvb.lvb_atime < attr->cat_atime)
+			lvb.lvb_atime = attr->cat_atime;
+		if (lvb.lvb_ctime < attr->cat_ctime)
+			lvb.lvb_ctime = attr->cat_ctime;
+		if (lvb.lvb_mtime < attr->cat_mtime)
+			lvb.lvb_mtime = attr->cat_mtime;
+
+		CDEBUG(D_VFSTRACE, DFID" updating i_size %llu\n",
+				PFID(&lli->lli_fid), attr->cat_size);
+		cl_isize_write_nolock(inode, attr->cat_size);
+
+		inode->i_blocks = attr->cat_blocks;
+
+		LTIME_S(inode->i_mtime) = lvb.lvb_mtime;
+		LTIME_S(inode->i_atime) = lvb.lvb_atime;
+		LTIME_S(inode->i_ctime) = lvb.lvb_ctime;
+	}
+	ll_inode_size_unlock(inode);
+
+	return rc;
+}
+
+int ll_glimpse_ioctl(struct ll_sb_info *sbi, struct lov_stripe_md *lsm,
+		     lstat_t *st)
+{
+	struct obdo obdo = { 0 };
+	int rc;
+
+	rc = ll_lsm_getattr(lsm, sbi->ll_dt_exp, NULL, &obdo, 0, 0);
+	if (rc == 0) {
+		st->st_size   = obdo.o_size;
+		st->st_blocks = obdo.o_blocks;
+		st->st_mtime  = obdo.o_mtime;
+		st->st_atime  = obdo.o_atime;
+		st->st_ctime  = obdo.o_ctime;
+	}
+	return rc;
+}
+
+static bool file_is_noatime(const struct file *file)
+{
+	const struct vfsmount *mnt = file->f_path.mnt;
+	const struct inode *inode = file_inode(file);
+
+	/* Adapted from file_accessed() and touch_atime().*/
+	if (file->f_flags & O_NOATIME)
+		return true;
+
+	if (inode->i_flags & S_NOATIME)
+		return true;
+
+	if (IS_NOATIME(inode))
+		return true;
+
+	if (mnt->mnt_flags & (MNT_NOATIME | MNT_READONLY))
+		return true;
+
+	if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))
+		return true;
+
+	if ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode))
+		return true;
+
+	return false;
+}
+
+void ll_io_init(struct cl_io *io, const struct file *file, int write)
+{
+	struct inode *inode = file_inode(file);
+
+	io->u.ci_rw.crw_nonblock = file->f_flags & O_NONBLOCK;
+	if (write) {
+		io->u.ci_wr.wr_append = !!(file->f_flags & O_APPEND);
+		io->u.ci_wr.wr_sync = file->f_flags & O_SYNC ||
+				      file->f_flags & O_DIRECT ||
+				      IS_SYNC(inode);
+	}
+	io->ci_obj     = ll_i2info(inode)->lli_clob;
+	io->ci_lockreq = CILR_MAYBE;
+	if (ll_file_nolock(file)) {
+		io->ci_lockreq = CILR_NEVER;
+		io->ci_no_srvlock = 1;
+	} else if (file->f_flags & O_APPEND) {
+		io->ci_lockreq = CILR_MANDATORY;
+	}
+
+	io->ci_noatime = file_is_noatime(file);
+}
+
+static ssize_t
+ll_file_io_generic(const struct lu_env *env, struct vvp_io_args *args,
+		   struct file *file, enum cl_io_type iot,
+		   loff_t *ppos, size_t count)
+{
+	struct ll_inode_info *lli = ll_i2info(file_inode(file));
+	struct ll_file_data  *fd  = LUSTRE_FPRIVATE(file);
+	struct cl_io	 *io;
+	ssize_t	       result;
+
+restart:
+	io = ccc_env_thread_io(env);
+	ll_io_init(io, file, iot == CIT_WRITE);
+
+	if (cl_io_rw_init(env, io, iot, *ppos, count) == 0) {
+		struct vvp_io *vio = vvp_env_io(env);
+		struct ccc_io *cio = ccc_env_io(env);
+		int write_mutex_locked = 0;
+
+		cio->cui_fd  = LUSTRE_FPRIVATE(file);
+		vio->cui_io_subtype = args->via_io_subtype;
+
+		switch (vio->cui_io_subtype) {
+		case IO_NORMAL:
+			cio->cui_iter = args->u.normal.via_iter;
+			cio->cui_iocb = args->u.normal.via_iocb;
+			if ((iot == CIT_WRITE) &&
+			    !(cio->cui_fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
+				if (mutex_lock_interruptible(&lli->
+							       lli_write_mutex)) {
+					result = -ERESTARTSYS;
+					goto out;
+				}
+				write_mutex_locked = 1;
+			} else if (iot == CIT_READ) {
+				down_read(&lli->lli_trunc_sem);
+			}
+			break;
+		case IO_SPLICE:
+			vio->u.splice.cui_pipe = args->u.splice.via_pipe;
+			vio->u.splice.cui_flags = args->u.splice.via_flags;
+			break;
+		default:
+			CERROR("Unknown IO type - %u\n", vio->cui_io_subtype);
+			LBUG();
+		}
+		result = cl_io_loop(env, io);
+		if (write_mutex_locked)
+			mutex_unlock(&lli->lli_write_mutex);
+		else if (args->via_io_subtype == IO_NORMAL && iot == CIT_READ)
+			up_read(&lli->lli_trunc_sem);
+	} else {
+		/* cl_io_rw_init() handled IO */
+		result = io->ci_result;
+	}
+
+	if (io->ci_nob > 0) {
+		result = io->ci_nob;
+		*ppos = io->u.ci_wr.wr.crw_pos;
+	}
+	goto out;
+out:
+	cl_io_fini(env, io);
+	/* If any bit been read/written (result != 0), we just return
+	 * short read/write instead of restart io. */
+	if ((result == 0 || result == -ENODATA) && io->ci_need_restart) {
+		CDEBUG(D_VFSTRACE, "Restart %s on %pD from %lld, count:%zd\n",
+		       iot == CIT_READ ? "read" : "write",
+		       file, *ppos, count);
+		LASSERTF(io->ci_nob == 0, "%zd", io->ci_nob);
+		goto restart;
+	}
+
+	if (iot == CIT_READ) {
+		if (result >= 0)
+			ll_stats_ops_tally(ll_i2sbi(file_inode(file)),
+					   LPROC_LL_READ_BYTES, result);
+	} else if (iot == CIT_WRITE) {
+		if (result >= 0) {
+			ll_stats_ops_tally(ll_i2sbi(file_inode(file)),
+					   LPROC_LL_WRITE_BYTES, result);
+			fd->fd_write_failed = false;
+		} else if (result != -ERESTARTSYS) {
+			fd->fd_write_failed = true;
+		}
+	}
+
+	return result;
+}
+
+static ssize_t ll_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
+{
+	struct lu_env      *env;
+	struct vvp_io_args *args;
+	ssize_t	     result;
+	int		 refcheck;
+
+	env = cl_env_get(&refcheck);
+	if (IS_ERR(env))
+		return PTR_ERR(env);
+
+	args = vvp_env_args(env, IO_NORMAL);
+	args->u.normal.via_iter = to;
+	args->u.normal.via_iocb = iocb;
+
+	result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_READ,
+				    &iocb->ki_pos, iov_iter_count(to));
+	cl_env_put(env, &refcheck);
+	return result;
+}
+
+/*
+ * Write to a file (through the page cache).
+ */
+static ssize_t ll_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
+{
+	struct lu_env      *env;
+	struct vvp_io_args *args;
+	ssize_t	     result;
+	int		 refcheck;
+
+	env = cl_env_get(&refcheck);
+	if (IS_ERR(env))
+		return PTR_ERR(env);
+
+	args = vvp_env_args(env, IO_NORMAL);
+	args->u.normal.via_iter = from;
+	args->u.normal.via_iocb = iocb;
+
+	result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_WRITE,
+				  &iocb->ki_pos, iov_iter_count(from));
+	cl_env_put(env, &refcheck);
+	return result;
+}
+
+/*
+ * Send file content (through pagecache) somewhere with helper
+ */
+static ssize_t ll_file_splice_read(struct file *in_file, loff_t *ppos,
+				   struct pipe_inode_info *pipe, size_t count,
+				   unsigned int flags)
+{
+	struct lu_env      *env;
+	struct vvp_io_args *args;
+	ssize_t	     result;
+	int		 refcheck;
+
+	env = cl_env_get(&refcheck);
+	if (IS_ERR(env))
+		return PTR_ERR(env);
+
+	args = vvp_env_args(env, IO_SPLICE);
+	args->u.splice.via_pipe = pipe;
+	args->u.splice.via_flags = flags;
+
+	result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
+	cl_env_put(env, &refcheck);
+	return result;
+}
+
+static int ll_lov_recreate(struct inode *inode, struct ost_id *oi, u32 ost_idx)
+{
+	struct obd_export *exp = ll_i2dtexp(inode);
+	struct obd_trans_info oti = { 0 };
+	struct obdo *oa = NULL;
+	int lsm_size;
+	int rc = 0;
+	struct lov_stripe_md *lsm = NULL, *lsm2;
+
+	OBDO_ALLOC(oa);
+	if (oa == NULL)
+		return -ENOMEM;
+
+	lsm = ccc_inode_lsm_get(inode);
+	if (!lsm_has_objects(lsm)) {
+		rc = -ENOENT;
+		goto out;
+	}
+
+	lsm_size = sizeof(*lsm) + (sizeof(struct lov_oinfo) *
+		   (lsm->lsm_stripe_count));
+
+	OBD_ALLOC_LARGE(lsm2, lsm_size);
+	if (lsm2 == NULL) {
+		rc = -ENOMEM;
+		goto out;
+	}
+
+	oa->o_oi = *oi;
+	oa->o_nlink = ost_idx;
+	oa->o_flags |= OBD_FL_RECREATE_OBJS;
+	oa->o_valid = OBD_MD_FLID | OBD_MD_FLFLAGS | OBD_MD_FLGROUP;
+	obdo_from_inode(oa, inode, OBD_MD_FLTYPE | OBD_MD_FLATIME |
+				   OBD_MD_FLMTIME | OBD_MD_FLCTIME);
+	obdo_set_parent_fid(oa, &ll_i2info(inode)->lli_fid);
+	memcpy(lsm2, lsm, lsm_size);
+	ll_inode_size_lock(inode);
+	rc = obd_create(NULL, exp, oa, &lsm2, &oti);
+	ll_inode_size_unlock(inode);
+
+	OBD_FREE_LARGE(lsm2, lsm_size);
+	goto out;
+out:
+	ccc_inode_lsm_put(inode, lsm);
+	OBDO_FREE(oa);
+	return rc;
+}
+
+static int ll_lov_recreate_obj(struct inode *inode, unsigned long arg)
+{
+	struct ll_recreate_obj ucreat;
+	struct ost_id		oi;
+
+	if (!capable(CFS_CAP_SYS_ADMIN))
+		return -EPERM;
+
+	if (copy_from_user(&ucreat, (struct ll_recreate_obj *)arg,
+			   sizeof(ucreat)))
+		return -EFAULT;
+
+	ostid_set_seq_mdt0(&oi);
+	ostid_set_id(&oi, ucreat.lrc_id);
+	return ll_lov_recreate(inode, &oi, ucreat.lrc_ost_idx);
+}
+
+static int ll_lov_recreate_fid(struct inode *inode, unsigned long arg)
+{
+	struct lu_fid	fid;
+	struct ost_id	oi;
+	u32		ost_idx;
+
+	if (!capable(CFS_CAP_SYS_ADMIN))
+		return -EPERM;
+
+	if (copy_from_user(&fid, (struct lu_fid *)arg, sizeof(fid)))
+		return -EFAULT;
+
+	fid_to_ostid(&fid, &oi);
+	ost_idx = (fid_seq(&fid) >> 16) & 0xffff;
+	return ll_lov_recreate(inode, &oi, ost_idx);
+}
+
+int ll_lov_setstripe_ea_info(struct inode *inode, struct dentry *dentry,
+			     int flags, struct lov_user_md *lum, int lum_size)
+{
+	struct lov_stripe_md *lsm = NULL;
+	struct lookup_intent oit = {.it_op = IT_OPEN, .it_flags = flags};
+	int rc = 0;
+
+	lsm = ccc_inode_lsm_get(inode);
+	if (lsm != NULL) {
+		ccc_inode_lsm_put(inode, lsm);
+		CDEBUG(D_IOCTL, "stripe already exists for ino %lu\n",
+		       inode->i_ino);
+		rc = -EEXIST;
+		goto out;
+	}
+
+	ll_inode_size_lock(inode);
+	rc = ll_intent_file_open(dentry, lum, lum_size, &oit);
+	if (rc)
+		goto out_unlock;
+	rc = oit.d.lustre.it_status;
+	if (rc < 0)
+		goto out_req_free;
+
+	ll_release_openhandle(inode, &oit);
+
+out_unlock:
+	ll_inode_size_unlock(inode);
+	ll_intent_release(&oit);
+	ccc_inode_lsm_put(inode, lsm);
+out:
+	return rc;
+out_req_free:
+	ptlrpc_req_finished((struct ptlrpc_request *) oit.d.lustre.it_data);
+	goto out;
+}
+
+int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
+			     struct lov_mds_md **lmmp, int *lmm_size,
+			     struct ptlrpc_request **request)
+{
+	struct ll_sb_info *sbi = ll_i2sbi(inode);
+	struct mdt_body  *body;
+	struct lov_mds_md *lmm = NULL;
+	struct ptlrpc_request *req = NULL;
+	struct md_op_data *op_data;
+	int rc, lmmsize;
+
+	rc = ll_get_default_mdsize(sbi, &lmmsize);
+	if (rc)
+		return rc;
+
+	op_data = ll_prep_md_op_data(NULL, inode, NULL, filename,
+				     strlen(filename), lmmsize,
+				     LUSTRE_OPC_ANY, NULL);
+	if (IS_ERR(op_data))
+		return PTR_ERR(op_data);
+
+	op_data->op_valid = OBD_MD_FLEASIZE | OBD_MD_FLDIREA;
+	rc = md_getattr_name(sbi->ll_md_exp, op_data, &req);
+	ll_finish_md_op_data(op_data);
+	if (rc < 0) {
+		CDEBUG(D_INFO, "md_getattr_name failed on %s: rc %d\n",
+		       filename, rc);
+		goto out;
+	}
+
+	body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
+	LASSERT(body != NULL); /* checked by mdc_getattr_name */
+
+	lmmsize = body->eadatasize;
+
+	if (!(body->valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
+			lmmsize == 0) {
+		rc = -ENODATA;
+		goto out;
+	}
+
+	lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD, lmmsize);
+	LASSERT(lmm != NULL);
+
+	if ((lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V1)) &&
+	    (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V3))) {
+		rc = -EPROTO;
+		goto out;
+	}
+
+	/*
+	 * This is coming from the MDS, so is probably in
+	 * little endian.  We convert it to host endian before
+	 * passing it to userspace.
+	 */
+	if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC)) {
+		int stripe_count;
+
+		stripe_count = le16_to_cpu(lmm->lmm_stripe_count);
+		if (le32_to_cpu(lmm->lmm_pattern) & LOV_PATTERN_F_RELEASED)
+			stripe_count = 0;
+
+		/* if function called for directory - we should
+		 * avoid swab not existent lsm objects */
+		if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1)) {
+			lustre_swab_lov_user_md_v1((struct lov_user_md_v1 *)lmm);
+			if (S_ISREG(body->mode))
+				lustre_swab_lov_user_md_objects(
+				 ((struct lov_user_md_v1 *)lmm)->lmm_objects,
+				 stripe_count);
+		} else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
+			lustre_swab_lov_user_md_v3((struct lov_user_md_v3 *)lmm);
+			if (S_ISREG(body->mode))
+				lustre_swab_lov_user_md_objects(
+				 ((struct lov_user_md_v3 *)lmm)->lmm_objects,
+				 stripe_count);
+		}
+	}
+
+out:
+	*lmmp = lmm;
+	*lmm_size = lmmsize;
+	*request = req;
+	return rc;
+}
+
+static int ll_lov_setea(struct inode *inode, struct file *file,
+			    unsigned long arg)
+{
+	int			 flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
+	struct lov_user_md	*lump;
+	int			 lum_size = sizeof(struct lov_user_md) +
+					    sizeof(struct lov_user_ost_data);
+	int			 rc;
+
+	if (!capable(CFS_CAP_SYS_ADMIN))
+		return -EPERM;
+
+	OBD_ALLOC_LARGE(lump, lum_size);
+	if (lump == NULL)
+		return -ENOMEM;
+
+	if (copy_from_user(lump, (struct lov_user_md *)arg, lum_size)) {
+		OBD_FREE_LARGE(lump, lum_size);
+		return -EFAULT;
+	}
+
+	rc = ll_lov_setstripe_ea_info(inode, file->f_path.dentry, flags, lump,
+				     lum_size);
+	cl_lov_delay_create_clear(&file->f_flags);
+
+	OBD_FREE_LARGE(lump, lum_size);
+	return rc;
+}
+
+static int ll_lov_setstripe(struct inode *inode, struct file *file,
+			    unsigned long arg)
+{
+	struct lov_user_md_v3	 lumv3;
+	struct lov_user_md_v1	*lumv1 = (struct lov_user_md_v1 *)&lumv3;
+	struct lov_user_md_v1	*lumv1p = (struct lov_user_md_v1 *)arg;
+	struct lov_user_md_v3	*lumv3p = (struct lov_user_md_v3 *)arg;
+	int			 lum_size, rc;
+	int			 flags = FMODE_WRITE;
+
+	/* first try with v1 which is smaller than v3 */
+	lum_size = sizeof(struct lov_user_md_v1);
+	if (copy_from_user(lumv1, lumv1p, lum_size))
+		return -EFAULT;
+
+	if (lumv1->lmm_magic == LOV_USER_MAGIC_V3) {
+		lum_size = sizeof(struct lov_user_md_v3);
+		if (copy_from_user(&lumv3, lumv3p, lum_size))
+			return -EFAULT;
+	}
+
+	rc = ll_lov_setstripe_ea_info(inode, file->f_path.dentry, flags, lumv1,
+				      lum_size);
+	cl_lov_delay_create_clear(&file->f_flags);
+	if (rc == 0) {
+		struct lov_stripe_md *lsm;
+		__u32 gen;
+
+		put_user(0, &lumv1p->lmm_stripe_count);
+
+		ll_layout_refresh(inode, &gen);
+		lsm = ccc_inode_lsm_get(inode);
+		rc = obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode),
+				   0, lsm, (void *)arg);
+		ccc_inode_lsm_put(inode, lsm);
+	}
+	return rc;
+}
+
+static int ll_lov_getstripe(struct inode *inode, unsigned long arg)
+{
+	struct lov_stripe_md *lsm;
+	int rc = -ENODATA;
+
+	lsm = ccc_inode_lsm_get(inode);
+	if (lsm != NULL)
+		rc = obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode), 0,
+				   lsm, (void *)arg);
+	ccc_inode_lsm_put(inode, lsm);
+	return rc;
+}
+
+static int
+ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg)
+{
+	struct ll_inode_info   *lli = ll_i2info(inode);
+	struct ll_file_data    *fd = LUSTRE_FPRIVATE(file);
+	struct ccc_grouplock    grouplock;
+	int		     rc;
+
+	if (arg == 0) {
+		CWARN("group id for group lock must not be 0\n");
+		return -EINVAL;
+	}
+
+	if (ll_file_nolock(file))
+		return -EOPNOTSUPP;
+
+	spin_lock(&lli->lli_lock);
+	if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
+		CWARN("group lock already existed with gid %lu\n",
+		      fd->fd_grouplock.cg_gid);
+		spin_unlock(&lli->lli_lock);
+		return -EINVAL;
+	}
+	LASSERT(fd->fd_grouplock.cg_lock == NULL);
+	spin_unlock(&lli->lli_lock);
+
+	rc = cl_get_grouplock(cl_i2info(inode)->lli_clob,
+			      arg, (file->f_flags & O_NONBLOCK), &grouplock);
+	if (rc)
+		return rc;
+
+	spin_lock(&lli->lli_lock);
+	if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
+		spin_unlock(&lli->lli_lock);
+		CERROR("another thread just won the race\n");
+		cl_put_grouplock(&grouplock);
+		return -EINVAL;
+	}
+
+	fd->fd_flags |= LL_FILE_GROUP_LOCKED;
+	fd->fd_grouplock = grouplock;
+	spin_unlock(&lli->lli_lock);
+
+	CDEBUG(D_INFO, "group lock %lu obtained\n", arg);
+	return 0;
+}
+
+static int ll_put_grouplock(struct inode *inode, struct file *file,
+			    unsigned long arg)
+{
+	struct ll_inode_info   *lli = ll_i2info(inode);
+	struct ll_file_data    *fd = LUSTRE_FPRIVATE(file);
+	struct ccc_grouplock    grouplock;
+
+	spin_lock(&lli->lli_lock);
+	if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
+		spin_unlock(&lli->lli_lock);
+		CWARN("no group lock held\n");
+		return -EINVAL;
+	}
+	LASSERT(fd->fd_grouplock.cg_lock != NULL);
+
+	if (fd->fd_grouplock.cg_gid != arg) {
+		CWARN("group lock %lu doesn't match current id %lu\n",
+		       arg, fd->fd_grouplock.cg_gid);
+		spin_unlock(&lli->lli_lock);
+		return -EINVAL;
+	}
+
+	grouplock = fd->fd_grouplock;
+	memset(&fd->fd_grouplock, 0, sizeof(fd->fd_grouplock));
+	fd->fd_flags &= ~LL_FILE_GROUP_LOCKED;
+	spin_unlock(&lli->lli_lock);
+
+	cl_put_grouplock(&grouplock);
+	CDEBUG(D_INFO, "group lock %lu released\n", arg);
+	return 0;
+}
+
+/**
+ * Close inode open handle
+ *
+ * \param inode  [in]     inode in question
+ * \param it     [in,out] intent which contains open info and result
+ *
+ * \retval 0     success
+ * \retval <0    failure
+ */
+int ll_release_openhandle(struct inode *inode, struct lookup_intent *it)
+{
+	struct obd_client_handle *och;
+	int rc;
+
+	LASSERT(inode);
+
+	/* Root ? Do nothing. */
+	if (is_root_inode(inode))
+		return 0;
+
+	/* No open handle to close? Move away */
+	if (!it_disposition(it, DISP_OPEN_OPEN))
+		return 0;
+
+	LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
+
+	och = kzalloc(sizeof(*och), GFP_NOFS);
+	if (!och) {
+		rc = -ENOMEM;
+		goto out;
+	}
+
+	ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
+
+	rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
+				       inode, och, NULL);
+out:
+	/* this one is in place of ll_file_open */
+	if (it_disposition(it, DISP_ENQ_OPEN_REF)) {
+		ptlrpc_req_finished(it->d.lustre.it_data);
+		it_clear_disposition(it, DISP_ENQ_OPEN_REF);
+	}
+	return rc;
+}
+
+/**
+ * Get size for inode for which FIEMAP mapping is requested.
+ * Make the FIEMAP get_info call and returns the result.
+ */
+static int ll_do_fiemap(struct inode *inode, struct ll_user_fiemap *fiemap,
+			size_t num_bytes)
+{
+	struct obd_export *exp = ll_i2dtexp(inode);
+	struct lov_stripe_md *lsm = NULL;
+	struct ll_fiemap_info_key fm_key = { .name = KEY_FIEMAP, };
+	__u32 vallen = num_bytes;
+	int rc;
+
+	/* Checks for fiemap flags */
+	if (fiemap->fm_flags & ~LUSTRE_FIEMAP_FLAGS_COMPAT) {
+		fiemap->fm_flags &= ~LUSTRE_FIEMAP_FLAGS_COMPAT;
+		return -EBADR;
+	}
+
+	/* Check for FIEMAP_FLAG_SYNC */
+	if (fiemap->fm_flags & FIEMAP_FLAG_SYNC) {
+		rc = filemap_fdatawrite(inode->i_mapping);
+		if (rc)
+			return rc;
+	}
+
+	lsm = ccc_inode_lsm_get(inode);
+	if (lsm == NULL)
+		return -ENOENT;
+
+	/* If the stripe_count > 1 and the application does not understand
+	 * DEVICE_ORDER flag, then it cannot interpret the extents correctly.
+	 */
+	if (lsm->lsm_stripe_count > 1 &&
+	    !(fiemap->fm_flags & FIEMAP_FLAG_DEVICE_ORDER)) {
+		rc = -EOPNOTSUPP;
+		goto out;
+	}
+
+	fm_key.oa.o_oi = lsm->lsm_oi;
+	fm_key.oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
+
+	if (i_size_read(inode) == 0) {
+		rc = ll_glimpse_size(inode);
+		if (rc)
+			goto out;
+	}
+
+	obdo_from_inode(&fm_key.oa, inode, OBD_MD_FLSIZE);
+	obdo_set_parent_fid(&fm_key.oa, &ll_i2info(inode)->lli_fid);
+	/* If filesize is 0, then there would be no objects for mapping */
+	if (fm_key.oa.o_size == 0) {
+		fiemap->fm_mapped_extents = 0;
+		rc = 0;
+		goto out;
+	}
+
+	memcpy(&fm_key.fiemap, fiemap, sizeof(*fiemap));
+
+	rc = obd_get_info(NULL, exp, sizeof(fm_key), &fm_key, &vallen,
+			  fiemap, lsm);
+	if (rc)
+		CERROR("obd_get_info failed: rc = %d\n", rc);
+
+out:
+	ccc_inode_lsm_put(inode, lsm);
+	return rc;
+}
+
+int ll_fid2path(struct inode *inode, void __user *arg)
+{
+	struct obd_export *exp = ll_i2mdexp(inode);
+	const struct getinfo_fid2path __user *gfin = arg;
+	struct getinfo_fid2path *gfout;
+	u32 pathlen;
+	size_t outsize;
+	int rc;
+
+	if (!capable(CFS_CAP_DAC_READ_SEARCH) &&
+	    !(ll_i2sbi(inode)->ll_flags & LL_SBI_USER_FID2PATH))
+		return -EPERM;
+
+	/* Only need to get the buflen */
+	if (get_user(pathlen, &gfin->gf_pathlen))
+		return -EFAULT;
+
+	if (pathlen > PATH_MAX)
+		return -EINVAL;
+
+	outsize = sizeof(*gfout) + pathlen;
+
+	gfout = kzalloc(outsize, GFP_NOFS);
+	if (!gfout)
+		return -ENOMEM;
+
+	if (copy_from_user(gfout, arg, sizeof(*gfout))) {
+		rc = -EFAULT;
+		goto gf_free;
+	}
+
+	/* Call mdc_iocontrol */
+	rc = obd_iocontrol(OBD_IOC_FID2PATH, exp, outsize, gfout, NULL);
+	if (rc != 0)
+		goto gf_free;
+
+	if (copy_to_user(arg, gfout, outsize))
+		rc = -EFAULT;
+
+gf_free:
+	OBD_FREE(gfout, outsize);
+	return rc;
+}
+
+static int ll_ioctl_fiemap(struct inode *inode, unsigned long arg)
+{
+	struct ll_user_fiemap *fiemap_s;
+	size_t num_bytes, ret_bytes;
+	unsigned int extent_count;
+	int rc = 0;
+
+	/* Get the extent count so we can calculate the size of
+	 * required fiemap buffer */
+	if (get_user(extent_count,
+	    &((struct ll_user_fiemap __user *)arg)->fm_extent_count))
+		return -EFAULT;
+
+	if (extent_count >=
+	    (SIZE_MAX - sizeof(*fiemap_s)) / sizeof(struct ll_fiemap_extent))
+		return -EINVAL;
+	num_bytes = sizeof(*fiemap_s) + (extent_count *
+					 sizeof(struct ll_fiemap_extent));
+
+	OBD_ALLOC_LARGE(fiemap_s, num_bytes);
+	if (fiemap_s == NULL)
+		return -ENOMEM;
+
+	/* get the fiemap value */
+	if (copy_from_user(fiemap_s, (struct ll_user_fiemap __user *)arg,
+			   sizeof(*fiemap_s))) {
+		rc = -EFAULT;
+		goto error;
+	}
+
+	/* If fm_extent_count is non-zero, read the first extent since
+	 * it is used to calculate end_offset and device from previous
+	 * fiemap call. */
+	if (extent_count) {
+		if (copy_from_user(&fiemap_s->fm_extents[0],
+		    (char __user *)arg + sizeof(*fiemap_s),
+		    sizeof(struct ll_fiemap_extent))) {
+			rc = -EFAULT;
+			goto error;
+		}
+	}
+
+	rc = ll_do_fiemap(inode, fiemap_s, num_bytes);
+	if (rc)
+		goto error;
+
+	ret_bytes = sizeof(struct ll_user_fiemap);
+
+	if (extent_count != 0)
+		ret_bytes += (fiemap_s->fm_mapped_extents *
+				 sizeof(struct ll_fiemap_extent));
+
+	if (copy_to_user((void *)arg, fiemap_s, ret_bytes))
+		rc = -EFAULT;
+
+error:
+	OBD_FREE_LARGE(fiemap_s, num_bytes);
+	return rc;
+}
+
+/*
+ * Read the data_version for inode.
+ *
+ * This value is computed using stripe object version on OST.
+ * Version is computed using server side locking.
+ *
+ * @param extent_lock  Take extent lock. Not needed if a process is already
+ *		       holding the OST object group locks.
+ */
+int ll_data_version(struct inode *inode, __u64 *data_version,
+		    int extent_lock)
+{
+	struct lov_stripe_md	*lsm = NULL;
+	struct ll_sb_info	*sbi = ll_i2sbi(inode);
+	struct obdo		*obdo = NULL;
+	int			 rc;
+
+	/* If no stripe, we consider version is 0. */
+	lsm = ccc_inode_lsm_get(inode);
+	if (!lsm_has_objects(lsm)) {
+		*data_version = 0;
+		CDEBUG(D_INODE, "No object for inode\n");
+		rc = 0;
+		goto out;
+	}
+
+	obdo = kzalloc(sizeof(*obdo), GFP_NOFS);
+	if (!obdo) {
+		rc = -ENOMEM;
+		goto out;
+	}
+
+	rc = ll_lsm_getattr(lsm, sbi->ll_dt_exp, NULL, obdo, 0, extent_lock);
+	if (rc == 0) {
+		if (!(obdo->o_valid & OBD_MD_FLDATAVERSION))
+			rc = -EOPNOTSUPP;
+		else
+			*data_version = obdo->o_data_version;
+	}
+
+	OBD_FREE_PTR(obdo);
+out:
+	ccc_inode_lsm_put(inode, lsm);
+	return rc;
+}
+
+/*
+ * Trigger a HSM release request for the provided inode.
+ */
+int ll_hsm_release(struct inode *inode)
+{
+	struct cl_env_nest nest;
+	struct lu_env *env;
+	struct obd_client_handle *och = NULL;
+	__u64 data_version = 0;
+	int rc;
+
+
+	CDEBUG(D_INODE, "%s: Releasing file "DFID".\n",
+	       ll_get_fsname(inode->i_sb, NULL, 0),
+	       PFID(&ll_i2info(inode)->lli_fid));
+
+	och = ll_lease_open(inode, NULL, FMODE_WRITE, MDS_OPEN_RELEASE);
+	if (IS_ERR(och)) {
+		rc = PTR_ERR(och);
+		goto out;
+	}
+
+	/* Grab latest data_version and [am]time values */
+	rc = ll_data_version(inode, &data_version, 1);
+	if (rc != 0)
+		goto out;
+
+	env = cl_env_nested_get(&nest);
+	if (IS_ERR(env)) {
+		rc = PTR_ERR(env);
+		goto out;
+	}
+
+	ll_merge_lvb(env, inode);
+	cl_env_nested_put(&nest, env);
+
+	/* Release the file.
+	 * NB: lease lock handle is released in mdc_hsm_release_pack() because
+	 * we still need it to pack l_remote_handle to MDT. */
+	rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp, inode, och,
+				       &data_version);
+	och = NULL;
+
+
+out:
+	if (och != NULL && !IS_ERR(och)) /* close the file */
+		ll_lease_close(och, inode, NULL);
+
+	return rc;
+}
+
+struct ll_swap_stack {
+	struct iattr		 ia1, ia2;
+	__u64			 dv1, dv2;
+	struct inode		*inode1, *inode2;
+	bool			 check_dv1, check_dv2;
+};
+
+static int ll_swap_layouts(struct file *file1, struct file *file2,
+			   struct lustre_swap_layouts *lsl)
+{
+	struct mdc_swap_layouts	 msl;
+	struct md_op_data	*op_data;
+	__u32			 gid;
+	__u64			 dv;
+	struct ll_swap_stack	*llss = NULL;
+	int			 rc;
+
+	llss = kzalloc(sizeof(*llss), GFP_NOFS);
+	if (!llss)
+		return -ENOMEM;
+
+	llss->inode1 = file_inode(file1);
+	llss->inode2 = file_inode(file2);
+
+	if (!S_ISREG(llss->inode2->i_mode)) {
+		rc = -EINVAL;
+		goto free;
+	}
+
+	if (inode_permission(llss->inode1, MAY_WRITE) ||
+	    inode_permission(llss->inode2, MAY_WRITE)) {
+		rc = -EPERM;
+		goto free;
+	}
+
+	if (llss->inode2->i_sb != llss->inode1->i_sb) {
+		rc = -EXDEV;
+		goto free;
+	}
+
+	/* we use 2 bool because it is easier to swap than 2 bits */
+	if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV1)
+		llss->check_dv1 = true;
+
+	if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV2)
+		llss->check_dv2 = true;
+
+	/* we cannot use lsl->sl_dvX directly because we may swap them */
+	llss->dv1 = lsl->sl_dv1;
+	llss->dv2 = lsl->sl_dv2;
+
+	rc = lu_fid_cmp(ll_inode2fid(llss->inode1), ll_inode2fid(llss->inode2));
+	if (rc == 0) /* same file, done! */ {
+		rc = 0;
+		goto free;
+	}
+
+	if (rc < 0) { /* sequentialize it */
+		swap(llss->inode1, llss->inode2);
+		swap(file1, file2);
+		swap(llss->dv1, llss->dv2);
+		swap(llss->check_dv1, llss->check_dv2);
+	}
+
+	gid = lsl->sl_gid;
+	if (gid != 0) { /* application asks to flush dirty cache */
+		rc = ll_get_grouplock(llss->inode1, file1, gid);
+		if (rc < 0)
+			goto free;
+
+		rc = ll_get_grouplock(llss->inode2, file2, gid);
+		if (rc < 0) {
+			ll_put_grouplock(llss->inode1, file1, gid);
+			goto free;
+		}
+	}
+
+	/* to be able to restore mtime and atime after swap
+	 * we need to first save them */
+	if (lsl->sl_flags &
+	    (SWAP_LAYOUTS_KEEP_MTIME | SWAP_LAYOUTS_KEEP_ATIME)) {
+		llss->ia1.ia_mtime = llss->inode1->i_mtime;
+		llss->ia1.ia_atime = llss->inode1->i_atime;
+		llss->ia1.ia_valid = ATTR_MTIME | ATTR_ATIME;
+		llss->ia2.ia_mtime = llss->inode2->i_mtime;
+		llss->ia2.ia_atime = llss->inode2->i_atime;
+		llss->ia2.ia_valid = ATTR_MTIME | ATTR_ATIME;
+	}
+
+	/* ultimate check, before swapping the layouts we check if
+	 * dataversion has changed (if requested) */
+	if (llss->check_dv1) {
+		rc = ll_data_version(llss->inode1, &dv, 0);
+		if (rc)
+			goto putgl;
+		if (dv != llss->dv1) {
+			rc = -EAGAIN;
+			goto putgl;
+		}
+	}
+
+	if (llss->check_dv2) {
+		rc = ll_data_version(llss->inode2, &dv, 0);
+		if (rc)
+			goto putgl;
+		if (dv != llss->dv2) {
+			rc = -EAGAIN;
+			goto putgl;
+		}
+	}
+
+	/* struct md_op_data is used to send the swap args to the mdt
+	 * only flags is missing, so we use struct mdc_swap_layouts
+	 * through the md_op_data->op_data */
+	/* flags from user space have to be converted before they are send to
+	 * server, no flag is sent today, they are only used on the client */
+	msl.msl_flags = 0;
+	rc = -ENOMEM;
+	op_data = ll_prep_md_op_data(NULL, llss->inode1, llss->inode2, NULL, 0,
+				     0, LUSTRE_OPC_ANY, &msl);
+	if (IS_ERR(op_data)) {
+		rc = PTR_ERR(op_data);
+		goto free;
+	}
+
+	rc = obd_iocontrol(LL_IOC_LOV_SWAP_LAYOUTS, ll_i2mdexp(llss->inode1),
+			   sizeof(*op_data), op_data, NULL);
+	ll_finish_md_op_data(op_data);
+
+putgl:
+	if (gid != 0) {
+		ll_put_grouplock(llss->inode2, file2, gid);
+		ll_put_grouplock(llss->inode1, file1, gid);
+	}
+
+	/* rc can be set from obd_iocontrol() or from a GOTO(putgl, ...) */
+	if (rc != 0)
+		goto free;
+
+	/* clear useless flags */
+	if (!(lsl->sl_flags & SWAP_LAYOUTS_KEEP_MTIME)) {
+		llss->ia1.ia_valid &= ~ATTR_MTIME;
+		llss->ia2.ia_valid &= ~ATTR_MTIME;
+	}
+
+	if (!(lsl->sl_flags & SWAP_LAYOUTS_KEEP_ATIME)) {
+		llss->ia1.ia_valid &= ~ATTR_ATIME;
+		llss->ia2.ia_valid &= ~ATTR_ATIME;
+	}
+
+	/* update time if requested */
+	rc = 0;
+	if (llss->ia2.ia_valid != 0) {
+		mutex_lock(&llss->inode1->i_mutex);
+		rc = ll_setattr(file1->f_path.dentry, &llss->ia2);
+		mutex_unlock(&llss->inode1->i_mutex);
+	}
+
+	if (llss->ia1.ia_valid != 0) {
+		int rc1;
+
+		mutex_lock(&llss->inode2->i_mutex);
+		rc1 = ll_setattr(file2->f_path.dentry, &llss->ia1);
+		mutex_unlock(&llss->inode2->i_mutex);
+		if (rc == 0)
+			rc = rc1;
+	}
+
+free:
+	if (llss != NULL)
+		OBD_FREE_PTR(llss);
+
+	return rc;
+}
+
+static int ll_hsm_state_set(struct inode *inode, struct hsm_state_set *hss)
+{
+	struct md_op_data	*op_data;
+	int			 rc;
+
+	/* Non-root users are forbidden to set or clear flags which are
+	 * NOT defined in HSM_USER_MASK. */
+	if (((hss->hss_setmask | hss->hss_clearmask) & ~HSM_USER_MASK) &&
+	    !capable(CFS_CAP_SYS_ADMIN))
+		return -EPERM;
+
+	op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
+				     LUSTRE_OPC_ANY, hss);
+	if (IS_ERR(op_data))
+		return PTR_ERR(op_data);
+
+	rc = obd_iocontrol(LL_IOC_HSM_STATE_SET, ll_i2mdexp(inode),
+			   sizeof(*op_data), op_data, NULL);
+
+	ll_finish_md_op_data(op_data);
+
+	return rc;
+}
+
+static int ll_hsm_import(struct inode *inode, struct file *file,
+			 struct hsm_user_import *hui)
+{
+	struct hsm_state_set	*hss = NULL;
+	struct iattr		*attr = NULL;
+	int			 rc;
+
+
+	if (!S_ISREG(inode->i_mode))
+		return -EINVAL;
+
+	/* set HSM flags */
+	hss = kzalloc(sizeof(*hss), GFP_NOFS);
+	if (!hss) {
+		rc = -ENOMEM;
+		goto out;
+	}
+
+	hss->hss_valid = HSS_SETMASK | HSS_ARCHIVE_ID;
+	hss->hss_archive_id = hui->hui_archive_id;
+	hss->hss_setmask = HS_ARCHIVED | HS_EXISTS | HS_RELEASED;
+	rc = ll_hsm_state_set(inode, hss);
+	if (rc != 0)
+		goto out;
+
+	attr = kzalloc(sizeof(*attr), GFP_NOFS);
+	if (!attr) {
+		rc = -ENOMEM;
+		goto out;
+	}
+
+	attr->ia_mode = hui->hui_mode & (S_IRWXU | S_IRWXG | S_IRWXO);
+	attr->ia_mode |= S_IFREG;
+	attr->ia_uid = make_kuid(&init_user_ns, hui->hui_uid);
+	attr->ia_gid = make_kgid(&init_user_ns, hui->hui_gid);
+	attr->ia_size = hui->hui_size;
+	attr->ia_mtime.tv_sec = hui->hui_mtime;
+	attr->ia_mtime.tv_nsec = hui->hui_mtime_ns;
+	attr->ia_atime.tv_sec = hui->hui_atime;
+	attr->ia_atime.tv_nsec = hui->hui_atime_ns;
+
+	attr->ia_valid = ATTR_SIZE | ATTR_MODE | ATTR_FORCE |
+			 ATTR_UID | ATTR_GID |
+			 ATTR_MTIME | ATTR_MTIME_SET |
+			 ATTR_ATIME | ATTR_ATIME_SET;
+
+	mutex_lock(&inode->i_mutex);
+
+	rc = ll_setattr_raw(file->f_path.dentry, attr, true);
+	if (rc == -ENODATA)
+		rc = 0;
+
+	mutex_unlock(&inode->i_mutex);
+
+out:
+	if (hss != NULL)
+		OBD_FREE_PTR(hss);
+
+	if (attr != NULL)
+		OBD_FREE_PTR(attr);
+
+	return rc;
+}
+
+static long
+ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+	struct inode		*inode = file_inode(file);
+	struct ll_file_data	*fd = LUSTRE_FPRIVATE(file);
+	int			 flags, rc;
+
+	CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),cmd=%x\n", inode->i_ino,
+	       inode->i_generation, inode, cmd);
+	ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
+
+	/* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
+	if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
+		return -ENOTTY;
+
+	switch (cmd) {
+	case LL_IOC_GETFLAGS:
+		/* Get the current value of the file flags */
+		return put_user(fd->fd_flags, (int *)arg);
+	case LL_IOC_SETFLAGS:
+	case LL_IOC_CLRFLAGS:
+		/* Set or clear specific file flags */
+		/* XXX This probably needs checks to ensure the flags are
+		 *     not abused, and to handle any flag side effects.
+		 */
+		if (get_user(flags, (int *) arg))
+			return -EFAULT;
+
+		if (cmd == LL_IOC_SETFLAGS) {
+			if ((flags & LL_FILE_IGNORE_LOCK) &&
+			    !(file->f_flags & O_DIRECT)) {
+				CERROR("%s: unable to disable locking on non-O_DIRECT file\n",
+				       current->comm);
+				return -EINVAL;
+			}
+
+			fd->fd_flags |= flags;
+		} else {
+			fd->fd_flags &= ~flags;
+		}
+		return 0;
+	case LL_IOC_LOV_SETSTRIPE:
+		return ll_lov_setstripe(inode, file, arg);
+	case LL_IOC_LOV_SETEA:
+		return ll_lov_setea(inode, file, arg);
+	case LL_IOC_LOV_SWAP_LAYOUTS: {
+		struct file *file2;
+		struct lustre_swap_layouts lsl;
+
+		if (copy_from_user(&lsl, (char *)arg,
+				       sizeof(struct lustre_swap_layouts)))
+			return -EFAULT;
+
+		if ((file->f_flags & O_ACCMODE) == 0) /* O_RDONLY */
+			return -EPERM;
+
+		file2 = fget(lsl.sl_fd);
+		if (file2 == NULL)
+			return -EBADF;
+
+		rc = -EPERM;
+		if ((file2->f_flags & O_ACCMODE) != 0) /* O_WRONLY or O_RDWR */
+			rc = ll_swap_layouts(file, file2, &lsl);
+		fput(file2);
+		return rc;
+	}
+	case LL_IOC_LOV_GETSTRIPE:
+		return ll_lov_getstripe(inode, arg);
+	case LL_IOC_RECREATE_OBJ:
+		return ll_lov_recreate_obj(inode, arg);
+	case LL_IOC_RECREATE_FID:
+		return ll_lov_recreate_fid(inode, arg);
+	case FSFILT_IOC_FIEMAP:
+		return ll_ioctl_fiemap(inode, arg);
+	case FSFILT_IOC_GETFLAGS:
+	case FSFILT_IOC_SETFLAGS:
+		return ll_iocontrol(inode, file, cmd, arg);
+	case FSFILT_IOC_GETVERSION_OLD:
+	case FSFILT_IOC_GETVERSION:
+		return put_user(inode->i_generation, (int *)arg);
+	case LL_IOC_GROUP_LOCK:
+		return ll_get_grouplock(inode, file, arg);
+	case LL_IOC_GROUP_UNLOCK:
+		return ll_put_grouplock(inode, file, arg);
+	case IOC_OBD_STATFS:
+		return ll_obd_statfs(inode, (void *)arg);
+
+	/* We need to special case any other ioctls we want to handle,
+	 * to send them to the MDS/OST as appropriate and to properly
+	 * network encode the arg field.
+	case FSFILT_IOC_SETVERSION_OLD:
+	case FSFILT_IOC_SETVERSION:
+	*/
+	case LL_IOC_FLUSHCTX:
+		return ll_flush_ctx(inode);
+	case LL_IOC_PATH2FID: {
+		if (copy_to_user((void *)arg, ll_inode2fid(inode),
+				 sizeof(struct lu_fid)))
+			return -EFAULT;
+
+		return 0;
+	}
+	case OBD_IOC_FID2PATH:
+		return ll_fid2path(inode, (void *)arg);
+	case LL_IOC_DATA_VERSION: {
+		struct ioc_data_version	idv;
+		int			rc;
+
+		if (copy_from_user(&idv, (char *)arg, sizeof(idv)))
+			return -EFAULT;
+
+		rc = ll_data_version(inode, &idv.idv_version,
+				!(idv.idv_flags & LL_DV_NOFLUSH));
+
+		if (rc == 0 && copy_to_user((char *) arg, &idv, sizeof(idv)))
+			return -EFAULT;
+
+		return rc;
+	}
+
+	case LL_IOC_GET_MDTIDX: {
+		int mdtidx;
+
+		mdtidx = ll_get_mdt_idx(inode);
+		if (mdtidx < 0)
+			return mdtidx;
+
+		if (put_user((int)mdtidx, (int *)arg))
+			return -EFAULT;
+
+		return 0;
+	}
+	case OBD_IOC_GETDTNAME:
+	case OBD_IOC_GETMDNAME:
+		return ll_get_obd_name(inode, cmd, arg);
+	case LL_IOC_HSM_STATE_GET: {
+		struct md_op_data	*op_data;
+		struct hsm_user_state	*hus;
+		int			 rc;
+
+		hus = kzalloc(sizeof(*hus), GFP_NOFS);
+		if (!hus)
+			return -ENOMEM;
+
+		op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
+					     LUSTRE_OPC_ANY, hus);
+		if (IS_ERR(op_data)) {
+			OBD_FREE_PTR(hus);
+			return PTR_ERR(op_data);
+		}
+
+		rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
+				   op_data, NULL);
+
+		if (copy_to_user((void *)arg, hus, sizeof(*hus)))
+			rc = -EFAULT;
+
+		ll_finish_md_op_data(op_data);
+		OBD_FREE_PTR(hus);
+		return rc;
+	}
+	case LL_IOC_HSM_STATE_SET: {
+		struct hsm_state_set	*hss;
+		int			 rc;
+
+		hss = kzalloc(sizeof(*hss), GFP_NOFS);
+		if (!hss)
+			return -ENOMEM;
+
+		if (copy_from_user(hss, (char *)arg, sizeof(*hss))) {
+			OBD_FREE_PTR(hss);
+			return -EFAULT;
+		}
+
+		rc = ll_hsm_state_set(inode, hss);
+
+		OBD_FREE_PTR(hss);
+		return rc;
+	}
+	case LL_IOC_HSM_ACTION: {
+		struct md_op_data		*op_data;
+		struct hsm_current_action	*hca;
+		int				 rc;
+
+		hca = kzalloc(sizeof(*hca), GFP_NOFS);
+		if (!hca)
+			return -ENOMEM;
+
+		op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
+					     LUSTRE_OPC_ANY, hca);
+		if (IS_ERR(op_data)) {
+			OBD_FREE_PTR(hca);
+			return PTR_ERR(op_data);
+		}
+
+		rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
+				   op_data, NULL);
+
+		if (copy_to_user((char *)arg, hca, sizeof(*hca)))
+			rc = -EFAULT;
+
+		ll_finish_md_op_data(op_data);
+		OBD_FREE_PTR(hca);
+		return rc;
+	}
+	case LL_IOC_SET_LEASE: {
+		struct ll_inode_info *lli = ll_i2info(inode);
+		struct obd_client_handle *och = NULL;
+		bool lease_broken;
+		fmode_t mode = 0;
+
+		switch (arg) {
+		case F_WRLCK:
+			if (!(file->f_mode & FMODE_WRITE))
+				return -EPERM;
+			mode = FMODE_WRITE;
+			break;
+		case F_RDLCK:
+			if (!(file->f_mode & FMODE_READ))
+				return -EPERM;
+			mode = FMODE_READ;
+			break;
+		case F_UNLCK:
+			mutex_lock(&lli->lli_och_mutex);
+			if (fd->fd_lease_och != NULL) {
+				och = fd->fd_lease_och;
+				fd->fd_lease_och = NULL;
+			}
+			mutex_unlock(&lli->lli_och_mutex);
+
+			if (och != NULL) {
+				mode = och->och_flags &
+				       (FMODE_READ|FMODE_WRITE);
+				rc = ll_lease_close(och, inode, &lease_broken);
+				if (rc == 0 && lease_broken)
+					mode = 0;
+			} else {
+				rc = -ENOLCK;
+			}
+
+			/* return the type of lease or error */
+			return rc < 0 ? rc : (int)mode;
+		default:
+			return -EINVAL;
+		}
+
+		CDEBUG(D_INODE, "Set lease with mode %d\n", mode);
+
+		/* apply for lease */
+		och = ll_lease_open(inode, file, mode, 0);
+		if (IS_ERR(och))
+			return PTR_ERR(och);
+
+		rc = 0;
+		mutex_lock(&lli->lli_och_mutex);
+		if (fd->fd_lease_och == NULL) {
+			fd->fd_lease_och = och;
+			och = NULL;
+		}
+		mutex_unlock(&lli->lli_och_mutex);
+		if (och != NULL) {
+			/* impossible now that only excl is supported for now */
+			ll_lease_close(och, inode, &lease_broken);
+			rc = -EBUSY;
+		}
+		return rc;
+	}
+	case LL_IOC_GET_LEASE: {
+		struct ll_inode_info *lli = ll_i2info(inode);
+		struct ldlm_lock *lock = NULL;
+
+		rc = 0;
+		mutex_lock(&lli->lli_och_mutex);
+		if (fd->fd_lease_och != NULL) {
+			struct obd_client_handle *och = fd->fd_lease_och;
+
+			lock = ldlm_handle2lock(&och->och_lease_handle);
+			if (lock != NULL) {
+				lock_res_and_lock(lock);
+				if (!ldlm_is_cancel(lock))
+					rc = och->och_flags &
+						(FMODE_READ | FMODE_WRITE);
+				unlock_res_and_lock(lock);
+				ldlm_lock_put(lock);
+			}
+		}
+		mutex_unlock(&lli->lli_och_mutex);
+		return rc;
+	}
+	case LL_IOC_HSM_IMPORT: {
+		struct hsm_user_import *hui;
+
+		hui = kzalloc(sizeof(*hui), GFP_NOFS);
+		if (!hui)
+			return -ENOMEM;
+
+		if (copy_from_user(hui, (void *)arg, sizeof(*hui))) {
+			OBD_FREE_PTR(hui);
+			return -EFAULT;
+		}
+
+		rc = ll_hsm_import(inode, file, hui);
+
+		OBD_FREE_PTR(hui);
+		return rc;
+	}
+	default: {
+		int err;
+
+		if (LLIOC_STOP ==
+		     ll_iocontrol_call(inode, file, cmd, arg, &err))
+			return err;
+
+		return obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL,
+				     (void *)arg);
+	}
+	}
+}
+
+
+static loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
+{
+	struct inode *inode = file_inode(file);
+	loff_t retval, eof = 0;
+
+	retval = offset + ((origin == SEEK_END) ? i_size_read(inode) :
+			   (origin == SEEK_CUR) ? file->f_pos : 0);
+	CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), to=%llu=%#llx(%d)\n",
+	       inode->i_ino, inode->i_generation, inode, retval, retval,
+	       origin);
+	ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK, 1);
+
+	if (origin == SEEK_END || origin == SEEK_HOLE || origin == SEEK_DATA) {
+		retval = ll_glimpse_size(inode);
+		if (retval != 0)
+			return retval;
+		eof = i_size_read(inode);
+	}
+
+	retval = generic_file_llseek_size(file, offset, origin,
+					  ll_file_maxbytes(inode), eof);
+	return retval;
+}
+
+static int ll_flush(struct file *file, fl_owner_t id)
+{
+	struct inode *inode = file_inode(file);
+	struct ll_inode_info *lli = ll_i2info(inode);
+	struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
+	int rc, err;
+
+	LASSERT(!S_ISDIR(inode->i_mode));
+
+	/* catch async errors that were recorded back when async writeback
+	 * failed for pages in this mapping. */
+	rc = lli->lli_async_rc;
+	lli->lli_async_rc = 0;
+	err = lov_read_and_clear_async_rc(lli->lli_clob);
+	if (rc == 0)
+		rc = err;
+
+	/* The application has been told write failure already.
+	 * Do not report failure again. */
+	if (fd->fd_write_failed)
+		return 0;
+	return rc ? -EIO : 0;
+}
+
+/**
+ * Called to make sure a portion of file has been written out.
+ * if @mode is not CL_FSYNC_LOCAL, it will send OST_SYNC RPCs to OST.
+ *
+ * Return how many pages have been written.
+ */
+int cl_sync_file_range(struct inode *inode, loff_t start, loff_t end,
+		       enum cl_fsync_mode mode, int ignore_layout)
+{
+	struct cl_env_nest nest;
+	struct lu_env *env;
+	struct cl_io *io;
+	struct obd_capa *capa = NULL;
+	struct cl_fsync_io *fio;
+	int result;
+
+	if (mode != CL_FSYNC_NONE && mode != CL_FSYNC_LOCAL &&
+	    mode != CL_FSYNC_DISCARD && mode != CL_FSYNC_ALL)
+		return -EINVAL;
+
+	env = cl_env_nested_get(&nest);
+	if (IS_ERR(env))
+		return PTR_ERR(env);
+
+	capa = ll_osscapa_get(inode, CAPA_OPC_OSS_WRITE);
+
+	io = ccc_env_thread_io(env);
+	io->ci_obj = cl_i2info(inode)->lli_clob;
+	io->ci_ignore_layout = ignore_layout;
+
+	/* initialize parameters for sync */
+	fio = &io->u.ci_fsync;
+	fio->fi_capa = capa;
+	fio->fi_start = start;
+	fio->fi_end = end;
+	fio->fi_fid = ll_inode2fid(inode);
+	fio->fi_mode = mode;
+	fio->fi_nr_written = 0;
+
+	if (cl_io_init(env, io, CIT_FSYNC, io->ci_obj) == 0)
+		result = cl_io_loop(env, io);
+	else
+		result = io->ci_result;
+	if (result == 0)
+		result = fio->fi_nr_written;
+	cl_io_fini(env, io);
+	cl_env_nested_put(&nest, env);
+
+	capa_put(capa);
+
+	return result;
+}
+
+int ll_fsync(struct file *file, loff_t start, loff_t end, int datasync)
+{
+	struct inode *inode = file_inode(file);
+	struct ll_inode_info *lli = ll_i2info(inode);
+	struct ptlrpc_request *req;
+	struct obd_capa *oc;
+	int rc, err;
+
+	CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
+	       inode->i_generation, inode);
+	ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1);
+
+	rc = filemap_write_and_wait_range(inode->i_mapping, start, end);
+	mutex_lock(&inode->i_mutex);
+
+	/* catch async errors that were recorded back when async writeback
+	 * failed for pages in this mapping. */
+	if (!S_ISDIR(inode->i_mode)) {
+		err = lli->lli_async_rc;
+		lli->lli_async_rc = 0;
+		if (rc == 0)
+			rc = err;
+		err = lov_read_and_clear_async_rc(lli->lli_clob);
+		if (rc == 0)
+			rc = err;
+	}
+
+	oc = ll_mdscapa_get(inode);
+	err = md_sync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), oc,
+		      &req);
+	capa_put(oc);
+	if (!rc)
+		rc = err;
+	if (!err)
+		ptlrpc_req_finished(req);
+
+	if (S_ISREG(inode->i_mode)) {
+		struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
+
+		err = cl_sync_file_range(inode, start, end, CL_FSYNC_ALL, 0);
+		if (rc == 0 && err < 0)
+			rc = err;
+		if (rc < 0)
+			fd->fd_write_failed = true;
+		else
+			fd->fd_write_failed = false;
+	}
+
+	mutex_unlock(&inode->i_mutex);
+	return rc;
+}
+
+static int
+ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
+{
+	struct inode *inode = file_inode(file);
+	struct ll_sb_info *sbi = ll_i2sbi(inode);
+	struct ldlm_enqueue_info einfo = {
+		.ei_type	= LDLM_FLOCK,
+		.ei_cb_cp	= ldlm_flock_completion_ast,
+		.ei_cbdata	= file_lock,
+	};
+	struct md_op_data *op_data;
+	struct lustre_handle lockh = {0};
+	ldlm_policy_data_t flock = {{0}};
+	__u64 flags = 0;
+	int rc;
+	int rc2 = 0;
+
+	CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu file_lock=%p\n",
+	       inode->i_ino, file_lock);
+
+	ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1);
+
+	if (file_lock->fl_flags & FL_FLOCK)
+		LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
+	else if (!(file_lock->fl_flags & FL_POSIX))
+		return -EINVAL;
+
+	flock.l_flock.owner = (unsigned long)file_lock->fl_owner;
+	flock.l_flock.pid = file_lock->fl_pid;
+	flock.l_flock.start = file_lock->fl_start;
+	flock.l_flock.end = file_lock->fl_end;
+
+	/* Somewhat ugly workaround for svc lockd.
+	 * lockd installs custom fl_lmops->lm_compare_owner that checks
+	 * for the fl_owner to be the same (which it always is on local node
+	 * I guess between lockd processes) and then compares pid.
+	 * As such we assign pid to the owner field to make it all work,
+	 * conflict with normal locks is unlikely since pid space and
+	 * pointer space for current->files are not intersecting */
+	if (file_lock->fl_lmops && file_lock->fl_lmops->lm_compare_owner)
+		flock.l_flock.owner = (unsigned long)file_lock->fl_pid;
+
+	switch (file_lock->fl_type) {
+	case F_RDLCK:
+		einfo.ei_mode = LCK_PR;
+		break;
+	case F_UNLCK:
+		/* An unlock request may or may not have any relation to
+		 * existing locks so we may not be able to pass a lock handle
+		 * via a normal ldlm_lock_cancel() request. The request may even
+		 * unlock a byte range in the middle of an existing lock. In
+		 * order to process an unlock request we need all of the same
+		 * information that is given with a normal read or write record
+		 * lock request. To avoid creating another ldlm unlock (cancel)
+		 * message we'll treat a LCK_NL flock request as an unlock. */
+		einfo.ei_mode = LCK_NL;
+		break;
+	case F_WRLCK:
+		einfo.ei_mode = LCK_PW;
+		break;
+	default:
+		CDEBUG(D_INFO, "Unknown fcntl lock type: %d\n",
+			file_lock->fl_type);
+		return -ENOTSUPP;
+	}
+
+	switch (cmd) {
+	case F_SETLKW:
+#ifdef F_SETLKW64
+	case F_SETLKW64:
+#endif
+		flags = 0;
+		break;
+	case F_SETLK:
+#ifdef F_SETLK64
+	case F_SETLK64:
+#endif
+		flags = LDLM_FL_BLOCK_NOWAIT;
+		break;
+	case F_GETLK:
+#ifdef F_GETLK64
+	case F_GETLK64:
+#endif
+		flags = LDLM_FL_TEST_LOCK;
+		/* Save the old mode so that if the mode in the lock changes we
+		 * can decrement the appropriate reader or writer refcount. */
+		file_lock->fl_type = einfo.ei_mode;
+		break;
+	default:
+		CERROR("unknown fcntl lock command: %d\n", cmd);
+		return -EINVAL;
+	}
+
+	op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
+				     LUSTRE_OPC_ANY, NULL);
+	if (IS_ERR(op_data))
+		return PTR_ERR(op_data);
+
+	CDEBUG(D_DLMTRACE, "inode=%lu, pid=%u, flags=%#llx, mode=%u, start=%llu, end=%llu\n",
+	       inode->i_ino, flock.l_flock.pid, flags, einfo.ei_mode,
+	       flock.l_flock.start, flock.l_flock.end);
+
+	rc = md_enqueue(sbi->ll_md_exp, &einfo, NULL,
+			op_data, &lockh, &flock, 0, NULL /* req */, flags);
+
+	if ((file_lock->fl_flags & FL_FLOCK) &&
+	    (rc == 0 || file_lock->fl_type == F_UNLCK))
+		rc2  = flock_lock_file_wait(file, file_lock);
+	if ((file_lock->fl_flags & FL_POSIX) &&
+	    (rc == 0 || file_lock->fl_type == F_UNLCK) &&
+	    !(flags & LDLM_FL_TEST_LOCK))
+		rc2  = posix_lock_file_wait(file, file_lock);
+
+	if (rc2 && file_lock->fl_type != F_UNLCK) {
+		einfo.ei_mode = LCK_NL;
+		md_enqueue(sbi->ll_md_exp, &einfo, NULL,
+			op_data, &lockh, &flock, 0, NULL /* req */, flags);
+		rc = rc2;
+	}
+
+	ll_finish_md_op_data(op_data);
+
+	return rc;
+}
+
+static int
+ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
+{
+	return -ENOSYS;
+}
+
+/**
+ * test if some locks matching bits and l_req_mode are acquired
+ * - bits can be in different locks
+ * - if found clear the common lock bits in *bits
+ * - the bits not found, are kept in *bits
+ * \param inode [IN]
+ * \param bits [IN] searched lock bits [IN]
+ * \param l_req_mode [IN] searched lock mode
+ * \retval boolean, true iff all bits are found
+ */
+int ll_have_md_lock(struct inode *inode, __u64 *bits,  ldlm_mode_t l_req_mode)
+{
+	struct lustre_handle lockh;
+	ldlm_policy_data_t policy;
+	ldlm_mode_t mode = (l_req_mode == LCK_MINMODE) ?
+				(LCK_CR|LCK_CW|LCK_PR|LCK_PW) : l_req_mode;
+	struct lu_fid *fid;
+	__u64 flags;
+	int i;
+
+	if (!inode)
+		return 0;
+
+	fid = &ll_i2info(inode)->lli_fid;
+	CDEBUG(D_INFO, "trying to match res "DFID" mode %s\n", PFID(fid),
+	       ldlm_lockname[mode]);
+
+	flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
+	for (i = 0; i <= MDS_INODELOCK_MAXSHIFT && *bits != 0; i++) {
+		policy.l_inodebits.bits = *bits & (1 << i);
+		if (policy.l_inodebits.bits == 0)
+			continue;
+
+		if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS,
+				  &policy, mode, &lockh)) {
+			struct ldlm_lock *lock;
+
+			lock = ldlm_handle2lock(&lockh);
+			if (lock) {
+				*bits &=
+				      ~(lock->l_policy_data.l_inodebits.bits);
+				LDLM_LOCK_PUT(lock);
+			} else {
+				*bits &= ~policy.l_inodebits.bits;
+			}
+		}
+	}
+	return *bits == 0;
+}
+
+ldlm_mode_t ll_take_md_lock(struct inode *inode, __u64 bits,
+			    struct lustre_handle *lockh, __u64 flags,
+			    ldlm_mode_t mode)
+{
+	ldlm_policy_data_t policy = { .l_inodebits = {bits} };
+	struct lu_fid *fid;
+	ldlm_mode_t rc;
+
+	fid = &ll_i2info(inode)->lli_fid;
+	CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
+
+	rc = md_lock_match(ll_i2mdexp(inode), LDLM_FL_BLOCK_GRANTED|flags,
+			   fid, LDLM_IBITS, &policy, mode, lockh);
+
+	return rc;
+}
+
+static int ll_inode_revalidate_fini(struct inode *inode, int rc)
+{
+	/* Already unlinked. Just update nlink and return success */
+	if (rc == -ENOENT) {
+		clear_nlink(inode);
+		/* This path cannot be hit for regular files unless in
+		 * case of obscure races, so no need to validate size.
+		 */
+		if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
+			return 0;
+	} else if (rc != 0) {
+		CDEBUG_LIMIT((rc == -EACCES || rc == -EIDRM) ? D_INFO : D_ERROR,
+			     "%s: revalidate FID "DFID" error: rc = %d\n",
+			     ll_get_fsname(inode->i_sb, NULL, 0),
+			     PFID(ll_inode2fid(inode)), rc);
+	}
+
+	return rc;
+}
+
+static int __ll_inode_revalidate(struct dentry *dentry, __u64 ibits)
+{
+	struct inode *inode = d_inode(dentry);
+	struct ptlrpc_request *req = NULL;
+	struct obd_export *exp;
+	int rc = 0;
+
+	LASSERT(inode != NULL);
+
+	CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),name=%pd\n",
+	       inode->i_ino, inode->i_generation, inode, dentry);
+
+	exp = ll_i2mdexp(inode);
+
+	/* XXX: Enable OBD_CONNECT_ATTRFID to reduce unnecessary getattr RPC.
+	 *      But under CMD case, it caused some lock issues, should be fixed
+	 *      with new CMD ibits lock. See bug 12718 */
+	if (exp_connect_flags(exp) & OBD_CONNECT_ATTRFID) {
+		struct lookup_intent oit = { .it_op = IT_GETATTR };
+		struct md_op_data *op_data;
+
+		if (ibits == MDS_INODELOCK_LOOKUP)
+			oit.it_op = IT_LOOKUP;
+
+		/* Call getattr by fid, so do not provide name at all. */
+		op_data = ll_prep_md_op_data(NULL, inode,
+					     inode, NULL, 0, 0,
+					     LUSTRE_OPC_ANY, NULL);
+		if (IS_ERR(op_data))
+			return PTR_ERR(op_data);
+
+		oit.it_create_mode |= M_CHECK_STALE;
+		rc = md_intent_lock(exp, op_data, NULL, 0,
+				    /* we are not interested in name
+				       based lookup */
+				    &oit, 0, &req,
+				    ll_md_blocking_ast, 0);
+		ll_finish_md_op_data(op_data);
+		oit.it_create_mode &= ~M_CHECK_STALE;
+		if (rc < 0) {
+			rc = ll_inode_revalidate_fini(inode, rc);
+			goto out;
+		}
+
+		rc = ll_revalidate_it_finish(req, &oit, inode);
+		if (rc != 0) {
+			ll_intent_release(&oit);
+			goto out;
+		}
+
+		/* Unlinked? Unhash dentry, so it is not picked up later by
+		   do_lookup() -> ll_revalidate_it(). We cannot use d_drop
+		   here to preserve get_cwd functionality on 2.6.
+		   Bug 10503 */
+		if (!d_inode(dentry)->i_nlink)
+			d_lustre_invalidate(dentry, 0);
+
+		ll_lookup_finish_locks(&oit, inode);
+	} else if (!ll_have_md_lock(d_inode(dentry), &ibits, LCK_MINMODE)) {
+		struct ll_sb_info *sbi = ll_i2sbi(d_inode(dentry));
+		u64 valid = OBD_MD_FLGETATTR;
+		struct md_op_data *op_data;
+		int ealen = 0;
+
+		if (S_ISREG(inode->i_mode)) {
+			rc = ll_get_default_mdsize(sbi, &ealen);
+			if (rc)
+				return rc;
+			valid |= OBD_MD_FLEASIZE | OBD_MD_FLMODEASIZE;
+		}
+
+		op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL,
+					     0, ealen, LUSTRE_OPC_ANY,
+					     NULL);
+		if (IS_ERR(op_data))
+			return PTR_ERR(op_data);
+
+		op_data->op_valid = valid;
+		/* Once OBD_CONNECT_ATTRFID is not supported, we can't find one
+		 * capa for this inode. Because we only keep capas of dirs
+		 * fresh. */
+		rc = md_getattr(sbi->ll_md_exp, op_data, &req);
+		ll_finish_md_op_data(op_data);
+		if (rc) {
+			rc = ll_inode_revalidate_fini(inode, rc);
+			return rc;
+		}
+
+		rc = ll_prep_inode(&inode, req, NULL, NULL);
+	}
+out:
+	ptlrpc_req_finished(req);
+	return rc;
+}
+
+static int ll_inode_revalidate(struct dentry *dentry, __u64 ibits)
+{
+	struct inode *inode = d_inode(dentry);
+	int rc;
+
+	rc = __ll_inode_revalidate(dentry, ibits);
+	if (rc != 0)
+		return rc;
+
+	/* if object isn't regular file, don't validate size */
+	if (!S_ISREG(inode->i_mode)) {
+		LTIME_S(inode->i_atime) = ll_i2info(inode)->lli_lvb.lvb_atime;
+		LTIME_S(inode->i_mtime) = ll_i2info(inode)->lli_lvb.lvb_mtime;
+		LTIME_S(inode->i_ctime) = ll_i2info(inode)->lli_lvb.lvb_ctime;
+	} else {
+		/* In case of restore, the MDT has the right size and has
+		 * already send it back without granting the layout lock,
+		 * inode is up-to-date so glimpse is useless.
+		 * Also to glimpse we need the layout, in case of a running
+		 * restore the MDT holds the layout lock so the glimpse will
+		 * block up to the end of restore (getattr will block)
+		 */
+		if (!(ll_i2info(inode)->lli_flags & LLIF_FILE_RESTORING))
+			rc = ll_glimpse_size(inode);
+	}
+	return rc;
+}
+
+int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
+{
+	struct inode *inode = d_inode(de);
+	struct ll_sb_info *sbi = ll_i2sbi(inode);
+	struct ll_inode_info *lli = ll_i2info(inode);
+	int res = 0;
+
+	res = ll_inode_revalidate(de, MDS_INODELOCK_UPDATE |
+				      MDS_INODELOCK_LOOKUP);
+	ll_stats_ops_tally(sbi, LPROC_LL_GETATTR, 1);
+
+	if (res)
+		return res;
+
+	stat->dev = inode->i_sb->s_dev;
+	if (ll_need_32bit_api(sbi))
+		stat->ino = cl_fid_build_ino(&lli->lli_fid, 1);
+	else
+		stat->ino = inode->i_ino;
+	stat->mode = inode->i_mode;
+	stat->nlink = inode->i_nlink;
+	stat->uid = inode->i_uid;
+	stat->gid = inode->i_gid;
+	stat->rdev = inode->i_rdev;
+	stat->atime = inode->i_atime;
+	stat->mtime = inode->i_mtime;
+	stat->ctime = inode->i_ctime;
+	stat->blksize = 1 << inode->i_blkbits;
+
+	stat->size = i_size_read(inode);
+	stat->blocks = inode->i_blocks;
+
+	return 0;
+}
+
+static int ll_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
+		     __u64 start, __u64 len)
+{
+	int rc;
+	size_t num_bytes;
+	struct ll_user_fiemap *fiemap;
+	unsigned int extent_count = fieinfo->fi_extents_max;
+
+	num_bytes = sizeof(*fiemap) + (extent_count *
+				       sizeof(struct ll_fiemap_extent));
+	OBD_ALLOC_LARGE(fiemap, num_bytes);
+
+	if (fiemap == NULL)
+		return -ENOMEM;
+
+	fiemap->fm_flags = fieinfo->fi_flags;
+	fiemap->fm_extent_count = fieinfo->fi_extents_max;
+	fiemap->fm_start = start;
+	fiemap->fm_length = len;
+	if (extent_count > 0)
+		memcpy(&fiemap->fm_extents[0], fieinfo->fi_extents_start,
+		       sizeof(struct ll_fiemap_extent));
+
+	rc = ll_do_fiemap(inode, fiemap, num_bytes);
+
+	fieinfo->fi_flags = fiemap->fm_flags;
+	fieinfo->fi_extents_mapped = fiemap->fm_mapped_extents;
+	if (extent_count > 0)
+		memcpy(fieinfo->fi_extents_start, &fiemap->fm_extents[0],
+		       fiemap->fm_mapped_extents *
+		       sizeof(struct ll_fiemap_extent));
+
+	OBD_FREE_LARGE(fiemap, num_bytes);
+	return rc;
+}
+
+struct posix_acl *ll_get_acl(struct inode *inode, int type)
+{
+	struct ll_inode_info *lli = ll_i2info(inode);
+	struct posix_acl *acl = NULL;
+
+	spin_lock(&lli->lli_lock);
+	/* VFS' acl_permission_check->check_acl will release the refcount */
+	acl = posix_acl_dup(lli->lli_posix_acl);
+	spin_unlock(&lli->lli_lock);
+
+	return acl;
+}
+
+
+int ll_inode_permission(struct inode *inode, int mask)
+{
+	int rc = 0;
+
+#ifdef MAY_NOT_BLOCK
+	if (mask & MAY_NOT_BLOCK)
+		return -ECHILD;
+#endif
+
+       /* as root inode are NOT getting validated in lookup operation,
+	* need to do it before permission check. */
+
+	if (is_root_inode(inode)) {
+		rc = __ll_inode_revalidate(inode->i_sb->s_root,
+					   MDS_INODELOCK_LOOKUP);
+		if (rc)
+			return rc;
+	}
+
+	CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), inode mode %x mask %o\n",
+	       inode->i_ino, inode->i_generation, inode, inode->i_mode, mask);
+
+	if (ll_i2sbi(inode)->ll_flags & LL_SBI_RMT_CLIENT)
+		return lustre_check_remote_perm(inode, mask);
+
+	ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_INODE_PERM, 1);
+	rc = generic_permission(inode, mask);
+
+	return rc;
+}
+
+/* -o localflock - only provides locally consistent flock locks */
+struct file_operations ll_file_operations = {
+	.read_iter = ll_file_read_iter,
+	.write_iter = ll_file_write_iter,
+	.unlocked_ioctl = ll_file_ioctl,
+	.open	   = ll_file_open,
+	.release	= ll_file_release,
+	.mmap	   = ll_file_mmap,
+	.llseek	 = ll_file_seek,
+	.splice_read    = ll_file_splice_read,
+	.fsync	  = ll_fsync,
+	.flush	  = ll_flush
+};
+
+struct file_operations ll_file_operations_flock = {
+	.read_iter    = ll_file_read_iter,
+	.write_iter   = ll_file_write_iter,
+	.unlocked_ioctl = ll_file_ioctl,
+	.open	   = ll_file_open,
+	.release	= ll_file_release,
+	.mmap	   = ll_file_mmap,
+	.llseek	 = ll_file_seek,
+	.splice_read    = ll_file_splice_read,
+	.fsync	  = ll_fsync,
+	.flush	  = ll_flush,
+	.flock	  = ll_file_flock,
+	.lock	   = ll_file_flock
+};
+
+/* These are for -o noflock - to return ENOSYS on flock calls */
+struct file_operations ll_file_operations_noflock = {
+	.read_iter    = ll_file_read_iter,
+	.write_iter   = ll_file_write_iter,
+	.unlocked_ioctl = ll_file_ioctl,
+	.open	   = ll_file_open,
+	.release	= ll_file_release,
+	.mmap	   = ll_file_mmap,
+	.llseek	 = ll_file_seek,
+	.splice_read    = ll_file_splice_read,
+	.fsync	  = ll_fsync,
+	.flush	  = ll_flush,
+	.flock	  = ll_file_noflock,
+	.lock	   = ll_file_noflock
+};
+
+struct inode_operations ll_file_inode_operations = {
+	.setattr	= ll_setattr,
+	.getattr	= ll_getattr,
+	.permission	= ll_inode_permission,
+	.setxattr	= ll_setxattr,
+	.getxattr	= ll_getxattr,
+	.listxattr	= ll_listxattr,
+	.removexattr	= ll_removexattr,
+	.fiemap		= ll_fiemap,
+	.get_acl	= ll_get_acl,
+};
+
+/* dynamic ioctl number support routines */
+static struct llioc_ctl_data {
+	struct rw_semaphore	ioc_sem;
+	struct list_head	      ioc_head;
+} llioc = {
+	__RWSEM_INITIALIZER(llioc.ioc_sem),
+	LIST_HEAD_INIT(llioc.ioc_head)
+};
+
+
+struct llioc_data {
+	struct list_head	      iocd_list;
+	unsigned int	    iocd_size;
+	llioc_callback_t	iocd_cb;
+	unsigned int	    iocd_count;
+	unsigned int	    iocd_cmd[0];
+};
+
+void *ll_iocontrol_register(llioc_callback_t cb, int count, unsigned int *cmd)
+{
+	unsigned int size;
+	struct llioc_data *in_data = NULL;
+
+	if (cb == NULL || cmd == NULL ||
+	    count > LLIOC_MAX_CMD || count < 0)
+		return NULL;
+
+	size = sizeof(*in_data) + count * sizeof(unsigned int);
+	in_data = kzalloc(size, GFP_NOFS);
+	if (!in_data)
+		return NULL;
+
+	memset(in_data, 0, sizeof(*in_data));
+	in_data->iocd_size = size;
+	in_data->iocd_cb = cb;
+	in_data->iocd_count = count;
+	memcpy(in_data->iocd_cmd, cmd, sizeof(unsigned int) * count);
+
+	down_write(&llioc.ioc_sem);
+	list_add_tail(&in_data->iocd_list, &llioc.ioc_head);
+	up_write(&llioc.ioc_sem);
+
+	return in_data;
+}
+EXPORT_SYMBOL(ll_iocontrol_register);
+
+void ll_iocontrol_unregister(void *magic)
+{
+	struct llioc_data *tmp;
+
+	if (magic == NULL)
+		return;
+
+	down_write(&llioc.ioc_sem);
+	list_for_each_entry(tmp, &llioc.ioc_head, iocd_list) {
+		if (tmp == magic) {
+			unsigned int size = tmp->iocd_size;
+
+			list_del(&tmp->iocd_list);
+			up_write(&llioc.ioc_sem);
+
+			OBD_FREE(tmp, size);
+			return;
+		}
+	}
+	up_write(&llioc.ioc_sem);
+
+	CWARN("didn't find iocontrol register block with magic: %p\n", magic);
+}
+EXPORT_SYMBOL(ll_iocontrol_unregister);
+
+static enum llioc_iter
+ll_iocontrol_call(struct inode *inode, struct file *file,
+		  unsigned int cmd, unsigned long arg, int *rcp)
+{
+	enum llioc_iter ret = LLIOC_CONT;
+	struct llioc_data *data;
+	int rc = -EINVAL, i;
+
+	down_read(&llioc.ioc_sem);
+	list_for_each_entry(data, &llioc.ioc_head, iocd_list) {
+		for (i = 0; i < data->iocd_count; i++) {
+			if (cmd != data->iocd_cmd[i])
+				continue;
+
+			ret = data->iocd_cb(inode, file, cmd, arg, data, &rc);
+			break;
+		}
+
+		if (ret == LLIOC_STOP)
+			break;
+	}
+	up_read(&llioc.ioc_sem);
+
+	if (rcp)
+		*rcp = rc;
+	return ret;
+}
+
+int ll_layout_conf(struct inode *inode, const struct cl_object_conf *conf)
+{
+	struct ll_inode_info *lli = ll_i2info(inode);
+	struct cl_env_nest nest;
+	struct lu_env *env;
+	int result;
+
+	if (lli->lli_clob == NULL)
+		return 0;
+
+	env = cl_env_nested_get(&nest);
+	if (IS_ERR(env))
+		return PTR_ERR(env);
+
+	result = cl_conf_set(env, lli->lli_clob, conf);
+	cl_env_nested_put(&nest, env);
+
+	if (conf->coc_opc == OBJECT_CONF_SET) {
+		struct ldlm_lock *lock = conf->coc_lock;
+
+		LASSERT(lock != NULL);
+		LASSERT(ldlm_has_layout(lock));
+		if (result == 0) {
+			/* it can only be allowed to match after layout is
+			 * applied to inode otherwise false layout would be
+			 * seen. Applying layout should happen before dropping
+			 * the intent lock. */
+			ldlm_lock_allow_match(lock);
+		}
+	}
+	return result;
+}
+
+/* Fetch layout from MDT with getxattr request, if it's not ready yet */
+static int ll_layout_fetch(struct inode *inode, struct ldlm_lock *lock)
+
+{
+	struct ll_sb_info *sbi = ll_i2sbi(inode);
+	struct obd_capa *oc;
+	struct ptlrpc_request *req;
+	struct mdt_body *body;
+	void *lvbdata;
+	void *lmm;
+	int lmmsize;
+	int rc;
+
+	CDEBUG(D_INODE, DFID" LVB_READY=%d l_lvb_data=%p l_lvb_len=%d\n",
+	       PFID(ll_inode2fid(inode)), !!(lock->l_flags & LDLM_FL_LVB_READY),
+	       lock->l_lvb_data, lock->l_lvb_len);
+
+	if ((lock->l_lvb_data != NULL) && (lock->l_flags & LDLM_FL_LVB_READY))
+		return 0;
+
+	/* if layout lock was granted right away, the layout is returned
+	 * within DLM_LVB of dlm reply; otherwise if the lock was ever
+	 * blocked and then granted via completion ast, we have to fetch
+	 * layout here. Please note that we can't use the LVB buffer in
+	 * completion AST because it doesn't have a large enough buffer */
+	oc = ll_mdscapa_get(inode);
+	rc = ll_get_default_mdsize(sbi, &lmmsize);
+	if (rc == 0)
+		rc = md_getxattr(sbi->ll_md_exp, ll_inode2fid(inode), oc,
+				OBD_MD_FLXATTR, XATTR_NAME_LOV, NULL, 0,
+				lmmsize, 0, &req);
+	capa_put(oc);
+	if (rc < 0)
+		return rc;
+
+	body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
+	if (body == NULL) {
+		rc = -EPROTO;
+		goto out;
+	}
+
+	lmmsize = body->eadatasize;
+	if (lmmsize == 0) /* empty layout */ {
+		rc = 0;
+		goto out;
+	}
+
+	lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_EADATA, lmmsize);
+	if (lmm == NULL) {
+		rc = -EFAULT;
+		goto out;
+	}
+
+	OBD_ALLOC_LARGE(lvbdata, lmmsize);
+	if (lvbdata == NULL) {
+		rc = -ENOMEM;
+		goto out;
+	}
+
+	memcpy(lvbdata, lmm, lmmsize);
+	lock_res_and_lock(lock);
+	if (lock->l_lvb_data != NULL)
+		OBD_FREE_LARGE(lock->l_lvb_data, lock->l_lvb_len);
+
+	lock->l_lvb_data = lvbdata;
+	lock->l_lvb_len = lmmsize;
+	unlock_res_and_lock(lock);
+
+out:
+	ptlrpc_req_finished(req);
+	return rc;
+}
+
+/**
+ * Apply the layout to the inode. Layout lock is held and will be released
+ * in this function.
+ */
+static int ll_layout_lock_set(struct lustre_handle *lockh, ldlm_mode_t mode,
+				struct inode *inode, __u32 *gen, bool reconf)
+{
+	struct ll_inode_info *lli = ll_i2info(inode);
+	struct ll_sb_info    *sbi = ll_i2sbi(inode);
+	struct ldlm_lock *lock;
+	struct lustre_md md = { NULL };
+	struct cl_object_conf conf;
+	int rc = 0;
+	bool lvb_ready;
+	bool wait_layout = false;
+
+	LASSERT(lustre_handle_is_used(lockh));
+
+	lock = ldlm_handle2lock(lockh);
+	LASSERT(lock != NULL);
+	LASSERT(ldlm_has_layout(lock));
+
+	LDLM_DEBUG(lock, "File %p/"DFID" being reconfigured: %d.\n",
+		   inode, PFID(&lli->lli_fid), reconf);
+
+	/* in case this is a caching lock and reinstate with new inode */
+	md_set_lock_data(sbi->ll_md_exp, &lockh->cookie, inode, NULL);
+
+	lock_res_and_lock(lock);
+	lvb_ready = !!(lock->l_flags & LDLM_FL_LVB_READY);
+	unlock_res_and_lock(lock);
+	/* checking lvb_ready is racy but this is okay. The worst case is
+	 * that multi processes may configure the file on the same time. */
+	if (lvb_ready || !reconf) {
+		rc = -ENODATA;
+		if (lvb_ready) {
+			/* layout_gen must be valid if layout lock is not
+			 * cancelled and stripe has already set */
+			*gen = ll_layout_version_get(lli);
+			rc = 0;
+		}
+		goto out;
+	}
+
+	rc = ll_layout_fetch(inode, lock);
+	if (rc < 0)
+		goto out;
+
+	/* for layout lock, lmm is returned in lock's lvb.
+	 * lvb_data is immutable if the lock is held so it's safe to access it
+	 * without res lock. See the description in ldlm_lock_decref_internal()
+	 * for the condition to free lvb_data of layout lock */
+	if (lock->l_lvb_data != NULL) {
+		rc = obd_unpackmd(sbi->ll_dt_exp, &md.lsm,
+				  lock->l_lvb_data, lock->l_lvb_len);
+		if (rc >= 0) {
+			*gen = LL_LAYOUT_GEN_EMPTY;
+			if (md.lsm != NULL)
+				*gen = md.lsm->lsm_layout_gen;
+			rc = 0;
+		} else {
+			CERROR("%s: file "DFID" unpackmd error: %d\n",
+				ll_get_fsname(inode->i_sb, NULL, 0),
+				PFID(&lli->lli_fid), rc);
+		}
+	}
+	if (rc < 0)
+		goto out;
+
+	/* set layout to file. Unlikely this will fail as old layout was
+	 * surely eliminated */
+	memset(&conf, 0, sizeof(conf));
+	conf.coc_opc = OBJECT_CONF_SET;
+	conf.coc_inode = inode;
+	conf.coc_lock = lock;
+	conf.u.coc_md = &md;
+	rc = ll_layout_conf(inode, &conf);
+
+	if (md.lsm != NULL)
+		obd_free_memmd(sbi->ll_dt_exp, &md.lsm);
+
+	/* refresh layout failed, need to wait */
+	wait_layout = rc == -EBUSY;
+
+out:
+	LDLM_LOCK_PUT(lock);
+	ldlm_lock_decref(lockh, mode);
+
+	/* wait for IO to complete if it's still being used. */
+	if (wait_layout) {
+		CDEBUG(D_INODE, "%s: %p/"DFID" wait for layout reconf.\n",
+			ll_get_fsname(inode->i_sb, NULL, 0),
+			inode, PFID(&lli->lli_fid));
+
+		memset(&conf, 0, sizeof(conf));
+		conf.coc_opc = OBJECT_CONF_WAIT;
+		conf.coc_inode = inode;
+		rc = ll_layout_conf(inode, &conf);
+		if (rc == 0)
+			rc = -EAGAIN;
+
+		CDEBUG(D_INODE, "file: "DFID" waiting layout return: %d.\n",
+			PFID(&lli->lli_fid), rc);
+	}
+	return rc;
+}
+
+/**
+ * This function checks if there exists a LAYOUT lock on the client side,
+ * or enqueues it if it doesn't have one in cache.
+ *
+ * This function will not hold layout lock so it may be revoked any time after
+ * this function returns. Any operations depend on layout should be redone
+ * in that case.
+ *
+ * This function should be called before lov_io_init() to get an uptodate
+ * layout version, the caller should save the version number and after IO
+ * is finished, this function should be called again to verify that layout
+ * is not changed during IO time.
+ */
+int ll_layout_refresh(struct inode *inode, __u32 *gen)
+{
+	struct ll_inode_info  *lli = ll_i2info(inode);
+	struct ll_sb_info     *sbi = ll_i2sbi(inode);
+	struct md_op_data     *op_data;
+	struct lookup_intent   it;
+	struct lustre_handle   lockh;
+	ldlm_mode_t	       mode;
+	struct ldlm_enqueue_info einfo = {
+		.ei_type = LDLM_IBITS,
+		.ei_mode = LCK_CR,
+		.ei_cb_bl = ll_md_blocking_ast,
+		.ei_cb_cp = ldlm_completion_ast,
+	};
+	int rc;
+
+	*gen = ll_layout_version_get(lli);
+	if (!(sbi->ll_flags & LL_SBI_LAYOUT_LOCK) || *gen != LL_LAYOUT_GEN_NONE)
+		return 0;
+
+	/* sanity checks */
+	LASSERT(fid_is_sane(ll_inode2fid(inode)));
+	LASSERT(S_ISREG(inode->i_mode));
+
+	/* take layout lock mutex to enqueue layout lock exclusively. */
+	mutex_lock(&lli->lli_layout_mutex);
+
+again:
+	/* mostly layout lock is caching on the local side, so try to match
+	 * it before grabbing layout lock mutex. */
+	mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0,
+			       LCK_CR | LCK_CW | LCK_PR | LCK_PW);
+	if (mode != 0) { /* hit cached lock */
+		rc = ll_layout_lock_set(&lockh, mode, inode, gen, true);
+		if (rc == -EAGAIN)
+			goto again;
+
+		mutex_unlock(&lli->lli_layout_mutex);
+		return rc;
+	}
+
+	op_data = ll_prep_md_op_data(NULL, inode, inode, NULL,
+			0, 0, LUSTRE_OPC_ANY, NULL);
+	if (IS_ERR(op_data)) {
+		mutex_unlock(&lli->lli_layout_mutex);
+		return PTR_ERR(op_data);
+	}
+
+	/* have to enqueue one */
+	memset(&it, 0, sizeof(it));
+	it.it_op = IT_LAYOUT;
+	lockh.cookie = 0ULL;
+
+	LDLM_DEBUG_NOLOCK("%s: requeue layout lock for file %p/"DFID".\n",
+			ll_get_fsname(inode->i_sb, NULL, 0), inode,
+			PFID(&lli->lli_fid));
+
+	rc = md_enqueue(sbi->ll_md_exp, &einfo, &it, op_data, &lockh,
+			NULL, 0, NULL, 0);
+	if (it.d.lustre.it_data != NULL)
+		ptlrpc_req_finished(it.d.lustre.it_data);
+	it.d.lustre.it_data = NULL;
+
+	ll_finish_md_op_data(op_data);
+
+	mode = it.d.lustre.it_lock_mode;
+	it.d.lustre.it_lock_mode = 0;
+	ll_intent_drop_lock(&it);
+
+	if (rc == 0) {
+		/* set lock data in case this is a new lock */
+		ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
+		rc = ll_layout_lock_set(&lockh, mode, inode, gen, true);
+		if (rc == -EAGAIN)
+			goto again;
+	}
+	mutex_unlock(&lli->lli_layout_mutex);
+
+	return rc;
+}
+
+/**
+ *  This function send a restore request to the MDT
+ */
+int ll_layout_restore(struct inode *inode)
+{
+	struct hsm_user_request	*hur;
+	int			 len, rc;
+
+	len = sizeof(struct hsm_user_request) +
+	      sizeof(struct hsm_user_item);
+	hur = kzalloc(len, GFP_NOFS);
+	if (!hur)
+		return -ENOMEM;
+
+	hur->hur_request.hr_action = HUA_RESTORE;
+	hur->hur_request.hr_archive_id = 0;
+	hur->hur_request.hr_flags = 0;
+	memcpy(&hur->hur_user_item[0].hui_fid, &ll_i2info(inode)->lli_fid,
+	       sizeof(hur->hur_user_item[0].hui_fid));
+	hur->hur_user_item[0].hui_extent.length = -1;
+	hur->hur_request.hr_itemcount = 1;
+	rc = obd_iocontrol(LL_IOC_HSM_REQUEST, cl_i2sbi(inode)->ll_md_exp,
+			   len, hur, NULL);
+	OBD_FREE(hur, len);
+	return rc;
+}
diff --git a/kernel/drivers/staging/lustre/lustre/llite/llite_capa.c b/kernel/drivers/staging/lustre/lustre/llite/llite_capa.c
new file mode 100644
index 000000000..aec9a4412
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/llite/llite_capa.c
@@ -0,0 +1,654 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/llite/llite_capa.c
+ *
+ * Author: Lai Siyao <lsy@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LLITE
+
+#include <linux/fs.h>
+#include <linux/uaccess.h>
+#include <linux/file.h>
+#include <linux/kmod.h>
+
+#include "../include/lustre_lite.h"
+#include "llite_internal.h"
+
+/* for obd_capa.c_list, client capa might stay in three places:
+ * 1. ll_capa_list.
+ * 2. ll_idle_capas.
+ * 3. stand alone: just allocated.
+ */
+
+/* capas for oss writeback and those failed to renew */
+static LIST_HEAD(ll_idle_capas);
+static struct ptlrpc_thread ll_capa_thread;
+static struct list_head *ll_capa_list = &capa_list[CAPA_SITE_CLIENT];
+
+/* llite capa renewal timer */
+struct timer_list ll_capa_timer;
+/* for debug: indicate whether capa on llite is enabled or not */
+static atomic_t ll_capa_debug = ATOMIC_INIT(0);
+static unsigned long long ll_capa_renewed;
+static unsigned long long ll_capa_renewal_noent;
+static unsigned long long ll_capa_renewal_failed;
+static unsigned long long ll_capa_renewal_retries;
+
+static int ll_update_capa(struct obd_capa *ocapa, struct lustre_capa *capa);
+
+static inline void update_capa_timer(struct obd_capa *ocapa, unsigned long expiry)
+{
+	if (time_before(expiry, ll_capa_timer.expires) ||
+	    !timer_pending(&ll_capa_timer)) {
+		mod_timer(&ll_capa_timer, expiry);
+		DEBUG_CAPA(D_SEC, &ocapa->c_capa,
+			   "ll_capa_timer update: %lu/%lu by", expiry, jiffies);
+	}
+}
+
+static inline unsigned long capa_renewal_time(struct obd_capa *ocapa)
+{
+	return cfs_time_sub(ocapa->c_expiry,
+			    cfs_time_seconds(ocapa->c_capa.lc_timeout) / 2);
+}
+
+static inline int capa_is_to_expire(struct obd_capa *ocapa)
+{
+	return time_before_eq(capa_renewal_time(ocapa), cfs_time_current());
+}
+
+static inline int have_expired_capa(void)
+{
+	struct obd_capa *ocapa = NULL;
+	int expired = 0;
+
+	/* if ll_capa_list has client capa to expire or ll_idle_capas has
+	 * expired capa, return 1.
+	 */
+	spin_lock(&capa_lock);
+	if (!list_empty(ll_capa_list)) {
+		ocapa = list_entry(ll_capa_list->next, struct obd_capa,
+				       c_list);
+		expired = capa_is_to_expire(ocapa);
+		if (!expired)
+			update_capa_timer(ocapa, capa_renewal_time(ocapa));
+	} else if (!list_empty(&ll_idle_capas)) {
+		ocapa = list_entry(ll_idle_capas.next, struct obd_capa,
+				       c_list);
+		expired = capa_is_expired(ocapa);
+		if (!expired)
+			update_capa_timer(ocapa, ocapa->c_expiry);
+	}
+	spin_unlock(&capa_lock);
+
+	if (expired)
+		DEBUG_CAPA(D_SEC, &ocapa->c_capa, "expired");
+	return expired;
+}
+
+static void sort_add_capa(struct obd_capa *ocapa, struct list_head *head)
+{
+	struct obd_capa *tmp;
+	struct list_head *before = NULL;
+
+	/* TODO: client capa is sorted by expiry, this could be optimized */
+	list_for_each_entry_reverse(tmp, head, c_list) {
+		if (cfs_time_aftereq(ocapa->c_expiry, tmp->c_expiry)) {
+			before = &tmp->c_list;
+			break;
+		}
+	}
+
+	LASSERT(&ocapa->c_list != before);
+	list_add(&ocapa->c_list, before ?: head);
+}
+
+static inline int obd_capa_open_count(struct obd_capa *oc)
+{
+	struct ll_inode_info *lli = ll_i2info(oc->u.cli.inode);
+	return atomic_read(&lli->lli_open_count);
+}
+
+static void ll_delete_capa(struct obd_capa *ocapa)
+{
+	struct ll_inode_info *lli = ll_i2info(ocapa->u.cli.inode);
+
+	if (capa_for_mds(&ocapa->c_capa)) {
+		LASSERT(lli->lli_mds_capa == ocapa);
+		lli->lli_mds_capa = NULL;
+	} else if (capa_for_oss(&ocapa->c_capa)) {
+		list_del_init(&ocapa->u.cli.lli_list);
+	}
+
+	DEBUG_CAPA(D_SEC, &ocapa->c_capa, "free client");
+	list_del_init(&ocapa->c_list);
+	capa_count[CAPA_SITE_CLIENT]--;
+	/* release the ref when alloc */
+	capa_put(ocapa);
+}
+
+/* three places where client capa is deleted:
+ * 1. capa_thread_main(), main place to delete expired capa.
+ * 2. ll_clear_inode_capas() in ll_clear_inode().
+ * 3. ll_truncate_free_capa() delete truncate capa explicitly in ll_setattr_ost().
+ */
+static int capa_thread_main(void *unused)
+{
+	struct obd_capa *ocapa, *tmp, *next;
+	struct inode *inode = NULL;
+	struct l_wait_info lwi = { 0 };
+	int rc;
+
+	thread_set_flags(&ll_capa_thread, SVC_RUNNING);
+	wake_up(&ll_capa_thread.t_ctl_waitq);
+
+	while (1) {
+		l_wait_event(ll_capa_thread.t_ctl_waitq,
+			     !thread_is_running(&ll_capa_thread) ||
+			     have_expired_capa(),
+			     &lwi);
+
+		if (!thread_is_running(&ll_capa_thread))
+			break;
+
+		next = NULL;
+
+		spin_lock(&capa_lock);
+		list_for_each_entry_safe(ocapa, tmp, ll_capa_list, c_list) {
+			__u64 ibits;
+
+			LASSERT(ocapa->c_capa.lc_opc != CAPA_OPC_OSS_TRUNC);
+
+			if (!capa_is_to_expire(ocapa)) {
+				next = ocapa;
+				break;
+			}
+
+			list_del_init(&ocapa->c_list);
+
+			/* for MDS capability, only renew those which belong to
+			 * dir, or its inode is opened, or client holds LOOKUP
+			 * lock.
+			 */
+			/* ibits may be changed by ll_have_md_lock() so we have
+			 * to set it each time */
+			ibits = MDS_INODELOCK_LOOKUP;
+			if (capa_for_mds(&ocapa->c_capa) &&
+			    !S_ISDIR(ocapa->u.cli.inode->i_mode) &&
+			    obd_capa_open_count(ocapa) == 0 &&
+			    !ll_have_md_lock(ocapa->u.cli.inode,
+					     &ibits, LCK_MINMODE)) {
+				DEBUG_CAPA(D_SEC, &ocapa->c_capa,
+					   "skip renewal for");
+				sort_add_capa(ocapa, &ll_idle_capas);
+				continue;
+			}
+
+			/* for OSS capability, only renew those whose inode is
+			 * opened.
+			 */
+			if (capa_for_oss(&ocapa->c_capa) &&
+			    obd_capa_open_count(ocapa) == 0) {
+				/* oss capa with open count == 0 won't renew,
+				 * move to idle list */
+				sort_add_capa(ocapa, &ll_idle_capas);
+				continue;
+			}
+
+			/* NB iput() is in ll_update_capa() */
+			inode = igrab(ocapa->u.cli.inode);
+			if (inode == NULL) {
+				DEBUG_CAPA(D_ERROR, &ocapa->c_capa,
+					   "igrab failed for");
+				continue;
+			}
+
+			capa_get(ocapa);
+			ll_capa_renewed++;
+			spin_unlock(&capa_lock);
+			rc = md_renew_capa(ll_i2mdexp(inode), ocapa,
+					   ll_update_capa);
+			spin_lock(&capa_lock);
+			if (rc) {
+				DEBUG_CAPA(D_ERROR, &ocapa->c_capa,
+					   "renew failed: %d", rc);
+				ll_capa_renewal_failed++;
+			}
+		}
+
+		if (next)
+			update_capa_timer(next, capa_renewal_time(next));
+
+		list_for_each_entry_safe(ocapa, tmp, &ll_idle_capas,
+					     c_list) {
+			if (!capa_is_expired(ocapa)) {
+				if (!next)
+					update_capa_timer(ocapa,
+							  ocapa->c_expiry);
+				break;
+			}
+
+			if (atomic_read(&ocapa->c_refc) > 1) {
+				DEBUG_CAPA(D_SEC, &ocapa->c_capa,
+					   "expired(c_refc %d), don't release",
+					   atomic_read(&ocapa->c_refc));
+				/* don't try to renew any more */
+				list_del_init(&ocapa->c_list);
+				continue;
+			}
+
+			/* expired capa is released. */
+			DEBUG_CAPA(D_SEC, &ocapa->c_capa, "release expired");
+			ll_delete_capa(ocapa);
+		}
+
+		spin_unlock(&capa_lock);
+	}
+
+	thread_set_flags(&ll_capa_thread, SVC_STOPPED);
+	wake_up(&ll_capa_thread.t_ctl_waitq);
+	return 0;
+}
+
+void ll_capa_timer_callback(unsigned long unused)
+{
+	wake_up(&ll_capa_thread.t_ctl_waitq);
+}
+
+int ll_capa_thread_start(void)
+{
+	struct task_struct *task;
+
+	init_waitqueue_head(&ll_capa_thread.t_ctl_waitq);
+
+	task = kthread_run(capa_thread_main, NULL, "ll_capa");
+	if (IS_ERR(task)) {
+		CERROR("cannot start expired capa thread: rc %ld\n",
+			PTR_ERR(task));
+		return PTR_ERR(task);
+	}
+	wait_event(ll_capa_thread.t_ctl_waitq,
+		       thread_is_running(&ll_capa_thread));
+
+	return 0;
+}
+
+void ll_capa_thread_stop(void)
+{
+	thread_set_flags(&ll_capa_thread, SVC_STOPPING);
+	wake_up(&ll_capa_thread.t_ctl_waitq);
+	wait_event(ll_capa_thread.t_ctl_waitq,
+		       thread_is_stopped(&ll_capa_thread));
+}
+
+struct obd_capa *ll_osscapa_get(struct inode *inode, __u64 opc)
+{
+	struct ll_inode_info *lli = ll_i2info(inode);
+	struct obd_capa *ocapa;
+	int found = 0;
+
+	if ((ll_i2sbi(inode)->ll_flags & LL_SBI_OSS_CAPA) == 0)
+		return NULL;
+
+	LASSERT(opc == CAPA_OPC_OSS_WRITE || opc == CAPA_OPC_OSS_RW ||
+		opc == CAPA_OPC_OSS_TRUNC);
+
+	spin_lock(&capa_lock);
+	list_for_each_entry(ocapa, &lli->lli_oss_capas, u.cli.lli_list) {
+		if (capa_is_expired(ocapa))
+			continue;
+		if ((opc & CAPA_OPC_OSS_WRITE) &&
+		    capa_opc_supported(&ocapa->c_capa, CAPA_OPC_OSS_WRITE)) {
+			found = 1;
+			break;
+		} else if ((opc & CAPA_OPC_OSS_READ) &&
+			   capa_opc_supported(&ocapa->c_capa,
+					      CAPA_OPC_OSS_READ)) {
+			found = 1;
+			break;
+		} else if ((opc & CAPA_OPC_OSS_TRUNC) &&
+			   capa_opc_supported(&ocapa->c_capa, opc)) {
+			found = 1;
+			break;
+		}
+	}
+
+	if (found) {
+		LASSERT(lu_fid_eq(capa_fid(&ocapa->c_capa),
+				  ll_inode2fid(inode)));
+		LASSERT(ocapa->c_site == CAPA_SITE_CLIENT);
+
+		capa_get(ocapa);
+
+		DEBUG_CAPA(D_SEC, &ocapa->c_capa, "found client");
+	} else {
+		ocapa = NULL;
+
+		if (atomic_read(&ll_capa_debug)) {
+			CERROR("no capability for "DFID" opc %#llx\n",
+			       PFID(&lli->lli_fid), opc);
+			atomic_set(&ll_capa_debug, 0);
+		}
+	}
+	spin_unlock(&capa_lock);
+
+	return ocapa;
+}
+EXPORT_SYMBOL(ll_osscapa_get);
+
+struct obd_capa *ll_mdscapa_get(struct inode *inode)
+{
+	struct ll_inode_info *lli = ll_i2info(inode);
+	struct obd_capa *ocapa;
+
+	LASSERT(inode != NULL);
+
+	if ((ll_i2sbi(inode)->ll_flags & LL_SBI_MDS_CAPA) == 0)
+		return NULL;
+
+	spin_lock(&capa_lock);
+	ocapa = capa_get(lli->lli_mds_capa);
+	spin_unlock(&capa_lock);
+	if (!ocapa && atomic_read(&ll_capa_debug)) {
+		CERROR("no mds capability for "DFID"\n", PFID(&lli->lli_fid));
+		atomic_set(&ll_capa_debug, 0);
+	}
+
+	return ocapa;
+}
+
+static struct obd_capa *do_add_mds_capa(struct inode *inode,
+					struct obd_capa *ocapa)
+{
+	struct ll_inode_info *lli = ll_i2info(inode);
+	struct obd_capa *old = lli->lli_mds_capa;
+	struct lustre_capa *capa = &ocapa->c_capa;
+
+	if (!old) {
+		ocapa->u.cli.inode = inode;
+		lli->lli_mds_capa = ocapa;
+		capa_count[CAPA_SITE_CLIENT]++;
+
+		DEBUG_CAPA(D_SEC, capa, "add MDS");
+	} else {
+		spin_lock(&old->c_lock);
+		old->c_capa = *capa;
+		spin_unlock(&old->c_lock);
+
+		DEBUG_CAPA(D_SEC, capa, "update MDS");
+
+		capa_put(ocapa);
+		ocapa = old;
+	}
+	return ocapa;
+}
+
+static struct obd_capa *do_lookup_oss_capa(struct inode *inode, int opc)
+{
+	struct ll_inode_info *lli = ll_i2info(inode);
+	struct obd_capa *ocapa;
+
+	/* inside capa_lock */
+	list_for_each_entry(ocapa, &lli->lli_oss_capas, u.cli.lli_list) {
+		if ((capa_opc(&ocapa->c_capa) & opc) != opc)
+			continue;
+
+		LASSERT(lu_fid_eq(capa_fid(&ocapa->c_capa),
+				  ll_inode2fid(inode)));
+		LASSERT(ocapa->c_site == CAPA_SITE_CLIENT);
+
+		DEBUG_CAPA(D_SEC, &ocapa->c_capa, "found client");
+		return ocapa;
+	}
+
+	return NULL;
+}
+
+static inline void inode_add_oss_capa(struct inode *inode,
+				      struct obd_capa *ocapa)
+{
+	struct ll_inode_info *lli = ll_i2info(inode);
+	struct obd_capa *tmp;
+	struct list_head *next = NULL;
+
+	/* capa is sorted in lli_oss_capas so lookup can always find the
+	 * latest one */
+	list_for_each_entry(tmp, &lli->lli_oss_capas, u.cli.lli_list) {
+		if (cfs_time_after(ocapa->c_expiry, tmp->c_expiry)) {
+			next = &tmp->u.cli.lli_list;
+			break;
+		}
+	}
+	LASSERT(&ocapa->u.cli.lli_list != next);
+	list_move_tail(&ocapa->u.cli.lli_list, next ?: &lli->lli_oss_capas);
+}
+
+static struct obd_capa *do_add_oss_capa(struct inode *inode,
+					struct obd_capa *ocapa)
+{
+	struct obd_capa *old;
+	struct lustre_capa *capa = &ocapa->c_capa;
+
+	LASSERTF(S_ISREG(inode->i_mode),
+		 "inode has oss capa, but not regular file, mode: %d\n",
+		 inode->i_mode);
+
+	/* FIXME: can't replace it so easily with fine-grained opc */
+	old = do_lookup_oss_capa(inode, capa_opc(capa) & CAPA_OPC_OSS_ONLY);
+	if (!old) {
+		ocapa->u.cli.inode = inode;
+		INIT_LIST_HEAD(&ocapa->u.cli.lli_list);
+		capa_count[CAPA_SITE_CLIENT]++;
+
+		DEBUG_CAPA(D_SEC, capa, "add OSS");
+	} else {
+		spin_lock(&old->c_lock);
+		old->c_capa = *capa;
+		spin_unlock(&old->c_lock);
+
+		DEBUG_CAPA(D_SEC, capa, "update OSS");
+
+		capa_put(ocapa);
+		ocapa = old;
+	}
+
+	inode_add_oss_capa(inode, ocapa);
+	return ocapa;
+}
+
+struct obd_capa *ll_add_capa(struct inode *inode, struct obd_capa *ocapa)
+{
+	spin_lock(&capa_lock);
+	ocapa = capa_for_mds(&ocapa->c_capa) ? do_add_mds_capa(inode, ocapa) :
+					       do_add_oss_capa(inode, ocapa);
+
+	/* truncate capa won't renew */
+	if (ocapa->c_capa.lc_opc != CAPA_OPC_OSS_TRUNC) {
+		set_capa_expiry(ocapa);
+		list_del_init(&ocapa->c_list);
+		sort_add_capa(ocapa, ll_capa_list);
+
+		update_capa_timer(ocapa, capa_renewal_time(ocapa));
+	}
+
+	spin_unlock(&capa_lock);
+
+	atomic_set(&ll_capa_debug, 1);
+	return ocapa;
+}
+
+static inline void delay_capa_renew(struct obd_capa *oc, unsigned long delay)
+{
+	/* NB: set a fake expiry for this capa to prevent it renew too soon */
+	oc->c_expiry = cfs_time_add(oc->c_expiry, cfs_time_seconds(delay));
+}
+
+static int ll_update_capa(struct obd_capa *ocapa, struct lustre_capa *capa)
+{
+	struct inode *inode = ocapa->u.cli.inode;
+	int rc = 0;
+
+	LASSERT(ocapa);
+
+	if (IS_ERR(capa)) {
+		/* set error code */
+		rc = PTR_ERR(capa);
+		spin_lock(&capa_lock);
+		if (rc == -ENOENT) {
+			DEBUG_CAPA(D_SEC, &ocapa->c_capa,
+				   "renewal canceled because object removed");
+			ll_capa_renewal_noent++;
+		} else {
+			ll_capa_renewal_failed++;
+
+			/* failed capa won't be renewed any longer, but if -EIO,
+			 * client might be doing recovery, retry in 2 min. */
+			if (rc == -EIO && !capa_is_expired(ocapa)) {
+				delay_capa_renew(ocapa, 120);
+				DEBUG_CAPA(D_ERROR, &ocapa->c_capa,
+					   "renewal failed: -EIO, retry in 2 mins");
+				ll_capa_renewal_retries++;
+				goto retry;
+			} else {
+				DEBUG_CAPA(D_ERROR, &ocapa->c_capa,
+					   "renewal failed(rc: %d) for", rc);
+			}
+		}
+
+		list_del_init(&ocapa->c_list);
+		sort_add_capa(ocapa, &ll_idle_capas);
+		spin_unlock(&capa_lock);
+
+		capa_put(ocapa);
+		iput(inode);
+		return rc;
+	}
+
+	spin_lock(&ocapa->c_lock);
+	LASSERT(!memcmp(&ocapa->c_capa, capa,
+			offsetof(struct lustre_capa, lc_opc)));
+	ocapa->c_capa = *capa;
+	set_capa_expiry(ocapa);
+	spin_unlock(&ocapa->c_lock);
+
+	spin_lock(&capa_lock);
+	if (capa_for_oss(capa))
+		inode_add_oss_capa(inode, ocapa);
+	DEBUG_CAPA(D_SEC, capa, "renew");
+retry:
+	list_del_init(&ocapa->c_list);
+	sort_add_capa(ocapa, ll_capa_list);
+	update_capa_timer(ocapa, capa_renewal_time(ocapa));
+	spin_unlock(&capa_lock);
+
+	capa_put(ocapa);
+	iput(inode);
+	return rc;
+}
+
+void ll_capa_open(struct inode *inode)
+{
+	struct ll_inode_info *lli = ll_i2info(inode);
+
+	if ((ll_i2sbi(inode)->ll_flags & (LL_SBI_MDS_CAPA | LL_SBI_OSS_CAPA))
+	    == 0)
+		return;
+
+	if (!S_ISREG(inode->i_mode))
+		return;
+
+	atomic_inc(&lli->lli_open_count);
+}
+
+void ll_capa_close(struct inode *inode)
+{
+	struct ll_inode_info *lli = ll_i2info(inode);
+
+	if ((ll_i2sbi(inode)->ll_flags & (LL_SBI_MDS_CAPA | LL_SBI_OSS_CAPA))
+	    == 0)
+		return;
+
+	if (!S_ISREG(inode->i_mode))
+		return;
+
+	atomic_dec(&lli->lli_open_count);
+}
+
+/* delete CAPA_OPC_OSS_TRUNC only */
+void ll_truncate_free_capa(struct obd_capa *ocapa)
+{
+	if (!ocapa)
+		return;
+
+	LASSERT(ocapa->c_capa.lc_opc & CAPA_OPC_OSS_TRUNC);
+	DEBUG_CAPA(D_SEC, &ocapa->c_capa, "free truncate");
+
+	/* release ref when find */
+	capa_put(ocapa);
+	if (likely(ocapa->c_capa.lc_opc == CAPA_OPC_OSS_TRUNC)) {
+		spin_lock(&capa_lock);
+		ll_delete_capa(ocapa);
+		spin_unlock(&capa_lock);
+	}
+}
+
+void ll_clear_inode_capas(struct inode *inode)
+{
+	struct ll_inode_info *lli = ll_i2info(inode);
+	struct obd_capa *ocapa, *tmp;
+
+	spin_lock(&capa_lock);
+	ocapa = lli->lli_mds_capa;
+	if (ocapa)
+		ll_delete_capa(ocapa);
+
+	list_for_each_entry_safe(ocapa, tmp, &lli->lli_oss_capas,
+				     u.cli.lli_list)
+		ll_delete_capa(ocapa);
+	spin_unlock(&capa_lock);
+}
+
+void ll_print_capa_stat(struct ll_sb_info *sbi)
+{
+	if (sbi->ll_flags & (LL_SBI_MDS_CAPA | LL_SBI_OSS_CAPA))
+		LCONSOLE_INFO("Fid capabilities renewed: %llu\n"
+			      "Fid capabilities renewal ENOENT: %llu\n"
+			      "Fid capabilities failed to renew: %llu\n"
+			      "Fid capabilities renewal retries: %llu\n",
+			      ll_capa_renewed, ll_capa_renewal_noent,
+			      ll_capa_renewal_failed, ll_capa_renewal_retries);
+}
diff --git a/kernel/drivers/staging/lustre/lustre/llite/llite_close.c b/kernel/drivers/staging/lustre/lustre/llite/llite_close.c
new file mode 100644
index 000000000..a94ba02cc
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/llite/llite_close.c
@@ -0,0 +1,393 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/llite/llite_close.c
+ *
+ * Lustre Lite routines to issue a secondary close after writeback
+ */
+
+#include <linux/module.h>
+
+#define DEBUG_SUBSYSTEM S_LLITE
+
+#include "../include/lustre_lite.h"
+#include "llite_internal.h"
+
+/** records that a write is in flight */
+void vvp_write_pending(struct ccc_object *club, struct ccc_page *page)
+{
+	struct ll_inode_info *lli = ll_i2info(club->cob_inode);
+
+	spin_lock(&lli->lli_lock);
+	lli->lli_flags |= LLIF_SOM_DIRTY;
+	if (page != NULL && list_empty(&page->cpg_pending_linkage))
+		list_add(&page->cpg_pending_linkage,
+			     &club->cob_pending_list);
+	spin_unlock(&lli->lli_lock);
+}
+
+/** records that a write has completed */
+void vvp_write_complete(struct ccc_object *club, struct ccc_page *page)
+{
+	struct ll_inode_info *lli = ll_i2info(club->cob_inode);
+	int rc = 0;
+
+	spin_lock(&lli->lli_lock);
+	if (page != NULL && !list_empty(&page->cpg_pending_linkage)) {
+		list_del_init(&page->cpg_pending_linkage);
+		rc = 1;
+	}
+	spin_unlock(&lli->lli_lock);
+	if (rc)
+		ll_queue_done_writing(club->cob_inode, 0);
+}
+
+/** Queues DONE_WRITING if
+ * - done writing is allowed;
+ * - inode has no no dirty pages; */
+void ll_queue_done_writing(struct inode *inode, unsigned long flags)
+{
+	struct ll_inode_info *lli = ll_i2info(inode);
+	struct ccc_object *club = cl2ccc(ll_i2info(inode)->lli_clob);
+
+	spin_lock(&lli->lli_lock);
+	lli->lli_flags |= flags;
+
+	if ((lli->lli_flags & LLIF_DONE_WRITING) &&
+	    list_empty(&club->cob_pending_list)) {
+		struct ll_close_queue *lcq = ll_i2sbi(inode)->ll_lcq;
+
+		if (lli->lli_flags & LLIF_MDS_SIZE_LOCK)
+			CWARN("ino %lu/%u(flags %u) som valid it just after recovery\n",
+			      inode->i_ino, inode->i_generation,
+			      lli->lli_flags);
+		/* DONE_WRITING is allowed and inode has no dirty page. */
+		spin_lock(&lcq->lcq_lock);
+
+		LASSERT(list_empty(&lli->lli_close_list));
+		CDEBUG(D_INODE, "adding inode %lu/%u to close list\n",
+		       inode->i_ino, inode->i_generation);
+		list_add_tail(&lli->lli_close_list, &lcq->lcq_head);
+
+		/* Avoid a concurrent insertion into the close thread queue:
+		 * an inode is already in the close thread, open(), write(),
+		 * close() happen, epoch is closed as the inode is marked as
+		 * LLIF_EPOCH_PENDING. When pages are written inode should not
+		 * be inserted into the queue again, clear this flag to avoid
+		 * it. */
+		lli->lli_flags &= ~LLIF_DONE_WRITING;
+
+		wake_up(&lcq->lcq_waitq);
+		spin_unlock(&lcq->lcq_lock);
+	}
+	spin_unlock(&lli->lli_lock);
+}
+
+/** Pack SOM attributes info @opdata for CLOSE, DONE_WRITING rpc. */
+void ll_done_writing_attr(struct inode *inode, struct md_op_data *op_data)
+{
+	struct ll_inode_info *lli = ll_i2info(inode);
+
+	op_data->op_flags |= MF_SOM_CHANGE;
+	/* Check if Size-on-MDS attributes are valid. */
+	if (lli->lli_flags & LLIF_MDS_SIZE_LOCK)
+		CERROR("ino %lu/%u(flags %u) som valid it just after recovery\n",
+		       inode->i_ino, inode->i_generation,
+		       lli->lli_flags);
+
+	if (!cl_local_size(inode)) {
+		/* Send Size-on-MDS Attributes if valid. */
+		op_data->op_attr.ia_valid |= ATTR_MTIME_SET | ATTR_CTIME_SET |
+				ATTR_ATIME_SET | ATTR_SIZE | ATTR_BLOCKS;
+	}
+}
+
+/** Closes ioepoch and packs Size-on-MDS attribute if needed into @op_data. */
+void ll_ioepoch_close(struct inode *inode, struct md_op_data *op_data,
+		      struct obd_client_handle **och, unsigned long flags)
+{
+	struct ll_inode_info *lli = ll_i2info(inode);
+	struct ccc_object *club = cl2ccc(ll_i2info(inode)->lli_clob);
+
+	spin_lock(&lli->lli_lock);
+	if (!(list_empty(&club->cob_pending_list))) {
+		if (!(lli->lli_flags & LLIF_EPOCH_PENDING)) {
+			LASSERT(*och != NULL);
+			LASSERT(lli->lli_pending_och == NULL);
+			/* Inode is dirty and there is no pending write done
+			 * request yet, DONE_WRITE is to be sent later. */
+			lli->lli_flags |= LLIF_EPOCH_PENDING;
+			lli->lli_pending_och = *och;
+			spin_unlock(&lli->lli_lock);
+
+			inode = igrab(inode);
+			LASSERT(inode);
+			goto out;
+		}
+		if (flags & LLIF_DONE_WRITING) {
+			/* Some pages are still dirty, it is early to send
+			 * DONE_WRITE. Wait until all pages will be flushed
+			 * and try DONE_WRITE again later. */
+			LASSERT(!(lli->lli_flags & LLIF_DONE_WRITING));
+			lli->lli_flags |= LLIF_DONE_WRITING;
+			spin_unlock(&lli->lli_lock);
+
+			inode = igrab(inode);
+			LASSERT(inode);
+			goto out;
+		}
+	}
+	CDEBUG(D_INODE, "Epoch %llu closed on "DFID"\n",
+	       ll_i2info(inode)->lli_ioepoch, PFID(&lli->lli_fid));
+	op_data->op_flags |= MF_EPOCH_CLOSE;
+
+	if (flags & LLIF_DONE_WRITING) {
+		LASSERT(lli->lli_flags & LLIF_SOM_DIRTY);
+		LASSERT(!(lli->lli_flags & LLIF_DONE_WRITING));
+		*och = lli->lli_pending_och;
+		lli->lli_pending_och = NULL;
+		lli->lli_flags &= ~LLIF_EPOCH_PENDING;
+	} else {
+		/* Pack Size-on-MDS inode attributes only if they has changed */
+		if (!(lli->lli_flags & LLIF_SOM_DIRTY)) {
+			spin_unlock(&lli->lli_lock);
+			goto out;
+		}
+
+		/* There is a pending DONE_WRITE -- close epoch with no
+		 * attribute change. */
+		if (lli->lli_flags & LLIF_EPOCH_PENDING) {
+			spin_unlock(&lli->lli_lock);
+			goto out;
+		}
+	}
+
+	LASSERT(list_empty(&club->cob_pending_list));
+	lli->lli_flags &= ~LLIF_SOM_DIRTY;
+	spin_unlock(&lli->lli_lock);
+	ll_done_writing_attr(inode, op_data);
+
+out:
+	return;
+}
+
+/**
+ * Cliens updates SOM attributes on MDS (including llog cookies):
+ * obd_getattr with no lock and md_setattr.
+ */
+int ll_som_update(struct inode *inode, struct md_op_data *op_data)
+{
+	struct ll_inode_info *lli = ll_i2info(inode);
+	struct ptlrpc_request *request = NULL;
+	__u32 old_flags;
+	struct obdo *oa;
+	int rc;
+
+	LASSERT(op_data != NULL);
+	if (lli->lli_flags & LLIF_MDS_SIZE_LOCK)
+		CERROR("ino %lu/%u(flags %u) som valid it just after recovery\n",
+		       inode->i_ino, inode->i_generation,
+		       lli->lli_flags);
+
+	OBDO_ALLOC(oa);
+	if (!oa) {
+		CERROR("can't allocate memory for Size-on-MDS update.\n");
+		return -ENOMEM;
+	}
+
+	old_flags = op_data->op_flags;
+	op_data->op_flags = MF_SOM_CHANGE;
+
+	/* If inode is already in another epoch, skip getattr from OSTs. */
+	if (lli->lli_ioepoch == op_data->op_ioepoch) {
+		rc = ll_inode_getattr(inode, oa, op_data->op_ioepoch,
+				      old_flags & MF_GETATTR_LOCK);
+		if (rc) {
+			oa->o_valid = 0;
+			if (rc != -ENOENT)
+				CERROR("inode_getattr failed (%d): unable to send a Size-on-MDS attribute update for inode %lu/%u\n",
+				       rc, inode->i_ino,
+				       inode->i_generation);
+		} else {
+			CDEBUG(D_INODE, "Size-on-MDS update on "DFID"\n",
+			       PFID(&lli->lli_fid));
+		}
+		/* Install attributes into op_data. */
+		md_from_obdo(op_data, oa, oa->o_valid);
+	}
+
+	rc = md_setattr(ll_i2sbi(inode)->ll_md_exp, op_data,
+			NULL, 0, NULL, 0, &request, NULL);
+	ptlrpc_req_finished(request);
+
+	OBDO_FREE(oa);
+	return rc;
+}
+
+/**
+ * Closes the ioepoch and packs all the attributes into @op_data for
+ * DONE_WRITING rpc.
+ */
+static void ll_prepare_done_writing(struct inode *inode,
+				    struct md_op_data *op_data,
+				    struct obd_client_handle **och)
+{
+	ll_ioepoch_close(inode, op_data, och, LLIF_DONE_WRITING);
+	/* If there is no @och, we do not do D_W yet. */
+	if (*och == NULL)
+		return;
+
+	ll_pack_inode2opdata(inode, op_data, &(*och)->och_fh);
+	ll_prep_md_op_data(op_data, inode, NULL, NULL,
+			   0, 0, LUSTRE_OPC_ANY, NULL);
+}
+
+/** Send a DONE_WRITING rpc. */
+static void ll_done_writing(struct inode *inode)
+{
+	struct obd_client_handle *och = NULL;
+	struct md_op_data *op_data;
+	int rc;
+
+	LASSERT(exp_connect_som(ll_i2mdexp(inode)));
+
+	op_data = kzalloc(sizeof(*op_data), GFP_NOFS);
+	if (!op_data)
+		return;
+
+	ll_prepare_done_writing(inode, op_data, &och);
+	/* If there is no @och, we do not do D_W yet. */
+	if (och == NULL)
+		goto out;
+
+	rc = md_done_writing(ll_i2sbi(inode)->ll_md_exp, op_data, NULL);
+	if (rc == -EAGAIN) {
+		/* MDS has instructed us to obtain Size-on-MDS attribute from
+		 * OSTs and send setattr to back to MDS. */
+		rc = ll_som_update(inode, op_data);
+	} else if (rc) {
+		CERROR("inode %lu mdc done_writing failed: rc = %d\n",
+		       inode->i_ino, rc);
+	}
+out:
+	ll_finish_md_op_data(op_data);
+	if (och) {
+		md_clear_open_replay_data(ll_i2sbi(inode)->ll_md_exp, och);
+		OBD_FREE_PTR(och);
+	}
+}
+
+static struct ll_inode_info *ll_close_next_lli(struct ll_close_queue *lcq)
+{
+	struct ll_inode_info *lli = NULL;
+
+	spin_lock(&lcq->lcq_lock);
+
+	if (!list_empty(&lcq->lcq_head)) {
+		lli = list_entry(lcq->lcq_head.next, struct ll_inode_info,
+				     lli_close_list);
+		list_del_init(&lli->lli_close_list);
+	} else if (atomic_read(&lcq->lcq_stop))
+		lli = ERR_PTR(-EALREADY);
+
+	spin_unlock(&lcq->lcq_lock);
+	return lli;
+}
+
+static int ll_close_thread(void *arg)
+{
+	struct ll_close_queue *lcq = arg;
+
+	complete(&lcq->lcq_comp);
+
+	while (1) {
+		struct l_wait_info lwi = { 0 };
+		struct ll_inode_info *lli;
+		struct inode *inode;
+
+		l_wait_event_exclusive(lcq->lcq_waitq,
+				       (lli = ll_close_next_lli(lcq)) != NULL,
+				       &lwi);
+		if (IS_ERR(lli))
+			break;
+
+		inode = ll_info2i(lli);
+		CDEBUG(D_INFO, "done_writing for inode %lu/%u\n",
+		       inode->i_ino, inode->i_generation);
+		ll_done_writing(inode);
+		iput(inode);
+	}
+
+	CDEBUG(D_INFO, "ll_close exiting\n");
+	complete(&lcq->lcq_comp);
+	return 0;
+}
+
+int ll_close_thread_start(struct ll_close_queue **lcq_ret)
+{
+	struct ll_close_queue *lcq;
+	struct task_struct *task;
+
+	if (OBD_FAIL_CHECK(OBD_FAIL_LDLM_CLOSE_THREAD))
+		return -EINTR;
+
+	lcq = kzalloc(sizeof(*lcq), GFP_NOFS);
+	if (!lcq)
+		return -ENOMEM;
+
+	spin_lock_init(&lcq->lcq_lock);
+	INIT_LIST_HEAD(&lcq->lcq_head);
+	init_waitqueue_head(&lcq->lcq_waitq);
+	init_completion(&lcq->lcq_comp);
+
+	task = kthread_run(ll_close_thread, lcq, "ll_close");
+	if (IS_ERR(task)) {
+		OBD_FREE(lcq, sizeof(*lcq));
+		return PTR_ERR(task);
+	}
+
+	wait_for_completion(&lcq->lcq_comp);
+	*lcq_ret = lcq;
+	return 0;
+}
+
+void ll_close_thread_shutdown(struct ll_close_queue *lcq)
+{
+	init_completion(&lcq->lcq_comp);
+	atomic_inc(&lcq->lcq_stop);
+	wake_up(&lcq->lcq_waitq);
+	wait_for_completion(&lcq->lcq_comp);
+	OBD_FREE(lcq, sizeof(*lcq));
+}
diff --git a/kernel/drivers/staging/lustre/lustre/llite/llite_internal.h b/kernel/drivers/staging/lustre/lustre/llite/llite_internal.h
new file mode 100644
index 000000000..5f918e3c4
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/llite/llite_internal.h
@@ -0,0 +1,1521 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef LLITE_INTERNAL_H
+#define LLITE_INTERNAL_H
+#include "../include/lustre_debug.h"
+#include "../include/lustre_ver.h"
+#include "../include/lustre_disk.h"	/* for s2sbi */
+#include "../include/lustre_eacl.h"
+
+/* for struct cl_lock_descr and struct cl_io */
+#include "../include/cl_object.h"
+#include "../include/lclient.h"
+#include "../include/lustre_mdc.h"
+#include "../include/lustre_intent.h"
+#include <linux/compat.h>
+#include <linux/posix_acl_xattr.h>
+
+#ifndef FMODE_EXEC
+#define FMODE_EXEC 0
+#endif
+
+#ifndef VM_FAULT_RETRY
+#define VM_FAULT_RETRY 0
+#endif
+
+/* Kernel 3.1 kills LOOKUP_CONTINUE, LOOKUP_PARENT is equivalent to it.
+ * seem kernel commit 49084c3bb2055c401f3493c13edae14d49128ca0 */
+#ifndef LOOKUP_CONTINUE
+#define LOOKUP_CONTINUE LOOKUP_PARENT
+#endif
+
+/** Only used on client-side for indicating the tail of dir hash/offset. */
+#define LL_DIR_END_OFF	  0x7fffffffffffffffULL
+#define LL_DIR_END_OFF_32BIT    0x7fffffffUL
+
+#define LL_IT2STR(it) ((it) ? ldlm_it2str((it)->it_op) : "0")
+#define LUSTRE_FPRIVATE(file) ((file)->private_data)
+
+struct ll_dentry_data {
+	struct lookup_intent		*lld_it;
+	unsigned int			lld_sa_generation;
+	unsigned int			lld_invalid:1;
+	struct rcu_head			lld_rcu_head;
+};
+
+#define ll_d2d(de) ((struct ll_dentry_data*)((de)->d_fsdata))
+
+#define LLI_INODE_MAGIC		 0x111d0de5
+#define LLI_INODE_DEAD		  0xdeadd00d
+
+/* remote client permission cache */
+#define REMOTE_PERM_HASHSIZE 16
+
+struct ll_getname_data {
+	struct dir_context ctx;
+	char	    *lgd_name;      /* points to a buffer with NAME_MAX+1 size */
+	struct lu_fid    lgd_fid;       /* target fid we are looking for */
+	int	      lgd_found;     /* inode matched? */
+};
+
+/* llite setxid/access permission for user on remote client */
+struct ll_remote_perm {
+	struct hlist_node	lrp_list;
+	uid_t		   lrp_uid;
+	gid_t		   lrp_gid;
+	uid_t		   lrp_fsuid;
+	gid_t		   lrp_fsgid;
+	int		     lrp_access_perm; /* MAY_READ/WRITE/EXEC, this
+						    is access permission with
+						    lrp_fsuid/lrp_fsgid. */
+};
+
+enum lli_flags {
+	/* MDS has an authority for the Size-on-MDS attributes. */
+	LLIF_MDS_SIZE_LOCK      = (1 << 0),
+	/* Epoch close is postponed. */
+	LLIF_EPOCH_PENDING      = (1 << 1),
+	/* DONE WRITING is allowed. */
+	LLIF_DONE_WRITING       = (1 << 2),
+	/* Sizeon-on-MDS attributes are changed. An attribute update needs to
+	 * be sent to MDS. */
+	LLIF_SOM_DIRTY	  = (1 << 3),
+	/* File data is modified. */
+	LLIF_DATA_MODIFIED      = (1 << 4),
+	/* File is being restored */
+	LLIF_FILE_RESTORING	= (1 << 5),
+	/* Xattr cache is attached to the file */
+	LLIF_XATTR_CACHE	= (1 << 6),
+};
+
+struct ll_inode_info {
+	__u32				lli_inode_magic;
+	__u32				lli_flags;
+	__u64				lli_ioepoch;
+
+	spinlock_t			lli_lock;
+	struct posix_acl		*lli_posix_acl;
+
+	struct hlist_head		*lli_remote_perms;
+	struct mutex				lli_rmtperm_mutex;
+
+	/* identifying fields for both metadata and data stacks. */
+	struct lu_fid		   lli_fid;
+	/* Parent fid for accessing default stripe data on parent directory
+	 * for allocating OST objects after a mknod() and later open-by-FID. */
+	struct lu_fid		   lli_pfid;
+
+	struct list_head		      lli_close_list;
+	struct list_head		      lli_oss_capas;
+	/* open count currently used by capability only, indicate whether
+	 * capability needs renewal */
+	atomic_t		    lli_open_count;
+	struct obd_capa		*lli_mds_capa;
+	unsigned long		      lli_rmtperm_time;
+
+	/* handle is to be sent to MDS later on done_writing and setattr.
+	 * Open handle data are needed for the recovery to reconstruct
+	 * the inode state on the MDS. XXX: recovery is not ready yet. */
+	struct obd_client_handle       *lli_pending_och;
+
+	/* We need all three because every inode may be opened in different
+	 * modes */
+	struct obd_client_handle       *lli_mds_read_och;
+	struct obd_client_handle       *lli_mds_write_och;
+	struct obd_client_handle       *lli_mds_exec_och;
+	__u64			   lli_open_fd_read_count;
+	__u64			   lli_open_fd_write_count;
+	__u64			   lli_open_fd_exec_count;
+	/* Protects access to och pointers and their usage counters */
+	struct mutex			lli_och_mutex;
+
+	struct inode			lli_vfs_inode;
+
+	/* the most recent timestamps obtained from mds */
+	struct ost_lvb			lli_lvb;
+	spinlock_t			lli_agl_lock;
+
+	/* Try to make the d::member and f::member are aligned. Before using
+	 * these members, make clear whether it is directory or not. */
+	union {
+		/* for directory */
+		struct {
+			/* serialize normal readdir and statahead-readdir. */
+			struct mutex			d_readdir_mutex;
+
+			/* metadata statahead */
+			/* since parent-child threads can share the same @file
+			 * struct, "opendir_key" is the token when dir close for
+			 * case of parent exit before child -- it is me should
+			 * cleanup the dir readahead. */
+			void			   *d_opendir_key;
+			struct ll_statahead_info       *d_sai;
+			/* protect statahead stuff. */
+			spinlock_t			d_sa_lock;
+			/* "opendir_pid" is the token when lookup/revalid
+			 * -- I am the owner of dir statahead. */
+			pid_t			   d_opendir_pid;
+		} d;
+
+#define lli_readdir_mutex       u.d.d_readdir_mutex
+#define lli_opendir_key	 u.d.d_opendir_key
+#define lli_sai		 u.d.d_sai
+#define lli_sa_lock	     u.d.d_sa_lock
+#define lli_opendir_pid	 u.d.d_opendir_pid
+
+		/* for non-directory */
+		struct {
+			struct mutex			f_size_mutex;
+			char				*f_symlink_name;
+			__u64				f_maxbytes;
+			/*
+			 * struct rw_semaphore {
+			 *    signed long	count;     // align d.d_def_acl
+			 *    spinlock_t	wait_lock; // align d.d_sa_lock
+			 *    struct list_head wait_list;
+			 * }
+			 */
+			struct rw_semaphore		f_trunc_sem;
+			struct mutex			f_write_mutex;
+
+			struct rw_semaphore		f_glimpse_sem;
+			unsigned long			f_glimpse_time;
+			struct list_head			f_agl_list;
+			__u64				f_agl_index;
+
+			/* for writepage() only to communicate to fsync */
+			int				f_async_rc;
+
+			/*
+			 * whenever a process try to read/write the file, the
+			 * jobid of the process will be saved here, and it'll
+			 * be packed into the write PRC when flush later.
+			 *
+			 * so the read/write statistics for jobid will not be
+			 * accurate if the file is shared by different jobs.
+			 */
+			char		     f_jobid[JOBSTATS_JOBID_SIZE];
+		} f;
+
+#define lli_size_mutex          u.f.f_size_mutex
+#define lli_symlink_name	u.f.f_symlink_name
+#define lli_maxbytes	    u.f.f_maxbytes
+#define lli_trunc_sem	   u.f.f_trunc_sem
+#define lli_write_mutex	 u.f.f_write_mutex
+#define lli_glimpse_sem		u.f.f_glimpse_sem
+#define lli_glimpse_time	u.f.f_glimpse_time
+#define lli_agl_list		u.f.f_agl_list
+#define lli_agl_index		u.f.f_agl_index
+#define lli_async_rc		u.f.f_async_rc
+#define lli_jobid		u.f.f_jobid
+
+	} u;
+
+	/* XXX: For following frequent used members, although they maybe special
+	 *      used for non-directory object, it is some time-wasting to check
+	 *      whether the object is directory or not before using them. On the
+	 *      other hand, currently, sizeof(f) > sizeof(d), it cannot reduce
+	 *      the "ll_inode_info" size even if moving those members into u.f.
+	 *      So keep them out side.
+	 *
+	 *      In the future, if more members are added only for directory,
+	 *      some of the following members can be moved into u.f.
+	 */
+	bool			    lli_has_smd;
+	struct cl_object	       *lli_clob;
+
+	/* mutex to request for layout lock exclusively. */
+	struct mutex			lli_layout_mutex;
+	/* Layout version, protected by lli_layout_lock */
+	__u32				lli_layout_gen;
+	spinlock_t			lli_layout_lock;
+
+	struct rw_semaphore		lli_xattrs_list_rwsem;
+	struct mutex			lli_xattrs_enq_lock;
+	struct list_head		lli_xattrs;/* ll_xattr_entry->xe_list */
+};
+
+static inline __u32 ll_layout_version_get(struct ll_inode_info *lli)
+{
+	__u32 gen;
+
+	spin_lock(&lli->lli_layout_lock);
+	gen = lli->lli_layout_gen;
+	spin_unlock(&lli->lli_layout_lock);
+
+	return gen;
+}
+
+static inline void ll_layout_version_set(struct ll_inode_info *lli, __u32 gen)
+{
+	spin_lock(&lli->lli_layout_lock);
+	lli->lli_layout_gen = gen;
+	spin_unlock(&lli->lli_layout_lock);
+}
+
+int ll_xattr_cache_destroy(struct inode *inode);
+
+int ll_xattr_cache_get(struct inode *inode,
+			const char *name,
+			char *buffer,
+			size_t size,
+			__u64 valid);
+
+/*
+ * Locking to guarantee consistency of non-atomic updates to long long i_size,
+ * consistency between file size and KMS.
+ *
+ * Implemented by ->lli_size_mutex and ->lsm_lock, nested in that order.
+ */
+
+void ll_inode_size_lock(struct inode *inode);
+void ll_inode_size_unlock(struct inode *inode);
+
+/* FIXME: replace the name of this with LL_I to conform to kernel stuff */
+/* static inline struct ll_inode_info *LL_I(struct inode *inode) */
+static inline struct ll_inode_info *ll_i2info(struct inode *inode)
+{
+	return container_of(inode, struct ll_inode_info, lli_vfs_inode);
+}
+
+/* default to about 40meg of readahead on a given system.  That much tied
+ * up in 512k readahead requests serviced at 40ms each is about 1GB/s. */
+#define SBI_DEFAULT_READAHEAD_MAX (40UL << (20 - PAGE_CACHE_SHIFT))
+
+/* default to read-ahead full files smaller than 2MB on the second read */
+#define SBI_DEFAULT_READAHEAD_WHOLE_MAX (2UL << (20 - PAGE_CACHE_SHIFT))
+
+enum ra_stat {
+	RA_STAT_HIT = 0,
+	RA_STAT_MISS,
+	RA_STAT_DISTANT_READPAGE,
+	RA_STAT_MISS_IN_WINDOW,
+	RA_STAT_FAILED_GRAB_PAGE,
+	RA_STAT_FAILED_MATCH,
+	RA_STAT_DISCARDED,
+	RA_STAT_ZERO_LEN,
+	RA_STAT_ZERO_WINDOW,
+	RA_STAT_EOF,
+	RA_STAT_MAX_IN_FLIGHT,
+	RA_STAT_WRONG_GRAB_PAGE,
+	_NR_RA_STAT,
+};
+
+struct ll_ra_info {
+	atomic_t	      ra_cur_pages;
+	unsigned long	     ra_max_pages;
+	unsigned long	     ra_max_pages_per_file;
+	unsigned long	     ra_max_read_ahead_whole_pages;
+};
+
+/* ra_io_arg will be filled in the beginning of ll_readahead with
+ * ras_lock, then the following ll_read_ahead_pages will read RA
+ * pages according to this arg, all the items in this structure are
+ * counted by page index.
+ */
+struct ra_io_arg {
+	unsigned long ria_start;  /* start offset of read-ahead*/
+	unsigned long ria_end;    /* end offset of read-ahead*/
+	/* If stride read pattern is detected, ria_stoff means where
+	 * stride read is started. Note: for normal read-ahead, the
+	 * value here is meaningless, and also it will not be accessed*/
+	pgoff_t ria_stoff;
+	/* ria_length and ria_pages are the length and pages length in the
+	 * stride I/O mode. And they will also be used to check whether
+	 * it is stride I/O read-ahead in the read-ahead pages*/
+	unsigned long ria_length;
+	unsigned long ria_pages;
+};
+
+/* LL_HIST_MAX=32 causes an overflow */
+#define LL_HIST_MAX 28
+#define LL_HIST_START 12 /* buckets start at 2^12 = 4k */
+#define LL_PROCESS_HIST_MAX 10
+struct per_process_info {
+	pid_t pid;
+	struct obd_histogram pp_r_hist;
+	struct obd_histogram pp_w_hist;
+};
+
+/* pp_extents[LL_PROCESS_HIST_MAX] will hold the combined process info */
+struct ll_rw_extents_info {
+	struct per_process_info pp_extents[LL_PROCESS_HIST_MAX + 1];
+};
+
+#define LL_OFFSET_HIST_MAX 100
+struct ll_rw_process_info {
+	pid_t		     rw_pid;
+	int		       rw_op;
+	loff_t		    rw_range_start;
+	loff_t		    rw_range_end;
+	loff_t		    rw_last_file_pos;
+	loff_t		    rw_offset;
+	size_t		    rw_smallest_extent;
+	size_t		    rw_largest_extent;
+	struct ll_file_data      *rw_last_file;
+};
+
+enum stats_track_type {
+	STATS_TRACK_ALL = 0,  /* track all processes */
+	STATS_TRACK_PID,      /* track process with this pid */
+	STATS_TRACK_PPID,     /* track processes with this ppid */
+	STATS_TRACK_GID,      /* track processes with this gid */
+	STATS_TRACK_LAST,
+};
+
+/* flags for sbi->ll_flags */
+#define LL_SBI_NOLCK	     0x01 /* DLM locking disabled (directio-only) */
+#define LL_SBI_CHECKSUM	  0x02 /* checksum each page as it's written */
+#define LL_SBI_FLOCK	     0x04
+#define LL_SBI_USER_XATTR	0x08 /* support user xattr */
+#define LL_SBI_ACL	       0x10 /* support ACL */
+#define LL_SBI_RMT_CLIENT	0x40 /* remote client */
+#define LL_SBI_MDS_CAPA	  0x80 /* support mds capa */
+#define LL_SBI_OSS_CAPA	 0x100 /* support oss capa */
+#define LL_SBI_LOCALFLOCK       0x200 /* Local flocks support by kernel */
+#define LL_SBI_LRU_RESIZE       0x400 /* lru resize support */
+#define LL_SBI_LAZYSTATFS       0x800 /* lazystatfs mount option */
+#define LL_SBI_SOM_PREVIEW     0x1000 /* SOM preview mount option */
+#define LL_SBI_32BIT_API       0x2000 /* generate 32 bit inodes. */
+#define LL_SBI_64BIT_HASH      0x4000 /* support 64-bits dir hash/offset */
+#define LL_SBI_AGL_ENABLED     0x8000 /* enable agl */
+#define LL_SBI_VERBOSE	0x10000 /* verbose mount/umount */
+#define LL_SBI_LAYOUT_LOCK    0x20000 /* layout lock support */
+#define LL_SBI_USER_FID2PATH  0x40000 /* allow fid2path by unprivileged users */
+#define LL_SBI_XATTR_CACHE    0x80000 /* support for xattr cache */
+
+#define LL_SBI_FLAGS {	\
+	"nolck",	\
+	"checksum",	\
+	"flock",	\
+	"xattr",	\
+	"acl",		\
+	"???",		\
+	"rmt_client",	\
+	"mds_capa",	\
+	"oss_capa",	\
+	"flock",	\
+	"lru_resize",	\
+	"lazy_statfs",	\
+	"som",		\
+	"32bit_api",	\
+	"64bit_hash",	\
+	"agl",		\
+	"verbose",	\
+	"layout",	\
+	"user_fid2path",\
+	"xattr",	\
+}
+
+#define RCE_HASHES      32
+
+struct rmtacl_ctl_entry {
+	struct list_head       rce_list;
+	pid_t	    rce_key; /* hash key */
+	int	      rce_ops; /* acl operation type */
+};
+
+struct rmtacl_ctl_table {
+	spinlock_t	rct_lock;
+	struct list_head	rct_entries[RCE_HASHES];
+};
+
+#define EE_HASHES       32
+
+struct eacl_table {
+	spinlock_t	et_lock;
+	struct list_head	et_entries[EE_HASHES];
+};
+
+struct ll_sb_info {
+	struct list_head		  ll_list;
+	/* this protects pglist and ra_info.  It isn't safe to
+	 * grab from interrupt contexts */
+	spinlock_t		  ll_lock;
+	spinlock_t		  ll_pp_extent_lock; /* pp_extent entry*/
+	spinlock_t		  ll_process_lock; /* ll_rw_process_info */
+	struct obd_uuid	   ll_sb_uuid;
+	struct obd_export	*ll_md_exp;
+	struct obd_export	*ll_dt_exp;
+	struct proc_dir_entry*    ll_proc_root;
+	struct lu_fid	     ll_root_fid; /* root object fid */
+
+	int		       ll_flags;
+	unsigned int		  ll_umounting:1,
+				  ll_xattr_cache_enabled:1;
+	struct list_head		ll_conn_chain; /* per-conn chain of SBs */
+	struct lustre_client_ocd  ll_lco;
+
+	struct list_head		ll_orphan_dentry_list; /*please don't ask -p*/
+	struct ll_close_queue    *ll_lcq;
+
+	struct lprocfs_stats     *ll_stats; /* lprocfs stats counter */
+
+	struct cl_client_cache    ll_cache;
+
+	struct lprocfs_stats     *ll_ra_stats;
+
+	struct ll_ra_info	 ll_ra_info;
+	unsigned int	      ll_namelen;
+	struct file_operations   *ll_fop;
+
+	/* =0 - hold lock over whole read/write
+	 * >0 - max. chunk to be read/written w/o lock re-acquiring */
+	unsigned long	     ll_max_rw_chunk;
+	unsigned int	      ll_md_brw_size; /* used by readdir */
+
+	struct lu_site	   *ll_site;
+	struct cl_device	 *ll_cl;
+	/* Statistics */
+	struct ll_rw_extents_info ll_rw_extents_info;
+	int		       ll_extent_process_count;
+	struct ll_rw_process_info ll_rw_process_info[LL_PROCESS_HIST_MAX];
+	unsigned int	      ll_offset_process_count;
+	struct ll_rw_process_info ll_rw_offset_info[LL_OFFSET_HIST_MAX];
+	unsigned int	      ll_rw_offset_entry_count;
+	int		       ll_stats_track_id;
+	enum stats_track_type     ll_stats_track_type;
+	int		       ll_rw_stats_on;
+
+	/* metadata stat-ahead */
+	unsigned int	      ll_sa_max;     /* max statahead RPCs */
+	atomic_t		  ll_sa_total;   /* statahead thread started
+						  * count */
+	atomic_t		  ll_sa_wrong;   /* statahead thread stopped for
+						  * low hit ratio */
+	atomic_t		  ll_agl_total;  /* AGL thread started count */
+
+	dev_t		     ll_sdev_orig; /* save s_dev before assign for
+						 * clustered nfs */
+	struct rmtacl_ctl_table   ll_rct;
+	struct eacl_table	 ll_et;
+	__kernel_fsid_t		  ll_fsid;
+};
+
+#define LL_DEFAULT_MAX_RW_CHUNK      (32 * 1024 * 1024)
+
+struct ll_ra_read {
+	pgoff_t	     lrr_start;
+	pgoff_t	     lrr_count;
+	struct task_struct *lrr_reader;
+	struct list_head	  lrr_linkage;
+};
+
+/*
+ * per file-descriptor read-ahead data.
+ */
+struct ll_readahead_state {
+	spinlock_t  ras_lock;
+	/*
+	 * index of the last page that read(2) needed and that wasn't in the
+	 * cache. Used by ras_update() to detect seeks.
+	 *
+	 * XXX nikita: if access seeks into cached region, Lustre doesn't see
+	 * this.
+	 */
+	unsigned long   ras_last_readpage;
+	/*
+	 * number of pages read after last read-ahead window reset. As window
+	 * is reset on each seek, this is effectively a number of consecutive
+	 * accesses. Maybe ->ras_accessed_in_window is better name.
+	 *
+	 * XXX nikita: window is also reset (by ras_update()) when Lustre
+	 * believes that memory pressure evicts read-ahead pages. In that
+	 * case, it probably doesn't make sense to expand window to
+	 * PTLRPC_MAX_BRW_PAGES on the third access.
+	 */
+	unsigned long   ras_consecutive_pages;
+	/*
+	 * number of read requests after the last read-ahead window reset
+	 * As window is reset on each seek, this is effectively the number
+	 * on consecutive read request and is used to trigger read-ahead.
+	 */
+	unsigned long   ras_consecutive_requests;
+	/*
+	 * Parameters of current read-ahead window. Handled by
+	 * ras_update(). On the initial access to the file or after a seek,
+	 * window is reset to 0. After 3 consecutive accesses, window is
+	 * expanded to PTLRPC_MAX_BRW_PAGES. Afterwards, window is enlarged by
+	 * PTLRPC_MAX_BRW_PAGES chunks up to ->ra_max_pages.
+	 */
+	unsigned long   ras_window_start, ras_window_len;
+	/*
+	 * Where next read-ahead should start at. This lies within read-ahead
+	 * window. Read-ahead window is read in pieces rather than at once
+	 * because: 1. lustre limits total number of pages under read-ahead by
+	 * ->ra_max_pages (see ll_ra_count_get()), 2. client cannot read pages
+	 * not covered by DLM lock.
+	 */
+	unsigned long   ras_next_readahead;
+	/*
+	 * Total number of ll_file_read requests issued, reads originating
+	 * due to mmap are not counted in this total.  This value is used to
+	 * trigger full file read-ahead after multiple reads to a small file.
+	 */
+	unsigned long   ras_requests;
+	/*
+	 * Page index with respect to the current request, these value
+	 * will not be accurate when dealing with reads issued via mmap.
+	 */
+	unsigned long   ras_request_index;
+	/*
+	 * list of struct ll_ra_read's one per read(2) call current in
+	 * progress against this file descriptor. Used by read-ahead code,
+	 * protected by ->ras_lock.
+	 */
+	struct list_head      ras_read_beads;
+	/*
+	 * The following 3 items are used for detecting the stride I/O
+	 * mode.
+	 * In stride I/O mode,
+	 * ...............|-----data-----|****gap*****|--------|******|....
+	 *    offset      |-stride_pages-|-stride_gap-|
+	 * ras_stride_offset = offset;
+	 * ras_stride_length = stride_pages + stride_gap;
+	 * ras_stride_pages = stride_pages;
+	 * Note: all these three items are counted by pages.
+	 */
+	unsigned long   ras_stride_length;
+	unsigned long   ras_stride_pages;
+	pgoff_t	 ras_stride_offset;
+	/*
+	 * number of consecutive stride request count, and it is similar as
+	 * ras_consecutive_requests, but used for stride I/O mode.
+	 * Note: only more than 2 consecutive stride request are detected,
+	 * stride read-ahead will be enable
+	 */
+	unsigned long   ras_consecutive_stride_requests;
+};
+
+extern struct kmem_cache *ll_file_data_slab;
+struct lustre_handle;
+struct ll_file_data {
+	struct ll_readahead_state fd_ras;
+	struct ccc_grouplock fd_grouplock;
+	__u64 lfd_pos;
+	__u32 fd_flags;
+	fmode_t fd_omode;
+	/* openhandle if lease exists for this file.
+	 * Borrow lli->lli_och_mutex to protect assignment */
+	struct obd_client_handle *fd_lease_och;
+	struct obd_client_handle *fd_och;
+	struct file *fd_file;
+	/* Indicate whether need to report failure when close.
+	 * true: failure is known, not report again.
+	 * false: unknown failure, should report. */
+	bool fd_write_failed;
+};
+
+struct lov_stripe_md;
+
+extern spinlock_t inode_lock;
+
+extern struct proc_dir_entry *proc_lustre_fs_root;
+
+static inline struct inode *ll_info2i(struct ll_inode_info *lli)
+{
+	return &lli->lli_vfs_inode;
+}
+
+__u32 ll_i2suppgid(struct inode *i);
+void ll_i2gids(__u32 *suppgids, struct inode *i1, struct inode *i2);
+
+static inline int ll_need_32bit_api(struct ll_sb_info *sbi)
+{
+#if BITS_PER_LONG == 32
+	return 1;
+#elif defined(CONFIG_COMPAT)
+	return unlikely(is_compat_task() || (sbi->ll_flags & LL_SBI_32BIT_API));
+#else
+	return unlikely(sbi->ll_flags & LL_SBI_32BIT_API);
+#endif
+}
+
+void ll_ra_read_in(struct file *f, struct ll_ra_read *rar);
+void ll_ra_read_ex(struct file *f, struct ll_ra_read *rar);
+struct ll_ra_read *ll_ra_read_get(struct file *f);
+
+/* llite/lproc_llite.c */
+#if defined (CONFIG_PROC_FS)
+int lprocfs_register_mountpoint(struct proc_dir_entry *parent,
+				struct super_block *sb, char *osc, char *mdc);
+void lprocfs_unregister_mountpoint(struct ll_sb_info *sbi);
+void ll_stats_ops_tally(struct ll_sb_info *sbi, int op, int count);
+void lprocfs_llite_init_vars(struct lprocfs_static_vars *lvars);
+void ll_rw_stats_tally(struct ll_sb_info *sbi, pid_t pid,
+		       struct ll_file_data *file, loff_t pos,
+		       size_t count, int rw);
+#else
+static inline int lprocfs_register_mountpoint(struct proc_dir_entry *parent,
+			struct super_block *sb, char *osc, char *mdc){return 0;}
+static inline void lprocfs_unregister_mountpoint(struct ll_sb_info *sbi) {}
+static inline
+void ll_stats_ops_tally(struct ll_sb_info *sbi, int op, int count) {}
+static inline void lprocfs_llite_init_vars(struct lprocfs_static_vars *lvars)
+{
+	memset(lvars, 0, sizeof(*lvars));
+}
+static inline void ll_rw_stats_tally(struct ll_sb_info *sbi, pid_t pid,
+				     struct ll_file_data *file, loff_t pos,
+				     size_t count, int rw) {}
+#endif
+
+
+/* llite/dir.c */
+void ll_release_page(struct page *page, int remove);
+extern const struct file_operations ll_dir_operations;
+extern const struct inode_operations ll_dir_inode_operations;
+struct page *ll_get_dir_page(struct inode *dir, __u64 hash,
+			     struct ll_dir_chain *chain);
+int ll_dir_read(struct inode *inode, struct dir_context *ctx);
+
+int ll_get_mdt_idx(struct inode *inode);
+/* llite/namei.c */
+extern const struct inode_operations ll_special_inode_operations;
+
+int ll_objects_destroy(struct ptlrpc_request *request,
+		       struct inode *dir);
+struct inode *ll_iget(struct super_block *sb, ino_t hash,
+		      struct lustre_md *lic);
+int ll_md_blocking_ast(struct ldlm_lock *, struct ldlm_lock_desc *,
+		       void *data, int flag);
+struct dentry *ll_splice_alias(struct inode *inode, struct dentry *de);
+int ll_rmdir_entry(struct inode *dir, char *name, int namelen);
+
+/* llite/rw.c */
+int ll_prepare_write(struct file *, struct page *, unsigned from, unsigned to);
+int ll_commit_write(struct file *, struct page *, unsigned from, unsigned to);
+int ll_writepage(struct page *page, struct writeback_control *wbc);
+int ll_writepages(struct address_space *, struct writeback_control *wbc);
+int ll_readpage(struct file *file, struct page *page);
+void ll_readahead_init(struct inode *inode, struct ll_readahead_state *ras);
+int ll_readahead(const struct lu_env *env, struct cl_io *io,
+		 struct ll_readahead_state *ras, struct address_space *mapping,
+		 struct cl_page_list *queue, int flags);
+
+#ifndef MS_HAS_NEW_AOPS
+extern const struct address_space_operations ll_aops;
+#else
+extern const struct address_space_operations_ext ll_aops;
+#endif
+
+/* llite/file.c */
+extern struct file_operations ll_file_operations;
+extern struct file_operations ll_file_operations_flock;
+extern struct file_operations ll_file_operations_noflock;
+extern struct inode_operations ll_file_inode_operations;
+extern int ll_have_md_lock(struct inode *inode, __u64 *bits,
+			   ldlm_mode_t l_req_mode);
+extern ldlm_mode_t ll_take_md_lock(struct inode *inode, __u64 bits,
+				   struct lustre_handle *lockh, __u64 flags,
+				   ldlm_mode_t mode);
+int ll_file_open(struct inode *inode, struct file *file);
+int ll_file_release(struct inode *inode, struct file *file);
+int ll_glimpse_ioctl(struct ll_sb_info *sbi,
+		     struct lov_stripe_md *lsm, lstat_t *st);
+void ll_ioepoch_open(struct ll_inode_info *lli, __u64 ioepoch);
+int ll_release_openhandle(struct inode *, struct lookup_intent *);
+int ll_md_real_close(struct inode *inode, fmode_t fmode);
+void ll_ioepoch_close(struct inode *inode, struct md_op_data *op_data,
+		      struct obd_client_handle **och, unsigned long flags);
+void ll_done_writing_attr(struct inode *inode, struct md_op_data *op_data);
+int ll_som_update(struct inode *inode, struct md_op_data *op_data);
+int ll_inode_getattr(struct inode *inode, struct obdo *obdo,
+		     __u64 ioepoch, int sync);
+void ll_pack_inode2opdata(struct inode *inode, struct md_op_data *op_data,
+			  struct lustre_handle *fh);
+int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat);
+struct posix_acl *ll_get_acl(struct inode *inode, int type);
+
+int ll_inode_permission(struct inode *inode, int mask);
+
+int ll_lov_setstripe_ea_info(struct inode *inode, struct dentry *dentry,
+			     int flags, struct lov_user_md *lum,
+			     int lum_size);
+int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
+			     struct lov_mds_md **lmm, int *lmm_size,
+			     struct ptlrpc_request **request);
+int ll_dir_setstripe(struct inode *inode, struct lov_user_md *lump,
+		     int set_default);
+int ll_dir_getstripe(struct inode *inode, struct lov_mds_md **lmmp,
+		     int *lmm_size, struct ptlrpc_request **request);
+int ll_fsync(struct file *file, loff_t start, loff_t end, int data);
+int ll_merge_lvb(const struct lu_env *env, struct inode *inode);
+int ll_fid2path(struct inode *inode, void __user *arg);
+int ll_data_version(struct inode *inode, __u64 *data_version, int extent_lock);
+int ll_hsm_release(struct inode *inode);
+
+/* llite/dcache.c */
+
+int ll_d_init(struct dentry *de);
+extern const struct dentry_operations ll_d_ops;
+void ll_intent_drop_lock(struct lookup_intent *);
+void ll_intent_release(struct lookup_intent *);
+void ll_invalidate_aliases(struct inode *);
+void ll_lookup_finish_locks(struct lookup_intent *it, struct inode *inode);
+int ll_revalidate_it_finish(struct ptlrpc_request *request,
+			    struct lookup_intent *it, struct inode *inode);
+
+/* llite/llite_lib.c */
+extern struct super_operations lustre_super_operations;
+
+void ll_lli_init(struct ll_inode_info *lli);
+int ll_fill_super(struct super_block *sb, struct vfsmount *mnt);
+void ll_put_super(struct super_block *sb);
+void ll_kill_super(struct super_block *sb);
+struct inode *ll_inode_from_resource_lock(struct ldlm_lock *lock);
+void ll_clear_inode(struct inode *inode);
+int ll_setattr_raw(struct dentry *dentry, struct iattr *attr, bool hsm_import);
+int ll_setattr(struct dentry *de, struct iattr *attr);
+int ll_statfs(struct dentry *de, struct kstatfs *sfs);
+int ll_statfs_internal(struct super_block *sb, struct obd_statfs *osfs,
+		       __u64 max_age, __u32 flags);
+void ll_update_inode(struct inode *inode, struct lustre_md *md);
+void ll_read_inode2(struct inode *inode, void *opaque);
+void ll_delete_inode(struct inode *inode);
+int ll_iocontrol(struct inode *inode, struct file *file,
+		 unsigned int cmd, unsigned long arg);
+int ll_flush_ctx(struct inode *inode);
+void ll_umount_begin(struct super_block *sb);
+int ll_remount_fs(struct super_block *sb, int *flags, char *data);
+int ll_show_options(struct seq_file *seq, struct dentry *dentry);
+void ll_dirty_page_discard_warn(struct page *page, int ioret);
+int ll_prep_inode(struct inode **inode, struct ptlrpc_request *req,
+		  struct super_block *, struct lookup_intent *);
+int ll_obd_statfs(struct inode *inode, void *arg);
+int ll_get_max_mdsize(struct ll_sb_info *sbi, int *max_mdsize);
+int ll_get_default_mdsize(struct ll_sb_info *sbi, int *default_mdsize);
+int ll_get_max_cookiesize(struct ll_sb_info *sbi, int *max_cookiesize);
+int ll_get_default_cookiesize(struct ll_sb_info *sbi, int *default_cookiesize);
+int ll_process_config(struct lustre_cfg *lcfg);
+struct md_op_data *ll_prep_md_op_data(struct md_op_data *op_data,
+				      struct inode *i1, struct inode *i2,
+				      const char *name, int namelen,
+				      int mode, __u32 opc, void *data);
+void ll_finish_md_op_data(struct md_op_data *op_data);
+int ll_get_obd_name(struct inode *inode, unsigned int cmd, unsigned long arg);
+char *ll_get_fsname(struct super_block *sb, char *buf, int buflen);
+
+/* llite/llite_nfs.c */
+extern struct export_operations lustre_export_operations;
+__u32 get_uuid2int(const char *name, int len);
+void get_uuid2fsid(const char *name, int len, __kernel_fsid_t *fsid);
+struct inode *search_inode_for_lustre(struct super_block *sb,
+				      const struct lu_fid *fid);
+
+/* llite/symlink.c */
+extern struct inode_operations ll_fast_symlink_inode_operations;
+
+/* llite/llite_close.c */
+struct ll_close_queue {
+	spinlock_t		lcq_lock;
+	struct list_head		lcq_head;
+	wait_queue_head_t		lcq_waitq;
+	struct completion	lcq_comp;
+	atomic_t		lcq_stop;
+};
+
+struct ccc_object *cl_inode2ccc(struct inode *inode);
+
+
+void vvp_write_pending (struct ccc_object *club, struct ccc_page *page);
+void vvp_write_complete(struct ccc_object *club, struct ccc_page *page);
+
+/* specific architecture can implement only part of this list */
+enum vvp_io_subtype {
+	/** normal IO */
+	IO_NORMAL,
+	/** io started from splice_{read|write} */
+	IO_SPLICE
+};
+
+/* IO subtypes */
+struct vvp_io {
+	/** io subtype */
+	enum vvp_io_subtype    cui_io_subtype;
+
+	union {
+		struct {
+			struct pipe_inode_info *cui_pipe;
+			unsigned int	    cui_flags;
+		} splice;
+		struct vvp_fault_io {
+			/**
+			 * Inode modification time that is checked across DLM
+			 * lock request.
+			 */
+			time_t		 ft_mtime;
+			struct vm_area_struct *ft_vma;
+			/**
+			 *  locked page returned from vvp_io
+			 */
+			struct page	    *ft_vmpage;
+			struct vm_fault_api {
+				/**
+				 * kernel fault info
+				 */
+				struct vm_fault *ft_vmf;
+				/**
+				 * fault API used bitflags for return code.
+				 */
+				unsigned int    ft_flags;
+				/**
+				 * check that flags are from filemap_fault
+				 */
+				bool		ft_flags_valid;
+			} fault;
+		} fault;
+	} u;
+	/**
+	 * Read-ahead state used by read and page-fault IO contexts.
+	 */
+	struct ll_ra_read    cui_bead;
+	/**
+	 * Set when cui_bead has been initialized.
+	 */
+	int		  cui_ra_window_set;
+};
+
+/**
+ * IO arguments for various VFS I/O interfaces.
+ */
+struct vvp_io_args {
+	/** normal/splice */
+	enum vvp_io_subtype via_io_subtype;
+
+	union {
+		struct {
+			struct kiocb      *via_iocb;
+			struct iov_iter   *via_iter;
+		} normal;
+		struct {
+			struct pipe_inode_info  *via_pipe;
+			unsigned int       via_flags;
+		} splice;
+	} u;
+};
+
+struct ll_cl_context {
+	void	   *lcc_cookie;
+	struct cl_io   *lcc_io;
+	struct cl_page *lcc_page;
+	struct lu_env  *lcc_env;
+	int	     lcc_refcheck;
+};
+
+struct vvp_thread_info {
+	struct vvp_io_args   vti_args;
+	struct ra_io_arg     vti_ria;
+	struct ll_cl_context vti_io_ctx;
+};
+
+static inline struct vvp_thread_info *vvp_env_info(const struct lu_env *env)
+{
+	extern struct lu_context_key vvp_key;
+	struct vvp_thread_info      *info;
+
+	info = lu_context_key_get(&env->le_ctx, &vvp_key);
+	LASSERT(info != NULL);
+	return info;
+}
+
+static inline struct vvp_io_args *vvp_env_args(const struct lu_env *env,
+					       enum vvp_io_subtype type)
+{
+	struct vvp_io_args *ret = &vvp_env_info(env)->vti_args;
+
+	ret->via_io_subtype = type;
+
+	return ret;
+}
+
+struct vvp_session {
+	struct vvp_io	 vs_ios;
+};
+
+static inline struct vvp_session *vvp_env_session(const struct lu_env *env)
+{
+	extern struct lu_context_key vvp_session_key;
+	struct vvp_session *ses;
+
+	ses = lu_context_key_get(env->le_ses, &vvp_session_key);
+	LASSERT(ses != NULL);
+	return ses;
+}
+
+static inline struct vvp_io *vvp_env_io(const struct lu_env *env)
+{
+	return &vvp_env_session(env)->vs_ios;
+}
+
+int vvp_global_init(void);
+void vvp_global_fini(void);
+
+void ll_queue_done_writing(struct inode *inode, unsigned long flags);
+void ll_close_thread_shutdown(struct ll_close_queue *lcq);
+int ll_close_thread_start(struct ll_close_queue **lcq_ret);
+
+/* llite/llite_mmap.c */
+
+int ll_teardown_mmaps(struct address_space *mapping, __u64 first, __u64 last);
+int ll_file_mmap(struct file *file, struct vm_area_struct *vma);
+void policy_from_vma(ldlm_policy_data_t *policy,
+		struct vm_area_struct *vma, unsigned long addr, size_t count);
+struct vm_area_struct *our_vma(struct mm_struct *mm, unsigned long addr,
+			       size_t count);
+
+static inline void ll_invalidate_page(struct page *vmpage)
+{
+	struct address_space *mapping = vmpage->mapping;
+	loff_t offset = vmpage->index << PAGE_CACHE_SHIFT;
+
+	LASSERT(PageLocked(vmpage));
+	if (mapping == NULL)
+		return;
+
+	ll_teardown_mmaps(mapping, offset, offset + PAGE_CACHE_SIZE);
+	truncate_complete_page(mapping, vmpage);
+}
+
+#define    ll_s2sbi(sb)	(s2lsi(sb)->lsi_llsbi)
+
+/* don't need an addref as the sb_info should be holding one */
+static inline struct obd_export *ll_s2dtexp(struct super_block *sb)
+{
+	return ll_s2sbi(sb)->ll_dt_exp;
+}
+
+/* don't need an addref as the sb_info should be holding one */
+static inline struct obd_export *ll_s2mdexp(struct super_block *sb)
+{
+	return ll_s2sbi(sb)->ll_md_exp;
+}
+
+static inline struct client_obd *sbi2mdc(struct ll_sb_info *sbi)
+{
+	struct obd_device *obd = sbi->ll_md_exp->exp_obd;
+	if (obd == NULL)
+		LBUG();
+	return &obd->u.cli;
+}
+
+/* FIXME: replace the name of this with LL_SB to conform to kernel stuff */
+static inline struct ll_sb_info *ll_i2sbi(struct inode *inode)
+{
+	return ll_s2sbi(inode->i_sb);
+}
+
+static inline struct obd_export *ll_i2dtexp(struct inode *inode)
+{
+	return ll_s2dtexp(inode->i_sb);
+}
+
+static inline struct obd_export *ll_i2mdexp(struct inode *inode)
+{
+	return ll_s2mdexp(inode->i_sb);
+}
+
+static inline struct lu_fid *ll_inode2fid(struct inode *inode)
+{
+	struct lu_fid *fid;
+
+	LASSERT(inode != NULL);
+	fid = &ll_i2info(inode)->lli_fid;
+
+	return fid;
+}
+
+static inline __u64 ll_file_maxbytes(struct inode *inode)
+{
+	return ll_i2info(inode)->lli_maxbytes;
+}
+
+/* llite/xattr.c */
+int ll_setxattr(struct dentry *dentry, const char *name,
+		const void *value, size_t size, int flags);
+ssize_t ll_getxattr(struct dentry *dentry, const char *name,
+		    void *buffer, size_t size);
+ssize_t ll_listxattr(struct dentry *dentry, char *buffer, size_t size);
+int ll_removexattr(struct dentry *dentry, const char *name);
+
+/* llite/remote_perm.c */
+extern struct kmem_cache *ll_remote_perm_cachep;
+extern struct kmem_cache *ll_rmtperm_hash_cachep;
+
+void free_rmtperm_hash(struct hlist_head *hash);
+int ll_update_remote_perm(struct inode *inode, struct mdt_remote_perm *perm);
+int lustre_check_remote_perm(struct inode *inode, int mask);
+
+/* llite/llite_capa.c */
+extern struct timer_list ll_capa_timer;
+
+int ll_capa_thread_start(void);
+void ll_capa_thread_stop(void);
+void ll_capa_timer_callback(unsigned long unused);
+
+struct obd_capa *ll_add_capa(struct inode *inode, struct obd_capa *ocapa);
+
+void ll_capa_open(struct inode *inode);
+void ll_capa_close(struct inode *inode);
+
+struct obd_capa *ll_mdscapa_get(struct inode *inode);
+struct obd_capa *ll_osscapa_get(struct inode *inode, __u64 opc);
+
+void ll_truncate_free_capa(struct obd_capa *ocapa);
+void ll_clear_inode_capas(struct inode *inode);
+void ll_print_capa_stat(struct ll_sb_info *sbi);
+
+/* llite/llite_cl.c */
+extern struct lu_device_type vvp_device_type;
+
+/**
+ * Common IO arguments for various VFS I/O interfaces.
+ */
+int cl_sb_init(struct super_block *sb);
+int cl_sb_fini(struct super_block *sb);
+void ll_io_init(struct cl_io *io, const struct file *file, int write);
+
+void ras_update(struct ll_sb_info *sbi, struct inode *inode,
+		struct ll_readahead_state *ras, unsigned long index,
+		unsigned hit);
+void ll_ra_count_put(struct ll_sb_info *sbi, unsigned long len);
+void ll_ra_stats_inc(struct address_space *mapping, enum ra_stat which);
+
+/* llite/llite_rmtacl.c */
+#ifdef CONFIG_FS_POSIX_ACL
+struct eacl_entry {
+	struct list_head	    ee_list;
+	pid_t		 ee_key; /* hash key */
+	struct lu_fid	 ee_fid;
+	int		   ee_type; /* ACL type for ACCESS or DEFAULT */
+	ext_acl_xattr_header *ee_acl;
+};
+
+u64 rce_ops2valid(int ops);
+struct rmtacl_ctl_entry *rct_search(struct rmtacl_ctl_table *rct, pid_t key);
+int rct_add(struct rmtacl_ctl_table *rct, pid_t key, int ops);
+int rct_del(struct rmtacl_ctl_table *rct, pid_t key);
+void rct_init(struct rmtacl_ctl_table *rct);
+void rct_fini(struct rmtacl_ctl_table *rct);
+
+void ee_free(struct eacl_entry *ee);
+int ee_add(struct eacl_table *et, pid_t key, struct lu_fid *fid, int type,
+	   ext_acl_xattr_header *header);
+struct eacl_entry *et_search_del(struct eacl_table *et, pid_t key,
+				 struct lu_fid *fid, int type);
+void et_search_free(struct eacl_table *et, pid_t key);
+void et_init(struct eacl_table *et);
+void et_fini(struct eacl_table *et);
+#else
+static inline u64 rce_ops2valid(int ops)
+{
+	return 0;
+}
+#endif
+
+/* statahead.c */
+
+#define LL_SA_RPC_MIN	   2
+#define LL_SA_RPC_DEF	   32
+#define LL_SA_RPC_MAX	   8192
+
+#define LL_SA_CACHE_BIT	 5
+#define LL_SA_CACHE_SIZE	(1 << LL_SA_CACHE_BIT)
+#define LL_SA_CACHE_MASK	(LL_SA_CACHE_SIZE - 1)
+
+/* per inode struct, for dir only */
+struct ll_statahead_info {
+	struct inode	   *sai_inode;
+	atomic_t	    sai_refcount;   /* when access this struct, hold
+						 * refcount */
+	unsigned int	    sai_generation; /* generation for statahead */
+	unsigned int	    sai_max;	/* max ahead of lookup */
+	__u64		   sai_sent;       /* stat requests sent count */
+	__u64		   sai_replied;    /* stat requests which received
+						 * reply */
+	__u64		   sai_index;      /* index of statahead entry */
+	__u64		   sai_index_wait; /* index of entry which is the
+						 * caller is waiting for */
+	__u64		   sai_hit;	/* hit count */
+	__u64		   sai_miss;       /* miss count:
+						 * for "ls -al" case, it includes
+						 * hidden dentry miss;
+						 * for "ls -l" case, it does not
+						 * include hidden dentry miss.
+						 * "sai_miss_hidden" is used for
+						 * the later case.
+						 */
+	unsigned int	    sai_consecutive_miss; /* consecutive miss */
+	unsigned int	    sai_miss_hidden;/* "ls -al", but first dentry
+						 * is not a hidden one */
+	unsigned int	    sai_skip_hidden;/* skipped hidden dentry count */
+	unsigned int	    sai_ls_all:1,   /* "ls -al", do stat-ahead for
+						 * hidden entries */
+				sai_agl_valid:1;/* AGL is valid for the dir */
+	wait_queue_head_t	     sai_waitq;      /* stat-ahead wait queue */
+	struct ptlrpc_thread    sai_thread;     /* stat-ahead thread */
+	struct ptlrpc_thread    sai_agl_thread; /* AGL thread */
+	struct list_head	      sai_entries;    /* entry list */
+	struct list_head	      sai_entries_received; /* entries returned */
+	struct list_head	      sai_entries_stated;   /* entries stated */
+	struct list_head	      sai_entries_agl; /* AGL entries to be sent */
+	struct list_head	      sai_cache[LL_SA_CACHE_SIZE];
+	spinlock_t		sai_cache_lock[LL_SA_CACHE_SIZE];
+	atomic_t		sai_cache_count; /* entry count in cache */
+};
+
+int do_statahead_enter(struct inode *dir, struct dentry **dentry,
+		       int only_unplug);
+void ll_stop_statahead(struct inode *dir, void *key);
+
+static inline int ll_glimpse_size(struct inode *inode)
+{
+	struct ll_inode_info *lli = ll_i2info(inode);
+	int rc;
+
+	down_read(&lli->lli_glimpse_sem);
+	rc = cl_glimpse_size(inode);
+	lli->lli_glimpse_time = cfs_time_current();
+	up_read(&lli->lli_glimpse_sem);
+	return rc;
+}
+
+static inline void
+ll_statahead_mark(struct inode *dir, struct dentry *dentry)
+{
+	struct ll_inode_info     *lli = ll_i2info(dir);
+	struct ll_statahead_info *sai = lli->lli_sai;
+	struct ll_dentry_data    *ldd = ll_d2d(dentry);
+
+	/* not the same process, don't mark */
+	if (lli->lli_opendir_pid != current_pid())
+		return;
+
+	LASSERT(ldd != NULL);
+	if (sai != NULL)
+		ldd->lld_sa_generation = sai->sai_generation;
+}
+
+static inline int
+d_need_statahead(struct inode *dir, struct dentry *dentryp)
+{
+	struct ll_inode_info  *lli;
+	struct ll_dentry_data *ldd;
+
+	if (ll_i2sbi(dir)->ll_sa_max == 0)
+		return -EAGAIN;
+
+	lli = ll_i2info(dir);
+	/* not the same process, don't statahead */
+	if (lli->lli_opendir_pid != current_pid())
+		return -EAGAIN;
+
+	/* statahead has been stopped */
+	if (lli->lli_opendir_key == NULL)
+		return -EAGAIN;
+
+	ldd = ll_d2d(dentryp);
+	/*
+	 * When stats a dentry, the system trigger more than once "revalidate"
+	 * or "lookup", for "getattr", for "getxattr", and maybe for others.
+	 * Under patchless client mode, the operation intent is not accurate,
+	 * which maybe misguide the statahead thread. For example:
+	 * The "revalidate" call for "getattr" and "getxattr" of a dentry maybe
+	 * have the same operation intent -- "IT_GETATTR".
+	 * In fact, one dentry should has only one chance to interact with the
+	 * statahead thread, otherwise the statahead windows will be confused.
+	 * The solution is as following:
+	 * Assign "lld_sa_generation" with "sai_generation" when a dentry
+	 * "IT_GETATTR" for the first time, and the subsequent "IT_GETATTR"
+	 * will bypass interacting with statahead thread for checking:
+	 * "lld_sa_generation == lli_sai->sai_generation"
+	 */
+	if (ldd && lli->lli_sai &&
+	    ldd->lld_sa_generation == lli->lli_sai->sai_generation)
+		return -EAGAIN;
+
+	return 1;
+}
+
+static inline int
+ll_statahead_enter(struct inode *dir, struct dentry **dentryp, int only_unplug)
+{
+	int ret;
+
+	ret = d_need_statahead(dir, *dentryp);
+	if (ret <= 0)
+		return ret;
+
+	return do_statahead_enter(dir, dentryp, only_unplug);
+}
+
+/* llite ioctl register support routine */
+enum llioc_iter {
+	LLIOC_CONT = 0,
+	LLIOC_STOP
+};
+
+#define LLIOC_MAX_CMD	   256
+
+/*
+ * Rules to write a callback function:
+ *
+ * Parameters:
+ *  @magic: Dynamic ioctl call routine will feed this value with the pointer
+ *      returned to ll_iocontrol_register.  Callback functions should use this
+ *      data to check the potential collasion of ioctl cmd. If collasion is
+ *      found, callback function should return LLIOC_CONT.
+ *  @rcp: The result of ioctl command.
+ *
+ *  Return values:
+ *      If @magic matches the pointer returned by ll_iocontrol_data, the
+ *      callback should return LLIOC_STOP; return LLIOC_STOP otherwise.
+ */
+typedef enum llioc_iter (*llioc_callback_t)(struct inode *inode,
+		struct file *file, unsigned int cmd, unsigned long arg,
+		void *magic, int *rcp);
+
+/* export functions */
+/* Register ioctl block dynamatically for a regular file.
+ *
+ * @cmd: the array of ioctl command set
+ * @count: number of commands in the @cmd
+ * @cb: callback function, it will be called if an ioctl command is found to
+ *      belong to the command list @cmd.
+ *
+ * Return value:
+ *      A magic pointer will be returned if success;
+ *      otherwise, NULL will be returned.
+ * */
+void *ll_iocontrol_register(llioc_callback_t cb, int count, unsigned int *cmd);
+void ll_iocontrol_unregister(void *magic);
+
+
+/* lclient compat stuff */
+#define cl_inode_info ll_inode_info
+#define cl_i2info(info) ll_i2info(info)
+#define cl_inode_mode(inode) ((inode)->i_mode)
+#define cl_i2sbi ll_i2sbi
+
+static inline struct ll_file_data *cl_iattr2fd(struct inode *inode,
+					       const struct iattr *attr)
+{
+	LASSERT(attr->ia_valid & ATTR_FILE);
+	return LUSTRE_FPRIVATE(attr->ia_file);
+}
+
+static inline void cl_isize_lock(struct inode *inode)
+{
+	ll_inode_size_lock(inode);
+}
+
+static inline void cl_isize_unlock(struct inode *inode)
+{
+	ll_inode_size_unlock(inode);
+}
+
+static inline void cl_isize_write_nolock(struct inode *inode, loff_t kms)
+{
+	LASSERT(mutex_is_locked(&ll_i2info(inode)->lli_size_mutex));
+	i_size_write(inode, kms);
+}
+
+static inline void cl_isize_write(struct inode *inode, loff_t kms)
+{
+	ll_inode_size_lock(inode);
+	i_size_write(inode, kms);
+	ll_inode_size_unlock(inode);
+}
+
+#define cl_isize_read(inode)	     i_size_read(inode)
+
+static inline int cl_merge_lvb(const struct lu_env *env, struct inode *inode)
+{
+	return ll_merge_lvb(env, inode);
+}
+
+#define cl_inode_atime(inode) LTIME_S((inode)->i_atime)
+#define cl_inode_ctime(inode) LTIME_S((inode)->i_ctime)
+#define cl_inode_mtime(inode) LTIME_S((inode)->i_mtime)
+
+struct obd_capa *cl_capa_lookup(struct inode *inode, enum cl_req_type crt);
+
+int cl_sync_file_range(struct inode *inode, loff_t start, loff_t end,
+		       enum cl_fsync_mode mode, int ignore_layout);
+
+/** direct write pages */
+struct ll_dio_pages {
+	/** page array to be written. we don't support
+	 * partial pages except the last one. */
+	struct page **ldp_pages;
+	/* offset of each page */
+	loff_t       *ldp_offsets;
+	/** if ldp_offsets is NULL, it means a sequential
+	 * pages to be written, then this is the file offset
+	 * of the * first page. */
+	loff_t	ldp_start_offset;
+	/** how many bytes are to be written. */
+	size_t	ldp_size;
+	/** # of pages in the array. */
+	int	   ldp_nr;
+};
+
+static inline void cl_stats_tally(struct cl_device *dev, enum cl_req_type crt,
+				  int rc)
+{
+	int opc = (crt == CRT_READ) ? LPROC_LL_OSC_READ :
+				      LPROC_LL_OSC_WRITE;
+
+	ll_stats_ops_tally(ll_s2sbi(cl2ccc_dev(dev)->cdv_sb), opc, rc);
+}
+
+extern ssize_t ll_direct_rw_pages(const struct lu_env *env, struct cl_io *io,
+				  int rw, struct inode *inode,
+				  struct ll_dio_pages *pv);
+
+static inline int ll_file_nolock(const struct file *file)
+{
+	struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
+	struct inode *inode = file_inode(file);
+
+	LASSERT(fd != NULL);
+	return ((fd->fd_flags & LL_FILE_IGNORE_LOCK) ||
+		(ll_i2sbi(inode)->ll_flags & LL_SBI_NOLCK));
+}
+
+static inline void ll_set_lock_data(struct obd_export *exp, struct inode *inode,
+				    struct lookup_intent *it, __u64 *bits)
+{
+	if (!it->d.lustre.it_lock_set) {
+		struct lustre_handle handle;
+
+		/* If this inode is a remote object, it will get two
+		 * separate locks in different namespaces, Master MDT,
+		 * where the name entry is, will grant LOOKUP lock,
+		 * remote MDT, where the object is, will grant
+		 * UPDATE|PERM lock. The inode will be attached to both
+		 * LOOKUP and PERM locks, so revoking either locks will
+		 * case the dcache being cleared */
+		if (it->d.lustre.it_remote_lock_mode) {
+			handle.cookie = it->d.lustre.it_remote_lock_handle;
+			CDEBUG(D_DLMTRACE, "setting l_data to inode %p(%lu/%u) for remote lock %#llx\n",
+			       inode,
+			       inode->i_ino, inode->i_generation,
+			       handle.cookie);
+			md_set_lock_data(exp, &handle.cookie, inode, NULL);
+		}
+
+		handle.cookie = it->d.lustre.it_lock_handle;
+
+		CDEBUG(D_DLMTRACE, "setting l_data to inode %p (%lu/%u) for lock %#llx\n",
+		       inode, inode->i_ino,
+		       inode->i_generation, handle.cookie);
+
+		md_set_lock_data(exp, &handle.cookie, inode,
+				 &it->d.lustre.it_lock_bits);
+		it->d.lustre.it_lock_set = 1;
+	}
+
+	if (bits != NULL)
+		*bits = it->d.lustre.it_lock_bits;
+}
+
+static inline void ll_lock_dcache(struct inode *inode)
+{
+	spin_lock(&inode->i_lock);
+}
+
+static inline void ll_unlock_dcache(struct inode *inode)
+{
+	spin_unlock(&inode->i_lock);
+}
+
+static inline int d_lustre_invalid(const struct dentry *dentry)
+{
+	struct ll_dentry_data *lld = ll_d2d(dentry);
+
+	return (lld == NULL) || lld->lld_invalid;
+}
+
+static inline void __d_lustre_invalidate(struct dentry *dentry)
+{
+	struct ll_dentry_data *lld = ll_d2d(dentry);
+
+	if (lld != NULL)
+		lld->lld_invalid = 1;
+}
+
+/*
+ * Mark dentry INVALID, if dentry refcount is zero (this is normally case for
+ * ll_md_blocking_ast), unhash this dentry, and let dcache to reclaim it later;
+ * else dput() of the last refcount will unhash this dentry and kill it.
+ */
+static inline void d_lustre_invalidate(struct dentry *dentry, int nested)
+{
+	CDEBUG(D_DENTRY, "invalidate dentry %pd (%p) parent %p inode %p refc %d\n",
+	       dentry, dentry,
+	       dentry->d_parent, d_inode(dentry), d_count(dentry));
+
+	spin_lock_nested(&dentry->d_lock,
+			 nested ? DENTRY_D_LOCK_NESTED : DENTRY_D_LOCK_NORMAL);
+	__d_lustre_invalidate(dentry);
+	if (d_count(dentry) == 0)
+		__d_drop(dentry);
+	spin_unlock(&dentry->d_lock);
+}
+
+static inline void d_lustre_revalidate(struct dentry *dentry)
+{
+	spin_lock(&dentry->d_lock);
+	LASSERT(ll_d2d(dentry) != NULL);
+	ll_d2d(dentry)->lld_invalid = 0;
+	spin_unlock(&dentry->d_lock);
+}
+
+enum {
+	LL_LAYOUT_GEN_NONE  = ((__u32)-2),	/* layout lock was cancelled */
+	LL_LAYOUT_GEN_EMPTY = ((__u32)-1)	/* for empty layout */
+};
+
+int ll_layout_conf(struct inode *inode, const struct cl_object_conf *conf);
+int ll_layout_refresh(struct inode *inode, __u32 *gen);
+int ll_layout_restore(struct inode *inode);
+
+int ll_xattr_init(void);
+void ll_xattr_fini(void);
+
+#endif /* LLITE_INTERNAL_H */
diff --git a/kernel/drivers/staging/lustre/lustre/llite/llite_lib.c b/kernel/drivers/staging/lustre/lustre/llite/llite_lib.c
new file mode 100644
index 000000000..a27af7882
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/llite/llite_lib.c
@@ -0,0 +1,2354 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/llite/llite_lib.c
+ *
+ * Lustre Light Super operations
+ */
+
+#define DEBUG_SUBSYSTEM S_LLITE
+
+#include <linux/module.h>
+#include <linux/statfs.h>
+#include <linux/types.h>
+#include <linux/mm.h>
+
+#include "../include/lustre_lite.h"
+#include "../include/lustre_ha.h"
+#include "../include/lustre_dlm.h"
+#include "../include/lprocfs_status.h"
+#include "../include/lustre_disk.h"
+#include "../include/lustre_param.h"
+#include "../include/lustre_log.h"
+#include "../include/cl_object.h"
+#include "../include/obd_cksum.h"
+#include "llite_internal.h"
+
+struct kmem_cache *ll_file_data_slab;
+struct proc_dir_entry *proc_lustre_fs_root;
+
+static LIST_HEAD(ll_super_blocks);
+static DEFINE_SPINLOCK(ll_sb_lock);
+
+#ifndef log2
+#define log2(n) ffz(~(n))
+#endif
+
+static struct ll_sb_info *ll_init_sbi(void)
+{
+	struct ll_sb_info *sbi = NULL;
+	unsigned long pages;
+	unsigned long lru_page_max;
+	struct sysinfo si;
+	class_uuid_t uuid;
+	int i;
+
+	sbi = kzalloc(sizeof(*sbi), GFP_NOFS);
+	if (!sbi)
+		return NULL;
+
+	spin_lock_init(&sbi->ll_lock);
+	mutex_init(&sbi->ll_lco.lco_lock);
+	spin_lock_init(&sbi->ll_pp_extent_lock);
+	spin_lock_init(&sbi->ll_process_lock);
+	sbi->ll_rw_stats_on = 0;
+
+	si_meminfo(&si);
+	pages = si.totalram - si.totalhigh;
+	if (pages >> (20 - PAGE_CACHE_SHIFT) < 512)
+		lru_page_max = pages / 2;
+	else
+		lru_page_max = (pages / 4) * 3;
+
+	/* initialize lru data */
+	atomic_set(&sbi->ll_cache.ccc_users, 0);
+	sbi->ll_cache.ccc_lru_max = lru_page_max;
+	atomic_set(&sbi->ll_cache.ccc_lru_left, lru_page_max);
+	spin_lock_init(&sbi->ll_cache.ccc_lru_lock);
+	INIT_LIST_HEAD(&sbi->ll_cache.ccc_lru);
+
+	sbi->ll_ra_info.ra_max_pages_per_file = min(pages / 32,
+					   SBI_DEFAULT_READAHEAD_MAX);
+	sbi->ll_ra_info.ra_max_pages = sbi->ll_ra_info.ra_max_pages_per_file;
+	sbi->ll_ra_info.ra_max_read_ahead_whole_pages =
+					   SBI_DEFAULT_READAHEAD_WHOLE_MAX;
+	INIT_LIST_HEAD(&sbi->ll_conn_chain);
+	INIT_LIST_HEAD(&sbi->ll_orphan_dentry_list);
+
+	ll_generate_random_uuid(uuid);
+	class_uuid_unparse(uuid, &sbi->ll_sb_uuid);
+	CDEBUG(D_CONFIG, "generated uuid: %s\n", sbi->ll_sb_uuid.uuid);
+
+	spin_lock(&ll_sb_lock);
+	list_add_tail(&sbi->ll_list, &ll_super_blocks);
+	spin_unlock(&ll_sb_lock);
+
+	sbi->ll_flags |= LL_SBI_VERBOSE;
+	sbi->ll_flags |= LL_SBI_CHECKSUM;
+
+	sbi->ll_flags |= LL_SBI_LRU_RESIZE;
+
+	for (i = 0; i <= LL_PROCESS_HIST_MAX; i++) {
+		spin_lock_init(&sbi->ll_rw_extents_info.pp_extents[i].
+			       pp_r_hist.oh_lock);
+		spin_lock_init(&sbi->ll_rw_extents_info.pp_extents[i].
+			       pp_w_hist.oh_lock);
+	}
+
+	/* metadata statahead is enabled by default */
+	sbi->ll_sa_max = LL_SA_RPC_DEF;
+	atomic_set(&sbi->ll_sa_total, 0);
+	atomic_set(&sbi->ll_sa_wrong, 0);
+	atomic_set(&sbi->ll_agl_total, 0);
+	sbi->ll_flags |= LL_SBI_AGL_ENABLED;
+
+	return sbi;
+}
+
+static void ll_free_sbi(struct super_block *sb)
+{
+	struct ll_sb_info *sbi = ll_s2sbi(sb);
+
+	if (sbi != NULL) {
+		spin_lock(&ll_sb_lock);
+		list_del(&sbi->ll_list);
+		spin_unlock(&ll_sb_lock);
+		OBD_FREE(sbi, sizeof(*sbi));
+	}
+}
+
+static int client_common_fill_super(struct super_block *sb, char *md, char *dt,
+				    struct vfsmount *mnt)
+{
+	struct inode *root = NULL;
+	struct ll_sb_info *sbi = ll_s2sbi(sb);
+	struct obd_device *obd;
+	struct obd_capa *oc = NULL;
+	struct obd_statfs *osfs = NULL;
+	struct ptlrpc_request *request = NULL;
+	struct obd_connect_data *data = NULL;
+	struct obd_uuid *uuid;
+	struct md_op_data *op_data;
+	struct lustre_md lmd;
+	u64 valid;
+	int size, err, checksum;
+
+	obd = class_name2obd(md);
+	if (!obd) {
+		CERROR("MD %s: not setup or attached\n", md);
+		return -EINVAL;
+	}
+
+	data = kzalloc(sizeof(*data), GFP_NOFS);
+	if (!data)
+		return -ENOMEM;
+
+	osfs = kzalloc(sizeof(*osfs), GFP_NOFS);
+	if (!osfs) {
+		OBD_FREE_PTR(data);
+		return -ENOMEM;
+	}
+
+	if (proc_lustre_fs_root) {
+		err = lprocfs_register_mountpoint(proc_lustre_fs_root, sb,
+						  dt, md);
+		if (err < 0)
+			CERROR("could not register mount in /proc/fs/lustre\n");
+	}
+
+	/* indicate the features supported by this client */
+	data->ocd_connect_flags = OBD_CONNECT_IBITS    | OBD_CONNECT_NODEVOH  |
+				  OBD_CONNECT_ATTRFID  |
+				  OBD_CONNECT_VERSION  | OBD_CONNECT_BRW_SIZE |
+				  OBD_CONNECT_MDS_CAPA | OBD_CONNECT_OSS_CAPA |
+				  OBD_CONNECT_CANCELSET | OBD_CONNECT_FID     |
+				  OBD_CONNECT_AT       | OBD_CONNECT_LOV_V3   |
+				  OBD_CONNECT_RMT_CLIENT | OBD_CONNECT_VBR    |
+				  OBD_CONNECT_FULL20   | OBD_CONNECT_64BITHASH|
+				  OBD_CONNECT_EINPROGRESS |
+				  OBD_CONNECT_JOBSTATS | OBD_CONNECT_LVB_TYPE |
+				  OBD_CONNECT_LAYOUTLOCK |
+				  OBD_CONNECT_PINGLESS |
+				  OBD_CONNECT_MAX_EASIZE |
+				  OBD_CONNECT_FLOCK_DEAD |
+				  OBD_CONNECT_DISP_STRIPE;
+
+	if (sbi->ll_flags & LL_SBI_SOM_PREVIEW)
+		data->ocd_connect_flags |= OBD_CONNECT_SOM;
+
+	if (sbi->ll_flags & LL_SBI_LRU_RESIZE)
+		data->ocd_connect_flags |= OBD_CONNECT_LRU_RESIZE;
+#ifdef CONFIG_FS_POSIX_ACL
+	data->ocd_connect_flags |= OBD_CONNECT_ACL | OBD_CONNECT_UMASK;
+#endif
+
+	if (OBD_FAIL_CHECK(OBD_FAIL_MDC_LIGHTWEIGHT))
+		/* flag mdc connection as lightweight, only used for test
+		 * purpose, use with care */
+		data->ocd_connect_flags |= OBD_CONNECT_LIGHTWEIGHT;
+
+	data->ocd_ibits_known = MDS_INODELOCK_FULL;
+	data->ocd_version = LUSTRE_VERSION_CODE;
+
+	if (sb->s_flags & MS_RDONLY)
+		data->ocd_connect_flags |= OBD_CONNECT_RDONLY;
+	if (sbi->ll_flags & LL_SBI_USER_XATTR)
+		data->ocd_connect_flags |= OBD_CONNECT_XATTR;
+
+#ifdef HAVE_MS_FLOCK_LOCK
+	/* force vfs to use lustre handler for flock() calls - bug 10743 */
+	sb->s_flags |= MS_FLOCK_LOCK;
+#endif
+#ifdef MS_HAS_NEW_AOPS
+	sb->s_flags |= MS_HAS_NEW_AOPS;
+#endif
+
+	if (sbi->ll_flags & LL_SBI_FLOCK)
+		sbi->ll_fop = &ll_file_operations_flock;
+	else if (sbi->ll_flags & LL_SBI_LOCALFLOCK)
+		sbi->ll_fop = &ll_file_operations;
+	else
+		sbi->ll_fop = &ll_file_operations_noflock;
+
+	/* real client */
+	data->ocd_connect_flags |= OBD_CONNECT_REAL;
+	if (sbi->ll_flags & LL_SBI_RMT_CLIENT)
+		data->ocd_connect_flags |= OBD_CONNECT_RMT_CLIENT_FORCE;
+
+	data->ocd_brw_size = MD_MAX_BRW_SIZE;
+
+	err = obd_connect(NULL, &sbi->ll_md_exp, obd, &sbi->ll_sb_uuid,
+			  data, NULL);
+	if (err == -EBUSY) {
+		LCONSOLE_ERROR_MSG(0x14f, "An MDT (md %s) is performing recovery, of which this client is not a part. Please wait for recovery to complete, abort, or time out.\n",
+				   md);
+		goto out;
+	} else if (err) {
+		CERROR("cannot connect to %s: rc = %d\n", md, err);
+		goto out;
+	}
+
+	sbi->ll_md_exp->exp_connect_data = *data;
+
+	err = obd_fid_init(sbi->ll_md_exp->exp_obd, sbi->ll_md_exp,
+			   LUSTRE_SEQ_METADATA);
+	if (err) {
+		CERROR("%s: Can't init metadata layer FID infrastructure, rc = %d\n",
+		       sbi->ll_md_exp->exp_obd->obd_name, err);
+		goto out_md;
+	}
+
+	/* For mount, we only need fs info from MDT0, and also in DNE, it
+	 * can make sure the client can be mounted as long as MDT0 is
+	 * available */
+	err = obd_statfs(NULL, sbi->ll_md_exp, osfs,
+			cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS),
+			OBD_STATFS_FOR_MDT0);
+	if (err)
+		goto out_md_fid;
+
+	/* This needs to be after statfs to ensure connect has finished.
+	 * Note that "data" does NOT contain the valid connect reply.
+	 * If connecting to a 1.8 server there will be no LMV device, so
+	 * we can access the MDC export directly and exp_connect_flags will
+	 * be non-zero, but if accessing an upgraded 2.1 server it will
+	 * have the correct flags filled in.
+	 * XXX: fill in the LMV exp_connect_flags from MDC(s). */
+	valid = exp_connect_flags(sbi->ll_md_exp) & CLIENT_CONNECT_MDT_REQD;
+	if (exp_connect_flags(sbi->ll_md_exp) != 0 &&
+	    valid != CLIENT_CONNECT_MDT_REQD) {
+		char *buf;
+
+		buf = kzalloc(PAGE_CACHE_SIZE, GFP_KERNEL);
+		obd_connect_flags2str(buf, PAGE_CACHE_SIZE,
+				      valid ^ CLIENT_CONNECT_MDT_REQD, ",");
+		LCONSOLE_ERROR_MSG(0x170, "Server %s does not support feature(s) needed for correct operation of this client (%s). Please upgrade server or downgrade client.\n",
+				   sbi->ll_md_exp->exp_obd->obd_name, buf);
+		OBD_FREE(buf, PAGE_CACHE_SIZE);
+		err = -EPROTO;
+		goto out_md_fid;
+	}
+
+	size = sizeof(*data);
+	err = obd_get_info(NULL, sbi->ll_md_exp, sizeof(KEY_CONN_DATA),
+			   KEY_CONN_DATA,  &size, data, NULL);
+	if (err) {
+		CERROR("%s: Get connect data failed: rc = %d\n",
+		       sbi->ll_md_exp->exp_obd->obd_name, err);
+		goto out_md_fid;
+	}
+
+	LASSERT(osfs->os_bsize);
+	sb->s_blocksize = osfs->os_bsize;
+	sb->s_blocksize_bits = log2(osfs->os_bsize);
+	sb->s_magic = LL_SUPER_MAGIC;
+	sb->s_maxbytes = MAX_LFS_FILESIZE;
+	sbi->ll_namelen = osfs->os_namelen;
+	sbi->ll_max_rw_chunk = LL_DEFAULT_MAX_RW_CHUNK;
+
+	if ((sbi->ll_flags & LL_SBI_USER_XATTR) &&
+	    !(data->ocd_connect_flags & OBD_CONNECT_XATTR)) {
+		LCONSOLE_INFO("Disabling user_xattr feature because it is not supported on the server\n");
+		sbi->ll_flags &= ~LL_SBI_USER_XATTR;
+	}
+
+	if (data->ocd_connect_flags & OBD_CONNECT_ACL) {
+#ifdef MS_POSIXACL
+		sb->s_flags |= MS_POSIXACL;
+#endif
+		sbi->ll_flags |= LL_SBI_ACL;
+	} else {
+		LCONSOLE_INFO("client wants to enable acl, but mdt not!\n");
+#ifdef MS_POSIXACL
+		sb->s_flags &= ~MS_POSIXACL;
+#endif
+		sbi->ll_flags &= ~LL_SBI_ACL;
+	}
+
+	if (data->ocd_connect_flags & OBD_CONNECT_RMT_CLIENT) {
+		if (!(sbi->ll_flags & LL_SBI_RMT_CLIENT)) {
+			sbi->ll_flags |= LL_SBI_RMT_CLIENT;
+			LCONSOLE_INFO("client is set as remote by default.\n");
+		}
+	} else {
+		if (sbi->ll_flags & LL_SBI_RMT_CLIENT) {
+			sbi->ll_flags &= ~LL_SBI_RMT_CLIENT;
+			LCONSOLE_INFO("client claims to be remote, but server rejected, forced to be local.\n");
+		}
+	}
+
+	if (data->ocd_connect_flags & OBD_CONNECT_MDS_CAPA) {
+		LCONSOLE_INFO("client enabled MDS capability!\n");
+		sbi->ll_flags |= LL_SBI_MDS_CAPA;
+	}
+
+	if (data->ocd_connect_flags & OBD_CONNECT_OSS_CAPA) {
+		LCONSOLE_INFO("client enabled OSS capability!\n");
+		sbi->ll_flags |= LL_SBI_OSS_CAPA;
+	}
+
+	if (data->ocd_connect_flags & OBD_CONNECT_64BITHASH)
+		sbi->ll_flags |= LL_SBI_64BIT_HASH;
+
+	if (data->ocd_connect_flags & OBD_CONNECT_BRW_SIZE)
+		sbi->ll_md_brw_size = data->ocd_brw_size;
+	else
+		sbi->ll_md_brw_size = PAGE_CACHE_SIZE;
+
+	if (data->ocd_connect_flags & OBD_CONNECT_LAYOUTLOCK) {
+		LCONSOLE_INFO("Layout lock feature supported.\n");
+		sbi->ll_flags |= LL_SBI_LAYOUT_LOCK;
+	}
+
+	if (data->ocd_ibits_known & MDS_INODELOCK_XATTR) {
+		if (!(data->ocd_connect_flags & OBD_CONNECT_MAX_EASIZE)) {
+			LCONSOLE_INFO(
+				"%s: disabling xattr cache due to unknown maximum xattr size.\n",
+				dt);
+		} else {
+			sbi->ll_flags |= LL_SBI_XATTR_CACHE;
+			sbi->ll_xattr_cache_enabled = 1;
+		}
+	}
+
+	obd = class_name2obd(dt);
+	if (!obd) {
+		CERROR("DT %s: not setup or attached\n", dt);
+		err = -ENODEV;
+		goto out_md_fid;
+	}
+
+	data->ocd_connect_flags = OBD_CONNECT_GRANT     | OBD_CONNECT_VERSION  |
+				  OBD_CONNECT_REQPORTAL | OBD_CONNECT_BRW_SIZE |
+				  OBD_CONNECT_CANCELSET | OBD_CONNECT_FID      |
+				  OBD_CONNECT_SRVLOCK   | OBD_CONNECT_TRUNCLOCK|
+				  OBD_CONNECT_AT | OBD_CONNECT_RMT_CLIENT |
+				  OBD_CONNECT_OSS_CAPA | OBD_CONNECT_VBR|
+				  OBD_CONNECT_FULL20 | OBD_CONNECT_64BITHASH |
+				  OBD_CONNECT_MAXBYTES |
+				  OBD_CONNECT_EINPROGRESS |
+				  OBD_CONNECT_JOBSTATS | OBD_CONNECT_LVB_TYPE |
+				  OBD_CONNECT_LAYOUTLOCK | OBD_CONNECT_PINGLESS;
+
+	if (sbi->ll_flags & LL_SBI_SOM_PREVIEW)
+		data->ocd_connect_flags |= OBD_CONNECT_SOM;
+
+	if (!OBD_FAIL_CHECK(OBD_FAIL_OSC_CONNECT_CKSUM)) {
+		/* OBD_CONNECT_CKSUM should always be set, even if checksums are
+		 * disabled by default, because it can still be enabled on the
+		 * fly via /proc. As a consequence, we still need to come to an
+		 * agreement on the supported algorithms at connect time */
+		data->ocd_connect_flags |= OBD_CONNECT_CKSUM;
+
+		if (OBD_FAIL_CHECK(OBD_FAIL_OSC_CKSUM_ADLER_ONLY))
+			data->ocd_cksum_types = OBD_CKSUM_ADLER;
+		else
+			data->ocd_cksum_types = cksum_types_supported_client();
+	}
+
+	data->ocd_connect_flags |= OBD_CONNECT_LRU_RESIZE;
+	if (sbi->ll_flags & LL_SBI_RMT_CLIENT)
+		data->ocd_connect_flags |= OBD_CONNECT_RMT_CLIENT_FORCE;
+
+	CDEBUG(D_RPCTRACE, "ocd_connect_flags: %#llx ocd_version: %d ocd_grant: %d\n",
+	       data->ocd_connect_flags,
+	       data->ocd_version, data->ocd_grant);
+
+	obd->obd_upcall.onu_owner = &sbi->ll_lco;
+	obd->obd_upcall.onu_upcall = cl_ocd_update;
+
+	data->ocd_brw_size = DT_MAX_BRW_SIZE;
+
+	err = obd_connect(NULL, &sbi->ll_dt_exp, obd, &sbi->ll_sb_uuid, data,
+			  NULL);
+	if (err == -EBUSY) {
+		LCONSOLE_ERROR_MSG(0x150, "An OST (dt %s) is performing recovery, of which this client is not a part.  Please wait for recovery to complete, abort, or time out.\n",
+				   dt);
+		goto out_md;
+	} else if (err) {
+		CERROR("%s: Cannot connect to %s: rc = %d\n",
+		       sbi->ll_dt_exp->exp_obd->obd_name, dt, err);
+		goto out_md;
+	}
+
+	sbi->ll_dt_exp->exp_connect_data = *data;
+
+	err = obd_fid_init(sbi->ll_dt_exp->exp_obd, sbi->ll_dt_exp,
+			   LUSTRE_SEQ_METADATA);
+	if (err) {
+		CERROR("%s: Can't init data layer FID infrastructure, rc = %d\n",
+		       sbi->ll_dt_exp->exp_obd->obd_name, err);
+		goto out_dt;
+	}
+
+	mutex_lock(&sbi->ll_lco.lco_lock);
+	sbi->ll_lco.lco_flags = data->ocd_connect_flags;
+	sbi->ll_lco.lco_md_exp = sbi->ll_md_exp;
+	sbi->ll_lco.lco_dt_exp = sbi->ll_dt_exp;
+	mutex_unlock(&sbi->ll_lco.lco_lock);
+
+	fid_zero(&sbi->ll_root_fid);
+	err = md_getstatus(sbi->ll_md_exp, &sbi->ll_root_fid, &oc);
+	if (err) {
+		CERROR("cannot mds_connect: rc = %d\n", err);
+		goto out_lock_cn_cb;
+	}
+	if (!fid_is_sane(&sbi->ll_root_fid)) {
+		CERROR("%s: Invalid root fid "DFID" during mount\n",
+		       sbi->ll_md_exp->exp_obd->obd_name,
+		       PFID(&sbi->ll_root_fid));
+		err = -EINVAL;
+		goto out_lock_cn_cb;
+	}
+	CDEBUG(D_SUPER, "rootfid "DFID"\n", PFID(&sbi->ll_root_fid));
+
+	sb->s_op = &lustre_super_operations;
+#if THREAD_SIZE >= 8192 /*b=17630*/
+	sb->s_export_op = &lustre_export_operations;
+#endif
+
+	/* make root inode
+	 * XXX: move this to after cbd setup? */
+	valid = OBD_MD_FLGETATTR | OBD_MD_FLBLOCKS | OBD_MD_FLMDSCAPA;
+	if (sbi->ll_flags & LL_SBI_RMT_CLIENT)
+		valid |= OBD_MD_FLRMTPERM;
+	else if (sbi->ll_flags & LL_SBI_ACL)
+		valid |= OBD_MD_FLACL;
+
+	op_data = kzalloc(sizeof(*op_data), GFP_NOFS);
+	if (!op_data) {
+		err = -ENOMEM;
+		goto out_lock_cn_cb;
+	}
+
+	op_data->op_fid1 = sbi->ll_root_fid;
+	op_data->op_mode = 0;
+	op_data->op_capa1 = oc;
+	op_data->op_valid = valid;
+
+	err = md_getattr(sbi->ll_md_exp, op_data, &request);
+	if (oc)
+		capa_put(oc);
+	OBD_FREE_PTR(op_data);
+	if (err) {
+		CERROR("%s: md_getattr failed for root: rc = %d\n",
+		       sbi->ll_md_exp->exp_obd->obd_name, err);
+		goto out_lock_cn_cb;
+	}
+
+	err = md_get_lustre_md(sbi->ll_md_exp, request, sbi->ll_dt_exp,
+			       sbi->ll_md_exp, &lmd);
+	if (err) {
+		CERROR("failed to understand root inode md: rc = %d\n", err);
+		ptlrpc_req_finished(request);
+		goto out_lock_cn_cb;
+	}
+
+	LASSERT(fid_is_sane(&sbi->ll_root_fid));
+	root = ll_iget(sb, cl_fid_build_ino(&sbi->ll_root_fid,
+					    sbi->ll_flags & LL_SBI_32BIT_API),
+		       &lmd);
+	md_free_lustre_md(sbi->ll_md_exp, &lmd);
+	ptlrpc_req_finished(request);
+
+	if (root == NULL || IS_ERR(root)) {
+		if (lmd.lsm)
+			obd_free_memmd(sbi->ll_dt_exp, &lmd.lsm);
+#ifdef CONFIG_FS_POSIX_ACL
+		if (lmd.posix_acl) {
+			posix_acl_release(lmd.posix_acl);
+			lmd.posix_acl = NULL;
+		}
+#endif
+		err = IS_ERR(root) ? PTR_ERR(root) : -EBADF;
+		root = NULL;
+		CERROR("lustre_lite: bad iget4 for root\n");
+		goto out_root;
+	}
+
+	err = ll_close_thread_start(&sbi->ll_lcq);
+	if (err) {
+		CERROR("cannot start close thread: rc %d\n", err);
+		goto out_root;
+	}
+
+#ifdef CONFIG_FS_POSIX_ACL
+	if (sbi->ll_flags & LL_SBI_RMT_CLIENT) {
+		rct_init(&sbi->ll_rct);
+		et_init(&sbi->ll_et);
+	}
+#endif
+
+	checksum = sbi->ll_flags & LL_SBI_CHECKSUM;
+	err = obd_set_info_async(NULL, sbi->ll_dt_exp, sizeof(KEY_CHECKSUM),
+				 KEY_CHECKSUM, sizeof(checksum), &checksum,
+				 NULL);
+	cl_sb_init(sb);
+
+	err = obd_set_info_async(NULL, sbi->ll_dt_exp, sizeof(KEY_CACHE_SET),
+				 KEY_CACHE_SET, sizeof(sbi->ll_cache),
+				 &sbi->ll_cache, NULL);
+
+	sb->s_root = d_make_root(root);
+	if (sb->s_root == NULL) {
+		CERROR("%s: can't make root dentry\n",
+			ll_get_fsname(sb, NULL, 0));
+		err = -ENOMEM;
+		goto out_lock_cn_cb;
+	}
+
+	sbi->ll_sdev_orig = sb->s_dev;
+
+	/* We set sb->s_dev equal on all lustre clients in order to support
+	 * NFS export clustering.  NFSD requires that the FSID be the same
+	 * on all clients. */
+	/* s_dev is also used in lt_compare() to compare two fs, but that is
+	 * only a node-local comparison. */
+	uuid = obd_get_uuid(sbi->ll_md_exp);
+	if (uuid != NULL) {
+		sb->s_dev = get_uuid2int(uuid->uuid, strlen(uuid->uuid));
+		get_uuid2fsid(uuid->uuid, strlen(uuid->uuid), &sbi->ll_fsid);
+	}
+
+	if (data != NULL)
+		OBD_FREE_PTR(data);
+	if (osfs != NULL)
+		OBD_FREE_PTR(osfs);
+
+	return err;
+out_root:
+	iput(root);
+out_lock_cn_cb:
+	obd_fid_fini(sbi->ll_dt_exp->exp_obd);
+out_dt:
+	obd_disconnect(sbi->ll_dt_exp);
+	sbi->ll_dt_exp = NULL;
+	/* Make sure all OScs are gone, since cl_cache is accessing sbi. */
+	obd_zombie_barrier();
+out_md_fid:
+	obd_fid_fini(sbi->ll_md_exp->exp_obd);
+out_md:
+	obd_disconnect(sbi->ll_md_exp);
+	sbi->ll_md_exp = NULL;
+out:
+	if (data != NULL)
+		OBD_FREE_PTR(data);
+	if (osfs != NULL)
+		OBD_FREE_PTR(osfs);
+	lprocfs_unregister_mountpoint(sbi);
+	return err;
+}
+
+int ll_get_max_mdsize(struct ll_sb_info *sbi, int *lmmsize)
+{
+	int size, rc;
+
+	*lmmsize = obd_size_diskmd(sbi->ll_dt_exp, NULL);
+	size = sizeof(int);
+	rc = obd_get_info(NULL, sbi->ll_md_exp, sizeof(KEY_MAX_EASIZE),
+			  KEY_MAX_EASIZE, &size, lmmsize, NULL);
+	if (rc)
+		CERROR("Get max mdsize error rc %d\n", rc);
+
+	return rc;
+}
+
+int ll_get_default_mdsize(struct ll_sb_info *sbi, int *lmmsize)
+{
+	int size, rc;
+
+	size = sizeof(int);
+	rc = obd_get_info(NULL, sbi->ll_md_exp, sizeof(KEY_DEFAULT_EASIZE),
+			 KEY_DEFAULT_EASIZE, &size, lmmsize, NULL);
+	if (rc)
+		CERROR("Get default mdsize error rc %d\n", rc);
+
+	return rc;
+}
+
+int ll_get_max_cookiesize(struct ll_sb_info *sbi, int *lmmsize)
+{
+	int size, rc;
+
+	size = sizeof(int);
+	rc = obd_get_info(NULL, sbi->ll_md_exp, sizeof(KEY_MAX_COOKIESIZE),
+			  KEY_MAX_COOKIESIZE, &size, lmmsize, NULL);
+	if (rc)
+		CERROR("Get max cookiesize error rc %d\n", rc);
+
+	return rc;
+}
+
+int ll_get_default_cookiesize(struct ll_sb_info *sbi, int *lmmsize)
+{
+	int size, rc;
+
+	size = sizeof(int);
+	rc = obd_get_info(NULL, sbi->ll_md_exp, sizeof(KEY_DEFAULT_COOKIESIZE),
+			  KEY_DEFAULT_COOKIESIZE, &size, lmmsize, NULL);
+	if (rc)
+		CERROR("Get default cookiesize error rc %d\n", rc);
+
+	return rc;
+}
+
+static void client_common_put_super(struct super_block *sb)
+{
+	struct ll_sb_info *sbi = ll_s2sbi(sb);
+
+#ifdef CONFIG_FS_POSIX_ACL
+	if (sbi->ll_flags & LL_SBI_RMT_CLIENT) {
+		et_fini(&sbi->ll_et);
+		rct_fini(&sbi->ll_rct);
+	}
+#endif
+
+	ll_close_thread_shutdown(sbi->ll_lcq);
+
+	cl_sb_fini(sb);
+
+	list_del(&sbi->ll_conn_chain);
+
+	obd_fid_fini(sbi->ll_dt_exp->exp_obd);
+	obd_disconnect(sbi->ll_dt_exp);
+	sbi->ll_dt_exp = NULL;
+	/* wait till all OSCs are gone, since cl_cache is accessing sbi.
+	 * see LU-2543. */
+	obd_zombie_barrier();
+
+	lprocfs_unregister_mountpoint(sbi);
+
+	obd_fid_fini(sbi->ll_md_exp->exp_obd);
+	obd_disconnect(sbi->ll_md_exp);
+	sbi->ll_md_exp = NULL;
+}
+
+void ll_kill_super(struct super_block *sb)
+{
+	struct ll_sb_info *sbi;
+
+	/* not init sb ?*/
+	if (!(sb->s_flags & MS_ACTIVE))
+		return;
+
+	sbi = ll_s2sbi(sb);
+	/* we need to restore s_dev from changed for clustered NFS before
+	 * put_super because new kernels have cached s_dev and change sb->s_dev
+	 * in put_super not affected real removing devices */
+	if (sbi) {
+		sb->s_dev = sbi->ll_sdev_orig;
+		sbi->ll_umounting = 1;
+	}
+}
+
+static inline int ll_set_opt(const char *opt, char *data, int fl)
+{
+	if (strncmp(opt, data, strlen(opt)) != 0)
+		return 0;
+	else
+		return fl;
+}
+
+/* non-client-specific mount options are parsed in lmd_parse */
+static int ll_options(char *options, int *flags)
+{
+	int tmp;
+	char *s1 = options, *s2;
+
+	if (!options)
+		return 0;
+
+	CDEBUG(D_CONFIG, "Parsing opts %s\n", options);
+
+	while (*s1) {
+		CDEBUG(D_SUPER, "next opt=%s\n", s1);
+		tmp = ll_set_opt("nolock", s1, LL_SBI_NOLCK);
+		if (tmp) {
+			*flags |= tmp;
+			goto next;
+		}
+		tmp = ll_set_opt("flock", s1, LL_SBI_FLOCK);
+		if (tmp) {
+			*flags |= tmp;
+			goto next;
+		}
+		tmp = ll_set_opt("localflock", s1, LL_SBI_LOCALFLOCK);
+		if (tmp) {
+			*flags |= tmp;
+			goto next;
+		}
+		tmp = ll_set_opt("noflock", s1, LL_SBI_FLOCK|LL_SBI_LOCALFLOCK);
+		if (tmp) {
+			*flags &= ~tmp;
+			goto next;
+		}
+		tmp = ll_set_opt("user_xattr", s1, LL_SBI_USER_XATTR);
+		if (tmp) {
+			*flags |= tmp;
+			goto next;
+		}
+		tmp = ll_set_opt("nouser_xattr", s1, LL_SBI_USER_XATTR);
+		if (tmp) {
+			*flags &= ~tmp;
+			goto next;
+		}
+		tmp = ll_set_opt("remote_client", s1, LL_SBI_RMT_CLIENT);
+		if (tmp) {
+			*flags |= tmp;
+			goto next;
+		}
+		tmp = ll_set_opt("user_fid2path", s1, LL_SBI_USER_FID2PATH);
+		if (tmp) {
+			*flags |= tmp;
+			goto next;
+		}
+		tmp = ll_set_opt("nouser_fid2path", s1, LL_SBI_USER_FID2PATH);
+		if (tmp) {
+			*flags &= ~tmp;
+			goto next;
+		}
+
+		tmp = ll_set_opt("checksum", s1, LL_SBI_CHECKSUM);
+		if (tmp) {
+			*flags |= tmp;
+			goto next;
+		}
+		tmp = ll_set_opt("nochecksum", s1, LL_SBI_CHECKSUM);
+		if (tmp) {
+			*flags &= ~tmp;
+			goto next;
+		}
+		tmp = ll_set_opt("lruresize", s1, LL_SBI_LRU_RESIZE);
+		if (tmp) {
+			*flags |= tmp;
+			goto next;
+		}
+		tmp = ll_set_opt("nolruresize", s1, LL_SBI_LRU_RESIZE);
+		if (tmp) {
+			*flags &= ~tmp;
+			goto next;
+		}
+		tmp = ll_set_opt("lazystatfs", s1, LL_SBI_LAZYSTATFS);
+		if (tmp) {
+			*flags |= tmp;
+			goto next;
+		}
+		tmp = ll_set_opt("nolazystatfs", s1, LL_SBI_LAZYSTATFS);
+		if (tmp) {
+			*flags &= ~tmp;
+			goto next;
+		}
+		tmp = ll_set_opt("som_preview", s1, LL_SBI_SOM_PREVIEW);
+		if (tmp) {
+			*flags |= tmp;
+			goto next;
+		}
+		tmp = ll_set_opt("32bitapi", s1, LL_SBI_32BIT_API);
+		if (tmp) {
+			*flags |= tmp;
+			goto next;
+		}
+		tmp = ll_set_opt("verbose", s1, LL_SBI_VERBOSE);
+		if (tmp) {
+			*flags |= tmp;
+			goto next;
+		}
+		tmp = ll_set_opt("noverbose", s1, LL_SBI_VERBOSE);
+		if (tmp) {
+			*flags &= ~tmp;
+			goto next;
+		}
+		LCONSOLE_ERROR_MSG(0x152, "Unknown option '%s', won't mount.\n",
+				   s1);
+		return -EINVAL;
+
+next:
+		/* Find next opt */
+		s2 = strchr(s1, ',');
+		if (s2 == NULL)
+			break;
+		s1 = s2 + 1;
+	}
+	return 0;
+}
+
+void ll_lli_init(struct ll_inode_info *lli)
+{
+	lli->lli_inode_magic = LLI_INODE_MAGIC;
+	lli->lli_flags = 0;
+	lli->lli_ioepoch = 0;
+	lli->lli_maxbytes = MAX_LFS_FILESIZE;
+	spin_lock_init(&lli->lli_lock);
+	lli->lli_posix_acl = NULL;
+	lli->lli_remote_perms = NULL;
+	mutex_init(&lli->lli_rmtperm_mutex);
+	/* Do not set lli_fid, it has been initialized already. */
+	fid_zero(&lli->lli_pfid);
+	INIT_LIST_HEAD(&lli->lli_close_list);
+	INIT_LIST_HEAD(&lli->lli_oss_capas);
+	atomic_set(&lli->lli_open_count, 0);
+	lli->lli_mds_capa = NULL;
+	lli->lli_rmtperm_time = 0;
+	lli->lli_pending_och = NULL;
+	lli->lli_mds_read_och = NULL;
+	lli->lli_mds_write_och = NULL;
+	lli->lli_mds_exec_och = NULL;
+	lli->lli_open_fd_read_count = 0;
+	lli->lli_open_fd_write_count = 0;
+	lli->lli_open_fd_exec_count = 0;
+	mutex_init(&lli->lli_och_mutex);
+	spin_lock_init(&lli->lli_agl_lock);
+	lli->lli_has_smd = false;
+	spin_lock_init(&lli->lli_layout_lock);
+	ll_layout_version_set(lli, LL_LAYOUT_GEN_NONE);
+	lli->lli_clob = NULL;
+
+	init_rwsem(&lli->lli_xattrs_list_rwsem);
+	mutex_init(&lli->lli_xattrs_enq_lock);
+
+	LASSERT(lli->lli_vfs_inode.i_mode != 0);
+	if (S_ISDIR(lli->lli_vfs_inode.i_mode)) {
+		mutex_init(&lli->lli_readdir_mutex);
+		lli->lli_opendir_key = NULL;
+		lli->lli_sai = NULL;
+		spin_lock_init(&lli->lli_sa_lock);
+		lli->lli_opendir_pid = 0;
+	} else {
+		mutex_init(&lli->lli_size_mutex);
+		lli->lli_symlink_name = NULL;
+		init_rwsem(&lli->lli_trunc_sem);
+		mutex_init(&lli->lli_write_mutex);
+		init_rwsem(&lli->lli_glimpse_sem);
+		lli->lli_glimpse_time = 0;
+		INIT_LIST_HEAD(&lli->lli_agl_list);
+		lli->lli_agl_index = 0;
+		lli->lli_async_rc = 0;
+	}
+	mutex_init(&lli->lli_layout_mutex);
+}
+
+static inline int ll_bdi_register(struct backing_dev_info *bdi)
+{
+	static atomic_t ll_bdi_num = ATOMIC_INIT(0);
+
+	bdi->name = "lustre";
+	return bdi_register(bdi, NULL, "lustre-%d",
+			    atomic_inc_return(&ll_bdi_num));
+}
+
+int ll_fill_super(struct super_block *sb, struct vfsmount *mnt)
+{
+	struct lustre_profile *lprof = NULL;
+	struct lustre_sb_info *lsi = s2lsi(sb);
+	struct ll_sb_info *sbi;
+	char  *dt = NULL, *md = NULL;
+	char  *profilenm = get_profile_name(sb);
+	struct config_llog_instance *cfg;
+	/* %p for void* in printf needs 16+2 characters: 0xffffffffffffffff */
+	const int instlen = sizeof(cfg->cfg_instance) * 2 + 2;
+	int    err;
+
+	CDEBUG(D_VFSTRACE, "VFS Op: sb %p\n", sb);
+
+	cfg = kzalloc(sizeof(*cfg), GFP_NOFS);
+	if (!cfg)
+		return -ENOMEM;
+
+	try_module_get(THIS_MODULE);
+
+	/* client additional sb info */
+	lsi->lsi_llsbi = sbi = ll_init_sbi();
+	if (!sbi) {
+		module_put(THIS_MODULE);
+		OBD_FREE_PTR(cfg);
+		return -ENOMEM;
+	}
+
+	err = ll_options(lsi->lsi_lmd->lmd_opts, &sbi->ll_flags);
+	if (err)
+		goto out_free;
+
+	err = bdi_init(&lsi->lsi_bdi);
+	if (err)
+		goto out_free;
+	lsi->lsi_flags |= LSI_BDI_INITIALIZED;
+	lsi->lsi_bdi.capabilities = 0;
+	err = ll_bdi_register(&lsi->lsi_bdi);
+	if (err)
+		goto out_free;
+
+	sb->s_bdi = &lsi->lsi_bdi;
+	/* kernel >= 2.6.38 store dentry operations in sb->s_d_op. */
+	sb->s_d_op = &ll_d_ops;
+
+	/* Generate a string unique to this super, in case some joker tries
+	   to mount the same fs at two mount points.
+	   Use the address of the super itself.*/
+	cfg->cfg_instance = sb;
+	cfg->cfg_uuid = lsi->lsi_llsbi->ll_sb_uuid;
+	cfg->cfg_callback = class_config_llog_handler;
+	/* set up client obds */
+	err = lustre_process_log(sb, profilenm, cfg);
+	if (err < 0) {
+		CERROR("Unable to process log: %d\n", err);
+		goto out_free;
+	}
+
+	/* Profile set with LCFG_MOUNTOPT so we can find our mdc and osc obds */
+	lprof = class_get_profile(profilenm);
+	if (lprof == NULL) {
+		LCONSOLE_ERROR_MSG(0x156, "The client profile '%s' could not be read from the MGS.  Does that filesystem exist?\n",
+				   profilenm);
+		err = -EINVAL;
+		goto out_free;
+	}
+	CDEBUG(D_CONFIG, "Found profile %s: mdc=%s osc=%s\n", profilenm,
+	       lprof->lp_md, lprof->lp_dt);
+
+	dt = kasprintf(GFP_NOFS, "%s-%p", lprof->lp_dt, cfg->cfg_instance);
+	if (!dt) {
+		err = -ENOMEM;
+		goto out_free;
+	}
+
+	md = kasprintf(GFP_NOFS, "%s-%p", lprof->lp_md, cfg->cfg_instance);
+	if (!md) {
+		err = -ENOMEM;
+		goto out_free;
+	}
+
+	/* connections, registrations, sb setup */
+	err = client_common_fill_super(sb, md, dt, mnt);
+
+out_free:
+	if (md)
+		OBD_FREE(md, strlen(lprof->lp_md) + instlen + 2);
+	if (dt)
+		OBD_FREE(dt, strlen(lprof->lp_dt) + instlen + 2);
+	if (err)
+		ll_put_super(sb);
+	else if (sbi->ll_flags & LL_SBI_VERBOSE)
+		LCONSOLE_WARN("Mounted %s\n", profilenm);
+
+	OBD_FREE_PTR(cfg);
+	return err;
+} /* ll_fill_super */
+
+void ll_put_super(struct super_block *sb)
+{
+	struct config_llog_instance cfg, params_cfg;
+	struct obd_device *obd;
+	struct lustre_sb_info *lsi = s2lsi(sb);
+	struct ll_sb_info *sbi = ll_s2sbi(sb);
+	char *profilenm = get_profile_name(sb);
+	int next, force = 1;
+
+	CDEBUG(D_VFSTRACE, "VFS Op: sb %p - %s\n", sb, profilenm);
+
+	ll_print_capa_stat(sbi);
+
+	cfg.cfg_instance = sb;
+	lustre_end_log(sb, profilenm, &cfg);
+
+	params_cfg.cfg_instance = sb;
+	lustre_end_log(sb, PARAMS_FILENAME, &params_cfg);
+
+	if (sbi->ll_md_exp) {
+		obd = class_exp2obd(sbi->ll_md_exp);
+		if (obd)
+			force = obd->obd_force;
+	}
+
+	/* We need to set force before the lov_disconnect in
+	   lustre_common_put_super, since l_d cleans up osc's as well. */
+	if (force) {
+		next = 0;
+		while ((obd = class_devices_in_group(&sbi->ll_sb_uuid,
+						     &next)) != NULL) {
+			obd->obd_force = force;
+		}
+	}
+
+	if (sbi->ll_lcq) {
+		/* Only if client_common_fill_super succeeded */
+		client_common_put_super(sb);
+	}
+
+	next = 0;
+	while ((obd = class_devices_in_group(&sbi->ll_sb_uuid, &next)))
+		class_manual_cleanup(obd);
+
+	if (sbi->ll_flags & LL_SBI_VERBOSE)
+		LCONSOLE_WARN("Unmounted %s\n", profilenm ? profilenm : "");
+
+	if (profilenm)
+		class_del_profile(profilenm);
+
+	if (lsi->lsi_flags & LSI_BDI_INITIALIZED) {
+		bdi_destroy(&lsi->lsi_bdi);
+		lsi->lsi_flags &= ~LSI_BDI_INITIALIZED;
+	}
+
+	ll_free_sbi(sb);
+	lsi->lsi_llsbi = NULL;
+
+	lustre_common_put_super(sb);
+
+	module_put(THIS_MODULE);
+} /* client_put_super */
+
+struct inode *ll_inode_from_resource_lock(struct ldlm_lock *lock)
+{
+	struct inode *inode = NULL;
+
+	/* NOTE: we depend on atomic igrab() -bzzz */
+	lock_res_and_lock(lock);
+	if (lock->l_resource->lr_lvb_inode) {
+		struct ll_inode_info *lli;
+
+		lli = ll_i2info(lock->l_resource->lr_lvb_inode);
+		if (lli->lli_inode_magic == LLI_INODE_MAGIC) {
+			inode = igrab(lock->l_resource->lr_lvb_inode);
+		} else {
+			inode = lock->l_resource->lr_lvb_inode;
+			LDLM_DEBUG_LIMIT(inode->i_state & I_FREEING ?  D_INFO :
+					 D_WARNING, lock, "lr_lvb_inode %p is bogus: magic %08x",
+					 lock->l_resource->lr_lvb_inode,
+					 lli->lli_inode_magic);
+			inode = NULL;
+		}
+	}
+	unlock_res_and_lock(lock);
+	return inode;
+}
+
+void ll_clear_inode(struct inode *inode)
+{
+	struct ll_inode_info *lli = ll_i2info(inode);
+	struct ll_sb_info *sbi = ll_i2sbi(inode);
+
+	CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
+	       inode->i_generation, inode);
+
+	if (S_ISDIR(inode->i_mode)) {
+		/* these should have been cleared in ll_file_release */
+		LASSERT(lli->lli_opendir_key == NULL);
+		LASSERT(lli->lli_sai == NULL);
+		LASSERT(lli->lli_opendir_pid == 0);
+	}
+
+	spin_lock(&lli->lli_lock);
+	ll_i2info(inode)->lli_flags &= ~LLIF_MDS_SIZE_LOCK;
+	spin_unlock(&lli->lli_lock);
+	md_null_inode(sbi->ll_md_exp, ll_inode2fid(inode));
+
+	LASSERT(!lli->lli_open_fd_write_count);
+	LASSERT(!lli->lli_open_fd_read_count);
+	LASSERT(!lli->lli_open_fd_exec_count);
+
+	if (lli->lli_mds_write_och)
+		ll_md_real_close(inode, FMODE_WRITE);
+	if (lli->lli_mds_exec_och)
+		ll_md_real_close(inode, FMODE_EXEC);
+	if (lli->lli_mds_read_och)
+		ll_md_real_close(inode, FMODE_READ);
+
+	if (S_ISLNK(inode->i_mode) && lli->lli_symlink_name) {
+		OBD_FREE(lli->lli_symlink_name,
+			 strlen(lli->lli_symlink_name) + 1);
+		lli->lli_symlink_name = NULL;
+	}
+
+	ll_xattr_cache_destroy(inode);
+
+	if (sbi->ll_flags & LL_SBI_RMT_CLIENT) {
+		LASSERT(lli->lli_posix_acl == NULL);
+		if (lli->lli_remote_perms) {
+			free_rmtperm_hash(lli->lli_remote_perms);
+			lli->lli_remote_perms = NULL;
+		}
+	}
+#ifdef CONFIG_FS_POSIX_ACL
+	else if (lli->lli_posix_acl) {
+		LASSERT(atomic_read(&lli->lli_posix_acl->a_refcount) == 1);
+		LASSERT(lli->lli_remote_perms == NULL);
+		posix_acl_release(lli->lli_posix_acl);
+		lli->lli_posix_acl = NULL;
+	}
+#endif
+	lli->lli_inode_magic = LLI_INODE_DEAD;
+
+	ll_clear_inode_capas(inode);
+	if (!S_ISDIR(inode->i_mode))
+		LASSERT(list_empty(&lli->lli_agl_list));
+
+	/*
+	 * XXX This has to be done before lsm is freed below, because
+	 * cl_object still uses inode lsm.
+	 */
+	cl_inode_fini(inode);
+	lli->lli_has_smd = false;
+}
+
+static int ll_md_setattr(struct dentry *dentry, struct md_op_data *op_data,
+		  struct md_open_data **mod)
+{
+	struct lustre_md md;
+	struct inode *inode = d_inode(dentry);
+	struct ll_sb_info *sbi = ll_i2sbi(inode);
+	struct ptlrpc_request *request = NULL;
+	int rc, ia_valid;
+
+	op_data = ll_prep_md_op_data(op_data, inode, NULL, NULL, 0, 0,
+				     LUSTRE_OPC_ANY, NULL);
+	if (IS_ERR(op_data))
+		return PTR_ERR(op_data);
+
+	rc = md_setattr(sbi->ll_md_exp, op_data, NULL, 0, NULL, 0,
+			&request, mod);
+	if (rc) {
+		ptlrpc_req_finished(request);
+		if (rc == -ENOENT) {
+			clear_nlink(inode);
+			/* Unlinked special device node? Or just a race?
+			 * Pretend we done everything. */
+			if (!S_ISREG(inode->i_mode) &&
+			    !S_ISDIR(inode->i_mode)) {
+				ia_valid = op_data->op_attr.ia_valid;
+				op_data->op_attr.ia_valid &= ~TIMES_SET_FLAGS;
+				rc = simple_setattr(dentry, &op_data->op_attr);
+				op_data->op_attr.ia_valid = ia_valid;
+			}
+		} else if (rc != -EPERM && rc != -EACCES && rc != -ETXTBSY) {
+			CERROR("md_setattr fails: rc = %d\n", rc);
+		}
+		return rc;
+	}
+
+	rc = md_get_lustre_md(sbi->ll_md_exp, request, sbi->ll_dt_exp,
+			      sbi->ll_md_exp, &md);
+	if (rc) {
+		ptlrpc_req_finished(request);
+		return rc;
+	}
+
+	ia_valid = op_data->op_attr.ia_valid;
+	/* inode size will be in ll_setattr_ost, can't do it now since dirty
+	 * cache is not cleared yet. */
+	op_data->op_attr.ia_valid &= ~(TIMES_SET_FLAGS | ATTR_SIZE);
+	rc = simple_setattr(dentry, &op_data->op_attr);
+	op_data->op_attr.ia_valid = ia_valid;
+
+	/* Extract epoch data if obtained. */
+	op_data->op_handle = md.body->handle;
+	op_data->op_ioepoch = md.body->ioepoch;
+
+	ll_update_inode(inode, &md);
+	ptlrpc_req_finished(request);
+
+	return rc;
+}
+
+/* Close IO epoch and send Size-on-MDS attribute update. */
+static int ll_setattr_done_writing(struct inode *inode,
+				   struct md_op_data *op_data,
+				   struct md_open_data *mod)
+{
+	struct ll_inode_info *lli = ll_i2info(inode);
+	int rc = 0;
+
+	LASSERT(op_data != NULL);
+	if (!S_ISREG(inode->i_mode))
+		return 0;
+
+	CDEBUG(D_INODE, "Epoch %llu closed on "DFID" for truncate\n",
+	       op_data->op_ioepoch, PFID(&lli->lli_fid));
+
+	op_data->op_flags = MF_EPOCH_CLOSE;
+	ll_done_writing_attr(inode, op_data);
+	ll_pack_inode2opdata(inode, op_data, NULL);
+
+	rc = md_done_writing(ll_i2sbi(inode)->ll_md_exp, op_data, mod);
+	if (rc == -EAGAIN) {
+		/* MDS has instructed us to obtain Size-on-MDS attribute
+		 * from OSTs and send setattr to back to MDS. */
+		rc = ll_som_update(inode, op_data);
+	} else if (rc) {
+		CERROR("inode %lu mdc truncate failed: rc = %d\n",
+		       inode->i_ino, rc);
+	}
+	return rc;
+}
+
+static int ll_setattr_ost(struct inode *inode, struct iattr *attr)
+{
+	struct obd_capa *capa;
+	int rc;
+
+	if (attr->ia_valid & ATTR_SIZE)
+		capa = ll_osscapa_get(inode, CAPA_OPC_OSS_TRUNC);
+	else
+		capa = ll_mdscapa_get(inode);
+
+	rc = cl_setattr_ost(inode, attr, capa);
+
+	if (attr->ia_valid & ATTR_SIZE)
+		ll_truncate_free_capa(capa);
+	else
+		capa_put(capa);
+
+	return rc;
+}
+
+
+/* If this inode has objects allocated to it (lsm != NULL), then the OST
+ * object(s) determine the file size and mtime.  Otherwise, the MDS will
+ * keep these values until such a time that objects are allocated for it.
+ * We do the MDS operations first, as it is checking permissions for us.
+ * We don't to the MDS RPC if there is nothing that we want to store there,
+ * otherwise there is no harm in updating mtime/atime on the MDS if we are
+ * going to do an RPC anyways.
+ *
+ * If we are doing a truncate, we will send the mtime and ctime updates
+ * to the OST with the punch RPC, otherwise we do an explicit setattr RPC.
+ * I don't believe it is possible to get e.g. ATTR_MTIME_SET and ATTR_SIZE
+ * at the same time.
+ *
+ * In case of HSMimport, we only set attr on MDS.
+ */
+int ll_setattr_raw(struct dentry *dentry, struct iattr *attr, bool hsm_import)
+{
+	struct inode *inode = d_inode(dentry);
+	struct ll_inode_info *lli = ll_i2info(inode);
+	struct md_op_data *op_data = NULL;
+	struct md_open_data *mod = NULL;
+	bool file_is_released = false;
+	int rc = 0, rc1 = 0;
+
+	CDEBUG(D_VFSTRACE,
+		"%s: setattr inode %p/fid:"DFID
+		" from %llu to %llu, valid %x, hsm_import %d\n",
+		ll_get_fsname(inode->i_sb, NULL, 0), inode,
+		PFID(&lli->lli_fid), i_size_read(inode), attr->ia_size,
+		attr->ia_valid, hsm_import);
+
+	if (attr->ia_valid & ATTR_SIZE) {
+		/* Check new size against VFS/VM file size limit and rlimit */
+		rc = inode_newsize_ok(inode, attr->ia_size);
+		if (rc)
+			return rc;
+
+		/* The maximum Lustre file size is variable, based on the
+		 * OST maximum object size and number of stripes.  This
+		 * needs another check in addition to the VFS check above. */
+		if (attr->ia_size > ll_file_maxbytes(inode)) {
+			CDEBUG(D_INODE, "file "DFID" too large %llu > %llu\n",
+			       PFID(&lli->lli_fid), attr->ia_size,
+			       ll_file_maxbytes(inode));
+			return -EFBIG;
+		}
+
+		attr->ia_valid |= ATTR_MTIME | ATTR_CTIME;
+	}
+
+	/* POSIX: check before ATTR_*TIME_SET set (from inode_change_ok) */
+	if (attr->ia_valid & TIMES_SET_FLAGS) {
+		if ((!uid_eq(current_fsuid(), inode->i_uid)) &&
+		    !capable(CFS_CAP_FOWNER))
+			return -EPERM;
+	}
+
+	/* We mark all of the fields "set" so MDS/OST does not re-set them */
+	if (attr->ia_valid & ATTR_CTIME) {
+		attr->ia_ctime = CURRENT_TIME;
+		attr->ia_valid |= ATTR_CTIME_SET;
+	}
+	if (!(attr->ia_valid & ATTR_ATIME_SET) &&
+	    (attr->ia_valid & ATTR_ATIME)) {
+		attr->ia_atime = CURRENT_TIME;
+		attr->ia_valid |= ATTR_ATIME_SET;
+	}
+	if (!(attr->ia_valid & ATTR_MTIME_SET) &&
+	    (attr->ia_valid & ATTR_MTIME)) {
+		attr->ia_mtime = CURRENT_TIME;
+		attr->ia_valid |= ATTR_MTIME_SET;
+	}
+
+	if (attr->ia_valid & (ATTR_MTIME | ATTR_CTIME))
+		CDEBUG(D_INODE, "setting mtime %lu, ctime %lu, now = %lu\n",
+		       LTIME_S(attr->ia_mtime), LTIME_S(attr->ia_ctime),
+		       get_seconds());
+
+	/* If we are changing file size, file content is modified, flag it. */
+	if (attr->ia_valid & ATTR_SIZE) {
+		attr->ia_valid |= MDS_OPEN_OWNEROVERRIDE;
+		spin_lock(&lli->lli_lock);
+		lli->lli_flags |= LLIF_DATA_MODIFIED;
+		spin_unlock(&lli->lli_lock);
+	}
+
+	/* We always do an MDS RPC, even if we're only changing the size;
+	 * only the MDS knows whether truncate() should fail with -ETXTBUSY */
+
+	op_data = kzalloc(sizeof(*op_data), GFP_NOFS);
+	if (!op_data)
+		return -ENOMEM;
+
+	if (!S_ISDIR(inode->i_mode)) {
+		if (attr->ia_valid & ATTR_SIZE)
+			inode_dio_write_done(inode);
+		mutex_unlock(&inode->i_mutex);
+	}
+
+	memcpy(&op_data->op_attr, attr, sizeof(*attr));
+
+	/* Open epoch for truncate. */
+	if (exp_connect_som(ll_i2mdexp(inode)) &&
+	    (attr->ia_valid & (ATTR_SIZE | ATTR_MTIME | ATTR_MTIME_SET)))
+		op_data->op_flags = MF_EPOCH_OPEN;
+
+	/* truncate on a released file must failed with -ENODATA,
+	 * so size must not be set on MDS for released file
+	 * but other attributes must be set
+	 */
+	if (S_ISREG(inode->i_mode)) {
+		struct lov_stripe_md *lsm;
+		__u32 gen;
+
+		ll_layout_refresh(inode, &gen);
+		lsm = ccc_inode_lsm_get(inode);
+		if (lsm && lsm->lsm_pattern & LOV_PATTERN_F_RELEASED)
+			file_is_released = true;
+		ccc_inode_lsm_put(inode, lsm);
+	}
+
+	/* if not in HSM import mode, clear size attr for released file
+	 * we clear the attribute send to MDT in op_data, not the original
+	 * received from caller in attr which is used later to
+	 * decide return code */
+	if (file_is_released && (attr->ia_valid & ATTR_SIZE) && !hsm_import)
+		op_data->op_attr.ia_valid &= ~ATTR_SIZE;
+
+	rc = ll_md_setattr(dentry, op_data, &mod);
+	if (rc)
+		goto out;
+
+	/* truncate failed (only when non HSM import), others succeed */
+	if (file_is_released) {
+		if ((attr->ia_valid & ATTR_SIZE) && !hsm_import)
+			rc = -ENODATA;
+		else
+			rc = 0;
+		goto out;
+	}
+
+	/* RPC to MDT is sent, cancel data modification flag */
+	if (rc == 0 && (op_data->op_bias & MDS_DATA_MODIFIED)) {
+		spin_lock(&lli->lli_lock);
+		lli->lli_flags &= ~LLIF_DATA_MODIFIED;
+		spin_unlock(&lli->lli_lock);
+	}
+
+	ll_ioepoch_open(lli, op_data->op_ioepoch);
+	if (!S_ISREG(inode->i_mode)) {
+		rc = 0;
+		goto out;
+	}
+
+	if (attr->ia_valid & (ATTR_SIZE |
+			      ATTR_ATIME | ATTR_ATIME_SET |
+			      ATTR_MTIME | ATTR_MTIME_SET)) {
+		/* For truncate and utimes sending attributes to OSTs, setting
+		 * mtime/atime to the past will be performed under PW [0:EOF]
+		 * extent lock (new_size:EOF for truncate).  It may seem
+		 * excessive to send mtime/atime updates to OSTs when not
+		 * setting times to past, but it is necessary due to possible
+		 * time de-synchronization between MDT inode and OST objects */
+		if (attr->ia_valid & ATTR_SIZE)
+			down_write(&lli->lli_trunc_sem);
+		rc = ll_setattr_ost(inode, attr);
+		if (attr->ia_valid & ATTR_SIZE)
+			up_write(&lli->lli_trunc_sem);
+	}
+out:
+	if (op_data) {
+		if (op_data->op_ioepoch) {
+			rc1 = ll_setattr_done_writing(inode, op_data, mod);
+			if (!rc)
+				rc = rc1;
+		}
+		ll_finish_md_op_data(op_data);
+	}
+	if (!S_ISDIR(inode->i_mode)) {
+		mutex_lock(&inode->i_mutex);
+		if ((attr->ia_valid & ATTR_SIZE) && !hsm_import)
+			inode_dio_wait(inode);
+	}
+
+	ll_stats_ops_tally(ll_i2sbi(inode), (attr->ia_valid & ATTR_SIZE) ?
+			LPROC_LL_TRUNC : LPROC_LL_SETATTR, 1);
+
+	return rc;
+}
+
+int ll_setattr(struct dentry *de, struct iattr *attr)
+{
+	int mode = d_inode(de)->i_mode;
+
+	if ((attr->ia_valid & (ATTR_CTIME|ATTR_SIZE|ATTR_MODE)) ==
+			      (ATTR_CTIME|ATTR_SIZE|ATTR_MODE))
+		attr->ia_valid |= MDS_OPEN_OWNEROVERRIDE;
+
+	if (((attr->ia_valid & (ATTR_MODE|ATTR_FORCE|ATTR_SIZE)) ==
+			       (ATTR_SIZE|ATTR_MODE)) &&
+	    (((mode & S_ISUID) && !(attr->ia_mode & S_ISUID)) ||
+	     (((mode & (S_ISGID|S_IXGRP)) == (S_ISGID|S_IXGRP)) &&
+	      !(attr->ia_mode & S_ISGID))))
+		attr->ia_valid |= ATTR_FORCE;
+
+	if ((attr->ia_valid & ATTR_MODE) &&
+	    (mode & S_ISUID) &&
+	    !(attr->ia_mode & S_ISUID) &&
+	    !(attr->ia_valid & ATTR_KILL_SUID))
+		attr->ia_valid |= ATTR_KILL_SUID;
+
+	if ((attr->ia_valid & ATTR_MODE) &&
+	    ((mode & (S_ISGID|S_IXGRP)) == (S_ISGID|S_IXGRP)) &&
+	    !(attr->ia_mode & S_ISGID) &&
+	    !(attr->ia_valid & ATTR_KILL_SGID))
+		attr->ia_valid |= ATTR_KILL_SGID;
+
+	return ll_setattr_raw(de, attr, false);
+}
+
+int ll_statfs_internal(struct super_block *sb, struct obd_statfs *osfs,
+		       __u64 max_age, __u32 flags)
+{
+	struct ll_sb_info *sbi = ll_s2sbi(sb);
+	struct obd_statfs obd_osfs;
+	int rc;
+
+	rc = obd_statfs(NULL, sbi->ll_md_exp, osfs, max_age, flags);
+	if (rc) {
+		CERROR("md_statfs fails: rc = %d\n", rc);
+		return rc;
+	}
+
+	osfs->os_type = sb->s_magic;
+
+	CDEBUG(D_SUPER, "MDC blocks %llu/%llu objects %llu/%llu\n",
+	       osfs->os_bavail, osfs->os_blocks, osfs->os_ffree,
+	       osfs->os_files);
+
+	if (sbi->ll_flags & LL_SBI_LAZYSTATFS)
+		flags |= OBD_STATFS_NODELAY;
+
+	rc = obd_statfs_rqset(sbi->ll_dt_exp, &obd_osfs, max_age, flags);
+	if (rc) {
+		CERROR("obd_statfs fails: rc = %d\n", rc);
+		return rc;
+	}
+
+	CDEBUG(D_SUPER, "OSC blocks %llu/%llu objects %llu/%llu\n",
+	       obd_osfs.os_bavail, obd_osfs.os_blocks, obd_osfs.os_ffree,
+	       obd_osfs.os_files);
+
+	osfs->os_bsize = obd_osfs.os_bsize;
+	osfs->os_blocks = obd_osfs.os_blocks;
+	osfs->os_bfree = obd_osfs.os_bfree;
+	osfs->os_bavail = obd_osfs.os_bavail;
+
+	/* If we don't have as many objects free on the OST as inodes
+	 * on the MDS, we reduce the total number of inodes to
+	 * compensate, so that the "inodes in use" number is correct.
+	 */
+	if (obd_osfs.os_ffree < osfs->os_ffree) {
+		osfs->os_files = (osfs->os_files - osfs->os_ffree) +
+			obd_osfs.os_ffree;
+		osfs->os_ffree = obd_osfs.os_ffree;
+	}
+
+	return rc;
+}
+int ll_statfs(struct dentry *de, struct kstatfs *sfs)
+{
+	struct super_block *sb = de->d_sb;
+	struct obd_statfs osfs;
+	int rc;
+
+	CDEBUG(D_VFSTRACE, "VFS Op: at %llu jiffies\n", get_jiffies_64());
+	ll_stats_ops_tally(ll_s2sbi(sb), LPROC_LL_STAFS, 1);
+
+	/* Some amount of caching on the client is allowed */
+	rc = ll_statfs_internal(sb, &osfs,
+				cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS),
+				0);
+	if (rc)
+		return rc;
+
+	statfs_unpack(sfs, &osfs);
+
+	/* We need to downshift for all 32-bit kernels, because we can't
+	 * tell if the kernel is being called via sys_statfs64() or not.
+	 * Stop before overflowing f_bsize - in which case it is better
+	 * to just risk EOVERFLOW if caller is using old sys_statfs(). */
+	if (sizeof(long) < 8) {
+		while (osfs.os_blocks > ~0UL && sfs->f_bsize < 0x40000000) {
+			sfs->f_bsize <<= 1;
+
+			osfs.os_blocks >>= 1;
+			osfs.os_bfree >>= 1;
+			osfs.os_bavail >>= 1;
+		}
+	}
+
+	sfs->f_blocks = osfs.os_blocks;
+	sfs->f_bfree = osfs.os_bfree;
+	sfs->f_bavail = osfs.os_bavail;
+	sfs->f_fsid = ll_s2sbi(sb)->ll_fsid;
+	return 0;
+}
+
+void ll_inode_size_lock(struct inode *inode)
+{
+	struct ll_inode_info *lli;
+
+	LASSERT(!S_ISDIR(inode->i_mode));
+
+	lli = ll_i2info(inode);
+	mutex_lock(&lli->lli_size_mutex);
+}
+
+void ll_inode_size_unlock(struct inode *inode)
+{
+	struct ll_inode_info *lli;
+
+	lli = ll_i2info(inode);
+	mutex_unlock(&lli->lli_size_mutex);
+}
+
+void ll_update_inode(struct inode *inode, struct lustre_md *md)
+{
+	struct ll_inode_info *lli = ll_i2info(inode);
+	struct mdt_body *body = md->body;
+	struct lov_stripe_md *lsm = md->lsm;
+	struct ll_sb_info *sbi = ll_i2sbi(inode);
+
+	LASSERT((lsm != NULL) == ((body->valid & OBD_MD_FLEASIZE) != 0));
+	if (lsm != NULL) {
+		if (!lli->lli_has_smd &&
+		    !(sbi->ll_flags & LL_SBI_LAYOUT_LOCK))
+			cl_file_inode_init(inode, md);
+
+		lli->lli_maxbytes = lsm->lsm_maxbytes;
+		if (lli->lli_maxbytes > MAX_LFS_FILESIZE)
+			lli->lli_maxbytes = MAX_LFS_FILESIZE;
+	}
+
+	if (sbi->ll_flags & LL_SBI_RMT_CLIENT) {
+		if (body->valid & OBD_MD_FLRMTPERM)
+			ll_update_remote_perm(inode, md->remote_perm);
+	}
+#ifdef CONFIG_FS_POSIX_ACL
+	else if (body->valid & OBD_MD_FLACL) {
+		spin_lock(&lli->lli_lock);
+		if (lli->lli_posix_acl)
+			posix_acl_release(lli->lli_posix_acl);
+		lli->lli_posix_acl = md->posix_acl;
+		spin_unlock(&lli->lli_lock);
+	}
+#endif
+	inode->i_ino = cl_fid_build_ino(&body->fid1,
+					sbi->ll_flags & LL_SBI_32BIT_API);
+	inode->i_generation = cl_fid_build_gen(&body->fid1);
+
+	if (body->valid & OBD_MD_FLATIME) {
+		if (body->atime > LTIME_S(inode->i_atime))
+			LTIME_S(inode->i_atime) = body->atime;
+		lli->lli_lvb.lvb_atime = body->atime;
+	}
+	if (body->valid & OBD_MD_FLMTIME) {
+		if (body->mtime > LTIME_S(inode->i_mtime)) {
+			CDEBUG(D_INODE, "setting ino %lu mtime from %lu to %llu\n",
+			       inode->i_ino, LTIME_S(inode->i_mtime),
+			       body->mtime);
+			LTIME_S(inode->i_mtime) = body->mtime;
+		}
+		lli->lli_lvb.lvb_mtime = body->mtime;
+	}
+	if (body->valid & OBD_MD_FLCTIME) {
+		if (body->ctime > LTIME_S(inode->i_ctime))
+			LTIME_S(inode->i_ctime) = body->ctime;
+		lli->lli_lvb.lvb_ctime = body->ctime;
+	}
+	if (body->valid & OBD_MD_FLMODE)
+		inode->i_mode = (inode->i_mode & S_IFMT)|(body->mode & ~S_IFMT);
+	if (body->valid & OBD_MD_FLTYPE)
+		inode->i_mode = (inode->i_mode & ~S_IFMT)|(body->mode & S_IFMT);
+	LASSERT(inode->i_mode != 0);
+	if (S_ISREG(inode->i_mode))
+		inode->i_blkbits = min(PTLRPC_MAX_BRW_BITS + 1,
+				       LL_MAX_BLKSIZE_BITS);
+	else
+		inode->i_blkbits = inode->i_sb->s_blocksize_bits;
+	if (body->valid & OBD_MD_FLUID)
+		inode->i_uid = make_kuid(&init_user_ns, body->uid);
+	if (body->valid & OBD_MD_FLGID)
+		inode->i_gid = make_kgid(&init_user_ns, body->gid);
+	if (body->valid & OBD_MD_FLFLAGS)
+		inode->i_flags = ll_ext_to_inode_flags(body->flags);
+	if (body->valid & OBD_MD_FLNLINK)
+		set_nlink(inode, body->nlink);
+	if (body->valid & OBD_MD_FLRDEV)
+		inode->i_rdev = old_decode_dev(body->rdev);
+
+	if (body->valid & OBD_MD_FLID) {
+		/* FID shouldn't be changed! */
+		if (fid_is_sane(&lli->lli_fid)) {
+			LASSERTF(lu_fid_eq(&lli->lli_fid, &body->fid1),
+				 "Trying to change FID "DFID
+				 " to the "DFID", inode %lu/%u(%p)\n",
+				 PFID(&lli->lli_fid), PFID(&body->fid1),
+				 inode->i_ino, inode->i_generation, inode);
+		} else
+			lli->lli_fid = body->fid1;
+	}
+
+	LASSERT(fid_seq(&lli->lli_fid) != 0);
+
+	if (body->valid & OBD_MD_FLSIZE) {
+		if (exp_connect_som(ll_i2mdexp(inode)) &&
+		    S_ISREG(inode->i_mode)) {
+			struct lustre_handle lockh;
+			ldlm_mode_t mode;
+
+			/* As it is possible a blocking ast has been processed
+			 * by this time, we need to check there is an UPDATE
+			 * lock on the client and set LLIF_MDS_SIZE_LOCK holding
+			 * it. */
+			mode = ll_take_md_lock(inode, MDS_INODELOCK_UPDATE,
+					       &lockh, LDLM_FL_CBPENDING,
+					       LCK_CR | LCK_CW |
+					       LCK_PR | LCK_PW);
+			if (mode) {
+				if (lli->lli_flags & (LLIF_DONE_WRITING |
+						      LLIF_EPOCH_PENDING |
+						      LLIF_SOM_DIRTY)) {
+					CERROR("ino %lu flags %u still has size authority! do not trust the size got from MDS\n",
+					       inode->i_ino, lli->lli_flags);
+				} else {
+					/* Use old size assignment to avoid
+					 * deadlock bz14138 & bz14326 */
+					i_size_write(inode, body->size);
+					spin_lock(&lli->lli_lock);
+					lli->lli_flags |= LLIF_MDS_SIZE_LOCK;
+					spin_unlock(&lli->lli_lock);
+				}
+				ldlm_lock_decref(&lockh, mode);
+			}
+		} else {
+			/* Use old size assignment to avoid
+			 * deadlock bz14138 & bz14326 */
+			i_size_write(inode, body->size);
+
+			CDEBUG(D_VFSTRACE, "inode=%lu, updating i_size %llu\n",
+			       inode->i_ino, (unsigned long long)body->size);
+		}
+
+		if (body->valid & OBD_MD_FLBLOCKS)
+			inode->i_blocks = body->blocks;
+	}
+
+	if (body->valid & OBD_MD_FLMDSCAPA) {
+		LASSERT(md->mds_capa);
+		ll_add_capa(inode, md->mds_capa);
+	}
+	if (body->valid & OBD_MD_FLOSSCAPA) {
+		LASSERT(md->oss_capa);
+		ll_add_capa(inode, md->oss_capa);
+	}
+
+	if (body->valid & OBD_MD_TSTATE) {
+		if (body->t_state & MS_RESTORE)
+			lli->lli_flags |= LLIF_FILE_RESTORING;
+	}
+}
+
+void ll_read_inode2(struct inode *inode, void *opaque)
+{
+	struct lustre_md *md = opaque;
+	struct ll_inode_info *lli = ll_i2info(inode);
+
+	CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
+	       PFID(&lli->lli_fid), inode);
+
+	LASSERT(!lli->lli_has_smd);
+
+	/* Core attributes from the MDS first.  This is a new inode, and
+	 * the VFS doesn't zero times in the core inode so we have to do
+	 * it ourselves.  They will be overwritten by either MDS or OST
+	 * attributes - we just need to make sure they aren't newer. */
+	LTIME_S(inode->i_mtime) = 0;
+	LTIME_S(inode->i_atime) = 0;
+	LTIME_S(inode->i_ctime) = 0;
+	inode->i_rdev = 0;
+	ll_update_inode(inode, md);
+
+	/* OIDEBUG(inode); */
+
+	if (S_ISREG(inode->i_mode)) {
+		struct ll_sb_info *sbi = ll_i2sbi(inode);
+
+		inode->i_op = &ll_file_inode_operations;
+		inode->i_fop = sbi->ll_fop;
+		inode->i_mapping->a_ops = (struct address_space_operations *)&ll_aops;
+	} else if (S_ISDIR(inode->i_mode)) {
+		inode->i_op = &ll_dir_inode_operations;
+		inode->i_fop = &ll_dir_operations;
+	} else if (S_ISLNK(inode->i_mode)) {
+		inode->i_op = &ll_fast_symlink_inode_operations;
+	} else {
+		inode->i_op = &ll_special_inode_operations;
+
+		init_special_inode(inode, inode->i_mode,
+				   inode->i_rdev);
+	}
+}
+
+void ll_delete_inode(struct inode *inode)
+{
+	struct cl_inode_info *lli = cl_i2info(inode);
+
+	if (S_ISREG(inode->i_mode) && lli->lli_clob != NULL)
+		/* discard all dirty pages before truncating them, required by
+		 * osc_extent implementation at LU-1030. */
+		cl_sync_file_range(inode, 0, OBD_OBJECT_EOF,
+				   CL_FSYNC_DISCARD, 1);
+
+	truncate_inode_pages_final(&inode->i_data);
+
+	/* Workaround for LU-118 */
+	if (inode->i_data.nrpages) {
+		spin_lock_irq(&inode->i_data.tree_lock);
+		spin_unlock_irq(&inode->i_data.tree_lock);
+		LASSERTF(inode->i_data.nrpages == 0,
+			 "inode=%lu/%u(%p) nrpages=%lu, see http://jira.whamcloud.com/browse/LU-118\n",
+			 inode->i_ino, inode->i_generation, inode,
+			 inode->i_data.nrpages);
+	}
+	/* Workaround end */
+
+	ll_clear_inode(inode);
+	clear_inode(inode);
+}
+
+int ll_iocontrol(struct inode *inode, struct file *file,
+		 unsigned int cmd, unsigned long arg)
+{
+	struct ll_sb_info *sbi = ll_i2sbi(inode);
+	struct ptlrpc_request *req = NULL;
+	int rc, flags = 0;
+
+	switch (cmd) {
+	case FSFILT_IOC_GETFLAGS: {
+		struct mdt_body *body;
+		struct md_op_data *op_data;
+
+		op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL,
+					     0, 0, LUSTRE_OPC_ANY,
+					     NULL);
+		if (IS_ERR(op_data))
+			return PTR_ERR(op_data);
+
+		op_data->op_valid = OBD_MD_FLFLAGS;
+		rc = md_getattr(sbi->ll_md_exp, op_data, &req);
+		ll_finish_md_op_data(op_data);
+		if (rc) {
+			CERROR("failure %d inode %lu\n", rc, inode->i_ino);
+			return -abs(rc);
+		}
+
+		body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
+
+		flags = body->flags;
+
+		ptlrpc_req_finished(req);
+
+		return put_user(flags, (int *)arg);
+	}
+	case FSFILT_IOC_SETFLAGS: {
+		struct lov_stripe_md *lsm;
+		struct obd_info oinfo = { { { 0 } } };
+		struct md_op_data *op_data;
+
+		if (get_user(flags, (int *)arg))
+			return -EFAULT;
+
+		op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
+					     LUSTRE_OPC_ANY, NULL);
+		if (IS_ERR(op_data))
+			return PTR_ERR(op_data);
+
+		((struct ll_iattr *)&op_data->op_attr)->ia_attr_flags = flags;
+		op_data->op_attr.ia_valid |= ATTR_ATTR_FLAG;
+		rc = md_setattr(sbi->ll_md_exp, op_data,
+				NULL, 0, NULL, 0, &req, NULL);
+		ll_finish_md_op_data(op_data);
+		ptlrpc_req_finished(req);
+		if (rc)
+			return rc;
+
+		inode->i_flags = ll_ext_to_inode_flags(flags);
+
+		lsm = ccc_inode_lsm_get(inode);
+		if (!lsm_has_objects(lsm)) {
+			ccc_inode_lsm_put(inode, lsm);
+			return 0;
+		}
+
+		OBDO_ALLOC(oinfo.oi_oa);
+		if (!oinfo.oi_oa) {
+			ccc_inode_lsm_put(inode, lsm);
+			return -ENOMEM;
+		}
+		oinfo.oi_md = lsm;
+		oinfo.oi_oa->o_oi = lsm->lsm_oi;
+		oinfo.oi_oa->o_flags = flags;
+		oinfo.oi_oa->o_valid = OBD_MD_FLID | OBD_MD_FLFLAGS |
+				       OBD_MD_FLGROUP;
+		oinfo.oi_capa = ll_mdscapa_get(inode);
+		obdo_set_parent_fid(oinfo.oi_oa, &ll_i2info(inode)->lli_fid);
+		rc = obd_setattr_rqset(sbi->ll_dt_exp, &oinfo, NULL);
+		capa_put(oinfo.oi_capa);
+		OBDO_FREE(oinfo.oi_oa);
+		ccc_inode_lsm_put(inode, lsm);
+
+		if (rc && rc != -EPERM && rc != -EACCES)
+			CERROR("osc_setattr_async fails: rc = %d\n", rc);
+
+		return rc;
+	}
+	default:
+		return -ENOSYS;
+	}
+
+	return 0;
+}
+
+int ll_flush_ctx(struct inode *inode)
+{
+	struct ll_sb_info  *sbi = ll_i2sbi(inode);
+
+	CDEBUG(D_SEC, "flush context for user %d\n",
+		      from_kuid(&init_user_ns, current_uid()));
+
+	obd_set_info_async(NULL, sbi->ll_md_exp,
+			   sizeof(KEY_FLUSH_CTX), KEY_FLUSH_CTX,
+			   0, NULL, NULL);
+	obd_set_info_async(NULL, sbi->ll_dt_exp,
+			   sizeof(KEY_FLUSH_CTX), KEY_FLUSH_CTX,
+			   0, NULL, NULL);
+	return 0;
+}
+
+/* umount -f client means force down, don't save state */
+void ll_umount_begin(struct super_block *sb)
+{
+	struct ll_sb_info *sbi = ll_s2sbi(sb);
+	struct obd_device *obd;
+	struct obd_ioctl_data *ioc_data;
+
+	CDEBUG(D_VFSTRACE, "VFS Op: superblock %p count %d active %d\n", sb,
+	       sb->s_count, atomic_read(&sb->s_active));
+
+	obd = class_exp2obd(sbi->ll_md_exp);
+	if (obd == NULL) {
+		CERROR("Invalid MDC connection handle %#llx\n",
+		       sbi->ll_md_exp->exp_handle.h_cookie);
+		return;
+	}
+	obd->obd_force = 1;
+
+	obd = class_exp2obd(sbi->ll_dt_exp);
+	if (obd == NULL) {
+		CERROR("Invalid LOV connection handle %#llx\n",
+		       sbi->ll_dt_exp->exp_handle.h_cookie);
+		return;
+	}
+	obd->obd_force = 1;
+
+	ioc_data = kzalloc(sizeof(*ioc_data), GFP_NOFS);
+	if (ioc_data) {
+		obd_iocontrol(IOC_OSC_SET_ACTIVE, sbi->ll_md_exp,
+			      sizeof(*ioc_data), ioc_data, NULL);
+
+		obd_iocontrol(IOC_OSC_SET_ACTIVE, sbi->ll_dt_exp,
+			      sizeof(*ioc_data), ioc_data, NULL);
+
+		OBD_FREE_PTR(ioc_data);
+	}
+
+	/* Really, we'd like to wait until there are no requests outstanding,
+	 * and then continue.  For now, we just invalidate the requests,
+	 * schedule() and sleep one second if needed, and hope.
+	 */
+	schedule();
+}
+
+int ll_remount_fs(struct super_block *sb, int *flags, char *data)
+{
+	struct ll_sb_info *sbi = ll_s2sbi(sb);
+	char *profilenm = get_profile_name(sb);
+	int err;
+	__u32 read_only;
+
+	if ((*flags & MS_RDONLY) != (sb->s_flags & MS_RDONLY)) {
+		read_only = *flags & MS_RDONLY;
+		err = obd_set_info_async(NULL, sbi->ll_md_exp,
+					 sizeof(KEY_READ_ONLY),
+					 KEY_READ_ONLY, sizeof(read_only),
+					 &read_only, NULL);
+		if (err) {
+			LCONSOLE_WARN("Failed to remount %s %s (%d)\n",
+				      profilenm, read_only ?
+				      "read-only" : "read-write", err);
+			return err;
+		}
+
+		if (read_only)
+			sb->s_flags |= MS_RDONLY;
+		else
+			sb->s_flags &= ~MS_RDONLY;
+
+		if (sbi->ll_flags & LL_SBI_VERBOSE)
+			LCONSOLE_WARN("Remounted %s %s\n", profilenm,
+				      read_only ?  "read-only" : "read-write");
+	}
+	return 0;
+}
+
+int ll_prep_inode(struct inode **inode, struct ptlrpc_request *req,
+		  struct super_block *sb, struct lookup_intent *it)
+{
+	struct ll_sb_info *sbi = NULL;
+	struct lustre_md md;
+	int rc;
+
+	LASSERT(*inode || sb);
+	sbi = sb ? ll_s2sbi(sb) : ll_i2sbi(*inode);
+	rc = md_get_lustre_md(sbi->ll_md_exp, req, sbi->ll_dt_exp,
+			      sbi->ll_md_exp, &md);
+	if (rc)
+		return rc;
+
+	if (*inode) {
+		ll_update_inode(*inode, &md);
+	} else {
+		LASSERT(sb != NULL);
+
+		/*
+		 * At this point server returns to client's same fid as client
+		 * generated for creating. So using ->fid1 is okay here.
+		 */
+		LASSERT(fid_is_sane(&md.body->fid1));
+
+		*inode = ll_iget(sb, cl_fid_build_ino(&md.body->fid1,
+					     sbi->ll_flags & LL_SBI_32BIT_API),
+				 &md);
+		if (*inode == NULL || IS_ERR(*inode)) {
+#ifdef CONFIG_FS_POSIX_ACL
+			if (md.posix_acl) {
+				posix_acl_release(md.posix_acl);
+				md.posix_acl = NULL;
+			}
+#endif
+			rc = IS_ERR(*inode) ? PTR_ERR(*inode) : -ENOMEM;
+			*inode = NULL;
+			CERROR("new_inode -fatal: rc %d\n", rc);
+			goto out;
+		}
+	}
+
+	/* Handling piggyback layout lock.
+	 * Layout lock can be piggybacked by getattr and open request.
+	 * The lsm can be applied to inode only if it comes with a layout lock
+	 * otherwise correct layout may be overwritten, for example:
+	 * 1. proc1: mdt returns a lsm but not granting layout
+	 * 2. layout was changed by another client
+	 * 3. proc2: refresh layout and layout lock granted
+	 * 4. proc1: to apply a stale layout */
+	if (it != NULL && it->d.lustre.it_lock_mode != 0) {
+		struct lustre_handle lockh;
+		struct ldlm_lock *lock;
+
+		lockh.cookie = it->d.lustre.it_lock_handle;
+		lock = ldlm_handle2lock(&lockh);
+		LASSERT(lock != NULL);
+		if (ldlm_has_layout(lock)) {
+			struct cl_object_conf conf;
+
+			memset(&conf, 0, sizeof(conf));
+			conf.coc_opc = OBJECT_CONF_SET;
+			conf.coc_inode = *inode;
+			conf.coc_lock = lock;
+			conf.u.coc_md = &md;
+			(void)ll_layout_conf(*inode, &conf);
+		}
+		LDLM_LOCK_PUT(lock);
+	}
+
+out:
+	if (md.lsm != NULL)
+		obd_free_memmd(sbi->ll_dt_exp, &md.lsm);
+	md_free_lustre_md(sbi->ll_md_exp, &md);
+	return rc;
+}
+
+int ll_obd_statfs(struct inode *inode, void *arg)
+{
+	struct ll_sb_info *sbi = NULL;
+	struct obd_export *exp;
+	char *buf = NULL;
+	struct obd_ioctl_data *data = NULL;
+	__u32 type;
+	__u32 flags;
+	int len = 0, rc;
+
+	if (!inode) {
+		rc = -EINVAL;
+		goto out_statfs;
+	}
+
+	sbi = ll_i2sbi(inode);
+	if (!sbi) {
+		rc = -EINVAL;
+		goto out_statfs;
+	}
+
+	rc = obd_ioctl_getdata(&buf, &len, arg);
+	if (rc)
+		goto out_statfs;
+
+	data = (void *)buf;
+	if (!data->ioc_inlbuf1 || !data->ioc_inlbuf2 ||
+	    !data->ioc_pbuf1 || !data->ioc_pbuf2) {
+		rc = -EINVAL;
+		goto out_statfs;
+	}
+
+	if (data->ioc_inllen1 != sizeof(__u32) ||
+	    data->ioc_inllen2 != sizeof(__u32) ||
+	    data->ioc_plen1 != sizeof(struct obd_statfs) ||
+	    data->ioc_plen2 != sizeof(struct obd_uuid)) {
+		rc = -EINVAL;
+		goto out_statfs;
+	}
+
+	memcpy(&type, data->ioc_inlbuf1, sizeof(__u32));
+	if (type & LL_STATFS_LMV)
+		exp = sbi->ll_md_exp;
+	else if (type & LL_STATFS_LOV)
+		exp = sbi->ll_dt_exp;
+	else {
+		rc = -ENODEV;
+		goto out_statfs;
+	}
+
+	flags = (type & LL_STATFS_NODELAY) ? OBD_STATFS_NODELAY : 0;
+	rc = obd_iocontrol(IOC_OBD_STATFS, exp, len, buf, &flags);
+	if (rc)
+		goto out_statfs;
+out_statfs:
+	if (buf)
+		obd_ioctl_freedata(buf, len);
+	return rc;
+}
+
+int ll_process_config(struct lustre_cfg *lcfg)
+{
+	char *ptr;
+	void *sb;
+	struct lprocfs_static_vars lvars;
+	unsigned long x;
+	int rc = 0;
+
+	lprocfs_llite_init_vars(&lvars);
+
+	/* The instance name contains the sb: lustre-client-aacfe000 */
+	ptr = strrchr(lustre_cfg_string(lcfg, 0), '-');
+	if (!ptr || !*(++ptr))
+		return -EINVAL;
+	rc = kstrtoul(ptr, 16, &x);
+	if (rc != 0)
+		return -EINVAL;
+	sb = (void *)x;
+	/* This better be a real Lustre superblock! */
+	LASSERT(s2lsi((struct super_block *)sb)->lsi_lmd->lmd_magic == LMD_MAGIC);
+
+	/* Note we have not called client_common_fill_super yet, so
+	   proc fns must be able to handle that! */
+	rc = class_process_proc_param(PARAM_LLITE, lvars.obd_vars,
+				      lcfg, sb);
+	if (rc > 0)
+		rc = 0;
+	return rc;
+}
+
+/* this function prepares md_op_data hint for passing ot down to MD stack. */
+struct md_op_data *ll_prep_md_op_data(struct md_op_data *op_data,
+				       struct inode *i1, struct inode *i2,
+				       const char *name, int namelen,
+				       int mode, __u32 opc, void *data)
+{
+	LASSERT(i1 != NULL);
+
+	if (namelen > ll_i2sbi(i1)->ll_namelen)
+		return ERR_PTR(-ENAMETOOLONG);
+
+	if (op_data == NULL)
+		op_data = kzalloc(sizeof(*op_data), GFP_NOFS);
+
+	if (op_data == NULL)
+		return ERR_PTR(-ENOMEM);
+
+	ll_i2gids(op_data->op_suppgids, i1, i2);
+	op_data->op_fid1 = *ll_inode2fid(i1);
+	op_data->op_capa1 = ll_mdscapa_get(i1);
+
+	if (i2) {
+		op_data->op_fid2 = *ll_inode2fid(i2);
+		op_data->op_capa2 = ll_mdscapa_get(i2);
+	} else {
+		fid_zero(&op_data->op_fid2);
+		op_data->op_capa2 = NULL;
+	}
+
+	op_data->op_name = name;
+	op_data->op_namelen = namelen;
+	op_data->op_mode = mode;
+	op_data->op_mod_time = get_seconds();
+	op_data->op_fsuid = from_kuid(&init_user_ns, current_fsuid());
+	op_data->op_fsgid = from_kgid(&init_user_ns, current_fsgid());
+	op_data->op_cap = cfs_curproc_cap_pack();
+	op_data->op_bias = 0;
+	op_data->op_cli_flags = 0;
+	if ((opc == LUSTRE_OPC_CREATE) && (name != NULL) &&
+	     filename_is_volatile(name, namelen, NULL))
+		op_data->op_bias |= MDS_CREATE_VOLATILE;
+	op_data->op_opc = opc;
+	op_data->op_mds = 0;
+	op_data->op_data = data;
+
+	/* If the file is being opened after mknod() (normally due to NFS)
+	 * try to use the default stripe data from parent directory for
+	 * allocating OST objects.  Try to pass the parent FID to MDS. */
+	if (opc == LUSTRE_OPC_CREATE && i1 == i2 && S_ISREG(i2->i_mode) &&
+	    !ll_i2info(i2)->lli_has_smd) {
+		struct ll_inode_info *lli = ll_i2info(i2);
+
+		spin_lock(&lli->lli_lock);
+		if (likely(!lli->lli_has_smd && !fid_is_zero(&lli->lli_pfid)))
+			op_data->op_fid1 = lli->lli_pfid;
+		spin_unlock(&lli->lli_lock);
+		/** We ignore parent's capability temporary. */
+	}
+
+	/* When called by ll_setattr_raw, file is i1. */
+	if (LLIF_DATA_MODIFIED & ll_i2info(i1)->lli_flags)
+		op_data->op_bias |= MDS_DATA_MODIFIED;
+
+	return op_data;
+}
+
+void ll_finish_md_op_data(struct md_op_data *op_data)
+{
+	capa_put(op_data->op_capa1);
+	capa_put(op_data->op_capa2);
+	OBD_FREE_PTR(op_data);
+}
+
+int ll_show_options(struct seq_file *seq, struct dentry *dentry)
+{
+	struct ll_sb_info *sbi;
+
+	LASSERT((seq != NULL) && (dentry != NULL));
+	sbi = ll_s2sbi(dentry->d_sb);
+
+	if (sbi->ll_flags & LL_SBI_NOLCK)
+		seq_puts(seq, ",nolock");
+
+	if (sbi->ll_flags & LL_SBI_FLOCK)
+		seq_puts(seq, ",flock");
+
+	if (sbi->ll_flags & LL_SBI_LOCALFLOCK)
+		seq_puts(seq, ",localflock");
+
+	if (sbi->ll_flags & LL_SBI_USER_XATTR)
+		seq_puts(seq, ",user_xattr");
+
+	if (sbi->ll_flags & LL_SBI_LAZYSTATFS)
+		seq_puts(seq, ",lazystatfs");
+
+	if (sbi->ll_flags & LL_SBI_USER_FID2PATH)
+		seq_puts(seq, ",user_fid2path");
+
+	return 0;
+}
+
+/**
+ * Get obd name by cmd, and copy out to user space
+ */
+int ll_get_obd_name(struct inode *inode, unsigned int cmd, unsigned long arg)
+{
+	struct ll_sb_info *sbi = ll_i2sbi(inode);
+	struct obd_device *obd;
+
+	if (cmd == OBD_IOC_GETDTNAME)
+		obd = class_exp2obd(sbi->ll_dt_exp);
+	else if (cmd == OBD_IOC_GETMDNAME)
+		obd = class_exp2obd(sbi->ll_md_exp);
+	else
+		return -EINVAL;
+
+	if (!obd)
+		return -ENOENT;
+
+	if (copy_to_user((void *)arg, obd->obd_name,
+			     strlen(obd->obd_name) + 1))
+		return -EFAULT;
+
+	return 0;
+}
+
+/**
+ * Get lustre file system name by \a sbi. If \a buf is provided(non-NULL), the
+ * fsname will be returned in this buffer; otherwise, a static buffer will be
+ * used to store the fsname and returned to caller.
+ */
+char *ll_get_fsname(struct super_block *sb, char *buf, int buflen)
+{
+	static char fsname_static[MTI_NAME_MAXLEN];
+	struct lustre_sb_info *lsi = s2lsi(sb);
+	char *ptr;
+	int len;
+
+	if (buf == NULL) {
+		/* this means the caller wants to use static buffer
+		 * and it doesn't care about race. Usually this is
+		 * in error reporting path */
+		buf = fsname_static;
+		buflen = sizeof(fsname_static);
+	}
+
+	len = strlen(lsi->lsi_lmd->lmd_profile);
+	ptr = strrchr(lsi->lsi_lmd->lmd_profile, '-');
+	if (ptr && (strcmp(ptr, "-client") == 0))
+		len -= 7;
+
+	if (unlikely(len >= buflen))
+		len = buflen - 1;
+	strncpy(buf, lsi->lsi_lmd->lmd_profile, len);
+	buf[len] = '\0';
+
+	return buf;
+}
+
+void ll_dirty_page_discard_warn(struct page *page, int ioret)
+{
+	char *buf, *path = NULL;
+	struct dentry *dentry = NULL;
+	struct ccc_object *obj = cl_inode2ccc(page->mapping->host);
+
+	/* this can be called inside spin lock so use GFP_ATOMIC. */
+	buf = (char *)__get_free_page(GFP_ATOMIC);
+	if (buf != NULL) {
+		dentry = d_find_alias(page->mapping->host);
+		if (dentry != NULL)
+			path = dentry_path_raw(dentry, buf, PAGE_SIZE);
+	}
+
+	CDEBUG(D_WARNING,
+	       "%s: dirty page discard: %s/fid: " DFID "/%s may get corrupted (rc %d)\n",
+	       ll_get_fsname(page->mapping->host->i_sb, NULL, 0),
+	       s2lsi(page->mapping->host->i_sb)->lsi_lmd->lmd_dev,
+	       PFID(&obj->cob_header.coh_lu.loh_fid),
+	       (path && !IS_ERR(path)) ? path : "", ioret);
+
+	if (dentry != NULL)
+		dput(dentry);
+
+	if (buf != NULL)
+		free_page((unsigned long)buf);
+}
diff --git a/kernel/drivers/staging/lustre/lustre/llite/llite_mmap.c b/kernel/drivers/staging/lustre/lustre/llite/llite_mmap.c
new file mode 100644
index 000000000..a90214bb8
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/llite/llite_mmap.c
@@ -0,0 +1,492 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/stat.h>
+#include <linux/errno.h>
+#include <linux/unistd.h>
+#include <linux/uaccess.h>
+
+#include <linux/fs.h>
+#include <linux/pagemap.h>
+
+#define DEBUG_SUBSYSTEM S_LLITE
+
+#include "../include/lustre_lite.h"
+#include "llite_internal.h"
+#include "../include/linux/lustre_compat25.h"
+
+static const struct vm_operations_struct ll_file_vm_ops;
+
+void policy_from_vma(ldlm_policy_data_t *policy,
+			    struct vm_area_struct *vma, unsigned long addr,
+			    size_t count)
+{
+	policy->l_extent.start = ((addr - vma->vm_start) & CFS_PAGE_MASK) +
+				 (vma->vm_pgoff << PAGE_CACHE_SHIFT);
+	policy->l_extent.end = (policy->l_extent.start + count - 1) |
+			       ~CFS_PAGE_MASK;
+}
+
+struct vm_area_struct *our_vma(struct mm_struct *mm, unsigned long addr,
+			       size_t count)
+{
+	struct vm_area_struct *vma, *ret = NULL;
+
+	/* mmap_sem must have been held by caller. */
+	LASSERT(!down_write_trylock(&mm->mmap_sem));
+
+	for (vma = find_vma(mm, addr);
+	    vma != NULL && vma->vm_start < (addr + count); vma = vma->vm_next) {
+		if (vma->vm_ops && vma->vm_ops == &ll_file_vm_ops &&
+		    vma->vm_flags & VM_SHARED) {
+			ret = vma;
+			break;
+		}
+	}
+	return ret;
+}
+
+/**
+ * API independent part for page fault initialization.
+ * \param vma - virtual memory area addressed to page fault
+ * \param env - corespondent lu_env to processing
+ * \param nest - nested level
+ * \param index - page index corespondent to fault.
+ * \parm ra_flags - vma readahead flags.
+ *
+ * \return allocated and initialized env for fault operation.
+ * \retval EINVAL if env can't allocated
+ * \return other error codes from cl_io_init.
+ */
+static struct cl_io *
+ll_fault_io_init(struct vm_area_struct *vma, struct lu_env **env_ret,
+		 struct cl_env_nest *nest, pgoff_t index,
+		 unsigned long *ra_flags)
+{
+	struct file	       *file = vma->vm_file;
+	struct inode	       *inode = file_inode(file);
+	struct cl_io	       *io;
+	struct cl_fault_io     *fio;
+	struct lu_env	       *env;
+	int			rc;
+
+	*env_ret = NULL;
+	if (ll_file_nolock(file))
+		return ERR_PTR(-EOPNOTSUPP);
+
+	/*
+	 * page fault can be called when lustre IO is
+	 * already active for the current thread, e.g., when doing read/write
+	 * against user level buffer mapped from Lustre buffer. To avoid
+	 * stomping on existing context, optionally force an allocation of a new
+	 * one.
+	 */
+	env = cl_env_nested_get(nest);
+	if (IS_ERR(env))
+		 return ERR_PTR(-EINVAL);
+
+	*env_ret = env;
+
+	io = ccc_env_thread_io(env);
+	io->ci_obj = ll_i2info(inode)->lli_clob;
+	LASSERT(io->ci_obj != NULL);
+
+	fio = &io->u.ci_fault;
+	fio->ft_index      = index;
+	fio->ft_executable = vma->vm_flags&VM_EXEC;
+
+	/*
+	 * disable VM_SEQ_READ and use VM_RAND_READ to make sure that
+	 * the kernel will not read other pages not covered by ldlm in
+	 * filemap_nopage. we do our readahead in ll_readpage.
+	 */
+	if (ra_flags != NULL)
+		*ra_flags = vma->vm_flags & (VM_RAND_READ|VM_SEQ_READ);
+	vma->vm_flags &= ~VM_SEQ_READ;
+	vma->vm_flags |= VM_RAND_READ;
+
+	CDEBUG(D_MMAP, "vm_flags: %lx (%lu %d)\n", vma->vm_flags,
+	       fio->ft_index, fio->ft_executable);
+
+	rc = cl_io_init(env, io, CIT_FAULT, io->ci_obj);
+	if (rc == 0) {
+		struct ccc_io *cio = ccc_env_io(env);
+		struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
+
+		LASSERT(cio->cui_cl.cis_io == io);
+
+		/* mmap lock must be MANDATORY it has to cache
+		 * pages. */
+		io->ci_lockreq = CILR_MANDATORY;
+		cio->cui_fd = fd;
+	} else {
+		LASSERT(rc < 0);
+		cl_io_fini(env, io);
+		cl_env_nested_put(nest, env);
+		io = ERR_PTR(rc);
+	}
+
+	return io;
+}
+
+/* Sharing code of page_mkwrite method for rhel5 and rhel6 */
+static int ll_page_mkwrite0(struct vm_area_struct *vma, struct page *vmpage,
+			    bool *retry)
+{
+	struct lu_env	   *env;
+	struct cl_io	    *io;
+	struct vvp_io	   *vio;
+	struct cl_env_nest       nest;
+	int		      result;
+	sigset_t	     set;
+	struct inode	     *inode;
+	struct ll_inode_info     *lli;
+
+	LASSERT(vmpage != NULL);
+
+	io = ll_fault_io_init(vma, &env,  &nest, vmpage->index, NULL);
+	if (IS_ERR(io)) {
+		result = PTR_ERR(io);
+		goto out;
+	}
+
+	result = io->ci_result;
+	if (result < 0)
+		goto out_io;
+
+	io->u.ci_fault.ft_mkwrite = 1;
+	io->u.ci_fault.ft_writable = 1;
+
+	vio = vvp_env_io(env);
+	vio->u.fault.ft_vma    = vma;
+	vio->u.fault.ft_vmpage = vmpage;
+
+	set = cfs_block_sigsinv(sigmask(SIGKILL) | sigmask(SIGTERM));
+
+	/* we grab lli_trunc_sem to exclude truncate case.
+	 * Otherwise, we could add dirty pages into osc cache
+	 * while truncate is on-going. */
+	inode = ccc_object_inode(io->ci_obj);
+	lli = ll_i2info(inode);
+	down_read(&lli->lli_trunc_sem);
+
+	result = cl_io_loop(env, io);
+
+	up_read(&lli->lli_trunc_sem);
+
+	cfs_restore_sigs(set);
+
+	if (result == 0) {
+		struct inode *inode = file_inode(vma->vm_file);
+		struct ll_inode_info *lli = ll_i2info(inode);
+
+		lock_page(vmpage);
+		if (vmpage->mapping == NULL) {
+			unlock_page(vmpage);
+
+			/* page was truncated and lock was cancelled, return
+			 * ENODATA so that VM_FAULT_NOPAGE will be returned
+			 * to handle_mm_fault(). */
+			if (result == 0)
+				result = -ENODATA;
+		} else if (!PageDirty(vmpage)) {
+			/* race, the page has been cleaned by ptlrpcd after
+			 * it was unlocked, it has to be added into dirty
+			 * cache again otherwise this soon-to-dirty page won't
+			 * consume any grants, even worse if this page is being
+			 * transferred because it will break RPC checksum.
+			 */
+			unlock_page(vmpage);
+
+			CDEBUG(D_MMAP, "Race on page_mkwrite %p/%lu, page has been written out, retry.\n",
+			       vmpage, vmpage->index);
+
+			*retry = true;
+			result = -EAGAIN;
+		}
+
+		if (result == 0) {
+			spin_lock(&lli->lli_lock);
+			lli->lli_flags |= LLIF_DATA_MODIFIED;
+			spin_unlock(&lli->lli_lock);
+		}
+	}
+
+out_io:
+	cl_io_fini(env, io);
+	cl_env_nested_put(&nest, env);
+out:
+	CDEBUG(D_MMAP, "%s mkwrite with %d\n", current->comm, result);
+	LASSERT(ergo(result == 0, PageLocked(vmpage)));
+
+	return result;
+}
+
+
+
+static inline int to_fault_error(int result)
+{
+	switch (result) {
+	case 0:
+		result = VM_FAULT_LOCKED;
+		break;
+	case -EFAULT:
+		result = VM_FAULT_NOPAGE;
+		break;
+	case -ENOMEM:
+		result = VM_FAULT_OOM;
+		break;
+	default:
+		result = VM_FAULT_SIGBUS;
+		break;
+	}
+	return result;
+}
+
+/**
+ * Lustre implementation of a vm_operations_struct::fault() method, called by
+ * VM to server page fault (both in kernel and user space).
+ *
+ * \param vma - is virtual area struct related to page fault
+ * \param vmf - structure which describe type and address where hit fault
+ *
+ * \return allocated and filled _locked_ page for address
+ * \retval VM_FAULT_ERROR on general error
+ * \retval NOPAGE_OOM not have memory for allocate new page
+ */
+static int ll_fault0(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+	struct lu_env	   *env;
+	struct cl_io	    *io;
+	struct vvp_io	   *vio = NULL;
+	struct page	     *vmpage;
+	unsigned long	    ra_flags;
+	struct cl_env_nest       nest;
+	int		      result;
+	int		      fault_ret = 0;
+
+	io = ll_fault_io_init(vma, &env,  &nest, vmf->pgoff, &ra_flags);
+	if (IS_ERR(io))
+		return to_fault_error(PTR_ERR(io));
+
+	result = io->ci_result;
+	if (result == 0) {
+		vio = vvp_env_io(env);
+		vio->u.fault.ft_vma       = vma;
+		vio->u.fault.ft_vmpage    = NULL;
+		vio->u.fault.fault.ft_vmf = vmf;
+		vio->u.fault.fault.ft_flags = 0;
+		vio->u.fault.fault.ft_flags_valid = false;
+
+		result = cl_io_loop(env, io);
+
+		/* ft_flags are only valid if we reached
+		 * the call to filemap_fault */
+		if (vio->u.fault.fault.ft_flags_valid)
+			fault_ret = vio->u.fault.fault.ft_flags;
+
+		vmpage = vio->u.fault.ft_vmpage;
+		if (result != 0 && vmpage != NULL) {
+			page_cache_release(vmpage);
+			vmf->page = NULL;
+		}
+	}
+	cl_io_fini(env, io);
+	cl_env_nested_put(&nest, env);
+
+	vma->vm_flags |= ra_flags;
+	if (result != 0 && !(fault_ret & VM_FAULT_RETRY))
+		fault_ret |= to_fault_error(result);
+
+	CDEBUG(D_MMAP, "%s fault %d/%d\n",
+	       current->comm, fault_ret, result);
+	return fault_ret;
+}
+
+static int ll_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+	int count = 0;
+	bool printed = false;
+	int result;
+	sigset_t set;
+
+	/* Only SIGKILL and SIGTERM is allowed for fault/nopage/mkwrite
+	 * so that it can be killed by admin but not cause segfault by
+	 * other signals. */
+	set = cfs_block_sigsinv(sigmask(SIGKILL) | sigmask(SIGTERM));
+
+restart:
+	result = ll_fault0(vma, vmf);
+	LASSERT(!(result & VM_FAULT_LOCKED));
+	if (result == 0) {
+		struct page *vmpage = vmf->page;
+
+		/* check if this page has been truncated */
+		lock_page(vmpage);
+		if (unlikely(vmpage->mapping == NULL)) { /* unlucky */
+			unlock_page(vmpage);
+			page_cache_release(vmpage);
+			vmf->page = NULL;
+
+			if (!printed && ++count > 16) {
+				CWARN("the page is under heavy contention, maybe your app(%s) needs revising :-)\n",
+				      current->comm);
+				printed = true;
+			}
+
+			goto restart;
+		}
+
+		result = VM_FAULT_LOCKED;
+	}
+	cfs_restore_sigs(set);
+	return result;
+}
+
+static int ll_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+	int count = 0;
+	bool printed = false;
+	bool retry;
+	int result;
+
+	do {
+		retry = false;
+		result = ll_page_mkwrite0(vma, vmf->page, &retry);
+
+		if (!printed && ++count > 16) {
+			CWARN("app(%s): the page %lu of file %lu is under heavy contention.\n",
+			      current->comm, vmf->pgoff,
+			      file_inode(vma->vm_file)->i_ino);
+			printed = true;
+		}
+	} while (retry);
+
+	switch (result) {
+	case 0:
+		LASSERT(PageLocked(vmf->page));
+		result = VM_FAULT_LOCKED;
+		break;
+	case -ENODATA:
+	case -EFAULT:
+		result = VM_FAULT_NOPAGE;
+		break;
+	case -ENOMEM:
+		result = VM_FAULT_OOM;
+		break;
+	case -EAGAIN:
+		result = VM_FAULT_RETRY;
+		break;
+	default:
+		result = VM_FAULT_SIGBUS;
+		break;
+	}
+
+	return result;
+}
+
+/**
+ *  To avoid cancel the locks covering mmapped region for lock cache pressure,
+ *  we track the mapped vma count in ccc_object::cob_mmap_cnt.
+ */
+static void ll_vm_open(struct vm_area_struct *vma)
+{
+	struct inode *inode    = file_inode(vma->vm_file);
+	struct ccc_object *vob = cl_inode2ccc(inode);
+
+	LASSERT(vma->vm_file);
+	LASSERT(atomic_read(&vob->cob_mmap_cnt) >= 0);
+	atomic_inc(&vob->cob_mmap_cnt);
+}
+
+/**
+ * Dual to ll_vm_open().
+ */
+static void ll_vm_close(struct vm_area_struct *vma)
+{
+	struct inode      *inode = file_inode(vma->vm_file);
+	struct ccc_object *vob   = cl_inode2ccc(inode);
+
+	LASSERT(vma->vm_file);
+	atomic_dec(&vob->cob_mmap_cnt);
+	LASSERT(atomic_read(&vob->cob_mmap_cnt) >= 0);
+}
+
+/* XXX put nice comment here.  talk about __free_pte -> dirty pages and
+ * nopage's reference passing to the pte */
+int ll_teardown_mmaps(struct address_space *mapping, __u64 first, __u64 last)
+{
+	int rc = -ENOENT;
+
+	LASSERTF(last > first, "last %llu first %llu\n", last, first);
+	if (mapping_mapped(mapping)) {
+		rc = 0;
+		unmap_mapping_range(mapping, first + PAGE_CACHE_SIZE - 1,
+				    last - first + 1, 0);
+	}
+
+	return rc;
+}
+
+static const struct vm_operations_struct ll_file_vm_ops = {
+	.fault			= ll_fault,
+	.page_mkwrite		= ll_page_mkwrite,
+	.open			= ll_vm_open,
+	.close			= ll_vm_close,
+};
+
+int ll_file_mmap(struct file *file, struct vm_area_struct *vma)
+{
+	struct inode *inode = file_inode(file);
+	int rc;
+
+	if (ll_file_nolock(file))
+		return -EOPNOTSUPP;
+
+	ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_MAP, 1);
+	rc = generic_file_mmap(file, vma);
+	if (rc == 0) {
+		vma->vm_ops = &ll_file_vm_ops;
+		vma->vm_ops->open(vma);
+		/* update the inode's size and mtime */
+		rc = ll_glimpse_size(inode);
+	}
+
+	return rc;
+}
diff --git a/kernel/drivers/staging/lustre/lustre/llite/llite_nfs.c b/kernel/drivers/staging/lustre/lustre/llite/llite_nfs.c
new file mode 100644
index 000000000..db43b8138
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/llite/llite_nfs.c
@@ -0,0 +1,335 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/lustre/llite/llite_nfs.c
+ *
+ * NFS export of Lustre Light File System
+ *
+ * Author: Yury Umanets <umka@clusterfs.com>
+ * Author: Huang Hua <huanghua@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LLITE
+#include "../include/lustre_lite.h"
+#include "llite_internal.h"
+#include <linux/exportfs.h>
+
+__u32 get_uuid2int(const char *name, int len)
+{
+	__u32 key0 = 0x12a3fe2d, key1 = 0x37abe8f9;
+	while (len--) {
+		__u32 key = key1 + (key0 ^ (*name++ * 7152373));
+
+		if (key & 0x80000000)
+			key -= 0x7fffffff;
+		key1 = key0;
+		key0 = key;
+	}
+	return (key0 << 1);
+}
+
+void get_uuid2fsid(const char *name, int len, __kernel_fsid_t *fsid)
+{
+	__u64 key = 0, key0 = 0x12a3fe2d, key1 = 0x37abe8f9;
+
+	while (len--) {
+		key = key1 + (key0 ^ (*name++ * 7152373));
+		if (key & 0x8000000000000000ULL)
+			key -= 0x7fffffffffffffffULL;
+		key1 = key0;
+		key0 = key;
+	}
+
+	fsid->val[0] = key;
+	fsid->val[1] = key >> 32;
+}
+
+static int ll_nfs_test_inode(struct inode *inode, void *opaque)
+{
+	return lu_fid_eq(&ll_i2info(inode)->lli_fid,
+			 (struct lu_fid *)opaque);
+}
+
+struct inode *search_inode_for_lustre(struct super_block *sb,
+				      const struct lu_fid *fid)
+{
+	struct ll_sb_info     *sbi = ll_s2sbi(sb);
+	struct ptlrpc_request *req = NULL;
+	struct inode	  *inode = NULL;
+	int		   eadatalen = 0;
+	unsigned long	      hash = cl_fid_build_ino(fid,
+						      ll_need_32bit_api(sbi));
+	struct  md_op_data    *op_data;
+	int		   rc;
+
+	CDEBUG(D_INFO, "searching inode for:(%lu,"DFID")\n", hash, PFID(fid));
+
+	inode = ilookup5(sb, hash, ll_nfs_test_inode, (void *)fid);
+	if (inode)
+		return inode;
+
+	rc = ll_get_default_mdsize(sbi, &eadatalen);
+	if (rc)
+		return ERR_PTR(rc);
+
+	/* Because inode is NULL, ll_prep_md_op_data can not
+	 * be used here. So we allocate op_data ourselves */
+	op_data = kzalloc(sizeof(*op_data), GFP_NOFS);
+	if (!op_data)
+		return ERR_PTR(-ENOMEM);
+
+	op_data->op_fid1 = *fid;
+	op_data->op_mode = eadatalen;
+	op_data->op_valid = OBD_MD_FLEASIZE;
+
+	/* mds_fid2dentry ignores f_type */
+	rc = md_getattr(sbi->ll_md_exp, op_data, &req);
+	OBD_FREE_PTR(op_data);
+	if (rc) {
+		CERROR("can't get object attrs, fid "DFID", rc %d\n",
+		       PFID(fid), rc);
+		return ERR_PTR(rc);
+	}
+	rc = ll_prep_inode(&inode, req, sb, NULL);
+	ptlrpc_req_finished(req);
+	if (rc)
+		return ERR_PTR(rc);
+
+	return inode;
+}
+
+struct lustre_nfs_fid {
+	struct lu_fid   lnf_child;
+	struct lu_fid   lnf_parent;
+};
+
+static struct dentry *
+ll_iget_for_nfs(struct super_block *sb, struct lu_fid *fid, struct lu_fid *parent)
+{
+	struct inode  *inode;
+	struct dentry *result;
+
+	CDEBUG(D_INFO, "Get dentry for fid: "DFID"\n", PFID(fid));
+	if (!fid_is_sane(fid))
+		return ERR_PTR(-ESTALE);
+
+	inode = search_inode_for_lustre(sb, fid);
+	if (IS_ERR(inode))
+		return ERR_CAST(inode);
+
+	if (is_bad_inode(inode)) {
+		/* we didn't find the right inode.. */
+		iput(inode);
+		return ERR_PTR(-ESTALE);
+	}
+
+	/**
+	 * It is an anonymous dentry without OST objects created yet.
+	 * We have to find the parent to tell MDS how to init lov objects.
+	 */
+	if (S_ISREG(inode->i_mode) && !ll_i2info(inode)->lli_has_smd &&
+	    parent != NULL) {
+		struct ll_inode_info *lli = ll_i2info(inode);
+
+		spin_lock(&lli->lli_lock);
+		lli->lli_pfid = *parent;
+		spin_unlock(&lli->lli_lock);
+	}
+
+	result = d_obtain_alias(inode);
+	if (IS_ERR(result)) {
+		iput(inode);
+		return result;
+	}
+
+	return result;
+}
+
+#define LUSTRE_NFS_FID	  0x97
+
+/**
+ * \a connectable - is nfsd will connect himself or this should be done
+ *		  at lustre
+ *
+ * The return value is file handle type:
+ * 1 -- contains child file handle;
+ * 2 -- contains child file handle and parent file handle;
+ * 255 -- error.
+ */
+static int ll_encode_fh(struct inode *inode, __u32 *fh, int *plen,
+			struct inode *parent)
+{
+	struct lustre_nfs_fid *nfs_fid = (void *)fh;
+
+	CDEBUG(D_INFO, "encoding for (%lu,"DFID") maxlen=%d minlen=%d\n",
+	      inode->i_ino, PFID(ll_inode2fid(inode)), *plen,
+	      (int)sizeof(struct lustre_nfs_fid));
+
+	if (*plen < sizeof(struct lustre_nfs_fid) / 4)
+		return 255;
+
+	nfs_fid->lnf_child = *ll_inode2fid(inode);
+	nfs_fid->lnf_parent = *ll_inode2fid(parent);
+	*plen = sizeof(struct lustre_nfs_fid) / 4;
+
+	return LUSTRE_NFS_FID;
+}
+
+static int ll_nfs_get_name_filldir(struct dir_context *ctx, const char *name,
+				   int namelen, loff_t hash, u64 ino,
+				   unsigned type)
+{
+	/* It is hack to access lde_fid for comparison with lgd_fid.
+	 * So the input 'name' must be part of the 'lu_dirent'. */
+	struct lu_dirent *lde = container_of0(name, struct lu_dirent, lde_name);
+	struct ll_getname_data *lgd =
+		container_of(ctx, struct ll_getname_data, ctx);
+	struct lu_fid fid;
+
+	fid_le_to_cpu(&fid, &lde->lde_fid);
+	if (lu_fid_eq(&fid, &lgd->lgd_fid)) {
+		memcpy(lgd->lgd_name, name, namelen);
+		lgd->lgd_name[namelen] = 0;
+		lgd->lgd_found = 1;
+	}
+	return lgd->lgd_found;
+}
+
+static int ll_get_name(struct dentry *dentry, char *name,
+		       struct dentry *child)
+{
+	struct inode *dir = d_inode(dentry);
+	int rc;
+	struct ll_getname_data lgd = {
+		.lgd_name = name,
+		.lgd_fid = ll_i2info(d_inode(child))->lli_fid,
+		.ctx.actor = ll_nfs_get_name_filldir,
+	};
+
+	if (!dir || !S_ISDIR(dir->i_mode)) {
+		rc = -ENOTDIR;
+		goto out;
+	}
+
+	if (!dir->i_fop) {
+		rc = -EINVAL;
+		goto out;
+	}
+
+	mutex_lock(&dir->i_mutex);
+	rc = ll_dir_read(dir, &lgd.ctx);
+	mutex_unlock(&dir->i_mutex);
+	if (!rc && !lgd.lgd_found)
+		rc = -ENOENT;
+out:
+	return rc;
+}
+
+static struct dentry *ll_fh_to_dentry(struct super_block *sb, struct fid *fid,
+				      int fh_len, int fh_type)
+{
+	struct lustre_nfs_fid *nfs_fid = (struct lustre_nfs_fid *)fid;
+
+	if (fh_type != LUSTRE_NFS_FID)
+		return ERR_PTR(-EPROTO);
+
+	return ll_iget_for_nfs(sb, &nfs_fid->lnf_child, &nfs_fid->lnf_parent);
+}
+
+static struct dentry *ll_fh_to_parent(struct super_block *sb, struct fid *fid,
+				      int fh_len, int fh_type)
+{
+	struct lustre_nfs_fid *nfs_fid = (struct lustre_nfs_fid *)fid;
+
+	if (fh_type != LUSTRE_NFS_FID)
+		return ERR_PTR(-EPROTO);
+
+	return ll_iget_for_nfs(sb, &nfs_fid->lnf_parent, NULL);
+}
+
+static struct dentry *ll_get_parent(struct dentry *dchild)
+{
+	struct ptlrpc_request *req = NULL;
+	struct inode	  *dir = d_inode(dchild);
+	struct ll_sb_info     *sbi;
+	struct dentry	 *result = NULL;
+	struct mdt_body       *body;
+	static char	   dotdot[] = "..";
+	struct md_op_data     *op_data;
+	int		   rc;
+	int		      lmmsize;
+
+	LASSERT(dir && S_ISDIR(dir->i_mode));
+
+	sbi = ll_s2sbi(dir->i_sb);
+
+	CDEBUG(D_INFO, "getting parent for (%lu,"DFID")\n",
+			dir->i_ino, PFID(ll_inode2fid(dir)));
+
+	rc = ll_get_default_mdsize(sbi, &lmmsize);
+	if (rc != 0)
+		return ERR_PTR(rc);
+
+	op_data = ll_prep_md_op_data(NULL, dir, NULL, dotdot,
+				     strlen(dotdot), lmmsize,
+				     LUSTRE_OPC_ANY, NULL);
+	if (IS_ERR(op_data))
+		return (void *)op_data;
+
+	rc = md_getattr_name(sbi->ll_md_exp, op_data, &req);
+	ll_finish_md_op_data(op_data);
+	if (rc) {
+		CERROR("failure %d inode %lu get parent\n", rc, dir->i_ino);
+		return ERR_PTR(rc);
+	}
+	body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
+	LASSERT(body->valid & OBD_MD_FLID);
+
+	CDEBUG(D_INFO, "parent for "DFID" is "DFID"\n",
+		PFID(ll_inode2fid(dir)), PFID(&body->fid1));
+
+	result = ll_iget_for_nfs(dir->i_sb, &body->fid1, NULL);
+
+	ptlrpc_req_finished(req);
+	return result;
+}
+
+struct export_operations lustre_export_operations = {
+       .get_parent = ll_get_parent,
+       .encode_fh  = ll_encode_fh,
+       .get_name   = ll_get_name,
+	.fh_to_dentry = ll_fh_to_dentry,
+	.fh_to_parent = ll_fh_to_parent,
+};
diff --git a/kernel/drivers/staging/lustre/lustre/llite/llite_rmtacl.c b/kernel/drivers/staging/lustre/lustre/llite/llite_rmtacl.c
new file mode 100644
index 000000000..f4da156f3
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/llite/llite_rmtacl.c
@@ -0,0 +1,300 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/llite/llite_rmtacl.c
+ *
+ * Lustre Remote User Access Control List.
+ *
+ * Author: Fan Yong <fanyong@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LLITE
+
+#ifdef CONFIG_FS_POSIX_ACL
+
+#include "../include/lustre_lite.h"
+#include "../include/lustre_eacl.h"
+#include "llite_internal.h"
+
+static inline __u32 rce_hashfunc(uid_t id)
+{
+	return id & (RCE_HASHES - 1);
+}
+
+static inline __u32 ee_hashfunc(uid_t id)
+{
+	return id & (EE_HASHES - 1);
+}
+
+u64 rce_ops2valid(int ops)
+{
+	switch (ops) {
+	case RMT_LSETFACL:
+		return OBD_MD_FLRMTLSETFACL;
+	case RMT_LGETFACL:
+		return OBD_MD_FLRMTLGETFACL;
+	case RMT_RSETFACL:
+		return OBD_MD_FLRMTRSETFACL;
+	case RMT_RGETFACL:
+		return OBD_MD_FLRMTRGETFACL;
+	default:
+		return 0;
+	}
+}
+
+static struct rmtacl_ctl_entry *rce_alloc(pid_t key, int ops)
+{
+	struct rmtacl_ctl_entry *rce;
+
+	rce = kzalloc(sizeof(*rce), GFP_NOFS);
+	if (!rce)
+		return NULL;
+
+	INIT_LIST_HEAD(&rce->rce_list);
+	rce->rce_key = key;
+	rce->rce_ops = ops;
+
+	return rce;
+}
+
+static void rce_free(struct rmtacl_ctl_entry *rce)
+{
+	if (!list_empty(&rce->rce_list))
+		list_del(&rce->rce_list);
+
+	OBD_FREE_PTR(rce);
+}
+
+static struct rmtacl_ctl_entry *__rct_search(struct rmtacl_ctl_table *rct,
+					   pid_t key)
+{
+	struct rmtacl_ctl_entry *rce;
+	struct list_head *head = &rct->rct_entries[rce_hashfunc(key)];
+
+	list_for_each_entry(rce, head, rce_list)
+		if (rce->rce_key == key)
+			return rce;
+
+	return NULL;
+}
+
+struct rmtacl_ctl_entry *rct_search(struct rmtacl_ctl_table *rct, pid_t key)
+{
+	struct rmtacl_ctl_entry *rce;
+
+	spin_lock(&rct->rct_lock);
+	rce = __rct_search(rct, key);
+	spin_unlock(&rct->rct_lock);
+	return rce;
+}
+
+int rct_add(struct rmtacl_ctl_table *rct, pid_t key, int ops)
+{
+	struct rmtacl_ctl_entry *rce, *e;
+
+	rce = rce_alloc(key, ops);
+	if (rce == NULL)
+		return -ENOMEM;
+
+	spin_lock(&rct->rct_lock);
+	e = __rct_search(rct, key);
+	if (unlikely(e != NULL)) {
+		CWARN("Unexpected stale rmtacl_entry found: [key: %d] [ops: %d]\n",
+		      (int)key, ops);
+		rce_free(e);
+	}
+	list_add_tail(&rce->rce_list, &rct->rct_entries[rce_hashfunc(key)]);
+	spin_unlock(&rct->rct_lock);
+
+	return 0;
+}
+
+int rct_del(struct rmtacl_ctl_table *rct, pid_t key)
+{
+	struct rmtacl_ctl_entry *rce;
+
+	spin_lock(&rct->rct_lock);
+	rce = __rct_search(rct, key);
+	if (rce)
+		rce_free(rce);
+	spin_unlock(&rct->rct_lock);
+
+	return rce ? 0 : -ENOENT;
+}
+
+void rct_init(struct rmtacl_ctl_table *rct)
+{
+	int i;
+
+	spin_lock_init(&rct->rct_lock);
+	for (i = 0; i < RCE_HASHES; i++)
+		INIT_LIST_HEAD(&rct->rct_entries[i]);
+}
+
+void rct_fini(struct rmtacl_ctl_table *rct)
+{
+	struct rmtacl_ctl_entry *rce;
+	int i;
+
+	spin_lock(&rct->rct_lock);
+	for (i = 0; i < RCE_HASHES; i++)
+		while (!list_empty(&rct->rct_entries[i])) {
+			rce = list_entry(rct->rct_entries[i].next,
+					     struct rmtacl_ctl_entry, rce_list);
+			rce_free(rce);
+		}
+	spin_unlock(&rct->rct_lock);
+}
+
+
+static struct eacl_entry *ee_alloc(pid_t key, struct lu_fid *fid, int type,
+				   ext_acl_xattr_header *header)
+{
+	struct eacl_entry *ee;
+
+	ee = kzalloc(sizeof(*ee), GFP_NOFS);
+	if (!ee)
+		return NULL;
+
+	INIT_LIST_HEAD(&ee->ee_list);
+	ee->ee_key = key;
+	ee->ee_fid = *fid;
+	ee->ee_type = type;
+	ee->ee_acl = header;
+
+	return ee;
+}
+
+void ee_free(struct eacl_entry *ee)
+{
+	if (!list_empty(&ee->ee_list))
+		list_del(&ee->ee_list);
+
+	if (ee->ee_acl)
+		lustre_ext_acl_xattr_free(ee->ee_acl);
+
+	OBD_FREE_PTR(ee);
+}
+
+static struct eacl_entry *__et_search_del(struct eacl_table *et, pid_t key,
+					struct lu_fid *fid, int type)
+{
+	struct eacl_entry *ee;
+	struct list_head *head = &et->et_entries[ee_hashfunc(key)];
+
+	LASSERT(fid != NULL);
+	list_for_each_entry(ee, head, ee_list)
+		if (ee->ee_key == key) {
+			if (lu_fid_eq(&ee->ee_fid, fid) &&
+			    ee->ee_type == type) {
+				list_del_init(&ee->ee_list);
+				return ee;
+			}
+		}
+
+	return NULL;
+}
+
+struct eacl_entry *et_search_del(struct eacl_table *et, pid_t key,
+				 struct lu_fid *fid, int type)
+{
+	struct eacl_entry *ee;
+
+	spin_lock(&et->et_lock);
+	ee = __et_search_del(et, key, fid, type);
+	spin_unlock(&et->et_lock);
+	return ee;
+}
+
+void et_search_free(struct eacl_table *et, pid_t key)
+{
+	struct eacl_entry *ee, *next;
+	struct list_head *head = &et->et_entries[ee_hashfunc(key)];
+
+	spin_lock(&et->et_lock);
+	list_for_each_entry_safe(ee, next, head, ee_list)
+		if (ee->ee_key == key)
+			ee_free(ee);
+
+	spin_unlock(&et->et_lock);
+}
+
+int ee_add(struct eacl_table *et, pid_t key, struct lu_fid *fid, int type,
+	   ext_acl_xattr_header *header)
+{
+	struct eacl_entry *ee, *e;
+
+	ee = ee_alloc(key, fid, type, header);
+	if (ee == NULL)
+		return -ENOMEM;
+
+	spin_lock(&et->et_lock);
+	e = __et_search_del(et, key, fid, type);
+	if (unlikely(e != NULL)) {
+		CWARN("Unexpected stale eacl_entry found: [key: %d] [fid: " DFID "] [type: %d]\n",
+		      (int)key, PFID(fid), type);
+		ee_free(e);
+	}
+	list_add_tail(&ee->ee_list, &et->et_entries[ee_hashfunc(key)]);
+	spin_unlock(&et->et_lock);
+
+	return 0;
+}
+
+void et_init(struct eacl_table *et)
+{
+	int i;
+
+	spin_lock_init(&et->et_lock);
+	for (i = 0; i < EE_HASHES; i++)
+		INIT_LIST_HEAD(&et->et_entries[i]);
+}
+
+void et_fini(struct eacl_table *et)
+{
+	struct eacl_entry *ee;
+	int i;
+
+	spin_lock(&et->et_lock);
+	for (i = 0; i < EE_HASHES; i++)
+		while (!list_empty(&et->et_entries[i])) {
+			ee = list_entry(et->et_entries[i].next,
+					    struct eacl_entry, ee_list);
+			ee_free(ee);
+		}
+	spin_unlock(&et->et_lock);
+}
+
+#endif
diff --git a/kernel/drivers/staging/lustre/lustre/llite/lloop.c b/kernel/drivers/staging/lustre/lustre/llite/lloop.c
new file mode 100644
index 000000000..413a8408e
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/llite/lloop.c
@@ -0,0 +1,877 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+/*
+ *  linux/drivers/block/loop.c
+ *
+ *  Written by Theodore Ts'o, 3/29/93
+ *
+ * Copyright 1993 by Theodore Ts'o.  Redistribution of this file is
+ * permitted under the GNU General Public License.
+ *
+ * Modularized and updated for 1.1.16 kernel - Mitch Dsouza 28th May 1994
+ * Adapted for 1.3.59 kernel - Andries Brouwer, 1 Feb 1996
+ *
+ * Fixed do_loop_request() re-entrancy - Vincent.Renardias@waw.com Mar 20, 1997
+ *
+ * Added devfs support - Richard Gooch <rgooch@atnf.csiro.au> 16-Jan-1998
+ *
+ * Handle sparse backing files correctly - Kenn Humborg, Jun 28, 1998
+ *
+ * Loadable modules and other fixes by AK, 1998
+ *
+ * Maximum number of loop devices now dynamic via max_loop module parameter.
+ * Russell Kroll <rkroll@exploits.org> 19990701
+ *
+ * Maximum number of loop devices when compiled-in now selectable by passing
+ * max_loop=<1-255> to the kernel on boot.
+ * Erik I. Bols?, <eriki@himolde.no>, Oct 31, 1999
+ *
+ * Completely rewrite request handling to be make_request_fn style and
+ * non blocking, pushing work to a helper thread. Lots of fixes from
+ * Al Viro too.
+ * Jens Axboe <axboe@suse.de>, Nov 2000
+ *
+ * Support up to 256 loop devices
+ * Heinz Mauelshagen <mge@sistina.com>, Feb 2002
+ *
+ * Support for falling back on the write file operation when the address space
+ * operations prepare_write and/or commit_write are not available on the
+ * backing filesystem.
+ * Anton Altaparmakov, 16 Feb 2005
+ *
+ * Still To Fix:
+ * - Advisory locking is ignored here.
+ * - Should use an own CAP_* category instead of CAP_SYS_ADMIN
+ *
+ */
+
+#include <linux/module.h>
+
+#include <linux/sched.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/stat.h>
+#include <linux/errno.h>
+#include <linux/major.h>
+#include <linux/wait.h>
+#include <linux/blkdev.h>
+#include <linux/blkpg.h>
+#include <linux/init.h>
+#include <linux/swap.h>
+#include <linux/slab.h>
+#include <linux/suspend.h>
+#include <linux/writeback.h>
+#include <linux/buffer_head.h>		/* for invalidate_bdev() */
+#include <linux/completion.h>
+#include <linux/highmem.h>
+#include <linux/gfp.h>
+#include <linux/pagevec.h>
+#include <linux/uaccess.h>
+
+#include "../include/lustre_lib.h"
+#include "../include/lustre_lite.h"
+#include "llite_internal.h"
+
+#define LLOOP_MAX_SEGMENTS	LNET_MAX_IOV
+
+/* Possible states of device */
+enum {
+	LLOOP_UNBOUND,
+	LLOOP_BOUND,
+	LLOOP_RUNDOWN,
+};
+
+struct lloop_device {
+	int		  lo_number;
+	int		  lo_refcnt;
+	loff_t	       lo_offset;
+	loff_t	       lo_sizelimit;
+	int		  lo_flags;
+	struct file	 *lo_backing_file;
+	struct block_device *lo_device;
+	unsigned	     lo_blocksize;
+
+	gfp_t		  old_gfp_mask;
+
+	spinlock_t		lo_lock;
+	struct bio		*lo_bio;
+	struct bio		*lo_biotail;
+	int			lo_state;
+	struct semaphore	lo_sem;
+	struct mutex		lo_ctl_mutex;
+	atomic_t	 lo_pending;
+	wait_queue_head_t	  lo_bh_wait;
+
+	struct request_queue *lo_queue;
+
+	const struct lu_env *lo_env;
+	struct cl_io	 lo_io;
+	struct ll_dio_pages  lo_pvec;
+
+	/* data to handle bio for lustre. */
+	struct lo_request_data {
+		struct page *lrd_pages[LLOOP_MAX_SEGMENTS];
+		loff_t       lrd_offsets[LLOOP_MAX_SEGMENTS];
+	} lo_requests[1];
+};
+
+/*
+ * Loop flags
+ */
+enum {
+	LO_FLAGS_READ_ONLY       = 1,
+};
+
+static int lloop_major;
+#define MAX_LOOP_DEFAULT  16
+static int max_loop = MAX_LOOP_DEFAULT;
+static struct lloop_device *loop_dev;
+static struct gendisk **disks;
+static struct mutex lloop_mutex;
+static void *ll_iocontrol_magic = NULL;
+
+static loff_t get_loop_size(struct lloop_device *lo, struct file *file)
+{
+	loff_t size, offset, loopsize;
+
+	/* Compute loopsize in bytes */
+	size = i_size_read(file->f_mapping->host);
+	offset = lo->lo_offset;
+	loopsize = size - offset;
+	if (lo->lo_sizelimit > 0 && lo->lo_sizelimit < loopsize)
+		loopsize = lo->lo_sizelimit;
+
+	/*
+	 * Unfortunately, if we want to do I/O on the device,
+	 * the number of 512-byte sectors has to fit into a sector_t.
+	 */
+	return loopsize >> 9;
+}
+
+static int do_bio_lustrebacked(struct lloop_device *lo, struct bio *head)
+{
+	const struct lu_env  *env   = lo->lo_env;
+	struct cl_io	 *io    = &lo->lo_io;
+	struct inode	 *inode = file_inode(lo->lo_backing_file);
+	struct cl_object     *obj = ll_i2info(inode)->lli_clob;
+	pgoff_t	       offset;
+	int		   ret;
+	int		   rw;
+	u32		   page_count = 0;
+	struct bio_vec       bvec;
+	struct bvec_iter   iter;
+	struct bio	   *bio;
+	ssize_t	       bytes;
+
+	struct ll_dio_pages  *pvec = &lo->lo_pvec;
+	struct page	 **pages = pvec->ldp_pages;
+	loff_t	       *offsets = pvec->ldp_offsets;
+
+	truncate_inode_pages(inode->i_mapping, 0);
+
+	/* initialize the IO */
+	memset(io, 0, sizeof(*io));
+	io->ci_obj = obj;
+	ret = cl_io_init(env, io, CIT_MISC, obj);
+	if (ret)
+		return io->ci_result;
+	io->ci_lockreq = CILR_NEVER;
+
+	LASSERT(head != NULL);
+	rw = head->bi_rw;
+	for (bio = head; bio != NULL; bio = bio->bi_next) {
+		LASSERT(rw == bio->bi_rw);
+
+		offset = (pgoff_t)(bio->bi_iter.bi_sector << 9) + lo->lo_offset;
+		bio_for_each_segment(bvec, bio, iter) {
+			BUG_ON(bvec.bv_offset != 0);
+			BUG_ON(bvec.bv_len != PAGE_CACHE_SIZE);
+
+			pages[page_count] = bvec.bv_page;
+			offsets[page_count] = offset;
+			page_count++;
+			offset += bvec.bv_len;
+		}
+		LASSERT(page_count <= LLOOP_MAX_SEGMENTS);
+	}
+
+	ll_stats_ops_tally(ll_i2sbi(inode),
+			(rw == WRITE) ? LPROC_LL_BRW_WRITE : LPROC_LL_BRW_READ,
+			page_count);
+
+	pvec->ldp_size = page_count << PAGE_CACHE_SHIFT;
+	pvec->ldp_nr = page_count;
+
+	/* FIXME: in ll_direct_rw_pages, it has to allocate many cl_page{}s to
+	 * write those pages into OST. Even worse case is that more pages
+	 * would be asked to write out to swap space, and then finally get here
+	 * again.
+	 * Unfortunately this is NOT easy to fix.
+	 * Thoughts on solution:
+	 * 0. Define a reserved pool for cl_pages, which could be a list of
+	 *    pre-allocated cl_pages;
+	 * 1. Define a new operation in cl_object_operations{}, says clo_depth,
+	 *    which measures how many layers for this lustre object. Generally
+	 *    speaking, the depth would be 2, one for llite, and one for lovsub.
+	 *    However, for SNS, there will be more since we need additional page
+	 *    to store parity;
+	 * 2. Reserve the # of (page_count * depth) cl_pages from the reserved
+	 *    pool. Afterwards, the clio would allocate the pages from reserved
+	 *    pool, this guarantees we needn't allocate the cl_pages from
+	 *    generic cl_page slab cache.
+	 *    Of course, if there is NOT enough pages in the pool, we might
+	 *    be asked to write less pages once, this purely depends on
+	 *    implementation. Anyway, we should be careful to avoid deadlocking.
+	 */
+	mutex_lock(&inode->i_mutex);
+	bytes = ll_direct_rw_pages(env, io, rw, inode, pvec);
+	mutex_unlock(&inode->i_mutex);
+	cl_io_fini(env, io);
+	return (bytes == pvec->ldp_size) ? 0 : (int)bytes;
+}
+
+/*
+ * Add bio to back of pending list
+ */
+static void loop_add_bio(struct lloop_device *lo, struct bio *bio)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&lo->lo_lock, flags);
+	if (lo->lo_biotail) {
+		lo->lo_biotail->bi_next = bio;
+		lo->lo_biotail = bio;
+	} else
+		lo->lo_bio = lo->lo_biotail = bio;
+	spin_unlock_irqrestore(&lo->lo_lock, flags);
+
+	atomic_inc(&lo->lo_pending);
+	if (waitqueue_active(&lo->lo_bh_wait))
+		wake_up(&lo->lo_bh_wait);
+}
+
+/*
+ * Grab first pending buffer
+ */
+static unsigned int loop_get_bio(struct lloop_device *lo, struct bio **req)
+{
+	struct bio *first;
+	struct bio **bio;
+	unsigned int count = 0;
+	unsigned int page_count = 0;
+	int rw;
+
+	spin_lock_irq(&lo->lo_lock);
+	first = lo->lo_bio;
+	if (unlikely(first == NULL)) {
+		spin_unlock_irq(&lo->lo_lock);
+		return 0;
+	}
+
+	/* TODO: need to split the bio, too bad. */
+	LASSERT(first->bi_vcnt <= LLOOP_MAX_SEGMENTS);
+
+	rw = first->bi_rw;
+	bio = &lo->lo_bio;
+	while (*bio && (*bio)->bi_rw == rw) {
+		CDEBUG(D_INFO, "bio sector %llu size %u count %u vcnt%u \n",
+		       (unsigned long long)(*bio)->bi_iter.bi_sector,
+		       (*bio)->bi_iter.bi_size,
+		       page_count, (*bio)->bi_vcnt);
+		if (page_count + (*bio)->bi_vcnt > LLOOP_MAX_SEGMENTS)
+			break;
+
+
+		page_count += (*bio)->bi_vcnt;
+		count++;
+		bio = &(*bio)->bi_next;
+	}
+	if (*bio) {
+		/* Some of bios can't be mergeable. */
+		lo->lo_bio = *bio;
+		*bio = NULL;
+	} else {
+		/* Hit the end of queue */
+		lo->lo_biotail = NULL;
+		lo->lo_bio = NULL;
+	}
+	*req = first;
+	spin_unlock_irq(&lo->lo_lock);
+	return count;
+}
+
+static void loop_make_request(struct request_queue *q, struct bio *old_bio)
+{
+	struct lloop_device *lo = q->queuedata;
+	int rw = bio_rw(old_bio);
+	int inactive;
+
+	if (!lo)
+		goto err;
+
+	CDEBUG(D_INFO, "submit bio sector %llu size %u\n",
+	       (unsigned long long)old_bio->bi_iter.bi_sector,
+	       old_bio->bi_iter.bi_size);
+
+	spin_lock_irq(&lo->lo_lock);
+	inactive = lo->lo_state != LLOOP_BOUND;
+	spin_unlock_irq(&lo->lo_lock);
+	if (inactive)
+		goto err;
+
+	if (rw == WRITE) {
+		if (lo->lo_flags & LO_FLAGS_READ_ONLY)
+			goto err;
+	} else if (rw == READA) {
+		rw = READ;
+	} else if (rw != READ) {
+		CERROR("lloop: unknown command (%x)\n", rw);
+		goto err;
+	}
+	loop_add_bio(lo, old_bio);
+	return;
+err:
+	cfs_bio_io_error(old_bio, old_bio->bi_iter.bi_size);
+}
+
+
+static inline void loop_handle_bio(struct lloop_device *lo, struct bio *bio)
+{
+	int ret;
+	ret = do_bio_lustrebacked(lo, bio);
+	while (bio) {
+		struct bio *tmp = bio->bi_next;
+		bio->bi_next = NULL;
+		cfs_bio_endio(bio, bio->bi_iter.bi_size, ret);
+		bio = tmp;
+	}
+}
+
+static inline int loop_active(struct lloop_device *lo)
+{
+	return atomic_read(&lo->lo_pending) ||
+		(lo->lo_state == LLOOP_RUNDOWN);
+}
+
+/*
+ * worker thread that handles reads/writes to file backed loop devices,
+ * to avoid blocking in our make_request_fn.
+ */
+static int loop_thread(void *data)
+{
+	struct lloop_device *lo = data;
+	struct bio *bio;
+	unsigned int count;
+	unsigned long times = 0;
+	unsigned long total_count = 0;
+
+	struct lu_env *env;
+	int refcheck;
+	int ret = 0;
+
+	set_user_nice(current, MIN_NICE);
+
+	lo->lo_state = LLOOP_BOUND;
+
+	env = cl_env_get(&refcheck);
+	if (IS_ERR(env)) {
+		ret = PTR_ERR(env);
+		goto out;
+	}
+
+	lo->lo_env = env;
+	memset(&lo->lo_pvec, 0, sizeof(lo->lo_pvec));
+	lo->lo_pvec.ldp_pages   = lo->lo_requests[0].lrd_pages;
+	lo->lo_pvec.ldp_offsets = lo->lo_requests[0].lrd_offsets;
+
+	/*
+	 * up sem, we are running
+	 */
+	up(&lo->lo_sem);
+
+	for (;;) {
+		wait_event(lo->lo_bh_wait, loop_active(lo));
+		if (!atomic_read(&lo->lo_pending)) {
+			int exiting = 0;
+			spin_lock_irq(&lo->lo_lock);
+			exiting = (lo->lo_state == LLOOP_RUNDOWN);
+			spin_unlock_irq(&lo->lo_lock);
+			if (exiting)
+				break;
+		}
+
+		bio = NULL;
+		count = loop_get_bio(lo, &bio);
+		if (!count) {
+			CWARN("lloop(minor: %d): missing bio\n", lo->lo_number);
+			continue;
+		}
+
+		total_count += count;
+		if (total_count < count) {     /* overflow */
+			total_count = count;
+			times = 1;
+		} else {
+			times++;
+		}
+		if ((times & 127) == 0) {
+			CDEBUG(D_INFO, "total: %lu, count: %lu, avg: %lu\n",
+			       total_count, times, total_count / times);
+		}
+
+		LASSERT(bio != NULL);
+		LASSERT(count <= atomic_read(&lo->lo_pending));
+		loop_handle_bio(lo, bio);
+		atomic_sub(count, &lo->lo_pending);
+	}
+	cl_env_put(env, &refcheck);
+
+out:
+	up(&lo->lo_sem);
+	return ret;
+}
+
+static int loop_set_fd(struct lloop_device *lo, struct file *unused,
+		       struct block_device *bdev, struct file *file)
+{
+	struct inode	 *inode;
+	struct address_space *mapping;
+	int		   lo_flags = 0;
+	int		   error;
+	loff_t		size;
+
+	if (!try_module_get(THIS_MODULE))
+		return -ENODEV;
+
+	error = -EBUSY;
+	if (lo->lo_state != LLOOP_UNBOUND)
+		goto out;
+
+	mapping = file->f_mapping;
+	inode = mapping->host;
+
+	error = -EINVAL;
+	if (!S_ISREG(inode->i_mode) || inode->i_sb->s_magic != LL_SUPER_MAGIC)
+		goto out;
+
+	if (!(file->f_mode & FMODE_WRITE))
+		lo_flags |= LO_FLAGS_READ_ONLY;
+
+	size = get_loop_size(lo, file);
+
+	if ((loff_t)(sector_t)size != size) {
+		error = -EFBIG;
+		goto out;
+	}
+
+	/* remove all pages in cache so as dirty pages not to be existent. */
+	truncate_inode_pages(mapping, 0);
+
+	set_device_ro(bdev, (lo_flags & LO_FLAGS_READ_ONLY) != 0);
+
+	lo->lo_blocksize = PAGE_CACHE_SIZE;
+	lo->lo_device = bdev;
+	lo->lo_flags = lo_flags;
+	lo->lo_backing_file = file;
+	lo->lo_sizelimit = 0;
+	lo->old_gfp_mask = mapping_gfp_mask(mapping);
+	mapping_set_gfp_mask(mapping, lo->old_gfp_mask & ~(__GFP_IO|__GFP_FS));
+
+	lo->lo_bio = lo->lo_biotail = NULL;
+
+	/*
+	 * set queue make_request_fn, and add limits based on lower level
+	 * device
+	 */
+	blk_queue_make_request(lo->lo_queue, loop_make_request);
+	lo->lo_queue->queuedata = lo;
+
+	/* queue parameters */
+	CLASSERT(PAGE_CACHE_SIZE < (1 << (sizeof(unsigned short) * 8)));
+	blk_queue_logical_block_size(lo->lo_queue,
+				     (unsigned short)PAGE_CACHE_SIZE);
+	blk_queue_max_hw_sectors(lo->lo_queue,
+				 LLOOP_MAX_SEGMENTS << (PAGE_CACHE_SHIFT - 9));
+	blk_queue_max_segments(lo->lo_queue, LLOOP_MAX_SEGMENTS);
+
+	set_capacity(disks[lo->lo_number], size);
+	bd_set_size(bdev, size << 9);
+
+	set_blocksize(bdev, lo->lo_blocksize);
+
+	kthread_run(loop_thread, lo, "lloop%d", lo->lo_number);
+	down(&lo->lo_sem);
+	return 0;
+
+out:
+	/* This is safe: open() is still holding a reference. */
+	module_put(THIS_MODULE);
+	return error;
+}
+
+static int loop_clr_fd(struct lloop_device *lo, struct block_device *bdev,
+		       int count)
+{
+	struct file *filp = lo->lo_backing_file;
+	gfp_t gfp = lo->old_gfp_mask;
+
+	if (lo->lo_state != LLOOP_BOUND)
+		return -ENXIO;
+
+	if (lo->lo_refcnt > count)	/* we needed one fd for the ioctl */
+		return -EBUSY;
+
+	if (filp == NULL)
+		return -EINVAL;
+
+	spin_lock_irq(&lo->lo_lock);
+	lo->lo_state = LLOOP_RUNDOWN;
+	spin_unlock_irq(&lo->lo_lock);
+	wake_up(&lo->lo_bh_wait);
+
+	down(&lo->lo_sem);
+	lo->lo_backing_file = NULL;
+	lo->lo_device = NULL;
+	lo->lo_offset = 0;
+	lo->lo_sizelimit = 0;
+	lo->lo_flags = 0;
+	invalidate_bdev(bdev);
+	set_capacity(disks[lo->lo_number], 0);
+	bd_set_size(bdev, 0);
+	mapping_set_gfp_mask(filp->f_mapping, gfp);
+	lo->lo_state = LLOOP_UNBOUND;
+	fput(filp);
+	/* This is safe: open() is still holding a reference. */
+	module_put(THIS_MODULE);
+	return 0;
+}
+
+static int lo_open(struct block_device *bdev, fmode_t mode)
+{
+	struct lloop_device *lo = bdev->bd_disk->private_data;
+
+	mutex_lock(&lo->lo_ctl_mutex);
+	lo->lo_refcnt++;
+	mutex_unlock(&lo->lo_ctl_mutex);
+
+	return 0;
+}
+
+static void lo_release(struct gendisk *disk, fmode_t mode)
+{
+	struct lloop_device *lo = disk->private_data;
+
+	mutex_lock(&lo->lo_ctl_mutex);
+	--lo->lo_refcnt;
+	mutex_unlock(&lo->lo_ctl_mutex);
+}
+
+/* lloop device node's ioctl function. */
+static int lo_ioctl(struct block_device *bdev, fmode_t mode,
+		    unsigned int cmd, unsigned long arg)
+{
+	struct lloop_device *lo = bdev->bd_disk->private_data;
+	struct inode *inode = NULL;
+	int err = 0;
+
+	mutex_lock(&lloop_mutex);
+	switch (cmd) {
+	case LL_IOC_LLOOP_DETACH: {
+		err = loop_clr_fd(lo, bdev, 2);
+		if (err == 0)
+			blkdev_put(bdev, 0); /* grabbed in LLOOP_ATTACH */
+		break;
+	}
+
+	case LL_IOC_LLOOP_INFO: {
+		struct lu_fid fid;
+
+		if (lo->lo_backing_file == NULL) {
+			err = -ENOENT;
+			break;
+		}
+		if (inode == NULL)
+			inode = file_inode(lo->lo_backing_file);
+		if (lo->lo_state == LLOOP_BOUND)
+			fid = ll_i2info(inode)->lli_fid;
+		else
+			fid_zero(&fid);
+
+		if (copy_to_user((struct lu_fid *)arg, &fid, sizeof(fid)))
+			err = -EFAULT;
+		break;
+	}
+
+	default:
+		err = -EINVAL;
+		break;
+	}
+	mutex_unlock(&lloop_mutex);
+
+	return err;
+}
+
+static struct block_device_operations lo_fops = {
+	.owner =	THIS_MODULE,
+	.open =	 lo_open,
+	.release =      lo_release,
+	.ioctl =	lo_ioctl,
+};
+
+/* dynamic iocontrol callback.
+ * This callback is registered in lloop_init and will be called by
+ * ll_iocontrol_call.
+ *
+ * This is a llite regular file ioctl function. It takes the responsibility
+ * of attaching or detaching a file by a lloop's device number.
+ */
+static enum llioc_iter lloop_ioctl(struct inode *unused, struct file *file,
+				   unsigned int cmd, unsigned long arg,
+				   void *magic, int *rcp)
+{
+	struct lloop_device *lo = NULL;
+	struct block_device *bdev = NULL;
+	int err = 0;
+	dev_t dev;
+
+	if (magic != ll_iocontrol_magic)
+		return LLIOC_CONT;
+
+	if (disks == NULL) {
+		err = -ENODEV;
+		goto out1;
+	}
+
+	CWARN("Enter llop_ioctl\n");
+
+	mutex_lock(&lloop_mutex);
+	switch (cmd) {
+	case LL_IOC_LLOOP_ATTACH: {
+		struct lloop_device *lo_free = NULL;
+		int i;
+
+		for (i = 0; i < max_loop; i++, lo = NULL) {
+			lo = &loop_dev[i];
+			if (lo->lo_state == LLOOP_UNBOUND) {
+				if (!lo_free)
+					lo_free = lo;
+				continue;
+			}
+			if (file_inode(lo->lo_backing_file) == file_inode(file))
+				break;
+		}
+		if (lo || !lo_free) {
+			err = -EBUSY;
+			goto out;
+		}
+
+		lo = lo_free;
+		dev = MKDEV(lloop_major, lo->lo_number);
+
+		/* quit if the used pointer is writable */
+		if (put_user((long)old_encode_dev(dev), (long *)arg)) {
+			err = -EFAULT;
+			goto out;
+		}
+
+		bdev = blkdev_get_by_dev(dev, file->f_mode, NULL);
+		if (IS_ERR(bdev)) {
+			err = PTR_ERR(bdev);
+			goto out;
+		}
+
+		get_file(file);
+		err = loop_set_fd(lo, NULL, bdev, file);
+		if (err) {
+			fput(file);
+			blkdev_put(bdev, 0);
+		}
+
+		break;
+	}
+
+	case LL_IOC_LLOOP_DETACH_BYDEV: {
+		int minor;
+
+		dev = old_decode_dev(arg);
+		if (MAJOR(dev) != lloop_major) {
+			err = -EINVAL;
+			goto out;
+		}
+
+		minor = MINOR(dev);
+		if (minor > max_loop - 1) {
+			err = -EINVAL;
+			goto out;
+		}
+
+		lo = &loop_dev[minor];
+		if (lo->lo_state != LLOOP_BOUND) {
+			err = -EINVAL;
+			goto out;
+		}
+
+		bdev = lo->lo_device;
+		err = loop_clr_fd(lo, bdev, 1);
+		if (err == 0)
+			blkdev_put(bdev, 0); /* grabbed in LLOOP_ATTACH */
+
+		break;
+	}
+
+	default:
+		err = -EINVAL;
+		break;
+	}
+
+out:
+	mutex_unlock(&lloop_mutex);
+out1:
+	if (rcp)
+		*rcp = err;
+	return LLIOC_STOP;
+}
+
+static int __init lloop_init(void)
+{
+	int	i;
+	unsigned int cmdlist[] = {
+		LL_IOC_LLOOP_ATTACH,
+		LL_IOC_LLOOP_DETACH_BYDEV,
+	};
+
+	if (max_loop < 1 || max_loop > 256) {
+		max_loop = MAX_LOOP_DEFAULT;
+		CWARN("lloop: invalid max_loop (must be between 1 and 256), using default (%u)\n",
+		      max_loop);
+	}
+
+	lloop_major = register_blkdev(0, "lloop");
+	if (lloop_major < 0)
+		return -EIO;
+
+	CDEBUG(D_CONFIG, "registered lloop major %d with %u minors\n",
+	       lloop_major, max_loop);
+
+	ll_iocontrol_magic = ll_iocontrol_register(lloop_ioctl, 2, cmdlist);
+	if (ll_iocontrol_magic == NULL)
+		goto out_mem1;
+
+	loop_dev = kcalloc(max_loop, sizeof(*loop_dev), GFP_KERNEL);
+	if (!loop_dev)
+		goto out_mem1;
+
+	disks = kcalloc(max_loop, sizeof(*disks), GFP_KERNEL);
+	if (!disks)
+		goto out_mem2;
+
+	for (i = 0; i < max_loop; i++) {
+		disks[i] = alloc_disk(1);
+		if (!disks[i])
+			goto out_mem3;
+	}
+
+	mutex_init(&lloop_mutex);
+
+	for (i = 0; i < max_loop; i++) {
+		struct lloop_device *lo = &loop_dev[i];
+		struct gendisk *disk = disks[i];
+
+		lo->lo_queue = blk_alloc_queue(GFP_KERNEL);
+		if (!lo->lo_queue)
+			goto out_mem4;
+
+		mutex_init(&lo->lo_ctl_mutex);
+		sema_init(&lo->lo_sem, 0);
+		init_waitqueue_head(&lo->lo_bh_wait);
+		lo->lo_number = i;
+		spin_lock_init(&lo->lo_lock);
+		disk->major = lloop_major;
+		disk->first_minor = i;
+		disk->fops = &lo_fops;
+		sprintf(disk->disk_name, "lloop%d", i);
+		disk->private_data = lo;
+		disk->queue = lo->lo_queue;
+	}
+
+	/* We cannot fail after we call this, so another loop!*/
+	for (i = 0; i < max_loop; i++)
+		add_disk(disks[i]);
+	return 0;
+
+out_mem4:
+	while (i--)
+		blk_cleanup_queue(loop_dev[i].lo_queue);
+	i = max_loop;
+out_mem3:
+	while (i--)
+		put_disk(disks[i]);
+	OBD_FREE(disks, max_loop * sizeof(*disks));
+out_mem2:
+	OBD_FREE(loop_dev, max_loop * sizeof(*loop_dev));
+out_mem1:
+	unregister_blkdev(lloop_major, "lloop");
+	ll_iocontrol_unregister(ll_iocontrol_magic);
+	CERROR("lloop: ran out of memory\n");
+	return -ENOMEM;
+}
+
+static void lloop_exit(void)
+{
+	int i;
+
+	ll_iocontrol_unregister(ll_iocontrol_magic);
+	for (i = 0; i < max_loop; i++) {
+		del_gendisk(disks[i]);
+		blk_cleanup_queue(loop_dev[i].lo_queue);
+		put_disk(disks[i]);
+	}
+
+	unregister_blkdev(lloop_major, "lloop");
+
+	OBD_FREE(disks, max_loop * sizeof(*disks));
+	OBD_FREE(loop_dev, max_loop * sizeof(*loop_dev));
+}
+
+module_init(lloop_init);
+module_exit(lloop_exit);
+
+module_param(max_loop, int, 0444);
+MODULE_PARM_DESC(max_loop, "maximum of lloop_device");
+MODULE_AUTHOR("Sun Microsystems, Inc. <http://www.lustre.org/>");
+MODULE_DESCRIPTION("Lustre virtual block device");
+MODULE_LICENSE("GPL");
diff --git a/kernel/drivers/staging/lustre/lustre/llite/lproc_llite.c b/kernel/drivers/staging/lustre/lustre/llite/lproc_llite.c
new file mode 100644
index 000000000..83a9b8547
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/llite/lproc_llite.c
@@ -0,0 +1,1536 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+#define DEBUG_SUBSYSTEM S_LLITE
+
+#include "../include/lustre_lite.h"
+#include "../include/lprocfs_status.h"
+#include <linux/seq_file.h>
+#include "../include/obd_support.h"
+
+#include "llite_internal.h"
+#include "vvp_internal.h"
+
+/* /proc/lustre/llite mount point registration */
+static struct file_operations ll_rw_extents_stats_fops;
+static struct file_operations ll_rw_extents_stats_pp_fops;
+static struct file_operations ll_rw_offset_stats_fops;
+
+static int ll_blksize_seq_show(struct seq_file *m, void *v)
+{
+	struct super_block *sb = (struct super_block *)m->private;
+	struct obd_statfs osfs;
+	int rc;
+
+	LASSERT(sb != NULL);
+	rc = ll_statfs_internal(sb, &osfs,
+				cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS),
+				OBD_STATFS_NODELAY);
+	if (!rc)
+		seq_printf(m, "%u\n", osfs.os_bsize);
+
+	return rc;
+}
+LPROC_SEQ_FOPS_RO(ll_blksize);
+
+static int ll_kbytestotal_seq_show(struct seq_file *m, void *v)
+{
+	struct super_block *sb = (struct super_block *)m->private;
+	struct obd_statfs osfs;
+	int rc;
+
+	LASSERT(sb != NULL);
+	rc = ll_statfs_internal(sb, &osfs,
+				cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS),
+				OBD_STATFS_NODELAY);
+	if (!rc) {
+		__u32 blk_size = osfs.os_bsize >> 10;
+		__u64 result = osfs.os_blocks;
+
+		while (blk_size >>= 1)
+			result <<= 1;
+
+		seq_printf(m, "%llu\n", result);
+	}
+
+	return rc;
+}
+LPROC_SEQ_FOPS_RO(ll_kbytestotal);
+
+static int ll_kbytesfree_seq_show(struct seq_file *m, void *v)
+{
+	struct super_block *sb = (struct super_block *)m->private;
+	struct obd_statfs osfs;
+	int rc;
+
+	LASSERT(sb != NULL);
+	rc = ll_statfs_internal(sb, &osfs,
+				cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS),
+				OBD_STATFS_NODELAY);
+	if (!rc) {
+		__u32 blk_size = osfs.os_bsize >> 10;
+		__u64 result = osfs.os_bfree;
+
+		while (blk_size >>= 1)
+			result <<= 1;
+
+		seq_printf(m, "%llu\n", result);
+	}
+
+	return rc;
+}
+LPROC_SEQ_FOPS_RO(ll_kbytesfree);
+
+static int ll_kbytesavail_seq_show(struct seq_file *m, void *v)
+{
+	struct super_block *sb = (struct super_block *)m->private;
+	struct obd_statfs osfs;
+	int rc;
+
+	LASSERT(sb != NULL);
+	rc = ll_statfs_internal(sb, &osfs,
+				cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS),
+				OBD_STATFS_NODELAY);
+	if (!rc) {
+		__u32 blk_size = osfs.os_bsize >> 10;
+		__u64 result = osfs.os_bavail;
+
+		while (blk_size >>= 1)
+			result <<= 1;
+
+		seq_printf(m, "%llu\n", result);
+	}
+
+	return rc;
+}
+LPROC_SEQ_FOPS_RO(ll_kbytesavail);
+
+static int ll_filestotal_seq_show(struct seq_file *m, void *v)
+{
+	struct super_block *sb = (struct super_block *)m->private;
+	struct obd_statfs osfs;
+	int rc;
+
+	LASSERT(sb != NULL);
+	rc = ll_statfs_internal(sb, &osfs,
+				cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS),
+				OBD_STATFS_NODELAY);
+	if (!rc)
+		seq_printf(m, "%llu\n", osfs.os_files);
+
+	return rc;
+}
+LPROC_SEQ_FOPS_RO(ll_filestotal);
+
+static int ll_filesfree_seq_show(struct seq_file *m, void *v)
+{
+	struct super_block *sb = (struct super_block *)m->private;
+	struct obd_statfs osfs;
+	int rc;
+
+	LASSERT(sb != NULL);
+	rc = ll_statfs_internal(sb, &osfs,
+				cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS),
+				OBD_STATFS_NODELAY);
+	if (!rc)
+		seq_printf(m, "%llu\n", osfs.os_ffree);
+
+	return rc;
+}
+LPROC_SEQ_FOPS_RO(ll_filesfree);
+
+static int ll_client_type_seq_show(struct seq_file *m, void *v)
+{
+	struct ll_sb_info *sbi = ll_s2sbi((struct super_block *)m->private);
+
+	LASSERT(sbi != NULL);
+
+	if (sbi->ll_flags & LL_SBI_RMT_CLIENT)
+		seq_puts(m, "remote client\n");
+	else
+		seq_puts(m, "local client\n");
+
+	return 0;
+}
+LPROC_SEQ_FOPS_RO(ll_client_type);
+
+static int ll_fstype_seq_show(struct seq_file *m, void *v)
+{
+	struct super_block *sb = (struct super_block *)m->private;
+
+	LASSERT(sb != NULL);
+	seq_printf(m, "%s\n", sb->s_type->name);
+	return 0;
+}
+LPROC_SEQ_FOPS_RO(ll_fstype);
+
+static int ll_sb_uuid_seq_show(struct seq_file *m, void *v)
+{
+	struct super_block *sb = (struct super_block *)m->private;
+
+	LASSERT(sb != NULL);
+	seq_printf(m, "%s\n", ll_s2sbi(sb)->ll_sb_uuid.uuid);
+	return 0;
+}
+LPROC_SEQ_FOPS_RO(ll_sb_uuid);
+
+static int ll_site_stats_seq_show(struct seq_file *m, void *v)
+{
+	struct super_block *sb = m->private;
+
+	/*
+	 * See description of statistical counters in struct cl_site, and
+	 * struct lu_site.
+	 */
+	return cl_site_stats_print(lu2cl_site(ll_s2sbi(sb)->ll_site), m);
+}
+LPROC_SEQ_FOPS_RO(ll_site_stats);
+
+static int ll_max_readahead_mb_seq_show(struct seq_file *m, void *v)
+{
+	struct super_block *sb = m->private;
+	struct ll_sb_info *sbi = ll_s2sbi(sb);
+	long pages_number;
+	int mult;
+
+	spin_lock(&sbi->ll_lock);
+	pages_number = sbi->ll_ra_info.ra_max_pages;
+	spin_unlock(&sbi->ll_lock);
+
+	mult = 1 << (20 - PAGE_CACHE_SHIFT);
+	return lprocfs_seq_read_frac_helper(m, pages_number, mult);
+}
+
+static ssize_t ll_max_readahead_mb_seq_write(struct file *file,
+					     const char __user *buffer,
+					     size_t count, loff_t *off)
+{
+	struct super_block *sb = ((struct seq_file *)file->private_data)->private;
+	struct ll_sb_info *sbi = ll_s2sbi(sb);
+	int mult, rc, pages_number;
+
+	mult = 1 << (20 - PAGE_CACHE_SHIFT);
+	rc = lprocfs_write_frac_helper(buffer, count, &pages_number, mult);
+	if (rc)
+		return rc;
+
+	if (pages_number < 0 || pages_number > totalram_pages / 2) {
+		CERROR("can't set file readahead more than %lu MB\n",
+		       totalram_pages >> (20 - PAGE_CACHE_SHIFT + 1)); /*1/2 of RAM*/
+		return -ERANGE;
+	}
+
+	spin_lock(&sbi->ll_lock);
+	sbi->ll_ra_info.ra_max_pages = pages_number;
+	spin_unlock(&sbi->ll_lock);
+
+	return count;
+}
+LPROC_SEQ_FOPS(ll_max_readahead_mb);
+
+static int ll_max_readahead_per_file_mb_seq_show(struct seq_file *m, void *v)
+{
+	struct super_block *sb = m->private;
+	struct ll_sb_info *sbi = ll_s2sbi(sb);
+	long pages_number;
+	int mult;
+
+	spin_lock(&sbi->ll_lock);
+	pages_number = sbi->ll_ra_info.ra_max_pages_per_file;
+	spin_unlock(&sbi->ll_lock);
+
+	mult = 1 << (20 - PAGE_CACHE_SHIFT);
+	return lprocfs_seq_read_frac_helper(m, pages_number, mult);
+}
+
+static ssize_t ll_max_readahead_per_file_mb_seq_write(struct file *file,
+						  const char __user *buffer,
+						  size_t count, loff_t *off)
+{
+	struct super_block *sb = ((struct seq_file *)file->private_data)->private;
+	struct ll_sb_info *sbi = ll_s2sbi(sb);
+	int mult, rc, pages_number;
+
+	mult = 1 << (20 - PAGE_CACHE_SHIFT);
+	rc = lprocfs_write_frac_helper(buffer, count, &pages_number, mult);
+	if (rc)
+		return rc;
+
+	if (pages_number < 0 ||
+		pages_number > sbi->ll_ra_info.ra_max_pages) {
+		CERROR("can't set file readahead more than max_read_ahead_mb %lu MB\n",
+		       sbi->ll_ra_info.ra_max_pages);
+		return -ERANGE;
+	}
+
+	spin_lock(&sbi->ll_lock);
+	sbi->ll_ra_info.ra_max_pages_per_file = pages_number;
+	spin_unlock(&sbi->ll_lock);
+
+	return count;
+}
+LPROC_SEQ_FOPS(ll_max_readahead_per_file_mb);
+
+static int ll_max_read_ahead_whole_mb_seq_show(struct seq_file *m, void *unused)
+{
+	struct super_block *sb = m->private;
+	struct ll_sb_info *sbi = ll_s2sbi(sb);
+	long pages_number;
+	int mult;
+
+	spin_lock(&sbi->ll_lock);
+	pages_number = sbi->ll_ra_info.ra_max_read_ahead_whole_pages;
+	spin_unlock(&sbi->ll_lock);
+
+	mult = 1 << (20 - PAGE_CACHE_SHIFT);
+	return lprocfs_seq_read_frac_helper(m, pages_number, mult);
+}
+
+static ssize_t ll_max_read_ahead_whole_mb_seq_write(struct file *file,
+						const char __user *buffer,
+						size_t count, loff_t *off)
+{
+	struct super_block *sb = ((struct seq_file *)file->private_data)->private;
+	struct ll_sb_info *sbi = ll_s2sbi(sb);
+	int mult, rc, pages_number;
+
+	mult = 1 << (20 - PAGE_CACHE_SHIFT);
+	rc = lprocfs_write_frac_helper(buffer, count, &pages_number, mult);
+	if (rc)
+		return rc;
+
+	/* Cap this at the current max readahead window size, the readahead
+	 * algorithm does this anyway so it's pointless to set it larger. */
+	if (pages_number < 0 ||
+	    pages_number > sbi->ll_ra_info.ra_max_pages_per_file) {
+		CERROR("can't set max_read_ahead_whole_mb more than max_read_ahead_per_file_mb: %lu\n",
+		       sbi->ll_ra_info.ra_max_pages_per_file >> (20 - PAGE_CACHE_SHIFT));
+		return -ERANGE;
+	}
+
+	spin_lock(&sbi->ll_lock);
+	sbi->ll_ra_info.ra_max_read_ahead_whole_pages = pages_number;
+	spin_unlock(&sbi->ll_lock);
+
+	return count;
+}
+LPROC_SEQ_FOPS(ll_max_read_ahead_whole_mb);
+
+static int ll_max_cached_mb_seq_show(struct seq_file *m, void *v)
+{
+	struct super_block     *sb    = m->private;
+	struct ll_sb_info      *sbi   = ll_s2sbi(sb);
+	struct cl_client_cache *cache = &sbi->ll_cache;
+	int shift = 20 - PAGE_CACHE_SHIFT;
+	int max_cached_mb;
+	int unused_mb;
+
+	max_cached_mb = cache->ccc_lru_max >> shift;
+	unused_mb = atomic_read(&cache->ccc_lru_left) >> shift;
+	seq_printf(m,
+		   "users: %d\n"
+		   "max_cached_mb: %d\n"
+		   "used_mb: %d\n"
+		   "unused_mb: %d\n"
+		   "reclaim_count: %u\n",
+		   atomic_read(&cache->ccc_users),
+		   max_cached_mb,
+		   max_cached_mb - unused_mb,
+		   unused_mb,
+		   cache->ccc_lru_shrinkers);
+	return 0;
+}
+
+static ssize_t ll_max_cached_mb_seq_write(struct file *file,
+					  const char __user *buffer,
+					  size_t count, loff_t *off)
+{
+	struct super_block *sb = ((struct seq_file *)file->private_data)->private;
+	struct ll_sb_info *sbi = ll_s2sbi(sb);
+	struct cl_client_cache *cache = &sbi->ll_cache;
+	int mult, rc, pages_number;
+	int diff = 0;
+	int nrpages = 0;
+	char kernbuf[128];
+
+	if (count >= sizeof(kernbuf))
+		return -EINVAL;
+
+	if (copy_from_user(kernbuf, buffer, count))
+		return -EFAULT;
+	kernbuf[count] = 0;
+
+	mult = 1 << (20 - PAGE_CACHE_SHIFT);
+	buffer += lprocfs_find_named_value(kernbuf, "max_cached_mb:", &count) -
+		  kernbuf;
+	rc = lprocfs_write_frac_helper(buffer, count, &pages_number, mult);
+	if (rc)
+		return rc;
+
+	if (pages_number < 0 || pages_number > totalram_pages) {
+		CERROR("%s: can't set max cache more than %lu MB\n",
+		       ll_get_fsname(sb, NULL, 0),
+		       totalram_pages >> (20 - PAGE_CACHE_SHIFT));
+		return -ERANGE;
+	}
+
+	spin_lock(&sbi->ll_lock);
+	diff = pages_number - cache->ccc_lru_max;
+	spin_unlock(&sbi->ll_lock);
+
+	/* easy - add more LRU slots. */
+	if (diff >= 0) {
+		atomic_add(diff, &cache->ccc_lru_left);
+		rc = 0;
+		goto out;
+	}
+
+	diff = -diff;
+	while (diff > 0) {
+		int tmp;
+
+		/* reduce LRU budget from free slots. */
+		do {
+			int ov, nv;
+
+			ov = atomic_read(&cache->ccc_lru_left);
+			if (ov == 0)
+				break;
+
+			nv = ov > diff ? ov - diff : 0;
+			rc = atomic_cmpxchg(&cache->ccc_lru_left, ov, nv);
+			if (likely(ov == rc)) {
+				diff -= ov - nv;
+				nrpages += ov - nv;
+				break;
+			}
+		} while (1);
+
+		if (diff <= 0)
+			break;
+
+		if (sbi->ll_dt_exp == NULL) { /* being initialized */
+			rc = -ENODEV;
+			break;
+		}
+
+		/* difficult - have to ask OSCs to drop LRU slots. */
+		tmp = diff << 1;
+		rc = obd_set_info_async(NULL, sbi->ll_dt_exp,
+				sizeof(KEY_CACHE_LRU_SHRINK),
+				KEY_CACHE_LRU_SHRINK,
+				sizeof(tmp), &tmp, NULL);
+		if (rc < 0)
+			break;
+	}
+
+out:
+	if (rc >= 0) {
+		spin_lock(&sbi->ll_lock);
+		cache->ccc_lru_max = pages_number;
+		spin_unlock(&sbi->ll_lock);
+		rc = count;
+	} else {
+		atomic_add(nrpages, &cache->ccc_lru_left);
+	}
+	return rc;
+}
+LPROC_SEQ_FOPS(ll_max_cached_mb);
+
+static int ll_checksum_seq_show(struct seq_file *m, void *v)
+{
+	struct super_block *sb = m->private;
+	struct ll_sb_info *sbi = ll_s2sbi(sb);
+
+	seq_printf(m, "%u\n", (sbi->ll_flags & LL_SBI_CHECKSUM) ? 1 : 0);
+	return 0;
+}
+
+static ssize_t ll_checksum_seq_write(struct file *file,
+				     const char __user *buffer,
+				     size_t count, loff_t *off)
+{
+	struct super_block *sb = ((struct seq_file *)file->private_data)->private;
+	struct ll_sb_info *sbi = ll_s2sbi(sb);
+	int val, rc;
+
+	if (!sbi->ll_dt_exp)
+		/* Not set up yet */
+		return -EAGAIN;
+
+	rc = lprocfs_write_helper(buffer, count, &val);
+	if (rc)
+		return rc;
+	if (val)
+		sbi->ll_flags |= LL_SBI_CHECKSUM;
+	else
+		sbi->ll_flags &= ~LL_SBI_CHECKSUM;
+
+	rc = obd_set_info_async(NULL, sbi->ll_dt_exp, sizeof(KEY_CHECKSUM),
+				KEY_CHECKSUM, sizeof(val), &val, NULL);
+	if (rc)
+		CWARN("Failed to set OSC checksum flags: %d\n", rc);
+
+	return count;
+}
+LPROC_SEQ_FOPS(ll_checksum);
+
+static int ll_max_rw_chunk_seq_show(struct seq_file *m, void *v)
+{
+	struct super_block *sb = m->private;
+
+	seq_printf(m, "%lu\n", ll_s2sbi(sb)->ll_max_rw_chunk);
+	return 0;
+}
+
+static ssize_t ll_max_rw_chunk_seq_write(struct file *file,
+					 const char __user *buffer,
+					 size_t count, loff_t *off)
+{
+	struct super_block *sb = ((struct seq_file *)file->private_data)->private;
+	int rc, val;
+
+	rc = lprocfs_write_helper(buffer, count, &val);
+	if (rc)
+		return rc;
+	ll_s2sbi(sb)->ll_max_rw_chunk = val;
+	return count;
+}
+LPROC_SEQ_FOPS(ll_max_rw_chunk);
+
+static int ll_rd_track_id(struct seq_file *m, enum stats_track_type type)
+{
+	struct super_block *sb = m->private;
+
+	if (ll_s2sbi(sb)->ll_stats_track_type == type)
+		seq_printf(m, "%d\n", ll_s2sbi(sb)->ll_stats_track_id);
+	else if (ll_s2sbi(sb)->ll_stats_track_type == STATS_TRACK_ALL)
+		seq_puts(m, "0 (all)\n");
+	else
+		seq_puts(m, "untracked\n");
+
+	return 0;
+}
+
+static int ll_wr_track_id(const char __user *buffer, unsigned long count,
+			  void *data, enum stats_track_type type)
+{
+	struct super_block *sb = data;
+	int rc, pid;
+
+	rc = lprocfs_write_helper(buffer, count, &pid);
+	if (rc)
+		return rc;
+	ll_s2sbi(sb)->ll_stats_track_id = pid;
+	if (pid == 0)
+		ll_s2sbi(sb)->ll_stats_track_type = STATS_TRACK_ALL;
+	else
+		ll_s2sbi(sb)->ll_stats_track_type = type;
+	lprocfs_clear_stats(ll_s2sbi(sb)->ll_stats);
+	return count;
+}
+
+static int ll_track_pid_seq_show(struct seq_file *m, void *v)
+{
+	return ll_rd_track_id(m, STATS_TRACK_PID);
+}
+
+static ssize_t ll_track_pid_seq_write(struct file *file,
+				      const char __user *buffer,
+				      size_t count, loff_t *off)
+{
+	struct seq_file *seq = file->private_data;
+	return ll_wr_track_id(buffer, count, seq->private, STATS_TRACK_PID);
+}
+LPROC_SEQ_FOPS(ll_track_pid);
+
+static int ll_track_ppid_seq_show(struct seq_file *m, void *v)
+{
+	return ll_rd_track_id(m, STATS_TRACK_PPID);
+}
+
+static ssize_t ll_track_ppid_seq_write(struct file *file,
+				       const char __user *buffer,
+				       size_t count, loff_t *off)
+{
+	struct seq_file *seq = file->private_data;
+	return ll_wr_track_id(buffer, count, seq->private, STATS_TRACK_PPID);
+}
+LPROC_SEQ_FOPS(ll_track_ppid);
+
+static int ll_track_gid_seq_show(struct seq_file *m, void *v)
+{
+	return ll_rd_track_id(m, STATS_TRACK_GID);
+}
+
+static ssize_t ll_track_gid_seq_write(struct file *file,
+				      const char __user *buffer,
+				      size_t count, loff_t *off)
+{
+	struct seq_file *seq = file->private_data;
+	return ll_wr_track_id(buffer, count, seq->private, STATS_TRACK_GID);
+}
+LPROC_SEQ_FOPS(ll_track_gid);
+
+static int ll_statahead_max_seq_show(struct seq_file *m, void *v)
+{
+	struct super_block *sb = m->private;
+	struct ll_sb_info *sbi = ll_s2sbi(sb);
+
+	seq_printf(m, "%u\n", sbi->ll_sa_max);
+	return 0;
+}
+
+static ssize_t ll_statahead_max_seq_write(struct file *file,
+					  const char __user *buffer,
+					  size_t count, loff_t *off)
+{
+	struct super_block *sb = ((struct seq_file *)file->private_data)->private;
+	struct ll_sb_info *sbi = ll_s2sbi(sb);
+	int val, rc;
+
+	rc = lprocfs_write_helper(buffer, count, &val);
+	if (rc)
+		return rc;
+
+	if (val >= 0 && val <= LL_SA_RPC_MAX)
+		sbi->ll_sa_max = val;
+	else
+		CERROR("Bad statahead_max value %d. Valid values are in the range [0, %d]\n",
+		       val, LL_SA_RPC_MAX);
+
+	return count;
+}
+LPROC_SEQ_FOPS(ll_statahead_max);
+
+static int ll_statahead_agl_seq_show(struct seq_file *m, void *v)
+{
+	struct super_block *sb = m->private;
+	struct ll_sb_info *sbi = ll_s2sbi(sb);
+
+	seq_printf(m, "%u\n", sbi->ll_flags & LL_SBI_AGL_ENABLED ? 1 : 0);
+	return 0;
+}
+
+static ssize_t ll_statahead_agl_seq_write(struct file *file,
+					  const char __user *buffer,
+					  size_t count, loff_t *off)
+{
+	struct super_block *sb = ((struct seq_file *)file->private_data)->private;
+	struct ll_sb_info *sbi = ll_s2sbi(sb);
+	int val, rc;
+
+	rc = lprocfs_write_helper(buffer, count, &val);
+	if (rc)
+		return rc;
+
+	if (val)
+		sbi->ll_flags |= LL_SBI_AGL_ENABLED;
+	else
+		sbi->ll_flags &= ~LL_SBI_AGL_ENABLED;
+
+	return count;
+}
+LPROC_SEQ_FOPS(ll_statahead_agl);
+
+static int ll_statahead_stats_seq_show(struct seq_file *m, void *v)
+{
+	struct super_block *sb = m->private;
+	struct ll_sb_info *sbi = ll_s2sbi(sb);
+
+	seq_printf(m,
+		   "statahead total: %u\n"
+		   "statahead wrong: %u\n"
+		   "agl total: %u\n",
+		   atomic_read(&sbi->ll_sa_total),
+		   atomic_read(&sbi->ll_sa_wrong),
+		   atomic_read(&sbi->ll_agl_total));
+	return 0;
+}
+LPROC_SEQ_FOPS_RO(ll_statahead_stats);
+
+static int ll_lazystatfs_seq_show(struct seq_file *m, void *v)
+{
+	struct super_block *sb = m->private;
+	struct ll_sb_info *sbi = ll_s2sbi(sb);
+
+	seq_printf(m, "%u\n", sbi->ll_flags & LL_SBI_LAZYSTATFS ? 1 : 0);
+	return 0;
+}
+
+static ssize_t ll_lazystatfs_seq_write(struct file *file,
+				       const char __user *buffer,
+				       size_t count, loff_t *off)
+{
+	struct super_block *sb = ((struct seq_file *)file->private_data)->private;
+	struct ll_sb_info *sbi = ll_s2sbi(sb);
+	int val, rc;
+
+	rc = lprocfs_write_helper(buffer, count, &val);
+	if (rc)
+		return rc;
+
+	if (val)
+		sbi->ll_flags |= LL_SBI_LAZYSTATFS;
+	else
+		sbi->ll_flags &= ~LL_SBI_LAZYSTATFS;
+
+	return count;
+}
+LPROC_SEQ_FOPS(ll_lazystatfs);
+
+static int ll_max_easize_seq_show(struct seq_file *m, void *v)
+{
+	struct super_block *sb = m->private;
+	struct ll_sb_info *sbi = ll_s2sbi(sb);
+	unsigned int ealen;
+	int rc;
+
+	rc = ll_get_max_mdsize(sbi, &ealen);
+	if (rc)
+		return rc;
+
+	seq_printf(m, "%u\n", ealen);
+	return 0;
+}
+LPROC_SEQ_FOPS_RO(ll_max_easize);
+
+static int ll_default_easize_seq_show(struct seq_file *m, void *v)
+{
+	struct super_block *sb = m->private;
+	struct ll_sb_info *sbi = ll_s2sbi(sb);
+	unsigned int ealen;
+	int rc;
+
+	rc = ll_get_default_mdsize(sbi, &ealen);
+	if (rc)
+		return rc;
+
+	seq_printf(m, "%u\n", ealen);
+	return 0;
+}
+LPROC_SEQ_FOPS_RO(ll_default_easize);
+
+static int ll_max_cookiesize_seq_show(struct seq_file *m, void *v)
+{
+	struct super_block *sb = m->private;
+	struct ll_sb_info *sbi = ll_s2sbi(sb);
+	unsigned int cookielen;
+	int rc;
+
+	rc = ll_get_max_cookiesize(sbi, &cookielen);
+	if (rc)
+		return rc;
+
+	seq_printf(m, "%u\n", cookielen);
+	return 0;
+}
+LPROC_SEQ_FOPS_RO(ll_max_cookiesize);
+
+static int ll_default_cookiesize_seq_show(struct seq_file *m, void *v)
+{
+	struct super_block *sb = m->private;
+	struct ll_sb_info *sbi = ll_s2sbi(sb);
+	unsigned int cookielen;
+	int rc;
+
+	rc = ll_get_default_cookiesize(sbi, &cookielen);
+	if (rc)
+		return rc;
+
+	seq_printf(m, "%u\n", cookielen);
+	return 0;
+}
+LPROC_SEQ_FOPS_RO(ll_default_cookiesize);
+
+static int ll_sbi_flags_seq_show(struct seq_file *m, void *v)
+{
+	const char *str[] = LL_SBI_FLAGS;
+	struct super_block *sb = m->private;
+	int flags = ll_s2sbi(sb)->ll_flags;
+	int i = 0;
+
+	while (flags != 0) {
+		if (ARRAY_SIZE(str) <= i) {
+			CERROR("%s: Revise array LL_SBI_FLAGS to match sbi flags please.\n",
+			       ll_get_fsname(sb, NULL, 0));
+			return -EINVAL;
+		}
+
+		if (flags & 0x1)
+			seq_printf(m, "%s ", str[i]);
+		flags >>= 1;
+		++i;
+	}
+	seq_printf(m, "\b\n");
+	return 0;
+}
+LPROC_SEQ_FOPS_RO(ll_sbi_flags);
+
+static int ll_xattr_cache_seq_show(struct seq_file *m, void *v)
+{
+	struct super_block *sb = m->private;
+	struct ll_sb_info *sbi = ll_s2sbi(sb);
+
+	seq_printf(m, "%u\n", sbi->ll_xattr_cache_enabled);
+
+	return 0;
+}
+
+static ssize_t ll_xattr_cache_seq_write(struct file *file,
+					const char __user *buffer,
+					size_t count, loff_t *off)
+{
+	struct seq_file *seq = file->private_data;
+	struct super_block *sb = seq->private;
+	struct ll_sb_info *sbi = ll_s2sbi(sb);
+	int val, rc;
+
+	rc = lprocfs_write_helper(buffer, count, &val);
+	if (rc)
+		return rc;
+
+	if (val != 0 && val != 1)
+		return -ERANGE;
+
+	if (val == 1 && !(sbi->ll_flags & LL_SBI_XATTR_CACHE))
+		return -ENOTSUPP;
+
+	sbi->ll_xattr_cache_enabled = val;
+
+	return count;
+}
+LPROC_SEQ_FOPS(ll_xattr_cache);
+
+static struct lprocfs_vars lprocfs_llite_obd_vars[] = {
+	{ "uuid",	  &ll_sb_uuid_fops,	  NULL, 0 },
+	/* { "mntpt_path",   ll_rd_path,	     0, 0 }, */
+	{ "fstype",       &ll_fstype_fops,	  NULL, 0 },
+	{ "site",	  &ll_site_stats_fops,    NULL, 0 },
+	{ "blocksize",    &ll_blksize_fops,	  NULL, 0 },
+	{ "kbytestotal",  &ll_kbytestotal_fops,   NULL, 0 },
+	{ "kbytesfree",   &ll_kbytesfree_fops,    NULL, 0 },
+	{ "kbytesavail",  &ll_kbytesavail_fops,   NULL, 0 },
+	{ "filestotal",   &ll_filestotal_fops,    NULL, 0 },
+	{ "filesfree",    &ll_filesfree_fops,	  NULL, 0 },
+	{ "client_type",  &ll_client_type_fops,   NULL, 0 },
+	/* { "filegroups",   lprocfs_rd_filegroups,  0, 0 }, */
+	{ "max_read_ahead_mb", &ll_max_readahead_mb_fops, NULL },
+	{ "max_read_ahead_per_file_mb", &ll_max_readahead_per_file_mb_fops,
+		NULL },
+	{ "max_read_ahead_whole_mb", &ll_max_read_ahead_whole_mb_fops, NULL },
+	{ "max_cached_mb",    &ll_max_cached_mb_fops, NULL },
+	{ "checksum_pages",   &ll_checksum_fops, NULL },
+	{ "max_rw_chunk",     &ll_max_rw_chunk_fops, NULL },
+	{ "stats_track_pid",  &ll_track_pid_fops, NULL },
+	{ "stats_track_ppid", &ll_track_ppid_fops, NULL },
+	{ "stats_track_gid",  &ll_track_gid_fops, NULL },
+	{ "statahead_max",    &ll_statahead_max_fops, NULL },
+	{ "statahead_agl",    &ll_statahead_agl_fops, NULL },
+	{ "statahead_stats",  &ll_statahead_stats_fops, NULL, 0 },
+	{ "lazystatfs",       &ll_lazystatfs_fops, NULL },
+	{ "max_easize",       &ll_max_easize_fops, NULL, 0 },
+	{ "default_easize",   &ll_default_easize_fops, NULL, 0 },
+	{ "max_cookiesize",   &ll_max_cookiesize_fops, NULL, 0 },
+	{ "default_cookiesize", &ll_default_cookiesize_fops, NULL, 0 },
+	{ "sbi_flags",	      &ll_sbi_flags_fops, NULL, 0 },
+	{ "xattr_cache",      &ll_xattr_cache_fops, NULL, 0 },
+	{ NULL }
+};
+
+#define MAX_STRING_SIZE 128
+
+static const struct llite_file_opcode {
+	__u32       opcode;
+	__u32       type;
+	const char *opname;
+} llite_opcode_table[LPROC_LL_FILE_OPCODES] = {
+	/* file operation */
+	{ LPROC_LL_DIRTY_HITS,     LPROCFS_TYPE_REGS, "dirty_pages_hits" },
+	{ LPROC_LL_DIRTY_MISSES,   LPROCFS_TYPE_REGS, "dirty_pages_misses" },
+	{ LPROC_LL_READ_BYTES,     LPROCFS_CNTR_AVGMINMAX|LPROCFS_TYPE_BYTES,
+				   "read_bytes" },
+	{ LPROC_LL_WRITE_BYTES,    LPROCFS_CNTR_AVGMINMAX|LPROCFS_TYPE_BYTES,
+				   "write_bytes" },
+	{ LPROC_LL_BRW_READ,       LPROCFS_CNTR_AVGMINMAX|LPROCFS_TYPE_PAGES,
+				   "brw_read" },
+	{ LPROC_LL_BRW_WRITE,      LPROCFS_CNTR_AVGMINMAX|LPROCFS_TYPE_PAGES,
+				   "brw_write" },
+	{ LPROC_LL_OSC_READ,       LPROCFS_CNTR_AVGMINMAX|LPROCFS_TYPE_BYTES,
+				   "osc_read" },
+	{ LPROC_LL_OSC_WRITE,      LPROCFS_CNTR_AVGMINMAX|LPROCFS_TYPE_BYTES,
+				   "osc_write" },
+	{ LPROC_LL_IOCTL,	  LPROCFS_TYPE_REGS, "ioctl" },
+	{ LPROC_LL_OPEN,	   LPROCFS_TYPE_REGS, "open" },
+	{ LPROC_LL_RELEASE,	LPROCFS_TYPE_REGS, "close" },
+	{ LPROC_LL_MAP,	    LPROCFS_TYPE_REGS, "mmap" },
+	{ LPROC_LL_LLSEEK,	 LPROCFS_TYPE_REGS, "seek" },
+	{ LPROC_LL_FSYNC,	  LPROCFS_TYPE_REGS, "fsync" },
+	{ LPROC_LL_READDIR,	LPROCFS_TYPE_REGS, "readdir" },
+	/* inode operation */
+	{ LPROC_LL_SETATTR,	LPROCFS_TYPE_REGS, "setattr" },
+	{ LPROC_LL_TRUNC,	  LPROCFS_TYPE_REGS, "truncate" },
+	{ LPROC_LL_FLOCK,	  LPROCFS_TYPE_REGS, "flock" },
+	{ LPROC_LL_GETATTR,	LPROCFS_TYPE_REGS, "getattr" },
+	/* dir inode operation */
+	{ LPROC_LL_CREATE,	 LPROCFS_TYPE_REGS, "create" },
+	{ LPROC_LL_LINK,	   LPROCFS_TYPE_REGS, "link" },
+	{ LPROC_LL_UNLINK,	 LPROCFS_TYPE_REGS, "unlink" },
+	{ LPROC_LL_SYMLINK,	LPROCFS_TYPE_REGS, "symlink" },
+	{ LPROC_LL_MKDIR,	  LPROCFS_TYPE_REGS, "mkdir" },
+	{ LPROC_LL_RMDIR,	  LPROCFS_TYPE_REGS, "rmdir" },
+	{ LPROC_LL_MKNOD,	  LPROCFS_TYPE_REGS, "mknod" },
+	{ LPROC_LL_RENAME,	 LPROCFS_TYPE_REGS, "rename" },
+	/* special inode operation */
+	{ LPROC_LL_STAFS,	  LPROCFS_TYPE_REGS, "statfs" },
+	{ LPROC_LL_ALLOC_INODE,    LPROCFS_TYPE_REGS, "alloc_inode" },
+	{ LPROC_LL_SETXATTR,       LPROCFS_TYPE_REGS, "setxattr" },
+	{ LPROC_LL_GETXATTR,       LPROCFS_TYPE_REGS, "getxattr" },
+	{ LPROC_LL_GETXATTR_HITS,  LPROCFS_TYPE_REGS, "getxattr_hits" },
+	{ LPROC_LL_LISTXATTR,      LPROCFS_TYPE_REGS, "listxattr" },
+	{ LPROC_LL_REMOVEXATTR,    LPROCFS_TYPE_REGS, "removexattr" },
+	{ LPROC_LL_INODE_PERM,     LPROCFS_TYPE_REGS, "inode_permission" },
+};
+
+void ll_stats_ops_tally(struct ll_sb_info *sbi, int op, int count)
+{
+	if (!sbi->ll_stats)
+		return;
+	if (sbi->ll_stats_track_type == STATS_TRACK_ALL)
+		lprocfs_counter_add(sbi->ll_stats, op, count);
+	else if (sbi->ll_stats_track_type == STATS_TRACK_PID &&
+		 sbi->ll_stats_track_id == current->pid)
+		lprocfs_counter_add(sbi->ll_stats, op, count);
+	else if (sbi->ll_stats_track_type == STATS_TRACK_PPID &&
+		 sbi->ll_stats_track_id == current->real_parent->pid)
+		lprocfs_counter_add(sbi->ll_stats, op, count);
+	else if (sbi->ll_stats_track_type == STATS_TRACK_GID &&
+		 sbi->ll_stats_track_id ==
+			from_kgid(&init_user_ns, current_gid()))
+		lprocfs_counter_add(sbi->ll_stats, op, count);
+}
+EXPORT_SYMBOL(ll_stats_ops_tally);
+
+static const char *ra_stat_string[] = {
+	[RA_STAT_HIT] = "hits",
+	[RA_STAT_MISS] = "misses",
+	[RA_STAT_DISTANT_READPAGE] = "readpage not consecutive",
+	[RA_STAT_MISS_IN_WINDOW] = "miss inside window",
+	[RA_STAT_FAILED_GRAB_PAGE] = "failed grab_cache_page",
+	[RA_STAT_FAILED_MATCH] = "failed lock match",
+	[RA_STAT_DISCARDED] = "read but discarded",
+	[RA_STAT_ZERO_LEN] = "zero length file",
+	[RA_STAT_ZERO_WINDOW] = "zero size window",
+	[RA_STAT_EOF] = "read-ahead to EOF",
+	[RA_STAT_MAX_IN_FLIGHT] = "hit max r-a issue",
+	[RA_STAT_WRONG_GRAB_PAGE] = "wrong page from grab_cache_page",
+};
+
+LPROC_SEQ_FOPS_RO_TYPE(llite, name);
+LPROC_SEQ_FOPS_RO_TYPE(llite, uuid);
+
+int lprocfs_register_mountpoint(struct proc_dir_entry *parent,
+				struct super_block *sb, char *osc, char *mdc)
+{
+	struct lprocfs_vars lvars[2];
+	struct lustre_sb_info *lsi = s2lsi(sb);
+	struct ll_sb_info *sbi = ll_s2sbi(sb);
+	struct obd_device *obd;
+	struct proc_dir_entry *dir;
+	char name[MAX_STRING_SIZE + 1], *ptr;
+	int err, id, len, rc;
+
+	memset(lvars, 0, sizeof(lvars));
+
+	name[MAX_STRING_SIZE] = '\0';
+	lvars[0].name = name;
+
+	LASSERT(sbi != NULL);
+	LASSERT(mdc != NULL);
+	LASSERT(osc != NULL);
+
+	/* Get fsname */
+	len = strlen(lsi->lsi_lmd->lmd_profile);
+	ptr = strrchr(lsi->lsi_lmd->lmd_profile, '-');
+	if (ptr && (strcmp(ptr, "-client") == 0))
+		len -= 7;
+
+	/* Mount info */
+	snprintf(name, MAX_STRING_SIZE, "%.*s-%p", len,
+		 lsi->lsi_lmd->lmd_profile, sb);
+
+	sbi->ll_proc_root = lprocfs_register(name, parent, NULL, NULL);
+	if (IS_ERR(sbi->ll_proc_root)) {
+		err = PTR_ERR(sbi->ll_proc_root);
+		sbi->ll_proc_root = NULL;
+		return err;
+	}
+
+	rc = lprocfs_seq_create(sbi->ll_proc_root, "dump_page_cache", 0444,
+				&vvp_dump_pgcache_file_ops, sbi);
+	if (rc)
+		CWARN("Error adding the dump_page_cache file\n");
+
+	rc = lprocfs_seq_create(sbi->ll_proc_root, "extents_stats", 0644,
+				&ll_rw_extents_stats_fops, sbi);
+	if (rc)
+		CWARN("Error adding the extent_stats file\n");
+
+	rc = lprocfs_seq_create(sbi->ll_proc_root, "extents_stats_per_process",
+				0644, &ll_rw_extents_stats_pp_fops, sbi);
+	if (rc)
+		CWARN("Error adding the extents_stats_per_process file\n");
+
+	rc = lprocfs_seq_create(sbi->ll_proc_root, "offset_stats", 0644,
+				&ll_rw_offset_stats_fops, sbi);
+	if (rc)
+		CWARN("Error adding the offset_stats file\n");
+
+	/* File operations stats */
+	sbi->ll_stats = lprocfs_alloc_stats(LPROC_LL_FILE_OPCODES,
+					    LPROCFS_STATS_FLAG_NONE);
+	if (sbi->ll_stats == NULL) {
+		err = -ENOMEM;
+		goto out;
+	}
+	/* do counter init */
+	for (id = 0; id < LPROC_LL_FILE_OPCODES; id++) {
+		__u32 type = llite_opcode_table[id].type;
+		void *ptr = NULL;
+		if (type & LPROCFS_TYPE_REGS)
+			ptr = "regs";
+		else if (type & LPROCFS_TYPE_BYTES)
+			ptr = "bytes";
+		else if (type & LPROCFS_TYPE_PAGES)
+			ptr = "pages";
+		lprocfs_counter_init(sbi->ll_stats,
+				     llite_opcode_table[id].opcode,
+				     (type & LPROCFS_CNTR_AVGMINMAX),
+				     llite_opcode_table[id].opname, ptr);
+	}
+	err = lprocfs_register_stats(sbi->ll_proc_root, "stats", sbi->ll_stats);
+	if (err)
+		goto out;
+
+	sbi->ll_ra_stats = lprocfs_alloc_stats(ARRAY_SIZE(ra_stat_string),
+					       LPROCFS_STATS_FLAG_NONE);
+	if (sbi->ll_ra_stats == NULL) {
+		err = -ENOMEM;
+		goto out;
+	}
+
+	for (id = 0; id < ARRAY_SIZE(ra_stat_string); id++)
+		lprocfs_counter_init(sbi->ll_ra_stats, id, 0,
+				     ra_stat_string[id], "pages");
+	err = lprocfs_register_stats(sbi->ll_proc_root, "read_ahead_stats",
+				     sbi->ll_ra_stats);
+	if (err)
+		goto out;
+
+
+	err = lprocfs_add_vars(sbi->ll_proc_root, lprocfs_llite_obd_vars, sb);
+	if (err)
+		goto out;
+
+	/* MDC info */
+	obd = class_name2obd(mdc);
+
+	LASSERT(obd != NULL);
+	LASSERT(obd->obd_magic == OBD_DEVICE_MAGIC);
+	LASSERT(obd->obd_type->typ_name != NULL);
+
+	dir = proc_mkdir(obd->obd_type->typ_name, sbi->ll_proc_root);
+	if (dir == NULL) {
+		err = -ENOMEM;
+		goto out;
+	}
+
+	snprintf(name, MAX_STRING_SIZE, "common_name");
+	lvars[0].fops = &llite_name_fops;
+	err = lprocfs_add_vars(dir, lvars, obd);
+	if (err)
+		goto out;
+
+	snprintf(name, MAX_STRING_SIZE, "uuid");
+	lvars[0].fops = &llite_uuid_fops;
+	err = lprocfs_add_vars(dir, lvars, obd);
+	if (err)
+		goto out;
+
+	/* OSC */
+	obd = class_name2obd(osc);
+
+	LASSERT(obd != NULL);
+	LASSERT(obd->obd_magic == OBD_DEVICE_MAGIC);
+	LASSERT(obd->obd_type->typ_name != NULL);
+
+	dir = proc_mkdir(obd->obd_type->typ_name, sbi->ll_proc_root);
+	if (dir == NULL) {
+		err = -ENOMEM;
+		goto out;
+	}
+
+	snprintf(name, MAX_STRING_SIZE, "common_name");
+	lvars[0].fops = &llite_name_fops;
+	err = lprocfs_add_vars(dir, lvars, obd);
+	if (err)
+		goto out;
+
+	snprintf(name, MAX_STRING_SIZE, "uuid");
+	lvars[0].fops = &llite_uuid_fops;
+	err = lprocfs_add_vars(dir, lvars, obd);
+out:
+	if (err) {
+		lprocfs_remove(&sbi->ll_proc_root);
+		lprocfs_free_stats(&sbi->ll_ra_stats);
+		lprocfs_free_stats(&sbi->ll_stats);
+	}
+	return err;
+}
+
+void lprocfs_unregister_mountpoint(struct ll_sb_info *sbi)
+{
+	if (sbi->ll_proc_root) {
+		lprocfs_remove(&sbi->ll_proc_root);
+		lprocfs_free_stats(&sbi->ll_ra_stats);
+		lprocfs_free_stats(&sbi->ll_stats);
+	}
+}
+#undef MAX_STRING_SIZE
+
+#define pct(a, b) (b ? a * 100 / b : 0)
+
+static void ll_display_extents_info(struct ll_rw_extents_info *io_extents,
+				   struct seq_file *seq, int which)
+{
+	unsigned long read_tot = 0, write_tot = 0, read_cum, write_cum;
+	unsigned long start, end, r, w;
+	char *unitp = "KMGTPEZY";
+	int i, units = 10;
+	struct per_process_info *pp_info = &io_extents->pp_extents[which];
+
+	read_cum = 0;
+	write_cum = 0;
+	start = 0;
+
+	for (i = 0; i < LL_HIST_MAX; i++) {
+		read_tot += pp_info->pp_r_hist.oh_buckets[i];
+		write_tot += pp_info->pp_w_hist.oh_buckets[i];
+	}
+
+	for (i = 0; i < LL_HIST_MAX; i++) {
+		r = pp_info->pp_r_hist.oh_buckets[i];
+		w = pp_info->pp_w_hist.oh_buckets[i];
+		read_cum += r;
+		write_cum += w;
+		end = 1 << (i + LL_HIST_START - units);
+		seq_printf(seq, "%4lu%c - %4lu%c%c: %14lu %4lu %4lu  | %14lu %4lu %4lu\n",
+			   start, *unitp, end, *unitp,
+			   (i == LL_HIST_MAX - 1) ? '+' : ' ',
+			   r, pct(r, read_tot), pct(read_cum, read_tot),
+			   w, pct(w, write_tot), pct(write_cum, write_tot));
+		start = end;
+		if (start == 1<<10) {
+			start = 1;
+			units += 10;
+			unitp++;
+		}
+		if (read_cum == read_tot && write_cum == write_tot)
+			break;
+	}
+}
+
+static int ll_rw_extents_stats_pp_seq_show(struct seq_file *seq, void *v)
+{
+	struct timeval now;
+	struct ll_sb_info *sbi = seq->private;
+	struct ll_rw_extents_info *io_extents = &sbi->ll_rw_extents_info;
+	int k;
+
+	do_gettimeofday(&now);
+
+	if (!sbi->ll_rw_stats_on) {
+		seq_printf(seq, "disabled\n"
+			   "write anything in this file to activate, then 0 or \"[D/d]isabled\" to deactivate\n");
+		return 0;
+	}
+	seq_printf(seq, "snapshot_time:	 %lu.%lu (secs.usecs)\n",
+		   now.tv_sec, (unsigned long)now.tv_usec);
+	seq_printf(seq, "%15s %19s       | %20s\n", " ", "read", "write");
+	seq_printf(seq, "%13s   %14s %4s %4s  | %14s %4s %4s\n",
+		   "extents", "calls", "%", "cum%",
+		   "calls", "%", "cum%");
+	spin_lock(&sbi->ll_pp_extent_lock);
+	for (k = 0; k < LL_PROCESS_HIST_MAX; k++) {
+		if (io_extents->pp_extents[k].pid != 0) {
+			seq_printf(seq, "\nPID: %d\n",
+				   io_extents->pp_extents[k].pid);
+			ll_display_extents_info(io_extents, seq, k);
+		}
+	}
+	spin_unlock(&sbi->ll_pp_extent_lock);
+	return 0;
+}
+
+static ssize_t ll_rw_extents_stats_pp_seq_write(struct file *file,
+						const char __user *buf,
+						size_t len,
+						loff_t *off)
+{
+	struct seq_file *seq = file->private_data;
+	struct ll_sb_info *sbi = seq->private;
+	struct ll_rw_extents_info *io_extents = &sbi->ll_rw_extents_info;
+	int i;
+	int value = 1, rc = 0;
+
+	if (len == 0)
+		return -EINVAL;
+
+	rc = lprocfs_write_helper(buf, len, &value);
+	if (rc < 0 && len < 16) {
+		char kernbuf[16];
+
+		if (copy_from_user(kernbuf, buf, len))
+			return -EFAULT;
+		kernbuf[len] = 0;
+
+		if (kernbuf[len - 1] == '\n')
+			kernbuf[len - 1] = 0;
+
+		if (strcmp(kernbuf, "disabled") == 0 ||
+		    strcmp(kernbuf, "Disabled") == 0)
+			value = 0;
+	}
+
+	if (value == 0)
+		sbi->ll_rw_stats_on = 0;
+	else
+		sbi->ll_rw_stats_on = 1;
+
+	spin_lock(&sbi->ll_pp_extent_lock);
+	for (i = 0; i < LL_PROCESS_HIST_MAX; i++) {
+		io_extents->pp_extents[i].pid = 0;
+		lprocfs_oh_clear(&io_extents->pp_extents[i].pp_r_hist);
+		lprocfs_oh_clear(&io_extents->pp_extents[i].pp_w_hist);
+	}
+	spin_unlock(&sbi->ll_pp_extent_lock);
+	return len;
+}
+
+LPROC_SEQ_FOPS(ll_rw_extents_stats_pp);
+
+static int ll_rw_extents_stats_seq_show(struct seq_file *seq, void *v)
+{
+	struct timeval now;
+	struct ll_sb_info *sbi = seq->private;
+	struct ll_rw_extents_info *io_extents = &sbi->ll_rw_extents_info;
+
+	do_gettimeofday(&now);
+
+	if (!sbi->ll_rw_stats_on) {
+		seq_printf(seq, "disabled\n"
+			   "write anything in this file to activate, then 0 or \"[D/d]isabled\" to deactivate\n");
+		return 0;
+	}
+	seq_printf(seq, "snapshot_time:	 %lu.%lu (secs.usecs)\n",
+		   now.tv_sec, (unsigned long)now.tv_usec);
+
+	seq_printf(seq, "%15s %19s       | %20s\n", " ", "read", "write");
+	seq_printf(seq, "%13s   %14s %4s %4s  | %14s %4s %4s\n",
+		   "extents", "calls", "%", "cum%",
+		   "calls", "%", "cum%");
+	spin_lock(&sbi->ll_lock);
+	ll_display_extents_info(io_extents, seq, LL_PROCESS_HIST_MAX);
+	spin_unlock(&sbi->ll_lock);
+
+	return 0;
+}
+
+static ssize_t ll_rw_extents_stats_seq_write(struct file *file,
+					     const char __user *buf,
+					     size_t len, loff_t *off)
+{
+	struct seq_file *seq = file->private_data;
+	struct ll_sb_info *sbi = seq->private;
+	struct ll_rw_extents_info *io_extents = &sbi->ll_rw_extents_info;
+	int i;
+	int value = 1, rc = 0;
+
+	if (len == 0)
+		return -EINVAL;
+
+	rc = lprocfs_write_helper(buf, len, &value);
+	if (rc < 0 && len < 16) {
+		char kernbuf[16];
+
+		if (copy_from_user(kernbuf, buf, len))
+			return -EFAULT;
+		kernbuf[len] = 0;
+
+		if (kernbuf[len - 1] == '\n')
+			kernbuf[len - 1] = 0;
+
+		if (strcmp(kernbuf, "disabled") == 0 ||
+		    strcmp(kernbuf, "Disabled") == 0)
+			value = 0;
+	}
+
+	if (value == 0)
+		sbi->ll_rw_stats_on = 0;
+	else
+		sbi->ll_rw_stats_on = 1;
+
+	spin_lock(&sbi->ll_pp_extent_lock);
+	for (i = 0; i <= LL_PROCESS_HIST_MAX; i++) {
+		io_extents->pp_extents[i].pid = 0;
+		lprocfs_oh_clear(&io_extents->pp_extents[i].pp_r_hist);
+		lprocfs_oh_clear(&io_extents->pp_extents[i].pp_w_hist);
+	}
+	spin_unlock(&sbi->ll_pp_extent_lock);
+
+	return len;
+}
+LPROC_SEQ_FOPS(ll_rw_extents_stats);
+
+void ll_rw_stats_tally(struct ll_sb_info *sbi, pid_t pid,
+		       struct ll_file_data *file, loff_t pos,
+		       size_t count, int rw)
+{
+	int i, cur = -1;
+	struct ll_rw_process_info *process;
+	struct ll_rw_process_info *offset;
+	int *off_count = &sbi->ll_rw_offset_entry_count;
+	int *process_count = &sbi->ll_offset_process_count;
+	struct ll_rw_extents_info *io_extents = &sbi->ll_rw_extents_info;
+
+	if (!sbi->ll_rw_stats_on)
+		return;
+	process = sbi->ll_rw_process_info;
+	offset = sbi->ll_rw_offset_info;
+
+	spin_lock(&sbi->ll_pp_extent_lock);
+	/* Extent statistics */
+	for (i = 0; i < LL_PROCESS_HIST_MAX; i++) {
+		if (io_extents->pp_extents[i].pid == pid) {
+			cur = i;
+			break;
+		}
+	}
+
+	if (cur == -1) {
+		/* new process */
+		sbi->ll_extent_process_count =
+			(sbi->ll_extent_process_count + 1) % LL_PROCESS_HIST_MAX;
+		cur = sbi->ll_extent_process_count;
+		io_extents->pp_extents[cur].pid = pid;
+		lprocfs_oh_clear(&io_extents->pp_extents[cur].pp_r_hist);
+		lprocfs_oh_clear(&io_extents->pp_extents[cur].pp_w_hist);
+	}
+
+	for(i = 0; (count >= (1 << LL_HIST_START << i)) &&
+	     (i < (LL_HIST_MAX - 1)); i++);
+	if (rw == 0) {
+		io_extents->pp_extents[cur].pp_r_hist.oh_buckets[i]++;
+		io_extents->pp_extents[LL_PROCESS_HIST_MAX].pp_r_hist.oh_buckets[i]++;
+	} else {
+		io_extents->pp_extents[cur].pp_w_hist.oh_buckets[i]++;
+		io_extents->pp_extents[LL_PROCESS_HIST_MAX].pp_w_hist.oh_buckets[i]++;
+	}
+	spin_unlock(&sbi->ll_pp_extent_lock);
+
+	spin_lock(&sbi->ll_process_lock);
+	/* Offset statistics */
+	for (i = 0; i < LL_PROCESS_HIST_MAX; i++) {
+		if (process[i].rw_pid == pid) {
+			if (process[i].rw_last_file != file) {
+				process[i].rw_range_start = pos;
+				process[i].rw_last_file_pos = pos + count;
+				process[i].rw_smallest_extent = count;
+				process[i].rw_largest_extent = count;
+				process[i].rw_offset = 0;
+				process[i].rw_last_file = file;
+				spin_unlock(&sbi->ll_process_lock);
+				return;
+			}
+			if (process[i].rw_last_file_pos != pos) {
+				*off_count =
+				    (*off_count + 1) % LL_OFFSET_HIST_MAX;
+				offset[*off_count].rw_op = process[i].rw_op;
+				offset[*off_count].rw_pid = pid;
+				offset[*off_count].rw_range_start =
+					process[i].rw_range_start;
+				offset[*off_count].rw_range_end =
+					process[i].rw_last_file_pos;
+				offset[*off_count].rw_smallest_extent =
+					process[i].rw_smallest_extent;
+				offset[*off_count].rw_largest_extent =
+					process[i].rw_largest_extent;
+				offset[*off_count].rw_offset =
+					process[i].rw_offset;
+				process[i].rw_op = rw;
+				process[i].rw_range_start = pos;
+				process[i].rw_smallest_extent = count;
+				process[i].rw_largest_extent = count;
+				process[i].rw_offset = pos -
+					process[i].rw_last_file_pos;
+			}
+			if (process[i].rw_smallest_extent > count)
+				process[i].rw_smallest_extent = count;
+			if (process[i].rw_largest_extent < count)
+				process[i].rw_largest_extent = count;
+			process[i].rw_last_file_pos = pos + count;
+			spin_unlock(&sbi->ll_process_lock);
+			return;
+		}
+	}
+	*process_count = (*process_count + 1) % LL_PROCESS_HIST_MAX;
+	process[*process_count].rw_pid = pid;
+	process[*process_count].rw_op = rw;
+	process[*process_count].rw_range_start = pos;
+	process[*process_count].rw_last_file_pos = pos + count;
+	process[*process_count].rw_smallest_extent = count;
+	process[*process_count].rw_largest_extent = count;
+	process[*process_count].rw_offset = 0;
+	process[*process_count].rw_last_file = file;
+	spin_unlock(&sbi->ll_process_lock);
+}
+
+static int ll_rw_offset_stats_seq_show(struct seq_file *seq, void *v)
+{
+	struct timeval now;
+	struct ll_sb_info *sbi = seq->private;
+	struct ll_rw_process_info *offset = sbi->ll_rw_offset_info;
+	struct ll_rw_process_info *process = sbi->ll_rw_process_info;
+	int i;
+
+	do_gettimeofday(&now);
+
+	if (!sbi->ll_rw_stats_on) {
+		seq_printf(seq, "disabled\n"
+			   "write anything in this file to activate, then 0 or \"[D/d]isabled\" to deactivate\n");
+		return 0;
+	}
+	spin_lock(&sbi->ll_process_lock);
+
+	seq_printf(seq, "snapshot_time:	 %lu.%lu (secs.usecs)\n",
+		   now.tv_sec, (unsigned long)now.tv_usec);
+	seq_printf(seq, "%3s %10s %14s %14s %17s %17s %14s\n",
+		   "R/W", "PID", "RANGE START", "RANGE END",
+		   "SMALLEST EXTENT", "LARGEST EXTENT", "OFFSET");
+	/* We stored the discontiguous offsets here; print them first */
+	for (i = 0; i < LL_OFFSET_HIST_MAX; i++) {
+		if (offset[i].rw_pid != 0)
+			seq_printf(seq,
+				   "%3c %10d %14Lu %14Lu %17lu %17lu %14Lu",
+				   offset[i].rw_op == READ ? 'R' : 'W',
+				   offset[i].rw_pid,
+				   offset[i].rw_range_start,
+				   offset[i].rw_range_end,
+				   (unsigned long)offset[i].rw_smallest_extent,
+				   (unsigned long)offset[i].rw_largest_extent,
+				   offset[i].rw_offset);
+	}
+	/* Then print the current offsets for each process */
+	for (i = 0; i < LL_PROCESS_HIST_MAX; i++) {
+		if (process[i].rw_pid != 0)
+			seq_printf(seq,
+				   "%3c %10d %14Lu %14Lu %17lu %17lu %14Lu",
+				   process[i].rw_op == READ ? 'R' : 'W',
+				   process[i].rw_pid,
+				   process[i].rw_range_start,
+				   process[i].rw_last_file_pos,
+				   (unsigned long)process[i].rw_smallest_extent,
+				   (unsigned long)process[i].rw_largest_extent,
+				   process[i].rw_offset);
+	}
+	spin_unlock(&sbi->ll_process_lock);
+
+	return 0;
+}
+
+static ssize_t ll_rw_offset_stats_seq_write(struct file *file,
+					    const char __user *buf,
+					    size_t len, loff_t *off)
+{
+	struct seq_file *seq = file->private_data;
+	struct ll_sb_info *sbi = seq->private;
+	struct ll_rw_process_info *process_info = sbi->ll_rw_process_info;
+	struct ll_rw_process_info *offset_info = sbi->ll_rw_offset_info;
+	int value = 1, rc = 0;
+
+	if (len == 0)
+		return -EINVAL;
+
+	rc = lprocfs_write_helper(buf, len, &value);
+
+	if (rc < 0 && len < 16) {
+		char kernbuf[16];
+
+		if (copy_from_user(kernbuf, buf, len))
+			return -EFAULT;
+		kernbuf[len] = 0;
+
+		if (kernbuf[len - 1] == '\n')
+			kernbuf[len - 1] = 0;
+
+		if (strcmp(kernbuf, "disabled") == 0 ||
+		    strcmp(kernbuf, "Disabled") == 0)
+			value = 0;
+	}
+
+	if (value == 0)
+		sbi->ll_rw_stats_on = 0;
+	else
+		sbi->ll_rw_stats_on = 1;
+
+	spin_lock(&sbi->ll_process_lock);
+	sbi->ll_offset_process_count = 0;
+	sbi->ll_rw_offset_entry_count = 0;
+	memset(process_info, 0, sizeof(struct ll_rw_process_info) *
+	       LL_PROCESS_HIST_MAX);
+	memset(offset_info, 0, sizeof(struct ll_rw_process_info) *
+	       LL_OFFSET_HIST_MAX);
+	spin_unlock(&sbi->ll_process_lock);
+
+	return len;
+}
+
+LPROC_SEQ_FOPS(ll_rw_offset_stats);
+
+void lprocfs_llite_init_vars(struct lprocfs_static_vars *lvars)
+{
+    lvars->module_vars  = NULL;
+    lvars->obd_vars     = lprocfs_llite_obd_vars;
+}
diff --git a/kernel/drivers/staging/lustre/lustre/llite/namei.c b/kernel/drivers/staging/lustre/lustre/llite/namei.c
new file mode 100644
index 000000000..5a25dcd10
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/llite/namei.c
@@ -0,0 +1,1178 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#include <linux/fs.h>
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/quotaops.h>
+#include <linux/highmem.h>
+#include <linux/pagemap.h>
+#include <linux/security.h>
+
+#define DEBUG_SUBSYSTEM S_LLITE
+
+#include "../include/obd_support.h"
+#include "../include/lustre_fid.h"
+#include "../include/lustre_lite.h"
+#include "../include/lustre_dlm.h"
+#include "../include/lustre_ver.h"
+#include "llite_internal.h"
+
+static int ll_create_it(struct inode *, struct dentry *,
+			int, struct lookup_intent *);
+
+/* called from iget5_locked->find_inode() under inode_hash_lock spinlock */
+static int ll_test_inode(struct inode *inode, void *opaque)
+{
+	struct ll_inode_info *lli = ll_i2info(inode);
+	struct lustre_md     *md = opaque;
+
+	if (unlikely(!(md->body->valid & OBD_MD_FLID))) {
+		CERROR("MDS body missing FID\n");
+		return 0;
+	}
+
+	if (!lu_fid_eq(&lli->lli_fid, &md->body->fid1))
+		return 0;
+
+	return 1;
+}
+
+static int ll_set_inode(struct inode *inode, void *opaque)
+{
+	struct ll_inode_info *lli = ll_i2info(inode);
+	struct mdt_body *body = ((struct lustre_md *)opaque)->body;
+
+	if (unlikely(!(body->valid & OBD_MD_FLID))) {
+		CERROR("MDS body missing FID\n");
+		return -EINVAL;
+	}
+
+	lli->lli_fid = body->fid1;
+	if (unlikely(!(body->valid & OBD_MD_FLTYPE))) {
+		CERROR("Can not initialize inode " DFID
+		       " without object type: valid = %#llx\n",
+		       PFID(&lli->lli_fid), body->valid);
+		return -EINVAL;
+	}
+
+	inode->i_mode = (inode->i_mode & ~S_IFMT) | (body->mode & S_IFMT);
+	if (unlikely(inode->i_mode == 0)) {
+		CERROR("Invalid inode "DFID" type\n", PFID(&lli->lli_fid));
+		return -EINVAL;
+	}
+
+	ll_lli_init(lli);
+
+	return 0;
+}
+
+
+/*
+ * Get an inode by inode number (already instantiated by the intent lookup).
+ * Returns inode or NULL
+ */
+struct inode *ll_iget(struct super_block *sb, ino_t hash,
+		      struct lustre_md *md)
+{
+	struct inode	 *inode;
+
+	LASSERT(hash != 0);
+	inode = iget5_locked(sb, hash, ll_test_inode, ll_set_inode, md);
+
+	if (inode) {
+		if (inode->i_state & I_NEW) {
+			int rc = 0;
+
+			ll_read_inode2(inode, md);
+			if (S_ISREG(inode->i_mode) &&
+			    ll_i2info(inode)->lli_clob == NULL) {
+				CDEBUG(D_INODE,
+					"%s: apply lsm %p to inode "DFID".\n",
+					ll_get_fsname(sb, NULL, 0), md->lsm,
+					PFID(ll_inode2fid(inode)));
+				rc = cl_file_inode_init(inode, md);
+			}
+			if (rc != 0) {
+				make_bad_inode(inode);
+				unlock_new_inode(inode);
+				iput(inode);
+				inode = ERR_PTR(rc);
+			} else
+				unlock_new_inode(inode);
+		} else if (!(inode->i_state & (I_FREEING | I_CLEAR)))
+			ll_update_inode(inode, md);
+		CDEBUG(D_VFSTRACE, "got inode: %p for "DFID"\n",
+		       inode, PFID(&md->body->fid1));
+	}
+	return inode;
+}
+
+static void ll_invalidate_negative_children(struct inode *dir)
+{
+	struct dentry *dentry, *tmp_subdir;
+	struct ll_d_hlist_node *p;
+
+	ll_lock_dcache(dir);
+	ll_d_hlist_for_each_entry(dentry, p, &dir->i_dentry, d_u.d_alias) {
+		spin_lock(&dentry->d_lock);
+		if (!list_empty(&dentry->d_subdirs)) {
+			struct dentry *child;
+
+			list_for_each_entry_safe(child, tmp_subdir,
+						 &dentry->d_subdirs,
+						 d_child) {
+				if (d_really_is_negative(child))
+					d_lustre_invalidate(child, 1);
+			}
+		}
+		spin_unlock(&dentry->d_lock);
+	}
+	ll_unlock_dcache(dir);
+}
+
+int ll_md_blocking_ast(struct ldlm_lock *lock, struct ldlm_lock_desc *desc,
+		       void *data, int flag)
+{
+	struct lustre_handle lockh;
+	int rc;
+
+	switch (flag) {
+	case LDLM_CB_BLOCKING:
+		ldlm_lock2handle(lock, &lockh);
+		rc = ldlm_cli_cancel(&lockh, LCF_ASYNC);
+		if (rc < 0) {
+			CDEBUG(D_INODE, "ldlm_cli_cancel: rc = %d\n", rc);
+			return rc;
+		}
+		break;
+	case LDLM_CB_CANCELING: {
+		struct inode *inode = ll_inode_from_resource_lock(lock);
+		__u64 bits = lock->l_policy_data.l_inodebits.bits;
+
+		/* Inode is set to lock->l_resource->lr_lvb_inode
+		 * for mdc - bug 24555 */
+		LASSERT(lock->l_ast_data == NULL);
+
+		if (inode == NULL)
+			break;
+
+		/* Invalidate all dentries associated with this inode */
+		LASSERT(lock->l_flags & LDLM_FL_CANCELING);
+
+		if (!fid_res_name_eq(ll_inode2fid(inode),
+				     &lock->l_resource->lr_name)) {
+			LDLM_ERROR(lock, "data mismatch with object "DFID"(%p)",
+				   PFID(ll_inode2fid(inode)), inode);
+			LBUG();
+		}
+
+		if (bits & MDS_INODELOCK_XATTR) {
+			ll_xattr_cache_destroy(inode);
+			bits &= ~MDS_INODELOCK_XATTR;
+		}
+
+		/* For OPEN locks we differentiate between lock modes
+		 * LCK_CR, LCK_CW, LCK_PR - bug 22891 */
+		if (bits & MDS_INODELOCK_OPEN)
+			ll_have_md_lock(inode, &bits, lock->l_req_mode);
+
+		if (bits & MDS_INODELOCK_OPEN) {
+			fmode_t fmode;
+
+			switch (lock->l_req_mode) {
+			case LCK_CW:
+				fmode = FMODE_WRITE;
+				break;
+			case LCK_PR:
+				fmode = FMODE_EXEC;
+				break;
+			case LCK_CR:
+				fmode = FMODE_READ;
+				break;
+			default:
+				LDLM_ERROR(lock, "bad lock mode for OPEN lock");
+				LBUG();
+			}
+
+			ll_md_real_close(inode, fmode);
+		}
+
+		if (bits & (MDS_INODELOCK_LOOKUP | MDS_INODELOCK_UPDATE |
+			    MDS_INODELOCK_LAYOUT | MDS_INODELOCK_PERM))
+			ll_have_md_lock(inode, &bits, LCK_MINMODE);
+
+		if (bits & MDS_INODELOCK_LAYOUT) {
+			struct cl_object_conf conf = {
+				.coc_opc = OBJECT_CONF_INVALIDATE,
+				.coc_inode = inode,
+			};
+
+			rc = ll_layout_conf(inode, &conf);
+			if (rc < 0)
+				CDEBUG(D_INODE, "cannot invalidate layout of "
+				       DFID": rc = %d\n",
+				       PFID(ll_inode2fid(inode)), rc);
+		}
+
+		if (bits & MDS_INODELOCK_UPDATE) {
+			struct ll_inode_info *lli = ll_i2info(inode);
+
+			spin_lock(&lli->lli_lock);
+			lli->lli_flags &= ~LLIF_MDS_SIZE_LOCK;
+			spin_unlock(&lli->lli_lock);
+		}
+
+		if ((bits & MDS_INODELOCK_UPDATE) && S_ISDIR(inode->i_mode)) {
+			CDEBUG(D_INODE, "invalidating inode %lu\n",
+			       inode->i_ino);
+			truncate_inode_pages(inode->i_mapping, 0);
+			ll_invalidate_negative_children(inode);
+		}
+
+		if ((bits & (MDS_INODELOCK_LOOKUP | MDS_INODELOCK_PERM)) &&
+		    inode->i_sb->s_root != NULL &&
+		    !is_root_inode(inode))
+			ll_invalidate_aliases(inode);
+
+		iput(inode);
+		break;
+	}
+	default:
+		LBUG();
+	}
+
+	return 0;
+}
+
+__u32 ll_i2suppgid(struct inode *i)
+{
+	if (in_group_p(i->i_gid))
+		return (__u32)from_kgid(&init_user_ns, i->i_gid);
+	else
+		return (__u32)(-1);
+}
+
+/* Pack the required supplementary groups into the supplied groups array.
+ * If we don't need to use the groups from the target inode(s) then we
+ * instead pack one or more groups from the user's supplementary group
+ * array in case it might be useful.  Not needed if doing an MDS-side upcall. */
+void ll_i2gids(__u32 *suppgids, struct inode *i1, struct inode *i2)
+{
+#if 0
+	int i;
+#endif
+
+	LASSERT(i1 != NULL);
+	LASSERT(suppgids != NULL);
+
+	suppgids[0] = ll_i2suppgid(i1);
+
+	if (i2)
+		suppgids[1] = ll_i2suppgid(i2);
+		else
+			suppgids[1] = -1;
+
+#if 0
+	for (i = 0; i < current_ngroups; i++) {
+		if (suppgids[0] == -1) {
+			if (current_groups[i] != suppgids[1])
+				suppgids[0] = current_groups[i];
+			continue;
+		}
+		if (suppgids[1] == -1) {
+			if (current_groups[i] != suppgids[0])
+				suppgids[1] = current_groups[i];
+			continue;
+		}
+		break;
+	}
+#endif
+}
+
+/*
+ * try to reuse three types of dentry:
+ * 1. unhashed alias, this one is unhashed by d_invalidate (but it may be valid
+ *    by concurrent .revalidate).
+ * 2. INVALID alias (common case for no valid ldlm lock held, but this flag may
+ *    be cleared by others calling d_lustre_revalidate).
+ * 3. DISCONNECTED alias.
+ */
+static struct dentry *ll_find_alias(struct inode *inode, struct dentry *dentry)
+{
+	struct dentry *alias, *discon_alias, *invalid_alias;
+	struct ll_d_hlist_node *p;
+
+	if (ll_d_hlist_empty(&inode->i_dentry))
+		return NULL;
+
+	discon_alias = invalid_alias = NULL;
+
+	ll_lock_dcache(inode);
+	ll_d_hlist_for_each_entry(alias, p, &inode->i_dentry, d_u.d_alias) {
+		LASSERT(alias != dentry);
+
+		spin_lock(&alias->d_lock);
+		if (alias->d_flags & DCACHE_DISCONNECTED)
+			/* LASSERT(last_discon == NULL); LU-405, bz 20055 */
+			discon_alias = alias;
+		else if (alias->d_parent == dentry->d_parent	     &&
+			 alias->d_name.hash == dentry->d_name.hash       &&
+			 alias->d_name.len == dentry->d_name.len	 &&
+			 memcmp(alias->d_name.name, dentry->d_name.name,
+				dentry->d_name.len) == 0)
+			invalid_alias = alias;
+		spin_unlock(&alias->d_lock);
+
+		if (invalid_alias)
+			break;
+	}
+	alias = invalid_alias ?: discon_alias ?: NULL;
+	if (alias) {
+		spin_lock(&alias->d_lock);
+		dget_dlock(alias);
+		spin_unlock(&alias->d_lock);
+	}
+	ll_unlock_dcache(inode);
+
+	return alias;
+}
+
+/*
+ * Similar to d_splice_alias(), but lustre treats invalid alias
+ * similar to DCACHE_DISCONNECTED, and tries to use it anyway.
+ */
+struct dentry *ll_splice_alias(struct inode *inode, struct dentry *de)
+{
+	struct dentry *new;
+	int rc;
+
+	if (inode) {
+		new = ll_find_alias(inode, de);
+		if (new) {
+			rc = ll_d_init(new);
+			if (rc < 0) {
+				dput(new);
+				return ERR_PTR(rc);
+			}
+			d_move(new, de);
+			iput(inode);
+			CDEBUG(D_DENTRY,
+			       "Reuse dentry %p inode %p refc %d flags %#x\n",
+			      new, d_inode(new), d_count(new), new->d_flags);
+			return new;
+		}
+	}
+	rc = ll_d_init(de);
+	if (rc < 0)
+		return ERR_PTR(rc);
+	d_add(de, inode);
+	CDEBUG(D_DENTRY, "Add dentry %p inode %p refc %d flags %#x\n",
+	       de, d_inode(de), d_count(de), de->d_flags);
+	return de;
+}
+
+static int ll_lookup_it_finish(struct ptlrpc_request *request,
+			       struct lookup_intent *it,
+			       struct inode *parent, struct dentry **de)
+{
+	struct inode *inode = NULL;
+	__u64 bits = 0;
+	int rc;
+
+	/* NB 1 request reference will be taken away by ll_intent_lock()
+	 * when I return */
+	CDEBUG(D_DENTRY, "it %p it_disposition %x\n", it,
+	       it->d.lustre.it_disposition);
+	if (!it_disposition(it, DISP_LOOKUP_NEG)) {
+		rc = ll_prep_inode(&inode, request, (*de)->d_sb, it);
+		if (rc)
+			return rc;
+
+		ll_set_lock_data(ll_i2sbi(parent)->ll_md_exp, inode, it, &bits);
+
+		/* We used to query real size from OSTs here, but actually
+		   this is not needed. For stat() calls size would be updated
+		   from subsequent do_revalidate()->ll_inode_revalidate_it() in
+		   2.4 and
+		   vfs_getattr_it->ll_getattr()->ll_inode_revalidate_it() in 2.6
+		   Everybody else who needs correct file size would call
+		   ll_glimpse_size or some equivalent themselves anyway.
+		   Also see bug 7198. */
+	}
+
+	/* Only hash *de if it is unhashed (new dentry).
+	 * Atoimc_open may passing hashed dentries for open.
+	 */
+	if (d_unhashed(*de)) {
+		struct dentry *alias;
+
+		alias = ll_splice_alias(inode, *de);
+		if (IS_ERR(alias))
+			return PTR_ERR(alias);
+		*de = alias;
+	} else if (!it_disposition(it, DISP_LOOKUP_NEG)  &&
+		   !it_disposition(it, DISP_OPEN_CREATE)) {
+		/* With DISP_OPEN_CREATE dentry will
+		   instantiated in ll_create_it. */
+		LASSERT(d_inode(*de) == NULL);
+		d_instantiate(*de, inode);
+	}
+
+	if (!it_disposition(it, DISP_LOOKUP_NEG)) {
+		/* we have lookup look - unhide dentry */
+		if (bits & MDS_INODELOCK_LOOKUP)
+			d_lustre_revalidate(*de);
+	} else if (!it_disposition(it, DISP_OPEN_CREATE)) {
+		/* If file created on server, don't depend on parent UPDATE
+		 * lock to unhide it. It is left hidden and next lookup can
+		 * find it in ll_splice_alias.
+		 */
+		/* Check that parent has UPDATE lock. */
+		struct lookup_intent parent_it = {
+					.it_op = IT_GETATTR,
+					.d.lustre.it_lock_handle = 0 };
+
+		if (md_revalidate_lock(ll_i2mdexp(parent), &parent_it,
+				       &ll_i2info(parent)->lli_fid, NULL)) {
+			d_lustre_revalidate(*de);
+			ll_intent_release(&parent_it);
+		}
+	}
+
+	return 0;
+}
+
+static struct dentry *ll_lookup_it(struct inode *parent, struct dentry *dentry,
+				   struct lookup_intent *it, int lookup_flags)
+{
+	struct lookup_intent lookup_it = { .it_op = IT_LOOKUP };
+	struct dentry *save = dentry, *retval;
+	struct ptlrpc_request *req = NULL;
+	struct inode *inode;
+	struct md_op_data *op_data;
+	__u32 opc;
+	int rc;
+
+	if (dentry->d_name.len > ll_i2sbi(parent)->ll_namelen)
+		return ERR_PTR(-ENAMETOOLONG);
+
+	CDEBUG(D_VFSTRACE, "VFS Op:name=%pd,dir=%lu/%u(%p),intent=%s\n",
+	       dentry, parent->i_ino,
+	       parent->i_generation, parent, LL_IT2STR(it));
+
+	if (d_mountpoint(dentry))
+		CERROR("Tell Peter, lookup on mtpt, it %s\n", LL_IT2STR(it));
+
+	if (it == NULL || it->it_op == IT_GETXATTR)
+		it = &lookup_it;
+
+	if (it->it_op == IT_GETATTR) {
+		rc = ll_statahead_enter(parent, &dentry, 0);
+		if (rc == 1) {
+			if (dentry == save)
+				retval = NULL;
+			else
+				retval = dentry;
+			goto out;
+		}
+	}
+
+	if (it->it_op & IT_CREAT)
+		opc = LUSTRE_OPC_CREATE;
+	else
+		opc = LUSTRE_OPC_ANY;
+
+	op_data = ll_prep_md_op_data(NULL, parent, NULL, dentry->d_name.name,
+				     dentry->d_name.len, lookup_flags, opc,
+				     NULL);
+	if (IS_ERR(op_data))
+		return (void *)op_data;
+
+	/* enforce umask if acl disabled or MDS doesn't support umask */
+	if (!IS_POSIXACL(parent) || !exp_connect_umask(ll_i2mdexp(parent)))
+		it->it_create_mode &= ~current_umask();
+
+	rc = md_intent_lock(ll_i2mdexp(parent), op_data, NULL, 0, it,
+			    lookup_flags, &req, ll_md_blocking_ast, 0);
+	ll_finish_md_op_data(op_data);
+	if (rc < 0) {
+		retval = ERR_PTR(rc);
+		goto out;
+	}
+
+	rc = ll_lookup_it_finish(req, it, parent, &dentry);
+	if (rc != 0) {
+		ll_intent_release(it);
+		retval = ERR_PTR(rc);
+		goto out;
+	}
+
+	inode = d_inode(dentry);
+	if ((it->it_op & IT_OPEN) && inode &&
+	    !S_ISREG(inode->i_mode) &&
+	    !S_ISDIR(inode->i_mode)) {
+		ll_release_openhandle(inode, it);
+	}
+	ll_lookup_finish_locks(it, inode);
+
+	if (dentry == save)
+		retval = NULL;
+	else
+		retval = dentry;
+	goto out;
+ out:
+	if (req)
+		ptlrpc_req_finished(req);
+	if (it->it_op == IT_GETATTR && (retval == NULL || retval == dentry))
+		ll_statahead_mark(parent, dentry);
+	return retval;
+}
+
+static struct dentry *ll_lookup_nd(struct inode *parent, struct dentry *dentry,
+				   unsigned int flags)
+{
+	struct lookup_intent *itp, it = { .it_op = IT_GETATTR };
+	struct dentry *de;
+
+	CDEBUG(D_VFSTRACE, "VFS Op:name=%pd,dir=%lu/%u(%p),flags=%u\n",
+	       dentry, parent->i_ino,
+	       parent->i_generation, parent, flags);
+
+	/* Optimize away (CREATE && !OPEN). Let .create handle the race. */
+	if ((flags & LOOKUP_CREATE) && !(flags & LOOKUP_OPEN))
+		return NULL;
+
+	if (flags & (LOOKUP_PARENT|LOOKUP_OPEN|LOOKUP_CREATE))
+		itp = NULL;
+	else
+		itp = &it;
+	de = ll_lookup_it(parent, dentry, itp, 0);
+
+	if (itp != NULL)
+		ll_intent_release(itp);
+
+	return de;
+}
+
+/*
+ * For cached negative dentry and new dentry, handle lookup/create/open
+ * together.
+ */
+static int ll_atomic_open(struct inode *dir, struct dentry *dentry,
+			  struct file *file, unsigned open_flags,
+			  umode_t mode, int *opened)
+{
+	struct lookup_intent *it;
+	struct dentry *de;
+	long long lookup_flags = LOOKUP_OPEN;
+	int rc = 0;
+
+	CDEBUG(D_VFSTRACE,
+	       "VFS Op:name=%pd,dir=%lu/%u(%p),file %p,open_flags %x,mode %x opened %d\n",
+	       dentry, dir->i_ino,
+	       dir->i_generation, dir, file, open_flags, mode, *opened);
+
+	it = kzalloc(sizeof(*it), GFP_NOFS);
+	if (!it)
+		return -ENOMEM;
+
+	it->it_op = IT_OPEN;
+	if (open_flags & O_CREAT) {
+		it->it_op |= IT_CREAT;
+		lookup_flags |= LOOKUP_CREATE;
+	}
+	it->it_create_mode = (mode & S_IALLUGO) | S_IFREG;
+	it->it_flags = (open_flags & ~O_ACCMODE) | OPEN_FMODE(open_flags);
+
+	/* Dentry added to dcache tree in ll_lookup_it */
+	de = ll_lookup_it(dir, dentry, it, lookup_flags);
+	if (IS_ERR(de))
+		rc = PTR_ERR(de);
+	else if (de != NULL)
+		dentry = de;
+
+	if (!rc) {
+		if (it_disposition(it, DISP_OPEN_CREATE)) {
+			/* Dentry instantiated in ll_create_it. */
+			rc = ll_create_it(dir, dentry, mode, it);
+			if (rc) {
+				/* We dget in ll_splice_alias. */
+				if (de != NULL)
+					dput(de);
+				goto out_release;
+			}
+
+			*opened |= FILE_CREATED;
+		}
+		if (d_really_is_positive(dentry) && it_disposition(it, DISP_OPEN_OPEN)) {
+			/* Open dentry. */
+			if (S_ISFIFO(d_inode(dentry)->i_mode)) {
+				/* We cannot call open here as it would
+				 * deadlock.
+				 */
+				if (it_disposition(it, DISP_ENQ_OPEN_REF))
+					ptlrpc_req_finished(
+						       (struct ptlrpc_request *)
+							  it->d.lustre.it_data);
+				rc = finish_no_open(file, de);
+			} else {
+				file->private_data = it;
+				rc = finish_open(file, dentry, NULL, opened);
+				/* We dget in ll_splice_alias. finish_open takes
+				 * care of dget for fd open.
+				 */
+				if (de != NULL)
+					dput(de);
+			}
+		} else {
+			rc = finish_no_open(file, de);
+		}
+	}
+
+out_release:
+	ll_intent_release(it);
+	OBD_FREE(it, sizeof(*it));
+
+	return rc;
+}
+
+
+/* We depend on "mode" being set with the proper file type/umask by now */
+static struct inode *ll_create_node(struct inode *dir, struct lookup_intent *it)
+{
+	struct inode *inode = NULL;
+	struct ptlrpc_request *request = NULL;
+	struct ll_sb_info *sbi = ll_i2sbi(dir);
+	int rc;
+
+	LASSERT(it && it->d.lustre.it_disposition);
+
+	LASSERT(it_disposition(it, DISP_ENQ_CREATE_REF));
+	request = it->d.lustre.it_data;
+	it_clear_disposition(it, DISP_ENQ_CREATE_REF);
+	rc = ll_prep_inode(&inode, request, dir->i_sb, it);
+	if (rc) {
+		inode = ERR_PTR(rc);
+		goto out;
+	}
+
+	LASSERT(ll_d_hlist_empty(&inode->i_dentry));
+
+	/* We asked for a lock on the directory, but were granted a
+	 * lock on the inode.  Since we finally have an inode pointer,
+	 * stuff it in the lock. */
+	CDEBUG(D_DLMTRACE, "setting l_ast_data to inode %p (%lu/%u)\n",
+	       inode, inode->i_ino, inode->i_generation);
+	ll_set_lock_data(sbi->ll_md_exp, inode, it, NULL);
+ out:
+	ptlrpc_req_finished(request);
+	return inode;
+}
+
+/*
+ * By the time this is called, we already have created the directory cache
+ * entry for the new file, but it is so far negative - it has no inode.
+ *
+ * We defer creating the OBD object(s) until open, to keep the intent and
+ * non-intent code paths similar, and also because we do not have the MDS
+ * inode number before calling ll_create_node() (which is needed for LOV),
+ * so we would need to do yet another RPC to the MDS to store the LOV EA
+ * data on the MDS.  If needed, we would pass the PACKED lmm as data and
+ * lmm_size in datalen (the MDS still has code which will handle that).
+ *
+ * If the create succeeds, we fill in the inode information
+ * with d_instantiate().
+ */
+static int ll_create_it(struct inode *dir, struct dentry *dentry, int mode,
+			struct lookup_intent *it)
+{
+	struct inode *inode;
+	int rc = 0;
+
+	CDEBUG(D_VFSTRACE, "VFS Op:name=%pd,dir=%lu/%u(%p),intent=%s\n",
+	       dentry, dir->i_ino,
+	       dir->i_generation, dir, LL_IT2STR(it));
+
+	rc = it_open_error(DISP_OPEN_CREATE, it);
+	if (rc)
+		return rc;
+
+	inode = ll_create_node(dir, it);
+	if (IS_ERR(inode))
+		return PTR_ERR(inode);
+
+	d_instantiate(dentry, inode);
+	return 0;
+}
+
+static void ll_update_times(struct ptlrpc_request *request,
+			    struct inode *inode)
+{
+	struct mdt_body *body = req_capsule_server_get(&request->rq_pill,
+						       &RMF_MDT_BODY);
+
+	LASSERT(body);
+	if (body->valid & OBD_MD_FLMTIME &&
+	    body->mtime > LTIME_S(inode->i_mtime)) {
+		CDEBUG(D_INODE, "setting ino %lu mtime from %lu to %llu\n",
+		       inode->i_ino, LTIME_S(inode->i_mtime), body->mtime);
+		LTIME_S(inode->i_mtime) = body->mtime;
+	}
+	if (body->valid & OBD_MD_FLCTIME &&
+	    body->ctime > LTIME_S(inode->i_ctime))
+		LTIME_S(inode->i_ctime) = body->ctime;
+}
+
+static int ll_new_node(struct inode *dir, struct dentry *dentry,
+		       const char *tgt, int mode, int rdev,
+		       __u32 opc)
+{
+	struct ptlrpc_request *request = NULL;
+	struct md_op_data *op_data;
+	struct inode *inode = NULL;
+	struct ll_sb_info *sbi = ll_i2sbi(dir);
+	int tgt_len = 0;
+	int err;
+
+	if (unlikely(tgt != NULL))
+		tgt_len = strlen(tgt) + 1;
+
+	op_data = ll_prep_md_op_data(NULL, dir, NULL,
+				     dentry->d_name.name,
+				     dentry->d_name.len,
+				     0, opc, NULL);
+	if (IS_ERR(op_data)) {
+		err = PTR_ERR(op_data);
+		goto err_exit;
+	}
+
+	err = md_create(sbi->ll_md_exp, op_data, tgt, tgt_len, mode,
+			from_kuid(&init_user_ns, current_fsuid()),
+			from_kgid(&init_user_ns, current_fsgid()),
+			cfs_curproc_cap_pack(), rdev, &request);
+	ll_finish_md_op_data(op_data);
+	if (err)
+		goto err_exit;
+
+	ll_update_times(request, dir);
+
+	err = ll_prep_inode(&inode, request, dir->i_sb, NULL);
+	if (err)
+		goto err_exit;
+
+	d_instantiate(dentry, inode);
+err_exit:
+	ptlrpc_req_finished(request);
+
+	return err;
+}
+
+static int ll_mknod(struct inode *dir, struct dentry *dchild,
+		    umode_t mode, dev_t rdev)
+{
+	int err;
+
+	CDEBUG(D_VFSTRACE, "VFS Op:name=%pd,dir=%lu/%u(%p) mode %o dev %x\n",
+	       dchild, dir->i_ino, dir->i_generation, dir,
+	       mode, old_encode_dev(rdev));
+
+	if (!IS_POSIXACL(dir) || !exp_connect_umask(ll_i2mdexp(dir)))
+		mode &= ~current_umask();
+
+	switch (mode & S_IFMT) {
+	case 0:
+		mode |= S_IFREG; /* for mode = 0 case, fallthrough */
+	case S_IFREG:
+	case S_IFCHR:
+	case S_IFBLK:
+	case S_IFIFO:
+	case S_IFSOCK:
+		err = ll_new_node(dir, dchild, NULL, mode,
+				  old_encode_dev(rdev),
+				  LUSTRE_OPC_MKNOD);
+		break;
+	case S_IFDIR:
+		err = -EPERM;
+		break;
+	default:
+		err = -EINVAL;
+	}
+
+	if (!err)
+		ll_stats_ops_tally(ll_i2sbi(dir), LPROC_LL_MKNOD, 1);
+
+	return err;
+}
+
+/*
+ * Plain create. Intent create is handled in atomic_open.
+ */
+static int ll_create_nd(struct inode *dir, struct dentry *dentry,
+			umode_t mode, bool want_excl)
+{
+	int rc;
+
+	CDEBUG(D_VFSTRACE, "VFS Op:name=%pd,dir=%lu/%u(%p),flags=%u, excl=%d\n",
+	       dentry, dir->i_ino,
+	       dir->i_generation, dir, mode, want_excl);
+
+	rc = ll_mknod(dir, dentry, mode, 0);
+
+	ll_stats_ops_tally(ll_i2sbi(dir), LPROC_LL_CREATE, 1);
+
+	CDEBUG(D_VFSTRACE, "VFS Op:name=%pd, unhashed %d\n",
+	       dentry, d_unhashed(dentry));
+
+	return rc;
+}
+
+static inline void ll_get_child_fid(struct dentry *child, struct lu_fid *fid)
+{
+	if (d_really_is_positive(child))
+		*fid = *ll_inode2fid(d_inode(child));
+}
+
+/**
+ * Remove dir entry
+ **/
+int ll_rmdir_entry(struct inode *dir, char *name, int namelen)
+{
+	struct ptlrpc_request *request = NULL;
+	struct md_op_data *op_data;
+	int rc;
+
+	CDEBUG(D_VFSTRACE, "VFS Op:name=%.*s,dir=%lu/%u(%p)\n",
+	       namelen, name, dir->i_ino, dir->i_generation, dir);
+
+	op_data = ll_prep_md_op_data(NULL, dir, NULL, name, strlen(name),
+				     S_IFDIR, LUSTRE_OPC_ANY, NULL);
+	if (IS_ERR(op_data))
+		return PTR_ERR(op_data);
+	op_data->op_cli_flags |= CLI_RM_ENTRY;
+	rc = md_unlink(ll_i2sbi(dir)->ll_md_exp, op_data, &request);
+	ll_finish_md_op_data(op_data);
+	if (rc == 0) {
+		ll_update_times(request, dir);
+		ll_stats_ops_tally(ll_i2sbi(dir), LPROC_LL_RMDIR, 1);
+	}
+
+	ptlrpc_req_finished(request);
+	return rc;
+}
+
+int ll_objects_destroy(struct ptlrpc_request *request, struct inode *dir)
+{
+	struct mdt_body *body;
+	struct lov_mds_md *eadata;
+	struct lov_stripe_md *lsm = NULL;
+	struct obd_trans_info oti = { 0 };
+	struct obdo *oa;
+	struct obd_capa *oc = NULL;
+	int rc;
+
+	/* req is swabbed so this is safe */
+	body = req_capsule_server_get(&request->rq_pill, &RMF_MDT_BODY);
+	if (!(body->valid & OBD_MD_FLEASIZE))
+		return 0;
+
+	if (body->eadatasize == 0) {
+		CERROR("OBD_MD_FLEASIZE set but eadatasize zero\n");
+		rc = -EPROTO;
+		goto out;
+	}
+
+	/* The MDS sent back the EA because we unlinked the last reference
+	 * to this file. Use this EA to unlink the objects on the OST.
+	 * It's opaque so we don't swab here; we leave it to obd_unpackmd() to
+	 * check it is complete and sensible. */
+	eadata = req_capsule_server_sized_get(&request->rq_pill, &RMF_MDT_MD,
+					      body->eadatasize);
+	LASSERT(eadata != NULL);
+
+	rc = obd_unpackmd(ll_i2dtexp(dir), &lsm, eadata, body->eadatasize);
+	if (rc < 0) {
+		CERROR("obd_unpackmd: %d\n", rc);
+		goto out;
+	}
+	LASSERT(rc >= sizeof(*lsm));
+
+	OBDO_ALLOC(oa);
+	if (oa == NULL) {
+		rc = -ENOMEM;
+		goto out_free_memmd;
+	}
+
+	oa->o_oi = lsm->lsm_oi;
+	oa->o_mode = body->mode & S_IFMT;
+	oa->o_valid = OBD_MD_FLID | OBD_MD_FLTYPE | OBD_MD_FLGROUP;
+
+	if (body->valid & OBD_MD_FLCOOKIE) {
+		oa->o_valid |= OBD_MD_FLCOOKIE;
+		oti.oti_logcookies =
+			req_capsule_server_sized_get(&request->rq_pill,
+						     &RMF_LOGCOOKIES,
+						   sizeof(struct llog_cookie) *
+						     lsm->lsm_stripe_count);
+		if (oti.oti_logcookies == NULL) {
+			oa->o_valid &= ~OBD_MD_FLCOOKIE;
+			body->valid &= ~OBD_MD_FLCOOKIE;
+		}
+	}
+
+	if (body->valid & OBD_MD_FLOSSCAPA) {
+		rc = md_unpack_capa(ll_i2mdexp(dir), request, &RMF_CAPA2, &oc);
+		if (rc)
+			goto out_free_memmd;
+	}
+
+	rc = obd_destroy(NULL, ll_i2dtexp(dir), oa, lsm, &oti,
+			 ll_i2mdexp(dir), oc);
+	capa_put(oc);
+	if (rc)
+		CERROR("obd destroy objid "DOSTID" error %d\n",
+		       POSTID(&lsm->lsm_oi), rc);
+out_free_memmd:
+	obd_free_memmd(ll_i2dtexp(dir), &lsm);
+	OBDO_FREE(oa);
+out:
+	return rc;
+}
+
+/* ll_unlink() doesn't update the inode with the new link count.
+ * Instead, ll_ddelete() and ll_d_iput() will update it based upon if there
+ * is any lock existing. They will recycle dentries and inodes based upon locks
+ * too. b=20433 */
+static int ll_unlink(struct inode *dir, struct dentry *dentry)
+{
+	struct ptlrpc_request *request = NULL;
+	struct md_op_data *op_data;
+	int rc;
+
+	CDEBUG(D_VFSTRACE, "VFS Op:name=%pd,dir=%lu/%u(%p)\n",
+	       dentry, dir->i_ino, dir->i_generation, dir);
+
+	op_data = ll_prep_md_op_data(NULL, dir, NULL,
+				     dentry->d_name.name,
+				     dentry->d_name.len,
+				     0, LUSTRE_OPC_ANY, NULL);
+	if (IS_ERR(op_data))
+		return PTR_ERR(op_data);
+
+	ll_get_child_fid(dentry, &op_data->op_fid3);
+	op_data->op_fid2 = op_data->op_fid3;
+	rc = md_unlink(ll_i2sbi(dir)->ll_md_exp, op_data, &request);
+	ll_finish_md_op_data(op_data);
+	if (rc)
+		goto out;
+
+	ll_update_times(request, dir);
+	ll_stats_ops_tally(ll_i2sbi(dir), LPROC_LL_UNLINK, 1);
+
+	rc = ll_objects_destroy(request, dir);
+ out:
+	ptlrpc_req_finished(request);
+	return rc;
+}
+
+static int ll_mkdir(struct inode *dir, struct dentry *dentry, ll_umode_t mode)
+{
+	int err;
+
+	CDEBUG(D_VFSTRACE, "VFS Op:name=%pd,dir=%lu/%u(%p)\n",
+	       dentry, dir->i_ino, dir->i_generation, dir);
+
+	if (!IS_POSIXACL(dir) || !exp_connect_umask(ll_i2mdexp(dir)))
+		mode &= ~current_umask();
+	mode = (mode & (S_IRWXUGO|S_ISVTX)) | S_IFDIR;
+	err = ll_new_node(dir, dentry, NULL, mode, 0, LUSTRE_OPC_MKDIR);
+
+	if (!err)
+		ll_stats_ops_tally(ll_i2sbi(dir), LPROC_LL_MKDIR, 1);
+
+	return err;
+}
+
+static int ll_rmdir(struct inode *dir, struct dentry *dentry)
+{
+	struct ptlrpc_request *request = NULL;
+	struct md_op_data *op_data;
+	int rc;
+
+	CDEBUG(D_VFSTRACE, "VFS Op:name=%pd,dir=%lu/%u(%p)\n",
+	       dentry, dir->i_ino, dir->i_generation, dir);
+
+	op_data = ll_prep_md_op_data(NULL, dir, NULL,
+				     dentry->d_name.name,
+				     dentry->d_name.len,
+				     S_IFDIR, LUSTRE_OPC_ANY, NULL);
+	if (IS_ERR(op_data))
+		return PTR_ERR(op_data);
+
+	ll_get_child_fid(dentry, &op_data->op_fid3);
+	op_data->op_fid2 = op_data->op_fid3;
+	rc = md_unlink(ll_i2sbi(dir)->ll_md_exp, op_data, &request);
+	ll_finish_md_op_data(op_data);
+	if (rc == 0) {
+		ll_update_times(request, dir);
+		ll_stats_ops_tally(ll_i2sbi(dir), LPROC_LL_RMDIR, 1);
+	}
+
+	ptlrpc_req_finished(request);
+	return rc;
+}
+
+static int ll_symlink(struct inode *dir, struct dentry *dentry,
+		      const char *oldname)
+{
+	int err;
+
+	CDEBUG(D_VFSTRACE, "VFS Op:name=%pd,dir=%lu/%u(%p),target=%.*s\n",
+	       dentry, dir->i_ino, dir->i_generation,
+	       dir, 3000, oldname);
+
+	err = ll_new_node(dir, dentry, oldname, S_IFLNK | S_IRWXUGO,
+			0, LUSTRE_OPC_SYMLINK);
+
+	if (!err)
+		ll_stats_ops_tally(ll_i2sbi(dir), LPROC_LL_SYMLINK, 1);
+
+	return err;
+}
+
+static int ll_link(struct dentry *old_dentry, struct inode *dir,
+		   struct dentry *new_dentry)
+{
+	struct inode *src = d_inode(old_dentry);
+	struct ll_sb_info *sbi = ll_i2sbi(dir);
+	struct ptlrpc_request *request = NULL;
+	struct md_op_data *op_data;
+	int err;
+
+	CDEBUG(D_VFSTRACE,
+	       "VFS Op: inode=%lu/%u(%p), dir=%lu/%u(%p), target=%pd\n",
+	       src->i_ino, src->i_generation, src, dir->i_ino,
+	       dir->i_generation, dir, new_dentry);
+
+	op_data = ll_prep_md_op_data(NULL, src, dir, new_dentry->d_name.name,
+				     new_dentry->d_name.len,
+				     0, LUSTRE_OPC_ANY, NULL);
+	if (IS_ERR(op_data))
+		return PTR_ERR(op_data);
+
+	err = md_link(sbi->ll_md_exp, op_data, &request);
+	ll_finish_md_op_data(op_data);
+	if (err)
+		goto out;
+
+	ll_update_times(request, dir);
+	ll_stats_ops_tally(sbi, LPROC_LL_LINK, 1);
+out:
+	ptlrpc_req_finished(request);
+	return err;
+}
+
+static int ll_rename(struct inode *old_dir, struct dentry *old_dentry,
+		     struct inode *new_dir, struct dentry *new_dentry)
+{
+	struct ptlrpc_request *request = NULL;
+	struct ll_sb_info *sbi = ll_i2sbi(old_dir);
+	struct md_op_data *op_data;
+	int err;
+
+	CDEBUG(D_VFSTRACE,
+	       "VFS Op:oldname=%pd,src_dir=%lu/%u(%p),newname=%pd,tgt_dir=%lu/%u(%p)\n",
+	       old_dentry, old_dir->i_ino, old_dir->i_generation, old_dir,
+	       new_dentry, new_dir->i_ino, new_dir->i_generation, new_dir);
+
+	op_data = ll_prep_md_op_data(NULL, old_dir, new_dir, NULL, 0, 0,
+				     LUSTRE_OPC_ANY, NULL);
+	if (IS_ERR(op_data))
+		return PTR_ERR(op_data);
+
+	ll_get_child_fid(old_dentry, &op_data->op_fid3);
+	ll_get_child_fid(new_dentry, &op_data->op_fid4);
+	err = md_rename(sbi->ll_md_exp, op_data,
+			old_dentry->d_name.name,
+			old_dentry->d_name.len,
+			new_dentry->d_name.name,
+			new_dentry->d_name.len, &request);
+	ll_finish_md_op_data(op_data);
+	if (!err) {
+		ll_update_times(request, old_dir);
+		ll_update_times(request, new_dir);
+		ll_stats_ops_tally(sbi, LPROC_LL_RENAME, 1);
+		err = ll_objects_destroy(request, old_dir);
+	}
+
+	ptlrpc_req_finished(request);
+	if (!err)
+		d_move(old_dentry, new_dentry);
+	return err;
+}
+
+const struct inode_operations ll_dir_inode_operations = {
+	.mknod	      = ll_mknod,
+	.atomic_open	    = ll_atomic_open,
+	.lookup	     = ll_lookup_nd,
+	.create	     = ll_create_nd,
+	/* We need all these non-raw things for NFSD, to not patch it. */
+	.unlink	     = ll_unlink,
+	.mkdir	      = ll_mkdir,
+	.rmdir	      = ll_rmdir,
+	.symlink	    = ll_symlink,
+	.link	       = ll_link,
+	.rename	     = ll_rename,
+	.setattr	    = ll_setattr,
+	.getattr	    = ll_getattr,
+	.permission	 = ll_inode_permission,
+	.setxattr	   = ll_setxattr,
+	.getxattr	   = ll_getxattr,
+	.listxattr	  = ll_listxattr,
+	.removexattr	= ll_removexattr,
+	.get_acl	    = ll_get_acl,
+};
+
+const struct inode_operations ll_special_inode_operations = {
+	.setattr	= ll_setattr,
+	.getattr	= ll_getattr,
+	.permission     = ll_inode_permission,
+	.setxattr       = ll_setxattr,
+	.getxattr       = ll_getxattr,
+	.listxattr      = ll_listxattr,
+	.removexattr    = ll_removexattr,
+	.get_acl	    = ll_get_acl,
+};
diff --git a/kernel/drivers/staging/lustre/lustre/llite/remote_perm.c b/kernel/drivers/staging/lustre/lustre/llite/remote_perm.c
new file mode 100644
index 000000000..a58182600
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/llite/remote_perm.c
@@ -0,0 +1,331 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/llite/remote_perm.c
+ *
+ * Lustre Permission Cache for Remote Client
+ *
+ * Author: Lai Siyao <lsy@clusterfs.com>
+ * Author: Fan Yong <fanyong@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LLITE
+
+#include <linux/module.h>
+#include <linux/types.h>
+
+#include "../include/lustre_lite.h"
+#include "../include/lustre_ha.h"
+#include "../include/lustre_dlm.h"
+#include "../include/lprocfs_status.h"
+#include "../include/lustre_disk.h"
+#include "../include/lustre_param.h"
+#include "llite_internal.h"
+
+struct kmem_cache *ll_remote_perm_cachep = NULL;
+struct kmem_cache *ll_rmtperm_hash_cachep = NULL;
+
+static inline struct ll_remote_perm *alloc_ll_remote_perm(void)
+{
+	struct ll_remote_perm *lrp;
+
+	OBD_SLAB_ALLOC_PTR_GFP(lrp, ll_remote_perm_cachep, GFP_KERNEL);
+	if (lrp)
+		INIT_HLIST_NODE(&lrp->lrp_list);
+	return lrp;
+}
+
+static inline void free_ll_remote_perm(struct ll_remote_perm *lrp)
+{
+	if (!lrp)
+		return;
+
+	if (!hlist_unhashed(&lrp->lrp_list))
+		hlist_del(&lrp->lrp_list);
+	OBD_SLAB_FREE(lrp, ll_remote_perm_cachep, sizeof(*lrp));
+}
+
+static struct hlist_head *alloc_rmtperm_hash(void)
+{
+	struct hlist_head *hash;
+	int i;
+
+	OBD_SLAB_ALLOC_GFP(hash, ll_rmtperm_hash_cachep,
+			   REMOTE_PERM_HASHSIZE * sizeof(*hash),
+			   GFP_IOFS);
+	if (!hash)
+		return NULL;
+
+	for (i = 0; i < REMOTE_PERM_HASHSIZE; i++)
+		INIT_HLIST_HEAD(hash + i);
+
+	return hash;
+}
+
+void free_rmtperm_hash(struct hlist_head *hash)
+{
+	int i;
+	struct ll_remote_perm *lrp;
+	struct hlist_node *next;
+
+	if (!hash)
+		return;
+
+	for (i = 0; i < REMOTE_PERM_HASHSIZE; i++)
+		hlist_for_each_entry_safe(lrp, next, hash + i,
+					      lrp_list)
+			free_ll_remote_perm(lrp);
+	OBD_SLAB_FREE(hash, ll_rmtperm_hash_cachep,
+		      REMOTE_PERM_HASHSIZE * sizeof(*hash));
+}
+
+static inline int remote_perm_hashfunc(uid_t uid)
+{
+	return uid & (REMOTE_PERM_HASHSIZE - 1);
+}
+
+/* NB: setxid permission is not checked here, instead it's done on
+ * MDT when client get remote permission. */
+static int do_check_remote_perm(struct ll_inode_info *lli, int mask)
+{
+	struct hlist_head *head;
+	struct ll_remote_perm *lrp;
+	int found = 0, rc;
+
+	if (!lli->lli_remote_perms)
+		return -ENOENT;
+
+	head = lli->lli_remote_perms +
+		remote_perm_hashfunc(from_kuid(&init_user_ns, current_uid()));
+
+	spin_lock(&lli->lli_lock);
+	hlist_for_each_entry(lrp, head, lrp_list) {
+		if (lrp->lrp_uid != from_kuid(&init_user_ns, current_uid()))
+			continue;
+		if (lrp->lrp_gid != from_kgid(&init_user_ns, current_gid()))
+			continue;
+		if (lrp->lrp_fsuid != from_kuid(&init_user_ns, current_fsuid()))
+			continue;
+		if (lrp->lrp_fsgid != from_kgid(&init_user_ns, current_fsgid()))
+			continue;
+		found = 1;
+		break;
+	}
+
+	if (!found) {
+		rc = -ENOENT;
+		goto out;
+	}
+
+	CDEBUG(D_SEC, "found remote perm: %u/%u/%u/%u - %#x\n",
+	       lrp->lrp_uid, lrp->lrp_gid, lrp->lrp_fsuid, lrp->lrp_fsgid,
+	       lrp->lrp_access_perm);
+	rc = ((lrp->lrp_access_perm & mask) == mask) ? 0 : -EACCES;
+
+out:
+	spin_unlock(&lli->lli_lock);
+	return rc;
+}
+
+int ll_update_remote_perm(struct inode *inode, struct mdt_remote_perm *perm)
+{
+	struct ll_inode_info *lli = ll_i2info(inode);
+	struct ll_remote_perm *lrp = NULL, *tmp = NULL;
+	struct hlist_head *head, *perm_hash = NULL;
+
+	LASSERT(ll_i2sbi(inode)->ll_flags & LL_SBI_RMT_CLIENT);
+
+#if 0
+	if (perm->rp_uid != current->uid ||
+	    perm->rp_gid != current->gid ||
+	    perm->rp_fsuid != current->fsuid ||
+	    perm->rp_fsgid != current->fsgid) {
+		/* user might setxid in this small period */
+		CDEBUG(D_SEC,
+		       "remote perm user %u/%u/%u/%u != current %u/%u/%u/%u\n",
+		       perm->rp_uid, perm->rp_gid, perm->rp_fsuid,
+		       perm->rp_fsgid, current->uid, current->gid,
+		       current->fsuid, current->fsgid);
+		return -EAGAIN;
+	}
+#endif
+
+	if (!lli->lli_remote_perms) {
+		perm_hash = alloc_rmtperm_hash();
+		if (perm_hash == NULL) {
+			CERROR("alloc lli_remote_perms failed!\n");
+			return -ENOMEM;
+		}
+	}
+
+	spin_lock(&lli->lli_lock);
+
+	if (!lli->lli_remote_perms)
+		lli->lli_remote_perms = perm_hash;
+	else
+		free_rmtperm_hash(perm_hash);
+
+	head = lli->lli_remote_perms + remote_perm_hashfunc(perm->rp_uid);
+
+again:
+	hlist_for_each_entry(tmp, head, lrp_list) {
+		if (tmp->lrp_uid != perm->rp_uid)
+			continue;
+		if (tmp->lrp_gid != perm->rp_gid)
+			continue;
+		if (tmp->lrp_fsuid != perm->rp_fsuid)
+			continue;
+		if (tmp->lrp_fsgid != perm->rp_fsgid)
+			continue;
+		free_ll_remote_perm(lrp);
+		lrp = tmp;
+		break;
+	}
+
+	if (!lrp) {
+		spin_unlock(&lli->lli_lock);
+		lrp = alloc_ll_remote_perm();
+		if (!lrp) {
+			CERROR("alloc memory for ll_remote_perm failed!\n");
+			return -ENOMEM;
+		}
+		spin_lock(&lli->lli_lock);
+		goto again;
+	}
+
+	lrp->lrp_access_perm = perm->rp_access_perm;
+	if (lrp != tmp) {
+		lrp->lrp_uid	 = perm->rp_uid;
+		lrp->lrp_gid	 = perm->rp_gid;
+		lrp->lrp_fsuid       = perm->rp_fsuid;
+		lrp->lrp_fsgid       = perm->rp_fsgid;
+		hlist_add_head(&lrp->lrp_list, head);
+	}
+	lli->lli_rmtperm_time = cfs_time_current();
+	spin_unlock(&lli->lli_lock);
+
+	CDEBUG(D_SEC, "new remote perm@%p: %u/%u/%u/%u - %#x\n",
+	       lrp, lrp->lrp_uid, lrp->lrp_gid, lrp->lrp_fsuid, lrp->lrp_fsgid,
+	       lrp->lrp_access_perm);
+
+	return 0;
+}
+
+int lustre_check_remote_perm(struct inode *inode, int mask)
+{
+	struct ll_inode_info *lli = ll_i2info(inode);
+	struct ll_sb_info *sbi = ll_i2sbi(inode);
+	struct ptlrpc_request *req = NULL;
+	struct mdt_remote_perm *perm;
+	struct obd_capa *oc;
+	unsigned long save;
+	int i = 0, rc;
+
+	do {
+		save = lli->lli_rmtperm_time;
+		rc = do_check_remote_perm(lli, mask);
+		if (!rc || (rc != -ENOENT && i))
+			break;
+
+		might_sleep();
+
+		mutex_lock(&lli->lli_rmtperm_mutex);
+		/* check again */
+		if (save != lli->lli_rmtperm_time) {
+			rc = do_check_remote_perm(lli, mask);
+			if (!rc || (rc != -ENOENT && i)) {
+				mutex_unlock(&lli->lli_rmtperm_mutex);
+				break;
+			}
+		}
+
+		if (i++ > 5) {
+			CERROR("check remote perm falls in dead loop!\n");
+			LBUG();
+		}
+
+		oc = ll_mdscapa_get(inode);
+		rc = md_get_remote_perm(sbi->ll_md_exp, ll_inode2fid(inode), oc,
+					ll_i2suppgid(inode), &req);
+		capa_put(oc);
+		if (rc) {
+			mutex_unlock(&lli->lli_rmtperm_mutex);
+			break;
+		}
+
+		perm = req_capsule_server_swab_get(&req->rq_pill, &RMF_ACL,
+						   lustre_swab_mdt_remote_perm);
+		if (unlikely(perm == NULL)) {
+			mutex_unlock(&lli->lli_rmtperm_mutex);
+			rc = -EPROTO;
+			break;
+		}
+
+		rc = ll_update_remote_perm(inode, perm);
+		mutex_unlock(&lli->lli_rmtperm_mutex);
+		if (rc == -ENOMEM)
+			break;
+
+		ptlrpc_req_finished(req);
+		req = NULL;
+	} while (1);
+	ptlrpc_req_finished(req);
+	return rc;
+}
+
+#if 0  /* NB: remote perms can't be freed in ll_mdc_blocking_ast of UPDATE lock,
+	* because it will fail sanity test 48.
+	*/
+void ll_free_remote_perms(struct inode *inode)
+{
+	struct ll_inode_info *lli = ll_i2info(inode);
+	struct hlist_head *hash = lli->lli_remote_perms;
+	struct ll_remote_perm *lrp;
+	struct hlist_node *node, *next;
+	int i;
+
+	LASSERT(hash);
+
+	spin_lock(&lli->lli_lock);
+
+	for (i = 0; i < REMOTE_PERM_HASHSIZE; i++) {
+		hlist_for_each_entry_safe(lrp, node, next, hash + i,
+					      lrp_list)
+			free_ll_remote_perm(lrp);
+	}
+
+	spin_unlock(&lli->lli_lock);
+}
+#endif
diff --git a/kernel/drivers/staging/lustre/lustre/llite/rw.c b/kernel/drivers/staging/lustre/lustre/llite/rw.c
new file mode 100644
index 000000000..991d20c50
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/llite/rw.c
@@ -0,0 +1,1289 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/llite/rw.c
+ *
+ * Lustre Lite I/O page cache routines shared by different kernel revs
+ */
+
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/stat.h>
+#include <linux/errno.h>
+#include <linux/unistd.h>
+#include <linux/writeback.h>
+#include <linux/uaccess.h>
+
+#include <linux/fs.h>
+#include <linux/pagemap.h>
+/* current_is_kswapd() */
+#include <linux/swap.h>
+
+#define DEBUG_SUBSYSTEM S_LLITE
+
+#include "../include/lustre_lite.h"
+#include "../include/obd_cksum.h"
+#include "llite_internal.h"
+#include "../include/linux/lustre_compat25.h"
+
+/**
+ * Finalizes cl-data before exiting typical address_space operation. Dual to
+ * ll_cl_init().
+ */
+static void ll_cl_fini(struct ll_cl_context *lcc)
+{
+	struct lu_env  *env  = lcc->lcc_env;
+	struct cl_io   *io   = lcc->lcc_io;
+	struct cl_page *page = lcc->lcc_page;
+
+	LASSERT(lcc->lcc_cookie == current);
+	LASSERT(env != NULL);
+
+	if (page != NULL) {
+		lu_ref_del(&page->cp_reference, "cl_io", io);
+		cl_page_put(env, page);
+	}
+
+	cl_env_put(env, &lcc->lcc_refcheck);
+}
+
+/**
+ * Initializes common cl-data at the typical address_space operation entry
+ * point.
+ */
+static struct ll_cl_context *ll_cl_init(struct file *file,
+					struct page *vmpage, int create)
+{
+	struct ll_cl_context *lcc;
+	struct lu_env    *env;
+	struct cl_io     *io;
+	struct cl_object *clob;
+	struct ccc_io    *cio;
+
+	int refcheck;
+	int result = 0;
+
+	clob = ll_i2info(vmpage->mapping->host)->lli_clob;
+	LASSERT(clob != NULL);
+
+	env = cl_env_get(&refcheck);
+	if (IS_ERR(env))
+		return ERR_CAST(env);
+
+	lcc = &vvp_env_info(env)->vti_io_ctx;
+	memset(lcc, 0, sizeof(*lcc));
+	lcc->lcc_env = env;
+	lcc->lcc_refcheck = refcheck;
+	lcc->lcc_cookie = current;
+
+	cio = ccc_env_io(env);
+	io = cio->cui_cl.cis_io;
+	if (io == NULL && create) {
+		struct inode *inode = vmpage->mapping->host;
+		loff_t pos;
+
+		if (mutex_trylock(&inode->i_mutex)) {
+			mutex_unlock(&(inode)->i_mutex);
+
+			/* this is too bad. Someone is trying to write the
+			 * page w/o holding inode mutex. This means we can
+			 * add dirty pages into cache during truncate */
+			CERROR("Proc %s is dirtying page w/o inode lock, this will break truncate\n",
+			       current->comm);
+			dump_stack();
+			LBUG();
+			return ERR_PTR(-EIO);
+		}
+
+		/*
+		 * Loop-back driver calls ->prepare_write().
+		 * methods directly, bypassing file system ->write() operation,
+		 * so cl_io has to be created here.
+		 */
+		io = ccc_env_thread_io(env);
+		ll_io_init(io, file, 1);
+
+		/* No lock at all for this kind of IO - we can't do it because
+		 * we have held page lock, it would cause deadlock.
+		 * XXX: This causes poor performance to loop device - One page
+		 *      per RPC.
+		 *      In order to get better performance, users should use
+		 *      lloop driver instead.
+		 */
+		io->ci_lockreq = CILR_NEVER;
+
+		pos = vmpage->index << PAGE_CACHE_SHIFT;
+
+		/* Create a temp IO to serve write. */
+		result = cl_io_rw_init(env, io, CIT_WRITE, pos, PAGE_CACHE_SIZE);
+		if (result == 0) {
+			cio->cui_fd = LUSTRE_FPRIVATE(file);
+			cio->cui_iter = NULL;
+			result = cl_io_iter_init(env, io);
+			if (result == 0) {
+				result = cl_io_lock(env, io);
+				if (result == 0)
+					result = cl_io_start(env, io);
+			}
+		} else
+			result = io->ci_result;
+	}
+
+	lcc->lcc_io = io;
+	if (io == NULL)
+		result = -EIO;
+	if (result == 0) {
+		struct cl_page   *page;
+
+		LASSERT(io != NULL);
+		LASSERT(io->ci_state == CIS_IO_GOING);
+		LASSERT(cio->cui_fd == LUSTRE_FPRIVATE(file));
+		page = cl_page_find(env, clob, vmpage->index, vmpage,
+				    CPT_CACHEABLE);
+		if (!IS_ERR(page)) {
+			lcc->lcc_page = page;
+			lu_ref_add(&page->cp_reference, "cl_io", io);
+			result = 0;
+		} else
+			result = PTR_ERR(page);
+	}
+	if (result) {
+		ll_cl_fini(lcc);
+		lcc = ERR_PTR(result);
+	}
+
+	CDEBUG(D_VFSTRACE, "%lu@"DFID" -> %d %p %p\n",
+	       vmpage->index, PFID(lu_object_fid(&clob->co_lu)), result,
+	       env, io);
+	return lcc;
+}
+
+static struct ll_cl_context *ll_cl_get(void)
+{
+	struct ll_cl_context *lcc;
+	struct lu_env *env;
+	int refcheck;
+
+	env = cl_env_get(&refcheck);
+	LASSERT(!IS_ERR(env));
+	lcc = &vvp_env_info(env)->vti_io_ctx;
+	LASSERT(env == lcc->lcc_env);
+	LASSERT(current == lcc->lcc_cookie);
+	cl_env_put(env, &refcheck);
+
+	/* env has got in ll_cl_init, so it is still usable. */
+	return lcc;
+}
+
+/**
+ * ->prepare_write() address space operation called by generic_file_write()
+ * for every page during write.
+ */
+int ll_prepare_write(struct file *file, struct page *vmpage, unsigned from,
+		     unsigned to)
+{
+	struct ll_cl_context *lcc;
+	int result;
+
+	lcc = ll_cl_init(file, vmpage, 1);
+	if (!IS_ERR(lcc)) {
+		struct lu_env  *env = lcc->lcc_env;
+		struct cl_io   *io  = lcc->lcc_io;
+		struct cl_page *page = lcc->lcc_page;
+
+		cl_page_assume(env, io, page);
+
+		result = cl_io_prepare_write(env, io, page, from, to);
+		if (result == 0) {
+			/*
+			 * Add a reference, so that page is not evicted from
+			 * the cache until ->commit_write() is called.
+			 */
+			cl_page_get(page);
+			lu_ref_add(&page->cp_reference, "prepare_write",
+				   current);
+		} else {
+			cl_page_unassume(env, io, page);
+			ll_cl_fini(lcc);
+		}
+		/* returning 0 in prepare assumes commit must be called
+		 * afterwards */
+	} else {
+		result = PTR_ERR(lcc);
+	}
+	return result;
+}
+
+int ll_commit_write(struct file *file, struct page *vmpage, unsigned from,
+		    unsigned to)
+{
+	struct ll_cl_context *lcc;
+	struct lu_env    *env;
+	struct cl_io     *io;
+	struct cl_page   *page;
+	int result = 0;
+
+	lcc  = ll_cl_get();
+	env  = lcc->lcc_env;
+	page = lcc->lcc_page;
+	io   = lcc->lcc_io;
+
+	LASSERT(cl_page_is_owned(page, io));
+	LASSERT(from <= to);
+	if (from != to) /* handle short write case. */
+		result = cl_io_commit_write(env, io, page, from, to);
+	if (cl_page_is_owned(page, io))
+		cl_page_unassume(env, io, page);
+
+	/*
+	 * Release reference acquired by ll_prepare_write().
+	 */
+	lu_ref_del(&page->cp_reference, "prepare_write", current);
+	cl_page_put(env, page);
+	ll_cl_fini(lcc);
+	return result;
+}
+
+struct obd_capa *cl_capa_lookup(struct inode *inode, enum cl_req_type crt)
+{
+	__u64 opc;
+
+	opc = crt == CRT_WRITE ? CAPA_OPC_OSS_WRITE : CAPA_OPC_OSS_RW;
+	return ll_osscapa_get(inode, opc);
+}
+
+static void ll_ra_stats_inc_sbi(struct ll_sb_info *sbi, enum ra_stat which);
+
+/**
+ * Get readahead pages from the filesystem readahead pool of the client for a
+ * thread.
+ *
+ * /param sbi superblock for filesystem readahead state ll_ra_info
+ * /param ria per-thread readahead state
+ * /param pages number of pages requested for readahead for the thread.
+ *
+ * WARNING: This algorithm is used to reduce contention on sbi->ll_lock.
+ * It should work well if the ra_max_pages is much greater than the single
+ * file's read-ahead window, and not too many threads contending for
+ * these readahead pages.
+ *
+ * TODO: There may be a 'global sync problem' if many threads are trying
+ * to get an ra budget that is larger than the remaining readahead pages
+ * and reach here at exactly the same time. They will compute /a ret to
+ * consume the remaining pages, but will fail at atomic_add_return() and
+ * get a zero ra window, although there is still ra space remaining. - Jay */
+
+static unsigned long ll_ra_count_get(struct ll_sb_info *sbi,
+				     struct ra_io_arg *ria,
+				     unsigned long pages)
+{
+	struct ll_ra_info *ra = &sbi->ll_ra_info;
+	long ret;
+
+	/* If read-ahead pages left are less than 1M, do not do read-ahead,
+	 * otherwise it will form small read RPC(< 1M), which hurt server
+	 * performance a lot. */
+	ret = min(ra->ra_max_pages - atomic_read(&ra->ra_cur_pages), pages);
+	if (ret < 0 || ret < min_t(long, PTLRPC_MAX_BRW_PAGES, pages)) {
+		ret = 0;
+		goto out;
+	}
+
+	/* If the non-strided (ria_pages == 0) readahead window
+	 * (ria_start + ret) has grown across an RPC boundary, then trim
+	 * readahead size by the amount beyond the RPC so it ends on an
+	 * RPC boundary. If the readahead window is already ending on
+	 * an RPC boundary (beyond_rpc == 0), or smaller than a full
+	 * RPC (beyond_rpc < ret) the readahead size is unchanged.
+	 * The (beyond_rpc != 0) check is skipped since the conditional
+	 * branch is more expensive than subtracting zero from the result.
+	 *
+	 * Strided read is left unaligned to avoid small fragments beyond
+	 * the RPC boundary from needing an extra read RPC. */
+	if (ria->ria_pages == 0) {
+		long beyond_rpc = (ria->ria_start + ret) % PTLRPC_MAX_BRW_PAGES;
+		if (/* beyond_rpc != 0 && */ beyond_rpc < ret)
+			ret -= beyond_rpc;
+	}
+
+	if (atomic_add_return(ret, &ra->ra_cur_pages) > ra->ra_max_pages) {
+		atomic_sub(ret, &ra->ra_cur_pages);
+		ret = 0;
+	}
+
+out:
+	return ret;
+}
+
+void ll_ra_count_put(struct ll_sb_info *sbi, unsigned long len)
+{
+	struct ll_ra_info *ra = &sbi->ll_ra_info;
+	atomic_sub(len, &ra->ra_cur_pages);
+}
+
+static void ll_ra_stats_inc_sbi(struct ll_sb_info *sbi, enum ra_stat which)
+{
+	LASSERTF(which >= 0 && which < _NR_RA_STAT, "which: %u\n", which);
+	lprocfs_counter_incr(sbi->ll_ra_stats, which);
+}
+
+void ll_ra_stats_inc(struct address_space *mapping, enum ra_stat which)
+{
+	struct ll_sb_info *sbi = ll_i2sbi(mapping->host);
+	ll_ra_stats_inc_sbi(sbi, which);
+}
+
+#define RAS_CDEBUG(ras) \
+	CDEBUG(D_READA,						      \
+	       "lrp %lu cr %lu cp %lu ws %lu wl %lu nra %lu r %lu ri %lu"    \
+	       "csr %lu sf %lu sp %lu sl %lu \n",			    \
+	       ras->ras_last_readpage, ras->ras_consecutive_requests,	\
+	       ras->ras_consecutive_pages, ras->ras_window_start,	    \
+	       ras->ras_window_len, ras->ras_next_readahead,		 \
+	       ras->ras_requests, ras->ras_request_index,		    \
+	       ras->ras_consecutive_stride_requests, ras->ras_stride_offset, \
+	       ras->ras_stride_pages, ras->ras_stride_length)
+
+static int index_in_window(unsigned long index, unsigned long point,
+			   unsigned long before, unsigned long after)
+{
+	unsigned long start = point - before, end = point + after;
+
+	if (start > point)
+	       start = 0;
+	if (end < point)
+	       end = ~0;
+
+	return start <= index && index <= end;
+}
+
+static struct ll_readahead_state *ll_ras_get(struct file *f)
+{
+	struct ll_file_data       *fd;
+
+	fd = LUSTRE_FPRIVATE(f);
+	return &fd->fd_ras;
+}
+
+void ll_ra_read_in(struct file *f, struct ll_ra_read *rar)
+{
+	struct ll_readahead_state *ras;
+
+	ras = ll_ras_get(f);
+
+	spin_lock(&ras->ras_lock);
+	ras->ras_requests++;
+	ras->ras_request_index = 0;
+	ras->ras_consecutive_requests++;
+	rar->lrr_reader = current;
+
+	list_add(&rar->lrr_linkage, &ras->ras_read_beads);
+	spin_unlock(&ras->ras_lock);
+}
+
+void ll_ra_read_ex(struct file *f, struct ll_ra_read *rar)
+{
+	struct ll_readahead_state *ras;
+
+	ras = ll_ras_get(f);
+
+	spin_lock(&ras->ras_lock);
+	list_del_init(&rar->lrr_linkage);
+	spin_unlock(&ras->ras_lock);
+}
+
+static struct ll_ra_read *ll_ra_read_get_locked(struct ll_readahead_state *ras)
+{
+	struct ll_ra_read *scan;
+
+	list_for_each_entry(scan, &ras->ras_read_beads, lrr_linkage) {
+		if (scan->lrr_reader == current)
+			return scan;
+	}
+	return NULL;
+}
+
+struct ll_ra_read *ll_ra_read_get(struct file *f)
+{
+	struct ll_readahead_state *ras;
+	struct ll_ra_read	 *bead;
+
+	ras = ll_ras_get(f);
+
+	spin_lock(&ras->ras_lock);
+	bead = ll_ra_read_get_locked(ras);
+	spin_unlock(&ras->ras_lock);
+	return bead;
+}
+
+static int cl_read_ahead_page(const struct lu_env *env, struct cl_io *io,
+			      struct cl_page_list *queue, struct cl_page *page,
+			      struct page *vmpage)
+{
+	struct ccc_page *cp;
+	int	      rc;
+
+	rc = 0;
+	cl_page_assume(env, io, page);
+	lu_ref_add(&page->cp_reference, "ra", current);
+	cp = cl2ccc_page(cl_page_at(page, &vvp_device_type));
+	if (!cp->cpg_defer_uptodate && !PageUptodate(vmpage)) {
+		rc = cl_page_is_under_lock(env, io, page);
+		if (rc == -EBUSY) {
+			cp->cpg_defer_uptodate = 1;
+			cp->cpg_ra_used = 0;
+			cl_page_list_add(queue, page);
+			rc = 1;
+		} else {
+			cl_page_delete(env, page);
+			rc = -ENOLCK;
+		}
+	} else {
+		/* skip completed pages */
+		cl_page_unassume(env, io, page);
+	}
+	lu_ref_del(&page->cp_reference, "ra", current);
+	cl_page_put(env, page);
+	return rc;
+}
+
+/**
+ * Initiates read-ahead of a page with given index.
+ *
+ * \retval     +ve: page was added to \a queue.
+ *
+ * \retval -ENOLCK: there is no extent lock for this part of a file, stop
+ *		  read-ahead.
+ *
+ * \retval  -ve, 0: page wasn't added to \a queue for other reason.
+ */
+static int ll_read_ahead_page(const struct lu_env *env, struct cl_io *io,
+			      struct cl_page_list *queue,
+			      pgoff_t index, struct address_space *mapping)
+{
+	struct page      *vmpage;
+	struct cl_object *clob  = ll_i2info(mapping->host)->lli_clob;
+	struct cl_page   *page;
+	enum ra_stat      which = _NR_RA_STAT; /* keep gcc happy */
+	int	       rc    = 0;
+	const char       *msg   = NULL;
+
+	vmpage = grab_cache_page_nowait(mapping, index);
+	if (vmpage != NULL) {
+		/* Check if vmpage was truncated or reclaimed */
+		if (vmpage->mapping == mapping) {
+			page = cl_page_find(env, clob, vmpage->index,
+					    vmpage, CPT_CACHEABLE);
+			if (!IS_ERR(page)) {
+				rc = cl_read_ahead_page(env, io, queue,
+							page, vmpage);
+				if (rc == -ENOLCK) {
+					which = RA_STAT_FAILED_MATCH;
+					msg   = "lock match failed";
+				}
+			} else {
+				which = RA_STAT_FAILED_GRAB_PAGE;
+				msg   = "cl_page_find failed";
+			}
+		} else {
+			which = RA_STAT_WRONG_GRAB_PAGE;
+			msg   = "g_c_p_n returned invalid page";
+		}
+		if (rc != 1)
+			unlock_page(vmpage);
+		page_cache_release(vmpage);
+	} else {
+		which = RA_STAT_FAILED_GRAB_PAGE;
+		msg   = "g_c_p_n failed";
+	}
+	if (msg != NULL) {
+		ll_ra_stats_inc(mapping, which);
+		CDEBUG(D_READA, "%s\n", msg);
+	}
+	return rc;
+}
+
+#define RIA_DEBUG(ria)						       \
+	CDEBUG(D_READA, "rs %lu re %lu ro %lu rl %lu rp %lu\n",       \
+	ria->ria_start, ria->ria_end, ria->ria_stoff, ria->ria_length,\
+	ria->ria_pages)
+
+/* Limit this to the blocksize instead of PTLRPC_BRW_MAX_SIZE, since we don't
+ * know what the actual RPC size is.  If this needs to change, it makes more
+ * sense to tune the i_blkbits value for the file based on the OSTs it is
+ * striped over, rather than having a constant value for all files here. */
+
+/* RAS_INCREASE_STEP should be (1UL << (inode->i_blkbits - PAGE_CACHE_SHIFT)).
+ * Temporarily set RAS_INCREASE_STEP to 1MB. After 4MB RPC is enabled
+ * by default, this should be adjusted corresponding with max_read_ahead_mb
+ * and max_read_ahead_per_file_mb otherwise the readahead budget can be used
+ * up quickly which will affect read performance significantly. See LU-2816 */
+#define RAS_INCREASE_STEP(inode) (ONE_MB_BRW_SIZE >> PAGE_CACHE_SHIFT)
+
+static inline int stride_io_mode(struct ll_readahead_state *ras)
+{
+	return ras->ras_consecutive_stride_requests > 1;
+}
+/* The function calculates how much pages will be read in
+ * [off, off + length], in such stride IO area,
+ * stride_offset = st_off, stride_length = st_len,
+ * stride_pages = st_pgs
+ *
+ *   |------------------|*****|------------------|*****|------------|*****|....
+ * st_off
+ *   |--- st_pgs     ---|
+ *   |-----     st_len   -----|
+ *
+ *	      How many pages it should read in such pattern
+ *	      |-------------------------------------------------------------|
+ *	      off
+ *	      |<------		  length		      ------->|
+ *
+ *	  =   |<----->|  +  |-------------------------------------| +   |---|
+ *	     start_left		 st_pgs * i		    end_left
+ */
+static unsigned long
+stride_pg_count(pgoff_t st_off, unsigned long st_len, unsigned long st_pgs,
+		unsigned long off, unsigned long length)
+{
+	__u64 start = off > st_off ? off - st_off : 0;
+	__u64 end = off + length > st_off ? off + length - st_off : 0;
+	unsigned long start_left = 0;
+	unsigned long end_left = 0;
+	unsigned long pg_count;
+
+	if (st_len == 0 || length == 0 || end == 0)
+		return length;
+
+	start_left = do_div(start, st_len);
+	if (start_left < st_pgs)
+		start_left = st_pgs - start_left;
+	else
+		start_left = 0;
+
+	end_left = do_div(end, st_len);
+	if (end_left > st_pgs)
+		end_left = st_pgs;
+
+	CDEBUG(D_READA, "start %llu, end %llu start_left %lu end_left %lu \n",
+	       start, end, start_left, end_left);
+
+	if (start == end)
+		pg_count = end_left - (st_pgs - start_left);
+	else
+		pg_count = start_left + st_pgs * (end - start - 1) + end_left;
+
+	CDEBUG(D_READA, "st_off %lu, st_len %lu st_pgs %lu off %lu length %lu pgcount %lu\n",
+	       st_off, st_len, st_pgs, off, length, pg_count);
+
+	return pg_count;
+}
+
+static int ria_page_count(struct ra_io_arg *ria)
+{
+	__u64 length = ria->ria_end >= ria->ria_start ?
+		       ria->ria_end - ria->ria_start + 1 : 0;
+
+	return stride_pg_count(ria->ria_stoff, ria->ria_length,
+			       ria->ria_pages, ria->ria_start,
+			       length);
+}
+
+/*Check whether the index is in the defined ra-window */
+static int ras_inside_ra_window(unsigned long idx, struct ra_io_arg *ria)
+{
+	/* If ria_length == ria_pages, it means non-stride I/O mode,
+	 * idx should always inside read-ahead window in this case
+	 * For stride I/O mode, just check whether the idx is inside
+	 * the ria_pages. */
+	return ria->ria_length == 0 || ria->ria_length == ria->ria_pages ||
+	       (idx >= ria->ria_stoff && (idx - ria->ria_stoff) %
+		ria->ria_length < ria->ria_pages);
+}
+
+static int ll_read_ahead_pages(const struct lu_env *env,
+			       struct cl_io *io, struct cl_page_list *queue,
+			       struct ra_io_arg *ria,
+			       unsigned long *reserved_pages,
+			       struct address_space *mapping,
+			       unsigned long *ra_end)
+{
+	int rc, count = 0, stride_ria;
+	unsigned long page_idx;
+
+	LASSERT(ria != NULL);
+	RIA_DEBUG(ria);
+
+	stride_ria = ria->ria_length > ria->ria_pages && ria->ria_pages > 0;
+	for (page_idx = ria->ria_start; page_idx <= ria->ria_end &&
+			*reserved_pages > 0; page_idx++) {
+		if (ras_inside_ra_window(page_idx, ria)) {
+			/* If the page is inside the read-ahead window*/
+			rc = ll_read_ahead_page(env, io, queue,
+						page_idx, mapping);
+			if (rc == 1) {
+				(*reserved_pages)--;
+				count ++;
+			} else if (rc == -ENOLCK)
+				break;
+		} else if (stride_ria) {
+			/* If it is not in the read-ahead window, and it is
+			 * read-ahead mode, then check whether it should skip
+			 * the stride gap */
+			pgoff_t offset;
+			/* FIXME: This assertion only is valid when it is for
+			 * forward read-ahead, it will be fixed when backward
+			 * read-ahead is implemented */
+			LASSERTF(page_idx > ria->ria_stoff, "Invalid page_idx %lu rs %lu re %lu ro %lu rl %lu rp %lu\n",
+				 page_idx,
+				 ria->ria_start, ria->ria_end, ria->ria_stoff,
+				 ria->ria_length, ria->ria_pages);
+			offset = page_idx - ria->ria_stoff;
+			offset = offset % (ria->ria_length);
+			if (offset > ria->ria_pages) {
+				page_idx += ria->ria_length - offset;
+				CDEBUG(D_READA, "i %lu skip %lu \n", page_idx,
+				       ria->ria_length - offset);
+				continue;
+			}
+		}
+	}
+	*ra_end = page_idx;
+	return count;
+}
+
+int ll_readahead(const struct lu_env *env, struct cl_io *io,
+		 struct ll_readahead_state *ras, struct address_space *mapping,
+		 struct cl_page_list *queue, int flags)
+{
+	struct vvp_io *vio = vvp_env_io(env);
+	struct vvp_thread_info *vti = vvp_env_info(env);
+	struct cl_attr *attr = ccc_env_thread_attr(env);
+	unsigned long start = 0, end = 0, reserved;
+	unsigned long ra_end, len;
+	struct inode *inode;
+	struct ll_ra_read *bead;
+	struct ra_io_arg *ria = &vti->vti_ria;
+	struct ll_inode_info *lli;
+	struct cl_object *clob;
+	int ret = 0;
+	__u64 kms;
+
+	inode = mapping->host;
+	lli = ll_i2info(inode);
+	clob = lli->lli_clob;
+
+	memset(ria, 0, sizeof(*ria));
+
+	cl_object_attr_lock(clob);
+	ret = cl_object_attr_get(env, clob, attr);
+	cl_object_attr_unlock(clob);
+
+	if (ret != 0)
+		return ret;
+	kms = attr->cat_kms;
+	if (kms == 0) {
+		ll_ra_stats_inc(mapping, RA_STAT_ZERO_LEN);
+		return 0;
+	}
+
+	spin_lock(&ras->ras_lock);
+	if (vio->cui_ra_window_set)
+		bead = &vio->cui_bead;
+	else
+		bead = NULL;
+
+	/* Enlarge the RA window to encompass the full read */
+	if (bead != NULL && ras->ras_window_start + ras->ras_window_len <
+	    bead->lrr_start + bead->lrr_count) {
+		ras->ras_window_len = bead->lrr_start + bead->lrr_count -
+				      ras->ras_window_start;
+	}
+	/* Reserve a part of the read-ahead window that we'll be issuing */
+	if (ras->ras_window_len) {
+		start = ras->ras_next_readahead;
+		end = ras->ras_window_start + ras->ras_window_len - 1;
+	}
+	if (end != 0) {
+		unsigned long rpc_boundary;
+		/*
+		 * Align RA window to an optimal boundary.
+		 *
+		 * XXX This would be better to align to cl_max_pages_per_rpc
+		 * instead of PTLRPC_MAX_BRW_PAGES, because the RPC size may
+		 * be aligned to the RAID stripe size in the future and that
+		 * is more important than the RPC size.
+		 */
+		/* Note: we only trim the RPC, instead of extending the RPC
+		 * to the boundary, so to avoid reading too much pages during
+		 * random reading. */
+		rpc_boundary = (end + 1) & (~(PTLRPC_MAX_BRW_PAGES - 1));
+		if (rpc_boundary > 0)
+			rpc_boundary--;
+
+		if (rpc_boundary  > start)
+			end = rpc_boundary;
+
+		/* Truncate RA window to end of file */
+		end = min(end, (unsigned long)((kms - 1) >> PAGE_CACHE_SHIFT));
+
+		ras->ras_next_readahead = max(end, end + 1);
+		RAS_CDEBUG(ras);
+	}
+	ria->ria_start = start;
+	ria->ria_end = end;
+	/* If stride I/O mode is detected, get stride window*/
+	if (stride_io_mode(ras)) {
+		ria->ria_stoff = ras->ras_stride_offset;
+		ria->ria_length = ras->ras_stride_length;
+		ria->ria_pages = ras->ras_stride_pages;
+	}
+	spin_unlock(&ras->ras_lock);
+
+	if (end == 0) {
+		ll_ra_stats_inc(mapping, RA_STAT_ZERO_WINDOW);
+		return 0;
+	}
+	len = ria_page_count(ria);
+	if (len == 0)
+		return 0;
+
+	reserved = ll_ra_count_get(ll_i2sbi(inode), ria, len);
+	if (reserved < len)
+		ll_ra_stats_inc(mapping, RA_STAT_MAX_IN_FLIGHT);
+
+	CDEBUG(D_READA, "reserved page %lu ra_cur %d ra_max %lu\n", reserved,
+	       atomic_read(&ll_i2sbi(inode)->ll_ra_info.ra_cur_pages),
+	       ll_i2sbi(inode)->ll_ra_info.ra_max_pages);
+
+	ret = ll_read_ahead_pages(env, io, queue,
+				  ria, &reserved, mapping, &ra_end);
+
+	LASSERTF(reserved >= 0, "reserved %lu\n", reserved);
+	if (reserved != 0)
+		ll_ra_count_put(ll_i2sbi(inode), reserved);
+
+	if (ra_end == end + 1 && ra_end == (kms >> PAGE_CACHE_SHIFT))
+		ll_ra_stats_inc(mapping, RA_STAT_EOF);
+
+	/* if we didn't get to the end of the region we reserved from
+	 * the ras we need to go back and update the ras so that the
+	 * next read-ahead tries from where we left off.  we only do so
+	 * if the region we failed to issue read-ahead on is still ahead
+	 * of the app and behind the next index to start read-ahead from */
+	CDEBUG(D_READA, "ra_end %lu end %lu stride end %lu \n",
+	       ra_end, end, ria->ria_end);
+
+	if (ra_end != end + 1) {
+		spin_lock(&ras->ras_lock);
+		if (ra_end < ras->ras_next_readahead &&
+		    index_in_window(ra_end, ras->ras_window_start, 0,
+				    ras->ras_window_len)) {
+			ras->ras_next_readahead = ra_end;
+			RAS_CDEBUG(ras);
+		}
+		spin_unlock(&ras->ras_lock);
+	}
+
+	return ret;
+}
+
+static void ras_set_start(struct inode *inode, struct ll_readahead_state *ras,
+			  unsigned long index)
+{
+	ras->ras_window_start = index & (~(RAS_INCREASE_STEP(inode) - 1));
+}
+
+/* called with the ras_lock held or from places where it doesn't matter */
+static void ras_reset(struct inode *inode, struct ll_readahead_state *ras,
+		      unsigned long index)
+{
+	ras->ras_last_readpage = index;
+	ras->ras_consecutive_requests = 0;
+	ras->ras_consecutive_pages = 0;
+	ras->ras_window_len = 0;
+	ras_set_start(inode, ras, index);
+	ras->ras_next_readahead = max(ras->ras_window_start, index);
+
+	RAS_CDEBUG(ras);
+}
+
+/* called with the ras_lock held or from places where it doesn't matter */
+static void ras_stride_reset(struct ll_readahead_state *ras)
+{
+	ras->ras_consecutive_stride_requests = 0;
+	ras->ras_stride_length = 0;
+	ras->ras_stride_pages = 0;
+	RAS_CDEBUG(ras);
+}
+
+void ll_readahead_init(struct inode *inode, struct ll_readahead_state *ras)
+{
+	spin_lock_init(&ras->ras_lock);
+	ras_reset(inode, ras, 0);
+	ras->ras_requests = 0;
+	INIT_LIST_HEAD(&ras->ras_read_beads);
+}
+
+/*
+ * Check whether the read request is in the stride window.
+ * If it is in the stride window, return 1, otherwise return 0.
+ */
+static int index_in_stride_window(struct ll_readahead_state *ras,
+				  unsigned long index)
+{
+	unsigned long stride_gap;
+
+	if (ras->ras_stride_length == 0 || ras->ras_stride_pages == 0 ||
+	    ras->ras_stride_pages == ras->ras_stride_length)
+		return 0;
+
+	stride_gap = index - ras->ras_last_readpage - 1;
+
+	/* If it is contiguous read */
+	if (stride_gap == 0)
+		return ras->ras_consecutive_pages + 1 <= ras->ras_stride_pages;
+
+	/* Otherwise check the stride by itself */
+	return (ras->ras_stride_length - ras->ras_stride_pages) == stride_gap &&
+		ras->ras_consecutive_pages == ras->ras_stride_pages;
+}
+
+static void ras_update_stride_detector(struct ll_readahead_state *ras,
+				       unsigned long index)
+{
+	unsigned long stride_gap = index - ras->ras_last_readpage - 1;
+
+	if (!stride_io_mode(ras) && (stride_gap != 0 ||
+	     ras->ras_consecutive_stride_requests == 0)) {
+		ras->ras_stride_pages = ras->ras_consecutive_pages;
+		ras->ras_stride_length = stride_gap +ras->ras_consecutive_pages;
+	}
+	LASSERT(ras->ras_request_index == 0);
+	LASSERT(ras->ras_consecutive_stride_requests == 0);
+
+	if (index <= ras->ras_last_readpage) {
+		/*Reset stride window for forward read*/
+		ras_stride_reset(ras);
+		return;
+	}
+
+	ras->ras_stride_pages = ras->ras_consecutive_pages;
+	ras->ras_stride_length = stride_gap +ras->ras_consecutive_pages;
+
+	RAS_CDEBUG(ras);
+	return;
+}
+
+static unsigned long
+stride_page_count(struct ll_readahead_state *ras, unsigned long len)
+{
+	return stride_pg_count(ras->ras_stride_offset, ras->ras_stride_length,
+			       ras->ras_stride_pages, ras->ras_stride_offset,
+			       len);
+}
+
+/* Stride Read-ahead window will be increased inc_len according to
+ * stride I/O pattern */
+static void ras_stride_increase_window(struct ll_readahead_state *ras,
+				       struct ll_ra_info *ra,
+				       unsigned long inc_len)
+{
+	unsigned long left, step, window_len;
+	unsigned long stride_len;
+
+	LASSERT(ras->ras_stride_length > 0);
+	LASSERTF(ras->ras_window_start + ras->ras_window_len
+		 >= ras->ras_stride_offset, "window_start %lu, window_len %lu stride_offset %lu\n",
+		 ras->ras_window_start,
+		 ras->ras_window_len, ras->ras_stride_offset);
+
+	stride_len = ras->ras_window_start + ras->ras_window_len -
+		     ras->ras_stride_offset;
+
+	left = stride_len % ras->ras_stride_length;
+	window_len = ras->ras_window_len - left;
+
+	if (left < ras->ras_stride_pages)
+		left += inc_len;
+	else
+		left = ras->ras_stride_pages + inc_len;
+
+	LASSERT(ras->ras_stride_pages != 0);
+
+	step = left / ras->ras_stride_pages;
+	left %= ras->ras_stride_pages;
+
+	window_len += step * ras->ras_stride_length + left;
+
+	if (stride_page_count(ras, window_len) <= ra->ra_max_pages_per_file)
+		ras->ras_window_len = window_len;
+
+	RAS_CDEBUG(ras);
+}
+
+static void ras_increase_window(struct inode *inode,
+				struct ll_readahead_state *ras,
+				struct ll_ra_info *ra)
+{
+	/* The stretch of ra-window should be aligned with max rpc_size
+	 * but current clio architecture does not support retrieve such
+	 * information from lower layer. FIXME later
+	 */
+	if (stride_io_mode(ras))
+		ras_stride_increase_window(ras, ra, RAS_INCREASE_STEP(inode));
+	else
+		ras->ras_window_len = min(ras->ras_window_len +
+					  RAS_INCREASE_STEP(inode),
+					  ra->ra_max_pages_per_file);
+}
+
+void ras_update(struct ll_sb_info *sbi, struct inode *inode,
+		struct ll_readahead_state *ras, unsigned long index,
+		unsigned hit)
+{
+	struct ll_ra_info *ra = &sbi->ll_ra_info;
+	int zero = 0, stride_detect = 0, ra_miss = 0;
+
+	spin_lock(&ras->ras_lock);
+
+	ll_ra_stats_inc_sbi(sbi, hit ? RA_STAT_HIT : RA_STAT_MISS);
+
+	/* reset the read-ahead window in two cases.  First when the app seeks
+	 * or reads to some other part of the file.  Secondly if we get a
+	 * read-ahead miss that we think we've previously issued.  This can
+	 * be a symptom of there being so many read-ahead pages that the VM is
+	 * reclaiming it before we get to it. */
+	if (!index_in_window(index, ras->ras_last_readpage, 8, 8)) {
+		zero = 1;
+		ll_ra_stats_inc_sbi(sbi, RA_STAT_DISTANT_READPAGE);
+	} else if (!hit && ras->ras_window_len &&
+		   index < ras->ras_next_readahead &&
+		   index_in_window(index, ras->ras_window_start, 0,
+				   ras->ras_window_len)) {
+		ra_miss = 1;
+		ll_ra_stats_inc_sbi(sbi, RA_STAT_MISS_IN_WINDOW);
+	}
+
+	/* On the second access to a file smaller than the tunable
+	 * ra_max_read_ahead_whole_pages trigger RA on all pages in the
+	 * file up to ra_max_pages_per_file.  This is simply a best effort
+	 * and only occurs once per open file.  Normal RA behavior is reverted
+	 * to for subsequent IO.  The mmap case does not increment
+	 * ras_requests and thus can never trigger this behavior. */
+	if (ras->ras_requests == 2 && !ras->ras_request_index) {
+		__u64 kms_pages;
+
+		kms_pages = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >>
+			    PAGE_CACHE_SHIFT;
+
+		CDEBUG(D_READA, "kmsp %llu mwp %lu mp %lu\n", kms_pages,
+		       ra->ra_max_read_ahead_whole_pages, ra->ra_max_pages_per_file);
+
+		if (kms_pages &&
+		    kms_pages <= ra->ra_max_read_ahead_whole_pages) {
+			ras->ras_window_start = 0;
+			ras->ras_last_readpage = 0;
+			ras->ras_next_readahead = 0;
+			ras->ras_window_len = min(ra->ra_max_pages_per_file,
+				ra->ra_max_read_ahead_whole_pages);
+			goto out_unlock;
+		}
+	}
+	if (zero) {
+		/* check whether it is in stride I/O mode*/
+		if (!index_in_stride_window(ras, index)) {
+			if (ras->ras_consecutive_stride_requests == 0 &&
+			    ras->ras_request_index == 0) {
+				ras_update_stride_detector(ras, index);
+				ras->ras_consecutive_stride_requests++;
+			} else {
+				ras_stride_reset(ras);
+			}
+			ras_reset(inode, ras, index);
+			ras->ras_consecutive_pages++;
+			goto out_unlock;
+		} else {
+			ras->ras_consecutive_pages = 0;
+			ras->ras_consecutive_requests = 0;
+			if (++ras->ras_consecutive_stride_requests > 1)
+				stride_detect = 1;
+			RAS_CDEBUG(ras);
+		}
+	} else {
+		if (ra_miss) {
+			if (index_in_stride_window(ras, index) &&
+			    stride_io_mode(ras)) {
+				/*If stride-RA hit cache miss, the stride dector
+				 *will not be reset to avoid the overhead of
+				 *redetecting read-ahead mode */
+				if (index != ras->ras_last_readpage + 1)
+					ras->ras_consecutive_pages = 0;
+				ras_reset(inode, ras, index);
+				RAS_CDEBUG(ras);
+			} else {
+				/* Reset both stride window and normal RA
+				 * window */
+				ras_reset(inode, ras, index);
+				ras->ras_consecutive_pages++;
+				ras_stride_reset(ras);
+				goto out_unlock;
+			}
+		} else if (stride_io_mode(ras)) {
+			/* If this is contiguous read but in stride I/O mode
+			 * currently, check whether stride step still is valid,
+			 * if invalid, it will reset the stride ra window*/
+			if (!index_in_stride_window(ras, index)) {
+				/* Shrink stride read-ahead window to be zero */
+				ras_stride_reset(ras);
+				ras->ras_window_len = 0;
+				ras->ras_next_readahead = index;
+			}
+		}
+	}
+	ras->ras_consecutive_pages++;
+	ras->ras_last_readpage = index;
+	ras_set_start(inode, ras, index);
+
+	if (stride_io_mode(ras))
+		/* Since stride readahead is sensitive to the offset
+		 * of read-ahead, so we use original offset here,
+		 * instead of ras_window_start, which is RPC aligned */
+		ras->ras_next_readahead = max(index, ras->ras_next_readahead);
+	else
+		ras->ras_next_readahead = max(ras->ras_window_start,
+					      ras->ras_next_readahead);
+	RAS_CDEBUG(ras);
+
+	/* Trigger RA in the mmap case where ras_consecutive_requests
+	 * is not incremented and thus can't be used to trigger RA */
+	if (!ras->ras_window_len && ras->ras_consecutive_pages == 4) {
+		ras->ras_window_len = RAS_INCREASE_STEP(inode);
+		goto out_unlock;
+	}
+
+	/* Initially reset the stride window offset to next_readahead*/
+	if (ras->ras_consecutive_stride_requests == 2 && stride_detect) {
+		/**
+		 * Once stride IO mode is detected, next_readahead should be
+		 * reset to make sure next_readahead > stride offset
+		 */
+		ras->ras_next_readahead = max(index, ras->ras_next_readahead);
+		ras->ras_stride_offset = index;
+		ras->ras_window_len = RAS_INCREASE_STEP(inode);
+	}
+
+	/* The initial ras_window_len is set to the request size.  To avoid
+	 * uselessly reading and discarding pages for random IO the window is
+	 * only increased once per consecutive request received. */
+	if ((ras->ras_consecutive_requests > 1 || stride_detect) &&
+	    !ras->ras_request_index)
+		ras_increase_window(inode, ras, ra);
+out_unlock:
+	RAS_CDEBUG(ras);
+	ras->ras_request_index++;
+	spin_unlock(&ras->ras_lock);
+	return;
+}
+
+int ll_writepage(struct page *vmpage, struct writeback_control *wbc)
+{
+	struct inode	       *inode = vmpage->mapping->host;
+	struct ll_inode_info   *lli   = ll_i2info(inode);
+	struct lu_env	  *env;
+	struct cl_io	   *io;
+	struct cl_page	 *page;
+	struct cl_object       *clob;
+	struct cl_env_nest      nest;
+	bool redirtied = false;
+	bool unlocked = false;
+	int result;
+
+	LASSERT(PageLocked(vmpage));
+	LASSERT(!PageWriteback(vmpage));
+
+	LASSERT(ll_i2dtexp(inode) != NULL);
+
+	env = cl_env_nested_get(&nest);
+	if (IS_ERR(env)) {
+		result = PTR_ERR(env);
+		goto out;
+	}
+
+	clob  = ll_i2info(inode)->lli_clob;
+	LASSERT(clob != NULL);
+
+	io = ccc_env_thread_io(env);
+	io->ci_obj = clob;
+	io->ci_ignore_layout = 1;
+	result = cl_io_init(env, io, CIT_MISC, clob);
+	if (result == 0) {
+		page = cl_page_find(env, clob, vmpage->index,
+				    vmpage, CPT_CACHEABLE);
+		if (!IS_ERR(page)) {
+			lu_ref_add(&page->cp_reference, "writepage",
+				   current);
+			cl_page_assume(env, io, page);
+			result = cl_page_flush(env, io, page);
+			if (result != 0) {
+				/*
+				 * Re-dirty page on error so it retries write,
+				 * but not in case when IO has actually
+				 * occurred and completed with an error.
+				 */
+				if (!PageError(vmpage)) {
+					redirty_page_for_writepage(wbc, vmpage);
+					result = 0;
+					redirtied = true;
+				}
+			}
+			cl_page_disown(env, io, page);
+			unlocked = true;
+			lu_ref_del(&page->cp_reference,
+				   "writepage", current);
+			cl_page_put(env, page);
+		} else {
+			result = PTR_ERR(page);
+		}
+	}
+	cl_io_fini(env, io);
+
+	if (redirtied && wbc->sync_mode == WB_SYNC_ALL) {
+		loff_t offset = cl_offset(clob, vmpage->index);
+
+		/* Flush page failed because the extent is being written out.
+		 * Wait for the write of extent to be finished to avoid
+		 * breaking kernel which assumes ->writepage should mark
+		 * PageWriteback or clean the page. */
+		result = cl_sync_file_range(inode, offset,
+					    offset + PAGE_CACHE_SIZE - 1,
+					    CL_FSYNC_LOCAL, 1);
+		if (result > 0) {
+			/* actually we may have written more than one page.
+			 * decreasing this page because the caller will count
+			 * it. */
+			wbc->nr_to_write -= result - 1;
+			result = 0;
+		}
+	}
+
+	cl_env_nested_put(&nest, env);
+	goto out;
+
+out:
+	if (result < 0) {
+		if (!lli->lli_async_rc)
+			lli->lli_async_rc = result;
+		SetPageError(vmpage);
+		if (!unlocked)
+			unlock_page(vmpage);
+	}
+	return result;
+}
+
+int ll_writepages(struct address_space *mapping, struct writeback_control *wbc)
+{
+	struct inode *inode = mapping->host;
+	struct ll_sb_info *sbi = ll_i2sbi(inode);
+	loff_t start;
+	loff_t end;
+	enum cl_fsync_mode mode;
+	int range_whole = 0;
+	int result;
+	int ignore_layout = 0;
+
+	if (wbc->range_cyclic) {
+		start = mapping->writeback_index << PAGE_CACHE_SHIFT;
+		end = OBD_OBJECT_EOF;
+	} else {
+		start = wbc->range_start;
+		end = wbc->range_end;
+		if (end == LLONG_MAX) {
+			end = OBD_OBJECT_EOF;
+			range_whole = start == 0;
+		}
+	}
+
+	mode = CL_FSYNC_NONE;
+	if (wbc->sync_mode == WB_SYNC_ALL)
+		mode = CL_FSYNC_LOCAL;
+
+	if (sbi->ll_umounting)
+		/* if the mountpoint is being umounted, all pages have to be
+		 * evicted to avoid hitting LBUG when truncate_inode_pages()
+		 * is called later on. */
+		ignore_layout = 1;
+	result = cl_sync_file_range(inode, start, end, mode, ignore_layout);
+	if (result > 0) {
+		wbc->nr_to_write -= result;
+		result = 0;
+	 }
+
+	if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0)) {
+		if (end == OBD_OBJECT_EOF)
+			end = i_size_read(inode);
+		mapping->writeback_index = (end >> PAGE_CACHE_SHIFT) + 1;
+	}
+	return result;
+}
+
+int ll_readpage(struct file *file, struct page *vmpage)
+{
+	struct ll_cl_context *lcc;
+	int result;
+
+	lcc = ll_cl_init(file, vmpage, 0);
+	if (!IS_ERR(lcc)) {
+		struct lu_env  *env  = lcc->lcc_env;
+		struct cl_io   *io   = lcc->lcc_io;
+		struct cl_page *page = lcc->lcc_page;
+
+		LASSERT(page->cp_type == CPT_CACHEABLE);
+		if (likely(!PageUptodate(vmpage))) {
+			cl_page_assume(env, io, page);
+			result = cl_io_read_page(env, io, page);
+		} else {
+			/* Page from a non-object file. */
+			unlock_page(vmpage);
+			result = 0;
+		}
+		ll_cl_fini(lcc);
+	} else {
+		unlock_page(vmpage);
+		result = PTR_ERR(lcc);
+	}
+	return result;
+}
diff --git a/kernel/drivers/staging/lustre/lustre/llite/rw26.c b/kernel/drivers/staging/lustre/lustre/llite/rw26.c
new file mode 100644
index 000000000..c6c824356
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/llite/rw26.c
@@ -0,0 +1,553 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/lustre/llite/rw26.c
+ *
+ * Lustre Lite I/O page cache routines for the 2.5/2.6 kernel version
+ */
+
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/stat.h>
+#include <linux/errno.h>
+#include <linux/unistd.h>
+#include <linux/uaccess.h>
+
+#include <linux/migrate.h>
+#include <linux/fs.h>
+#include <linux/buffer_head.h>
+#include <linux/mpage.h>
+#include <linux/writeback.h>
+#include <linux/pagemap.h>
+
+#define DEBUG_SUBSYSTEM S_LLITE
+
+#include "../include/lustre_lite.h"
+#include "llite_internal.h"
+#include "../include/linux/lustre_compat25.h"
+
+/**
+ * Implements Linux VM address_space::invalidatepage() method. This method is
+ * called when the page is truncate from a file, either as a result of
+ * explicit truncate, or when inode is removed from memory (as a result of
+ * final iput(), umount, or memory pressure induced icache shrinking).
+ *
+ * [0, offset] bytes of the page remain valid (this is for a case of not-page
+ * aligned truncate). Lustre leaves partially truncated page in the cache,
+ * relying on struct inode::i_size to limit further accesses.
+ */
+static void ll_invalidatepage(struct page *vmpage, unsigned int offset,
+			      unsigned int length)
+{
+	struct inode     *inode;
+	struct lu_env    *env;
+	struct cl_page   *page;
+	struct cl_object *obj;
+
+	int refcheck;
+
+	LASSERT(PageLocked(vmpage));
+	LASSERT(!PageWriteback(vmpage));
+
+	/*
+	 * It is safe to not check anything in invalidatepage/releasepage
+	 * below because they are run with page locked and all our io is
+	 * happening with locked page too
+	 */
+	if (offset == 0 && length == PAGE_CACHE_SIZE) {
+		env = cl_env_get(&refcheck);
+		if (!IS_ERR(env)) {
+			inode = vmpage->mapping->host;
+			obj = ll_i2info(inode)->lli_clob;
+			if (obj != NULL) {
+				page = cl_vmpage_page(vmpage, obj);
+				if (page != NULL) {
+					lu_ref_add(&page->cp_reference,
+						   "delete", vmpage);
+					cl_page_delete(env, page);
+					lu_ref_del(&page->cp_reference,
+						   "delete", vmpage);
+					cl_page_put(env, page);
+				}
+			} else
+				LASSERT(vmpage->private == 0);
+			cl_env_put(env, &refcheck);
+		}
+	}
+}
+
+#ifdef HAVE_RELEASEPAGE_WITH_INT
+#define RELEASEPAGE_ARG_TYPE int
+#else
+#define RELEASEPAGE_ARG_TYPE gfp_t
+#endif
+static int ll_releasepage(struct page *vmpage, RELEASEPAGE_ARG_TYPE gfp_mask)
+{
+	struct cl_env_nest nest;
+	struct lu_env     *env;
+	struct cl_object  *obj;
+	struct cl_page    *page;
+	struct address_space *mapping;
+	int result;
+
+	LASSERT(PageLocked(vmpage));
+	if (PageWriteback(vmpage) || PageDirty(vmpage))
+		return 0;
+
+	mapping = vmpage->mapping;
+	if (mapping == NULL)
+		return 1;
+
+	obj = ll_i2info(mapping->host)->lli_clob;
+	if (obj == NULL)
+		return 1;
+
+	/* 1 for page allocator, 1 for cl_page and 1 for page cache */
+	if (page_count(vmpage) > 3)
+		return 0;
+
+	/* TODO: determine what gfp should be used by @gfp_mask. */
+	env = cl_env_nested_get(&nest);
+	if (IS_ERR(env))
+		/* If we can't allocate an env we won't call cl_page_put()
+		 * later on which further means it's impossible to drop
+		 * page refcount by cl_page, so ask kernel to not free
+		 * this page. */
+		return 0;
+
+	page = cl_vmpage_page(vmpage, obj);
+	result = page == NULL;
+	if (page != NULL) {
+		if (!cl_page_in_use(page)) {
+			result = 1;
+			cl_page_delete(env, page);
+		}
+		cl_page_put(env, page);
+	}
+	cl_env_nested_put(&nest, env);
+	return result;
+}
+
+static int ll_set_page_dirty(struct page *vmpage)
+{
+#if 0
+	struct cl_page    *page = vvp_vmpage_page_transient(vmpage);
+	struct vvp_object *obj  = cl_inode2vvp(vmpage->mapping->host);
+	struct vvp_page   *cpg;
+
+	/*
+	 * XXX should page method be called here?
+	 */
+	LASSERT(&obj->co_cl == page->cp_obj);
+	cpg = cl2vvp_page(cl_page_at(page, &vvp_device_type));
+	/*
+	 * XXX cannot do much here, because page is possibly not locked:
+	 * sys_munmap()->...
+	 *     ->unmap_page_range()->zap_pte_range()->set_page_dirty().
+	 */
+	vvp_write_pending(obj, cpg);
+#endif
+	return __set_page_dirty_nobuffers(vmpage);
+}
+
+#define MAX_DIRECTIO_SIZE (2*1024*1024*1024UL)
+
+static inline int ll_get_user_pages(int rw, unsigned long user_addr,
+				    size_t size, struct page ***pages,
+				    int *max_pages)
+{
+	int result = -ENOMEM;
+
+	/* set an arbitrary limit to prevent arithmetic overflow */
+	if (size > MAX_DIRECTIO_SIZE) {
+		*pages = NULL;
+		return -EFBIG;
+	}
+
+	*max_pages = (user_addr + size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+	*max_pages -= user_addr >> PAGE_CACHE_SHIFT;
+
+	OBD_ALLOC_LARGE(*pages, *max_pages * sizeof(**pages));
+	if (*pages) {
+		result = get_user_pages_fast(user_addr, *max_pages,
+					     (rw == READ), *pages);
+		if (unlikely(result <= 0))
+			OBD_FREE_LARGE(*pages, *max_pages * sizeof(**pages));
+	}
+
+	return result;
+}
+
+/*  ll_free_user_pages - tear down page struct array
+ *  @pages: array of page struct pointers underlying target buffer */
+static void ll_free_user_pages(struct page **pages, int npages, int do_dirty)
+{
+	int i;
+
+	for (i = 0; i < npages; i++) {
+		if (do_dirty)
+			set_page_dirty_lock(pages[i]);
+		page_cache_release(pages[i]);
+	}
+	kvfree(pages);
+}
+
+ssize_t ll_direct_rw_pages(const struct lu_env *env, struct cl_io *io,
+			   int rw, struct inode *inode,
+			   struct ll_dio_pages *pv)
+{
+	struct cl_page    *clp;
+	struct cl_2queue  *queue;
+	struct cl_object  *obj = io->ci_obj;
+	int i;
+	ssize_t rc = 0;
+	loff_t file_offset  = pv->ldp_start_offset;
+	long size	   = pv->ldp_size;
+	int page_count      = pv->ldp_nr;
+	struct page **pages = pv->ldp_pages;
+	long page_size      = cl_page_size(obj);
+	bool do_io;
+	int  io_pages       = 0;
+
+	queue = &io->ci_queue;
+	cl_2queue_init(queue);
+	for (i = 0; i < page_count; i++) {
+		if (pv->ldp_offsets)
+		    file_offset = pv->ldp_offsets[i];
+
+		LASSERT(!(file_offset & (page_size - 1)));
+		clp = cl_page_find(env, obj, cl_index(obj, file_offset),
+				   pv->ldp_pages[i], CPT_TRANSIENT);
+		if (IS_ERR(clp)) {
+			rc = PTR_ERR(clp);
+			break;
+		}
+
+		rc = cl_page_own(env, io, clp);
+		if (rc) {
+			LASSERT(clp->cp_state == CPS_FREEING);
+			cl_page_put(env, clp);
+			break;
+		}
+
+		do_io = true;
+
+		/* check the page type: if the page is a host page, then do
+		 * write directly */
+		if (clp->cp_type == CPT_CACHEABLE) {
+			struct page *vmpage = cl_page_vmpage(env, clp);
+			struct page *src_page;
+			struct page *dst_page;
+			void       *src;
+			void       *dst;
+
+			src_page = (rw == WRITE) ? pages[i] : vmpage;
+			dst_page = (rw == WRITE) ? vmpage : pages[i];
+
+			src = kmap_atomic(src_page);
+			dst = kmap_atomic(dst_page);
+			memcpy(dst, src, min(page_size, size));
+			kunmap_atomic(dst);
+			kunmap_atomic(src);
+
+			/* make sure page will be added to the transfer by
+			 * cl_io_submit()->...->vvp_page_prep_write(). */
+			if (rw == WRITE)
+				set_page_dirty(vmpage);
+
+			if (rw == READ) {
+				/* do not issue the page for read, since it
+				 * may reread a ra page which has NOT uptodate
+				 * bit set. */
+				cl_page_disown(env, io, clp);
+				do_io = false;
+			}
+		}
+
+		if (likely(do_io)) {
+			cl_2queue_add(queue, clp);
+
+			/*
+			 * Set page clip to tell transfer formation engine
+			 * that page has to be sent even if it is beyond KMS.
+			 */
+			cl_page_clip(env, clp, 0, min(size, page_size));
+
+			++io_pages;
+		}
+
+		/* drop the reference count for cl_page_find */
+		cl_page_put(env, clp);
+		size -= page_size;
+		file_offset += page_size;
+	}
+
+	if (rc == 0 && io_pages) {
+		rc = cl_io_submit_sync(env, io,
+				       rw == READ ? CRT_READ : CRT_WRITE,
+				       queue, 0);
+	}
+	if (rc == 0)
+		rc = pv->ldp_size;
+
+	cl_2queue_discard(env, io, queue);
+	cl_2queue_disown(env, io, queue);
+	cl_2queue_fini(env, queue);
+	return rc;
+}
+EXPORT_SYMBOL(ll_direct_rw_pages);
+
+static ssize_t ll_direct_IO_26_seg(const struct lu_env *env, struct cl_io *io,
+				   int rw, struct inode *inode,
+				   struct address_space *mapping,
+				   size_t size, loff_t file_offset,
+				   struct page **pages, int page_count)
+{
+    struct ll_dio_pages pvec = { .ldp_pages	= pages,
+				 .ldp_nr	   = page_count,
+				 .ldp_size	 = size,
+				 .ldp_offsets      = NULL,
+				 .ldp_start_offset = file_offset
+			       };
+
+    return ll_direct_rw_pages(env, io, rw, inode, &pvec);
+}
+
+#ifdef KMALLOC_MAX_SIZE
+#define MAX_MALLOC KMALLOC_MAX_SIZE
+#else
+#define MAX_MALLOC (128 * 1024)
+#endif
+
+/* This is the maximum size of a single O_DIRECT request, based on the
+ * kmalloc limit.  We need to fit all of the brw_page structs, each one
+ * representing PAGE_SIZE worth of user data, into a single buffer, and
+ * then truncate this to be a full-sized RPC.  For 4kB PAGE_SIZE this is
+ * up to 22MB for 128kB kmalloc and up to 682MB for 4MB kmalloc. */
+#define MAX_DIO_SIZE ((MAX_MALLOC / sizeof(struct brw_page) * PAGE_CACHE_SIZE) & \
+		      ~(DT_MAX_BRW_SIZE - 1))
+static ssize_t ll_direct_IO_26(struct kiocb *iocb, struct iov_iter *iter,
+			       loff_t file_offset)
+{
+	struct lu_env *env;
+	struct cl_io *io;
+	struct file *file = iocb->ki_filp;
+	struct inode *inode = file->f_mapping->host;
+	struct ccc_object *obj = cl_inode2ccc(inode);
+	ssize_t count = iov_iter_count(iter);
+	ssize_t tot_bytes = 0, result = 0;
+	struct ll_inode_info *lli = ll_i2info(inode);
+	long size = MAX_DIO_SIZE;
+	int refcheck;
+
+	if (!lli->lli_has_smd)
+		return -EBADF;
+
+	/* FIXME: io smaller than PAGE_SIZE is broken on ia64 ??? */
+	if ((file_offset & ~CFS_PAGE_MASK) || (count & ~CFS_PAGE_MASK))
+		return -EINVAL;
+
+	CDEBUG(D_VFSTRACE,
+	       "VFS Op:inode=%lu/%u(%p), size=%zd (max %lu), offset=%lld=%llx, pages %zd (max %lu)\n",
+	       inode->i_ino, inode->i_generation, inode, count, MAX_DIO_SIZE,
+	       file_offset, file_offset, count >> PAGE_CACHE_SHIFT,
+	       MAX_DIO_SIZE >> PAGE_CACHE_SHIFT);
+
+	/* Check that all user buffers are aligned as well */
+	if (iov_iter_alignment(iter) & ~CFS_PAGE_MASK)
+		return -EINVAL;
+
+	env = cl_env_get(&refcheck);
+	LASSERT(!IS_ERR(env));
+	io = ccc_env_io(env)->cui_cl.cis_io;
+	LASSERT(io != NULL);
+
+	/* 0. Need locking between buffered and direct access. and race with
+	 *    size changing by concurrent truncates and writes.
+	 * 1. Need inode mutex to operate transient pages.
+	 */
+	if (iov_iter_rw(iter) == READ)
+		mutex_lock(&inode->i_mutex);
+
+	LASSERT(obj->cob_transient_pages == 0);
+	while (iov_iter_count(iter)) {
+		struct page **pages;
+		size_t offs;
+
+		count = min_t(size_t, iov_iter_count(iter), size);
+		if (iov_iter_rw(iter) == READ) {
+			if (file_offset >= i_size_read(inode))
+				break;
+			if (file_offset + count > i_size_read(inode))
+				count = i_size_read(inode) - file_offset;
+		}
+
+		result = iov_iter_get_pages_alloc(iter, &pages, count, &offs);
+		if (likely(result > 0)) {
+			int n = DIV_ROUND_UP(result + offs, PAGE_SIZE);
+			result = ll_direct_IO_26_seg(env, io, iov_iter_rw(iter),
+						     inode, file->f_mapping,
+						     result, file_offset, pages,
+						     n);
+			ll_free_user_pages(pages, n, iov_iter_rw(iter) == READ);
+		}
+		if (unlikely(result <= 0)) {
+			/* If we can't allocate a large enough buffer
+			 * for the request, shrink it to a smaller
+			 * PAGE_SIZE multiple and try again.
+			 * We should always be able to kmalloc for a
+			 * page worth of page pointers = 4MB on i386. */
+			if (result == -ENOMEM &&
+			    size > (PAGE_CACHE_SIZE / sizeof(*pages)) *
+				   PAGE_CACHE_SIZE) {
+				size = ((((size / 2) - 1) |
+					 ~CFS_PAGE_MASK) + 1) &
+					CFS_PAGE_MASK;
+				CDEBUG(D_VFSTRACE, "DIO size now %lu\n",
+				       size);
+				continue;
+			}
+
+			goto out;
+		}
+		iov_iter_advance(iter, result);
+		tot_bytes += result;
+		file_offset += result;
+	}
+out:
+	LASSERT(obj->cob_transient_pages == 0);
+	if (iov_iter_rw(iter) == READ)
+		mutex_unlock(&inode->i_mutex);
+
+	if (tot_bytes > 0) {
+		if (iov_iter_rw(iter) == WRITE) {
+			struct lov_stripe_md *lsm;
+
+			lsm = ccc_inode_lsm_get(inode);
+			LASSERT(lsm != NULL);
+			lov_stripe_lock(lsm);
+			obd_adjust_kms(ll_i2dtexp(inode), lsm, file_offset, 0);
+			lov_stripe_unlock(lsm);
+			ccc_inode_lsm_put(inode, lsm);
+		}
+	}
+
+	cl_env_put(env, &refcheck);
+	return tot_bytes ? : result;
+}
+
+static int ll_write_begin(struct file *file, struct address_space *mapping,
+			 loff_t pos, unsigned len, unsigned flags,
+			 struct page **pagep, void **fsdata)
+{
+	pgoff_t index = pos >> PAGE_CACHE_SHIFT;
+	struct page *page;
+	int rc;
+	unsigned from = pos & (PAGE_CACHE_SIZE - 1);
+
+	page = grab_cache_page_write_begin(mapping, index, flags);
+	if (!page)
+		return -ENOMEM;
+
+	*pagep = page;
+
+	rc = ll_prepare_write(file, page, from, from + len);
+	if (rc) {
+		unlock_page(page);
+		page_cache_release(page);
+	}
+	return rc;
+}
+
+static int ll_write_end(struct file *file, struct address_space *mapping,
+			loff_t pos, unsigned len, unsigned copied,
+			struct page *page, void *fsdata)
+{
+	unsigned from = pos & (PAGE_CACHE_SIZE - 1);
+	int rc;
+
+	rc = ll_commit_write(file, page, from, from + copied);
+	unlock_page(page);
+	page_cache_release(page);
+
+	return rc ?: copied;
+}
+
+#ifdef CONFIG_MIGRATION
+static int ll_migratepage(struct address_space *mapping,
+			 struct page *newpage, struct page *page,
+			 enum migrate_mode mode
+		)
+{
+	/* Always fail page migration until we have a proper implementation */
+	return -EIO;
+}
+#endif
+
+#ifndef MS_HAS_NEW_AOPS
+const struct address_space_operations ll_aops = {
+	.readpage	= ll_readpage,
+	.direct_IO      = ll_direct_IO_26,
+	.writepage      = ll_writepage,
+	.writepages     = ll_writepages,
+	.set_page_dirty = ll_set_page_dirty,
+	.write_begin    = ll_write_begin,
+	.write_end      = ll_write_end,
+	.invalidatepage = ll_invalidatepage,
+	.releasepage    = (void *)ll_releasepage,
+#ifdef CONFIG_MIGRATION
+	.migratepage    = ll_migratepage,
+#endif
+};
+#else
+const struct address_space_operations_ext ll_aops = {
+	.orig_aops.readpage       = ll_readpage,
+/*	.orig_aops.readpages      = ll_readpages, */
+	.orig_aops.direct_IO      = ll_direct_IO_26,
+	.orig_aops.writepage      = ll_writepage,
+	.orig_aops.writepages     = ll_writepages,
+	.orig_aops.set_page_dirty = ll_set_page_dirty,
+	.orig_aops.prepare_write  = ll_prepare_write,
+	.orig_aops.commit_write   = ll_commit_write,
+	.orig_aops.invalidatepage = ll_invalidatepage,
+	.orig_aops.releasepage    = ll_releasepage,
+#ifdef CONFIG_MIGRATION
+	.orig_aops.migratepage    = ll_migratepage,
+#endif
+	.write_begin    = ll_write_begin,
+	.write_end      = ll_write_end
+};
+#endif
diff --git a/kernel/drivers/staging/lustre/lustre/llite/statahead.c b/kernel/drivers/staging/lustre/lustre/llite/statahead.c
new file mode 100644
index 000000000..7f8071242
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/llite/statahead.c
@@ -0,0 +1,1729 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#include <linux/fs.h>
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/highmem.h>
+#include <linux/pagemap.h>
+
+#define DEBUG_SUBSYSTEM S_LLITE
+
+#include "../include/obd_support.h"
+#include "../include/lustre_lite.h"
+#include "../include/lustre_dlm.h"
+#include "llite_internal.h"
+
+#define SA_OMITTED_ENTRY_MAX 8ULL
+
+typedef enum {
+	/** negative values are for error cases */
+	SA_ENTRY_INIT = 0,      /** init entry */
+	SA_ENTRY_SUCC = 1,      /** stat succeed */
+	SA_ENTRY_INVA = 2,      /** invalid entry */
+	SA_ENTRY_DEST = 3,      /** entry to be destroyed */
+} se_stat_t;
+
+struct ll_sa_entry {
+	/* link into sai->sai_entries */
+	struct list_head	      se_link;
+	/* link into sai->sai_entries_{received,stated} */
+	struct list_head	      se_list;
+	/* link into sai hash table locally */
+	struct list_head	      se_hash;
+	/* entry reference count */
+	atomic_t	    se_refcount;
+	/* entry index in the sai */
+	__u64		   se_index;
+	/* low layer ldlm lock handle */
+	__u64		   se_handle;
+	/* entry status */
+	se_stat_t	       se_stat;
+	/* entry size, contains name */
+	int		     se_size;
+	/* pointer to async getattr enqueue info */
+	struct md_enqueue_info *se_minfo;
+	/* pointer to the async getattr request */
+	struct ptlrpc_request  *se_req;
+	/* pointer to the target inode */
+	struct inode	   *se_inode;
+	/* entry name */
+	struct qstr	     se_qstr;
+};
+
+static unsigned int sai_generation;
+static DEFINE_SPINLOCK(sai_generation_lock);
+
+static inline int ll_sa_entry_unhashed(struct ll_sa_entry *entry)
+{
+	return list_empty(&entry->se_hash);
+}
+
+/*
+ * The entry only can be released by the caller, it is necessary to hold lock.
+ */
+static inline int ll_sa_entry_stated(struct ll_sa_entry *entry)
+{
+	smp_rmb();
+	return (entry->se_stat != SA_ENTRY_INIT);
+}
+
+static inline int ll_sa_entry_hash(int val)
+{
+	return val & LL_SA_CACHE_MASK;
+}
+
+/*
+ * Insert entry to hash SA table.
+ */
+static inline void
+ll_sa_entry_enhash(struct ll_statahead_info *sai, struct ll_sa_entry *entry)
+{
+	int i = ll_sa_entry_hash(entry->se_qstr.hash);
+
+	spin_lock(&sai->sai_cache_lock[i]);
+	list_add_tail(&entry->se_hash, &sai->sai_cache[i]);
+	spin_unlock(&sai->sai_cache_lock[i]);
+}
+
+/*
+ * Remove entry from SA table.
+ */
+static inline void
+ll_sa_entry_unhash(struct ll_statahead_info *sai, struct ll_sa_entry *entry)
+{
+	int i = ll_sa_entry_hash(entry->se_qstr.hash);
+
+	spin_lock(&sai->sai_cache_lock[i]);
+	list_del_init(&entry->se_hash);
+	spin_unlock(&sai->sai_cache_lock[i]);
+}
+
+static inline int agl_should_run(struct ll_statahead_info *sai,
+				 struct inode *inode)
+{
+	return (inode != NULL && S_ISREG(inode->i_mode) && sai->sai_agl_valid);
+}
+
+static inline struct ll_sa_entry *
+sa_first_received_entry(struct ll_statahead_info *sai)
+{
+	return list_entry(sai->sai_entries_received.next,
+			      struct ll_sa_entry, se_list);
+}
+
+static inline struct ll_inode_info *
+agl_first_entry(struct ll_statahead_info *sai)
+{
+	return list_entry(sai->sai_entries_agl.next,
+			      struct ll_inode_info, lli_agl_list);
+}
+
+static inline int sa_sent_full(struct ll_statahead_info *sai)
+{
+	return atomic_read(&sai->sai_cache_count) >= sai->sai_max;
+}
+
+static inline int sa_received_empty(struct ll_statahead_info *sai)
+{
+	return list_empty(&sai->sai_entries_received);
+}
+
+static inline int agl_list_empty(struct ll_statahead_info *sai)
+{
+	return list_empty(&sai->sai_entries_agl);
+}
+
+/**
+ * (1) hit ratio less than 80%
+ * or
+ * (2) consecutive miss more than 8
+ * then means low hit.
+ */
+static inline int sa_low_hit(struct ll_statahead_info *sai)
+{
+	return ((sai->sai_hit > 7 && sai->sai_hit < 4 * sai->sai_miss) ||
+		(sai->sai_consecutive_miss > 8));
+}
+
+/*
+ * If the given index is behind of statahead window more than
+ * SA_OMITTED_ENTRY_MAX, then it is old.
+ */
+static inline int is_omitted_entry(struct ll_statahead_info *sai, __u64 index)
+{
+	return ((__u64)sai->sai_max + index + SA_OMITTED_ENTRY_MAX <
+		 sai->sai_index);
+}
+
+/*
+ * Insert it into sai_entries tail when init.
+ */
+static struct ll_sa_entry *
+ll_sa_entry_alloc(struct ll_statahead_info *sai, __u64 index,
+		  const char *name, int len)
+{
+	struct ll_inode_info *lli;
+	struct ll_sa_entry   *entry;
+	int		   entry_size;
+	char		 *dname;
+
+	entry_size = sizeof(struct ll_sa_entry) + (len & ~3) + 4;
+	entry = kzalloc(entry_size, GFP_NOFS);
+	if (unlikely(!entry))
+		return ERR_PTR(-ENOMEM);
+
+	CDEBUG(D_READA, "alloc sa entry %.*s(%p) index %llu\n",
+	       len, name, entry, index);
+
+	entry->se_index = index;
+
+	/*
+	 * Statahead entry reference rules:
+	 *
+	 * 1) When statahead entry is initialized, its reference is set as 2.
+	 *    One reference is used by the directory scanner. When the scanner
+	 *    searches the statahead cache for the given name, it can perform
+	 *    lockless hash lookup (only the scanner can remove entry from hash
+	 *    list), and once found, it needn't to call "atomic_inc()" for the
+	 *    entry reference. So the performance is improved. After using the
+	 *    statahead entry, the scanner will call "atomic_dec()" to drop the
+	 *    reference held when initialization. If it is the last reference,
+	 *    the statahead entry will be freed.
+	 *
+	 * 2) All other threads, including statahead thread and ptlrpcd thread,
+	 *    when they process the statahead entry, the reference for target
+	 *    should be held to guarantee the entry will not be released by the
+	 *    directory scanner. After processing the entry, these threads will
+	 *    drop the entry reference. If it is the last reference, the entry
+	 *    will be freed.
+	 *
+	 *    The second reference when initializes the statahead entry is used
+	 *    by the statahead thread, following the rule 2).
+	 */
+	atomic_set(&entry->se_refcount, 2);
+	entry->se_stat = SA_ENTRY_INIT;
+	entry->se_size = entry_size;
+	dname = (char *)entry + sizeof(struct ll_sa_entry);
+	memcpy(dname, name, len);
+	dname[len] = 0;
+	entry->se_qstr.hash = full_name_hash(name, len);
+	entry->se_qstr.len = len;
+	entry->se_qstr.name = dname;
+
+	lli = ll_i2info(sai->sai_inode);
+	spin_lock(&lli->lli_sa_lock);
+	list_add_tail(&entry->se_link, &sai->sai_entries);
+	INIT_LIST_HEAD(&entry->se_list);
+	ll_sa_entry_enhash(sai, entry);
+	spin_unlock(&lli->lli_sa_lock);
+
+	atomic_inc(&sai->sai_cache_count);
+
+	return entry;
+}
+
+/*
+ * Used by the directory scanner to search entry with name.
+ *
+ * Only the caller can remove the entry from hash, so it is unnecessary to hold
+ * hash lock. It is caller's duty to release the init refcount on the entry, so
+ * it is also unnecessary to increase refcount on the entry.
+ */
+static struct ll_sa_entry *
+ll_sa_entry_get_byname(struct ll_statahead_info *sai, const struct qstr *qstr)
+{
+	struct ll_sa_entry *entry;
+	int i = ll_sa_entry_hash(qstr->hash);
+
+	list_for_each_entry(entry, &sai->sai_cache[i], se_hash) {
+		if (entry->se_qstr.hash == qstr->hash &&
+		    entry->se_qstr.len == qstr->len &&
+		    memcmp(entry->se_qstr.name, qstr->name, qstr->len) == 0)
+			return entry;
+	}
+	return NULL;
+}
+
+/*
+ * Used by the async getattr request callback to find entry with index.
+ *
+ * Inside lli_sa_lock to prevent others to change the list during the search.
+ * It needs to increase entry refcount before returning to guarantee that the
+ * entry cannot be freed by others.
+ */
+static struct ll_sa_entry *
+ll_sa_entry_get_byindex(struct ll_statahead_info *sai, __u64 index)
+{
+	struct ll_sa_entry *entry;
+
+	list_for_each_entry(entry, &sai->sai_entries, se_link) {
+		if (entry->se_index == index) {
+			LASSERT(atomic_read(&entry->se_refcount) > 0);
+			atomic_inc(&entry->se_refcount);
+			return entry;
+		}
+		if (entry->se_index > index)
+			break;
+	}
+	return NULL;
+}
+
+static void ll_sa_entry_cleanup(struct ll_statahead_info *sai,
+				 struct ll_sa_entry *entry)
+{
+	struct md_enqueue_info *minfo = entry->se_minfo;
+	struct ptlrpc_request  *req   = entry->se_req;
+
+	if (minfo) {
+		entry->se_minfo = NULL;
+		ll_intent_release(&minfo->mi_it);
+		iput(minfo->mi_dir);
+		OBD_FREE_PTR(minfo);
+	}
+
+	if (req) {
+		entry->se_req = NULL;
+		ptlrpc_req_finished(req);
+	}
+}
+
+static void ll_sa_entry_put(struct ll_statahead_info *sai,
+			     struct ll_sa_entry *entry)
+{
+	if (atomic_dec_and_test(&entry->se_refcount)) {
+		CDEBUG(D_READA, "free sa entry %.*s(%p) index %llu\n",
+		       entry->se_qstr.len, entry->se_qstr.name, entry,
+		       entry->se_index);
+
+		LASSERT(list_empty(&entry->se_link));
+		LASSERT(list_empty(&entry->se_list));
+		LASSERT(ll_sa_entry_unhashed(entry));
+
+		ll_sa_entry_cleanup(sai, entry);
+		iput(entry->se_inode);
+
+		OBD_FREE(entry, entry->se_size);
+		atomic_dec(&sai->sai_cache_count);
+	}
+}
+
+static inline void
+do_sa_entry_fini(struct ll_statahead_info *sai, struct ll_sa_entry *entry)
+{
+	struct ll_inode_info *lli = ll_i2info(sai->sai_inode);
+
+	LASSERT(!ll_sa_entry_unhashed(entry));
+	LASSERT(!list_empty(&entry->se_link));
+
+	ll_sa_entry_unhash(sai, entry);
+
+	spin_lock(&lli->lli_sa_lock);
+	entry->se_stat = SA_ENTRY_DEST;
+	list_del_init(&entry->se_link);
+	if (likely(!list_empty(&entry->se_list)))
+		list_del_init(&entry->se_list);
+	spin_unlock(&lli->lli_sa_lock);
+
+	ll_sa_entry_put(sai, entry);
+}
+
+/*
+ * Delete it from sai_entries_stated list when fini.
+ */
+static void
+ll_sa_entry_fini(struct ll_statahead_info *sai, struct ll_sa_entry *entry)
+{
+	struct ll_sa_entry *pos, *next;
+
+	if (entry)
+		do_sa_entry_fini(sai, entry);
+
+	/* drop old entry, only 'scanner' process does this, no need to lock */
+	list_for_each_entry_safe(pos, next, &sai->sai_entries, se_link) {
+		if (!is_omitted_entry(sai, pos->se_index))
+			break;
+		do_sa_entry_fini(sai, pos);
+	}
+}
+
+/*
+ * Inside lli_sa_lock.
+ */
+static void
+do_sa_entry_to_stated(struct ll_statahead_info *sai,
+		      struct ll_sa_entry *entry, se_stat_t stat)
+{
+	struct ll_sa_entry *se;
+	struct list_head	 *pos = &sai->sai_entries_stated;
+
+	if (!list_empty(&entry->se_list))
+		list_del_init(&entry->se_list);
+
+	list_for_each_entry_reverse(se, &sai->sai_entries_stated, se_list) {
+		if (se->se_index < entry->se_index) {
+			pos = &se->se_list;
+			break;
+		}
+	}
+
+	list_add(&entry->se_list, pos);
+	entry->se_stat = stat;
+}
+
+/*
+ * Move entry to sai_entries_stated and sort with the index.
+ * \retval 1    -- entry to be destroyed.
+ * \retval 0    -- entry is inserted into stated list.
+ */
+static int
+ll_sa_entry_to_stated(struct ll_statahead_info *sai,
+		      struct ll_sa_entry *entry, se_stat_t stat)
+{
+	struct ll_inode_info *lli = ll_i2info(sai->sai_inode);
+	int		   ret = 1;
+
+	ll_sa_entry_cleanup(sai, entry);
+
+	spin_lock(&lli->lli_sa_lock);
+	if (likely(entry->se_stat != SA_ENTRY_DEST)) {
+		do_sa_entry_to_stated(sai, entry, stat);
+		ret = 0;
+	}
+	spin_unlock(&lli->lli_sa_lock);
+
+	return ret;
+}
+
+/*
+ * Insert inode into the list of sai_entries_agl.
+ */
+static void ll_agl_add(struct ll_statahead_info *sai,
+		       struct inode *inode, int index)
+{
+	struct ll_inode_info *child  = ll_i2info(inode);
+	struct ll_inode_info *parent = ll_i2info(sai->sai_inode);
+	int		   added  = 0;
+
+	spin_lock(&child->lli_agl_lock);
+	if (child->lli_agl_index == 0) {
+		child->lli_agl_index = index;
+		spin_unlock(&child->lli_agl_lock);
+
+		LASSERT(list_empty(&child->lli_agl_list));
+
+		igrab(inode);
+		spin_lock(&parent->lli_agl_lock);
+		if (agl_list_empty(sai))
+			added = 1;
+		list_add_tail(&child->lli_agl_list, &sai->sai_entries_agl);
+		spin_unlock(&parent->lli_agl_lock);
+	} else {
+		spin_unlock(&child->lli_agl_lock);
+	}
+
+	if (added > 0)
+		wake_up(&sai->sai_agl_thread.t_ctl_waitq);
+}
+
+static struct ll_statahead_info *ll_sai_alloc(void)
+{
+	struct ll_statahead_info *sai;
+	int		       i;
+
+	sai = kzalloc(sizeof(*sai), GFP_NOFS);
+	if (!sai)
+		return NULL;
+
+	atomic_set(&sai->sai_refcount, 1);
+
+	spin_lock(&sai_generation_lock);
+	sai->sai_generation = ++sai_generation;
+	if (unlikely(sai_generation == 0))
+		sai->sai_generation = ++sai_generation;
+	spin_unlock(&sai_generation_lock);
+
+	sai->sai_max = LL_SA_RPC_MIN;
+	sai->sai_index = 1;
+	init_waitqueue_head(&sai->sai_waitq);
+	init_waitqueue_head(&sai->sai_thread.t_ctl_waitq);
+	init_waitqueue_head(&sai->sai_agl_thread.t_ctl_waitq);
+
+	INIT_LIST_HEAD(&sai->sai_entries);
+	INIT_LIST_HEAD(&sai->sai_entries_received);
+	INIT_LIST_HEAD(&sai->sai_entries_stated);
+	INIT_LIST_HEAD(&sai->sai_entries_agl);
+
+	for (i = 0; i < LL_SA_CACHE_SIZE; i++) {
+		INIT_LIST_HEAD(&sai->sai_cache[i]);
+		spin_lock_init(&sai->sai_cache_lock[i]);
+	}
+	atomic_set(&sai->sai_cache_count, 0);
+
+	return sai;
+}
+
+static inline struct ll_statahead_info *
+ll_sai_get(struct ll_statahead_info *sai)
+{
+	atomic_inc(&sai->sai_refcount);
+	return sai;
+}
+
+static void ll_sai_put(struct ll_statahead_info *sai)
+{
+	struct inode	 *inode = sai->sai_inode;
+	struct ll_inode_info *lli   = ll_i2info(inode);
+
+	if (atomic_dec_and_lock(&sai->sai_refcount, &lli->lli_sa_lock)) {
+		struct ll_sa_entry *entry, *next;
+
+		if (unlikely(atomic_read(&sai->sai_refcount) > 0)) {
+			/* It is race case, the interpret callback just hold
+			 * a reference count */
+			spin_unlock(&lli->lli_sa_lock);
+			return;
+		}
+
+		LASSERT(lli->lli_opendir_key == NULL);
+		LASSERT(thread_is_stopped(&sai->sai_thread));
+		LASSERT(thread_is_stopped(&sai->sai_agl_thread));
+
+		lli->lli_sai = NULL;
+		lli->lli_opendir_pid = 0;
+		spin_unlock(&lli->lli_sa_lock);
+
+		if (sai->sai_sent > sai->sai_replied)
+			CDEBUG(D_READA, "statahead for dir "DFID
+			      " does not finish: [sent:%llu] [replied:%llu]\n",
+			      PFID(&lli->lli_fid),
+			      sai->sai_sent, sai->sai_replied);
+
+		list_for_each_entry_safe(entry, next,
+					     &sai->sai_entries, se_link)
+			do_sa_entry_fini(sai, entry);
+
+		LASSERT(list_empty(&sai->sai_entries));
+		LASSERT(sa_received_empty(sai));
+		LASSERT(list_empty(&sai->sai_entries_stated));
+
+		LASSERT(atomic_read(&sai->sai_cache_count) == 0);
+		LASSERT(agl_list_empty(sai));
+
+		iput(inode);
+		OBD_FREE_PTR(sai);
+	}
+}
+
+/* Do NOT forget to drop inode refcount when into sai_entries_agl. */
+static void ll_agl_trigger(struct inode *inode, struct ll_statahead_info *sai)
+{
+	struct ll_inode_info *lli   = ll_i2info(inode);
+	__u64		 index = lli->lli_agl_index;
+	int		   rc;
+
+	LASSERT(list_empty(&lli->lli_agl_list));
+
+	/* AGL maybe fall behind statahead with one entry */
+	if (is_omitted_entry(sai, index + 1)) {
+		lli->lli_agl_index = 0;
+		iput(inode);
+		return;
+	}
+
+	/* Someone is in glimpse (sync or async), do nothing. */
+	rc = down_write_trylock(&lli->lli_glimpse_sem);
+	if (rc == 0) {
+		lli->lli_agl_index = 0;
+		iput(inode);
+		return;
+	}
+
+	/*
+	 * Someone triggered glimpse within 1 sec before.
+	 * 1) The former glimpse succeeded with glimpse lock granted by OST, and
+	 *    if the lock is still cached on client, AGL needs to do nothing. If
+	 *    it is cancelled by other client, AGL maybe cannot obtain new lock
+	 *    for no glimpse callback triggered by AGL.
+	 * 2) The former glimpse succeeded, but OST did not grant glimpse lock.
+	 *    Under such case, it is quite possible that the OST will not grant
+	 *    glimpse lock for AGL also.
+	 * 3) The former glimpse failed, compared with other two cases, it is
+	 *    relative rare. AGL can ignore such case, and it will not muchly
+	 *    affect the performance.
+	 */
+	if (lli->lli_glimpse_time != 0 &&
+	    time_before(cfs_time_shift(-1), lli->lli_glimpse_time)) {
+		up_write(&lli->lli_glimpse_sem);
+		lli->lli_agl_index = 0;
+		iput(inode);
+		return;
+	}
+
+	CDEBUG(D_READA, "Handling (init) async glimpse: inode = "
+	       DFID", idx = %llu\n", PFID(&lli->lli_fid), index);
+
+	cl_agl(inode);
+	lli->lli_agl_index = 0;
+	lli->lli_glimpse_time = cfs_time_current();
+	up_write(&lli->lli_glimpse_sem);
+
+	CDEBUG(D_READA, "Handled (init) async glimpse: inode= "
+	       DFID", idx = %llu, rc = %d\n",
+	       PFID(&lli->lli_fid), index, rc);
+
+	iput(inode);
+}
+
+static void ll_post_statahead(struct ll_statahead_info *sai)
+{
+	struct inode	   *dir   = sai->sai_inode;
+	struct inode	   *child;
+	struct ll_inode_info   *lli   = ll_i2info(dir);
+	struct ll_sa_entry     *entry;
+	struct md_enqueue_info *minfo;
+	struct lookup_intent   *it;
+	struct ptlrpc_request  *req;
+	struct mdt_body	*body;
+	int		     rc    = 0;
+
+	spin_lock(&lli->lli_sa_lock);
+	if (unlikely(sa_received_empty(sai))) {
+		spin_unlock(&lli->lli_sa_lock);
+		return;
+	}
+	entry = sa_first_received_entry(sai);
+	atomic_inc(&entry->se_refcount);
+	list_del_init(&entry->se_list);
+	spin_unlock(&lli->lli_sa_lock);
+
+	LASSERT(entry->se_handle != 0);
+
+	minfo = entry->se_minfo;
+	it = &minfo->mi_it;
+	req = entry->se_req;
+	body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
+	if (body == NULL) {
+		rc = -EFAULT;
+		goto out;
+	}
+
+	child = entry->se_inode;
+	if (child == NULL) {
+		/*
+		 * lookup.
+		 */
+		LASSERT(fid_is_zero(&minfo->mi_data.op_fid2));
+
+		/* XXX: No fid in reply, this is probably cross-ref case.
+		 * SA can't handle it yet. */
+		if (body->valid & OBD_MD_MDS) {
+			rc = -EAGAIN;
+			goto out;
+		}
+	} else {
+		/*
+		 * revalidate.
+		 */
+		/* unlinked and re-created with the same name */
+		if (unlikely(!lu_fid_eq(&minfo->mi_data.op_fid2, &body->fid1))){
+			entry->se_inode = NULL;
+			iput(child);
+			child = NULL;
+		}
+	}
+
+	it->d.lustre.it_lock_handle = entry->se_handle;
+	rc = md_revalidate_lock(ll_i2mdexp(dir), it, ll_inode2fid(dir), NULL);
+	if (rc != 1) {
+		rc = -EAGAIN;
+		goto out;
+	}
+
+	rc = ll_prep_inode(&child, req, dir->i_sb, it);
+	if (rc)
+		goto out;
+
+	CDEBUG(D_DLMTRACE, "setting l_data to inode %p (%lu/%u)\n",
+	       child, child->i_ino, child->i_generation);
+	ll_set_lock_data(ll_i2sbi(dir)->ll_md_exp, child, it, NULL);
+
+	entry->se_inode = child;
+
+	if (agl_should_run(sai, child))
+		ll_agl_add(sai, child, entry->se_index);
+
+out:
+	/* The "ll_sa_entry_to_stated()" will drop related ldlm ibits lock
+	 * reference count by calling "ll_intent_drop_lock()" in spite of the
+	 * above operations failed or not. Do not worry about calling
+	 * "ll_intent_drop_lock()" more than once. */
+	rc = ll_sa_entry_to_stated(sai, entry,
+				   rc < 0 ? SA_ENTRY_INVA : SA_ENTRY_SUCC);
+	if (rc == 0 && entry->se_index == sai->sai_index_wait)
+		wake_up(&sai->sai_waitq);
+	ll_sa_entry_put(sai, entry);
+}
+
+static int ll_statahead_interpret(struct ptlrpc_request *req,
+				  struct md_enqueue_info *minfo, int rc)
+{
+	struct lookup_intent     *it  = &minfo->mi_it;
+	struct inode	     *dir = minfo->mi_dir;
+	struct ll_inode_info     *lli = ll_i2info(dir);
+	struct ll_statahead_info *sai = NULL;
+	struct ll_sa_entry       *entry;
+	__u64			  handle = 0;
+	int		       wakeup;
+
+	if (it_disposition(it, DISP_LOOKUP_NEG))
+		rc = -ENOENT;
+
+	if (rc == 0) {
+		/* release ibits lock ASAP to avoid deadlock when statahead
+		 * thread enqueues lock on parent in readdir and another
+		 * process enqueues lock on child with parent lock held, eg.
+		 * unlink. */
+		handle = it->d.lustre.it_lock_handle;
+		ll_intent_drop_lock(it);
+	}
+
+	spin_lock(&lli->lli_sa_lock);
+	/* stale entry */
+	if (unlikely(lli->lli_sai == NULL ||
+		     lli->lli_sai->sai_generation != minfo->mi_generation)) {
+		spin_unlock(&lli->lli_sa_lock);
+		rc = -ESTALE;
+		goto out;
+	} else {
+		sai = ll_sai_get(lli->lli_sai);
+		if (unlikely(!thread_is_running(&sai->sai_thread))) {
+			sai->sai_replied++;
+			spin_unlock(&lli->lli_sa_lock);
+			rc = -EBADFD;
+			goto out;
+		}
+
+		entry = ll_sa_entry_get_byindex(sai, minfo->mi_cbdata);
+		if (entry == NULL) {
+			sai->sai_replied++;
+			spin_unlock(&lli->lli_sa_lock);
+			rc = -EIDRM;
+			goto out;
+		}
+
+		if (rc != 0) {
+			do_sa_entry_to_stated(sai, entry, SA_ENTRY_INVA);
+			wakeup = (entry->se_index == sai->sai_index_wait);
+		} else {
+			entry->se_minfo = minfo;
+			entry->se_req = ptlrpc_request_addref(req);
+			/* Release the async ibits lock ASAP to avoid deadlock
+			 * when statahead thread tries to enqueue lock on parent
+			 * for readpage and other tries to enqueue lock on child
+			 * with parent's lock held, for example: unlink. */
+			entry->se_handle = handle;
+			wakeup = sa_received_empty(sai);
+			list_add_tail(&entry->se_list,
+					  &sai->sai_entries_received);
+		}
+		sai->sai_replied++;
+		spin_unlock(&lli->lli_sa_lock);
+
+		ll_sa_entry_put(sai, entry);
+		if (wakeup)
+			wake_up(&sai->sai_thread.t_ctl_waitq);
+	}
+
+out:
+	if (rc != 0) {
+		ll_intent_release(it);
+		iput(dir);
+		OBD_FREE_PTR(minfo);
+	}
+	if (sai != NULL)
+		ll_sai_put(sai);
+	return rc;
+}
+
+static void sa_args_fini(struct md_enqueue_info *minfo,
+			 struct ldlm_enqueue_info *einfo)
+{
+	LASSERT(minfo && einfo);
+	iput(minfo->mi_dir);
+	capa_put(minfo->mi_data.op_capa1);
+	capa_put(minfo->mi_data.op_capa2);
+	OBD_FREE_PTR(minfo);
+	OBD_FREE_PTR(einfo);
+}
+
+/**
+ * There is race condition between "capa_put" and "ll_statahead_interpret" for
+ * accessing "op_data.op_capa[1,2]" as following:
+ * "capa_put" releases "op_data.op_capa[1,2]"'s reference count after calling
+ * "md_intent_getattr_async". But "ll_statahead_interpret" maybe run first, and
+ * fill "op_data.op_capa[1,2]" as POISON, then cause "capa_put" access invalid
+ * "ocapa". So here reserve "op_data.op_capa[1,2]" in "pcapa" before calling
+ * "md_intent_getattr_async".
+ */
+static int sa_args_init(struct inode *dir, struct inode *child,
+			struct ll_sa_entry *entry, struct md_enqueue_info **pmi,
+			struct ldlm_enqueue_info **pei,
+			struct obd_capa **pcapa)
+{
+	struct qstr	      *qstr = &entry->se_qstr;
+	struct ll_inode_info     *lli  = ll_i2info(dir);
+	struct md_enqueue_info   *minfo;
+	struct ldlm_enqueue_info *einfo;
+	struct md_op_data	*op_data;
+
+	einfo = kzalloc(sizeof(*einfo), GFP_NOFS);
+	if (!einfo)
+		return -ENOMEM;
+
+	minfo = kzalloc(sizeof(*minfo), GFP_NOFS);
+	if (!minfo) {
+		OBD_FREE_PTR(einfo);
+		return -ENOMEM;
+	}
+
+	op_data = ll_prep_md_op_data(&minfo->mi_data, dir, child, qstr->name,
+				     qstr->len, 0, LUSTRE_OPC_ANY, NULL);
+	if (IS_ERR(op_data)) {
+		OBD_FREE_PTR(einfo);
+		OBD_FREE_PTR(minfo);
+		return PTR_ERR(op_data);
+	}
+
+	minfo->mi_it.it_op = IT_GETATTR;
+	minfo->mi_dir = igrab(dir);
+	minfo->mi_cb = ll_statahead_interpret;
+	minfo->mi_generation = lli->lli_sai->sai_generation;
+	minfo->mi_cbdata = entry->se_index;
+
+	einfo->ei_type   = LDLM_IBITS;
+	einfo->ei_mode   = it_to_lock_mode(&minfo->mi_it);
+	einfo->ei_cb_bl  = ll_md_blocking_ast;
+	einfo->ei_cb_cp  = ldlm_completion_ast;
+	einfo->ei_cb_gl  = NULL;
+	einfo->ei_cbdata = NULL;
+
+	*pmi = minfo;
+	*pei = einfo;
+	pcapa[0] = op_data->op_capa1;
+	pcapa[1] = op_data->op_capa2;
+
+	return 0;
+}
+
+static int do_sa_lookup(struct inode *dir, struct ll_sa_entry *entry)
+{
+	struct md_enqueue_info   *minfo;
+	struct ldlm_enqueue_info *einfo;
+	struct obd_capa	  *capas[2];
+	int		       rc;
+
+	rc = sa_args_init(dir, NULL, entry, &minfo, &einfo, capas);
+	if (rc)
+		return rc;
+
+	rc = md_intent_getattr_async(ll_i2mdexp(dir), minfo, einfo);
+	if (!rc) {
+		capa_put(capas[0]);
+		capa_put(capas[1]);
+	} else {
+		sa_args_fini(minfo, einfo);
+	}
+
+	return rc;
+}
+
+/**
+ * similar to ll_revalidate_it().
+ * \retval      1 -- dentry valid
+ * \retval      0 -- will send stat-ahead request
+ * \retval others -- prepare stat-ahead request failed
+ */
+static int do_sa_revalidate(struct inode *dir, struct ll_sa_entry *entry,
+			    struct dentry *dentry)
+{
+	struct inode	     *inode = d_inode(dentry);
+	struct lookup_intent      it = { .it_op = IT_GETATTR,
+					 .d.lustre.it_lock_handle = 0 };
+	struct md_enqueue_info   *minfo;
+	struct ldlm_enqueue_info *einfo;
+	struct obd_capa	  *capas[2];
+	int rc;
+
+	if (unlikely(inode == NULL))
+		return 1;
+
+	if (d_mountpoint(dentry))
+		return 1;
+
+	entry->se_inode = igrab(inode);
+	rc = md_revalidate_lock(ll_i2mdexp(dir), &it, ll_inode2fid(inode),
+				NULL);
+	if (rc == 1) {
+		entry->se_handle = it.d.lustre.it_lock_handle;
+		ll_intent_release(&it);
+		return 1;
+	}
+
+	rc = sa_args_init(dir, inode, entry, &minfo, &einfo, capas);
+	if (rc) {
+		entry->se_inode = NULL;
+		iput(inode);
+		return rc;
+	}
+
+	rc = md_intent_getattr_async(ll_i2mdexp(dir), minfo, einfo);
+	if (!rc) {
+		capa_put(capas[0]);
+		capa_put(capas[1]);
+	} else {
+		entry->se_inode = NULL;
+		iput(inode);
+		sa_args_fini(minfo, einfo);
+	}
+
+	return rc;
+}
+
+static void ll_statahead_one(struct dentry *parent, const char *entry_name,
+			     int entry_name_len)
+{
+	struct inode	     *dir    = d_inode(parent);
+	struct ll_inode_info     *lli    = ll_i2info(dir);
+	struct ll_statahead_info *sai    = lli->lli_sai;
+	struct dentry	    *dentry = NULL;
+	struct ll_sa_entry       *entry;
+	int		       rc;
+	int		       rc1;
+
+	entry = ll_sa_entry_alloc(sai, sai->sai_index, entry_name,
+				  entry_name_len);
+	if (IS_ERR(entry))
+		return;
+
+	dentry = d_lookup(parent, &entry->se_qstr);
+	if (!dentry) {
+		rc = do_sa_lookup(dir, entry);
+	} else {
+		rc = do_sa_revalidate(dir, entry, dentry);
+		if (rc == 1 && agl_should_run(sai, d_inode(dentry)))
+			ll_agl_add(sai, d_inode(dentry), entry->se_index);
+	}
+
+	if (dentry != NULL)
+		dput(dentry);
+
+	if (rc) {
+		rc1 = ll_sa_entry_to_stated(sai, entry,
+					rc < 0 ? SA_ENTRY_INVA : SA_ENTRY_SUCC);
+		if (rc1 == 0 && entry->se_index == sai->sai_index_wait)
+			wake_up(&sai->sai_waitq);
+	} else {
+		sai->sai_sent++;
+	}
+
+	sai->sai_index++;
+	/* drop one refcount on entry by ll_sa_entry_alloc */
+	ll_sa_entry_put(sai, entry);
+}
+
+static int ll_agl_thread(void *arg)
+{
+	struct dentry	    *parent = (struct dentry *)arg;
+	struct inode	     *dir    = d_inode(parent);
+	struct ll_inode_info     *plli   = ll_i2info(dir);
+	struct ll_inode_info     *clli;
+	struct ll_sb_info	*sbi    = ll_i2sbi(dir);
+	struct ll_statahead_info *sai    = ll_sai_get(plli->lli_sai);
+	struct ptlrpc_thread     *thread = &sai->sai_agl_thread;
+	struct l_wait_info	lwi    = { 0 };
+
+	thread->t_pid = current_pid();
+	CDEBUG(D_READA, "agl thread started: sai %p, parent %pd\n",
+	       sai, parent);
+
+	atomic_inc(&sbi->ll_agl_total);
+	spin_lock(&plli->lli_agl_lock);
+	sai->sai_agl_valid = 1;
+	if (thread_is_init(thread))
+		/* If someone else has changed the thread state
+		 * (e.g. already changed to SVC_STOPPING), we can't just
+		 * blindly overwrite that setting. */
+		thread_set_flags(thread, SVC_RUNNING);
+	spin_unlock(&plli->lli_agl_lock);
+	wake_up(&thread->t_ctl_waitq);
+
+	while (1) {
+		l_wait_event(thread->t_ctl_waitq,
+			     !agl_list_empty(sai) ||
+			     !thread_is_running(thread),
+			     &lwi);
+
+		if (!thread_is_running(thread))
+			break;
+
+		spin_lock(&plli->lli_agl_lock);
+		/* The statahead thread maybe help to process AGL entries,
+		 * so check whether list empty again. */
+		if (!agl_list_empty(sai)) {
+			clli = agl_first_entry(sai);
+			list_del_init(&clli->lli_agl_list);
+			spin_unlock(&plli->lli_agl_lock);
+			ll_agl_trigger(&clli->lli_vfs_inode, sai);
+		} else {
+			spin_unlock(&plli->lli_agl_lock);
+		}
+	}
+
+	spin_lock(&plli->lli_agl_lock);
+	sai->sai_agl_valid = 0;
+	while (!agl_list_empty(sai)) {
+		clli = agl_first_entry(sai);
+		list_del_init(&clli->lli_agl_list);
+		spin_unlock(&plli->lli_agl_lock);
+		clli->lli_agl_index = 0;
+		iput(&clli->lli_vfs_inode);
+		spin_lock(&plli->lli_agl_lock);
+	}
+	thread_set_flags(thread, SVC_STOPPED);
+	spin_unlock(&plli->lli_agl_lock);
+	wake_up(&thread->t_ctl_waitq);
+	ll_sai_put(sai);
+	CDEBUG(D_READA, "agl thread stopped: sai %p, parent %pd\n",
+	       sai, parent);
+	return 0;
+}
+
+static void ll_start_agl(struct dentry *parent, struct ll_statahead_info *sai)
+{
+	struct ptlrpc_thread *thread = &sai->sai_agl_thread;
+	struct l_wait_info    lwi    = { 0 };
+	struct ll_inode_info  *plli;
+	struct task_struct *task;
+
+	CDEBUG(D_READA, "start agl thread: sai %p, parent %pd\n",
+	       sai, parent);
+
+	plli = ll_i2info(d_inode(parent));
+	task = kthread_run(ll_agl_thread, parent,
+			       "ll_agl_%u", plli->lli_opendir_pid);
+	if (IS_ERR(task)) {
+		CERROR("can't start ll_agl thread, rc: %ld\n", PTR_ERR(task));
+		thread_set_flags(thread, SVC_STOPPED);
+		return;
+	}
+
+	l_wait_event(thread->t_ctl_waitq,
+		     thread_is_running(thread) || thread_is_stopped(thread),
+		     &lwi);
+}
+
+static int ll_statahead_thread(void *arg)
+{
+	struct dentry	    *parent = (struct dentry *)arg;
+	struct inode	     *dir    = d_inode(parent);
+	struct ll_inode_info     *plli   = ll_i2info(dir);
+	struct ll_inode_info     *clli;
+	struct ll_sb_info	*sbi    = ll_i2sbi(dir);
+	struct ll_statahead_info *sai    = ll_sai_get(plli->lli_sai);
+	struct ptlrpc_thread     *thread = &sai->sai_thread;
+	struct ptlrpc_thread *agl_thread = &sai->sai_agl_thread;
+	struct page	      *page;
+	__u64		     pos    = 0;
+	int		       first  = 0;
+	int		       rc     = 0;
+	struct ll_dir_chain       chain;
+	struct l_wait_info	lwi    = { 0 };
+
+	thread->t_pid = current_pid();
+	CDEBUG(D_READA, "statahead thread starting: sai %p, parent %pd\n",
+	       sai, parent);
+
+	if (sbi->ll_flags & LL_SBI_AGL_ENABLED)
+		ll_start_agl(parent, sai);
+
+	atomic_inc(&sbi->ll_sa_total);
+	spin_lock(&plli->lli_sa_lock);
+	if (thread_is_init(thread))
+		/* If someone else has changed the thread state
+		 * (e.g. already changed to SVC_STOPPING), we can't just
+		 * blindly overwrite that setting. */
+		thread_set_flags(thread, SVC_RUNNING);
+	spin_unlock(&plli->lli_sa_lock);
+	wake_up(&thread->t_ctl_waitq);
+
+	ll_dir_chain_init(&chain);
+	page = ll_get_dir_page(dir, pos, &chain);
+
+	while (1) {
+		struct lu_dirpage *dp;
+		struct lu_dirent  *ent;
+
+		if (IS_ERR(page)) {
+			rc = PTR_ERR(page);
+			CDEBUG(D_READA, "error reading dir "DFID" at %llu/%llu: [rc %d] [parent %u]\n",
+			       PFID(ll_inode2fid(dir)), pos, sai->sai_index,
+			       rc, plli->lli_opendir_pid);
+			goto out;
+		}
+
+		dp = page_address(page);
+		for (ent = lu_dirent_start(dp); ent != NULL;
+		     ent = lu_dirent_next(ent)) {
+			__u64 hash;
+			int namelen;
+			char *name;
+
+			hash = le64_to_cpu(ent->lde_hash);
+			if (unlikely(hash < pos))
+				/*
+				 * Skip until we find target hash value.
+				 */
+				continue;
+
+			namelen = le16_to_cpu(ent->lde_namelen);
+			if (unlikely(namelen == 0))
+				/*
+				 * Skip dummy record.
+				 */
+				continue;
+
+			name = ent->lde_name;
+			if (name[0] == '.') {
+				if (namelen == 1) {
+					/*
+					 * skip "."
+					 */
+					continue;
+				} else if (name[1] == '.' && namelen == 2) {
+					/*
+					 * skip ".."
+					 */
+					continue;
+				} else if (!sai->sai_ls_all) {
+					/*
+					 * skip hidden files.
+					 */
+					sai->sai_skip_hidden++;
+					continue;
+				}
+			}
+
+			/*
+			 * don't stat-ahead first entry.
+			 */
+			if (unlikely(++first == 1))
+				continue;
+
+keep_it:
+			l_wait_event(thread->t_ctl_waitq,
+				     !sa_sent_full(sai) ||
+				     !sa_received_empty(sai) ||
+				     !agl_list_empty(sai) ||
+				     !thread_is_running(thread),
+				     &lwi);
+
+interpret_it:
+			while (!sa_received_empty(sai))
+				ll_post_statahead(sai);
+
+			if (unlikely(!thread_is_running(thread))) {
+				ll_release_page(page, 0);
+				rc = 0;
+				goto out;
+			}
+
+			/* If no window for metadata statahead, but there are
+			 * some AGL entries to be triggered, then try to help
+			 * to process the AGL entries. */
+			if (sa_sent_full(sai)) {
+				spin_lock(&plli->lli_agl_lock);
+				while (!agl_list_empty(sai)) {
+					clli = agl_first_entry(sai);
+					list_del_init(&clli->lli_agl_list);
+					spin_unlock(&plli->lli_agl_lock);
+					ll_agl_trigger(&clli->lli_vfs_inode,
+						       sai);
+
+					if (!sa_received_empty(sai))
+						goto interpret_it;
+
+					if (unlikely(
+						!thread_is_running(thread))) {
+						ll_release_page(page, 0);
+						rc = 0;
+						goto out;
+					}
+
+					if (!sa_sent_full(sai))
+						goto do_it;
+
+					spin_lock(&plli->lli_agl_lock);
+				}
+				spin_unlock(&plli->lli_agl_lock);
+
+				goto keep_it;
+			}
+
+do_it:
+			ll_statahead_one(parent, name, namelen);
+		}
+		pos = le64_to_cpu(dp->ldp_hash_end);
+		if (pos == MDS_DIR_END_OFF) {
+			/*
+			 * End of directory reached.
+			 */
+			ll_release_page(page, 0);
+			while (1) {
+				l_wait_event(thread->t_ctl_waitq,
+					     !sa_received_empty(sai) ||
+					     sai->sai_sent == sai->sai_replied||
+					     !thread_is_running(thread),
+					     &lwi);
+
+				while (!sa_received_empty(sai))
+					ll_post_statahead(sai);
+
+				if (unlikely(!thread_is_running(thread))) {
+					rc = 0;
+					goto out;
+				}
+
+				if (sai->sai_sent == sai->sai_replied &&
+				    sa_received_empty(sai))
+					break;
+			}
+
+			spin_lock(&plli->lli_agl_lock);
+			while (!agl_list_empty(sai) &&
+			       thread_is_running(thread)) {
+				clli = agl_first_entry(sai);
+				list_del_init(&clli->lli_agl_list);
+				spin_unlock(&plli->lli_agl_lock);
+				ll_agl_trigger(&clli->lli_vfs_inode, sai);
+				spin_lock(&plli->lli_agl_lock);
+			}
+			spin_unlock(&plli->lli_agl_lock);
+
+			rc = 0;
+			goto out;
+		} else if (1) {
+			/*
+			 * chain is exhausted.
+			 * Normal case: continue to the next page.
+			 */
+			ll_release_page(page, le32_to_cpu(dp->ldp_flags) &
+					      LDF_COLLIDE);
+			page = ll_get_dir_page(dir, pos, &chain);
+		} else {
+			LASSERT(le32_to_cpu(dp->ldp_flags) & LDF_COLLIDE);
+			ll_release_page(page, 1);
+			/*
+			 * go into overflow page.
+			 */
+		}
+	}
+
+out:
+	if (sai->sai_agl_valid) {
+		spin_lock(&plli->lli_agl_lock);
+		thread_set_flags(agl_thread, SVC_STOPPING);
+		spin_unlock(&plli->lli_agl_lock);
+		wake_up(&agl_thread->t_ctl_waitq);
+
+		CDEBUG(D_READA, "stop agl thread: sai %p pid %u\n",
+		       sai, (unsigned int)agl_thread->t_pid);
+		l_wait_event(agl_thread->t_ctl_waitq,
+			     thread_is_stopped(agl_thread),
+			     &lwi);
+	} else {
+		/* Set agl_thread flags anyway. */
+		thread_set_flags(&sai->sai_agl_thread, SVC_STOPPED);
+	}
+	ll_dir_chain_fini(&chain);
+	spin_lock(&plli->lli_sa_lock);
+	if (!sa_received_empty(sai)) {
+		thread_set_flags(thread, SVC_STOPPING);
+		spin_unlock(&plli->lli_sa_lock);
+
+		/* To release the resources held by received entries. */
+		while (!sa_received_empty(sai))
+			ll_post_statahead(sai);
+
+		spin_lock(&plli->lli_sa_lock);
+	}
+	thread_set_flags(thread, SVC_STOPPED);
+	spin_unlock(&plli->lli_sa_lock);
+	wake_up(&sai->sai_waitq);
+	wake_up(&thread->t_ctl_waitq);
+	ll_sai_put(sai);
+	dput(parent);
+	CDEBUG(D_READA, "statahead thread stopped: sai %p, parent %pd\n",
+	       sai, parent);
+	return rc;
+}
+
+/**
+ * called in ll_file_release().
+ */
+void ll_stop_statahead(struct inode *dir, void *key)
+{
+	struct ll_inode_info *lli = ll_i2info(dir);
+
+	if (unlikely(key == NULL))
+		return;
+
+	spin_lock(&lli->lli_sa_lock);
+	if (lli->lli_opendir_key != key || lli->lli_opendir_pid == 0) {
+		spin_unlock(&lli->lli_sa_lock);
+		return;
+	}
+
+	lli->lli_opendir_key = NULL;
+
+	if (lli->lli_sai) {
+		struct l_wait_info lwi = { 0 };
+		struct ptlrpc_thread *thread = &lli->lli_sai->sai_thread;
+
+		if (!thread_is_stopped(thread)) {
+			thread_set_flags(thread, SVC_STOPPING);
+			spin_unlock(&lli->lli_sa_lock);
+			wake_up(&thread->t_ctl_waitq);
+
+			CDEBUG(D_READA, "stop statahead thread: sai %p pid %u\n",
+			       lli->lli_sai, (unsigned int)thread->t_pid);
+			l_wait_event(thread->t_ctl_waitq,
+				     thread_is_stopped(thread),
+				     &lwi);
+		} else {
+			spin_unlock(&lli->lli_sa_lock);
+		}
+
+		/*
+		 * Put the ref which was held when first statahead_enter.
+		 * It maybe not the last ref for some statahead requests
+		 * maybe inflight.
+		 */
+		ll_sai_put(lli->lli_sai);
+	} else {
+		lli->lli_opendir_pid = 0;
+		spin_unlock(&lli->lli_sa_lock);
+	}
+}
+
+enum {
+	/**
+	 * not first dirent, or is "."
+	 */
+	LS_NONE_FIRST_DE = 0,
+	/**
+	 * the first non-hidden dirent
+	 */
+	LS_FIRST_DE,
+	/**
+	 * the first hidden dirent, that is "."
+	 */
+	LS_FIRST_DOT_DE
+};
+
+static int is_first_dirent(struct inode *dir, struct dentry *dentry)
+{
+	struct ll_dir_chain   chain;
+	struct qstr	  *target = &dentry->d_name;
+	struct page	  *page;
+	__u64		 pos    = 0;
+	int		   dot_de;
+	int		   rc     = LS_NONE_FIRST_DE;
+
+	ll_dir_chain_init(&chain);
+	page = ll_get_dir_page(dir, pos, &chain);
+
+	while (1) {
+		struct lu_dirpage *dp;
+		struct lu_dirent  *ent;
+
+		if (IS_ERR(page)) {
+			struct ll_inode_info *lli = ll_i2info(dir);
+
+			rc = PTR_ERR(page);
+			CERROR("error reading dir "DFID" at %llu: [rc %d] [parent %u]\n",
+			       PFID(ll_inode2fid(dir)), pos,
+			       rc, lli->lli_opendir_pid);
+			break;
+		}
+
+		dp = page_address(page);
+		for (ent = lu_dirent_start(dp); ent != NULL;
+		     ent = lu_dirent_next(ent)) {
+			__u64 hash;
+			int namelen;
+			char *name;
+
+			hash = le64_to_cpu(ent->lde_hash);
+			/* The ll_get_dir_page() can return any page containing
+			 * the given hash which may be not the start hash. */
+			if (unlikely(hash < pos))
+				continue;
+
+			namelen = le16_to_cpu(ent->lde_namelen);
+			if (unlikely(namelen == 0))
+				/*
+				 * skip dummy record.
+				 */
+				continue;
+
+			name = ent->lde_name;
+			if (name[0] == '.') {
+				if (namelen == 1)
+					/*
+					 * skip "."
+					 */
+					continue;
+				else if (name[1] == '.' && namelen == 2)
+					/*
+					 * skip ".."
+					 */
+					continue;
+				else
+					dot_de = 1;
+			} else {
+				dot_de = 0;
+			}
+
+			if (dot_de && target->name[0] != '.') {
+				CDEBUG(D_READA, "%.*s skip hidden file %.*s\n",
+				       target->len, target->name,
+				       namelen, name);
+				continue;
+			}
+
+			if (target->len != namelen ||
+			    memcmp(target->name, name, namelen) != 0)
+				rc = LS_NONE_FIRST_DE;
+			else if (!dot_de)
+				rc = LS_FIRST_DE;
+			else
+				rc = LS_FIRST_DOT_DE;
+
+			ll_release_page(page, 0);
+			goto out;
+		}
+		pos = le64_to_cpu(dp->ldp_hash_end);
+		if (pos == MDS_DIR_END_OFF) {
+			/*
+			 * End of directory reached.
+			 */
+			ll_release_page(page, 0);
+			break;
+		} else if (1) {
+			/*
+			 * chain is exhausted
+			 * Normal case: continue to the next page.
+			 */
+			ll_release_page(page, le32_to_cpu(dp->ldp_flags) &
+					      LDF_COLLIDE);
+			page = ll_get_dir_page(dir, pos, &chain);
+		} else {
+			/*
+			 * go into overflow page.
+			 */
+			LASSERT(le32_to_cpu(dp->ldp_flags) & LDF_COLLIDE);
+			ll_release_page(page, 1);
+		}
+	}
+
+out:
+	ll_dir_chain_fini(&chain);
+	return rc;
+}
+
+static void
+ll_sai_unplug(struct ll_statahead_info *sai, struct ll_sa_entry *entry)
+{
+	struct ptlrpc_thread *thread = &sai->sai_thread;
+	struct ll_sb_info    *sbi    = ll_i2sbi(sai->sai_inode);
+	int		   hit;
+
+	if (entry != NULL && entry->se_stat == SA_ENTRY_SUCC)
+		hit = 1;
+	else
+		hit = 0;
+
+	ll_sa_entry_fini(sai, entry);
+	if (hit) {
+		sai->sai_hit++;
+		sai->sai_consecutive_miss = 0;
+		sai->sai_max = min(2 * sai->sai_max, sbi->ll_sa_max);
+	} else {
+		struct ll_inode_info *lli = ll_i2info(sai->sai_inode);
+
+		sai->sai_miss++;
+		sai->sai_consecutive_miss++;
+		if (sa_low_hit(sai) && thread_is_running(thread)) {
+			atomic_inc(&sbi->ll_sa_wrong);
+			CDEBUG(D_READA, "Statahead for dir " DFID " hit ratio too low: hit/miss %llu/%llu, sent/replied %llu/%llu, stopping statahead thread\n",
+			       PFID(&lli->lli_fid), sai->sai_hit,
+			       sai->sai_miss, sai->sai_sent,
+			       sai->sai_replied);
+			spin_lock(&lli->lli_sa_lock);
+			if (!thread_is_stopped(thread))
+				thread_set_flags(thread, SVC_STOPPING);
+			spin_unlock(&lli->lli_sa_lock);
+		}
+	}
+
+	if (!thread_is_stopped(thread))
+		wake_up(&thread->t_ctl_waitq);
+}
+
+/**
+ * Start statahead thread if this is the first dir entry.
+ * Otherwise if a thread is started already, wait it until it is ahead of me.
+ * \retval 1       -- find entry with lock in cache, the caller needs to do
+ *		    nothing.
+ * \retval 0       -- find entry in cache, but without lock, the caller needs
+ *		    refresh from MDS.
+ * \retval others  -- the caller need to process as non-statahead.
+ */
+int do_statahead_enter(struct inode *dir, struct dentry **dentryp,
+		       int only_unplug)
+{
+	struct ll_inode_info     *lli   = ll_i2info(dir);
+	struct ll_statahead_info *sai   = lli->lli_sai;
+	struct dentry	    *parent;
+	struct ll_sa_entry       *entry;
+	struct ptlrpc_thread     *thread;
+	struct l_wait_info	lwi   = { 0 };
+	int		       rc    = 0;
+	struct ll_inode_info     *plli;
+
+	LASSERT(lli->lli_opendir_pid == current_pid());
+
+	if (sai) {
+		thread = &sai->sai_thread;
+		if (unlikely(thread_is_stopped(thread) &&
+			     list_empty(&sai->sai_entries_stated))) {
+			/* to release resource */
+			ll_stop_statahead(dir, lli->lli_opendir_key);
+			return -EAGAIN;
+		}
+
+		if ((*dentryp)->d_name.name[0] == '.') {
+			if (sai->sai_ls_all ||
+			    sai->sai_miss_hidden >= sai->sai_skip_hidden) {
+				/*
+				 * Hidden dentry is the first one, or statahead
+				 * thread does not skip so many hidden dentries
+				 * before "sai_ls_all" enabled as below.
+				 */
+			} else {
+				if (!sai->sai_ls_all)
+					/*
+					 * It maybe because hidden dentry is not
+					 * the first one, "sai_ls_all" was not
+					 * set, then "ls -al" missed. Enable
+					 * "sai_ls_all" for such case.
+					 */
+					sai->sai_ls_all = 1;
+
+				/*
+				 * Such "getattr" has been skipped before
+				 * "sai_ls_all" enabled as above.
+				 */
+				sai->sai_miss_hidden++;
+				return -EAGAIN;
+			}
+		}
+
+		entry = ll_sa_entry_get_byname(sai, &(*dentryp)->d_name);
+		if (entry == NULL || only_unplug) {
+			ll_sai_unplug(sai, entry);
+			return entry ? 1 : -EAGAIN;
+		}
+
+		if (!ll_sa_entry_stated(entry)) {
+			sai->sai_index_wait = entry->se_index;
+			lwi = LWI_TIMEOUT_INTR(cfs_time_seconds(30), NULL,
+					       LWI_ON_SIGNAL_NOOP, NULL);
+			rc = l_wait_event(sai->sai_waitq,
+					  ll_sa_entry_stated(entry) ||
+					  thread_is_stopped(thread),
+					  &lwi);
+			if (rc < 0) {
+				ll_sai_unplug(sai, entry);
+				return -EAGAIN;
+			}
+		}
+
+		if (entry->se_stat == SA_ENTRY_SUCC &&
+		    entry->se_inode != NULL) {
+			struct inode *inode = entry->se_inode;
+			struct lookup_intent it = { .it_op = IT_GETATTR,
+						    .d.lustre.it_lock_handle =
+						     entry->se_handle };
+			__u64 bits;
+
+			rc = md_revalidate_lock(ll_i2mdexp(dir), &it,
+						ll_inode2fid(inode), &bits);
+			if (rc == 1) {
+				if (d_inode(*dentryp) == NULL) {
+					struct dentry *alias;
+
+					alias = ll_splice_alias(inode,
+								   *dentryp);
+					if (IS_ERR(alias)) {
+						ll_sai_unplug(sai, entry);
+						return PTR_ERR(alias);
+					}
+					*dentryp = alias;
+				} else if (d_inode(*dentryp) != inode) {
+					/* revalidate, but inode is recreated */
+					CDEBUG(D_READA,
+					      "stale dentry %pd inode %lu/%u, statahead inode %lu/%u\n",
+					      *dentryp,
+					      d_inode(*dentryp)->i_ino,
+					      d_inode(*dentryp)->i_generation,
+					      inode->i_ino,
+					      inode->i_generation);
+					ll_sai_unplug(sai, entry);
+					return -ESTALE;
+				} else {
+					iput(inode);
+				}
+				entry->se_inode = NULL;
+
+				if ((bits & MDS_INODELOCK_LOOKUP) &&
+				    d_lustre_invalid(*dentryp))
+					d_lustre_revalidate(*dentryp);
+				ll_intent_release(&it);
+			}
+		}
+
+		ll_sai_unplug(sai, entry);
+		return rc;
+	}
+
+	/* I am the "lli_opendir_pid" owner, only me can set "lli_sai". */
+	rc = is_first_dirent(dir, *dentryp);
+	if (rc == LS_NONE_FIRST_DE) {
+		/* It is not "ls -{a}l" operation, no need statahead for it. */
+		rc = -EAGAIN;
+		goto out;
+	}
+
+	sai = ll_sai_alloc();
+	if (sai == NULL) {
+		rc = -ENOMEM;
+		goto out;
+	}
+
+	sai->sai_ls_all = (rc == LS_FIRST_DOT_DE);
+	sai->sai_inode = igrab(dir);
+	if (unlikely(sai->sai_inode == NULL)) {
+		CWARN("Do not start stat ahead on dying inode "DFID"\n",
+		      PFID(&lli->lli_fid));
+		rc = -ESTALE;
+		goto out;
+	}
+
+	/* get parent reference count here, and put it in ll_statahead_thread */
+	parent = dget((*dentryp)->d_parent);
+	if (unlikely(sai->sai_inode != d_inode(parent))) {
+		struct ll_inode_info *nlli = ll_i2info(d_inode(parent));
+
+		CWARN("Race condition, someone changed %pd just now: old parent "DFID", new parent "DFID"\n",
+		      *dentryp,
+		      PFID(&lli->lli_fid), PFID(&nlli->lli_fid));
+		dput(parent);
+		iput(sai->sai_inode);
+		rc = -EAGAIN;
+		goto out;
+	}
+
+	CDEBUG(D_READA, "start statahead thread: sai %p, parent %pd\n",
+	       sai, parent);
+
+	/* The sai buffer already has one reference taken at allocation time,
+	 * but as soon as we expose the sai by attaching it to the lli that
+	 * default reference can be dropped by another thread calling
+	 * ll_stop_statahead. We need to take a local reference to protect
+	 * the sai buffer while we intend to access it. */
+	ll_sai_get(sai);
+	lli->lli_sai = sai;
+
+	plli = ll_i2info(d_inode(parent));
+	rc = PTR_ERR(kthread_run(ll_statahead_thread, parent,
+				 "ll_sa_%u", plli->lli_opendir_pid));
+	thread = &sai->sai_thread;
+	if (IS_ERR_VALUE(rc)) {
+		CERROR("can't start ll_sa thread, rc: %d\n", rc);
+		dput(parent);
+		lli->lli_opendir_key = NULL;
+		thread_set_flags(thread, SVC_STOPPED);
+		thread_set_flags(&sai->sai_agl_thread, SVC_STOPPED);
+		/* Drop both our own local reference and the default
+		 * reference from allocation time. */
+		ll_sai_put(sai);
+		ll_sai_put(sai);
+		LASSERT(lli->lli_sai == NULL);
+		return -EAGAIN;
+	}
+
+	l_wait_event(thread->t_ctl_waitq,
+		     thread_is_running(thread) || thread_is_stopped(thread),
+		     &lwi);
+	ll_sai_put(sai);
+
+	/*
+	 * We don't stat-ahead for the first dirent since we are already in
+	 * lookup.
+	 */
+	return -EAGAIN;
+
+out:
+	if (sai != NULL)
+		OBD_FREE_PTR(sai);
+	spin_lock(&lli->lli_sa_lock);
+	lli->lli_opendir_key = NULL;
+	lli->lli_opendir_pid = 0;
+	spin_unlock(&lli->lli_sa_lock);
+	return rc;
+}
diff --git a/kernel/drivers/staging/lustre/lustre/llite/super25.c b/kernel/drivers/staging/lustre/lustre/llite/super25.c
new file mode 100644
index 000000000..a494f6271
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/llite/super25.c
@@ -0,0 +1,226 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_LLITE
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include "../include/lustre_lite.h"
+#include "../include/lustre_ha.h"
+#include "../include/lustre_dlm.h"
+#include <linux/init.h>
+#include <linux/fs.h>
+#include "../include/lprocfs_status.h"
+#include "llite_internal.h"
+
+static struct kmem_cache *ll_inode_cachep;
+
+static struct inode *ll_alloc_inode(struct super_block *sb)
+{
+	struct ll_inode_info *lli;
+	ll_stats_ops_tally(ll_s2sbi(sb), LPROC_LL_ALLOC_INODE, 1);
+	OBD_SLAB_ALLOC_PTR_GFP(lli, ll_inode_cachep, GFP_NOFS);
+	if (lli == NULL)
+		return NULL;
+
+	inode_init_once(&lli->lli_vfs_inode);
+	return &lli->lli_vfs_inode;
+}
+
+static void ll_inode_destroy_callback(struct rcu_head *head)
+{
+	struct inode *inode = container_of(head, struct inode, i_rcu);
+	struct ll_inode_info *ptr = ll_i2info(inode);
+	OBD_SLAB_FREE_PTR(ptr, ll_inode_cachep);
+}
+
+static void ll_destroy_inode(struct inode *inode)
+{
+	call_rcu(&inode->i_rcu, ll_inode_destroy_callback);
+}
+
+/* exported operations */
+struct super_operations lustre_super_operations = {
+	.alloc_inode   = ll_alloc_inode,
+	.destroy_inode = ll_destroy_inode,
+	.evict_inode   = ll_delete_inode,
+	.put_super     = ll_put_super,
+	.statfs	= ll_statfs,
+	.umount_begin  = ll_umount_begin,
+	.remount_fs    = ll_remount_fs,
+	.show_options  = ll_show_options,
+};
+MODULE_ALIAS_FS("lustre");
+
+void lustre_register_client_process_config(int (*cpc)(struct lustre_cfg *lcfg));
+
+static int __init init_lustre_lite(void)
+{
+	struct proc_dir_entry *entry;
+	lnet_process_id_t lnet_id;
+	struct timeval tv;
+	int i, rc, seed[2];
+
+	CLASSERT(sizeof(LUSTRE_VOLATILE_HDR) == LUSTRE_VOLATILE_HDR_LEN + 1);
+
+	/* print an address of _any_ initialized kernel symbol from this
+	 * module, to allow debugging with gdb that doesn't support data
+	 * symbols from modules.*/
+	CDEBUG(D_INFO, "Lustre client module (%p).\n",
+	       &lustre_super_operations);
+
+	rc = -ENOMEM;
+	ll_inode_cachep = kmem_cache_create("lustre_inode_cache",
+					    sizeof(struct ll_inode_info),
+					    0, SLAB_HWCACHE_ALIGN, NULL);
+	if (ll_inode_cachep == NULL)
+		goto out_cache;
+
+	ll_file_data_slab = kmem_cache_create("ll_file_data",
+						 sizeof(struct ll_file_data), 0,
+						 SLAB_HWCACHE_ALIGN, NULL);
+	if (ll_file_data_slab == NULL)
+		goto out_cache;
+
+	ll_remote_perm_cachep = kmem_cache_create("ll_remote_perm_cache",
+						  sizeof(struct ll_remote_perm),
+						      0, 0, NULL);
+	if (ll_remote_perm_cachep == NULL)
+		goto out_cache;
+
+	ll_rmtperm_hash_cachep = kmem_cache_create("ll_rmtperm_hash_cache",
+						   REMOTE_PERM_HASHSIZE *
+						   sizeof(struct list_head),
+						   0, 0, NULL);
+	if (ll_rmtperm_hash_cachep == NULL)
+		goto out_cache;
+
+	entry = lprocfs_register("llite", proc_lustre_root, NULL, NULL);
+	if (IS_ERR(entry)) {
+		rc = PTR_ERR(entry);
+		CERROR("cannot register '/proc/fs/lustre/llite': rc = %d\n",
+		       rc);
+		goto out_cache;
+	}
+
+	proc_lustre_fs_root = entry;
+
+	cfs_get_random_bytes(seed, sizeof(seed));
+
+	/* Nodes with small feet have little entropy. The NID for this
+	 * node gives the most entropy in the low bits */
+	for (i = 0;; i++) {
+		if (LNetGetId(i, &lnet_id) == -ENOENT)
+			break;
+
+		if (LNET_NETTYP(LNET_NIDNET(lnet_id.nid)) != LOLND)
+			seed[0] ^= LNET_NIDADDR(lnet_id.nid);
+	}
+
+	do_gettimeofday(&tv);
+	cfs_srand(tv.tv_sec ^ seed[0], tv.tv_usec ^ seed[1]);
+	setup_timer(&ll_capa_timer, ll_capa_timer_callback, 0);
+	rc = ll_capa_thread_start();
+	if (rc != 0)
+		goto out_proc;
+
+	rc = vvp_global_init();
+	if (rc != 0)
+		goto out_capa;
+
+	rc = ll_xattr_init();
+	if (rc != 0)
+		goto out_vvp;
+
+	lustre_register_client_fill_super(ll_fill_super);
+	lustre_register_kill_super_cb(ll_kill_super);
+	lustre_register_client_process_config(ll_process_config);
+
+	return 0;
+
+out_vvp:
+	vvp_global_fini();
+out_capa:
+	del_timer(&ll_capa_timer);
+	ll_capa_thread_stop();
+out_proc:
+	lprocfs_remove(&proc_lustre_fs_root);
+out_cache:
+	if (ll_inode_cachep != NULL)
+		kmem_cache_destroy(ll_inode_cachep);
+
+	if (ll_file_data_slab != NULL)
+		kmem_cache_destroy(ll_file_data_slab);
+
+	if (ll_remote_perm_cachep != NULL)
+		kmem_cache_destroy(ll_remote_perm_cachep);
+
+	if (ll_rmtperm_hash_cachep != NULL)
+		kmem_cache_destroy(ll_rmtperm_hash_cachep);
+
+	return rc;
+}
+
+static void __exit exit_lustre_lite(void)
+{
+	lustre_register_client_fill_super(NULL);
+	lustre_register_kill_super_cb(NULL);
+	lustre_register_client_process_config(NULL);
+
+	lprocfs_remove(&proc_lustre_fs_root);
+
+	ll_xattr_fini();
+	vvp_global_fini();
+	del_timer(&ll_capa_timer);
+	ll_capa_thread_stop();
+	LASSERTF(capa_count[CAPA_SITE_CLIENT] == 0,
+		 "client remaining capa count %d\n",
+		 capa_count[CAPA_SITE_CLIENT]);
+
+	kmem_cache_destroy(ll_inode_cachep);
+	kmem_cache_destroy(ll_rmtperm_hash_cachep);
+
+	kmem_cache_destroy(ll_remote_perm_cachep);
+
+	kmem_cache_destroy(ll_file_data_slab);
+}
+
+MODULE_AUTHOR("Sun Microsystems, Inc. <http://www.lustre.org/>");
+MODULE_DESCRIPTION("Lustre Lite Client File System");
+MODULE_LICENSE("GPL");
+
+module_init(init_lustre_lite);
+module_exit(exit_lustre_lite);
diff --git a/kernel/drivers/staging/lustre/lustre/llite/symlink.c b/kernel/drivers/staging/lustre/lustre/llite/symlink.c
new file mode 100644
index 000000000..3711e671a
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/llite/symlink.c
@@ -0,0 +1,170 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/stat.h>
+#define DEBUG_SUBSYSTEM S_LLITE
+
+#include "../include/lustre_lite.h"
+#include "llite_internal.h"
+
+static int ll_readlink_internal(struct inode *inode,
+				struct ptlrpc_request **request, char **symname)
+{
+	struct ll_inode_info *lli = ll_i2info(inode);
+	struct ll_sb_info *sbi = ll_i2sbi(inode);
+	int rc, symlen = i_size_read(inode) + 1;
+	struct mdt_body *body;
+	struct md_op_data *op_data;
+
+	*request = NULL;
+
+	if (lli->lli_symlink_name) {
+		int print_limit = min_t(int, PAGE_SIZE - 128, symlen);
+
+		*symname = lli->lli_symlink_name;
+		/* If the total CDEBUG() size is larger than a page, it
+		 * will print a warning to the console, avoid this by
+		 * printing just the last part of the symlink. */
+		CDEBUG(D_INODE, "using cached symlink %s%.*s, len = %d\n",
+		       print_limit < symlen ? "..." : "", print_limit,
+		       (*symname) + symlen - print_limit, symlen);
+		return 0;
+	}
+
+	op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, symlen,
+				     LUSTRE_OPC_ANY, NULL);
+	if (IS_ERR(op_data))
+		return PTR_ERR(op_data);
+
+	op_data->op_valid = OBD_MD_LINKNAME;
+	rc = md_getattr(sbi->ll_md_exp, op_data, request);
+	ll_finish_md_op_data(op_data);
+	if (rc) {
+		if (rc != -ENOENT)
+			CERROR("inode %lu: rc = %d\n", inode->i_ino, rc);
+		goto failed;
+	}
+
+	body = req_capsule_server_get(&(*request)->rq_pill, &RMF_MDT_BODY);
+	LASSERT(body != NULL);
+	if ((body->valid & OBD_MD_LINKNAME) == 0) {
+		CERROR("OBD_MD_LINKNAME not set on reply\n");
+		rc = -EPROTO;
+		goto failed;
+	}
+
+	LASSERT(symlen != 0);
+	if (body->eadatasize != symlen) {
+		CERROR("inode %lu: symlink length %d not expected %d\n",
+			inode->i_ino, body->eadatasize - 1, symlen - 1);
+		rc = -EPROTO;
+		goto failed;
+	}
+
+	*symname = req_capsule_server_get(&(*request)->rq_pill, &RMF_MDT_MD);
+	if (*symname == NULL ||
+	    strnlen(*symname, symlen) != symlen - 1) {
+		/* not full/NULL terminated */
+		CERROR("inode %lu: symlink not NULL terminated string of length %d\n",
+		       inode->i_ino, symlen - 1);
+		rc = -EPROTO;
+		goto failed;
+	}
+
+	lli->lli_symlink_name = kzalloc(symlen, GFP_NOFS);
+	/* do not return an error if we cannot cache the symlink locally */
+	if (lli->lli_symlink_name) {
+		memcpy(lli->lli_symlink_name, *symname, symlen);
+		*symname = lli->lli_symlink_name;
+	}
+	return 0;
+
+failed:
+	return rc;
+}
+
+static void *ll_follow_link(struct dentry *dentry, struct nameidata *nd)
+{
+	struct inode *inode = d_inode(dentry);
+	struct ptlrpc_request *request = NULL;
+	int rc;
+	char *symname = NULL;
+
+	CDEBUG(D_VFSTRACE, "VFS Op\n");
+	/* Limit the recursive symlink depth to 5 instead of default
+	 * 8 links when kernel has 4k stack to prevent stack overflow.
+	 * For 8k stacks we need to limit it to 7 for local servers. */
+	if (THREAD_SIZE < 8192 && current->link_count >= 6) {
+		rc = -ELOOP;
+	} else if (THREAD_SIZE == 8192 && current->link_count >= 8) {
+		rc = -ELOOP;
+	} else {
+		ll_inode_size_lock(inode);
+		rc = ll_readlink_internal(inode, &request, &symname);
+		ll_inode_size_unlock(inode);
+	}
+	if (rc) {
+		ptlrpc_req_finished(request);
+		request = NULL;
+		symname = ERR_PTR(rc);
+	}
+
+	nd_set_link(nd, symname);
+	/* symname may contain a pointer to the request message buffer,
+	 * we delay request releasing until ll_put_link then.
+	 */
+	return request;
+}
+
+static void ll_put_link(struct dentry *dentry, struct nameidata *nd, void *cookie)
+{
+	ptlrpc_req_finished(cookie);
+}
+
+struct inode_operations ll_fast_symlink_inode_operations = {
+	.readlink	= generic_readlink,
+	.setattr	= ll_setattr,
+	.follow_link	= ll_follow_link,
+	.put_link	= ll_put_link,
+	.getattr	= ll_getattr,
+	.permission	= ll_inode_permission,
+	.setxattr	= ll_setxattr,
+	.getxattr	= ll_getxattr,
+	.listxattr	= ll_listxattr,
+	.removexattr	= ll_removexattr,
+};
diff --git a/kernel/drivers/staging/lustre/lustre/llite/vvp_dev.c b/kernel/drivers/staging/lustre/lustre/llite/vvp_dev.c
new file mode 100644
index 000000000..fde41d7c5
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/llite/vvp_dev.c
@@ -0,0 +1,547 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * cl_device and cl_device_type implementation for VVP layer.
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LLITE
+
+
+#include "../include/obd.h"
+#include "../include/lustre_lite.h"
+#include "llite_internal.h"
+#include "vvp_internal.h"
+
+/*****************************************************************************
+ *
+ * Vvp device and device type functions.
+ *
+ */
+
+/*
+ * vvp_ prefix stands for "Vfs Vm Posix". It corresponds to historical
+ * "llite_" (var. "ll_") prefix.
+ */
+
+static struct kmem_cache *vvp_thread_kmem;
+static struct kmem_cache *vvp_session_kmem;
+static struct lu_kmem_descr vvp_caches[] = {
+	{
+		.ckd_cache = &vvp_thread_kmem,
+		.ckd_name  = "vvp_thread_kmem",
+		.ckd_size  = sizeof(struct vvp_thread_info),
+	},
+	{
+		.ckd_cache = &vvp_session_kmem,
+		.ckd_name  = "vvp_session_kmem",
+		.ckd_size  = sizeof(struct vvp_session)
+	},
+	{
+		.ckd_cache = NULL
+	}
+};
+
+static void *vvp_key_init(const struct lu_context *ctx,
+			  struct lu_context_key *key)
+{
+	struct vvp_thread_info *info;
+
+	OBD_SLAB_ALLOC_PTR_GFP(info, vvp_thread_kmem, GFP_NOFS);
+	if (info == NULL)
+		info = ERR_PTR(-ENOMEM);
+	return info;
+}
+
+static void vvp_key_fini(const struct lu_context *ctx,
+			 struct lu_context_key *key, void *data)
+{
+	struct vvp_thread_info *info = data;
+
+	OBD_SLAB_FREE_PTR(info, vvp_thread_kmem);
+}
+
+static void *vvp_session_key_init(const struct lu_context *ctx,
+				  struct lu_context_key *key)
+{
+	struct vvp_session *session;
+
+	OBD_SLAB_ALLOC_PTR_GFP(session, vvp_session_kmem, GFP_NOFS);
+	if (session == NULL)
+		session = ERR_PTR(-ENOMEM);
+	return session;
+}
+
+static void vvp_session_key_fini(const struct lu_context *ctx,
+				 struct lu_context_key *key, void *data)
+{
+	struct vvp_session *session = data;
+
+	OBD_SLAB_FREE_PTR(session, vvp_session_kmem);
+}
+
+
+struct lu_context_key vvp_key = {
+	.lct_tags = LCT_CL_THREAD,
+	.lct_init = vvp_key_init,
+	.lct_fini = vvp_key_fini
+};
+
+struct lu_context_key vvp_session_key = {
+	.lct_tags = LCT_SESSION,
+	.lct_init = vvp_session_key_init,
+	.lct_fini = vvp_session_key_fini
+};
+
+/* type constructor/destructor: vvp_type_{init,fini,start,stop}(). */
+LU_TYPE_INIT_FINI(vvp, &ccc_key, &ccc_session_key, &vvp_key, &vvp_session_key);
+
+static const struct lu_device_operations vvp_lu_ops = {
+	.ldo_object_alloc      = vvp_object_alloc
+};
+
+static const struct cl_device_operations vvp_cl_ops = {
+	.cdo_req_init = ccc_req_init
+};
+
+static struct lu_device *vvp_device_alloc(const struct lu_env *env,
+					  struct lu_device_type *t,
+					  struct lustre_cfg *cfg)
+{
+	return ccc_device_alloc(env, t, cfg, &vvp_lu_ops, &vvp_cl_ops);
+}
+
+static const struct lu_device_type_operations vvp_device_type_ops = {
+	.ldto_init = vvp_type_init,
+	.ldto_fini = vvp_type_fini,
+
+	.ldto_start = vvp_type_start,
+	.ldto_stop  = vvp_type_stop,
+
+	.ldto_device_alloc = vvp_device_alloc,
+	.ldto_device_free  = ccc_device_free,
+	.ldto_device_init  = ccc_device_init,
+	.ldto_device_fini  = ccc_device_fini
+};
+
+struct lu_device_type vvp_device_type = {
+	.ldt_tags     = LU_DEVICE_CL,
+	.ldt_name     = LUSTRE_VVP_NAME,
+	.ldt_ops      = &vvp_device_type_ops,
+	.ldt_ctx_tags = LCT_CL_THREAD
+};
+
+/**
+ * A mutex serializing calls to vvp_inode_fini() under extreme memory
+ * pressure, when environments cannot be allocated.
+ */
+int vvp_global_init(void)
+{
+	int result;
+
+	result = lu_kmem_init(vvp_caches);
+	if (result == 0) {
+		result = ccc_global_init(&vvp_device_type);
+		if (result != 0)
+			lu_kmem_fini(vvp_caches);
+	}
+	return result;
+}
+
+void vvp_global_fini(void)
+{
+	ccc_global_fini(&vvp_device_type);
+	lu_kmem_fini(vvp_caches);
+}
+
+
+/*****************************************************************************
+ *
+ * mirror obd-devices into cl devices.
+ *
+ */
+
+int cl_sb_init(struct super_block *sb)
+{
+	struct ll_sb_info *sbi;
+	struct cl_device  *cl;
+	struct lu_env     *env;
+	int rc = 0;
+	int refcheck;
+
+	sbi  = ll_s2sbi(sb);
+	env = cl_env_get(&refcheck);
+	if (!IS_ERR(env)) {
+		cl = cl_type_setup(env, NULL, &vvp_device_type,
+				   sbi->ll_dt_exp->exp_obd->obd_lu_dev);
+		if (!IS_ERR(cl)) {
+			cl2ccc_dev(cl)->cdv_sb = sb;
+			sbi->ll_cl = cl;
+			sbi->ll_site = cl2lu_dev(cl)->ld_site;
+		}
+		cl_env_put(env, &refcheck);
+	} else
+		rc = PTR_ERR(env);
+	return rc;
+}
+
+int cl_sb_fini(struct super_block *sb)
+{
+	struct ll_sb_info *sbi;
+	struct lu_env     *env;
+	struct cl_device  *cld;
+	int		refcheck;
+	int		result;
+
+	sbi = ll_s2sbi(sb);
+	env = cl_env_get(&refcheck);
+	if (!IS_ERR(env)) {
+		cld = sbi->ll_cl;
+
+		if (cld != NULL) {
+			cl_stack_fini(env, cld);
+			sbi->ll_cl = NULL;
+			sbi->ll_site = NULL;
+		}
+		cl_env_put(env, &refcheck);
+		result = 0;
+	} else {
+		CERROR("Cannot cleanup cl-stack due to memory shortage.\n");
+		result = PTR_ERR(env);
+	}
+	/*
+	 * If mount failed (sbi->ll_cl == NULL), and this there are no other
+	 * mounts, stop device types manually (this usually happens
+	 * automatically when last device is destroyed).
+	 */
+	lu_types_stop();
+	return result;
+}
+
+/****************************************************************************
+ *
+ * /proc/fs/lustre/llite/$MNT/dump_page_cache
+ *
+ ****************************************************************************/
+
+/*
+ * To represent contents of a page cache as a byte stream, following
+ * information if encoded in 64bit offset:
+ *
+ *       - file hash bucket in lu_site::ls_hash[]       28bits
+ *
+ *       - how far file is from bucket head	      4bits
+ *
+ *       - page index				   32bits
+ *
+ * First two data identify a file in the cache uniquely.
+ */
+
+#define PGC_OBJ_SHIFT (32 + 4)
+#define PGC_DEPTH_SHIFT (32)
+
+struct vvp_pgcache_id {
+	unsigned		 vpi_bucket;
+	unsigned		 vpi_depth;
+	uint32_t		 vpi_index;
+
+	unsigned		 vpi_curdep;
+	struct lu_object_header *vpi_obj;
+};
+
+static void vvp_pgcache_id_unpack(loff_t pos, struct vvp_pgcache_id *id)
+{
+	CLASSERT(sizeof(pos) == sizeof(__u64));
+
+	id->vpi_index  = pos & 0xffffffff;
+	id->vpi_depth  = (pos >> PGC_DEPTH_SHIFT) & 0xf;
+	id->vpi_bucket = (unsigned long long)pos >> PGC_OBJ_SHIFT;
+}
+
+static loff_t vvp_pgcache_id_pack(struct vvp_pgcache_id *id)
+{
+	return
+		((__u64)id->vpi_index) |
+		((__u64)id->vpi_depth  << PGC_DEPTH_SHIFT) |
+		((__u64)id->vpi_bucket << PGC_OBJ_SHIFT);
+}
+
+static int vvp_pgcache_obj_get(struct cfs_hash *hs, struct cfs_hash_bd *bd,
+			       struct hlist_node *hnode, void *data)
+{
+	struct vvp_pgcache_id   *id  = data;
+	struct lu_object_header *hdr = cfs_hash_object(hs, hnode);
+
+	if (id->vpi_curdep-- > 0)
+		return 0; /* continue */
+
+	if (lu_object_is_dying(hdr))
+		return 1;
+
+	cfs_hash_get(hs, hnode);
+	id->vpi_obj = hdr;
+	return 1;
+}
+
+static struct cl_object *vvp_pgcache_obj(const struct lu_env *env,
+					 struct lu_device *dev,
+					 struct vvp_pgcache_id *id)
+{
+	LASSERT(lu_device_is_cl(dev));
+
+	id->vpi_depth &= 0xf;
+	id->vpi_obj    = NULL;
+	id->vpi_curdep = id->vpi_depth;
+
+	cfs_hash_hlist_for_each(dev->ld_site->ls_obj_hash, id->vpi_bucket,
+				vvp_pgcache_obj_get, id);
+	if (id->vpi_obj != NULL) {
+		struct lu_object *lu_obj;
+
+		lu_obj = lu_object_locate(id->vpi_obj, dev->ld_type);
+		if (lu_obj != NULL) {
+			lu_object_ref_add(lu_obj, "dump", current);
+			return lu2cl(lu_obj);
+		}
+		lu_object_put(env, lu_object_top(id->vpi_obj));
+
+	} else if (id->vpi_curdep > 0) {
+		id->vpi_depth = 0xf;
+	}
+	return NULL;
+}
+
+static loff_t vvp_pgcache_find(const struct lu_env *env,
+			       struct lu_device *dev, loff_t pos)
+{
+	struct cl_object     *clob;
+	struct lu_site       *site;
+	struct vvp_pgcache_id id;
+
+	site = dev->ld_site;
+	vvp_pgcache_id_unpack(pos, &id);
+
+	while (1) {
+		if (id.vpi_bucket >= CFS_HASH_NHLIST(site->ls_obj_hash))
+			return ~0ULL;
+		clob = vvp_pgcache_obj(env, dev, &id);
+		if (clob != NULL) {
+			struct cl_object_header *hdr;
+			int		      nr;
+			struct cl_page	  *pg;
+
+			/* got an object. Find next page. */
+			hdr = cl_object_header(clob);
+
+			spin_lock(&hdr->coh_page_guard);
+			nr = radix_tree_gang_lookup(&hdr->coh_tree,
+						    (void **)&pg,
+						    id.vpi_index, 1);
+			if (nr > 0) {
+				id.vpi_index = pg->cp_index;
+				/* Cant support over 16T file */
+				nr = !(pg->cp_index > 0xffffffff);
+			}
+			spin_unlock(&hdr->coh_page_guard);
+
+			lu_object_ref_del(&clob->co_lu, "dump", current);
+			cl_object_put(env, clob);
+			if (nr > 0)
+				return vvp_pgcache_id_pack(&id);
+		}
+		/* to the next object. */
+		++id.vpi_depth;
+		id.vpi_depth &= 0xf;
+		if (id.vpi_depth == 0 && ++id.vpi_bucket == 0)
+			return ~0ULL;
+		id.vpi_index = 0;
+	}
+}
+
+#define seq_page_flag(seq, page, flag, has_flags) do {		  \
+	if (test_bit(PG_##flag, &(page)->flags)) {		  \
+		seq_printf(seq, "%s"#flag, has_flags ? "|" : "");       \
+		has_flags = 1;					  \
+	}							       \
+} while (0)
+
+static void vvp_pgcache_page_show(const struct lu_env *env,
+				  struct seq_file *seq, struct cl_page *page)
+{
+	struct ccc_page *cpg;
+	struct page      *vmpage;
+	int	      has_flags;
+
+	cpg = cl2ccc_page(cl_page_at(page, &vvp_device_type));
+	vmpage = cpg->cpg_page;
+	seq_printf(seq, " %5i | %p %p %s %s %s %s | %p %lu/%u(%p) %lu %u [",
+		   0 /* gen */,
+		   cpg, page,
+		   "none",
+		   cpg->cpg_write_queued ? "wq" : "- ",
+		   cpg->cpg_defer_uptodate ? "du" : "- ",
+		   PageWriteback(vmpage) ? "wb" : "-",
+		   vmpage, vmpage->mapping->host->i_ino,
+		   vmpage->mapping->host->i_generation,
+		   vmpage->mapping->host, vmpage->index,
+		   page_count(vmpage));
+	has_flags = 0;
+	seq_page_flag(seq, vmpage, locked, has_flags);
+	seq_page_flag(seq, vmpage, error, has_flags);
+	seq_page_flag(seq, vmpage, referenced, has_flags);
+	seq_page_flag(seq, vmpage, uptodate, has_flags);
+	seq_page_flag(seq, vmpage, dirty, has_flags);
+	seq_page_flag(seq, vmpage, writeback, has_flags);
+	seq_printf(seq, "%s]\n", has_flags ? "" : "-");
+}
+
+static int vvp_pgcache_show(struct seq_file *f, void *v)
+{
+	loff_t		   pos;
+	struct ll_sb_info       *sbi;
+	struct cl_object	*clob;
+	struct lu_env	   *env;
+	struct cl_page	  *page;
+	struct cl_object_header *hdr;
+	struct vvp_pgcache_id    id;
+	int		      refcheck;
+	int		      result;
+
+	env = cl_env_get(&refcheck);
+	if (!IS_ERR(env)) {
+		pos = *(loff_t *) v;
+		vvp_pgcache_id_unpack(pos, &id);
+		sbi = f->private;
+		clob = vvp_pgcache_obj(env, &sbi->ll_cl->cd_lu_dev, &id);
+		if (clob != NULL) {
+			hdr = cl_object_header(clob);
+
+			spin_lock(&hdr->coh_page_guard);
+			page = cl_page_lookup(hdr, id.vpi_index);
+			spin_unlock(&hdr->coh_page_guard);
+
+			seq_printf(f, "%8x@"DFID": ",
+				   id.vpi_index, PFID(&hdr->coh_lu.loh_fid));
+			if (page != NULL) {
+				vvp_pgcache_page_show(env, f, page);
+				cl_page_put(env, page);
+			} else
+				seq_puts(f, "missing\n");
+			lu_object_ref_del(&clob->co_lu, "dump", current);
+			cl_object_put(env, clob);
+		} else
+			seq_printf(f, "%llx missing\n", pos);
+		cl_env_put(env, &refcheck);
+		result = 0;
+	} else
+		result = PTR_ERR(env);
+	return result;
+}
+
+static void *vvp_pgcache_start(struct seq_file *f, loff_t *pos)
+{
+	struct ll_sb_info *sbi;
+	struct lu_env     *env;
+	int		refcheck;
+
+	sbi = f->private;
+
+	env = cl_env_get(&refcheck);
+	if (!IS_ERR(env)) {
+		sbi = f->private;
+		if (sbi->ll_site->ls_obj_hash->hs_cur_bits > 64 - PGC_OBJ_SHIFT)
+			pos = ERR_PTR(-EFBIG);
+		else {
+			*pos = vvp_pgcache_find(env, &sbi->ll_cl->cd_lu_dev,
+						*pos);
+			if (*pos == ~0ULL)
+				pos = NULL;
+		}
+		cl_env_put(env, &refcheck);
+	}
+	return pos;
+}
+
+static void *vvp_pgcache_next(struct seq_file *f, void *v, loff_t *pos)
+{
+	struct ll_sb_info *sbi;
+	struct lu_env     *env;
+	int		refcheck;
+
+	env = cl_env_get(&refcheck);
+	if (!IS_ERR(env)) {
+		sbi = f->private;
+		*pos = vvp_pgcache_find(env, &sbi->ll_cl->cd_lu_dev, *pos + 1);
+		if (*pos == ~0ULL)
+			pos = NULL;
+		cl_env_put(env, &refcheck);
+	}
+	return pos;
+}
+
+static void vvp_pgcache_stop(struct seq_file *f, void *v)
+{
+	/* Nothing to do */
+}
+
+static struct seq_operations vvp_pgcache_ops = {
+	.start = vvp_pgcache_start,
+	.next  = vvp_pgcache_next,
+	.stop  = vvp_pgcache_stop,
+	.show  = vvp_pgcache_show
+};
+
+static int vvp_dump_pgcache_seq_open(struct inode *inode, struct file *filp)
+{
+	struct ll_sb_info     *sbi = PDE_DATA(inode);
+	struct seq_file       *seq;
+	int		    result;
+
+	result = seq_open(filp, &vvp_pgcache_ops);
+	if (result == 0) {
+		seq = filp->private_data;
+		seq->private = sbi;
+	}
+	return result;
+}
+
+const struct file_operations vvp_dump_pgcache_file_ops = {
+	.owner   = THIS_MODULE,
+	.open    = vvp_dump_pgcache_seq_open,
+	.read    = seq_read,
+	.llseek	 = seq_lseek,
+	.release = seq_release,
+};
diff --git a/kernel/drivers/staging/lustre/lustre/llite/vvp_internal.h b/kernel/drivers/staging/lustre/lustre/llite/vvp_internal.h
new file mode 100644
index 000000000..2162bf6c0
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/llite/vvp_internal.h
@@ -0,0 +1,62 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Internal definitions for VVP layer.
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ */
+
+#ifndef VVP_INTERNAL_H
+#define VVP_INTERNAL_H
+
+
+#include "../include/cl_object.h"
+#include "llite_internal.h"
+
+int	       vvp_io_init     (const struct lu_env *env,
+				   struct cl_object *obj, struct cl_io *io);
+int	       vvp_lock_init   (const struct lu_env *env,
+				   struct cl_object *obj, struct cl_lock *lock,
+				   const struct cl_io *io);
+int		  vvp_page_init   (const struct lu_env *env,
+				   struct cl_object *obj,
+				   struct cl_page *page, struct page *vmpage);
+struct lu_object *vvp_object_alloc(const struct lu_env *env,
+				   const struct lu_object_header *hdr,
+				   struct lu_device *dev);
+
+struct ccc_object *cl_inode2ccc(struct inode *inode);
+
+extern const struct file_operations vvp_dump_pgcache_file_ops;
+
+#endif /* VVP_INTERNAL_H */
diff --git a/kernel/drivers/staging/lustre/lustre/llite/vvp_io.c b/kernel/drivers/staging/lustre/lustre/llite/vvp_io.c
new file mode 100644
index 000000000..91bba7967
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/llite/vvp_io.c
@@ -0,0 +1,1209 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Implementation of cl_io for VVP layer.
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ *   Author: Jinshan Xiong <jinshan.xiong@whamcloud.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LLITE
+
+
+#include "../include/obd.h"
+#include "../include/lustre_lite.h"
+
+#include "vvp_internal.h"
+
+static struct vvp_io *cl2vvp_io(const struct lu_env *env,
+				const struct cl_io_slice *slice);
+
+/**
+ * True, if \a io is a normal io, False for splice_{read,write}
+ */
+int cl_is_normalio(const struct lu_env *env, const struct cl_io *io)
+{
+	struct vvp_io *vio = vvp_env_io(env);
+
+	LASSERT(io->ci_type == CIT_READ || io->ci_type == CIT_WRITE);
+
+	return vio->cui_io_subtype == IO_NORMAL;
+}
+
+/**
+ * For swapping layout. The file's layout may have changed.
+ * To avoid populating pages to a wrong stripe, we have to verify the
+ * correctness of layout. It works because swapping layout processes
+ * have to acquire group lock.
+ */
+static bool can_populate_pages(const struct lu_env *env, struct cl_io *io,
+				struct inode *inode)
+{
+	struct ll_inode_info	*lli = ll_i2info(inode);
+	struct ccc_io		*cio = ccc_env_io(env);
+	bool rc = true;
+
+	switch (io->ci_type) {
+	case CIT_READ:
+	case CIT_WRITE:
+		/* don't need lock here to check lli_layout_gen as we have held
+		 * extent lock and GROUP lock has to hold to swap layout */
+		if (ll_layout_version_get(lli) != cio->cui_layout_gen) {
+			io->ci_need_restart = 1;
+			/* this will return application a short read/write */
+			io->ci_continue = 0;
+			rc = false;
+		}
+	case CIT_FAULT:
+		/* fault is okay because we've already had a page. */
+	default:
+		break;
+	}
+
+	return rc;
+}
+
+/*****************************************************************************
+ *
+ * io operations.
+ *
+ */
+
+static int vvp_io_fault_iter_init(const struct lu_env *env,
+				  const struct cl_io_slice *ios)
+{
+	struct vvp_io *vio   = cl2vvp_io(env, ios);
+	struct inode  *inode = ccc_object_inode(ios->cis_obj);
+
+	LASSERT(inode ==
+		file_inode(cl2ccc_io(env, ios)->cui_fd->fd_file));
+	vio->u.fault.ft_mtime = LTIME_S(inode->i_mtime);
+	return 0;
+}
+
+static void vvp_io_fini(const struct lu_env *env, const struct cl_io_slice *ios)
+{
+	struct cl_io     *io  = ios->cis_io;
+	struct cl_object *obj = io->ci_obj;
+	struct ccc_io    *cio = cl2ccc_io(env, ios);
+
+	CLOBINVRNT(env, obj, ccc_object_invariant(obj));
+
+	CDEBUG(D_VFSTRACE, DFID
+	       " ignore/verify layout %d/%d, layout version %d restore needed %d\n",
+	       PFID(lu_object_fid(&obj->co_lu)),
+	       io->ci_ignore_layout, io->ci_verify_layout,
+	       cio->cui_layout_gen, io->ci_restore_needed);
+
+	if (io->ci_restore_needed == 1) {
+		int	rc;
+
+		/* file was detected release, we need to restore it
+		 * before finishing the io
+		 */
+		rc = ll_layout_restore(ccc_object_inode(obj));
+		/* if restore registration failed, no restart,
+		 * we will return -ENODATA */
+		/* The layout will change after restore, so we need to
+		 * block on layout lock hold by the MDT
+		 * as MDT will not send new layout in lvb (see LU-3124)
+		 * we have to explicitly fetch it, all this will be done
+		 * by ll_layout_refresh()
+		 */
+		if (rc == 0) {
+			io->ci_restore_needed = 0;
+			io->ci_need_restart = 1;
+			io->ci_verify_layout = 1;
+		} else {
+			io->ci_restore_needed = 1;
+			io->ci_need_restart = 0;
+			io->ci_verify_layout = 0;
+			io->ci_result = rc;
+		}
+	}
+
+	if (!io->ci_ignore_layout && io->ci_verify_layout) {
+		__u32 gen = 0;
+
+		/* check layout version */
+		ll_layout_refresh(ccc_object_inode(obj), &gen);
+		io->ci_need_restart = cio->cui_layout_gen != gen;
+		if (io->ci_need_restart) {
+			CDEBUG(D_VFSTRACE,
+			       DFID" layout changed from %d to %d.\n",
+			       PFID(lu_object_fid(&obj->co_lu)),
+			       cio->cui_layout_gen, gen);
+			/* today successful restore is the only possible
+			 * case */
+			/* restore was done, clear restoring state */
+			ll_i2info(ccc_object_inode(obj))->lli_flags &=
+				~LLIF_FILE_RESTORING;
+		}
+	}
+}
+
+static void vvp_io_fault_fini(const struct lu_env *env,
+			      const struct cl_io_slice *ios)
+{
+	struct cl_io   *io   = ios->cis_io;
+	struct cl_page *page = io->u.ci_fault.ft_page;
+
+	CLOBINVRNT(env, io->ci_obj, ccc_object_invariant(io->ci_obj));
+
+	if (page != NULL) {
+		lu_ref_del(&page->cp_reference, "fault", io);
+		cl_page_put(env, page);
+		io->u.ci_fault.ft_page = NULL;
+	}
+	vvp_io_fini(env, ios);
+}
+
+static enum cl_lock_mode vvp_mode_from_vma(struct vm_area_struct *vma)
+{
+	/*
+	 * we only want to hold PW locks if the mmap() can generate
+	 * writes back to the file and that only happens in shared
+	 * writable vmas
+	 */
+	if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_WRITE))
+		return CLM_WRITE;
+	return CLM_READ;
+}
+
+static int vvp_mmap_locks(const struct lu_env *env,
+			  struct ccc_io *vio, struct cl_io *io)
+{
+	struct ccc_thread_info *cti = ccc_env_info(env);
+	struct mm_struct       *mm = current->mm;
+	struct vm_area_struct  *vma;
+	struct cl_lock_descr   *descr = &cti->cti_descr;
+	ldlm_policy_data_t      policy;
+	unsigned long	   addr;
+	ssize_t		 count;
+	int		     result;
+	struct iov_iter i;
+	struct iovec iov;
+
+	LASSERT(io->ci_type == CIT_READ || io->ci_type == CIT_WRITE);
+
+	if (!cl_is_normalio(env, io))
+		return 0;
+
+	if (vio->cui_iter == NULL) /* nfs or loop back device write */
+		return 0;
+
+	/* No MM (e.g. NFS)? No vmas too. */
+	if (mm == NULL)
+		return 0;
+
+	iov_for_each(iov, i, *(vio->cui_iter)) {
+		addr = (unsigned long)iov.iov_base;
+		count = iov.iov_len;
+		if (count == 0)
+			continue;
+
+		count += addr & (~CFS_PAGE_MASK);
+		addr &= CFS_PAGE_MASK;
+
+		down_read(&mm->mmap_sem);
+		while ((vma = our_vma(mm, addr, count)) != NULL) {
+			struct inode *inode = file_inode(vma->vm_file);
+			int flags = CEF_MUST;
+
+			if (ll_file_nolock(vma->vm_file)) {
+				/*
+				 * For no lock case, a lockless lock will be
+				 * generated.
+				 */
+				flags = CEF_NEVER;
+			}
+
+			/*
+			 * XXX: Required lock mode can be weakened: CIT_WRITE
+			 * io only ever reads user level buffer, and CIT_READ
+			 * only writes on it.
+			 */
+			policy_from_vma(&policy, vma, addr, count);
+			descr->cld_mode = vvp_mode_from_vma(vma);
+			descr->cld_obj = ll_i2info(inode)->lli_clob;
+			descr->cld_start = cl_index(descr->cld_obj,
+						    policy.l_extent.start);
+			descr->cld_end = cl_index(descr->cld_obj,
+						  policy.l_extent.end);
+			descr->cld_enq_flags = flags;
+			result = cl_io_lock_alloc_add(env, io, descr);
+
+			CDEBUG(D_VFSTRACE, "lock: %d: [%lu, %lu]\n",
+			       descr->cld_mode, descr->cld_start,
+			       descr->cld_end);
+
+			if (result < 0) {
+				up_read(&mm->mmap_sem);
+				return result;
+			}
+
+			if (vma->vm_end - addr >= count)
+				break;
+
+			count -= vma->vm_end - addr;
+			addr = vma->vm_end;
+		}
+		up_read(&mm->mmap_sem);
+	}
+	return 0;
+}
+
+static int vvp_io_rw_lock(const struct lu_env *env, struct cl_io *io,
+			  enum cl_lock_mode mode, loff_t start, loff_t end)
+{
+	struct ccc_io *cio = ccc_env_io(env);
+	int result;
+	int ast_flags = 0;
+
+	LASSERT(io->ci_type == CIT_READ || io->ci_type == CIT_WRITE);
+
+	ccc_io_update_iov(env, cio, io);
+
+	if (io->u.ci_rw.crw_nonblock)
+		ast_flags |= CEF_NONBLOCK;
+	result = vvp_mmap_locks(env, cio, io);
+	if (result == 0)
+		result = ccc_io_one_lock(env, io, ast_flags, mode, start, end);
+	return result;
+}
+
+static int vvp_io_read_lock(const struct lu_env *env,
+			    const struct cl_io_slice *ios)
+{
+	struct cl_io	 *io = ios->cis_io;
+	struct cl_io_rw_common *rd = &io->u.ci_rd.rd;
+	int result;
+
+	result = vvp_io_rw_lock(env, io, CLM_READ, rd->crw_pos,
+				rd->crw_pos + rd->crw_count - 1);
+
+	return result;
+}
+
+static int vvp_io_fault_lock(const struct lu_env *env,
+			     const struct cl_io_slice *ios)
+{
+	struct cl_io *io   = ios->cis_io;
+	struct vvp_io *vio = cl2vvp_io(env, ios);
+	/*
+	 * XXX LDLM_FL_CBPENDING
+	 */
+	return ccc_io_one_lock_index
+		(env, io, 0, vvp_mode_from_vma(vio->u.fault.ft_vma),
+		 io->u.ci_fault.ft_index, io->u.ci_fault.ft_index);
+}
+
+static int vvp_io_write_lock(const struct lu_env *env,
+			     const struct cl_io_slice *ios)
+{
+	struct cl_io *io = ios->cis_io;
+	loff_t start;
+	loff_t end;
+
+	if (io->u.ci_wr.wr_append) {
+		start = 0;
+		end   = OBD_OBJECT_EOF;
+	} else {
+		start = io->u.ci_wr.wr.crw_pos;
+		end   = start + io->u.ci_wr.wr.crw_count - 1;
+	}
+	return vvp_io_rw_lock(env, io, CLM_WRITE, start, end);
+}
+
+static int vvp_io_setattr_iter_init(const struct lu_env *env,
+				    const struct cl_io_slice *ios)
+{
+	return 0;
+}
+
+/**
+ * Implementation of cl_io_operations::cio_lock() method for CIT_SETATTR io.
+ *
+ * Handles "lockless io" mode when extent locking is done by server.
+ */
+static int vvp_io_setattr_lock(const struct lu_env *env,
+			       const struct cl_io_slice *ios)
+{
+	struct ccc_io *cio = ccc_env_io(env);
+	struct cl_io  *io  = ios->cis_io;
+	__u64 new_size;
+	__u32 enqflags = 0;
+
+	if (cl_io_is_trunc(io)) {
+		new_size = io->u.ci_setattr.sa_attr.lvb_size;
+		if (new_size == 0)
+			enqflags = CEF_DISCARD_DATA;
+	} else {
+		if ((io->u.ci_setattr.sa_attr.lvb_mtime >=
+		     io->u.ci_setattr.sa_attr.lvb_ctime) ||
+		    (io->u.ci_setattr.sa_attr.lvb_atime >=
+		     io->u.ci_setattr.sa_attr.lvb_ctime))
+			return 0;
+		new_size = 0;
+	}
+	cio->u.setattr.cui_local_lock = SETATTR_EXTENT_LOCK;
+	return ccc_io_one_lock(env, io, enqflags, CLM_WRITE,
+			       new_size, OBD_OBJECT_EOF);
+}
+
+static int vvp_do_vmtruncate(struct inode *inode, size_t size)
+{
+	int     result;
+	/*
+	 * Only ll_inode_size_lock is taken at this level.
+	 */
+	ll_inode_size_lock(inode);
+	result = inode_newsize_ok(inode, size);
+	if (result < 0) {
+		ll_inode_size_unlock(inode);
+		return result;
+	}
+	truncate_setsize(inode, size);
+	ll_inode_size_unlock(inode);
+	return result;
+}
+
+static int vvp_io_setattr_trunc(const struct lu_env *env,
+				const struct cl_io_slice *ios,
+				struct inode *inode, loff_t size)
+{
+	inode_dio_wait(inode);
+	return 0;
+}
+
+static int vvp_io_setattr_time(const struct lu_env *env,
+			       const struct cl_io_slice *ios)
+{
+	struct cl_io       *io    = ios->cis_io;
+	struct cl_object   *obj   = io->ci_obj;
+	struct cl_attr     *attr  = ccc_env_thread_attr(env);
+	int result;
+	unsigned valid = CAT_CTIME;
+
+	cl_object_attr_lock(obj);
+	attr->cat_ctime = io->u.ci_setattr.sa_attr.lvb_ctime;
+	if (io->u.ci_setattr.sa_valid & ATTR_ATIME_SET) {
+		attr->cat_atime = io->u.ci_setattr.sa_attr.lvb_atime;
+		valid |= CAT_ATIME;
+	}
+	if (io->u.ci_setattr.sa_valid & ATTR_MTIME_SET) {
+		attr->cat_mtime = io->u.ci_setattr.sa_attr.lvb_mtime;
+		valid |= CAT_MTIME;
+	}
+	result = cl_object_attr_set(env, obj, attr, valid);
+	cl_object_attr_unlock(obj);
+
+	return result;
+}
+
+static int vvp_io_setattr_start(const struct lu_env *env,
+				const struct cl_io_slice *ios)
+{
+	struct cl_io	*io    = ios->cis_io;
+	struct inode	*inode = ccc_object_inode(io->ci_obj);
+	int result = 0;
+
+	mutex_lock(&inode->i_mutex);
+	if (cl_io_is_trunc(io))
+		result = vvp_io_setattr_trunc(env, ios, inode,
+					io->u.ci_setattr.sa_attr.lvb_size);
+	if (result == 0)
+		result = vvp_io_setattr_time(env, ios);
+	return result;
+}
+
+static void vvp_io_setattr_end(const struct lu_env *env,
+			       const struct cl_io_slice *ios)
+{
+	struct cl_io *io    = ios->cis_io;
+	struct inode *inode = ccc_object_inode(io->ci_obj);
+
+	if (cl_io_is_trunc(io)) {
+		/* Truncate in memory pages - they must be clean pages
+		 * because osc has already notified to destroy osc_extents. */
+		vvp_do_vmtruncate(inode, io->u.ci_setattr.sa_attr.lvb_size);
+		inode_dio_write_done(inode);
+	}
+	mutex_unlock(&inode->i_mutex);
+}
+
+static void vvp_io_setattr_fini(const struct lu_env *env,
+				const struct cl_io_slice *ios)
+{
+	vvp_io_fini(env, ios);
+}
+
+static int vvp_io_read_start(const struct lu_env *env,
+			     const struct cl_io_slice *ios)
+{
+	struct vvp_io     *vio   = cl2vvp_io(env, ios);
+	struct ccc_io     *cio   = cl2ccc_io(env, ios);
+	struct cl_io      *io    = ios->cis_io;
+	struct cl_object  *obj   = io->ci_obj;
+	struct inode      *inode = ccc_object_inode(obj);
+	struct ll_ra_read *bead  = &vio->cui_bead;
+	struct file       *file  = cio->cui_fd->fd_file;
+
+	int     result;
+	loff_t  pos = io->u.ci_rd.rd.crw_pos;
+	long    cnt = io->u.ci_rd.rd.crw_count;
+	long    tot = cio->cui_tot_count;
+	int     exceed = 0;
+
+	CLOBINVRNT(env, obj, ccc_object_invariant(obj));
+
+	CDEBUG(D_VFSTRACE, "read: -> [%lli, %lli)\n", pos, pos + cnt);
+
+	if (!can_populate_pages(env, io, inode))
+		return 0;
+
+	result = ccc_prep_size(env, obj, io, pos, tot, &exceed);
+	if (result != 0)
+		return result;
+	else if (exceed != 0)
+		goto out;
+
+	LU_OBJECT_HEADER(D_INODE, env, &obj->co_lu,
+			"Read ino %lu, %lu bytes, offset %lld, size %llu\n",
+			inode->i_ino, cnt, pos, i_size_read(inode));
+
+	/* turn off the kernel's read-ahead */
+	cio->cui_fd->fd_file->f_ra.ra_pages = 0;
+
+	/* initialize read-ahead window once per syscall */
+	if (!vio->cui_ra_window_set) {
+		vio->cui_ra_window_set = 1;
+		bead->lrr_start = cl_index(obj, pos);
+		/*
+		 * XXX: explicit PAGE_CACHE_SIZE
+		 */
+		bead->lrr_count = cl_index(obj, tot + PAGE_CACHE_SIZE - 1);
+		ll_ra_read_in(file, bead);
+	}
+
+	/* BUG: 5972 */
+	file_accessed(file);
+	switch (vio->cui_io_subtype) {
+	case IO_NORMAL:
+		LASSERT(cio->cui_iocb->ki_pos == pos);
+		result = generic_file_read_iter(cio->cui_iocb, cio->cui_iter);
+		break;
+	case IO_SPLICE:
+		result = generic_file_splice_read(file, &pos,
+				vio->u.splice.cui_pipe, cnt,
+				vio->u.splice.cui_flags);
+		/* LU-1109: do splice read stripe by stripe otherwise if it
+		 * may make nfsd stuck if this read occupied all internal pipe
+		 * buffers. */
+		io->ci_continue = 0;
+		break;
+	default:
+		CERROR("Wrong IO type %u\n", vio->cui_io_subtype);
+		LBUG();
+	}
+
+out:
+	if (result >= 0) {
+		if (result < cnt)
+			io->ci_continue = 0;
+		io->ci_nob += result;
+		ll_rw_stats_tally(ll_i2sbi(inode), current->pid,
+				  cio->cui_fd, pos, result, READ);
+		result = 0;
+	}
+	return result;
+}
+
+static void vvp_io_read_fini(const struct lu_env *env, const struct cl_io_slice *ios)
+{
+	struct vvp_io *vio = cl2vvp_io(env, ios);
+	struct ccc_io *cio = cl2ccc_io(env, ios);
+
+	if (vio->cui_ra_window_set)
+		ll_ra_read_ex(cio->cui_fd->fd_file, &vio->cui_bead);
+
+	vvp_io_fini(env, ios);
+}
+
+static int vvp_io_write_start(const struct lu_env *env,
+			      const struct cl_io_slice *ios)
+{
+	struct ccc_io      *cio   = cl2ccc_io(env, ios);
+	struct cl_io       *io    = ios->cis_io;
+	struct cl_object   *obj   = io->ci_obj;
+	struct inode       *inode = ccc_object_inode(obj);
+	ssize_t result = 0;
+	loff_t pos = io->u.ci_wr.wr.crw_pos;
+	size_t cnt = io->u.ci_wr.wr.crw_count;
+
+	if (!can_populate_pages(env, io, inode))
+		return 0;
+
+	if (cl_io_is_append(io)) {
+		/*
+		 * PARALLEL IO This has to be changed for parallel IO doing
+		 * out-of-order writes.
+		 */
+		pos = io->u.ci_wr.wr.crw_pos = i_size_read(inode);
+		cio->cui_iocb->ki_pos = pos;
+	} else {
+		LASSERT(cio->cui_iocb->ki_pos == pos);
+	}
+
+	CDEBUG(D_VFSTRACE, "write: [%lli, %lli)\n", pos, pos + (long long)cnt);
+
+	if (cio->cui_iter == NULL) /* from a temp io in ll_cl_init(). */
+		result = 0;
+	else
+		result = generic_file_write_iter(cio->cui_iocb, cio->cui_iter);
+
+	if (result > 0) {
+		if (result < cnt)
+			io->ci_continue = 0;
+		io->ci_nob += result;
+		ll_rw_stats_tally(ll_i2sbi(inode), current->pid,
+				  cio->cui_fd, pos, result, WRITE);
+		result = 0;
+	}
+	return result;
+}
+
+static int vvp_io_kernel_fault(struct vvp_fault_io *cfio)
+{
+	struct vm_fault *vmf = cfio->fault.ft_vmf;
+
+	cfio->fault.ft_flags = filemap_fault(cfio->ft_vma, vmf);
+	cfio->fault.ft_flags_valid = 1;
+
+	if (vmf->page) {
+		CDEBUG(D_PAGE,
+		       "page %p map %p index %lu flags %lx count %u priv %0lx: got addr %p type NOPAGE\n",
+		       vmf->page, vmf->page->mapping, vmf->page->index,
+		       (long)vmf->page->flags, page_count(vmf->page),
+		       page_private(vmf->page), vmf->virtual_address);
+		if (unlikely(!(cfio->fault.ft_flags & VM_FAULT_LOCKED))) {
+			lock_page(vmf->page);
+			cfio->fault.ft_flags |= VM_FAULT_LOCKED;
+		}
+
+		cfio->ft_vmpage = vmf->page;
+		return 0;
+	}
+
+	if (cfio->fault.ft_flags & (VM_FAULT_SIGBUS | VM_FAULT_SIGSEGV)) {
+		CDEBUG(D_PAGE, "got addr %p - SIGBUS\n", vmf->virtual_address);
+		return -EFAULT;
+	}
+
+	if (cfio->fault.ft_flags & VM_FAULT_OOM) {
+		CDEBUG(D_PAGE, "got addr %p - OOM\n", vmf->virtual_address);
+		return -ENOMEM;
+	}
+
+	if (cfio->fault.ft_flags & VM_FAULT_RETRY)
+		return -EAGAIN;
+
+	CERROR("Unknown error in page fault %d!\n", cfio->fault.ft_flags);
+	return -EINVAL;
+}
+
+
+static int vvp_io_fault_start(const struct lu_env *env,
+			      const struct cl_io_slice *ios)
+{
+	struct vvp_io       *vio     = cl2vvp_io(env, ios);
+	struct cl_io	*io      = ios->cis_io;
+	struct cl_object    *obj     = io->ci_obj;
+	struct inode	*inode   = ccc_object_inode(obj);
+	struct cl_fault_io  *fio     = &io->u.ci_fault;
+	struct vvp_fault_io *cfio    = &vio->u.fault;
+	loff_t	       offset;
+	int		  result  = 0;
+	struct page	  *vmpage  = NULL;
+	struct cl_page      *page;
+	loff_t	       size;
+	pgoff_t	      last; /* last page in a file data region */
+
+	if (fio->ft_executable &&
+	    LTIME_S(inode->i_mtime) != vio->u.fault.ft_mtime)
+		CWARN("binary "DFID
+		      " changed while waiting for the page fault lock\n",
+		      PFID(lu_object_fid(&obj->co_lu)));
+
+	/* offset of the last byte on the page */
+	offset = cl_offset(obj, fio->ft_index + 1) - 1;
+	LASSERT(cl_index(obj, offset) == fio->ft_index);
+	result = ccc_prep_size(env, obj, io, 0, offset + 1, NULL);
+	if (result != 0)
+		return result;
+
+	/* must return locked page */
+	if (fio->ft_mkwrite) {
+		LASSERT(cfio->ft_vmpage != NULL);
+		lock_page(cfio->ft_vmpage);
+	} else {
+		result = vvp_io_kernel_fault(cfio);
+		if (result != 0)
+			return result;
+	}
+
+	vmpage = cfio->ft_vmpage;
+	LASSERT(PageLocked(vmpage));
+
+	if (OBD_FAIL_CHECK(OBD_FAIL_LLITE_FAULT_TRUNC_RACE))
+		ll_invalidate_page(vmpage);
+
+	size = i_size_read(inode);
+	/* Though we have already held a cl_lock upon this page, but
+	 * it still can be truncated locally. */
+	if (unlikely((vmpage->mapping != inode->i_mapping) ||
+		     (page_offset(vmpage) > size))) {
+		CDEBUG(D_PAGE, "llite: fault and truncate race happened!\n");
+
+		/* return +1 to stop cl_io_loop() and ll_fault() will catch
+		 * and retry. */
+		result = +1;
+		goto out;
+	}
+
+
+	if (fio->ft_mkwrite) {
+		pgoff_t last_index;
+		/*
+		 * Capture the size while holding the lli_trunc_sem from above
+		 * we want to make sure that we complete the mkwrite action
+		 * while holding this lock. We need to make sure that we are
+		 * not past the end of the file.
+		 */
+		last_index = cl_index(obj, size - 1);
+		if (last_index < fio->ft_index) {
+			CDEBUG(D_PAGE,
+			       "llite: mkwrite and truncate race happened: %p: 0x%lx 0x%lx\n",
+			       vmpage->mapping, fio->ft_index, last_index);
+			/*
+			 * We need to return if we are
+			 * passed the end of the file. This will propagate
+			 * up the call stack to ll_page_mkwrite where
+			 * we will return VM_FAULT_NOPAGE. Any non-negative
+			 * value returned here will be silently
+			 * converted to 0. If the vmpage->mapping is null
+			 * the error code would be converted back to ENODATA
+			 * in ll_page_mkwrite0. Thus we return -ENODATA
+			 * to handle both cases
+			 */
+			result = -ENODATA;
+			goto out;
+		}
+	}
+
+	page = cl_page_find(env, obj, fio->ft_index, vmpage, CPT_CACHEABLE);
+	if (IS_ERR(page)) {
+		result = PTR_ERR(page);
+		goto out;
+	}
+
+	/* if page is going to be written, we should add this page into cache
+	 * earlier. */
+	if (fio->ft_mkwrite) {
+		wait_on_page_writeback(vmpage);
+		if (set_page_dirty(vmpage)) {
+			struct ccc_page *cp;
+
+			/* vvp_page_assume() calls wait_on_page_writeback(). */
+			cl_page_assume(env, io, page);
+
+			cp = cl2ccc_page(cl_page_at(page, &vvp_device_type));
+			vvp_write_pending(cl2ccc(obj), cp);
+
+			/* Do not set Dirty bit here so that in case IO is
+			 * started before the page is really made dirty, we
+			 * still have chance to detect it. */
+			result = cl_page_cache_add(env, io, page, CRT_WRITE);
+			LASSERT(cl_page_is_owned(page, io));
+
+			vmpage = NULL;
+			if (result < 0) {
+				cl_page_unmap(env, io, page);
+				cl_page_discard(env, io, page);
+				cl_page_disown(env, io, page);
+
+				cl_page_put(env, page);
+
+				/* we're in big trouble, what can we do now? */
+				if (result == -EDQUOT)
+					result = -ENOSPC;
+				goto out;
+			} else
+				cl_page_disown(env, io, page);
+		}
+	}
+
+	last = cl_index(obj, size - 1);
+	/*
+	 * The ft_index is only used in the case of
+	 * a mkwrite action. We need to check
+	 * our assertions are correct, since
+	 * we should have caught this above
+	 */
+	LASSERT(!fio->ft_mkwrite || fio->ft_index <= last);
+	if (fio->ft_index == last)
+		/*
+		 * Last page is mapped partially.
+		 */
+		fio->ft_nob = size - cl_offset(obj, fio->ft_index);
+	else
+		fio->ft_nob = cl_page_size(obj);
+
+	lu_ref_add(&page->cp_reference, "fault", io);
+	fio->ft_page = page;
+
+out:
+	/* return unlocked vmpage to avoid deadlocking */
+	if (vmpage != NULL)
+		unlock_page(vmpage);
+	cfio->fault.ft_flags &= ~VM_FAULT_LOCKED;
+	return result;
+}
+
+static int vvp_io_fsync_start(const struct lu_env *env,
+			      const struct cl_io_slice *ios)
+{
+	/* we should mark TOWRITE bit to each dirty page in radix tree to
+	 * verify pages have been written, but this is difficult because of
+	 * race. */
+	return 0;
+}
+
+static int vvp_io_read_page(const struct lu_env *env,
+			    const struct cl_io_slice *ios,
+			    const struct cl_page_slice *slice)
+{
+	struct cl_io	      *io     = ios->cis_io;
+	struct cl_object	  *obj    = slice->cpl_obj;
+	struct ccc_page	   *cp     = cl2ccc_page(slice);
+	struct cl_page	    *page   = slice->cpl_page;
+	struct inode	      *inode  = ccc_object_inode(obj);
+	struct ll_sb_info	 *sbi    = ll_i2sbi(inode);
+	struct ll_file_data       *fd     = cl2ccc_io(env, ios)->cui_fd;
+	struct ll_readahead_state *ras    = &fd->fd_ras;
+	struct page		*vmpage = cp->cpg_page;
+	struct cl_2queue	  *queue  = &io->ci_queue;
+	int rc;
+
+	CLOBINVRNT(env, obj, ccc_object_invariant(obj));
+	LASSERT(slice->cpl_obj == obj);
+
+	if (sbi->ll_ra_info.ra_max_pages_per_file &&
+	    sbi->ll_ra_info.ra_max_pages)
+		ras_update(sbi, inode, ras, page->cp_index,
+			   cp->cpg_defer_uptodate);
+
+	/* Sanity check whether the page is protected by a lock. */
+	rc = cl_page_is_under_lock(env, io, page);
+	if (rc != -EBUSY) {
+		CL_PAGE_HEADER(D_WARNING, env, page, "%s: %d\n",
+			       rc == -ENODATA ? "without a lock" :
+			       "match failed", rc);
+		if (rc != -ENODATA)
+			return rc;
+	}
+
+	if (cp->cpg_defer_uptodate) {
+		cp->cpg_ra_used = 1;
+		cl_page_export(env, page, 1);
+	}
+	/*
+	 * Add page into the queue even when it is marked uptodate above.
+	 * this will unlock it automatically as part of cl_page_list_disown().
+	 */
+	cl_2queue_add(queue, page);
+	if (sbi->ll_ra_info.ra_max_pages_per_file &&
+	    sbi->ll_ra_info.ra_max_pages)
+		ll_readahead(env, io, ras,
+			     vmpage->mapping, &queue->c2_qin, fd->fd_flags);
+
+	return 0;
+}
+
+static int vvp_page_sync_io(const struct lu_env *env, struct cl_io *io,
+			    struct cl_page *page, struct ccc_page *cp,
+			    enum cl_req_type crt)
+{
+	struct cl_2queue  *queue;
+	int result;
+
+	LASSERT(io->ci_type == CIT_READ || io->ci_type == CIT_WRITE);
+
+	queue = &io->ci_queue;
+	cl_2queue_init_page(queue, page);
+
+	result = cl_io_submit_sync(env, io, crt, queue, 0);
+	LASSERT(cl_page_is_owned(page, io));
+
+	if (crt == CRT_READ)
+		/*
+		 * in CRT_WRITE case page is left locked even in case of
+		 * error.
+		 */
+		cl_page_list_disown(env, io, &queue->c2_qin);
+	cl_2queue_fini(env, queue);
+
+	return result;
+}
+
+/**
+ * Prepare partially written-to page for a write.
+ */
+static int vvp_io_prepare_partial(const struct lu_env *env, struct cl_io *io,
+				  struct cl_object *obj, struct cl_page *pg,
+				  struct ccc_page *cp,
+				  unsigned from, unsigned to)
+{
+	struct cl_attr *attr   = ccc_env_thread_attr(env);
+	loff_t	  offset = cl_offset(obj, pg->cp_index);
+	int	     result;
+
+	cl_object_attr_lock(obj);
+	result = cl_object_attr_get(env, obj, attr);
+	cl_object_attr_unlock(obj);
+	if (result == 0) {
+		/*
+		 * If are writing to a new page, no need to read old data.
+		 * The extent locking will have updated the KMS, and for our
+		 * purposes here we can treat it like i_size.
+		 */
+		if (attr->cat_kms <= offset) {
+			char *kaddr = kmap_atomic(cp->cpg_page);
+
+			memset(kaddr, 0, cl_page_size(obj));
+			kunmap_atomic(kaddr);
+		} else if (cp->cpg_defer_uptodate)
+			cp->cpg_ra_used = 1;
+		else
+			result = vvp_page_sync_io(env, io, pg, cp, CRT_READ);
+		/*
+		 * In older implementations, obdo_refresh_inode is called here
+		 * to update the inode because the write might modify the
+		 * object info at OST. However, this has been proven useless,
+		 * since LVB functions will be called when user space program
+		 * tries to retrieve inode attribute.  Also, see bug 15909 for
+		 * details. -jay
+		 */
+		if (result == 0)
+			cl_page_export(env, pg, 1);
+	}
+	return result;
+}
+
+static int vvp_io_prepare_write(const struct lu_env *env,
+				const struct cl_io_slice *ios,
+				const struct cl_page_slice *slice,
+				unsigned from, unsigned to)
+{
+	struct cl_object *obj    = slice->cpl_obj;
+	struct ccc_page  *cp     = cl2ccc_page(slice);
+	struct cl_page   *pg     = slice->cpl_page;
+	struct page       *vmpage = cp->cpg_page;
+
+	int result;
+
+	LINVRNT(cl_page_is_vmlocked(env, pg));
+	LASSERT(vmpage->mapping->host == ccc_object_inode(obj));
+
+	result = 0;
+
+	CL_PAGE_HEADER(D_PAGE, env, pg, "preparing: [%d, %d]\n", from, to);
+	if (!PageUptodate(vmpage)) {
+		/*
+		 * We're completely overwriting an existing page, so _don't_
+		 * set it up to date until commit_write
+		 */
+		if (from == 0 && to == PAGE_CACHE_SIZE) {
+			CL_PAGE_HEADER(D_PAGE, env, pg, "full page write\n");
+			POISON_PAGE(page, 0x11);
+		} else
+			result = vvp_io_prepare_partial(env, ios->cis_io, obj,
+							pg, cp, from, to);
+	} else
+		CL_PAGE_HEADER(D_PAGE, env, pg, "uptodate\n");
+	return result;
+}
+
+static int vvp_io_commit_write(const struct lu_env *env,
+			       const struct cl_io_slice *ios,
+			       const struct cl_page_slice *slice,
+			       unsigned from, unsigned to)
+{
+	struct cl_object  *obj    = slice->cpl_obj;
+	struct cl_io      *io     = ios->cis_io;
+	struct ccc_page   *cp     = cl2ccc_page(slice);
+	struct cl_page    *pg     = slice->cpl_page;
+	struct inode      *inode  = ccc_object_inode(obj);
+	struct ll_sb_info *sbi    = ll_i2sbi(inode);
+	struct ll_inode_info *lli = ll_i2info(inode);
+	struct page	*vmpage = cp->cpg_page;
+
+	int    result;
+	int    tallyop;
+	loff_t size;
+
+	LINVRNT(cl_page_is_vmlocked(env, pg));
+	LASSERT(vmpage->mapping->host == inode);
+
+	LU_OBJECT_HEADER(D_INODE, env, &obj->co_lu, "committing page write\n");
+	CL_PAGE_HEADER(D_PAGE, env, pg, "committing: [%d, %d]\n", from, to);
+
+	/*
+	 * queue a write for some time in the future the first time we
+	 * dirty the page.
+	 *
+	 * This is different from what other file systems do: they usually
+	 * just mark page (and some of its buffers) dirty and rely on
+	 * balance_dirty_pages() to start a write-back. Lustre wants write-back
+	 * to be started earlier for the following reasons:
+	 *
+	 *     (1) with a large number of clients we need to limit the amount
+	 *     of cached data on the clients a lot;
+	 *
+	 *     (2) large compute jobs generally want compute-only then io-only
+	 *     and the IO should complete as quickly as possible;
+	 *
+	 *     (3) IO is batched up to the RPC size and is async until the
+	 *     client max cache is hit
+	 *     (/proc/fs/lustre/osc/OSC.../max_dirty_mb)
+	 *
+	 */
+	if (!PageDirty(vmpage)) {
+		tallyop = LPROC_LL_DIRTY_MISSES;
+		result = cl_page_cache_add(env, io, pg, CRT_WRITE);
+		if (result == 0) {
+			/* page was added into cache successfully. */
+			set_page_dirty(vmpage);
+			vvp_write_pending(cl2ccc(obj), cp);
+		} else if (result == -EDQUOT) {
+			pgoff_t last_index = i_size_read(inode) >> PAGE_CACHE_SHIFT;
+			bool need_clip = true;
+
+			/*
+			 * Client ran out of disk space grant. Possible
+			 * strategies are:
+			 *
+			 *     (a) do a sync write, renewing grant;
+			 *
+			 *     (b) stop writing on this stripe, switch to the
+			 *     next one.
+			 *
+			 * (b) is a part of "parallel io" design that is the
+			 * ultimate goal. (a) is what "old" client did, and
+			 * what the new code continues to do for the time
+			 * being.
+			 */
+			if (last_index > pg->cp_index) {
+				to = PAGE_CACHE_SIZE;
+				need_clip = false;
+			} else if (last_index == pg->cp_index) {
+				int size_to = i_size_read(inode) & ~CFS_PAGE_MASK;
+				if (to < size_to)
+					to = size_to;
+			}
+			if (need_clip)
+				cl_page_clip(env, pg, 0, to);
+			result = vvp_page_sync_io(env, io, pg, cp, CRT_WRITE);
+			if (result)
+				CERROR("Write page %lu of inode %p failed %d\n",
+				       pg->cp_index, inode, result);
+		}
+	} else {
+		tallyop = LPROC_LL_DIRTY_HITS;
+		result = 0;
+	}
+	ll_stats_ops_tally(sbi, tallyop, 1);
+
+	/* Inode should be marked DIRTY even if no new page was marked DIRTY
+	 * because page could have been not flushed between 2 modifications.
+	 * It is important the file is marked DIRTY as soon as the I/O is done
+	 * Indeed, when cache is flushed, file could be already closed and it
+	 * is too late to warn the MDT.
+	 * It is acceptable that file is marked DIRTY even if I/O is dropped
+	 * for some reasons before being flushed to OST.
+	 */
+	if (result == 0) {
+		spin_lock(&lli->lli_lock);
+		lli->lli_flags |= LLIF_DATA_MODIFIED;
+		spin_unlock(&lli->lli_lock);
+	}
+
+	size = cl_offset(obj, pg->cp_index) + to;
+
+	ll_inode_size_lock(inode);
+	if (result == 0) {
+		if (size > i_size_read(inode)) {
+			cl_isize_write_nolock(inode, size);
+			CDEBUG(D_VFSTRACE, DFID" updating i_size %lu\n",
+			       PFID(lu_object_fid(&obj->co_lu)),
+			       (unsigned long)size);
+		}
+		cl_page_export(env, pg, 1);
+	} else {
+		if (size > i_size_read(inode))
+			cl_page_discard(env, io, pg);
+	}
+	ll_inode_size_unlock(inode);
+	return result;
+}
+
+static const struct cl_io_operations vvp_io_ops = {
+	.op = {
+		[CIT_READ] = {
+			.cio_fini      = vvp_io_read_fini,
+			.cio_lock      = vvp_io_read_lock,
+			.cio_start     = vvp_io_read_start,
+			.cio_advance   = ccc_io_advance
+		},
+		[CIT_WRITE] = {
+			.cio_fini      = vvp_io_fini,
+			.cio_lock      = vvp_io_write_lock,
+			.cio_start     = vvp_io_write_start,
+			.cio_advance   = ccc_io_advance
+		},
+		[CIT_SETATTR] = {
+			.cio_fini       = vvp_io_setattr_fini,
+			.cio_iter_init  = vvp_io_setattr_iter_init,
+			.cio_lock       = vvp_io_setattr_lock,
+			.cio_start      = vvp_io_setattr_start,
+			.cio_end	= vvp_io_setattr_end
+		},
+		[CIT_FAULT] = {
+			.cio_fini      = vvp_io_fault_fini,
+			.cio_iter_init = vvp_io_fault_iter_init,
+			.cio_lock      = vvp_io_fault_lock,
+			.cio_start     = vvp_io_fault_start,
+			.cio_end       = ccc_io_end
+		},
+		[CIT_FSYNC] = {
+			.cio_start  = vvp_io_fsync_start,
+			.cio_fini   = vvp_io_fini
+		},
+		[CIT_MISC] = {
+			.cio_fini   = vvp_io_fini
+		}
+	},
+	.cio_read_page     = vvp_io_read_page,
+	.cio_prepare_write = vvp_io_prepare_write,
+	.cio_commit_write  = vvp_io_commit_write
+};
+
+int vvp_io_init(const struct lu_env *env, struct cl_object *obj,
+		struct cl_io *io)
+{
+	struct vvp_io      *vio   = vvp_env_io(env);
+	struct ccc_io      *cio   = ccc_env_io(env);
+	struct inode       *inode = ccc_object_inode(obj);
+	int		 result;
+
+	CLOBINVRNT(env, obj, ccc_object_invariant(obj));
+
+	CDEBUG(D_VFSTRACE, DFID
+	       " ignore/verify layout %d/%d, layout version %d restore needed %d\n",
+	       PFID(lu_object_fid(&obj->co_lu)),
+	       io->ci_ignore_layout, io->ci_verify_layout,
+	       cio->cui_layout_gen, io->ci_restore_needed);
+
+	CL_IO_SLICE_CLEAN(cio, cui_cl);
+	cl_io_slice_add(io, &cio->cui_cl, obj, &vvp_io_ops);
+	vio->cui_ra_window_set = 0;
+	result = 0;
+	if (io->ci_type == CIT_READ || io->ci_type == CIT_WRITE) {
+		size_t count;
+		struct ll_inode_info *lli = ll_i2info(inode);
+
+		count = io->u.ci_rw.crw_count;
+		/* "If nbyte is 0, read() will return 0 and have no other
+		 *  results."  -- Single Unix Spec */
+		if (count == 0)
+			result = 1;
+		else
+			cio->cui_tot_count = count;
+
+		/* for read/write, we store the jobid in the inode, and
+		 * it'll be fetched by osc when building RPC.
+		 *
+		 * it's not accurate if the file is shared by different
+		 * jobs.
+		 */
+		lustre_get_jobid(lli->lli_jobid);
+	} else if (io->ci_type == CIT_SETATTR) {
+		if (!cl_io_is_trunc(io))
+			io->ci_lockreq = CILR_MANDATORY;
+	}
+
+	/* ignore layout change for generic CIT_MISC but not for glimpse.
+	 * io context for glimpse must set ci_verify_layout to true,
+	 * see cl_glimpse_size0() for details. */
+	if (io->ci_type == CIT_MISC && !io->ci_verify_layout)
+		io->ci_ignore_layout = 1;
+
+	/* Enqueue layout lock and get layout version. We need to do this
+	 * even for operations requiring to open file, such as read and write,
+	 * because it might not grant layout lock in IT_OPEN. */
+	if (result == 0 && !io->ci_ignore_layout) {
+		result = ll_layout_refresh(inode, &cio->cui_layout_gen);
+		if (result == -ENOENT)
+			/* If the inode on MDS has been removed, but the objects
+			 * on OSTs haven't been destroyed (async unlink), layout
+			 * fetch will return -ENOENT, we'd ignore this error
+			 * and continue with dirty flush. LU-3230. */
+			result = 0;
+		if (result < 0)
+			CERROR("%s: refresh file layout " DFID " error %d.\n",
+				ll_get_fsname(inode->i_sb, NULL, 0),
+				PFID(lu_object_fid(&obj->co_lu)), result);
+	}
+
+	return result;
+}
+
+static struct vvp_io *cl2vvp_io(const struct lu_env *env,
+				const struct cl_io_slice *slice)
+{
+	/* Calling just for assertion */
+	cl2ccc_io(env, slice);
+	return vvp_env_io(env);
+}
diff --git a/kernel/drivers/staging/lustre/lustre/llite/vvp_lock.c b/kernel/drivers/staging/lustre/lustre/llite/vvp_lock.c
new file mode 100644
index 000000000..f354e82d4
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/llite/vvp_lock.c
@@ -0,0 +1,85 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Implementation of cl_lock for VVP layer.
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LLITE
+
+
+#include "../include/obd.h"
+#include "../include/lustre_lite.h"
+
+#include "vvp_internal.h"
+
+/*****************************************************************************
+ *
+ * Vvp lock functions.
+ *
+ */
+
+/**
+ * Estimates lock value for the purpose of managing the lock cache during
+ * memory shortages.
+ *
+ * Locks for memory mapped files are almost infinitely precious, others are
+ * junk. "Mapped locks" are heavy, but not infinitely heavy, so that they are
+ * ordered within themselves by weights assigned from other layers.
+ */
+static unsigned long vvp_lock_weigh(const struct lu_env *env,
+				    const struct cl_lock_slice *slice)
+{
+	struct ccc_object *cob = cl2ccc(slice->cls_obj);
+
+	return atomic_read(&cob->cob_mmap_cnt) > 0 ? ~0UL >> 2 : 0;
+}
+
+static const struct cl_lock_operations vvp_lock_ops = {
+	.clo_delete    = ccc_lock_delete,
+	.clo_fini      = ccc_lock_fini,
+	.clo_enqueue   = ccc_lock_enqueue,
+	.clo_wait      = ccc_lock_wait,
+	.clo_use       = ccc_lock_use,
+	.clo_unuse     = ccc_lock_unuse,
+	.clo_fits_into = ccc_lock_fits_into,
+	.clo_state     = ccc_lock_state,
+	.clo_weigh     = vvp_lock_weigh
+};
+
+int vvp_lock_init(const struct lu_env *env, struct cl_object *obj,
+		  struct cl_lock *lock, const struct cl_io *io)
+{
+	return ccc_lock_init(env, obj, lock, io, &vvp_lock_ops);
+}
diff --git a/kernel/drivers/staging/lustre/lustre/llite/vvp_object.c b/kernel/drivers/staging/lustre/lustre/llite/vvp_object.c
new file mode 100644
index 000000000..b6f6d4cb6
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/llite/vvp_object.c
@@ -0,0 +1,201 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * cl_object implementation for VVP layer.
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LLITE
+
+
+#include "../../include/linux/libcfs/libcfs.h"
+
+#include "../include/obd.h"
+#include "../include/lustre_lite.h"
+
+#include "vvp_internal.h"
+
+/*****************************************************************************
+ *
+ * Object operations.
+ *
+ */
+
+static int vvp_object_print(const struct lu_env *env, void *cookie,
+			    lu_printer_t p, const struct lu_object *o)
+{
+	struct ccc_object    *obj   = lu2ccc(o);
+	struct inode	 *inode = obj->cob_inode;
+	struct ll_inode_info *lli;
+
+	(*p)(env, cookie, "(%s %d %d) inode: %p ",
+	     list_empty(&obj->cob_pending_list) ? "-" : "+",
+	     obj->cob_transient_pages, atomic_read(&obj->cob_mmap_cnt),
+	     inode);
+	if (inode) {
+		lli = ll_i2info(inode);
+		(*p)(env, cookie, "%lu/%u %o %u %d %p "DFID,
+		     inode->i_ino, inode->i_generation, inode->i_mode,
+		     inode->i_nlink, atomic_read(&inode->i_count),
+		     lli->lli_clob, PFID(&lli->lli_fid));
+	}
+	return 0;
+}
+
+static int vvp_attr_get(const struct lu_env *env, struct cl_object *obj,
+			struct cl_attr *attr)
+{
+	struct inode *inode = ccc_object_inode(obj);
+
+	/*
+	 * lov overwrites most of these fields in
+	 * lov_attr_get()->...lov_merge_lvb_kms(), except when inode
+	 * attributes are newer.
+	 */
+
+	attr->cat_size = i_size_read(inode);
+	attr->cat_mtime = LTIME_S(inode->i_mtime);
+	attr->cat_atime = LTIME_S(inode->i_atime);
+	attr->cat_ctime = LTIME_S(inode->i_ctime);
+	attr->cat_blocks = inode->i_blocks;
+	attr->cat_uid = from_kuid(&init_user_ns, inode->i_uid);
+	attr->cat_gid = from_kgid(&init_user_ns, inode->i_gid);
+	/* KMS is not known by this layer */
+	return 0; /* layers below have to fill in the rest */
+}
+
+static int vvp_attr_set(const struct lu_env *env, struct cl_object *obj,
+			const struct cl_attr *attr, unsigned valid)
+{
+	struct inode *inode = ccc_object_inode(obj);
+
+	if (valid & CAT_UID)
+		inode->i_uid = make_kuid(&init_user_ns, attr->cat_uid);
+	if (valid & CAT_GID)
+		inode->i_gid = make_kgid(&init_user_ns, attr->cat_gid);
+	if (valid & CAT_ATIME)
+		LTIME_S(inode->i_atime) = attr->cat_atime;
+	if (valid & CAT_MTIME)
+		LTIME_S(inode->i_mtime) = attr->cat_mtime;
+	if (valid & CAT_CTIME)
+		LTIME_S(inode->i_ctime) = attr->cat_ctime;
+	if (0 && valid & CAT_SIZE)
+		cl_isize_write_nolock(inode, attr->cat_size);
+	/* not currently necessary */
+	if (0 && valid & (CAT_UID|CAT_GID|CAT_SIZE))
+		mark_inode_dirty(inode);
+	return 0;
+}
+
+static int vvp_conf_set(const struct lu_env *env, struct cl_object *obj,
+			const struct cl_object_conf *conf)
+{
+	struct ll_inode_info *lli = ll_i2info(conf->coc_inode);
+
+	if (conf->coc_opc == OBJECT_CONF_INVALIDATE) {
+		CDEBUG(D_VFSTRACE, DFID ": losing layout lock\n",
+		       PFID(&lli->lli_fid));
+
+		ll_layout_version_set(lli, LL_LAYOUT_GEN_NONE);
+
+		/* Clean up page mmap for this inode.
+		 * The reason for us to do this is that if the page has
+		 * already been installed into memory space, the process
+		 * can access it without interacting with lustre, so this
+		 * page may be stale due to layout change, and the process
+		 * will never be notified.
+		 * This operation is expensive but mmap processes have to pay
+		 * a price themselves. */
+		unmap_mapping_range(conf->coc_inode->i_mapping,
+				    0, OBD_OBJECT_EOF, 0);
+
+		return 0;
+	}
+
+	if (conf->coc_opc != OBJECT_CONF_SET)
+		return 0;
+
+	if (conf->u.coc_md != NULL && conf->u.coc_md->lsm != NULL) {
+		CDEBUG(D_VFSTRACE, DFID ": layout version change: %u -> %u\n",
+		       PFID(&lli->lli_fid), lli->lli_layout_gen,
+		       conf->u.coc_md->lsm->lsm_layout_gen);
+
+		lli->lli_has_smd = lsm_has_objects(conf->u.coc_md->lsm);
+		ll_layout_version_set(lli, conf->u.coc_md->lsm->lsm_layout_gen);
+	} else {
+		CDEBUG(D_VFSTRACE, DFID ": layout nuked: %u.\n",
+		       PFID(&lli->lli_fid), lli->lli_layout_gen);
+
+		lli->lli_has_smd = false;
+		ll_layout_version_set(lli, LL_LAYOUT_GEN_EMPTY);
+	}
+	return 0;
+}
+
+static const struct cl_object_operations vvp_ops = {
+	.coo_page_init = vvp_page_init,
+	.coo_lock_init = vvp_lock_init,
+	.coo_io_init   = vvp_io_init,
+	.coo_attr_get  = vvp_attr_get,
+	.coo_attr_set  = vvp_attr_set,
+	.coo_conf_set  = vvp_conf_set,
+	.coo_glimpse   = ccc_object_glimpse
+};
+
+static const struct lu_object_operations vvp_lu_obj_ops = {
+	.loo_object_init  = ccc_object_init,
+	.loo_object_free  = ccc_object_free,
+	.loo_object_print = vvp_object_print
+};
+
+struct ccc_object *cl_inode2ccc(struct inode *inode)
+{
+	struct cl_inode_info *lli = cl_i2info(inode);
+	struct cl_object     *obj = lli->lli_clob;
+	struct lu_object     *lu;
+
+	LASSERT(obj != NULL);
+	lu = lu_object_locate(obj->co_lu.lo_header, &vvp_device_type);
+	LASSERT(lu != NULL);
+	return lu2ccc(lu);
+}
+
+struct lu_object *vvp_object_alloc(const struct lu_env *env,
+				   const struct lu_object_header *hdr,
+				   struct lu_device *dev)
+{
+	return ccc_object_alloc(env, hdr, dev, &vvp_ops, &vvp_lu_obj_ops);
+}
diff --git a/kernel/drivers/staging/lustre/lustre/llite/vvp_page.c b/kernel/drivers/staging/lustre/lustre/llite/vvp_page.c
new file mode 100644
index 000000000..954ed08c6
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/llite/vvp_page.c
@@ -0,0 +1,551 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Implementation of cl_page for VVP layer.
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ *   Author: Jinshan Xiong <jinshan.xiong@whamcloud.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LLITE
+
+
+#include "../include/obd.h"
+#include "../include/lustre_lite.h"
+
+#include "vvp_internal.h"
+
+/*****************************************************************************
+ *
+ * Page operations.
+ *
+ */
+
+static void vvp_page_fini_common(struct ccc_page *cp)
+{
+	struct page *vmpage = cp->cpg_page;
+
+	LASSERT(vmpage != NULL);
+	page_cache_release(vmpage);
+}
+
+static void vvp_page_fini(const struct lu_env *env,
+			  struct cl_page_slice *slice)
+{
+	struct ccc_page *cp = cl2ccc_page(slice);
+	struct page *vmpage  = cp->cpg_page;
+
+	/*
+	 * vmpage->private was already cleared when page was moved into
+	 * VPG_FREEING state.
+	 */
+	LASSERT((struct cl_page *)vmpage->private != slice->cpl_page);
+	vvp_page_fini_common(cp);
+}
+
+static int vvp_page_own(const struct lu_env *env,
+			const struct cl_page_slice *slice, struct cl_io *io,
+			int nonblock)
+{
+	struct ccc_page *vpg    = cl2ccc_page(slice);
+	struct page      *vmpage = vpg->cpg_page;
+
+	LASSERT(vmpage != NULL);
+	if (nonblock) {
+		if (!trylock_page(vmpage))
+			return -EAGAIN;
+
+		if (unlikely(PageWriteback(vmpage))) {
+			unlock_page(vmpage);
+			return -EAGAIN;
+		}
+
+		return 0;
+	}
+
+	lock_page(vmpage);
+	wait_on_page_writeback(vmpage);
+	return 0;
+}
+
+static void vvp_page_assume(const struct lu_env *env,
+			    const struct cl_page_slice *slice,
+			    struct cl_io *unused)
+{
+	struct page *vmpage = cl2vm_page(slice);
+
+	LASSERT(vmpage != NULL);
+	LASSERT(PageLocked(vmpage));
+	wait_on_page_writeback(vmpage);
+}
+
+static void vvp_page_unassume(const struct lu_env *env,
+			      const struct cl_page_slice *slice,
+			      struct cl_io *unused)
+{
+	struct page *vmpage = cl2vm_page(slice);
+
+	LASSERT(vmpage != NULL);
+	LASSERT(PageLocked(vmpage));
+}
+
+static void vvp_page_disown(const struct lu_env *env,
+			    const struct cl_page_slice *slice, struct cl_io *io)
+{
+	struct page *vmpage = cl2vm_page(slice);
+
+	LASSERT(vmpage != NULL);
+	LASSERT(PageLocked(vmpage));
+
+	unlock_page(cl2vm_page(slice));
+}
+
+static void vvp_page_discard(const struct lu_env *env,
+			     const struct cl_page_slice *slice,
+			     struct cl_io *unused)
+{
+	struct page	   *vmpage  = cl2vm_page(slice);
+	struct address_space *mapping;
+	struct ccc_page      *cpg     = cl2ccc_page(slice);
+
+	LASSERT(vmpage != NULL);
+	LASSERT(PageLocked(vmpage));
+
+	mapping = vmpage->mapping;
+
+	if (cpg->cpg_defer_uptodate && !cpg->cpg_ra_used)
+		ll_ra_stats_inc(mapping, RA_STAT_DISCARDED);
+
+	/*
+	 * truncate_complete_page() calls
+	 * a_ops->invalidatepage()->cl_page_delete()->vvp_page_delete().
+	 */
+	truncate_complete_page(mapping, vmpage);
+}
+
+static int vvp_page_unmap(const struct lu_env *env,
+			  const struct cl_page_slice *slice,
+			  struct cl_io *unused)
+{
+	struct page *vmpage = cl2vm_page(slice);
+	__u64       offset;
+
+	LASSERT(vmpage != NULL);
+	LASSERT(PageLocked(vmpage));
+
+	offset = vmpage->index << PAGE_CACHE_SHIFT;
+
+	/*
+	 * XXX is it safe to call this with the page lock held?
+	 */
+	ll_teardown_mmaps(vmpage->mapping, offset, offset + PAGE_CACHE_SIZE);
+	return 0;
+}
+
+static void vvp_page_delete(const struct lu_env *env,
+			    const struct cl_page_slice *slice)
+{
+	struct page       *vmpage = cl2vm_page(slice);
+	struct inode     *inode  = vmpage->mapping->host;
+	struct cl_object *obj    = slice->cpl_obj;
+
+	LASSERT(PageLocked(vmpage));
+	LASSERT((struct cl_page *)vmpage->private == slice->cpl_page);
+	LASSERT(inode == ccc_object_inode(obj));
+
+	vvp_write_complete(cl2ccc(obj), cl2ccc_page(slice));
+	ClearPagePrivate(vmpage);
+	vmpage->private = 0;
+	/*
+	 * Reference from vmpage to cl_page is removed, but the reference back
+	 * is still here. It is removed later in vvp_page_fini().
+	 */
+}
+
+static void vvp_page_export(const struct lu_env *env,
+			    const struct cl_page_slice *slice,
+			    int uptodate)
+{
+	struct page *vmpage = cl2vm_page(slice);
+
+	LASSERT(vmpage != NULL);
+	LASSERT(PageLocked(vmpage));
+	if (uptodate)
+		SetPageUptodate(vmpage);
+	else
+		ClearPageUptodate(vmpage);
+}
+
+static int vvp_page_is_vmlocked(const struct lu_env *env,
+				const struct cl_page_slice *slice)
+{
+	return PageLocked(cl2vm_page(slice)) ? -EBUSY : -ENODATA;
+}
+
+static int vvp_page_prep_read(const struct lu_env *env,
+			      const struct cl_page_slice *slice,
+			      struct cl_io *unused)
+{
+	/* Skip the page already marked as PG_uptodate. */
+	return PageUptodate(cl2vm_page(slice)) ? -EALREADY : 0;
+}
+
+static int vvp_page_prep_write(const struct lu_env *env,
+			       const struct cl_page_slice *slice,
+			       struct cl_io *unused)
+{
+	struct page *vmpage = cl2vm_page(slice);
+
+	LASSERT(PageLocked(vmpage));
+	LASSERT(!PageDirty(vmpage));
+
+	set_page_writeback(vmpage);
+	vvp_write_pending(cl2ccc(slice->cpl_obj), cl2ccc_page(slice));
+
+	return 0;
+}
+
+/**
+ * Handles page transfer errors at VM level.
+ *
+ * This takes inode as a separate argument, because inode on which error is to
+ * be set can be different from \a vmpage inode in case of direct-io.
+ */
+static void vvp_vmpage_error(struct inode *inode, struct page *vmpage, int ioret)
+{
+	struct ccc_object *obj = cl_inode2ccc(inode);
+
+	if (ioret == 0) {
+		ClearPageError(vmpage);
+		obj->cob_discard_page_warned = 0;
+	} else {
+		SetPageError(vmpage);
+		if (ioret == -ENOSPC)
+			set_bit(AS_ENOSPC, &inode->i_mapping->flags);
+		else
+			set_bit(AS_EIO, &inode->i_mapping->flags);
+
+		if ((ioret == -ESHUTDOWN || ioret == -EINTR) &&
+		     obj->cob_discard_page_warned == 0) {
+			obj->cob_discard_page_warned = 1;
+			ll_dirty_page_discard_warn(vmpage, ioret);
+		}
+	}
+}
+
+static void vvp_page_completion_read(const struct lu_env *env,
+				     const struct cl_page_slice *slice,
+				     int ioret)
+{
+	struct ccc_page *cp     = cl2ccc_page(slice);
+	struct page      *vmpage = cp->cpg_page;
+	struct cl_page  *page   = cl_page_top(slice->cpl_page);
+	struct inode    *inode  = ccc_object_inode(page->cp_obj);
+
+	LASSERT(PageLocked(vmpage));
+	CL_PAGE_HEADER(D_PAGE, env, page, "completing READ with %d\n", ioret);
+
+	if (cp->cpg_defer_uptodate)
+		ll_ra_count_put(ll_i2sbi(inode), 1);
+
+	if (ioret == 0)  {
+		if (!cp->cpg_defer_uptodate)
+			cl_page_export(env, page, 1);
+	} else
+		cp->cpg_defer_uptodate = 0;
+
+	if (page->cp_sync_io == NULL)
+		unlock_page(vmpage);
+}
+
+static void vvp_page_completion_write(const struct lu_env *env,
+				      const struct cl_page_slice *slice,
+				      int ioret)
+{
+	struct ccc_page *cp     = cl2ccc_page(slice);
+	struct cl_page  *pg     = slice->cpl_page;
+	struct page      *vmpage = cp->cpg_page;
+
+	LASSERT(ergo(pg->cp_sync_io != NULL, PageLocked(vmpage)));
+	LASSERT(PageWriteback(vmpage));
+
+	CL_PAGE_HEADER(D_PAGE, env, pg, "completing WRITE with %d\n", ioret);
+
+	/*
+	 * TODO: Actually it makes sense to add the page into oap pending
+	 * list again and so that we don't need to take the page out from
+	 * SoM write pending list, if we just meet a recoverable error,
+	 * -ENOMEM, etc.
+	 * To implement this, we just need to return a non zero value in
+	 * ->cpo_completion method. The underlying transfer should be notified
+	 * and then re-add the page into pending transfer queue.  -jay
+	 */
+
+	cp->cpg_write_queued = 0;
+	vvp_write_complete(cl2ccc(slice->cpl_obj), cp);
+
+	/*
+	 * Only mark the page error only when it's an async write because
+	 * applications won't wait for IO to finish.
+	 */
+	if (pg->cp_sync_io == NULL)
+		vvp_vmpage_error(ccc_object_inode(pg->cp_obj), vmpage, ioret);
+
+	end_page_writeback(vmpage);
+}
+
+/**
+ * Implements cl_page_operations::cpo_make_ready() method.
+ *
+ * This is called to yank a page from the transfer cache and to send it out as
+ * a part of transfer. This function try-locks the page. If try-lock failed,
+ * page is owned by some concurrent IO, and should be skipped (this is bad,
+ * but hopefully rare situation, as it usually results in transfer being
+ * shorter than possible).
+ *
+ * \retval 0      success, page can be placed into transfer
+ *
+ * \retval -EAGAIN page is either used by concurrent IO has been
+ * truncated. Skip it.
+ */
+static int vvp_page_make_ready(const struct lu_env *env,
+			       const struct cl_page_slice *slice)
+{
+	struct page *vmpage = cl2vm_page(slice);
+	struct cl_page *pg = slice->cpl_page;
+	int result = 0;
+
+	lock_page(vmpage);
+	if (clear_page_dirty_for_io(vmpage)) {
+		LASSERT(pg->cp_state == CPS_CACHED);
+		/* This actually clears the dirty bit in the radix
+		 * tree. */
+		set_page_writeback(vmpage);
+		vvp_write_pending(cl2ccc(slice->cpl_obj),
+				cl2ccc_page(slice));
+		CL_PAGE_HEADER(D_PAGE, env, pg, "readied\n");
+	} else if (pg->cp_state == CPS_PAGEOUT) {
+		/* is it possible for osc_flush_async_page() to already
+		 * make it ready? */
+		result = -EALREADY;
+	} else {
+		CL_PAGE_DEBUG(D_ERROR, env, pg, "Unexpecting page state %d.\n",
+			      pg->cp_state);
+		LBUG();
+	}
+	unlock_page(vmpage);
+	return result;
+}
+
+static int vvp_page_print(const struct lu_env *env,
+			  const struct cl_page_slice *slice,
+			  void *cookie, lu_printer_t printer)
+{
+	struct ccc_page *vp = cl2ccc_page(slice);
+	struct page      *vmpage = vp->cpg_page;
+
+	(*printer)(env, cookie, LUSTRE_VVP_NAME "-page@%p(%d:%d:%d) vm@%p ",
+		   vp, vp->cpg_defer_uptodate, vp->cpg_ra_used,
+		   vp->cpg_write_queued, vmpage);
+	if (vmpage != NULL) {
+		(*printer)(env, cookie, "%lx %d:%d %lx %lu %slru",
+			   (long)vmpage->flags, page_count(vmpage),
+			   page_mapcount(vmpage), vmpage->private,
+			   page_index(vmpage),
+			   list_empty(&vmpage->lru) ? "not-" : "");
+	}
+	(*printer)(env, cookie, "\n");
+	return 0;
+}
+
+static const struct cl_page_operations vvp_page_ops = {
+	.cpo_own	   = vvp_page_own,
+	.cpo_assume	= vvp_page_assume,
+	.cpo_unassume      = vvp_page_unassume,
+	.cpo_disown	= vvp_page_disown,
+	.cpo_vmpage	= ccc_page_vmpage,
+	.cpo_discard       = vvp_page_discard,
+	.cpo_delete	= vvp_page_delete,
+	.cpo_unmap	 = vvp_page_unmap,
+	.cpo_export	= vvp_page_export,
+	.cpo_is_vmlocked   = vvp_page_is_vmlocked,
+	.cpo_fini	  = vvp_page_fini,
+	.cpo_print	 = vvp_page_print,
+	.cpo_is_under_lock = ccc_page_is_under_lock,
+	.io = {
+		[CRT_READ] = {
+			.cpo_prep	= vvp_page_prep_read,
+			.cpo_completion  = vvp_page_completion_read,
+			.cpo_make_ready  = ccc_fail,
+		},
+		[CRT_WRITE] = {
+			.cpo_prep	= vvp_page_prep_write,
+			.cpo_completion  = vvp_page_completion_write,
+			.cpo_make_ready  = vvp_page_make_ready,
+		}
+	}
+};
+
+static void vvp_transient_page_verify(const struct cl_page *page)
+{
+	struct inode *inode = ccc_object_inode(page->cp_obj);
+
+	LASSERT(!mutex_trylock(&inode->i_mutex));
+}
+
+static int vvp_transient_page_own(const struct lu_env *env,
+				  const struct cl_page_slice *slice,
+				  struct cl_io *unused, int nonblock)
+{
+	vvp_transient_page_verify(slice->cpl_page);
+	return 0;
+}
+
+static void vvp_transient_page_assume(const struct lu_env *env,
+				      const struct cl_page_slice *slice,
+				      struct cl_io *unused)
+{
+	vvp_transient_page_verify(slice->cpl_page);
+}
+
+static void vvp_transient_page_unassume(const struct lu_env *env,
+					const struct cl_page_slice *slice,
+					struct cl_io *unused)
+{
+	vvp_transient_page_verify(slice->cpl_page);
+}
+
+static void vvp_transient_page_disown(const struct lu_env *env,
+				      const struct cl_page_slice *slice,
+				      struct cl_io *unused)
+{
+	vvp_transient_page_verify(slice->cpl_page);
+}
+
+static void vvp_transient_page_discard(const struct lu_env *env,
+				       const struct cl_page_slice *slice,
+				       struct cl_io *unused)
+{
+	struct cl_page *page = slice->cpl_page;
+
+	vvp_transient_page_verify(slice->cpl_page);
+
+	/*
+	 * For transient pages, remove it from the radix tree.
+	 */
+	cl_page_delete(env, page);
+}
+
+static int vvp_transient_page_is_vmlocked(const struct lu_env *env,
+					  const struct cl_page_slice *slice)
+{
+	struct inode    *inode = ccc_object_inode(slice->cpl_obj);
+	int	locked;
+
+	locked = !mutex_trylock(&inode->i_mutex);
+	if (!locked)
+		mutex_unlock(&inode->i_mutex);
+	return locked ? -EBUSY : -ENODATA;
+}
+
+static void
+vvp_transient_page_completion(const struct lu_env *env,
+			      const struct cl_page_slice *slice,
+			      int ioret)
+{
+	vvp_transient_page_verify(slice->cpl_page);
+}
+
+static void vvp_transient_page_fini(const struct lu_env *env,
+				    struct cl_page_slice *slice)
+{
+	struct ccc_page *cp = cl2ccc_page(slice);
+	struct cl_page *clp = slice->cpl_page;
+	struct ccc_object *clobj = cl2ccc(clp->cp_obj);
+
+	vvp_page_fini_common(cp);
+	LASSERT(!mutex_trylock(&clobj->cob_inode->i_mutex));
+	clobj->cob_transient_pages--;
+}
+
+static const struct cl_page_operations vvp_transient_page_ops = {
+	.cpo_own	   = vvp_transient_page_own,
+	.cpo_assume	= vvp_transient_page_assume,
+	.cpo_unassume      = vvp_transient_page_unassume,
+	.cpo_disown	= vvp_transient_page_disown,
+	.cpo_discard       = vvp_transient_page_discard,
+	.cpo_vmpage	= ccc_page_vmpage,
+	.cpo_fini	  = vvp_transient_page_fini,
+	.cpo_is_vmlocked   = vvp_transient_page_is_vmlocked,
+	.cpo_print	 = vvp_page_print,
+	.cpo_is_under_lock = ccc_page_is_under_lock,
+	.io = {
+		[CRT_READ] = {
+			.cpo_prep	= ccc_transient_page_prep,
+			.cpo_completion  = vvp_transient_page_completion,
+		},
+		[CRT_WRITE] = {
+			.cpo_prep	= ccc_transient_page_prep,
+			.cpo_completion  = vvp_transient_page_completion,
+		}
+	}
+};
+
+int vvp_page_init(const struct lu_env *env, struct cl_object *obj,
+		struct cl_page *page, struct page *vmpage)
+{
+	struct ccc_page *cpg = cl_object_page_slice(obj, page);
+
+	CLOBINVRNT(env, obj, ccc_object_invariant(obj));
+
+	cpg->cpg_page = vmpage;
+	page_cache_get(vmpage);
+
+	INIT_LIST_HEAD(&cpg->cpg_pending_linkage);
+	if (page->cp_type == CPT_CACHEABLE) {
+		SetPagePrivate(vmpage);
+		vmpage->private = (unsigned long)page;
+		cl_page_slice_add(page, &cpg->cpg_cl, obj,
+				&vvp_page_ops);
+	} else {
+		struct ccc_object *clobj = cl2ccc(obj);
+
+		LASSERT(!mutex_trylock(&clobj->cob_inode->i_mutex));
+		cl_page_slice_add(page, &cpg->cpg_cl, obj,
+				&vvp_transient_page_ops);
+		clobj->cob_transient_pages++;
+	}
+	return 0;
+}
diff --git a/kernel/drivers/staging/lustre/lustre/llite/xattr.c b/kernel/drivers/staging/lustre/lustre/llite/xattr.c
new file mode 100644
index 000000000..e0fcbe139
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/llite/xattr.c
@@ -0,0 +1,621 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#include <linux/fs.h>
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/selinux.h>
+
+#define DEBUG_SUBSYSTEM S_LLITE
+
+#include "../include/obd_support.h"
+#include "../include/lustre_lite.h"
+#include "../include/lustre_dlm.h"
+#include "../include/lustre_ver.h"
+#include "../include/lustre_eacl.h"
+
+#include "llite_internal.h"
+
+#define XATTR_USER_T	    (1)
+#define XATTR_TRUSTED_T	 (2)
+#define XATTR_SECURITY_T	(3)
+#define XATTR_ACL_ACCESS_T      (4)
+#define XATTR_ACL_DEFAULT_T     (5)
+#define XATTR_LUSTRE_T	  (6)
+#define XATTR_OTHER_T	   (7)
+
+static
+int get_xattr_type(const char *name)
+{
+	if (!strcmp(name, POSIX_ACL_XATTR_ACCESS))
+		return XATTR_ACL_ACCESS_T;
+
+	if (!strcmp(name, POSIX_ACL_XATTR_DEFAULT))
+		return XATTR_ACL_DEFAULT_T;
+
+	if (!strncmp(name, XATTR_USER_PREFIX,
+		     sizeof(XATTR_USER_PREFIX) - 1))
+		return XATTR_USER_T;
+
+	if (!strncmp(name, XATTR_TRUSTED_PREFIX,
+		     sizeof(XATTR_TRUSTED_PREFIX) - 1))
+		return XATTR_TRUSTED_T;
+
+	if (!strncmp(name, XATTR_SECURITY_PREFIX,
+		     sizeof(XATTR_SECURITY_PREFIX) - 1))
+		return XATTR_SECURITY_T;
+
+	if (!strncmp(name, XATTR_LUSTRE_PREFIX,
+		     sizeof(XATTR_LUSTRE_PREFIX) - 1))
+		return XATTR_LUSTRE_T;
+
+	return XATTR_OTHER_T;
+}
+
+static
+int xattr_type_filter(struct ll_sb_info *sbi, int xattr_type)
+{
+	if ((xattr_type == XATTR_ACL_ACCESS_T ||
+	     xattr_type == XATTR_ACL_DEFAULT_T) &&
+	   !(sbi->ll_flags & LL_SBI_ACL))
+		return -EOPNOTSUPP;
+
+	if (xattr_type == XATTR_USER_T && !(sbi->ll_flags & LL_SBI_USER_XATTR))
+		return -EOPNOTSUPP;
+	if (xattr_type == XATTR_TRUSTED_T && !capable(CFS_CAP_SYS_ADMIN))
+		return -EPERM;
+	if (xattr_type == XATTR_OTHER_T)
+		return -EOPNOTSUPP;
+
+	return 0;
+}
+
+static
+int ll_setxattr_common(struct inode *inode, const char *name,
+		       const void *value, size_t size,
+		       int flags, __u64 valid)
+{
+	struct ll_sb_info *sbi = ll_i2sbi(inode);
+	struct ptlrpc_request *req = NULL;
+	int xattr_type, rc;
+	struct obd_capa *oc;
+#ifdef CONFIG_FS_POSIX_ACL
+	struct rmtacl_ctl_entry *rce = NULL;
+	posix_acl_xattr_header *new_value = NULL;
+	ext_acl_xattr_header *acl = NULL;
+#endif
+	const char *pv = value;
+
+	xattr_type = get_xattr_type(name);
+	rc = xattr_type_filter(sbi, xattr_type);
+	if (rc)
+		return rc;
+
+	if ((xattr_type == XATTR_ACL_ACCESS_T ||
+	     xattr_type == XATTR_ACL_DEFAULT_T) &&
+	    !inode_owner_or_capable(inode))
+		return -EPERM;
+
+	/* b10667: ignore lustre special xattr for now */
+	if ((xattr_type == XATTR_TRUSTED_T && strcmp(name, "trusted.lov") == 0) ||
+	    (xattr_type == XATTR_LUSTRE_T && strcmp(name, "lustre.lov") == 0))
+		return 0;
+
+	/* b15587: ignore security.capability xattr for now */
+	if ((xattr_type == XATTR_SECURITY_T &&
+	    strcmp(name, "security.capability") == 0))
+		return 0;
+
+	/* LU-549:  Disable security.selinux when selinux is disabled */
+	if (xattr_type == XATTR_SECURITY_T && !selinux_is_enabled() &&
+	    strcmp(name, "security.selinux") == 0)
+		return -EOPNOTSUPP;
+
+#ifdef CONFIG_FS_POSIX_ACL
+	if (sbi->ll_flags & LL_SBI_RMT_CLIENT &&
+	    (xattr_type == XATTR_ACL_ACCESS_T ||
+	    xattr_type == XATTR_ACL_DEFAULT_T)) {
+		rce = rct_search(&sbi->ll_rct, current_pid());
+		if (rce == NULL ||
+		    (rce->rce_ops != RMT_LSETFACL &&
+		    rce->rce_ops != RMT_RSETFACL))
+			return -EOPNOTSUPP;
+
+		if (rce->rce_ops == RMT_LSETFACL) {
+			struct eacl_entry *ee;
+
+			ee = et_search_del(&sbi->ll_et, current_pid(),
+					   ll_inode2fid(inode), xattr_type);
+			LASSERT(ee != NULL);
+			if (valid & OBD_MD_FLXATTR) {
+				acl = lustre_acl_xattr_merge2ext(
+						(posix_acl_xattr_header *)value,
+						size, ee->ee_acl);
+				if (IS_ERR(acl)) {
+					ee_free(ee);
+					return PTR_ERR(acl);
+				}
+				size =  CFS_ACL_XATTR_SIZE(\
+						le32_to_cpu(acl->a_count), \
+						ext_acl_xattr);
+				pv = (const char *)acl;
+			}
+			ee_free(ee);
+		} else if (rce->rce_ops == RMT_RSETFACL) {
+			size = lustre_posix_acl_xattr_filter(
+						(posix_acl_xattr_header *)value,
+						size, &new_value);
+			if (unlikely(size < 0))
+				return size;
+
+			pv = (const char *)new_value;
+		} else
+			return -EOPNOTSUPP;
+
+		valid |= rce_ops2valid(rce->rce_ops);
+	}
+#endif
+		oc = ll_mdscapa_get(inode);
+		rc = md_setxattr(sbi->ll_md_exp, ll_inode2fid(inode), oc,
+				valid, name, pv, size, 0, flags,
+				ll_i2suppgid(inode), &req);
+		capa_put(oc);
+#ifdef CONFIG_FS_POSIX_ACL
+	if (new_value != NULL)
+		lustre_posix_acl_xattr_free(new_value, size);
+	if (acl != NULL)
+		lustre_ext_acl_xattr_free(acl);
+#endif
+	if (rc) {
+		if (rc == -EOPNOTSUPP && xattr_type == XATTR_USER_T) {
+			LCONSOLE_INFO("Disabling user_xattr feature because it is not supported on the server\n");
+			sbi->ll_flags &= ~LL_SBI_USER_XATTR;
+		}
+		return rc;
+	}
+
+	ptlrpc_req_finished(req);
+	return 0;
+}
+
+int ll_setxattr(struct dentry *dentry, const char *name,
+		const void *value, size_t size, int flags)
+{
+	struct inode *inode = d_inode(dentry);
+
+	LASSERT(inode);
+	LASSERT(name);
+
+	CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), xattr %s\n",
+	       inode->i_ino, inode->i_generation, inode, name);
+
+	ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_SETXATTR, 1);
+
+	if ((strncmp(name, XATTR_TRUSTED_PREFIX,
+		     sizeof(XATTR_TRUSTED_PREFIX) - 1) == 0 &&
+	     strcmp(name + sizeof(XATTR_TRUSTED_PREFIX) - 1, "lov") == 0) ||
+	    (strncmp(name, XATTR_LUSTRE_PREFIX,
+		     sizeof(XATTR_LUSTRE_PREFIX) - 1) == 0 &&
+	     strcmp(name + sizeof(XATTR_LUSTRE_PREFIX) - 1, "lov") == 0)) {
+		struct lov_user_md *lump = (struct lov_user_md *)value;
+		int rc = 0;
+
+		if (size != 0 && size < sizeof(struct lov_user_md))
+			return -EINVAL;
+
+		/* Attributes that are saved via getxattr will always have
+		 * the stripe_offset as 0.  Instead, the MDS should be
+		 * allowed to pick the starting OST index.   b=17846 */
+		if (lump != NULL && lump->lmm_stripe_offset == 0)
+			lump->lmm_stripe_offset = -1;
+
+		if (lump != NULL && S_ISREG(inode->i_mode)) {
+			int flags = FMODE_WRITE;
+			int lum_size = (lump->lmm_magic == LOV_USER_MAGIC_V1) ?
+				sizeof(*lump) : sizeof(struct lov_user_md_v3);
+
+			rc = ll_lov_setstripe_ea_info(inode, dentry, flags, lump,
+						      lum_size);
+			/* b10667: rc always be 0 here for now */
+			rc = 0;
+		} else if (S_ISDIR(inode->i_mode)) {
+			rc = ll_dir_setstripe(inode, lump, 0);
+		}
+
+		return rc;
+
+	} else if (strcmp(name, XATTR_NAME_LMA) == 0 ||
+		   strcmp(name, XATTR_NAME_LINK) == 0)
+		return 0;
+
+	return ll_setxattr_common(inode, name, value, size, flags,
+				  OBD_MD_FLXATTR);
+}
+
+int ll_removexattr(struct dentry *dentry, const char *name)
+{
+	struct inode *inode = d_inode(dentry);
+
+	LASSERT(inode);
+	LASSERT(name);
+
+	CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), xattr %s\n",
+	       inode->i_ino, inode->i_generation, inode, name);
+
+	ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_REMOVEXATTR, 1);
+	return ll_setxattr_common(inode, name, NULL, 0, 0,
+				  OBD_MD_FLXATTRRM);
+}
+
+static
+int ll_getxattr_common(struct inode *inode, const char *name,
+		       void *buffer, size_t size, __u64 valid)
+{
+	struct ll_sb_info *sbi = ll_i2sbi(inode);
+	struct ptlrpc_request *req = NULL;
+	struct mdt_body *body;
+	int xattr_type, rc;
+	void *xdata;
+	struct obd_capa *oc;
+	struct rmtacl_ctl_entry *rce = NULL;
+	struct ll_inode_info *lli = ll_i2info(inode);
+
+	CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n",
+	       inode->i_ino, inode->i_generation, inode);
+
+	/* listxattr have slightly different behavior from of ext3:
+	 * without 'user_xattr' ext3 will list all xattr names but
+	 * filtered out "^user..*"; we list them all for simplicity.
+	 */
+	if (!name) {
+		xattr_type = XATTR_OTHER_T;
+		goto do_getxattr;
+	}
+
+	xattr_type = get_xattr_type(name);
+	rc = xattr_type_filter(sbi, xattr_type);
+	if (rc)
+		return rc;
+
+	/* b15587: ignore security.capability xattr for now */
+	if ((xattr_type == XATTR_SECURITY_T &&
+	    strcmp(name, "security.capability") == 0))
+		return -ENODATA;
+
+	/* LU-549:  Disable security.selinux when selinux is disabled */
+	if (xattr_type == XATTR_SECURITY_T && !selinux_is_enabled() &&
+	    strcmp(name, "security.selinux") == 0)
+		return -EOPNOTSUPP;
+
+#ifdef CONFIG_FS_POSIX_ACL
+	if (sbi->ll_flags & LL_SBI_RMT_CLIENT &&
+	    (xattr_type == XATTR_ACL_ACCESS_T ||
+	    xattr_type == XATTR_ACL_DEFAULT_T)) {
+		rce = rct_search(&sbi->ll_rct, current_pid());
+		if (rce == NULL ||
+		    (rce->rce_ops != RMT_LSETFACL &&
+		    rce->rce_ops != RMT_LGETFACL &&
+		    rce->rce_ops != RMT_RSETFACL &&
+		    rce->rce_ops != RMT_RGETFACL))
+			return -EOPNOTSUPP;
+	}
+
+	/* posix acl is under protection of LOOKUP lock. when calling to this,
+	 * we just have path resolution to the target inode, so we have great
+	 * chance that cached ACL is uptodate.
+	 */
+	if (xattr_type == XATTR_ACL_ACCESS_T &&
+	    !(sbi->ll_flags & LL_SBI_RMT_CLIENT)) {
+
+		struct posix_acl *acl;
+
+		spin_lock(&lli->lli_lock);
+		acl = posix_acl_dup(lli->lli_posix_acl);
+		spin_unlock(&lli->lli_lock);
+
+		if (!acl)
+			return -ENODATA;
+
+		rc = posix_acl_to_xattr(&init_user_ns, acl, buffer, size);
+		posix_acl_release(acl);
+		return rc;
+	}
+	if (xattr_type == XATTR_ACL_DEFAULT_T && !S_ISDIR(inode->i_mode))
+		return -ENODATA;
+#endif
+
+do_getxattr:
+	if (sbi->ll_xattr_cache_enabled && xattr_type != XATTR_ACL_ACCESS_T) {
+		rc = ll_xattr_cache_get(inode, name, buffer, size, valid);
+		if (rc == -EAGAIN)
+			goto getxattr_nocache;
+		if (rc < 0)
+			goto out_xattr;
+
+		/* Add "system.posix_acl_access" to the list */
+		if (lli->lli_posix_acl != NULL && valid & OBD_MD_FLXATTRLS) {
+			if (size == 0) {
+				rc += sizeof(XATTR_NAME_ACL_ACCESS);
+			} else if (size - rc >= sizeof(XATTR_NAME_ACL_ACCESS)) {
+				memcpy(buffer + rc, XATTR_NAME_ACL_ACCESS,
+				       sizeof(XATTR_NAME_ACL_ACCESS));
+				rc += sizeof(XATTR_NAME_ACL_ACCESS);
+			} else {
+				rc = -ERANGE;
+				goto out_xattr;
+			}
+		}
+	} else {
+getxattr_nocache:
+		oc = ll_mdscapa_get(inode);
+		rc = md_getxattr(sbi->ll_md_exp, ll_inode2fid(inode), oc,
+				valid | (rce ? rce_ops2valid(rce->rce_ops) : 0),
+				name, NULL, 0, size, 0, &req);
+		capa_put(oc);
+
+		if (rc < 0)
+			goto out_xattr;
+
+		body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
+		LASSERT(body);
+
+		/* only detect the xattr size */
+		if (size == 0) {
+			rc = body->eadatasize;
+			goto out;
+		}
+
+		if (size < body->eadatasize) {
+			CERROR("server bug: replied size %u > %u\n",
+				body->eadatasize, (int)size);
+			rc = -ERANGE;
+			goto out;
+		}
+
+		if (body->eadatasize == 0) {
+			rc = -ENODATA;
+			goto out;
+		}
+
+		/* do not need swab xattr data */
+		xdata = req_capsule_server_sized_get(&req->rq_pill, &RMF_EADATA,
+							body->eadatasize);
+		if (!xdata) {
+			rc = -EFAULT;
+			goto out;
+		}
+
+		memcpy(buffer, xdata, body->eadatasize);
+		rc = body->eadatasize;
+	}
+
+#ifdef CONFIG_FS_POSIX_ACL
+	if (rce && rce->rce_ops == RMT_LSETFACL) {
+		ext_acl_xattr_header *acl;
+
+		acl = lustre_posix_acl_xattr_2ext(
+					(posix_acl_xattr_header *)buffer, rc);
+		if (IS_ERR(acl)) {
+			rc = PTR_ERR(acl);
+			goto out;
+		}
+
+		rc = ee_add(&sbi->ll_et, current_pid(), ll_inode2fid(inode),
+			    xattr_type, acl);
+		if (unlikely(rc < 0)) {
+			lustre_ext_acl_xattr_free(acl);
+			goto out;
+		}
+	}
+#endif
+
+out_xattr:
+	if (rc == -EOPNOTSUPP && xattr_type == XATTR_USER_T) {
+		LCONSOLE_INFO(
+			"%s: disabling user_xattr feature because it is not supported on the server: rc = %d\n",
+			ll_get_fsname(inode->i_sb, NULL, 0), rc);
+		sbi->ll_flags &= ~LL_SBI_USER_XATTR;
+	}
+out:
+	ptlrpc_req_finished(req);
+	return rc;
+}
+
+ssize_t ll_getxattr(struct dentry *dentry, const char *name,
+		    void *buffer, size_t size)
+{
+	struct inode *inode = d_inode(dentry);
+
+	LASSERT(inode);
+	LASSERT(name);
+
+	CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), xattr %s\n",
+	       inode->i_ino, inode->i_generation, inode, name);
+
+	ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_GETXATTR, 1);
+
+	if ((strncmp(name, XATTR_TRUSTED_PREFIX,
+		     sizeof(XATTR_TRUSTED_PREFIX) - 1) == 0 &&
+	     strcmp(name + sizeof(XATTR_TRUSTED_PREFIX) - 1, "lov") == 0) ||
+	    (strncmp(name, XATTR_LUSTRE_PREFIX,
+		     sizeof(XATTR_LUSTRE_PREFIX) - 1) == 0 &&
+	     strcmp(name + sizeof(XATTR_LUSTRE_PREFIX) - 1, "lov") == 0)) {
+		struct lov_stripe_md *lsm;
+		struct lov_user_md *lump;
+		struct lov_mds_md *lmm = NULL;
+		struct ptlrpc_request *request = NULL;
+		int rc = 0, lmmsize = 0;
+
+		if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
+			return -ENODATA;
+
+		if (size == 0 && S_ISDIR(inode->i_mode)) {
+			/* XXX directory EA is fix for now, optimize to save
+			 * RPC transfer */
+			rc = sizeof(struct lov_user_md);
+			goto out;
+		}
+
+		lsm = ccc_inode_lsm_get(inode);
+		if (lsm == NULL) {
+			if (S_ISDIR(inode->i_mode)) {
+				rc = ll_dir_getstripe(inode, &lmm,
+						      &lmmsize, &request);
+			} else {
+				rc = -ENODATA;
+			}
+		} else {
+			/* LSM is present already after lookup/getattr call.
+			 * we need to grab layout lock once it is implemented */
+			rc = obd_packmd(ll_i2dtexp(inode), &lmm, lsm);
+			lmmsize = rc;
+		}
+		ccc_inode_lsm_put(inode, lsm);
+
+		if (rc < 0)
+			goto out;
+
+		if (size == 0) {
+			/* used to call ll_get_max_mdsize() forward to get
+			 * the maximum buffer size, while some apps (such as
+			 * rsync 3.0.x) care much about the exact xattr value
+			 * size */
+			rc = lmmsize;
+			goto out;
+		}
+
+		if (size < lmmsize) {
+			CERROR("server bug: replied size %d > %d for %pd (%s)\n",
+			       lmmsize, (int)size, dentry, name);
+			rc = -ERANGE;
+			goto out;
+		}
+
+		lump = (struct lov_user_md *)buffer;
+		memcpy(lump, lmm, lmmsize);
+		/* do not return layout gen for getxattr otherwise it would
+		 * confuse tar --xattr by recognizing layout gen as stripe
+		 * offset when the file is restored. See LU-2809. */
+		lump->lmm_layout_gen = 0;
+
+		rc = lmmsize;
+out:
+		if (request)
+			ptlrpc_req_finished(request);
+		else if (lmm)
+			obd_free_diskmd(ll_i2dtexp(inode), &lmm);
+		return rc;
+	}
+
+	return ll_getxattr_common(inode, name, buffer, size, OBD_MD_FLXATTR);
+}
+
+ssize_t ll_listxattr(struct dentry *dentry, char *buffer, size_t size)
+{
+	struct inode *inode = d_inode(dentry);
+	int rc = 0, rc2 = 0;
+	struct lov_mds_md *lmm = NULL;
+	struct ptlrpc_request *request = NULL;
+	int lmmsize;
+
+	LASSERT(inode);
+
+	CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n",
+	       inode->i_ino, inode->i_generation, inode);
+
+	ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LISTXATTR, 1);
+
+	rc = ll_getxattr_common(inode, NULL, buffer, size, OBD_MD_FLXATTRLS);
+	if (rc < 0)
+		goto out;
+
+	if (buffer != NULL) {
+		struct ll_sb_info *sbi = ll_i2sbi(inode);
+		char *xattr_name = buffer;
+		int xlen, rem = rc;
+
+		while (rem > 0) {
+			xlen = strnlen(xattr_name, rem - 1) + 1;
+			rem -= xlen;
+			if (xattr_type_filter(sbi,
+					get_xattr_type(xattr_name)) == 0) {
+				/* skip OK xattr type
+				 * leave it in buffer
+				 */
+				xattr_name += xlen;
+				continue;
+			}
+			/* move up remaining xattrs in buffer
+			 * removing the xattr that is not OK
+			 */
+			memmove(xattr_name, xattr_name + xlen, rem);
+			rc -= xlen;
+		}
+	}
+	if (S_ISREG(inode->i_mode)) {
+		if (!ll_i2info(inode)->lli_has_smd)
+			rc2 = -1;
+	} else if (S_ISDIR(inode->i_mode)) {
+		rc2 = ll_dir_getstripe(inode, &lmm, &lmmsize, &request);
+	}
+
+	if (rc2 < 0) {
+		rc2 = 0;
+		goto out;
+	} else if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode)) {
+		const int prefix_len = sizeof(XATTR_LUSTRE_PREFIX) - 1;
+		const size_t name_len   = sizeof("lov") - 1;
+		const size_t total_len  = prefix_len + name_len + 1;
+
+		if (((rc + total_len) > size) && (buffer != NULL)) {
+			ptlrpc_req_finished(request);
+			return -ERANGE;
+		}
+
+		if (buffer != NULL) {
+			buffer += rc;
+			memcpy(buffer, XATTR_LUSTRE_PREFIX, prefix_len);
+			memcpy(buffer + prefix_len, "lov", name_len);
+			buffer[prefix_len + name_len] = '\0';
+		}
+		rc2 = total_len;
+	}
+out:
+	ptlrpc_req_finished(request);
+	rc = rc + rc2;
+
+	return rc;
+}
diff --git a/kernel/drivers/staging/lustre/lustre/llite/xattr_cache.c b/kernel/drivers/staging/lustre/lustre/llite/xattr_cache.c
new file mode 100644
index 000000000..69ea92adf
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/llite/xattr_cache.c
@@ -0,0 +1,538 @@
+/*
+ * Copyright 2012 Xyratex Technology Limited
+ *
+ * Author: Andrew Perepechko <Andrew_Perepechko@xyratex.com>
+ *
+ */
+
+#define DEBUG_SUBSYSTEM S_LLITE
+
+#include <linux/fs.h>
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include "../include/obd_support.h"
+#include "../include/lustre_lite.h"
+#include "../include/lustre_dlm.h"
+#include "../include/lustre_ver.h"
+#include "llite_internal.h"
+
+/* If we ever have hundreds of extended attributes, we might want to consider
+ * using a hash or a tree structure instead of list for faster lookups.
+ */
+struct ll_xattr_entry {
+	struct list_head	xe_list;    /* protected with
+					     * lli_xattrs_list_rwsem */
+	char			*xe_name;   /* xattr name, \0-terminated */
+	char			*xe_value;  /* xattr value */
+	unsigned		xe_namelen; /* strlen(xe_name) + 1 */
+	unsigned		xe_vallen;  /* xattr value length */
+};
+
+static struct kmem_cache *xattr_kmem;
+static struct lu_kmem_descr xattr_caches[] = {
+	{
+		.ckd_cache = &xattr_kmem,
+		.ckd_name  = "xattr_kmem",
+		.ckd_size  = sizeof(struct ll_xattr_entry)
+	},
+	{
+		.ckd_cache = NULL
+	}
+};
+
+int ll_xattr_init(void)
+{
+	return lu_kmem_init(xattr_caches);
+}
+
+void ll_xattr_fini(void)
+{
+	lu_kmem_fini(xattr_caches);
+}
+
+/**
+ * Initializes xattr cache for an inode.
+ *
+ * This initializes the xattr list and marks cache presence.
+ */
+static void ll_xattr_cache_init(struct ll_inode_info *lli)
+{
+
+
+	LASSERT(lli != NULL);
+
+	INIT_LIST_HEAD(&lli->lli_xattrs);
+	lli->lli_flags |= LLIF_XATTR_CACHE;
+}
+
+/**
+ *  This looks for a specific extended attribute.
+ *
+ *  Find in @cache and return @xattr_name attribute in @xattr,
+ *  for the NULL @xattr_name return the first cached @xattr.
+ *
+ *  \retval 0        success
+ *  \retval -ENODATA if not found
+ */
+static int ll_xattr_cache_find(struct list_head *cache,
+			       const char *xattr_name,
+			       struct ll_xattr_entry **xattr)
+{
+	struct ll_xattr_entry *entry;
+
+
+
+	list_for_each_entry(entry, cache, xe_list) {
+		/* xattr_name == NULL means look for any entry */
+		if (xattr_name == NULL ||
+		    strcmp(xattr_name, entry->xe_name) == 0) {
+			*xattr = entry;
+			CDEBUG(D_CACHE, "find: [%s]=%.*s\n",
+			       entry->xe_name, entry->xe_vallen,
+			       entry->xe_value);
+			return 0;
+		}
+	}
+
+	return -ENODATA;
+}
+
+/**
+ * This adds an xattr.
+ *
+ * Add @xattr_name attr with @xattr_val value and @xattr_val_len length,
+ *
+ * \retval 0       success
+ * \retval -ENOMEM if no memory could be allocated for the cached attr
+ * \retval -EPROTO if duplicate xattr is being added
+ */
+static int ll_xattr_cache_add(struct list_head *cache,
+			      const char *xattr_name,
+			      const char *xattr_val,
+			      unsigned xattr_val_len)
+{
+	struct ll_xattr_entry *xattr;
+
+
+
+	if (ll_xattr_cache_find(cache, xattr_name, &xattr) == 0) {
+		CDEBUG(D_CACHE, "duplicate xattr: [%s]\n", xattr_name);
+		return -EPROTO;
+	}
+
+	OBD_SLAB_ALLOC_PTR_GFP(xattr, xattr_kmem, GFP_NOFS);
+	if (xattr == NULL) {
+		CDEBUG(D_CACHE, "failed to allocate xattr\n");
+		return -ENOMEM;
+	}
+
+	xattr->xe_name = kstrdup(xattr_name, GFP_NOFS);
+	if (!xattr->xe_name) {
+		CDEBUG(D_CACHE, "failed to alloc xattr name %u\n",
+		       xattr->xe_namelen);
+		goto err_name;
+	}
+	xattr->xe_value = kmemdup(xattr_val, xattr_val_len, GFP_NOFS);
+	if (!xattr->xe_value)
+		goto err_value;
+
+	xattr->xe_vallen = xattr_val_len;
+	list_add(&xattr->xe_list, cache);
+
+	CDEBUG(D_CACHE, "set: [%s]=%.*s\n", xattr_name,
+		xattr_val_len, xattr_val);
+
+	return 0;
+err_value:
+	OBD_FREE(xattr->xe_name, xattr->xe_namelen);
+err_name:
+	OBD_SLAB_FREE_PTR(xattr, xattr_kmem);
+
+	return -ENOMEM;
+}
+
+/**
+ * This removes an extended attribute from cache.
+ *
+ * Remove @xattr_name attribute from @cache.
+ *
+ * \retval 0        success
+ * \retval -ENODATA if @xattr_name is not cached
+ */
+static int ll_xattr_cache_del(struct list_head *cache,
+			      const char *xattr_name)
+{
+	struct ll_xattr_entry *xattr;
+
+
+
+	CDEBUG(D_CACHE, "del xattr: %s\n", xattr_name);
+
+	if (ll_xattr_cache_find(cache, xattr_name, &xattr) == 0) {
+		list_del(&xattr->xe_list);
+		OBD_FREE(xattr->xe_name, xattr->xe_namelen);
+		OBD_FREE(xattr->xe_value, xattr->xe_vallen);
+		OBD_SLAB_FREE_PTR(xattr, xattr_kmem);
+
+		return 0;
+	}
+
+	return -ENODATA;
+}
+
+/**
+ * This iterates cached extended attributes.
+ *
+ * Walk over cached attributes in @cache and
+ * fill in @xld_buffer or only calculate buffer
+ * size if @xld_buffer is NULL.
+ *
+ * \retval >= 0     buffer list size
+ * \retval -ENODATA if the list cannot fit @xld_size buffer
+ */
+static int ll_xattr_cache_list(struct list_head *cache,
+			       char *xld_buffer,
+			       int xld_size)
+{
+	struct ll_xattr_entry *xattr, *tmp;
+	int xld_tail = 0;
+
+
+
+	list_for_each_entry_safe(xattr, tmp, cache, xe_list) {
+		CDEBUG(D_CACHE, "list: buffer=%p[%d] name=%s\n",
+			xld_buffer, xld_tail, xattr->xe_name);
+
+		if (xld_buffer) {
+			xld_size -= xattr->xe_namelen;
+			if (xld_size < 0)
+				break;
+			memcpy(&xld_buffer[xld_tail],
+			       xattr->xe_name, xattr->xe_namelen);
+		}
+		xld_tail += xattr->xe_namelen;
+	}
+
+	if (xld_size < 0)
+		return -ERANGE;
+
+	return xld_tail;
+}
+
+/**
+ * Check if the xattr cache is initialized (filled).
+ *
+ * \retval 0 @cache is not initialized
+ * \retval 1 @cache is initialized
+ */
+static int ll_xattr_cache_valid(struct ll_inode_info *lli)
+{
+	return !!(lli->lli_flags & LLIF_XATTR_CACHE);
+}
+
+/**
+ * This finalizes the xattr cache.
+ *
+ * Free all xattr memory. @lli is the inode info pointer.
+ *
+ * \retval 0 no error occurred
+ */
+static int ll_xattr_cache_destroy_locked(struct ll_inode_info *lli)
+{
+
+
+	if (!ll_xattr_cache_valid(lli))
+		return 0;
+
+	while (ll_xattr_cache_del(&lli->lli_xattrs, NULL) == 0)
+		; /* empty loop */
+	lli->lli_flags &= ~LLIF_XATTR_CACHE;
+
+	return 0;
+}
+
+int ll_xattr_cache_destroy(struct inode *inode)
+{
+	struct ll_inode_info *lli = ll_i2info(inode);
+	int rc;
+
+
+
+	down_write(&lli->lli_xattrs_list_rwsem);
+	rc = ll_xattr_cache_destroy_locked(lli);
+	up_write(&lli->lli_xattrs_list_rwsem);
+
+	return rc;
+}
+
+/**
+ * Match or enqueue a PR lock.
+ *
+ * Find or request an LDLM lock with xattr data.
+ * Since LDLM does not provide API for atomic match_or_enqueue,
+ * the function handles it with a separate enq lock.
+ * If successful, the function exits with the list lock held.
+ *
+ * \retval 0       no error occurred
+ * \retval -ENOMEM not enough memory
+ */
+static int ll_xattr_find_get_lock(struct inode *inode,
+				  struct lookup_intent *oit,
+				  struct ptlrpc_request **req)
+{
+	ldlm_mode_t mode;
+	struct lustre_handle lockh = { 0 };
+	struct md_op_data *op_data;
+	struct ll_inode_info *lli = ll_i2info(inode);
+	struct ldlm_enqueue_info einfo = { .ei_type = LDLM_IBITS,
+					   .ei_mode = it_to_lock_mode(oit),
+					   .ei_cb_bl = ll_md_blocking_ast,
+					   .ei_cb_cp = ldlm_completion_ast };
+	struct ll_sb_info *sbi = ll_i2sbi(inode);
+	struct obd_export *exp = sbi->ll_md_exp;
+	int rc;
+
+
+
+	mutex_lock(&lli->lli_xattrs_enq_lock);
+	/* inode may have been shrunk and recreated, so data is gone, match lock
+	 * only when data exists. */
+	if (ll_xattr_cache_valid(lli)) {
+		/* Try matching first. */
+		mode = ll_take_md_lock(inode, MDS_INODELOCK_XATTR, &lockh, 0,
+				       LCK_PR);
+		if (mode != 0) {
+			/* fake oit in mdc_revalidate_lock() manner */
+			oit->d.lustre.it_lock_handle = lockh.cookie;
+			oit->d.lustre.it_lock_mode = mode;
+			goto out;
+		}
+	}
+
+	/* Enqueue if the lock isn't cached locally. */
+	op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
+				     LUSTRE_OPC_ANY, NULL);
+	if (IS_ERR(op_data)) {
+		mutex_unlock(&lli->lli_xattrs_enq_lock);
+		return PTR_ERR(op_data);
+	}
+
+	op_data->op_valid = OBD_MD_FLXATTR | OBD_MD_FLXATTRLS;
+
+	rc = md_enqueue(exp, &einfo, oit, op_data, &lockh, NULL, 0, NULL, 0);
+	ll_finish_md_op_data(op_data);
+
+	if (rc < 0) {
+		CDEBUG(D_CACHE,
+		       "md_intent_lock failed with %d for fid "DFID"\n",
+		       rc, PFID(ll_inode2fid(inode)));
+		mutex_unlock(&lli->lli_xattrs_enq_lock);
+		return rc;
+	}
+
+	*req = (struct ptlrpc_request *)oit->d.lustre.it_data;
+out:
+	down_write(&lli->lli_xattrs_list_rwsem);
+	mutex_unlock(&lli->lli_xattrs_enq_lock);
+
+	return 0;
+}
+
+/**
+ * Refill the xattr cache.
+ *
+ * Fetch and cache the whole of xattrs for @inode, acquiring
+ * a read or a write xattr lock depending on operation in @oit.
+ * Intent is dropped on exit unless the operation is setxattr.
+ *
+ * \retval 0       no error occurred
+ * \retval -EPROTO network protocol error
+ * \retval -ENOMEM not enough memory for the cache
+ */
+static int ll_xattr_cache_refill(struct inode *inode, struct lookup_intent *oit)
+{
+	struct ll_sb_info *sbi = ll_i2sbi(inode);
+	struct ptlrpc_request *req = NULL;
+	const char *xdata, *xval, *xtail, *xvtail;
+	struct ll_inode_info *lli = ll_i2info(inode);
+	struct mdt_body *body;
+	__u32 *xsizes;
+	int rc = 0, i;
+
+
+
+	rc = ll_xattr_find_get_lock(inode, oit, &req);
+	if (rc)
+		goto out_no_unlock;
+
+	/* Do we have the data at this point? */
+	if (ll_xattr_cache_valid(lli)) {
+		ll_stats_ops_tally(sbi, LPROC_LL_GETXATTR_HITS, 1);
+		rc = 0;
+		goto out_maybe_drop;
+	}
+
+	/* Matched but no cache? Cancelled on error by a parallel refill. */
+	if (unlikely(req == NULL)) {
+		CDEBUG(D_CACHE, "cancelled by a parallel getxattr\n");
+		rc = -EIO;
+		goto out_maybe_drop;
+	}
+
+	if (oit->d.lustre.it_status < 0) {
+		CDEBUG(D_CACHE, "getxattr intent returned %d for fid "DFID"\n",
+		       oit->d.lustre.it_status, PFID(ll_inode2fid(inode)));
+		rc = oit->d.lustre.it_status;
+		/* xattr data is so large that we don't want to cache it */
+		if (rc == -ERANGE)
+			rc = -EAGAIN;
+		goto out_destroy;
+	}
+
+	body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
+	if (body == NULL) {
+		CERROR("no MDT BODY in the refill xattr reply\n");
+		rc = -EPROTO;
+		goto out_destroy;
+	}
+	/* do not need swab xattr data */
+	xdata = req_capsule_server_sized_get(&req->rq_pill, &RMF_EADATA,
+						body->eadatasize);
+	xval = req_capsule_server_sized_get(&req->rq_pill, &RMF_EAVALS,
+						body->aclsize);
+	xsizes = req_capsule_server_sized_get(&req->rq_pill, &RMF_EAVALS_LENS,
+					      body->max_mdsize * sizeof(__u32));
+	if (xdata == NULL || xval == NULL || xsizes == NULL) {
+		CERROR("wrong setxattr reply\n");
+		rc = -EPROTO;
+		goto out_destroy;
+	}
+
+	xtail = xdata + body->eadatasize;
+	xvtail = xval + body->aclsize;
+
+	CDEBUG(D_CACHE, "caching: xdata=%p xtail=%p\n", xdata, xtail);
+
+	ll_xattr_cache_init(lli);
+
+	for (i = 0; i < body->max_mdsize; i++) {
+		CDEBUG(D_CACHE, "caching [%s]=%.*s\n", xdata, *xsizes, xval);
+		/* Perform consistency checks: attr names and vals in pill */
+		if (memchr(xdata, 0, xtail - xdata) == NULL) {
+			CERROR("xattr protocol violation (names are broken)\n");
+			rc = -EPROTO;
+		} else if (xval + *xsizes > xvtail) {
+			CERROR("xattr protocol violation (vals are broken)\n");
+			rc = -EPROTO;
+		} else if (OBD_FAIL_CHECK(OBD_FAIL_LLITE_XATTR_ENOMEM)) {
+			rc = -ENOMEM;
+		} else if (!strcmp(xdata, XATTR_NAME_ACL_ACCESS)) {
+			/* Filter out ACL ACCESS since it's cached separately */
+			CDEBUG(D_CACHE, "not caching %s\n",
+			       XATTR_NAME_ACL_ACCESS);
+			rc = 0;
+		} else {
+			rc = ll_xattr_cache_add(&lli->lli_xattrs, xdata, xval,
+						*xsizes);
+		}
+		if (rc < 0) {
+			ll_xattr_cache_destroy_locked(lli);
+			goto out_destroy;
+		}
+		xdata += strlen(xdata) + 1;
+		xval  += *xsizes;
+		xsizes++;
+	}
+
+	if (xdata != xtail || xval != xvtail)
+		CERROR("a hole in xattr data\n");
+
+	ll_set_lock_data(sbi->ll_md_exp, inode, oit, NULL);
+
+	goto out_maybe_drop;
+out_maybe_drop:
+
+		ll_intent_drop_lock(oit);
+
+	if (rc != 0)
+		up_write(&lli->lli_xattrs_list_rwsem);
+out_no_unlock:
+	ptlrpc_req_finished(req);
+
+	return rc;
+
+out_destroy:
+	up_write(&lli->lli_xattrs_list_rwsem);
+
+	ldlm_lock_decref_and_cancel((struct lustre_handle *)
+					&oit->d.lustre.it_lock_handle,
+					oit->d.lustre.it_lock_mode);
+
+	goto out_no_unlock;
+}
+
+/**
+ * Get an xattr value or list xattrs using the write-through cache.
+ *
+ * Get the xattr value (@valid has OBD_MD_FLXATTR set) of @name or
+ * list xattr names (@valid has OBD_MD_FLXATTRLS set) for @inode.
+ * The resulting value/list is stored in @buffer if the former
+ * is not larger than @size.
+ *
+ * \retval 0        no error occurred
+ * \retval -EPROTO  network protocol error
+ * \retval -ENOMEM  not enough memory for the cache
+ * \retval -ERANGE  the buffer is not large enough
+ * \retval -ENODATA no such attr or the list is empty
+ */
+int ll_xattr_cache_get(struct inode *inode,
+			const char *name,
+			char *buffer,
+			size_t size,
+			__u64 valid)
+{
+	struct lookup_intent oit = { .it_op = IT_GETXATTR };
+	struct ll_inode_info *lli = ll_i2info(inode);
+	int rc = 0;
+
+
+
+	LASSERT(!!(valid & OBD_MD_FLXATTR) ^ !!(valid & OBD_MD_FLXATTRLS));
+
+	down_read(&lli->lli_xattrs_list_rwsem);
+	if (!ll_xattr_cache_valid(lli)) {
+		up_read(&lli->lli_xattrs_list_rwsem);
+		rc = ll_xattr_cache_refill(inode, &oit);
+		if (rc)
+			return rc;
+		downgrade_write(&lli->lli_xattrs_list_rwsem);
+	} else {
+		ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_GETXATTR_HITS, 1);
+	}
+
+	if (valid & OBD_MD_FLXATTR) {
+		struct ll_xattr_entry *xattr;
+
+		rc = ll_xattr_cache_find(&lli->lli_xattrs, name, &xattr);
+		if (rc == 0) {
+			rc = xattr->xe_vallen;
+			/* zero size means we are only requested size in rc */
+			if (size != 0) {
+				if (size >= xattr->xe_vallen)
+					memcpy(buffer, xattr->xe_value,
+						xattr->xe_vallen);
+				else
+					rc = -ERANGE;
+			}
+		}
+	} else if (valid & OBD_MD_FLXATTRLS) {
+		rc = ll_xattr_cache_list(&lli->lli_xattrs,
+					 size ? buffer : NULL, size);
+	}
+
+	goto out;
+out:
+	up_read(&lli->lli_xattrs_list_rwsem);
+
+	return rc;
+}
diff --git a/kernel/drivers/staging/lustre/lustre/lmv/Makefile b/kernel/drivers/staging/lustre/lustre/lmv/Makefile
new file mode 100644
index 000000000..a7a15369a
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/lmv/Makefile
@@ -0,0 +1,3 @@
+obj-$(CONFIG_LUSTRE_FS) += lmv.o
+lmv-y := lmv_obd.o lmv_intent.o lmv_fld.o
+lmv-$(CONFIG_PROC_FS) += lproc_lmv.o
diff --git a/kernel/drivers/staging/lustre/lustre/lmv/lmv_fld.c b/kernel/drivers/staging/lustre/lustre/lmv/lmv_fld.c
new file mode 100644
index 000000000..ee235926f
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/lmv/lmv_fld.c
@@ -0,0 +1,83 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2013, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_LMV
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/pagemap.h>
+#include <asm/div64.h>
+#include <linux/seq_file.h>
+
+#include "../include/obd_support.h"
+#include "../include/lustre/lustre_idl.h"
+#include "../include/lustre_fid.h"
+#include "../include/lustre_lib.h"
+#include "../include/lustre_net.h"
+#include "../include/lustre_dlm.h"
+#include "../include/obd_class.h"
+#include "../include/lprocfs_status.h"
+#include "lmv_internal.h"
+
+int lmv_fld_lookup(struct lmv_obd *lmv,
+		   const struct lu_fid *fid,
+		   u32 *mds)
+{
+	int rc;
+
+	/* FIXME: Currently ZFS still use local seq for ROOT unfortunately, and
+	 * this fid_is_local check should be removed once LU-2240 is fixed */
+	LASSERTF((fid_seq_in_fldb(fid_seq(fid)) ||
+		  fid_seq_is_local_file(fid_seq(fid))) &&
+		 fid_is_sane(fid), DFID" is insane!\n", PFID(fid));
+
+	rc = fld_client_lookup(&lmv->lmv_fld, fid_seq(fid), mds,
+			       LU_SEQ_RANGE_MDT, NULL);
+	if (rc) {
+		CERROR("Error while looking for mds number. Seq %#llx, err = %d\n",
+		       fid_seq(fid), rc);
+		return rc;
+	}
+
+	CDEBUG(D_INODE, "FLD lookup got mds #%x for fid="DFID"\n",
+	       *mds, PFID(fid));
+
+	if (*mds >= lmv->desc.ld_tgt_count) {
+		CERROR("FLD lookup got invalid mds #%x (max: %x) for fid=" DFID "\n", *mds, lmv->desc.ld_tgt_count,
+		       PFID(fid));
+		rc = -EINVAL;
+	}
+	return rc;
+}
diff --git a/kernel/drivers/staging/lustre/lustre/lmv/lmv_intent.c b/kernel/drivers/staging/lustre/lustre/lmv/lmv_intent.c
new file mode 100644
index 000000000..d22d57b4f
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/lmv/lmv_intent.c
@@ -0,0 +1,323 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_LMV
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/pagemap.h>
+#include <asm/div64.h>
+#include <linux/seq_file.h>
+#include <linux/namei.h>
+#include "../include/lustre_intent.h"
+#include "../include/obd_support.h"
+#include "../include/lustre/lustre_idl.h"
+#include "../include/lustre_lib.h"
+#include "../include/lustre_net.h"
+#include "../include/lustre_dlm.h"
+#include "../include/obd_class.h"
+#include "../include/lprocfs_status.h"
+#include "lmv_internal.h"
+
+static int lmv_intent_remote(struct obd_export *exp, void *lmm,
+			     int lmmsize, struct lookup_intent *it,
+			     const struct lu_fid *parent_fid, int flags,
+			     struct ptlrpc_request **reqp,
+			     ldlm_blocking_callback cb_blocking,
+			     __u64 extra_lock_flags)
+{
+	struct obd_device	*obd = exp->exp_obd;
+	struct lmv_obd		*lmv = &obd->u.lmv;
+	struct ptlrpc_request	*req = NULL;
+	struct lustre_handle	plock;
+	struct md_op_data	*op_data;
+	struct lmv_tgt_desc	*tgt;
+	struct mdt_body		*body;
+	int			pmode;
+	int			rc = 0;
+
+	body = req_capsule_server_get(&(*reqp)->rq_pill, &RMF_MDT_BODY);
+	if (body == NULL)
+		return -EPROTO;
+
+	LASSERT((body->valid & OBD_MD_MDS));
+
+	/*
+	 * Unfortunately, we have to lie to MDC/MDS to retrieve
+	 * attributes llite needs and provideproper locking.
+	 */
+	if (it->it_op & IT_LOOKUP)
+		it->it_op = IT_GETATTR;
+
+	/*
+	 * We got LOOKUP lock, but we really need attrs.
+	 */
+	pmode = it->d.lustre.it_lock_mode;
+	if (pmode) {
+		plock.cookie = it->d.lustre.it_lock_handle;
+		it->d.lustre.it_lock_mode = 0;
+		it->d.lustre.it_data = NULL;
+	}
+
+	LASSERT(fid_is_sane(&body->fid1));
+
+	tgt = lmv_find_target(lmv, &body->fid1);
+	if (IS_ERR(tgt)) {
+		rc = PTR_ERR(tgt);
+		goto out;
+	}
+
+	OBD_ALLOC_PTR(op_data);
+	if (op_data == NULL) {
+		rc = -ENOMEM;
+		goto out;
+	}
+
+	op_data->op_fid1 = body->fid1;
+	/* Sent the parent FID to the remote MDT */
+	if (parent_fid != NULL) {
+		/* The parent fid is only for remote open to
+		 * check whether the open is from OBF,
+		 * see mdt_cross_open */
+		LASSERT(it->it_op & IT_OPEN);
+		op_data->op_fid2 = *parent_fid;
+		/* Add object FID to op_fid3, in case it needs to check stale
+		 * (M_CHECK_STALE), see mdc_finish_intent_lock */
+		op_data->op_fid3 = body->fid1;
+	}
+
+	op_data->op_bias = MDS_CROSS_REF;
+	CDEBUG(D_INODE, "REMOTE_INTENT with fid="DFID" -> mds #%d\n",
+	       PFID(&body->fid1), tgt->ltd_idx);
+
+	rc = md_intent_lock(tgt->ltd_exp, op_data, lmm, lmmsize, it,
+			    flags, &req, cb_blocking, extra_lock_flags);
+	if (rc)
+		goto out_free_op_data;
+
+	/*
+	 * LLite needs LOOKUP lock to track dentry revocation in order to
+	 * maintain dcache consistency. Thus drop UPDATE|PERM lock here
+	 * and put LOOKUP in request.
+	 */
+	if (it->d.lustre.it_lock_mode != 0) {
+		it->d.lustre.it_remote_lock_handle =
+					it->d.lustre.it_lock_handle;
+		it->d.lustre.it_remote_lock_mode = it->d.lustre.it_lock_mode;
+	}
+
+	it->d.lustre.it_lock_handle = plock.cookie;
+	it->d.lustre.it_lock_mode = pmode;
+
+out_free_op_data:
+	OBD_FREE_PTR(op_data);
+out:
+	if (rc && pmode)
+		ldlm_lock_decref(&plock, pmode);
+
+	ptlrpc_req_finished(*reqp);
+	*reqp = req;
+	return rc;
+}
+
+/*
+ * IT_OPEN is intended to open (and create, possible) an object. Parent (pid)
+ * may be split dir.
+ */
+int lmv_intent_open(struct obd_export *exp, struct md_op_data *op_data,
+		    void *lmm, int lmmsize, struct lookup_intent *it,
+		    int flags, struct ptlrpc_request **reqp,
+		    ldlm_blocking_callback cb_blocking,
+		    __u64 extra_lock_flags)
+{
+	struct obd_device	*obd = exp->exp_obd;
+	struct lmv_obd		*lmv = &obd->u.lmv;
+	struct lmv_tgt_desc	*tgt;
+	struct mdt_body		*body;
+	int			rc;
+
+	tgt = lmv_locate_mds(lmv, op_data, &op_data->op_fid1);
+	if (IS_ERR(tgt))
+		return PTR_ERR(tgt);
+
+	/* If it is ready to open the file by FID, do not need
+	 * allocate FID at all, otherwise it will confuse MDT */
+	if ((it->it_op & IT_CREAT) &&
+	    !(it->it_flags & MDS_OPEN_BY_FID)) {
+		/*
+		 * For open with IT_CREATE and for IT_CREATE cases allocate new
+		 * fid and setup FLD for it.
+		 */
+		op_data->op_fid3 = op_data->op_fid2;
+		rc = lmv_fid_alloc(exp, &op_data->op_fid2, op_data);
+		if (rc != 0)
+			return rc;
+	}
+
+	CDEBUG(D_INODE, "OPEN_INTENT with fid1=" DFID ", fid2=" DFID ", name='%s' -> mds #%d\n",
+	       PFID(&op_data->op_fid1),
+	       PFID(&op_data->op_fid2), op_data->op_name, tgt->ltd_idx);
+
+	rc = md_intent_lock(tgt->ltd_exp, op_data, lmm, lmmsize, it, flags,
+			    reqp, cb_blocking, extra_lock_flags);
+	if (rc != 0)
+		return rc;
+	/*
+	 * Nothing is found, do not access body->fid1 as it is zero and thus
+	 * pointless.
+	 */
+	if ((it->d.lustre.it_disposition & DISP_LOOKUP_NEG) &&
+	    !(it->d.lustre.it_disposition & DISP_OPEN_CREATE) &&
+	    !(it->d.lustre.it_disposition & DISP_OPEN_OPEN))
+		return rc;
+
+	body = req_capsule_server_get(&(*reqp)->rq_pill, &RMF_MDT_BODY);
+	if (body == NULL)
+		return -EPROTO;
+	/*
+	 * Not cross-ref case, just get out of here.
+	 */
+	if (likely(!(body->valid & OBD_MD_MDS)))
+		return 0;
+
+	/*
+	 * Okay, MDS has returned success. Probably name has been resolved in
+	 * remote inode.
+	 */
+	rc = lmv_intent_remote(exp, lmm, lmmsize, it, &op_data->op_fid1, flags,
+			       reqp, cb_blocking, extra_lock_flags);
+	if (rc != 0) {
+		LASSERT(rc < 0);
+		/*
+		 * This is possible, that some userspace application will try to
+		 * open file as directory and we will have -ENOTDIR here. As
+		 * this is normal situation, we should not print error here,
+		 * only debug info.
+		 */
+		CDEBUG(D_INODE, "Can't handle remote %s: dir " DFID "(" DFID "):%*s: %d\n",
+		       LL_IT2STR(it), PFID(&op_data->op_fid2),
+		       PFID(&op_data->op_fid1), op_data->op_namelen,
+		       op_data->op_name, rc);
+		return rc;
+	}
+
+	return rc;
+}
+
+/*
+ * Handler for: getattr, lookup and revalidate cases.
+ */
+int lmv_intent_lookup(struct obd_export *exp, struct md_op_data *op_data,
+		      void *lmm, int lmmsize, struct lookup_intent *it,
+		      int flags, struct ptlrpc_request **reqp,
+		      ldlm_blocking_callback cb_blocking,
+		      __u64 extra_lock_flags)
+{
+	struct obd_device      *obd = exp->exp_obd;
+	struct lmv_obd	 *lmv = &obd->u.lmv;
+	struct lmv_tgt_desc    *tgt = NULL;
+	struct mdt_body	*body;
+	int		     rc = 0;
+
+	tgt = lmv_locate_mds(lmv, op_data, &op_data->op_fid1);
+	if (IS_ERR(tgt))
+		return PTR_ERR(tgt);
+
+	if (!fid_is_sane(&op_data->op_fid2))
+		fid_zero(&op_data->op_fid2);
+
+	CDEBUG(D_INODE, "LOOKUP_INTENT with fid1="DFID", fid2="DFID
+	       ", name='%s' -> mds #%d\n", PFID(&op_data->op_fid1),
+	       PFID(&op_data->op_fid2),
+	       op_data->op_name ? op_data->op_name : "<NULL>",
+	       tgt->ltd_idx);
+
+	op_data->op_bias &= ~MDS_CROSS_REF;
+
+	rc = md_intent_lock(tgt->ltd_exp, op_data, lmm, lmmsize, it,
+			     flags, reqp, cb_blocking, extra_lock_flags);
+
+	if (rc < 0 || *reqp == NULL)
+		return rc;
+
+	/*
+	 * MDS has returned success. Probably name has been resolved in
+	 * remote inode. Let's check this.
+	 */
+	body = req_capsule_server_get(&(*reqp)->rq_pill, &RMF_MDT_BODY);
+	if (body == NULL)
+		return -EPROTO;
+	/* Not cross-ref case, just get out of here. */
+	if (likely(!(body->valid & OBD_MD_MDS)))
+		return 0;
+
+	rc = lmv_intent_remote(exp, lmm, lmmsize, it, NULL, flags, reqp,
+			       cb_blocking, extra_lock_flags);
+
+	return rc;
+}
+
+int lmv_intent_lock(struct obd_export *exp, struct md_op_data *op_data,
+		    void *lmm, int lmmsize, struct lookup_intent *it,
+		    int flags, struct ptlrpc_request **reqp,
+		    ldlm_blocking_callback cb_blocking,
+		    __u64 extra_lock_flags)
+{
+	struct obd_device *obd = exp->exp_obd;
+	int		rc;
+
+	LASSERT(it != NULL);
+	LASSERT(fid_is_sane(&op_data->op_fid1));
+
+	CDEBUG(D_INODE, "INTENT LOCK '%s' for '%*s' on "DFID"\n",
+	       LL_IT2STR(it), op_data->op_namelen, op_data->op_name,
+	       PFID(&op_data->op_fid1));
+
+	rc = lmv_check_connect(obd);
+	if (rc)
+		return rc;
+
+	if (it->it_op & (IT_LOOKUP | IT_GETATTR | IT_LAYOUT))
+		rc = lmv_intent_lookup(exp, op_data, lmm, lmmsize, it,
+				       flags, reqp, cb_blocking,
+				       extra_lock_flags);
+	else if (it->it_op & IT_OPEN)
+		rc = lmv_intent_open(exp, op_data, lmm, lmmsize, it,
+				     flags, reqp, cb_blocking,
+				     extra_lock_flags);
+	else
+		LBUG();
+	return rc;
+}
diff --git a/kernel/drivers/staging/lustre/lustre/lmv/lmv_internal.h b/kernel/drivers/staging/lustre/lustre/lmv/lmv_internal.h
new file mode 100644
index 000000000..852d78721
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/lmv/lmv_internal.h
@@ -0,0 +1,157 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef _LMV_INTERNAL_H_
+#define _LMV_INTERNAL_H_
+
+#include "../include/lustre/lustre_idl.h"
+#include "../include/obd.h"
+
+#define LMV_MAX_TGT_COUNT 128
+
+#define lmv_init_lock(lmv)   mutex_lock(&lmv->init_mutex)
+#define lmv_init_unlock(lmv) mutex_unlock(&lmv->init_mutex)
+
+#define LL_IT2STR(it)					\
+	((it) ? ldlm_it2str((it)->it_op) : "0")
+
+int lmv_check_connect(struct obd_device *obd);
+
+int lmv_intent_lock(struct obd_export *exp, struct md_op_data *op_data,
+		    void *lmm, int lmmsize, struct lookup_intent *it,
+		    int flags, struct ptlrpc_request **reqp,
+		    ldlm_blocking_callback cb_blocking,
+		    __u64 extra_lock_flags);
+
+int lmv_intent_lookup(struct obd_export *exp, struct md_op_data *op_data,
+		      void *lmm, int lmmsize, struct lookup_intent *it,
+		      int flags, struct ptlrpc_request **reqp,
+		      ldlm_blocking_callback cb_blocking,
+		      __u64 extra_lock_flags);
+
+int lmv_intent_open(struct obd_export *exp, struct md_op_data *op_data,
+		    void *lmm, int lmmsize, struct lookup_intent *it,
+		    int flags, struct ptlrpc_request **reqp,
+		    ldlm_blocking_callback cb_blocking,
+		    __u64 extra_lock_flags);
+
+int lmv_blocking_ast(struct ldlm_lock *, struct ldlm_lock_desc *,
+		     void *, int);
+int lmv_fld_lookup(struct lmv_obd *lmv, const struct lu_fid *fid, u32 *mds);
+int __lmv_fid_alloc(struct lmv_obd *lmv, struct lu_fid *fid, u32 mds);
+int lmv_fid_alloc(struct obd_export *exp, struct lu_fid *fid,
+		  struct md_op_data *op_data);
+
+static inline struct lmv_stripe_md *lmv_get_mea(struct ptlrpc_request *req)
+{
+	struct mdt_body	 *body;
+	struct lmv_stripe_md    *mea;
+
+	LASSERT(req != NULL);
+
+	body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
+
+	if (!body || !S_ISDIR(body->mode) || !body->eadatasize)
+		return NULL;
+
+	mea = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD,
+					   body->eadatasize);
+	LASSERT(mea != NULL);
+
+	if (mea->mea_count == 0)
+		return NULL;
+	if (mea->mea_magic != MEA_MAGIC_LAST_CHAR &&
+		mea->mea_magic != MEA_MAGIC_ALL_CHARS &&
+		mea->mea_magic != MEA_MAGIC_HASH_SEGMENT)
+		return NULL;
+
+	return mea;
+}
+
+static inline int lmv_get_easize(struct lmv_obd *lmv)
+{
+	return sizeof(struct lmv_stripe_md) +
+		lmv->desc.ld_tgt_count *
+		sizeof(struct lu_fid);
+}
+
+static inline struct lmv_tgt_desc *
+lmv_get_target(struct lmv_obd *lmv, u32 mds)
+{
+	int count = lmv->desc.ld_tgt_count;
+	int i;
+
+	for (i = 0; i < count; i++) {
+		if (lmv->tgts[i] == NULL)
+			continue;
+
+		if (lmv->tgts[i]->ltd_idx == mds)
+			return lmv->tgts[i];
+	}
+
+	return ERR_PTR(-ENODEV);
+}
+
+static inline struct lmv_tgt_desc *
+lmv_find_target(struct lmv_obd *lmv, const struct lu_fid *fid)
+{
+	u32 mds = 0;
+	int rc;
+
+	if (lmv->desc.ld_tgt_count > 1) {
+		rc = lmv_fld_lookup(lmv, fid, &mds);
+		if (rc)
+			return ERR_PTR(rc);
+	}
+
+	return lmv_get_target(lmv, mds);
+}
+
+struct lmv_tgt_desc
+*lmv_locate_mds(struct lmv_obd *lmv, struct md_op_data *op_data,
+		struct lu_fid *fid);
+/* lproc_lmv.c */
+#if defined(CONFIG_PROC_FS)
+void lprocfs_lmv_init_vars(struct lprocfs_static_vars *lvars);
+#else
+static inline void lprocfs_lmv_init_vars(struct lprocfs_static_vars *lvars)
+{
+	memset(lvars, 0, sizeof(*lvars));
+}
+#endif
+extern struct file_operations lmv_proc_target_fops;
+
+#endif
diff --git a/kernel/drivers/staging/lustre/lustre/lmv/lmv_obd.c b/kernel/drivers/staging/lustre/lustre/lmv/lmv_obd.c
new file mode 100644
index 000000000..b9459faf8
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/lmv/lmv_obd.c
@@ -0,0 +1,2892 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_LMV
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/pagemap.h>
+#include <linux/mm.h>
+#include <asm/div64.h>
+#include <linux/seq_file.h>
+#include <linux/namei.h>
+#include <linux/uaccess.h>
+
+#include "../include/lustre/lustre_idl.h"
+#include "../include/obd_support.h"
+#include "../include/lustre_lib.h"
+#include "../include/lustre_net.h"
+#include "../include/obd_class.h"
+#include "../include/lprocfs_status.h"
+#include "../include/lustre_lite.h"
+#include "../include/lustre_fid.h"
+#include "lmv_internal.h"
+
+static void lmv_activate_target(struct lmv_obd *lmv,
+				struct lmv_tgt_desc *tgt,
+				int activate)
+{
+	if (tgt->ltd_active == activate)
+		return;
+
+	tgt->ltd_active = activate;
+	lmv->desc.ld_active_tgt_count += (activate ? 1 : -1);
+}
+
+/**
+ * Error codes:
+ *
+ *  -EINVAL  : UUID can't be found in the LMV's target list
+ *  -ENOTCONN: The UUID is found, but the target connection is bad (!)
+ *  -EBADF   : The UUID is found, but the OBD of the wrong type (!)
+ */
+static int lmv_set_mdc_active(struct lmv_obd *lmv, struct obd_uuid *uuid,
+			      int activate)
+{
+	struct lmv_tgt_desc    *uninitialized_var(tgt);
+	struct obd_device      *obd;
+	int		     i;
+	int		     rc = 0;
+
+	CDEBUG(D_INFO, "Searching in lmv %p for uuid %s (activate=%d)\n",
+	       lmv, uuid->uuid, activate);
+
+	spin_lock(&lmv->lmv_lock);
+	for (i = 0; i < lmv->desc.ld_tgt_count; i++) {
+		tgt = lmv->tgts[i];
+		if (tgt == NULL || tgt->ltd_exp == NULL)
+			continue;
+
+		CDEBUG(D_INFO, "Target idx %d is %s conn %#llx\n", i,
+		       tgt->ltd_uuid.uuid, tgt->ltd_exp->exp_handle.h_cookie);
+
+		if (obd_uuid_equals(uuid, &tgt->ltd_uuid))
+			break;
+	}
+
+	if (i == lmv->desc.ld_tgt_count) {
+		rc = -EINVAL;
+		goto out_lmv_lock;
+	}
+
+	obd = class_exp2obd(tgt->ltd_exp);
+	if (obd == NULL) {
+		rc = -ENOTCONN;
+		goto out_lmv_lock;
+	}
+
+	CDEBUG(D_INFO, "Found OBD %s=%s device %d (%p) type %s at LMV idx %d\n",
+	       obd->obd_name, obd->obd_uuid.uuid, obd->obd_minor, obd,
+	       obd->obd_type->typ_name, i);
+	LASSERT(strcmp(obd->obd_type->typ_name, LUSTRE_MDC_NAME) == 0);
+
+	if (tgt->ltd_active == activate) {
+		CDEBUG(D_INFO, "OBD %p already %sactive!\n", obd,
+		       activate ? "" : "in");
+		goto out_lmv_lock;
+	}
+
+	CDEBUG(D_INFO, "Marking OBD %p %sactive\n", obd,
+	       activate ? "" : "in");
+	lmv_activate_target(lmv, tgt, activate);
+
+ out_lmv_lock:
+	spin_unlock(&lmv->lmv_lock);
+	return rc;
+}
+
+static struct obd_uuid *lmv_get_uuid(struct obd_export *exp)
+{
+	struct lmv_obd *lmv = &exp->exp_obd->u.lmv;
+
+	return obd_get_uuid(lmv->tgts[0]->ltd_exp);
+}
+
+static int lmv_notify(struct obd_device *obd, struct obd_device *watched,
+		      enum obd_notify_event ev, void *data)
+{
+	struct obd_connect_data *conn_data;
+	struct lmv_obd	  *lmv = &obd->u.lmv;
+	struct obd_uuid	 *uuid;
+	int		      rc = 0;
+
+	if (strcmp(watched->obd_type->typ_name, LUSTRE_MDC_NAME)) {
+		CERROR("unexpected notification of %s %s!\n",
+		       watched->obd_type->typ_name,
+		       watched->obd_name);
+		return -EINVAL;
+	}
+
+	uuid = &watched->u.cli.cl_target_uuid;
+	if (ev == OBD_NOTIFY_ACTIVE || ev == OBD_NOTIFY_INACTIVE) {
+		/*
+		 * Set MDC as active before notifying the observer, so the
+		 * observer can use the MDC normally.
+		 */
+		rc = lmv_set_mdc_active(lmv, uuid,
+					ev == OBD_NOTIFY_ACTIVE);
+		if (rc) {
+			CERROR("%sactivation of %s failed: %d\n",
+			       ev == OBD_NOTIFY_ACTIVE ? "" : "de",
+			       uuid->uuid, rc);
+			return rc;
+		}
+	} else if (ev == OBD_NOTIFY_OCD) {
+		conn_data = &watched->u.cli.cl_import->imp_connect_data;
+		/*
+		 * XXX: Make sure that ocd_connect_flags from all targets are
+		 * the same. Otherwise one of MDTs runs wrong version or
+		 * something like this.  --umka
+		 */
+		obd->obd_self_export->exp_connect_data = *conn_data;
+	}
+#if 0
+	else if (ev == OBD_NOTIFY_DISCON) {
+		/*
+		 * For disconnect event, flush fld cache for failout MDS case.
+		 */
+		fld_client_flush(&lmv->lmv_fld);
+	}
+#endif
+	/*
+	 * Pass the notification up the chain.
+	 */
+	if (obd->obd_observer)
+		rc = obd_notify(obd->obd_observer, watched, ev, data);
+
+	return rc;
+}
+
+/**
+ * This is fake connect function. Its purpose is to initialize lmv and say
+ * caller that everything is okay. Real connection will be performed later.
+ */
+static int lmv_connect(const struct lu_env *env,
+		       struct obd_export **exp, struct obd_device *obd,
+		       struct obd_uuid *cluuid, struct obd_connect_data *data,
+		       void *localdata)
+{
+	struct proc_dir_entry *lmv_proc_dir;
+	struct lmv_obd	*lmv = &obd->u.lmv;
+	struct lustre_handle  conn = { 0 };
+	int		    rc = 0;
+
+	/*
+	 * We don't want to actually do the underlying connections more than
+	 * once, so keep track.
+	 */
+	lmv->refcount++;
+	if (lmv->refcount > 1) {
+		*exp = NULL;
+		return 0;
+	}
+
+	rc = class_connect(&conn, obd, cluuid);
+	if (rc) {
+		CERROR("class_connection() returned %d\n", rc);
+		return rc;
+	}
+
+	*exp = class_conn2export(&conn);
+	class_export_get(*exp);
+
+	lmv->exp = *exp;
+	lmv->connected = 0;
+	lmv->cluuid = *cluuid;
+
+	if (data)
+		lmv->conn_data = *data;
+
+	if (obd->obd_proc_private != NULL) {
+		lmv_proc_dir = obd->obd_proc_private;
+	} else {
+		lmv_proc_dir = lprocfs_register("target_obds", obd->obd_proc_entry,
+						NULL, NULL);
+		if (IS_ERR(lmv_proc_dir)) {
+			CERROR("could not register /proc/fs/lustre/%s/%s/target_obds.",
+			       obd->obd_type->typ_name, obd->obd_name);
+			lmv_proc_dir = NULL;
+		}
+		obd->obd_proc_private = lmv_proc_dir;
+	}
+
+	/*
+	 * All real clients should perform actual connection right away, because
+	 * it is possible, that LMV will not have opportunity to connect targets
+	 * and MDC stuff will be called directly, for instance while reading
+	 * ../mdc/../kbytesfree procfs file, etc.
+	 */
+	if (data->ocd_connect_flags & OBD_CONNECT_REAL)
+		rc = lmv_check_connect(obd);
+
+	if (rc && lmv_proc_dir) {
+		lprocfs_remove(&lmv_proc_dir);
+		obd->obd_proc_private = NULL;
+	}
+
+	return rc;
+}
+
+static void lmv_set_timeouts(struct obd_device *obd)
+{
+	struct lmv_tgt_desc   *tgt;
+	struct lmv_obd	*lmv;
+	int		    i;
+
+	lmv = &obd->u.lmv;
+	if (lmv->server_timeout == 0)
+		return;
+
+	if (lmv->connected == 0)
+		return;
+
+	for (i = 0; i < lmv->desc.ld_tgt_count; i++) {
+		tgt = lmv->tgts[i];
+		if (tgt == NULL || tgt->ltd_exp == NULL || tgt->ltd_active == 0)
+			continue;
+
+		obd_set_info_async(NULL, tgt->ltd_exp, sizeof(KEY_INTERMDS),
+				   KEY_INTERMDS, 0, NULL, NULL);
+	}
+}
+
+static int lmv_init_ea_size(struct obd_export *exp, int easize,
+			    int def_easize, int cookiesize, int def_cookiesize)
+{
+	struct obd_device   *obd = exp->exp_obd;
+	struct lmv_obd      *lmv = &obd->u.lmv;
+	int		  i;
+	int		  rc = 0;
+	int		  change = 0;
+
+	if (lmv->max_easize < easize) {
+		lmv->max_easize = easize;
+		change = 1;
+	}
+	if (lmv->max_def_easize < def_easize) {
+		lmv->max_def_easize = def_easize;
+		change = 1;
+	}
+	if (lmv->max_cookiesize < cookiesize) {
+		lmv->max_cookiesize = cookiesize;
+		change = 1;
+	}
+	if (lmv->max_def_cookiesize < def_cookiesize) {
+		lmv->max_def_cookiesize = def_cookiesize;
+		change = 1;
+	}
+	if (change == 0)
+		return 0;
+
+	if (lmv->connected == 0)
+		return 0;
+
+	for (i = 0; i < lmv->desc.ld_tgt_count; i++) {
+		if (lmv->tgts[i] == NULL ||
+		    lmv->tgts[i]->ltd_exp == NULL ||
+		    lmv->tgts[i]->ltd_active == 0) {
+			CWARN("%s: NULL export for %d\n", obd->obd_name, i);
+			continue;
+		}
+
+		rc = md_init_ea_size(lmv->tgts[i]->ltd_exp, easize, def_easize,
+				     cookiesize, def_cookiesize);
+		if (rc) {
+			CERROR("%s: obd_init_ea_size() failed on MDT target %d: rc = %d.\n",
+			       obd->obd_name, i, rc);
+			break;
+		}
+	}
+	return rc;
+}
+
+#define MAX_STRING_SIZE 128
+
+static int lmv_connect_mdc(struct obd_device *obd, struct lmv_tgt_desc *tgt)
+{
+	struct proc_dir_entry   *lmv_proc_dir;
+	struct lmv_obd	  *lmv = &obd->u.lmv;
+	struct obd_uuid	 *cluuid = &lmv->cluuid;
+	struct obd_uuid	  lmv_mdc_uuid = { "LMV_MDC_UUID" };
+	struct obd_device       *mdc_obd;
+	struct obd_export       *mdc_exp;
+	struct lu_fld_target     target;
+	int		      rc;
+
+	mdc_obd = class_find_client_obd(&tgt->ltd_uuid, LUSTRE_MDC_NAME,
+					&obd->obd_uuid);
+	if (!mdc_obd) {
+		CERROR("target %s not attached\n", tgt->ltd_uuid.uuid);
+		return -EINVAL;
+	}
+
+	CDEBUG(D_CONFIG, "connect to %s(%s) - %s, %s FOR %s\n",
+		mdc_obd->obd_name, mdc_obd->obd_uuid.uuid,
+		tgt->ltd_uuid.uuid, obd->obd_uuid.uuid,
+		cluuid->uuid);
+
+	if (!mdc_obd->obd_set_up) {
+		CERROR("target %s is not set up\n", tgt->ltd_uuid.uuid);
+		return -EINVAL;
+	}
+
+	rc = obd_connect(NULL, &mdc_exp, mdc_obd, &lmv_mdc_uuid,
+			 &lmv->conn_data, NULL);
+	if (rc) {
+		CERROR("target %s connect error %d\n", tgt->ltd_uuid.uuid, rc);
+		return rc;
+	}
+
+	/*
+	 * Init fid sequence client for this mdc and add new fld target.
+	 */
+	rc = obd_fid_init(mdc_obd, mdc_exp, LUSTRE_SEQ_METADATA);
+	if (rc)
+		return rc;
+
+	target.ft_srv = NULL;
+	target.ft_exp = mdc_exp;
+	target.ft_idx = tgt->ltd_idx;
+
+	fld_client_add_target(&lmv->lmv_fld, &target);
+
+	rc = obd_register_observer(mdc_obd, obd);
+	if (rc) {
+		obd_disconnect(mdc_exp);
+		CERROR("target %s register_observer error %d\n",
+		       tgt->ltd_uuid.uuid, rc);
+		return rc;
+	}
+
+	if (obd->obd_observer) {
+		/*
+		 * Tell the observer about the new target.
+		 */
+		rc = obd_notify(obd->obd_observer, mdc_exp->exp_obd,
+				OBD_NOTIFY_ACTIVE,
+				(void *)(tgt - lmv->tgts[0]));
+		if (rc) {
+			obd_disconnect(mdc_exp);
+			return rc;
+		}
+	}
+
+	tgt->ltd_active = 1;
+	tgt->ltd_exp = mdc_exp;
+	lmv->desc.ld_active_tgt_count++;
+
+	md_init_ea_size(tgt->ltd_exp, lmv->max_easize, lmv->max_def_easize,
+			lmv->max_cookiesize, lmv->max_def_cookiesize);
+
+	CDEBUG(D_CONFIG, "Connected to %s(%s) successfully (%d)\n",
+		mdc_obd->obd_name, mdc_obd->obd_uuid.uuid,
+		atomic_read(&obd->obd_refcount));
+
+	lmv_proc_dir = obd->obd_proc_private;
+	if (lmv_proc_dir) {
+		struct proc_dir_entry *mdc_symlink;
+
+		LASSERT(mdc_obd->obd_type != NULL);
+		LASSERT(mdc_obd->obd_type->typ_name != NULL);
+		mdc_symlink = lprocfs_add_symlink(mdc_obd->obd_name,
+						  lmv_proc_dir,
+						  "../../../%s/%s",
+						  mdc_obd->obd_type->typ_name,
+						  mdc_obd->obd_name);
+		if (mdc_symlink == NULL) {
+			CERROR("Could not register LMV target /proc/fs/lustre/%s/%s/target_obds/%s.",
+			       obd->obd_type->typ_name, obd->obd_name,
+			       mdc_obd->obd_name);
+			lprocfs_remove(&lmv_proc_dir);
+			obd->obd_proc_private = NULL;
+		}
+	}
+	return 0;
+}
+
+static void lmv_del_target(struct lmv_obd *lmv, int index)
+{
+	if (lmv->tgts[index] == NULL)
+		return;
+
+	OBD_FREE_PTR(lmv->tgts[index]);
+	lmv->tgts[index] = NULL;
+	return;
+}
+
+static int lmv_add_target(struct obd_device *obd, struct obd_uuid *uuidp,
+			   __u32 index, int gen)
+{
+	struct lmv_obd      *lmv = &obd->u.lmv;
+	struct lmv_tgt_desc *tgt;
+	int		  rc = 0;
+
+	CDEBUG(D_CONFIG, "Target uuid: %s. index %d\n", uuidp->uuid, index);
+
+	lmv_init_lock(lmv);
+
+	if (lmv->desc.ld_tgt_count == 0) {
+		struct obd_device *mdc_obd;
+
+		mdc_obd = class_find_client_obd(uuidp, LUSTRE_MDC_NAME,
+						&obd->obd_uuid);
+		if (!mdc_obd) {
+			lmv_init_unlock(lmv);
+			CERROR("%s: Target %s not attached: rc = %d\n",
+			       obd->obd_name, uuidp->uuid, -EINVAL);
+			return -EINVAL;
+		}
+	}
+
+	if ((index < lmv->tgts_size) && (lmv->tgts[index] != NULL)) {
+		tgt = lmv->tgts[index];
+		CERROR("%s: UUID %s already assigned at LOV target index %d: rc = %d\n",
+		       obd->obd_name,
+		       obd_uuid2str(&tgt->ltd_uuid), index, -EEXIST);
+		lmv_init_unlock(lmv);
+		return -EEXIST;
+	}
+
+	if (index >= lmv->tgts_size) {
+		/* We need to reallocate the lmv target array. */
+		struct lmv_tgt_desc **newtgts, **old = NULL;
+		__u32 newsize = 1;
+		__u32 oldsize = 0;
+
+		while (newsize < index + 1)
+			newsize <<= 1;
+		OBD_ALLOC(newtgts, sizeof(*newtgts) * newsize);
+		if (newtgts == NULL) {
+			lmv_init_unlock(lmv);
+			return -ENOMEM;
+		}
+
+		if (lmv->tgts_size) {
+			memcpy(newtgts, lmv->tgts,
+			       sizeof(*newtgts) * lmv->tgts_size);
+			old = lmv->tgts;
+			oldsize = lmv->tgts_size;
+		}
+
+		lmv->tgts = newtgts;
+		lmv->tgts_size = newsize;
+		smp_rmb();
+		if (old)
+			OBD_FREE(old, sizeof(*old) * oldsize);
+
+		CDEBUG(D_CONFIG, "tgts: %p size: %d\n", lmv->tgts,
+		       lmv->tgts_size);
+	}
+
+	OBD_ALLOC_PTR(tgt);
+	if (!tgt) {
+		lmv_init_unlock(lmv);
+		return -ENOMEM;
+	}
+
+	mutex_init(&tgt->ltd_fid_mutex);
+	tgt->ltd_idx = index;
+	tgt->ltd_uuid = *uuidp;
+	tgt->ltd_active = 0;
+	lmv->tgts[index] = tgt;
+	if (index >= lmv->desc.ld_tgt_count)
+		lmv->desc.ld_tgt_count = index + 1;
+
+	if (lmv->connected) {
+		rc = lmv_connect_mdc(obd, tgt);
+		if (rc) {
+			spin_lock(&lmv->lmv_lock);
+			lmv->desc.ld_tgt_count--;
+			memset(tgt, 0, sizeof(*tgt));
+			spin_unlock(&lmv->lmv_lock);
+		} else {
+			int easize = sizeof(struct lmv_stripe_md) +
+				lmv->desc.ld_tgt_count * sizeof(struct lu_fid);
+			lmv_init_ea_size(obd->obd_self_export, easize, 0, 0, 0);
+		}
+	}
+
+	lmv_init_unlock(lmv);
+	return rc;
+}
+
+int lmv_check_connect(struct obd_device *obd)
+{
+	struct lmv_obd       *lmv = &obd->u.lmv;
+	struct lmv_tgt_desc  *tgt;
+	int		   i;
+	int		   rc;
+	int		   easize;
+
+	if (lmv->connected)
+		return 0;
+
+	lmv_init_lock(lmv);
+	if (lmv->connected) {
+		lmv_init_unlock(lmv);
+		return 0;
+	}
+
+	if (lmv->desc.ld_tgt_count == 0) {
+		lmv_init_unlock(lmv);
+		CERROR("%s: no targets configured.\n", obd->obd_name);
+		return -EINVAL;
+	}
+
+	CDEBUG(D_CONFIG, "Time to connect %s to %s\n",
+	       lmv->cluuid.uuid, obd->obd_name);
+
+	LASSERT(lmv->tgts != NULL);
+
+	for (i = 0; i < lmv->desc.ld_tgt_count; i++) {
+		tgt = lmv->tgts[i];
+		if (tgt == NULL)
+			continue;
+		rc = lmv_connect_mdc(obd, tgt);
+		if (rc)
+			goto out_disc;
+	}
+
+	lmv_set_timeouts(obd);
+	class_export_put(lmv->exp);
+	lmv->connected = 1;
+	easize = lmv_get_easize(lmv);
+	lmv_init_ea_size(obd->obd_self_export, easize, 0, 0, 0);
+	lmv_init_unlock(lmv);
+	return 0;
+
+ out_disc:
+	while (i-- > 0) {
+		int rc2;
+		tgt = lmv->tgts[i];
+		if (tgt == NULL)
+			continue;
+		tgt->ltd_active = 0;
+		if (tgt->ltd_exp) {
+			--lmv->desc.ld_active_tgt_count;
+			rc2 = obd_disconnect(tgt->ltd_exp);
+			if (rc2) {
+				CERROR("LMV target %s disconnect on MDC idx %d: error %d\n",
+				       tgt->ltd_uuid.uuid, i, rc2);
+			}
+		}
+	}
+	class_disconnect(lmv->exp);
+	lmv_init_unlock(lmv);
+	return rc;
+}
+
+static int lmv_disconnect_mdc(struct obd_device *obd, struct lmv_tgt_desc *tgt)
+{
+	struct proc_dir_entry  *lmv_proc_dir;
+	struct lmv_obd	 *lmv = &obd->u.lmv;
+	struct obd_device      *mdc_obd;
+	int		     rc;
+
+	LASSERT(tgt != NULL);
+	LASSERT(obd != NULL);
+
+	mdc_obd = class_exp2obd(tgt->ltd_exp);
+
+	if (mdc_obd) {
+		mdc_obd->obd_force = obd->obd_force;
+		mdc_obd->obd_fail = obd->obd_fail;
+		mdc_obd->obd_no_recov = obd->obd_no_recov;
+	}
+
+	lmv_proc_dir = obd->obd_proc_private;
+	if (lmv_proc_dir)
+		lprocfs_remove_proc_entry(mdc_obd->obd_name, lmv_proc_dir);
+
+	rc = obd_fid_fini(tgt->ltd_exp->exp_obd);
+	if (rc)
+		CERROR("Can't finalize fids factory\n");
+
+	CDEBUG(D_INFO, "Disconnected from %s(%s) successfully\n",
+	       tgt->ltd_exp->exp_obd->obd_name,
+	       tgt->ltd_exp->exp_obd->obd_uuid.uuid);
+
+	obd_register_observer(tgt->ltd_exp->exp_obd, NULL);
+	rc = obd_disconnect(tgt->ltd_exp);
+	if (rc) {
+		if (tgt->ltd_active) {
+			CERROR("Target %s disconnect error %d\n",
+			       tgt->ltd_uuid.uuid, rc);
+		}
+	}
+
+	lmv_activate_target(lmv, tgt, 0);
+	tgt->ltd_exp = NULL;
+	return 0;
+}
+
+static int lmv_disconnect(struct obd_export *exp)
+{
+	struct obd_device     *obd = class_exp2obd(exp);
+	struct lmv_obd	*lmv = &obd->u.lmv;
+	int		    rc;
+	int		    i;
+
+	if (!lmv->tgts)
+		goto out_local;
+
+	/*
+	 * Only disconnect the underlying layers on the final disconnect.
+	 */
+	lmv->refcount--;
+	if (lmv->refcount != 0)
+		goto out_local;
+
+	for (i = 0; i < lmv->desc.ld_tgt_count; i++) {
+		if (lmv->tgts[i] == NULL || lmv->tgts[i]->ltd_exp == NULL)
+			continue;
+
+		lmv_disconnect_mdc(obd, lmv->tgts[i]);
+	}
+
+	if (obd->obd_proc_private)
+		lprocfs_remove((struct proc_dir_entry **)&obd->obd_proc_private);
+	else
+		CERROR("/proc/fs/lustre/%s/%s/target_obds missing\n",
+		       obd->obd_type->typ_name, obd->obd_name);
+
+out_local:
+	/*
+	 * This is the case when no real connection is established by
+	 * lmv_check_connect().
+	 */
+	if (!lmv->connected)
+		class_export_put(exp);
+	rc = class_disconnect(exp);
+	if (lmv->refcount == 0)
+		lmv->connected = 0;
+	return rc;
+}
+
+static int lmv_fid2path(struct obd_export *exp, int len, void *karg, void *uarg)
+{
+	struct obd_device	*obddev = class_exp2obd(exp);
+	struct lmv_obd		*lmv = &obddev->u.lmv;
+	struct getinfo_fid2path *gf;
+	struct lmv_tgt_desc     *tgt;
+	struct getinfo_fid2path *remote_gf = NULL;
+	int			remote_gf_size = 0;
+	int			rc;
+
+	gf = (struct getinfo_fid2path *)karg;
+	tgt = lmv_find_target(lmv, &gf->gf_fid);
+	if (IS_ERR(tgt))
+		return PTR_ERR(tgt);
+
+repeat_fid2path:
+	rc = obd_iocontrol(OBD_IOC_FID2PATH, tgt->ltd_exp, len, gf, uarg);
+	if (rc != 0 && rc != -EREMOTE)
+		goto out_fid2path;
+
+	/* If remote_gf != NULL, it means just building the
+	 * path on the remote MDT, copy this path segment to gf */
+	if (remote_gf != NULL) {
+		struct getinfo_fid2path *ori_gf;
+		char *ptr;
+
+		ori_gf = (struct getinfo_fid2path *)karg;
+		if (strlen(ori_gf->gf_path) +
+		    strlen(gf->gf_path) > ori_gf->gf_pathlen) {
+			rc = -EOVERFLOW;
+			goto out_fid2path;
+		}
+
+		ptr = ori_gf->gf_path;
+
+		memmove(ptr + strlen(gf->gf_path) + 1, ptr,
+			strlen(ori_gf->gf_path));
+
+		strncpy(ptr, gf->gf_path, strlen(gf->gf_path));
+		ptr += strlen(gf->gf_path);
+		*ptr = '/';
+	}
+
+	CDEBUG(D_INFO, "%s: get path %s "DFID" rec: %llu ln: %u\n",
+	       tgt->ltd_exp->exp_obd->obd_name,
+	       gf->gf_path, PFID(&gf->gf_fid), gf->gf_recno,
+	       gf->gf_linkno);
+
+	if (rc == 0)
+		goto out_fid2path;
+
+	/* sigh, has to go to another MDT to do path building further */
+	if (remote_gf == NULL) {
+		remote_gf_size = sizeof(*remote_gf) + PATH_MAX;
+		OBD_ALLOC(remote_gf, remote_gf_size);
+		if (remote_gf == NULL) {
+			rc = -ENOMEM;
+			goto out_fid2path;
+		}
+		remote_gf->gf_pathlen = PATH_MAX;
+	}
+
+	if (!fid_is_sane(&gf->gf_fid)) {
+		CERROR("%s: invalid FID "DFID": rc = %d\n",
+		       tgt->ltd_exp->exp_obd->obd_name,
+		       PFID(&gf->gf_fid), -EINVAL);
+		rc = -EINVAL;
+		goto out_fid2path;
+	}
+
+	tgt = lmv_find_target(lmv, &gf->gf_fid);
+	if (IS_ERR(tgt)) {
+		rc = -EINVAL;
+		goto out_fid2path;
+	}
+
+	remote_gf->gf_fid = gf->gf_fid;
+	remote_gf->gf_recno = -1;
+	remote_gf->gf_linkno = -1;
+	memset(remote_gf->gf_path, 0, remote_gf->gf_pathlen);
+	gf = remote_gf;
+	goto repeat_fid2path;
+
+out_fid2path:
+	if (remote_gf != NULL)
+		OBD_FREE(remote_gf, remote_gf_size);
+	return rc;
+}
+
+static int lmv_hsm_req_count(struct lmv_obd *lmv,
+			     const struct hsm_user_request *hur,
+			     const struct lmv_tgt_desc *tgt_mds)
+{
+	int			i, nr = 0;
+	struct lmv_tgt_desc    *curr_tgt;
+
+	/* count how many requests must be sent to the given target */
+	for (i = 0; i < hur->hur_request.hr_itemcount; i++) {
+		curr_tgt = lmv_find_target(lmv, &hur->hur_user_item[i].hui_fid);
+		if (obd_uuid_equals(&curr_tgt->ltd_uuid, &tgt_mds->ltd_uuid))
+			nr++;
+	}
+	return nr;
+}
+
+static void lmv_hsm_req_build(struct lmv_obd *lmv,
+			      struct hsm_user_request *hur_in,
+			      const struct lmv_tgt_desc *tgt_mds,
+			      struct hsm_user_request *hur_out)
+{
+	int			i, nr_out;
+	struct lmv_tgt_desc    *curr_tgt;
+
+	/* build the hsm_user_request for the given target */
+	hur_out->hur_request = hur_in->hur_request;
+	nr_out = 0;
+	for (i = 0; i < hur_in->hur_request.hr_itemcount; i++) {
+		curr_tgt = lmv_find_target(lmv,
+					&hur_in->hur_user_item[i].hui_fid);
+		if (obd_uuid_equals(&curr_tgt->ltd_uuid, &tgt_mds->ltd_uuid)) {
+			hur_out->hur_user_item[nr_out] =
+				hur_in->hur_user_item[i];
+			nr_out++;
+		}
+	}
+	hur_out->hur_request.hr_itemcount = nr_out;
+	memcpy(hur_data(hur_out), hur_data(hur_in),
+	       hur_in->hur_request.hr_data_len);
+}
+
+static int lmv_hsm_ct_unregister(struct lmv_obd *lmv, unsigned int cmd, int len,
+				 struct lustre_kernelcomm *lk, void *uarg)
+{
+	int	i, rc = 0;
+
+	/* unregister request (call from llapi_hsm_copytool_fini) */
+	for (i = 0; i < lmv->desc.ld_tgt_count; i++) {
+		/* best effort: try to clean as much as possible
+		 * (continue on error) */
+		obd_iocontrol(cmd, lmv->tgts[i]->ltd_exp, len, lk, uarg);
+	}
+
+	/* Whatever the result, remove copytool from kuc groups.
+	 * Unreached coordinators will get EPIPE on next requests
+	 * and will unregister automatically.
+	 */
+	rc = libcfs_kkuc_group_rem(lk->lk_uid, lk->lk_group);
+	return rc;
+}
+
+static int lmv_hsm_ct_register(struct lmv_obd *lmv, unsigned int cmd, int len,
+			       struct lustre_kernelcomm *lk, void *uarg)
+{
+	struct file	*filp;
+	int		 i, j, err;
+	int		 rc = 0;
+	bool		 any_set = false;
+
+	/* All or nothing: try to register to all MDS.
+	 * In case of failure, unregister from previous MDS,
+	 * except if it because of inactive target. */
+	for (i = 0; i < lmv->desc.ld_tgt_count; i++) {
+		err = obd_iocontrol(cmd, lmv->tgts[i]->ltd_exp,
+				   len, lk, uarg);
+		if (err) {
+			if (lmv->tgts[i]->ltd_active) {
+				/* permanent error */
+				CERROR("error: iocontrol MDC %s on MDTidx %d cmd %x: err = %d\n",
+				       lmv->tgts[i]->ltd_uuid.uuid,
+				       i, cmd, err);
+				rc = err;
+				lk->lk_flags |= LK_FLG_STOP;
+				/* unregister from previous MDS */
+				for (j = 0; j < i; j++)
+					obd_iocontrol(cmd,
+						  lmv->tgts[j]->ltd_exp,
+						  len, lk, uarg);
+				return rc;
+			}
+			/* else: transient error.
+			 * kuc will register to the missing MDT
+			 * when it is back */
+		} else {
+			any_set = true;
+		}
+	}
+
+	if (!any_set)
+		/* no registration done: return error */
+		return -ENOTCONN;
+
+	/* at least one registration done, with no failure */
+	filp = fget(lk->lk_wfd);
+	if (filp == NULL) {
+		return -EBADF;
+	}
+	rc = libcfs_kkuc_group_add(filp, lk->lk_uid, lk->lk_group, lk->lk_data);
+	if (rc != 0 && filp != NULL)
+		fput(filp);
+	return rc;
+}
+
+
+
+
+static int lmv_iocontrol(unsigned int cmd, struct obd_export *exp,
+			 int len, void *karg, void *uarg)
+{
+	struct obd_device    *obddev = class_exp2obd(exp);
+	struct lmv_obd       *lmv = &obddev->u.lmv;
+	int		   i = 0;
+	int		   rc = 0;
+	int		   set = 0;
+	int		   count = lmv->desc.ld_tgt_count;
+
+	if (count == 0)
+		return -ENOTTY;
+
+	switch (cmd) {
+	case IOC_OBD_STATFS: {
+		struct obd_ioctl_data *data = karg;
+		struct obd_device *mdc_obd;
+		struct obd_statfs stat_buf = {0};
+		__u32 index;
+
+		memcpy(&index, data->ioc_inlbuf2, sizeof(__u32));
+		if (index >= count)
+			return -ENODEV;
+
+		if (lmv->tgts[index] == NULL ||
+		    lmv->tgts[index]->ltd_active == 0)
+			return -ENODATA;
+
+		mdc_obd = class_exp2obd(lmv->tgts[index]->ltd_exp);
+		if (!mdc_obd)
+			return -EINVAL;
+
+		/* copy UUID */
+		if (copy_to_user(data->ioc_pbuf2, obd2cli_tgt(mdc_obd),
+				     min((int) data->ioc_plen2,
+					 (int) sizeof(struct obd_uuid))))
+			return -EFAULT;
+
+		rc = obd_statfs(NULL, lmv->tgts[index]->ltd_exp, &stat_buf,
+				cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS),
+				0);
+		if (rc)
+			return rc;
+		if (copy_to_user(data->ioc_pbuf1, &stat_buf,
+				     min((int) data->ioc_plen1,
+					 (int) sizeof(stat_buf))))
+			return -EFAULT;
+		break;
+	}
+	case OBD_IOC_QUOTACTL: {
+		struct if_quotactl *qctl = karg;
+		struct lmv_tgt_desc *tgt = NULL;
+		struct obd_quotactl *oqctl;
+
+		if (qctl->qc_valid == QC_MDTIDX) {
+			if (qctl->qc_idx < 0 || count <= qctl->qc_idx)
+				return -EINVAL;
+
+			tgt = lmv->tgts[qctl->qc_idx];
+			if (tgt == NULL || tgt->ltd_exp == NULL)
+				return -EINVAL;
+		} else if (qctl->qc_valid == QC_UUID) {
+			for (i = 0; i < count; i++) {
+				tgt = lmv->tgts[i];
+				if (tgt == NULL)
+					continue;
+				if (!obd_uuid_equals(&tgt->ltd_uuid,
+						     &qctl->obd_uuid))
+					continue;
+
+				if (tgt->ltd_exp == NULL)
+					return -EINVAL;
+
+				break;
+			}
+		} else {
+			return -EINVAL;
+		}
+
+		if (i >= count)
+			return -EAGAIN;
+
+		LASSERT(tgt && tgt->ltd_exp);
+		OBD_ALLOC_PTR(oqctl);
+		if (!oqctl)
+			return -ENOMEM;
+
+		QCTL_COPY(oqctl, qctl);
+		rc = obd_quotactl(tgt->ltd_exp, oqctl);
+		if (rc == 0) {
+			QCTL_COPY(qctl, oqctl);
+			qctl->qc_valid = QC_MDTIDX;
+			qctl->obd_uuid = tgt->ltd_uuid;
+		}
+		OBD_FREE_PTR(oqctl);
+		break;
+	}
+	case OBD_IOC_CHANGELOG_SEND:
+	case OBD_IOC_CHANGELOG_CLEAR: {
+		struct ioc_changelog *icc = karg;
+
+		if (icc->icc_mdtindex >= count)
+			return -ENODEV;
+
+		if (lmv->tgts[icc->icc_mdtindex] == NULL ||
+		    lmv->tgts[icc->icc_mdtindex]->ltd_exp == NULL ||
+		    lmv->tgts[icc->icc_mdtindex]->ltd_active == 0)
+			return -ENODEV;
+		rc = obd_iocontrol(cmd, lmv->tgts[icc->icc_mdtindex]->ltd_exp,
+				   sizeof(*icc), icc, NULL);
+		break;
+	}
+	case LL_IOC_GET_CONNECT_FLAGS: {
+		if (lmv->tgts[0] == NULL)
+			return -ENODATA;
+		rc = obd_iocontrol(cmd, lmv->tgts[0]->ltd_exp, len, karg, uarg);
+		break;
+	}
+	case OBD_IOC_FID2PATH: {
+		rc = lmv_fid2path(exp, len, karg, uarg);
+		break;
+	}
+	case LL_IOC_HSM_STATE_GET:
+	case LL_IOC_HSM_STATE_SET:
+	case LL_IOC_HSM_ACTION: {
+		struct md_op_data	*op_data = karg;
+		struct lmv_tgt_desc	*tgt;
+
+		tgt = lmv_find_target(lmv, &op_data->op_fid1);
+		if (IS_ERR(tgt))
+				return PTR_ERR(tgt);
+
+		if (tgt->ltd_exp == NULL)
+				return -EINVAL;
+
+		rc = obd_iocontrol(cmd, tgt->ltd_exp, len, karg, uarg);
+		break;
+	}
+	case LL_IOC_HSM_PROGRESS: {
+		const struct hsm_progress_kernel *hpk = karg;
+		struct lmv_tgt_desc	*tgt;
+
+		tgt = lmv_find_target(lmv, &hpk->hpk_fid);
+		if (IS_ERR(tgt))
+			return PTR_ERR(tgt);
+		rc = obd_iocontrol(cmd, tgt->ltd_exp, len, karg, uarg);
+		break;
+	}
+	case LL_IOC_HSM_REQUEST: {
+		struct hsm_user_request *hur = karg;
+		struct lmv_tgt_desc	*tgt;
+		unsigned int reqcount = hur->hur_request.hr_itemcount;
+
+		if (reqcount == 0)
+			return 0;
+
+		/* if the request is about a single fid
+		 * or if there is a single MDS, no need to split
+		 * the request. */
+		if (reqcount == 1 || count == 1) {
+			tgt = lmv_find_target(lmv,
+					      &hur->hur_user_item[0].hui_fid);
+			if (IS_ERR(tgt))
+				return PTR_ERR(tgt);
+			rc = obd_iocontrol(cmd, tgt->ltd_exp, len, karg, uarg);
+		} else {
+			/* split fid list to their respective MDS */
+			for (i = 0; i < count; i++) {
+				unsigned int		nr, reqlen;
+				int			rc1;
+				struct hsm_user_request *req;
+
+				nr = lmv_hsm_req_count(lmv, hur, lmv->tgts[i]);
+				if (nr == 0) /* nothing for this MDS */
+					continue;
+
+				/* build a request with fids for this MDS */
+				reqlen = offsetof(typeof(*hur),
+						  hur_user_item[nr])
+					 + hur->hur_request.hr_data_len;
+				OBD_ALLOC_LARGE(req, reqlen);
+				if (req == NULL)
+					return -ENOMEM;
+
+				lmv_hsm_req_build(lmv, hur, lmv->tgts[i], req);
+
+				rc1 = obd_iocontrol(cmd, lmv->tgts[i]->ltd_exp,
+						    reqlen, req, uarg);
+				if (rc1 != 0 && rc == 0)
+					rc = rc1;
+				OBD_FREE_LARGE(req, reqlen);
+			}
+		}
+		break;
+	}
+	case LL_IOC_LOV_SWAP_LAYOUTS: {
+		struct md_op_data	*op_data = karg;
+		struct lmv_tgt_desc	*tgt1, *tgt2;
+
+		tgt1 = lmv_find_target(lmv, &op_data->op_fid1);
+		if (IS_ERR(tgt1))
+			return PTR_ERR(tgt1);
+
+		tgt2 = lmv_find_target(lmv, &op_data->op_fid2);
+		if (IS_ERR(tgt2))
+			return PTR_ERR(tgt2);
+
+		if ((tgt1->ltd_exp == NULL) || (tgt2->ltd_exp == NULL))
+			return -EINVAL;
+
+		/* only files on same MDT can have their layouts swapped */
+		if (tgt1->ltd_idx != tgt2->ltd_idx)
+			return -EPERM;
+
+		rc = obd_iocontrol(cmd, tgt1->ltd_exp, len, karg, uarg);
+		break;
+	}
+	case LL_IOC_HSM_CT_START: {
+		struct lustre_kernelcomm *lk = karg;
+		if (lk->lk_flags & LK_FLG_STOP)
+			rc = lmv_hsm_ct_unregister(lmv, cmd, len, lk, uarg);
+		else
+			rc = lmv_hsm_ct_register(lmv, cmd, len, lk, uarg);
+		break;
+	}
+	default:
+		for (i = 0; i < count; i++) {
+			struct obd_device *mdc_obd;
+			int err;
+
+			if (lmv->tgts[i] == NULL ||
+			    lmv->tgts[i]->ltd_exp == NULL)
+				continue;
+			/* ll_umount_begin() sets force flag but for lmv, not
+			 * mdc. Let's pass it through */
+			mdc_obd = class_exp2obd(lmv->tgts[i]->ltd_exp);
+			mdc_obd->obd_force = obddev->obd_force;
+			err = obd_iocontrol(cmd, lmv->tgts[i]->ltd_exp, len,
+					    karg, uarg);
+			if (err == -ENODATA && cmd == OBD_IOC_POLL_QUOTACHECK) {
+				return err;
+			} else if (err) {
+				if (lmv->tgts[i]->ltd_active) {
+					CERROR("error: iocontrol MDC %s on MDTidx %d cmd %x: err = %d\n",
+					       lmv->tgts[i]->ltd_uuid.uuid,
+					       i, cmd, err);
+					if (!rc)
+						rc = err;
+				}
+			} else
+				set = 1;
+		}
+		if (!set && !rc)
+			rc = -EIO;
+	}
+	return rc;
+}
+
+#if 0
+static int lmv_all_chars_policy(int count, const char *name,
+				int len)
+{
+	unsigned int c = 0;
+
+	while (len > 0)
+		c += name[--len];
+	c = c % count;
+	return c;
+}
+
+static int lmv_nid_policy(struct lmv_obd *lmv)
+{
+	struct obd_import *imp;
+	__u32	      id;
+
+	/*
+	 * XXX: To get nid we assume that underlying obd device is mdc.
+	 */
+	imp = class_exp2cliimp(lmv->tgts[0].ltd_exp);
+	id = imp->imp_connection->c_self ^ (imp->imp_connection->c_self >> 32);
+	return id % lmv->desc.ld_tgt_count;
+}
+
+static int lmv_choose_mds(struct lmv_obd *lmv, struct md_op_data *op_data,
+			  enum placement_policy placement)
+{
+	switch (placement) {
+	case PLACEMENT_CHAR_POLICY:
+		return lmv_all_chars_policy(lmv->desc.ld_tgt_count,
+					    op_data->op_name,
+					    op_data->op_namelen);
+	case PLACEMENT_NID_POLICY:
+		return lmv_nid_policy(lmv);
+
+	default:
+		break;
+	}
+
+	CERROR("Unsupported placement policy %x\n", placement);
+	return -EINVAL;
+}
+#endif
+
+/**
+ * This is _inode_ placement policy function (not name).
+ */
+static int lmv_placement_policy(struct obd_device *obd,
+				struct md_op_data *op_data, u32 *mds)
+{
+	struct lmv_obd	  *lmv = &obd->u.lmv;
+
+	LASSERT(mds != NULL);
+
+	if (lmv->desc.ld_tgt_count == 1) {
+		*mds = 0;
+		return 0;
+	}
+
+	/**
+	 * If stripe_offset is provided during setdirstripe
+	 * (setdirstripe -i xx), xx MDS will be chosen.
+	 */
+	if (op_data->op_cli_flags & CLI_SET_MEA) {
+		struct lmv_user_md *lum;
+
+		lum = (struct lmv_user_md *)op_data->op_data;
+		if (lum->lum_type == LMV_STRIPE_TYPE &&
+		    lum->lum_stripe_offset != -1) {
+			if (lum->lum_stripe_offset >= lmv->desc.ld_tgt_count) {
+				CERROR("%s: Stripe_offset %d > MDT count %d: rc = %d\n",
+				       obd->obd_name,
+				       lum->lum_stripe_offset,
+				       lmv->desc.ld_tgt_count, -ERANGE);
+				return -ERANGE;
+			}
+			*mds = lum->lum_stripe_offset;
+			return 0;
+		}
+	}
+
+	/* Allocate new fid on target according to operation type and parent
+	 * home mds. */
+	*mds = op_data->op_mds;
+	return 0;
+}
+
+int __lmv_fid_alloc(struct lmv_obd *lmv, struct lu_fid *fid, u32 mds)
+{
+	struct lmv_tgt_desc	*tgt;
+	int			 rc;
+
+	tgt = lmv_get_target(lmv, mds);
+	if (IS_ERR(tgt))
+		return PTR_ERR(tgt);
+
+	/*
+	 * New seq alloc and FLD setup should be atomic. Otherwise we may find
+	 * on server that seq in new allocated fid is not yet known.
+	 */
+	mutex_lock(&tgt->ltd_fid_mutex);
+
+	if (tgt->ltd_active == 0 || tgt->ltd_exp == NULL) {
+		rc = -ENODEV;
+		goto out;
+	}
+
+	/*
+	 * Asking underlaying tgt layer to allocate new fid.
+	 */
+	rc = obd_fid_alloc(tgt->ltd_exp, fid, NULL);
+	if (rc > 0) {
+		LASSERT(fid_is_sane(fid));
+		rc = 0;
+	}
+
+out:
+	mutex_unlock(&tgt->ltd_fid_mutex);
+	return rc;
+}
+
+int lmv_fid_alloc(struct obd_export *exp, struct lu_fid *fid,
+		  struct md_op_data *op_data)
+{
+	struct obd_device     *obd = class_exp2obd(exp);
+	struct lmv_obd	*lmv = &obd->u.lmv;
+	u32		       mds = 0;
+	int		    rc;
+
+	LASSERT(op_data != NULL);
+	LASSERT(fid != NULL);
+
+	rc = lmv_placement_policy(obd, op_data, &mds);
+	if (rc) {
+		CERROR("Can't get target for allocating fid, rc %d\n",
+		       rc);
+		return rc;
+	}
+
+	rc = __lmv_fid_alloc(lmv, fid, mds);
+	if (rc) {
+		CERROR("Can't alloc new fid, rc %d\n", rc);
+		return rc;
+	}
+
+	return rc;
+}
+
+static int lmv_setup(struct obd_device *obd, struct lustre_cfg *lcfg)
+{
+	struct lmv_obd	     *lmv = &obd->u.lmv;
+	struct lprocfs_static_vars  lvars;
+	struct lmv_desc	    *desc;
+	int			 rc;
+
+	if (LUSTRE_CFG_BUFLEN(lcfg, 1) < 1) {
+		CERROR("LMV setup requires a descriptor\n");
+		return -EINVAL;
+	}
+
+	desc = (struct lmv_desc *)lustre_cfg_buf(lcfg, 1);
+	if (sizeof(*desc) > LUSTRE_CFG_BUFLEN(lcfg, 1)) {
+		CERROR("Lmv descriptor size wrong: %d > %d\n",
+		       (int)sizeof(*desc), LUSTRE_CFG_BUFLEN(lcfg, 1));
+		return -EINVAL;
+	}
+
+	OBD_ALLOC(lmv->tgts, sizeof(*lmv->tgts) * 32);
+	if (lmv->tgts == NULL)
+		return -ENOMEM;
+	lmv->tgts_size = 32;
+
+	obd_str2uuid(&lmv->desc.ld_uuid, desc->ld_uuid.uuid);
+	lmv->desc.ld_tgt_count = 0;
+	lmv->desc.ld_active_tgt_count = 0;
+	lmv->max_cookiesize = 0;
+	lmv->max_def_easize = 0;
+	lmv->max_easize = 0;
+	lmv->lmv_placement = PLACEMENT_CHAR_POLICY;
+
+	spin_lock_init(&lmv->lmv_lock);
+	mutex_init(&lmv->init_mutex);
+
+	lprocfs_lmv_init_vars(&lvars);
+
+	lprocfs_obd_setup(obd, lvars.obd_vars);
+#if defined (CONFIG_PROC_FS)
+	{
+		rc = lprocfs_seq_create(obd->obd_proc_entry, "target_obd",
+					0444, &lmv_proc_target_fops, obd);
+		if (rc)
+			CWARN("%s: error adding LMV target_obd file: rc = %d\n",
+			       obd->obd_name, rc);
+       }
+#endif
+	rc = fld_client_init(&lmv->lmv_fld, obd->obd_name,
+			     LUSTRE_CLI_FLD_HASH_DHT);
+	if (rc) {
+		CERROR("Can't init FLD, err %d\n", rc);
+		goto out;
+	}
+
+	return 0;
+
+out:
+	return rc;
+}
+
+static int lmv_cleanup(struct obd_device *obd)
+{
+	struct lmv_obd   *lmv = &obd->u.lmv;
+
+	fld_client_fini(&lmv->lmv_fld);
+	if (lmv->tgts != NULL) {
+		int i;
+		for (i = 0; i < lmv->desc.ld_tgt_count; i++) {
+			if (lmv->tgts[i] == NULL)
+				continue;
+			lmv_del_target(lmv, i);
+		}
+		OBD_FREE(lmv->tgts, sizeof(*lmv->tgts) * lmv->tgts_size);
+		lmv->tgts_size = 0;
+	}
+	return 0;
+}
+
+static int lmv_process_config(struct obd_device *obd, u32 len, void *buf)
+{
+	struct lustre_cfg	*lcfg = buf;
+	struct obd_uuid		obd_uuid;
+	int			gen;
+	__u32			index;
+	int			rc;
+
+	switch (lcfg->lcfg_command) {
+	case LCFG_ADD_MDC:
+		/* modify_mdc_tgts add 0:lustre-clilmv  1:lustre-MDT0000_UUID
+		 * 2:0  3:1  4:lustre-MDT0000-mdc_UUID */
+		if (LUSTRE_CFG_BUFLEN(lcfg, 1) > sizeof(obd_uuid.uuid)) {
+			rc = -EINVAL;
+			goto out;
+		}
+
+		obd_str2uuid(&obd_uuid,  lustre_cfg_buf(lcfg, 1));
+
+		if (sscanf(lustre_cfg_buf(lcfg, 2), "%d", &index) != 1) {
+			rc = -EINVAL;
+			goto out;
+		}
+		if (sscanf(lustre_cfg_buf(lcfg, 3), "%d", &gen) != 1) {
+			rc = -EINVAL;
+			goto out;
+		}
+		rc = lmv_add_target(obd, &obd_uuid, index, gen);
+		goto out;
+	default:
+		CERROR("Unknown command: %d\n", lcfg->lcfg_command);
+		rc = -EINVAL;
+		goto out;
+	}
+out:
+	return rc;
+}
+
+static int lmv_statfs(const struct lu_env *env, struct obd_export *exp,
+		      struct obd_statfs *osfs, __u64 max_age, __u32 flags)
+{
+	struct obd_device     *obd = class_exp2obd(exp);
+	struct lmv_obd	*lmv = &obd->u.lmv;
+	struct obd_statfs     *temp;
+	int		    rc = 0;
+	int		    i;
+
+	rc = lmv_check_connect(obd);
+	if (rc)
+		return rc;
+
+	OBD_ALLOC(temp, sizeof(*temp));
+	if (temp == NULL)
+		return -ENOMEM;
+
+	for (i = 0; i < lmv->desc.ld_tgt_count; i++) {
+		if (lmv->tgts[i] == NULL || lmv->tgts[i]->ltd_exp == NULL)
+			continue;
+
+		rc = obd_statfs(env, lmv->tgts[i]->ltd_exp, temp,
+				max_age, flags);
+		if (rc) {
+			CERROR("can't stat MDS #%d (%s), error %d\n", i,
+			       lmv->tgts[i]->ltd_exp->exp_obd->obd_name,
+			       rc);
+			goto out_free_temp;
+		}
+
+		if (i == 0) {
+			*osfs = *temp;
+			/* If the statfs is from mount, it will needs
+			 * retrieve necessary information from MDT0.
+			 * i.e. mount does not need the merged osfs
+			 * from all of MDT.
+			 * And also clients can be mounted as long as
+			 * MDT0 is in service*/
+			if (flags & OBD_STATFS_FOR_MDT0)
+				goto out_free_temp;
+		} else {
+			osfs->os_bavail += temp->os_bavail;
+			osfs->os_blocks += temp->os_blocks;
+			osfs->os_ffree += temp->os_ffree;
+			osfs->os_files += temp->os_files;
+		}
+	}
+
+out_free_temp:
+	OBD_FREE(temp, sizeof(*temp));
+	return rc;
+}
+
+static int lmv_getstatus(struct obd_export *exp,
+			 struct lu_fid *fid,
+			 struct obd_capa **pc)
+{
+	struct obd_device    *obd = exp->exp_obd;
+	struct lmv_obd       *lmv = &obd->u.lmv;
+	int		   rc;
+
+	rc = lmv_check_connect(obd);
+	if (rc)
+		return rc;
+
+	rc = md_getstatus(lmv->tgts[0]->ltd_exp, fid, pc);
+	return rc;
+}
+
+static int lmv_getxattr(struct obd_export *exp, const struct lu_fid *fid,
+			struct obd_capa *oc, u64 valid, const char *name,
+			const char *input, int input_size, int output_size,
+			int flags, struct ptlrpc_request **request)
+{
+	struct obd_device      *obd = exp->exp_obd;
+	struct lmv_obd	 *lmv = &obd->u.lmv;
+	struct lmv_tgt_desc    *tgt;
+	int		     rc;
+
+	rc = lmv_check_connect(obd);
+	if (rc)
+		return rc;
+
+	tgt = lmv_find_target(lmv, fid);
+	if (IS_ERR(tgt))
+		return PTR_ERR(tgt);
+
+	rc = md_getxattr(tgt->ltd_exp, fid, oc, valid, name, input,
+			 input_size, output_size, flags, request);
+
+	return rc;
+}
+
+static int lmv_setxattr(struct obd_export *exp, const struct lu_fid *fid,
+			struct obd_capa *oc, u64 valid, const char *name,
+			const char *input, int input_size, int output_size,
+			int flags, __u32 suppgid,
+			struct ptlrpc_request **request)
+{
+	struct obd_device      *obd = exp->exp_obd;
+	struct lmv_obd	 *lmv = &obd->u.lmv;
+	struct lmv_tgt_desc    *tgt;
+	int		     rc;
+
+	rc = lmv_check_connect(obd);
+	if (rc)
+		return rc;
+
+	tgt = lmv_find_target(lmv, fid);
+	if (IS_ERR(tgt))
+		return PTR_ERR(tgt);
+
+	rc = md_setxattr(tgt->ltd_exp, fid, oc, valid, name, input,
+			 input_size, output_size, flags, suppgid,
+			 request);
+
+	return rc;
+}
+
+static int lmv_getattr(struct obd_export *exp, struct md_op_data *op_data,
+		       struct ptlrpc_request **request)
+{
+	struct obd_device       *obd = exp->exp_obd;
+	struct lmv_obd	  *lmv = &obd->u.lmv;
+	struct lmv_tgt_desc     *tgt;
+	int		      rc;
+
+	rc = lmv_check_connect(obd);
+	if (rc)
+		return rc;
+
+	tgt = lmv_find_target(lmv, &op_data->op_fid1);
+	if (IS_ERR(tgt))
+		return PTR_ERR(tgt);
+
+	if (op_data->op_flags & MF_GET_MDT_IDX) {
+		op_data->op_mds = tgt->ltd_idx;
+		return 0;
+	}
+
+	rc = md_getattr(tgt->ltd_exp, op_data, request);
+
+	return rc;
+}
+
+static int lmv_null_inode(struct obd_export *exp, const struct lu_fid *fid)
+{
+	struct obd_device   *obd = exp->exp_obd;
+	struct lmv_obd      *lmv = &obd->u.lmv;
+	int		  i;
+	int		  rc;
+
+	rc = lmv_check_connect(obd);
+	if (rc)
+		return rc;
+
+	CDEBUG(D_INODE, "CBDATA for "DFID"\n", PFID(fid));
+
+	/*
+	 * With DNE every object can have two locks in different namespaces:
+	 * lookup lock in space of MDT storing direntry and update/open lock in
+	 * space of MDT storing inode.
+	 */
+	for (i = 0; i < lmv->desc.ld_tgt_count; i++) {
+		if (lmv->tgts[i] == NULL || lmv->tgts[i]->ltd_exp == NULL)
+			continue;
+		md_null_inode(lmv->tgts[i]->ltd_exp, fid);
+	}
+
+	return 0;
+}
+
+static int lmv_find_cbdata(struct obd_export *exp, const struct lu_fid *fid,
+			   ldlm_iterator_t it, void *data)
+{
+	struct obd_device   *obd = exp->exp_obd;
+	struct lmv_obd      *lmv = &obd->u.lmv;
+	int		  i;
+	int		  rc;
+
+	rc = lmv_check_connect(obd);
+	if (rc)
+		return rc;
+
+	CDEBUG(D_INODE, "CBDATA for "DFID"\n", PFID(fid));
+
+	/*
+	 * With DNE every object can have two locks in different namespaces:
+	 * lookup lock in space of MDT storing direntry and update/open lock in
+	 * space of MDT storing inode.
+	 */
+	for (i = 0; i < lmv->desc.ld_tgt_count; i++) {
+		if (lmv->tgts[i] == NULL || lmv->tgts[i]->ltd_exp == NULL)
+			continue;
+		rc = md_find_cbdata(lmv->tgts[i]->ltd_exp, fid, it, data);
+		if (rc)
+			return rc;
+	}
+
+	return rc;
+}
+
+
+static int lmv_close(struct obd_export *exp, struct md_op_data *op_data,
+		     struct md_open_data *mod, struct ptlrpc_request **request)
+{
+	struct obd_device     *obd = exp->exp_obd;
+	struct lmv_obd	*lmv = &obd->u.lmv;
+	struct lmv_tgt_desc   *tgt;
+	int		    rc;
+
+	rc = lmv_check_connect(obd);
+	if (rc)
+		return rc;
+
+	tgt = lmv_find_target(lmv, &op_data->op_fid1);
+	if (IS_ERR(tgt))
+		return PTR_ERR(tgt);
+
+	CDEBUG(D_INODE, "CLOSE "DFID"\n", PFID(&op_data->op_fid1));
+	rc = md_close(tgt->ltd_exp, op_data, mod, request);
+	return rc;
+}
+
+struct lmv_tgt_desc
+*lmv_locate_mds(struct lmv_obd *lmv, struct md_op_data *op_data,
+		struct lu_fid *fid)
+{
+	struct lmv_tgt_desc *tgt;
+
+	tgt = lmv_find_target(lmv, fid);
+	if (IS_ERR(tgt))
+		return tgt;
+
+	op_data->op_mds = tgt->ltd_idx;
+
+	return tgt;
+}
+
+static int lmv_create(struct obd_export *exp, struct md_op_data *op_data,
+		      const void *data, int datalen, int mode, __u32 uid,
+		      __u32 gid, cfs_cap_t cap_effective, __u64 rdev,
+		      struct ptlrpc_request **request)
+{
+	struct obd_device       *obd = exp->exp_obd;
+	struct lmv_obd	  *lmv = &obd->u.lmv;
+	struct lmv_tgt_desc     *tgt;
+	int		      rc;
+
+	rc = lmv_check_connect(obd);
+	if (rc)
+		return rc;
+
+	if (!lmv->desc.ld_active_tgt_count)
+		return -EIO;
+
+	tgt = lmv_locate_mds(lmv, op_data, &op_data->op_fid1);
+	if (IS_ERR(tgt))
+		return PTR_ERR(tgt);
+
+	rc = lmv_fid_alloc(exp, &op_data->op_fid2, op_data);
+	if (rc)
+		return rc;
+
+	CDEBUG(D_INODE, "CREATE '%*s' on "DFID" -> mds #%x\n",
+	       op_data->op_namelen, op_data->op_name, PFID(&op_data->op_fid1),
+	       op_data->op_mds);
+
+	op_data->op_flags |= MF_MDC_CANCEL_FID1;
+	rc = md_create(tgt->ltd_exp, op_data, data, datalen, mode, uid, gid,
+		       cap_effective, rdev, request);
+
+	if (rc == 0) {
+		if (*request == NULL)
+			return rc;
+		CDEBUG(D_INODE, "Created - "DFID"\n", PFID(&op_data->op_fid2));
+	}
+	return rc;
+}
+
+static int lmv_done_writing(struct obd_export *exp,
+			    struct md_op_data *op_data,
+			    struct md_open_data *mod)
+{
+	struct obd_device     *obd = exp->exp_obd;
+	struct lmv_obd	*lmv = &obd->u.lmv;
+	struct lmv_tgt_desc   *tgt;
+	int		    rc;
+
+	rc = lmv_check_connect(obd);
+	if (rc)
+		return rc;
+
+	tgt = lmv_find_target(lmv, &op_data->op_fid1);
+	if (IS_ERR(tgt))
+		return PTR_ERR(tgt);
+
+	rc = md_done_writing(tgt->ltd_exp, op_data, mod);
+	return rc;
+}
+
+static int
+lmv_enqueue_remote(struct obd_export *exp, struct ldlm_enqueue_info *einfo,
+		   struct lookup_intent *it, struct md_op_data *op_data,
+		   struct lustre_handle *lockh, void *lmm, int lmmsize,
+		   __u64 extra_lock_flags)
+{
+	struct ptlrpc_request      *req = it->d.lustre.it_data;
+	struct obd_device	  *obd = exp->exp_obd;
+	struct lmv_obd	     *lmv = &obd->u.lmv;
+	struct lustre_handle	plock;
+	struct lmv_tgt_desc	*tgt;
+	struct md_op_data	  *rdata;
+	struct lu_fid	       fid1;
+	struct mdt_body	    *body;
+	int			 rc = 0;
+	int			 pmode;
+
+	body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
+	LASSERT(body != NULL);
+
+	if (!(body->valid & OBD_MD_MDS))
+		return 0;
+
+	CDEBUG(D_INODE, "REMOTE_ENQUEUE '%s' on "DFID" -> "DFID"\n",
+	       LL_IT2STR(it), PFID(&op_data->op_fid1), PFID(&body->fid1));
+
+	/*
+	 * We got LOOKUP lock, but we really need attrs.
+	 */
+	pmode = it->d.lustre.it_lock_mode;
+	LASSERT(pmode != 0);
+	memcpy(&plock, lockh, sizeof(plock));
+	it->d.lustre.it_lock_mode = 0;
+	it->d.lustre.it_data = NULL;
+	fid1 = body->fid1;
+
+	ptlrpc_req_finished(req);
+
+	tgt = lmv_find_target(lmv, &fid1);
+	if (IS_ERR(tgt)) {
+		rc = PTR_ERR(tgt);
+		goto out;
+	}
+
+	OBD_ALLOC_PTR(rdata);
+	if (rdata == NULL) {
+		rc = -ENOMEM;
+		goto out;
+	}
+
+	rdata->op_fid1 = fid1;
+	rdata->op_bias = MDS_CROSS_REF;
+
+	rc = md_enqueue(tgt->ltd_exp, einfo, it, rdata, lockh,
+			lmm, lmmsize, NULL, extra_lock_flags);
+	OBD_FREE_PTR(rdata);
+out:
+	ldlm_lock_decref(&plock, pmode);
+	return rc;
+}
+
+static int
+lmv_enqueue(struct obd_export *exp, struct ldlm_enqueue_info *einfo,
+	    struct lookup_intent *it, struct md_op_data *op_data,
+	    struct lustre_handle *lockh, void *lmm, int lmmsize,
+	    struct ptlrpc_request **req, __u64 extra_lock_flags)
+{
+	struct obd_device	*obd = exp->exp_obd;
+	struct lmv_obd	   *lmv = &obd->u.lmv;
+	struct lmv_tgt_desc      *tgt;
+	int		       rc;
+
+	rc = lmv_check_connect(obd);
+	if (rc)
+		return rc;
+
+	CDEBUG(D_INODE, "ENQUEUE '%s' on "DFID"\n",
+	       LL_IT2STR(it), PFID(&op_data->op_fid1));
+
+	tgt = lmv_locate_mds(lmv, op_data, &op_data->op_fid1);
+	if (IS_ERR(tgt))
+		return PTR_ERR(tgt);
+
+	CDEBUG(D_INODE, "ENQUEUE '%s' on "DFID" -> mds #%d\n",
+	       LL_IT2STR(it), PFID(&op_data->op_fid1), tgt->ltd_idx);
+
+	rc = md_enqueue(tgt->ltd_exp, einfo, it, op_data, lockh,
+			lmm, lmmsize, req, extra_lock_flags);
+
+	if (rc == 0 && it && it->it_op == IT_OPEN) {
+		rc = lmv_enqueue_remote(exp, einfo, it, op_data, lockh,
+					lmm, lmmsize, extra_lock_flags);
+	}
+	return rc;
+}
+
+static int
+lmv_getattr_name(struct obd_export *exp, struct md_op_data *op_data,
+		 struct ptlrpc_request **request)
+{
+	struct ptlrpc_request   *req = NULL;
+	struct obd_device       *obd = exp->exp_obd;
+	struct lmv_obd	  *lmv = &obd->u.lmv;
+	struct lmv_tgt_desc     *tgt;
+	struct mdt_body	 *body;
+	int		      rc;
+
+	rc = lmv_check_connect(obd);
+	if (rc)
+		return rc;
+
+	tgt = lmv_locate_mds(lmv, op_data, &op_data->op_fid1);
+	if (IS_ERR(tgt))
+		return PTR_ERR(tgt);
+
+	CDEBUG(D_INODE, "GETATTR_NAME for %*s on "DFID" -> mds #%d\n",
+	       op_data->op_namelen, op_data->op_name, PFID(&op_data->op_fid1),
+	       tgt->ltd_idx);
+
+	rc = md_getattr_name(tgt->ltd_exp, op_data, request);
+	if (rc != 0)
+		return rc;
+
+	body = req_capsule_server_get(&(*request)->rq_pill,
+				      &RMF_MDT_BODY);
+	LASSERT(body != NULL);
+
+	if (body->valid & OBD_MD_MDS) {
+		struct lu_fid rid = body->fid1;
+		CDEBUG(D_INODE, "Request attrs for "DFID"\n",
+		       PFID(&rid));
+
+		tgt = lmv_find_target(lmv, &rid);
+		if (IS_ERR(tgt)) {
+			ptlrpc_req_finished(*request);
+			return PTR_ERR(tgt);
+		}
+
+		op_data->op_fid1 = rid;
+		op_data->op_valid |= OBD_MD_FLCROSSREF;
+		op_data->op_namelen = 0;
+		op_data->op_name = NULL;
+		rc = md_getattr_name(tgt->ltd_exp, op_data, &req);
+		ptlrpc_req_finished(*request);
+		*request = req;
+	}
+
+	return rc;
+}
+
+#define md_op_data_fid(op_data, fl)		     \
+	(fl == MF_MDC_CANCEL_FID1 ? &op_data->op_fid1 : \
+	 fl == MF_MDC_CANCEL_FID2 ? &op_data->op_fid2 : \
+	 fl == MF_MDC_CANCEL_FID3 ? &op_data->op_fid3 : \
+	 fl == MF_MDC_CANCEL_FID4 ? &op_data->op_fid4 : \
+	 NULL)
+
+static int lmv_early_cancel(struct obd_export *exp, struct md_op_data *op_data,
+			    int op_tgt, ldlm_mode_t mode, int bits, int flag)
+{
+	struct lu_fid	  *fid = md_op_data_fid(op_data, flag);
+	struct obd_device      *obd = exp->exp_obd;
+	struct lmv_obd	 *lmv = &obd->u.lmv;
+	struct lmv_tgt_desc    *tgt;
+	ldlm_policy_data_t      policy = {{0}};
+	int		     rc = 0;
+
+	if (!fid_is_sane(fid))
+		return 0;
+
+	tgt = lmv_find_target(lmv, fid);
+	if (IS_ERR(tgt))
+		return PTR_ERR(tgt);
+
+	if (tgt->ltd_idx != op_tgt) {
+		CDEBUG(D_INODE, "EARLY_CANCEL on "DFID"\n", PFID(fid));
+		policy.l_inodebits.bits = bits;
+		rc = md_cancel_unused(tgt->ltd_exp, fid, &policy,
+				      mode, LCF_ASYNC, NULL);
+	} else {
+		CDEBUG(D_INODE,
+		       "EARLY_CANCEL skip operation target %d on "DFID"\n",
+		       op_tgt, PFID(fid));
+		op_data->op_flags |= flag;
+		rc = 0;
+	}
+
+	return rc;
+}
+
+/*
+ * llite passes fid of an target inode in op_data->op_fid1 and id of directory in
+ * op_data->op_fid2
+ */
+static int lmv_link(struct obd_export *exp, struct md_op_data *op_data,
+		    struct ptlrpc_request **request)
+{
+	struct obd_device       *obd = exp->exp_obd;
+	struct lmv_obd	  *lmv = &obd->u.lmv;
+	struct lmv_tgt_desc     *tgt;
+	int		      rc;
+
+	rc = lmv_check_connect(obd);
+	if (rc)
+		return rc;
+
+	LASSERT(op_data->op_namelen != 0);
+
+	CDEBUG(D_INODE, "LINK "DFID":%*s to "DFID"\n",
+	       PFID(&op_data->op_fid2), op_data->op_namelen,
+	       op_data->op_name, PFID(&op_data->op_fid1));
+
+	op_data->op_fsuid = from_kuid(&init_user_ns, current_fsuid());
+	op_data->op_fsgid = from_kgid(&init_user_ns, current_fsgid());
+	op_data->op_cap = cfs_curproc_cap_pack();
+	tgt = lmv_locate_mds(lmv, op_data, &op_data->op_fid2);
+	if (IS_ERR(tgt))
+		return PTR_ERR(tgt);
+
+	/*
+	 * Cancel UPDATE lock on child (fid1).
+	 */
+	op_data->op_flags |= MF_MDC_CANCEL_FID2;
+	rc = lmv_early_cancel(exp, op_data, tgt->ltd_idx, LCK_EX,
+			      MDS_INODELOCK_UPDATE, MF_MDC_CANCEL_FID1);
+	if (rc != 0)
+		return rc;
+
+	rc = md_link(tgt->ltd_exp, op_data, request);
+
+	return rc;
+}
+
+static int lmv_rename(struct obd_export *exp, struct md_op_data *op_data,
+		      const char *old, int oldlen, const char *new, int newlen,
+		      struct ptlrpc_request **request)
+{
+	struct obd_device       *obd = exp->exp_obd;
+	struct lmv_obd	  *lmv = &obd->u.lmv;
+	struct lmv_tgt_desc     *src_tgt;
+	struct lmv_tgt_desc     *tgt_tgt;
+	int			rc;
+
+	LASSERT(oldlen != 0);
+
+	CDEBUG(D_INODE, "RENAME %*s in "DFID" to %*s in "DFID"\n",
+	       oldlen, old, PFID(&op_data->op_fid1),
+	       newlen, new, PFID(&op_data->op_fid2));
+
+	rc = lmv_check_connect(obd);
+	if (rc)
+		return rc;
+
+	op_data->op_fsuid = from_kuid(&init_user_ns, current_fsuid());
+	op_data->op_fsgid = from_kgid(&init_user_ns, current_fsgid());
+	op_data->op_cap = cfs_curproc_cap_pack();
+	src_tgt = lmv_locate_mds(lmv, op_data, &op_data->op_fid1);
+	if (IS_ERR(src_tgt))
+		return PTR_ERR(src_tgt);
+
+	tgt_tgt = lmv_locate_mds(lmv, op_data, &op_data->op_fid2);
+	if (IS_ERR(tgt_tgt))
+		return PTR_ERR(tgt_tgt);
+	/*
+	 * LOOKUP lock on src child (fid3) should also be cancelled for
+	 * src_tgt in mdc_rename.
+	 */
+	op_data->op_flags |= MF_MDC_CANCEL_FID1 | MF_MDC_CANCEL_FID3;
+
+	/*
+	 * Cancel UPDATE locks on tgt parent (fid2), tgt_tgt is its
+	 * own target.
+	 */
+	rc = lmv_early_cancel(exp, op_data, src_tgt->ltd_idx,
+			      LCK_EX, MDS_INODELOCK_UPDATE,
+			      MF_MDC_CANCEL_FID2);
+
+	/*
+	 * Cancel LOOKUP locks on tgt child (fid4) for parent tgt_tgt.
+	 */
+	if (rc == 0) {
+		rc = lmv_early_cancel(exp, op_data, src_tgt->ltd_idx,
+				      LCK_EX, MDS_INODELOCK_LOOKUP,
+				      MF_MDC_CANCEL_FID4);
+	}
+
+	/*
+	 * Cancel all the locks on tgt child (fid4).
+	 */
+	if (rc == 0)
+		rc = lmv_early_cancel(exp, op_data, src_tgt->ltd_idx,
+				      LCK_EX, MDS_INODELOCK_FULL,
+				      MF_MDC_CANCEL_FID4);
+
+	if (rc == 0)
+		rc = md_rename(src_tgt->ltd_exp, op_data, old, oldlen,
+			       new, newlen, request);
+	return rc;
+}
+
+static int lmv_setattr(struct obd_export *exp, struct md_op_data *op_data,
+		       void *ea, int ealen, void *ea2, int ea2len,
+		       struct ptlrpc_request **request,
+		       struct md_open_data **mod)
+{
+	struct obd_device       *obd = exp->exp_obd;
+	struct lmv_obd	  *lmv = &obd->u.lmv;
+	struct lmv_tgt_desc     *tgt;
+	int		      rc = 0;
+
+	rc = lmv_check_connect(obd);
+	if (rc)
+		return rc;
+
+	CDEBUG(D_INODE, "SETATTR for "DFID", valid 0x%x\n",
+	       PFID(&op_data->op_fid1), op_data->op_attr.ia_valid);
+
+	op_data->op_flags |= MF_MDC_CANCEL_FID1;
+	tgt = lmv_find_target(lmv, &op_data->op_fid1);
+	if (IS_ERR(tgt))
+		return PTR_ERR(tgt);
+
+	rc = md_setattr(tgt->ltd_exp, op_data, ea, ealen, ea2,
+			ea2len, request, mod);
+
+	return rc;
+}
+
+static int lmv_sync(struct obd_export *exp, const struct lu_fid *fid,
+		    struct obd_capa *oc, struct ptlrpc_request **request)
+{
+	struct obd_device	 *obd = exp->exp_obd;
+	struct lmv_obd	    *lmv = &obd->u.lmv;
+	struct lmv_tgt_desc       *tgt;
+	int			rc;
+
+	rc = lmv_check_connect(obd);
+	if (rc)
+		return rc;
+
+	tgt = lmv_find_target(lmv, fid);
+	if (IS_ERR(tgt))
+		return PTR_ERR(tgt);
+
+	rc = md_sync(tgt->ltd_exp, fid, oc, request);
+	return rc;
+}
+
+/*
+ * Adjust a set of pages, each page containing an array of lu_dirpages,
+ * so that each page can be used as a single logical lu_dirpage.
+ *
+ * A lu_dirpage is laid out as follows, where s = ldp_hash_start,
+ * e = ldp_hash_end, f = ldp_flags, p = padding, and each "ent" is a
+ * struct lu_dirent.  It has size up to LU_PAGE_SIZE. The ldp_hash_end
+ * value is used as a cookie to request the next lu_dirpage in a
+ * directory listing that spans multiple pages (two in this example):
+ *   ________
+ *  |	|
+ * .|--------v-------   -----.
+ * |s|e|f|p|ent|ent| ... |ent|
+ * '--|--------------   -----'   Each CFS_PAGE contains a single
+ *    '------.		   lu_dirpage.
+ * .---------v-------   -----.
+ * |s|e|f|p|ent| 0 | ... | 0 |
+ * '-----------------   -----'
+ *
+ * However, on hosts where the native VM page size (PAGE_CACHE_SIZE) is
+ * larger than LU_PAGE_SIZE, a single host page may contain multiple
+ * lu_dirpages. After reading the lu_dirpages from the MDS, the
+ * ldp_hash_end of the first lu_dirpage refers to the one immediately
+ * after it in the same CFS_PAGE (arrows simplified for brevity, but
+ * in general e0==s1, e1==s2, etc.):
+ *
+ * .--------------------   -----.
+ * |s0|e0|f0|p|ent|ent| ... |ent|
+ * |---v----------------   -----|
+ * |s1|e1|f1|p|ent|ent| ... |ent|
+ * |---v----------------   -----|  Here, each CFS_PAGE contains
+ *	     ...		 multiple lu_dirpages.
+ * |---v----------------   -----|
+ * |s'|e'|f'|p|ent|ent| ... |ent|
+ * '---|----------------   -----'
+ *     v
+ * .----------------------------.
+ * |	next CFS_PAGE       |
+ *
+ * This structure is transformed into a single logical lu_dirpage as follows:
+ *
+ * - Replace e0 with e' so the request for the next lu_dirpage gets the page
+ *   labeled 'next CFS_PAGE'.
+ *
+ * - Copy the LDF_COLLIDE flag from f' to f0 to correctly reflect whether
+ *   a hash collision with the next page exists.
+ *
+ * - Adjust the lde_reclen of the ending entry of each lu_dirpage to span
+ *   to the first entry of the next lu_dirpage.
+ */
+#if PAGE_CACHE_SIZE > LU_PAGE_SIZE
+static void lmv_adjust_dirpages(struct page **pages, int ncfspgs, int nlupgs)
+{
+	int i;
+
+	for (i = 0; i < ncfspgs; i++) {
+		struct lu_dirpage	*dp = kmap(pages[i]);
+		struct lu_dirpage	*first = dp;
+		struct lu_dirent	*end_dirent = NULL;
+		struct lu_dirent	*ent;
+		__u64			hash_end = dp->ldp_hash_end;
+		__u32			flags = dp->ldp_flags;
+
+		while (--nlupgs > 0) {
+			ent = lu_dirent_start(dp);
+			for (end_dirent = ent; ent != NULL;
+			     end_dirent = ent, ent = lu_dirent_next(ent));
+
+			/* Advance dp to next lu_dirpage. */
+			dp = (struct lu_dirpage *)((char *)dp + LU_PAGE_SIZE);
+
+			/* Check if we've reached the end of the CFS_PAGE. */
+			if (!((unsigned long)dp & ~CFS_PAGE_MASK))
+				break;
+
+			/* Save the hash and flags of this lu_dirpage. */
+			hash_end = dp->ldp_hash_end;
+			flags = dp->ldp_flags;
+
+			/* Check if lu_dirpage contains no entries. */
+			if (!end_dirent)
+				break;
+
+			/* Enlarge the end entry lde_reclen from 0 to
+			 * first entry of next lu_dirpage. */
+			LASSERT(le16_to_cpu(end_dirent->lde_reclen) == 0);
+			end_dirent->lde_reclen =
+				cpu_to_le16((char *)(dp->ldp_entries) -
+					    (char *)end_dirent);
+		}
+
+		first->ldp_hash_end = hash_end;
+		first->ldp_flags &= ~cpu_to_le32(LDF_COLLIDE);
+		first->ldp_flags |= flags & cpu_to_le32(LDF_COLLIDE);
+
+		kunmap(pages[i]);
+	}
+	LASSERTF(nlupgs == 0, "left = %d", nlupgs);
+}
+#else
+#define lmv_adjust_dirpages(pages, ncfspgs, nlupgs) do {} while (0)
+#endif	/* PAGE_CACHE_SIZE > LU_PAGE_SIZE */
+
+static int lmv_readpage(struct obd_export *exp, struct md_op_data *op_data,
+			struct page **pages, struct ptlrpc_request **request)
+{
+	struct obd_device	*obd = exp->exp_obd;
+	struct lmv_obd		*lmv = &obd->u.lmv;
+	__u64			offset = op_data->op_offset;
+	int			rc;
+	int			ncfspgs; /* pages read in PAGE_CACHE_SIZE */
+	int			nlupgs; /* pages read in LU_PAGE_SIZE */
+	struct lmv_tgt_desc	*tgt;
+
+	rc = lmv_check_connect(obd);
+	if (rc)
+		return rc;
+
+	CDEBUG(D_INODE, "READPAGE at %#llx from "DFID"\n",
+	       offset, PFID(&op_data->op_fid1));
+
+	tgt = lmv_find_target(lmv, &op_data->op_fid1);
+	if (IS_ERR(tgt))
+		return PTR_ERR(tgt);
+
+	rc = md_readpage(tgt->ltd_exp, op_data, pages, request);
+	if (rc != 0)
+		return rc;
+
+	ncfspgs = ((*request)->rq_bulk->bd_nob_transferred + PAGE_CACHE_SIZE - 1)
+		 >> PAGE_CACHE_SHIFT;
+	nlupgs = (*request)->rq_bulk->bd_nob_transferred >> LU_PAGE_SHIFT;
+	LASSERT(!((*request)->rq_bulk->bd_nob_transferred & ~LU_PAGE_MASK));
+	LASSERT(ncfspgs > 0 && ncfspgs <= op_data->op_npages);
+
+	CDEBUG(D_INODE, "read %d(%d)/%d pages\n", ncfspgs, nlupgs,
+	       op_data->op_npages);
+
+	lmv_adjust_dirpages(pages, ncfspgs, nlupgs);
+
+	return rc;
+}
+
+static int lmv_unlink(struct obd_export *exp, struct md_op_data *op_data,
+		      struct ptlrpc_request **request)
+{
+	struct obd_device       *obd = exp->exp_obd;
+	struct lmv_obd	  *lmv = &obd->u.lmv;
+	struct lmv_tgt_desc     *tgt = NULL;
+	struct mdt_body		*body;
+	int		     rc;
+
+	rc = lmv_check_connect(obd);
+	if (rc)
+		return rc;
+retry:
+	/* Send unlink requests to the MDT where the child is located */
+	if (likely(!fid_is_zero(&op_data->op_fid2)))
+		tgt = lmv_locate_mds(lmv, op_data, &op_data->op_fid2);
+	else
+		tgt = lmv_locate_mds(lmv, op_data, &op_data->op_fid1);
+	if (IS_ERR(tgt))
+		return PTR_ERR(tgt);
+
+	op_data->op_fsuid = from_kuid(&init_user_ns, current_fsuid());
+	op_data->op_fsgid = from_kgid(&init_user_ns, current_fsgid());
+	op_data->op_cap = cfs_curproc_cap_pack();
+
+	/*
+	 * If child's fid is given, cancel unused locks for it if it is from
+	 * another export than parent.
+	 *
+	 * LOOKUP lock for child (fid3) should also be cancelled on parent
+	 * tgt_tgt in mdc_unlink().
+	 */
+	op_data->op_flags |= MF_MDC_CANCEL_FID1 | MF_MDC_CANCEL_FID3;
+
+	/*
+	 * Cancel FULL locks on child (fid3).
+	 */
+	rc = lmv_early_cancel(exp, op_data, tgt->ltd_idx, LCK_EX,
+			      MDS_INODELOCK_FULL, MF_MDC_CANCEL_FID3);
+
+	if (rc != 0)
+		return rc;
+
+	CDEBUG(D_INODE, "unlink with fid="DFID"/"DFID" -> mds #%d\n",
+	       PFID(&op_data->op_fid1), PFID(&op_data->op_fid2), tgt->ltd_idx);
+
+	rc = md_unlink(tgt->ltd_exp, op_data, request);
+	if (rc != 0 && rc != -EREMOTE)
+		return rc;
+
+	body = req_capsule_server_get(&(*request)->rq_pill, &RMF_MDT_BODY);
+	if (body == NULL)
+		return -EPROTO;
+
+	/* Not cross-ref case, just get out of here. */
+	if (likely(!(body->valid & OBD_MD_MDS)))
+		return 0;
+
+	CDEBUG(D_INODE, "%s: try unlink to another MDT for "DFID"\n",
+	       exp->exp_obd->obd_name, PFID(&body->fid1));
+
+	/* This is a remote object, try remote MDT, Note: it may
+	 * try more than 1 time here, Considering following case
+	 * /mnt/lustre is root on MDT0, remote1 is on MDT1
+	 * 1. Initially A does not know where remote1 is, it send
+	 *    unlink RPC to MDT0, MDT0 return -EREMOTE, it will
+	 *    resend unlink RPC to MDT1 (retry 1st time).
+	 *
+	 * 2. During the unlink RPC in flight,
+	 *    client B mv /mnt/lustre/remote1 /mnt/lustre/remote2
+	 *    and create new remote1, but on MDT0
+	 *
+	 * 3. MDT1 get unlink RPC(from A), then do remote lock on
+	 *    /mnt/lustre, then lookup get fid of remote1, and find
+	 *    it is remote dir again, and replay -EREMOTE again.
+	 *
+	 * 4. Then A will resend unlink RPC to MDT0. (retry 2nd times).
+	 *
+	 * In theory, it might try unlimited time here, but it should
+	 * be very rare case.  */
+	op_data->op_fid2 = body->fid1;
+	ptlrpc_req_finished(*request);
+	*request = NULL;
+
+	goto retry;
+}
+
+static int lmv_precleanup(struct obd_device *obd, enum obd_cleanup_stage stage)
+{
+	struct lmv_obd *lmv = &obd->u.lmv;
+
+	switch (stage) {
+	case OBD_CLEANUP_EARLY:
+		/* XXX: here should be calling obd_precleanup() down to
+		 * stack. */
+		break;
+	case OBD_CLEANUP_EXPORTS:
+		fld_client_proc_fini(&lmv->lmv_fld);
+		lprocfs_obd_cleanup(obd);
+		break;
+	default:
+		break;
+	}
+	return 0;
+}
+
+static int lmv_get_info(const struct lu_env *env, struct obd_export *exp,
+			__u32 keylen, void *key, __u32 *vallen, void *val,
+			struct lov_stripe_md *lsm)
+{
+	struct obd_device       *obd;
+	struct lmv_obd	  *lmv;
+	int		      rc = 0;
+
+	obd = class_exp2obd(exp);
+	if (obd == NULL) {
+		CDEBUG(D_IOCTL, "Invalid client cookie %#llx\n",
+		       exp->exp_handle.h_cookie);
+		return -EINVAL;
+	}
+
+	lmv = &obd->u.lmv;
+	if (keylen >= strlen("remote_flag") && !strcmp(key, "remote_flag")) {
+		struct lmv_tgt_desc *tgt;
+		int i;
+
+		rc = lmv_check_connect(obd);
+		if (rc)
+			return rc;
+
+		LASSERT(*vallen == sizeof(__u32));
+		for (i = 0; i < lmv->desc.ld_tgt_count; i++) {
+			tgt = lmv->tgts[i];
+			/*
+			 * All tgts should be connected when this gets called.
+			 */
+			if (tgt == NULL || tgt->ltd_exp == NULL)
+				continue;
+
+			if (!obd_get_info(env, tgt->ltd_exp, keylen, key,
+					  vallen, val, NULL))
+				return 0;
+		}
+		return -EINVAL;
+	} else if (KEY_IS(KEY_MAX_EASIZE) ||
+		   KEY_IS(KEY_DEFAULT_EASIZE) ||
+		   KEY_IS(KEY_MAX_COOKIESIZE) ||
+		   KEY_IS(KEY_DEFAULT_COOKIESIZE) ||
+		   KEY_IS(KEY_CONN_DATA)) {
+		rc = lmv_check_connect(obd);
+		if (rc)
+			return rc;
+
+		/*
+		 * Forwarding this request to first MDS, it should know LOV
+		 * desc.
+		 */
+		rc = obd_get_info(env, lmv->tgts[0]->ltd_exp, keylen, key,
+				  vallen, val, NULL);
+		if (!rc && KEY_IS(KEY_CONN_DATA))
+			exp->exp_connect_data = *(struct obd_connect_data *)val;
+		return rc;
+	} else if (KEY_IS(KEY_TGT_COUNT)) {
+		*((int *)val) = lmv->desc.ld_tgt_count;
+		return 0;
+	}
+
+	CDEBUG(D_IOCTL, "Invalid key\n");
+	return -EINVAL;
+}
+
+static int lmv_set_info_async(const struct lu_env *env, struct obd_export *exp,
+			      u32 keylen, void *key, u32 vallen,
+			      void *val, struct ptlrpc_request_set *set)
+{
+	struct lmv_tgt_desc    *tgt;
+	struct obd_device      *obd;
+	struct lmv_obd	 *lmv;
+	int rc = 0;
+
+	obd = class_exp2obd(exp);
+	if (obd == NULL) {
+		CDEBUG(D_IOCTL, "Invalid client cookie %#llx\n",
+		       exp->exp_handle.h_cookie);
+		return -EINVAL;
+	}
+	lmv = &obd->u.lmv;
+
+	if (KEY_IS(KEY_READ_ONLY) || KEY_IS(KEY_FLUSH_CTX)) {
+		int i, err = 0;
+
+		for (i = 0; i < lmv->desc.ld_tgt_count; i++) {
+			tgt = lmv->tgts[i];
+
+			if (tgt == NULL || tgt->ltd_exp == NULL)
+				continue;
+
+			err = obd_set_info_async(env, tgt->ltd_exp,
+						 keylen, key, vallen, val, set);
+			if (err && rc == 0)
+				rc = err;
+		}
+
+		return rc;
+	}
+
+	return -EINVAL;
+}
+
+static int lmv_packmd(struct obd_export *exp, struct lov_mds_md **lmmp,
+		      struct lov_stripe_md *lsm)
+{
+	struct obd_device	 *obd = class_exp2obd(exp);
+	struct lmv_obd	    *lmv = &obd->u.lmv;
+	struct lmv_stripe_md      *meap;
+	struct lmv_stripe_md      *lsmp;
+	int			mea_size;
+	int			i;
+
+	mea_size = lmv_get_easize(lmv);
+	if (!lmmp)
+		return mea_size;
+
+	if (*lmmp && !lsm) {
+		OBD_FREE_LARGE(*lmmp, mea_size);
+		*lmmp = NULL;
+		return 0;
+	}
+
+	if (*lmmp == NULL) {
+		OBD_ALLOC_LARGE(*lmmp, mea_size);
+		if (*lmmp == NULL)
+			return -ENOMEM;
+	}
+
+	if (!lsm)
+		return mea_size;
+
+	lsmp = (struct lmv_stripe_md *)lsm;
+	meap = (struct lmv_stripe_md *)*lmmp;
+
+	if (lsmp->mea_magic != MEA_MAGIC_LAST_CHAR &&
+	    lsmp->mea_magic != MEA_MAGIC_ALL_CHARS)
+		return -EINVAL;
+
+	meap->mea_magic = cpu_to_le32(lsmp->mea_magic);
+	meap->mea_count = cpu_to_le32(lsmp->mea_count);
+	meap->mea_master = cpu_to_le32(lsmp->mea_master);
+
+	for (i = 0; i < lmv->desc.ld_tgt_count; i++) {
+		meap->mea_ids[i] = lsmp->mea_ids[i];
+		fid_cpu_to_le(&meap->mea_ids[i], &lsmp->mea_ids[i]);
+	}
+
+	return mea_size;
+}
+
+static int lmv_unpackmd(struct obd_export *exp, struct lov_stripe_md **lsmp,
+			struct lov_mds_md *lmm, int lmm_size)
+{
+	struct obd_device	  *obd = class_exp2obd(exp);
+	struct lmv_stripe_md      **tmea = (struct lmv_stripe_md **)lsmp;
+	struct lmv_stripe_md       *mea = (struct lmv_stripe_md *)lmm;
+	struct lmv_obd	     *lmv = &obd->u.lmv;
+	int			 mea_size;
+	int			 i;
+	__u32		       magic;
+
+	mea_size = lmv_get_easize(lmv);
+	if (lsmp == NULL)
+		return mea_size;
+
+	if (*lsmp != NULL && lmm == NULL) {
+		OBD_FREE_LARGE(*tmea, mea_size);
+		*lsmp = NULL;
+		return 0;
+	}
+
+	LASSERT(mea_size == lmm_size);
+
+	OBD_ALLOC_LARGE(*tmea, mea_size);
+	if (*tmea == NULL)
+		return -ENOMEM;
+
+	if (!lmm)
+		return mea_size;
+
+	if (mea->mea_magic == MEA_MAGIC_LAST_CHAR ||
+	    mea->mea_magic == MEA_MAGIC_ALL_CHARS ||
+	    mea->mea_magic == MEA_MAGIC_HASH_SEGMENT) {
+		magic = le32_to_cpu(mea->mea_magic);
+	} else {
+		/*
+		 * Old mea is not handled here.
+		 */
+		CERROR("Old not supportable EA is found\n");
+		LBUG();
+	}
+
+	(*tmea)->mea_magic = magic;
+	(*tmea)->mea_count = le32_to_cpu(mea->mea_count);
+	(*tmea)->mea_master = le32_to_cpu(mea->mea_master);
+
+	for (i = 0; i < (*tmea)->mea_count; i++) {
+		(*tmea)->mea_ids[i] = mea->mea_ids[i];
+		fid_le_to_cpu(&(*tmea)->mea_ids[i], &(*tmea)->mea_ids[i]);
+	}
+	return mea_size;
+}
+
+static int lmv_cancel_unused(struct obd_export *exp, const struct lu_fid *fid,
+			     ldlm_policy_data_t *policy, ldlm_mode_t mode,
+			     ldlm_cancel_flags_t flags, void *opaque)
+{
+	struct obd_device       *obd = exp->exp_obd;
+	struct lmv_obd	  *lmv = &obd->u.lmv;
+	int		      rc = 0;
+	int		      err;
+	int		      i;
+
+	LASSERT(fid != NULL);
+
+	for (i = 0; i < lmv->desc.ld_tgt_count; i++) {
+		if (lmv->tgts[i] == NULL || lmv->tgts[i]->ltd_exp == NULL ||
+		    lmv->tgts[i]->ltd_active == 0)
+			continue;
+
+		err = md_cancel_unused(lmv->tgts[i]->ltd_exp, fid,
+				       policy, mode, flags, opaque);
+		if (!rc)
+			rc = err;
+	}
+	return rc;
+}
+
+static int lmv_set_lock_data(struct obd_export *exp, __u64 *lockh, void *data,
+			     __u64 *bits)
+{
+	struct lmv_obd	  *lmv = &exp->exp_obd->u.lmv;
+	int		      rc;
+
+	rc =  md_set_lock_data(lmv->tgts[0]->ltd_exp, lockh, data, bits);
+	return rc;
+}
+
+static ldlm_mode_t lmv_lock_match(struct obd_export *exp, __u64 flags,
+				  const struct lu_fid *fid, ldlm_type_t type,
+				  ldlm_policy_data_t *policy, ldlm_mode_t mode,
+				  struct lustre_handle *lockh)
+{
+	struct obd_device       *obd = exp->exp_obd;
+	struct lmv_obd	  *lmv = &obd->u.lmv;
+	ldlm_mode_t	      rc;
+	int		      i;
+
+	CDEBUG(D_INODE, "Lock match for "DFID"\n", PFID(fid));
+
+	/*
+	 * With CMD every object can have two locks in different namespaces:
+	 * lookup lock in space of mds storing direntry and update/open lock in
+	 * space of mds storing inode. Thus we check all targets, not only that
+	 * one fid was created in.
+	 */
+	for (i = 0; i < lmv->desc.ld_tgt_count; i++) {
+		if (lmv->tgts[i] == NULL ||
+		    lmv->tgts[i]->ltd_exp == NULL ||
+		    lmv->tgts[i]->ltd_active == 0)
+			continue;
+
+		rc = md_lock_match(lmv->tgts[i]->ltd_exp, flags, fid,
+				   type, policy, mode, lockh);
+		if (rc)
+			return rc;
+	}
+
+	return 0;
+}
+
+static int lmv_get_lustre_md(struct obd_export *exp,
+			     struct ptlrpc_request *req,
+			     struct obd_export *dt_exp,
+			     struct obd_export *md_exp,
+			     struct lustre_md *md)
+{
+	struct lmv_obd	  *lmv = &exp->exp_obd->u.lmv;
+
+	return md_get_lustre_md(lmv->tgts[0]->ltd_exp, req, dt_exp, md_exp, md);
+}
+
+static int lmv_free_lustre_md(struct obd_export *exp, struct lustre_md *md)
+{
+	struct obd_device       *obd = exp->exp_obd;
+	struct lmv_obd	  *lmv = &obd->u.lmv;
+
+	if (md->mea)
+		obd_free_memmd(exp, (void *)&md->mea);
+	return md_free_lustre_md(lmv->tgts[0]->ltd_exp, md);
+}
+
+static int lmv_set_open_replay_data(struct obd_export *exp,
+				    struct obd_client_handle *och,
+				    struct lookup_intent *it)
+{
+	struct obd_device       *obd = exp->exp_obd;
+	struct lmv_obd	  *lmv = &obd->u.lmv;
+	struct lmv_tgt_desc     *tgt;
+
+	tgt = lmv_find_target(lmv, &och->och_fid);
+	if (IS_ERR(tgt))
+		return PTR_ERR(tgt);
+
+	return md_set_open_replay_data(tgt->ltd_exp, och, it);
+}
+
+static int lmv_clear_open_replay_data(struct obd_export *exp,
+				      struct obd_client_handle *och)
+{
+	struct obd_device       *obd = exp->exp_obd;
+	struct lmv_obd	  *lmv = &obd->u.lmv;
+	struct lmv_tgt_desc     *tgt;
+
+	tgt = lmv_find_target(lmv, &och->och_fid);
+	if (IS_ERR(tgt))
+		return PTR_ERR(tgt);
+
+	return md_clear_open_replay_data(tgt->ltd_exp, och);
+}
+
+static int lmv_get_remote_perm(struct obd_export *exp,
+			       const struct lu_fid *fid,
+			       struct obd_capa *oc, __u32 suppgid,
+			       struct ptlrpc_request **request)
+{
+	struct obd_device       *obd = exp->exp_obd;
+	struct lmv_obd	  *lmv = &obd->u.lmv;
+	struct lmv_tgt_desc     *tgt;
+	int		      rc;
+
+	rc = lmv_check_connect(obd);
+	if (rc)
+		return rc;
+
+	tgt = lmv_find_target(lmv, fid);
+	if (IS_ERR(tgt))
+		return PTR_ERR(tgt);
+
+	rc = md_get_remote_perm(tgt->ltd_exp, fid, oc, suppgid, request);
+	return rc;
+}
+
+static int lmv_renew_capa(struct obd_export *exp, struct obd_capa *oc,
+			  renew_capa_cb_t cb)
+{
+	struct obd_device       *obd = exp->exp_obd;
+	struct lmv_obd	  *lmv = &obd->u.lmv;
+	struct lmv_tgt_desc     *tgt;
+	int		      rc;
+
+	rc = lmv_check_connect(obd);
+	if (rc)
+		return rc;
+
+	tgt = lmv_find_target(lmv, &oc->c_capa.lc_fid);
+	if (IS_ERR(tgt))
+		return PTR_ERR(tgt);
+
+	rc = md_renew_capa(tgt->ltd_exp, oc, cb);
+	return rc;
+}
+
+static int lmv_unpack_capa(struct obd_export *exp, struct ptlrpc_request *req,
+			   const struct req_msg_field *field,
+			   struct obd_capa **oc)
+{
+	struct lmv_obd *lmv = &exp->exp_obd->u.lmv;
+
+	return md_unpack_capa(lmv->tgts[0]->ltd_exp, req, field, oc);
+}
+
+static int lmv_intent_getattr_async(struct obd_export *exp,
+				    struct md_enqueue_info *minfo,
+				    struct ldlm_enqueue_info *einfo)
+{
+	struct md_op_data       *op_data = &minfo->mi_data;
+	struct obd_device       *obd = exp->exp_obd;
+	struct lmv_obd	  *lmv = &obd->u.lmv;
+	struct lmv_tgt_desc     *tgt = NULL;
+	int		      rc;
+
+	rc = lmv_check_connect(obd);
+	if (rc)
+		return rc;
+
+	tgt = lmv_find_target(lmv, &op_data->op_fid1);
+	if (IS_ERR(tgt))
+		return PTR_ERR(tgt);
+
+	rc = md_intent_getattr_async(tgt->ltd_exp, minfo, einfo);
+	return rc;
+}
+
+static int lmv_revalidate_lock(struct obd_export *exp, struct lookup_intent *it,
+			       struct lu_fid *fid, __u64 *bits)
+{
+	struct obd_device       *obd = exp->exp_obd;
+	struct lmv_obd	  *lmv = &obd->u.lmv;
+	struct lmv_tgt_desc     *tgt;
+	int		      rc;
+
+	rc = lmv_check_connect(obd);
+	if (rc)
+		return rc;
+
+	tgt = lmv_find_target(lmv, fid);
+	if (IS_ERR(tgt))
+		return PTR_ERR(tgt);
+
+	rc = md_revalidate_lock(tgt->ltd_exp, it, fid, bits);
+	return rc;
+}
+
+/**
+ * For lmv, only need to send request to master MDT, and the master MDT will
+ * process with other slave MDTs. The only exception is Q_GETOQUOTA for which
+ * we directly fetch data from the slave MDTs.
+ */
+static int lmv_quotactl(struct obd_device *unused, struct obd_export *exp,
+			struct obd_quotactl *oqctl)
+{
+	struct obd_device   *obd = class_exp2obd(exp);
+	struct lmv_obd      *lmv = &obd->u.lmv;
+	struct lmv_tgt_desc *tgt = lmv->tgts[0];
+	int		  rc = 0, i;
+	__u64		curspace, curinodes;
+
+	if (!lmv->desc.ld_tgt_count || !tgt->ltd_active) {
+		CERROR("master lmv inactive\n");
+		return -EIO;
+	}
+
+	if (oqctl->qc_cmd != Q_GETOQUOTA) {
+		rc = obd_quotactl(tgt->ltd_exp, oqctl);
+		return rc;
+	}
+
+	curspace = curinodes = 0;
+	for (i = 0; i < lmv->desc.ld_tgt_count; i++) {
+		int err;
+		tgt = lmv->tgts[i];
+
+		if (tgt == NULL || tgt->ltd_exp == NULL || tgt->ltd_active == 0)
+			continue;
+		if (!tgt->ltd_active) {
+			CDEBUG(D_HA, "mdt %d is inactive.\n", i);
+			continue;
+		}
+
+		err = obd_quotactl(tgt->ltd_exp, oqctl);
+		if (err) {
+			CERROR("getquota on mdt %d failed. %d\n", i, err);
+			if (!rc)
+				rc = err;
+		} else {
+			curspace += oqctl->qc_dqblk.dqb_curspace;
+			curinodes += oqctl->qc_dqblk.dqb_curinodes;
+		}
+	}
+	oqctl->qc_dqblk.dqb_curspace = curspace;
+	oqctl->qc_dqblk.dqb_curinodes = curinodes;
+
+	return rc;
+}
+
+static int lmv_quotacheck(struct obd_device *unused, struct obd_export *exp,
+			  struct obd_quotactl *oqctl)
+{
+	struct obd_device   *obd = class_exp2obd(exp);
+	struct lmv_obd      *lmv = &obd->u.lmv;
+	struct lmv_tgt_desc *tgt;
+	int		  i, rc = 0;
+
+	for (i = 0; i < lmv->desc.ld_tgt_count; i++) {
+		int err;
+		tgt = lmv->tgts[i];
+		if (tgt == NULL || tgt->ltd_exp == NULL || !tgt->ltd_active) {
+			CERROR("lmv idx %d inactive\n", i);
+			return -EIO;
+		}
+
+		err = obd_quotacheck(tgt->ltd_exp, oqctl);
+		if (err && !rc)
+			rc = err;
+	}
+
+	return rc;
+}
+
+static struct obd_ops lmv_obd_ops = {
+	.o_owner		= THIS_MODULE,
+	.o_setup		= lmv_setup,
+	.o_cleanup	      = lmv_cleanup,
+	.o_precleanup	   = lmv_precleanup,
+	.o_process_config       = lmv_process_config,
+	.o_connect	      = lmv_connect,
+	.o_disconnect	   = lmv_disconnect,
+	.o_statfs	       = lmv_statfs,
+	.o_get_info	     = lmv_get_info,
+	.o_set_info_async       = lmv_set_info_async,
+	.o_packmd	       = lmv_packmd,
+	.o_unpackmd	     = lmv_unpackmd,
+	.o_notify	       = lmv_notify,
+	.o_get_uuid	     = lmv_get_uuid,
+	.o_iocontrol	    = lmv_iocontrol,
+	.o_quotacheck	   = lmv_quotacheck,
+	.o_quotactl	     = lmv_quotactl
+};
+
+static struct md_ops lmv_md_ops = {
+	.m_getstatus	    = lmv_getstatus,
+	.m_null_inode		= lmv_null_inode,
+	.m_find_cbdata	  = lmv_find_cbdata,
+	.m_close		= lmv_close,
+	.m_create	       = lmv_create,
+	.m_done_writing	 = lmv_done_writing,
+	.m_enqueue	      = lmv_enqueue,
+	.m_getattr	      = lmv_getattr,
+	.m_getxattr	     = lmv_getxattr,
+	.m_getattr_name	 = lmv_getattr_name,
+	.m_intent_lock	  = lmv_intent_lock,
+	.m_link		 = lmv_link,
+	.m_rename	       = lmv_rename,
+	.m_setattr	      = lmv_setattr,
+	.m_setxattr	     = lmv_setxattr,
+	.m_sync		 = lmv_sync,
+	.m_readpage	     = lmv_readpage,
+	.m_unlink	       = lmv_unlink,
+	.m_init_ea_size	 = lmv_init_ea_size,
+	.m_cancel_unused	= lmv_cancel_unused,
+	.m_set_lock_data	= lmv_set_lock_data,
+	.m_lock_match	   = lmv_lock_match,
+	.m_get_lustre_md	= lmv_get_lustre_md,
+	.m_free_lustre_md       = lmv_free_lustre_md,
+	.m_set_open_replay_data = lmv_set_open_replay_data,
+	.m_clear_open_replay_data = lmv_clear_open_replay_data,
+	.m_renew_capa	   = lmv_renew_capa,
+	.m_unpack_capa	  = lmv_unpack_capa,
+	.m_get_remote_perm      = lmv_get_remote_perm,
+	.m_intent_getattr_async = lmv_intent_getattr_async,
+	.m_revalidate_lock      = lmv_revalidate_lock
+};
+
+static int __init lmv_init(void)
+{
+	struct lprocfs_static_vars lvars;
+	int			rc;
+
+	lprocfs_lmv_init_vars(&lvars);
+
+	rc = class_register_type(&lmv_obd_ops, &lmv_md_ops,
+				 lvars.module_vars, LUSTRE_LMV_NAME, NULL);
+	return rc;
+}
+
+static void lmv_exit(void)
+{
+	class_unregister_type(LUSTRE_LMV_NAME);
+}
+
+MODULE_AUTHOR("Sun Microsystems, Inc. <http://www.lustre.org/>");
+MODULE_DESCRIPTION("Lustre Logical Metadata Volume OBD driver");
+MODULE_LICENSE("GPL");
+
+module_init(lmv_init);
+module_exit(lmv_exit);
diff --git a/kernel/drivers/staging/lustre/lustre/lmv/lproc_lmv.c b/kernel/drivers/staging/lustre/lustre/lmv/lproc_lmv.c
new file mode 100644
index 000000000..22e5c315f
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/lmv/lproc_lmv.c
@@ -0,0 +1,237 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+
+#include <linux/seq_file.h>
+#include <linux/statfs.h>
+#include "../include/lprocfs_status.h"
+#include "../include/obd_class.h"
+#include "lmv_internal.h"
+
+static int lmv_numobd_seq_show(struct seq_file *m, void *v)
+{
+	struct obd_device       *dev = (struct obd_device *)m->private;
+	struct lmv_desc	 *desc;
+
+	LASSERT(dev != NULL);
+	desc = &dev->u.lmv.desc;
+	seq_printf(m, "%u\n", desc->ld_tgt_count);
+	return 0;
+}
+LPROC_SEQ_FOPS_RO(lmv_numobd);
+
+static const char *placement_name[] = {
+	[PLACEMENT_CHAR_POLICY] = "CHAR",
+	[PLACEMENT_NID_POLICY]  = "NID",
+	[PLACEMENT_INVAL_POLICY]  = "INVAL"
+};
+
+static enum placement_policy placement_name2policy(char *name, int len)
+{
+	int		     i;
+
+	for (i = 0; i < PLACEMENT_MAX_POLICY; i++) {
+		if (!strncmp(placement_name[i], name, len))
+			return i;
+	}
+	return PLACEMENT_INVAL_POLICY;
+}
+
+static const char *placement_policy2name(enum placement_policy placement)
+{
+	LASSERT(placement < PLACEMENT_MAX_POLICY);
+	return placement_name[placement];
+}
+
+static int lmv_placement_seq_show(struct seq_file *m, void *v)
+{
+	struct obd_device       *dev = (struct obd_device *)m->private;
+	struct lmv_obd	  *lmv;
+
+	LASSERT(dev != NULL);
+	lmv = &dev->u.lmv;
+	seq_printf(m, "%s\n", placement_policy2name(lmv->lmv_placement));
+	return 0;
+}
+
+#define MAX_POLICY_STRING_SIZE 64
+
+static ssize_t lmv_placement_seq_write(struct file *file,
+					const char __user *buffer,
+					size_t count, loff_t *off)
+{
+	struct obd_device *dev = ((struct seq_file *)file->private_data)->private;
+	char		     dummy[MAX_POLICY_STRING_SIZE + 1];
+	int		      len = count;
+	enum placement_policy       policy;
+	struct lmv_obd	  *lmv;
+
+	if (copy_from_user(dummy, buffer, MAX_POLICY_STRING_SIZE))
+		return -EFAULT;
+
+	LASSERT(dev != NULL);
+	lmv = &dev->u.lmv;
+
+	if (len > MAX_POLICY_STRING_SIZE)
+		len = MAX_POLICY_STRING_SIZE;
+
+	if (dummy[len - 1] == '\n')
+		len--;
+	dummy[len] = '\0';
+
+	policy = placement_name2policy(dummy, len);
+	if (policy != PLACEMENT_INVAL_POLICY) {
+		spin_lock(&lmv->lmv_lock);
+		lmv->lmv_placement = policy;
+		spin_unlock(&lmv->lmv_lock);
+	} else {
+		CERROR("Invalid placement policy \"%s\"!\n", dummy);
+		return -EINVAL;
+	}
+	return count;
+}
+LPROC_SEQ_FOPS(lmv_placement);
+
+static int lmv_activeobd_seq_show(struct seq_file *m, void *v)
+{
+	struct obd_device       *dev = (struct obd_device *)m->private;
+	struct lmv_desc	 *desc;
+
+	LASSERT(dev != NULL);
+	desc = &dev->u.lmv.desc;
+	seq_printf(m, "%u\n", desc->ld_active_tgt_count);
+	return 0;
+}
+LPROC_SEQ_FOPS_RO(lmv_activeobd);
+
+static int lmv_desc_uuid_seq_show(struct seq_file *m, void *v)
+{
+	struct obd_device *dev = (struct obd_device *)m->private;
+	struct lmv_obd	  *lmv;
+
+	LASSERT(dev != NULL);
+	lmv = &dev->u.lmv;
+	seq_printf(m, "%s\n", lmv->desc.ld_uuid.uuid);
+	return 0;
+}
+LPROC_SEQ_FOPS_RO(lmv_desc_uuid);
+
+static void *lmv_tgt_seq_start(struct seq_file *p, loff_t *pos)
+{
+	struct obd_device       *dev = p->private;
+	struct lmv_obd	  *lmv = &dev->u.lmv;
+	return (*pos >= lmv->desc.ld_tgt_count) ? NULL : lmv->tgts[*pos];
+}
+
+static void lmv_tgt_seq_stop(struct seq_file *p, void *v)
+{
+	return;
+}
+
+static void *lmv_tgt_seq_next(struct seq_file *p, void *v, loff_t *pos)
+{
+	struct obd_device       *dev = p->private;
+	struct lmv_obd	  *lmv = &dev->u.lmv;
+	++*pos;
+	return (*pos >= lmv->desc.ld_tgt_count) ? NULL : lmv->tgts[*pos];
+}
+
+static int lmv_tgt_seq_show(struct seq_file *p, void *v)
+{
+	struct lmv_tgt_desc     *tgt = v;
+
+	if (tgt == NULL)
+		return 0;
+	seq_printf(p, "%d: %s %sACTIVE\n",
+		   tgt->ltd_idx, tgt->ltd_uuid.uuid,
+		   tgt->ltd_active ? "" : "IN");
+	return 0;
+}
+
+static struct seq_operations lmv_tgt_sops = {
+	.start		 = lmv_tgt_seq_start,
+	.stop		  = lmv_tgt_seq_stop,
+	.next		  = lmv_tgt_seq_next,
+	.show		  = lmv_tgt_seq_show,
+};
+
+static int lmv_target_seq_open(struct inode *inode, struct file *file)
+{
+	struct seq_file	 *seq;
+	int		     rc;
+
+	rc = seq_open(file, &lmv_tgt_sops);
+	if (rc)
+		return rc;
+
+	seq = file->private_data;
+	seq->private = PDE_DATA(inode);
+
+	return 0;
+}
+
+LPROC_SEQ_FOPS_RO_TYPE(lmv, uuid);
+
+static struct lprocfs_vars lprocfs_lmv_obd_vars[] = {
+	{ "numobd",	  &lmv_numobd_fops,	  NULL, 0 },
+	{ "placement",	  &lmv_placement_fops,    NULL, 0 },
+	{ "activeobd",	  &lmv_activeobd_fops,    NULL, 0 },
+	{ "uuid",	  &lmv_uuid_fops,	  NULL, 0 },
+	{ "desc_uuid",	  &lmv_desc_uuid_fops,    NULL, 0 },
+	{ NULL }
+};
+
+LPROC_SEQ_FOPS_RO_TYPE(lmv, numrefs);
+
+static struct lprocfs_vars lprocfs_lmv_module_vars[] = {
+	{ "num_refs",	   &lmv_numrefs_fops, NULL, 0 },
+	{ NULL }
+};
+
+struct file_operations lmv_proc_target_fops = {
+	.owner		= THIS_MODULE,
+	.open		 = lmv_target_seq_open,
+	.read		 = seq_read,
+	.llseek	       = seq_lseek,
+	.release	      = seq_release,
+};
+
+void lprocfs_lmv_init_vars(struct lprocfs_static_vars *lvars)
+{
+	lvars->module_vars    = lprocfs_lmv_module_vars;
+	lvars->obd_vars       = lprocfs_lmv_obd_vars;
+}
diff --git a/kernel/drivers/staging/lustre/lustre/lov/Makefile b/kernel/drivers/staging/lustre/lustre/lov/Makefile
new file mode 100644
index 000000000..6fe56a24b
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/lov/Makefile
@@ -0,0 +1,6 @@
+obj-$(CONFIG_LUSTRE_FS) += lov.o
+lov-y := lov_obd.o lov_pack.o lov_offset.o lov_merge.o \
+	 lov_request.o lov_ea.o lov_dev.o lov_object.o lov_page.o  \
+	 lov_lock.o lov_io.o lovsub_dev.o lovsub_object.o lovsub_page.o      \
+	 lovsub_lock.o lovsub_io.o lov_pool.o
+lov-$(CONFIG_PROC_FS) += lproc_lov.o
diff --git a/kernel/drivers/staging/lustre/lustre/lov/lov_cl_internal.h b/kernel/drivers/staging/lustre/lustre/lov/lov_cl_internal.h
new file mode 100644
index 000000000..314ce8525
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/lov/lov_cl_internal.h
@@ -0,0 +1,839 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Internal interfaces of LOV layer.
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ *   Author: Jinshan Xiong <jinshan.xiong@intel.com>
+ */
+
+#ifndef LOV_CL_INTERNAL_H
+#define LOV_CL_INTERNAL_H
+
+#include "../../include/linux/libcfs/libcfs.h"
+
+#include "../include/obd.h"
+#include "../include/cl_object.h"
+#include "lov_internal.h"
+
+/** \defgroup lov lov
+ * Logical object volume layer. This layer implements data striping (raid0).
+ *
+ * At the lov layer top-entity (object, page, lock, io) is connected to one or
+ * more sub-entities: top-object, representing a file is connected to a set of
+ * sub-objects, each representing a stripe, file-level top-lock is connected
+ * to a set of per-stripe sub-locks, top-page is connected to a (single)
+ * sub-page, and a top-level IO is connected to a set of (potentially
+ * concurrent) sub-IO's.
+ *
+ * Sub-object, sub-page, and sub-io have well-defined top-object and top-page
+ * respectively, while a single sub-lock can be part of multiple top-locks.
+ *
+ * Reference counting models are different for different types of entities:
+ *
+ *     - top-object keeps a reference to its sub-objects, and destroys them
+ *       when it is destroyed.
+ *
+ *     - top-page keeps a reference to its sub-page, and destroys it when it
+ *       is destroyed.
+ *
+ *     - sub-lock keep a reference to its top-locks. Top-lock keeps a
+ *       reference (and a hold, see cl_lock_hold()) on its sub-locks when it
+ *       actively using them (that is, in cl_lock_state::CLS_QUEUING,
+ *       cl_lock_state::CLS_ENQUEUED, cl_lock_state::CLS_HELD states). When
+ *       moving into cl_lock_state::CLS_CACHED state, top-lock releases a
+ *       hold. From this moment top-lock has only a 'weak' reference to its
+ *       sub-locks. This reference is protected by top-lock
+ *       cl_lock::cll_guard, and will be automatically cleared by the sub-lock
+ *       when the latter is destroyed. When a sub-lock is canceled, a
+ *       reference to it is removed from the top-lock array, and top-lock is
+ *       moved into CLS_NEW state. It is guaranteed that all sub-locks exist
+ *       while their top-lock is in CLS_HELD or CLS_CACHED states.
+ *
+ *     - IO's are not reference counted.
+ *
+ * To implement a connection between top and sub entities, lov layer is split
+ * into two pieces: lov ("upper half"), and lovsub ("bottom half"), both
+ * implementing full set of cl-interfaces. For example, top-object has vvp and
+ * lov layers, and it's sub-object has lovsub and osc layers. lovsub layer is
+ * used to track child-parent relationship.
+ *
+ * @{
+ */
+
+struct lovsub_device;
+struct lovsub_object;
+struct lovsub_lock;
+
+enum lov_device_flags {
+	LOV_DEV_INITIALIZED = 1 << 0
+};
+
+/*
+ * Upper half.
+ */
+
+/**
+ * Resources that are used in memory-cleaning path, and whose allocation
+ * cannot fail even when memory is tight. They are preallocated in sufficient
+ * quantities in lov_device::ld_emerg[], and access to them is serialized
+ * lov_device::ld_mutex.
+ */
+struct lov_device_emerg {
+	/**
+	 * Page list used to submit IO when memory is in pressure.
+	 */
+	struct cl_page_list emrg_page_list;
+	/**
+	 * sub-io's shared by all threads accessing this device when memory is
+	 * too low to allocate sub-io's dynamically.
+	 */
+	struct cl_io	emrg_subio;
+	/**
+	 * Environments used by sub-io's in
+	 * lov_device_emerg::emrg_subio.
+	 */
+	struct lu_env      *emrg_env;
+	/**
+	 * Refchecks for lov_device_emerg::emrg_env.
+	 *
+	 * \see cl_env_get()
+	 */
+	int		 emrg_refcheck;
+};
+
+struct lov_device {
+	/*
+	 * XXX Locking of lov-private data is missing.
+	 */
+	struct cl_device	  ld_cl;
+	struct lov_obd	   *ld_lov;
+	/** size of lov_device::ld_target[] array */
+	__u32		     ld_target_nr;
+	struct lovsub_device    **ld_target;
+	__u32		     ld_flags;
+
+	/** Emergency resources used in memory-cleansing paths. */
+	struct lov_device_emerg **ld_emrg;
+	/**
+	 * Serializes access to lov_device::ld_emrg in low-memory
+	 * conditions.
+	 */
+	struct mutex		  ld_mutex;
+};
+
+/**
+ * Layout type.
+ */
+enum lov_layout_type {
+	LLT_EMPTY,	/** empty file without body (mknod + truncate) */
+	LLT_RAID0,	/** striped file */
+	LLT_RELEASED,	/** file with no objects (data in HSM) */
+	LLT_NR
+};
+
+static inline char *llt2str(enum lov_layout_type llt)
+{
+	switch (llt) {
+	case LLT_EMPTY:
+		return "EMPTY";
+	case LLT_RAID0:
+		return "RAID0";
+	case LLT_RELEASED:
+		return "RELEASED";
+	case LLT_NR:
+		LBUG();
+	}
+	LBUG();
+	return "";
+}
+
+/**
+ * lov-specific file state.
+ *
+ * lov object has particular layout type, determining how top-object is built
+ * on top of sub-objects. Layout type can change dynamically. When this
+ * happens, lov_object::lo_type_guard semaphore is taken in exclusive mode,
+ * all state pertaining to the old layout type is destroyed, and new state is
+ * constructed. All object methods take said semaphore in the shared mode,
+ * providing serialization against transition between layout types.
+ *
+ * To avoid multiple `if' or `switch' statements, selecting behavior for the
+ * current layout type, object methods perform double-dispatch, invoking
+ * function corresponding to the current layout type.
+ */
+struct lov_object {
+	struct cl_object       lo_cl;
+	/**
+	 * Serializes object operations with transitions between layout types.
+	 *
+	 * This semaphore is taken in shared mode by all object methods, and
+	 * is taken in exclusive mode when object type is changed.
+	 *
+	 * \see lov_object::lo_type
+	 */
+	struct rw_semaphore	lo_type_guard;
+	/**
+	 * Type of an object. Protected by lov_object::lo_type_guard.
+	 */
+	enum lov_layout_type	lo_type;
+	/**
+	 * True if layout is invalid. This bit is cleared when layout lock
+	 * is lost.
+	 */
+	bool			lo_layout_invalid;
+	/**
+	 * How many IOs are on going on this object. Layout can be changed
+	 * only if there is no active IO.
+	 */
+	atomic_t	       lo_active_ios;
+	/**
+	 * Waitq - wait for no one else is using lo_lsm
+	 */
+	wait_queue_head_t	       lo_waitq;
+	/**
+	 * Layout metadata. NULL if empty layout.
+	 */
+	struct lov_stripe_md  *lo_lsm;
+
+	union lov_layout_state {
+		struct lov_layout_raid0 {
+			unsigned	       lo_nr;
+			/**
+			 * When this is true, lov_object::lo_attr contains
+			 * valid up to date attributes for a top-level
+			 * object. This field is reset to 0 when attributes of
+			 * any sub-object change.
+			 */
+			int		       lo_attr_valid;
+			/**
+			 * Array of sub-objects. Allocated when top-object is
+			 * created (lov_init_raid0()).
+			 *
+			 * Top-object is a strict master of its sub-objects:
+			 * it is created before them, and outlives its
+			 * children (this later is necessary so that basic
+			 * functions like cl_object_top() always
+			 * work). Top-object keeps a reference on every
+			 * sub-object.
+			 *
+			 * When top-object is destroyed (lov_delete_raid0())
+			 * it releases its reference to a sub-object and waits
+			 * until the latter is finally destroyed.
+			 */
+			struct lovsub_object **lo_sub;
+			/**
+			 * protect lo_sub
+			 */
+			spinlock_t		lo_sub_lock;
+			/**
+			 * Cached object attribute, built from sub-object
+			 * attributes.
+			 */
+			struct cl_attr	 lo_attr;
+		} raid0;
+		struct lov_layout_state_empty {
+		} empty;
+		struct lov_layout_state_released {
+		} released;
+	} u;
+	/**
+	 * Thread that acquired lov_object::lo_type_guard in an exclusive
+	 * mode.
+	 */
+	struct task_struct	*lo_owner;
+};
+
+/**
+ * Flags that top-lock can set on each of its sub-locks.
+ */
+enum lov_sub_flags {
+	/** Top-lock acquired a hold (cl_lock_hold()) on a sub-lock. */
+	LSF_HELD = 1 << 0
+};
+
+/**
+ * State lov_lock keeps for each sub-lock.
+ */
+struct lov_lock_sub {
+	/** sub-lock itself */
+	struct lovsub_lock  *sub_lock;
+	/** An array of per-sub-lock flags, taken from enum lov_sub_flags */
+	unsigned	     sub_flags;
+	int		  sub_stripe;
+	struct cl_lock_descr sub_descr;
+	struct cl_lock_descr sub_got;
+};
+
+/**
+ * lov-specific lock state.
+ */
+struct lov_lock {
+	struct cl_lock_slice   lls_cl;
+	/** Number of sub-locks in this lock */
+	int		    lls_nr;
+	/**
+	 * Number of existing sub-locks.
+	 */
+	unsigned	       lls_nr_filled;
+	/**
+	 * Set when sub-lock was canceled, while top-lock was being
+	 * used, or unused.
+	 */
+	unsigned int	       lls_cancel_race:1;
+	/**
+	 * An array of sub-locks
+	 *
+	 * There are two issues with managing sub-locks:
+	 *
+	 *     - sub-locks are concurrently canceled, and
+	 *
+	 *     - sub-locks are shared with other top-locks.
+	 *
+	 * To manage cancellation, top-lock acquires a hold on a sublock
+	 * (lov_sublock_adopt()) when the latter is inserted into
+	 * lov_lock::lls_sub[]. This hold is released (lov_sublock_release())
+	 * when top-lock is going into CLS_CACHED state or destroyed. Hold
+	 * prevents sub-lock from cancellation.
+	 *
+	 * Sub-lock sharing means, among other things, that top-lock that is
+	 * in the process of creation (i.e., not yet inserted into lock list)
+	 * is already accessible to other threads once at least one of its
+	 * sub-locks is created, see lov_lock_sub_init().
+	 *
+	 * Sub-lock can be in one of the following states:
+	 *
+	 *     - doesn't exist, lov_lock::lls_sub[]::sub_lock == NULL. Such
+	 *       sub-lock was either never created (top-lock is in CLS_NEW
+	 *       state), or it was created, then canceled, then destroyed
+	 *       (lov_lock_unlink() cleared sub-lock pointer in the top-lock).
+	 *
+	 *     - sub-lock exists and is on
+	 *       hold. (lov_lock::lls_sub[]::sub_flags & LSF_HELD). This is a
+	 *       normal state of a sub-lock in CLS_HELD and CLS_CACHED states
+	 *       of a top-lock.
+	 *
+	 *     - sub-lock exists, but is not held by the top-lock. This
+	 *       happens after top-lock released a hold on sub-locks before
+	 *       going into cache (lov_lock_unuse()).
+	 *
+	 * \todo To support wide-striping, array has to be replaced with a set
+	 * of queues to avoid scanning.
+	 */
+	struct lov_lock_sub   *lls_sub;
+	/**
+	 * Original description with which lock was enqueued.
+	 */
+	struct cl_lock_descr   lls_orig;
+};
+
+struct lov_page {
+	struct cl_page_slice lps_cl;
+	int		  lps_invalid;
+};
+
+/*
+ * Bottom half.
+ */
+
+struct lovsub_device {
+	struct cl_device   acid_cl;
+	struct lov_device *acid_super;
+	int		acid_idx;
+	struct cl_device  *acid_next;
+};
+
+struct lovsub_object {
+	struct cl_object_header lso_header;
+	struct cl_object	lso_cl;
+	struct lov_object      *lso_super;
+	int		     lso_index;
+};
+
+/**
+ * A link between a top-lock and a sub-lock. Separate data-structure is
+ * necessary, because top-locks and sub-locks are in M:N relationship.
+ *
+ * \todo This can be optimized for a (by far) most frequent case of a single
+ * top-lock per sub-lock.
+ */
+struct lov_lock_link {
+	struct lov_lock *lll_super;
+	/** An index within parent lock. */
+	int	      lll_idx;
+	/**
+	 * A linkage into per sub-lock list of all corresponding top-locks,
+	 * hanging off lovsub_lock::lss_parents.
+	 */
+	struct list_head       lll_list;
+};
+
+/**
+ * Lock state at lovsub layer.
+ */
+struct lovsub_lock {
+	struct cl_lock_slice  lss_cl;
+	/**
+	 * List of top-locks that have given sub-lock as their part. Protected
+	 * by cl_lock::cll_guard mutex.
+	 */
+	struct list_head	    lss_parents;
+	/**
+	 * Top-lock that initiated current operation on this sub-lock. This is
+	 * only set during top-to-bottom lock operations like enqueue, and is
+	 * used to optimize state change notification. Protected by
+	 * cl_lock::cll_guard mutex.
+	 *
+	 * \see lovsub_lock_state_one().
+	 */
+	struct cl_lock       *lss_active;
+};
+
+/**
+ * Describe the environment settings for sublocks.
+ */
+struct lov_sublock_env {
+	const struct lu_env *lse_env;
+	struct cl_io	*lse_io;
+	struct lov_io_sub   *lse_sub;
+};
+
+struct lovsub_page {
+	struct cl_page_slice lsb_cl;
+};
+
+
+struct lov_thread_info {
+	struct cl_object_conf   lti_stripe_conf;
+	struct lu_fid	   lti_fid;
+	struct cl_lock_descr    lti_ldescr;
+	struct ost_lvb	  lti_lvb;
+	struct cl_2queue	lti_cl2q;
+	struct cl_lock_closure  lti_closure;
+	wait_queue_t	  lti_waiter;
+};
+
+/**
+ * State that lov_io maintains for every sub-io.
+ */
+struct lov_io_sub {
+	int		  sub_stripe;
+	/**
+	 * sub-io for a stripe. Ideally sub-io's can be stopped and resumed
+	 * independently, with lov acting as a scheduler to maximize overall
+	 * throughput.
+	 */
+	struct cl_io	*sub_io;
+	/**
+	 * Linkage into a list (hanging off lov_io::lis_active) of all
+	 * sub-io's active for the current IO iteration.
+	 */
+	struct list_head	   sub_linkage;
+	/**
+	 * true, iff cl_io_init() was successfully executed against
+	 * lov_io_sub::sub_io.
+	 */
+	int		  sub_io_initialized;
+	/**
+	 * True, iff lov_io_sub::sub_io and lov_io_sub::sub_env weren't
+	 * allocated, but borrowed from a per-device emergency pool.
+	 */
+	int		  sub_borrowed;
+	/**
+	 * environment, in which sub-io executes.
+	 */
+	struct lu_env *sub_env;
+	/**
+	 * environment's refcheck.
+	 *
+	 * \see cl_env_get()
+	 */
+	int		  sub_refcheck;
+	int		  sub_refcheck2;
+	int		  sub_reenter;
+	void		*sub_cookie;
+};
+
+/**
+ * IO state private for LOV.
+ */
+struct lov_io {
+	/** super-class */
+	struct cl_io_slice lis_cl;
+	/**
+	 * Pointer to the object slice. This is a duplicate of
+	 * lov_io::lis_cl::cis_object.
+	 */
+	struct lov_object *lis_object;
+	/**
+	 * Original end-of-io position for this IO, set by the upper layer as
+	 * cl_io::u::ci_rw::pos + cl_io::u::ci_rw::count. lov remembers this,
+	 * changes pos and count to fit IO into a single stripe and uses saved
+	 * value to determine when IO iterations have to stop.
+	 *
+	 * This is used only for CIT_READ and CIT_WRITE io's.
+	 */
+	loff_t	     lis_io_endpos;
+
+	/**
+	 * starting position within a file, for the current io loop iteration
+	 * (stripe), used by ci_io_loop().
+	 */
+	u64	    lis_pos;
+	/**
+	 * end position with in a file, for the current stripe io. This is
+	 * exclusive (i.e., next offset after last byte affected by io).
+	 */
+	u64	    lis_endpos;
+
+	int		lis_mem_frozen;
+	int		lis_stripe_count;
+	int		lis_active_subios;
+
+	/**
+	 * the index of ls_single_subio in ls_subios array
+	 */
+	int		lis_single_subio_index;
+	struct cl_io       lis_single_subio;
+
+	/**
+	 * size of ls_subios array, actually the highest stripe #
+	 */
+	int		lis_nr_subios;
+	struct lov_io_sub *lis_subs;
+	/**
+	 * List of active sub-io's.
+	 */
+	struct list_head	 lis_active;
+};
+
+struct lov_session {
+	struct lov_io	  ls_io;
+	struct lov_sublock_env ls_subenv;
+};
+
+/**
+ * State of transfer for lov.
+ */
+struct lov_req {
+	struct cl_req_slice lr_cl;
+};
+
+/**
+ * State of transfer for lovsub.
+ */
+struct lovsub_req {
+	struct cl_req_slice lsrq_cl;
+};
+
+extern struct lu_device_type lov_device_type;
+extern struct lu_device_type lovsub_device_type;
+
+extern struct lu_context_key lov_key;
+extern struct lu_context_key lov_session_key;
+
+extern struct kmem_cache *lov_lock_kmem;
+extern struct kmem_cache *lov_object_kmem;
+extern struct kmem_cache *lov_thread_kmem;
+extern struct kmem_cache *lov_session_kmem;
+extern struct kmem_cache *lov_req_kmem;
+
+extern struct kmem_cache *lovsub_lock_kmem;
+extern struct kmem_cache *lovsub_object_kmem;
+extern struct kmem_cache *lovsub_req_kmem;
+
+extern struct kmem_cache *lov_lock_link_kmem;
+
+int   lov_object_init(const struct lu_env *env, struct lu_object *obj,
+			   const struct lu_object_conf *conf);
+int   lovsub_object_init(const struct lu_env *env, struct lu_object *obj,
+			   const struct lu_object_conf *conf);
+int   lov_lock_init(const struct lu_env *env, struct cl_object *obj,
+			   struct cl_lock *lock, const struct cl_io *io);
+int   lov_io_init(const struct lu_env *env, struct cl_object *obj,
+			   struct cl_io *io);
+int   lovsub_lock_init(const struct lu_env *env, struct cl_object *obj,
+			   struct cl_lock *lock, const struct cl_io *io);
+
+int   lov_lock_init_raid0(const struct lu_env *env, struct cl_object *obj,
+			   struct cl_lock *lock, const struct cl_io *io);
+int   lov_lock_init_empty(const struct lu_env *env, struct cl_object *obj,
+			   struct cl_lock *lock, const struct cl_io *io);
+int   lov_io_init_raid0(const struct lu_env *env, struct cl_object *obj,
+			   struct cl_io *io);
+int   lov_io_init_empty(const struct lu_env *env, struct cl_object *obj,
+			   struct cl_io *io);
+int   lov_io_init_released(const struct lu_env *env, struct cl_object *obj,
+			   struct cl_io *io);
+void  lov_lock_unlink(const struct lu_env *env, struct lov_lock_link *link,
+			   struct lovsub_lock *sub);
+
+struct lov_io_sub *lov_sub_get(const struct lu_env *env, struct lov_io *lio,
+			       int stripe);
+void  lov_sub_put(struct lov_io_sub *sub);
+int   lov_sublock_modify(const struct lu_env *env, struct lov_lock *lov,
+			   struct lovsub_lock *sublock,
+			   const struct cl_lock_descr *d, int idx);
+
+
+int   lov_page_init(const struct lu_env *env, struct cl_object *ob,
+			   struct cl_page *page, struct page *vmpage);
+int   lovsub_page_init(const struct lu_env *env, struct cl_object *ob,
+			   struct cl_page *page, struct page *vmpage);
+
+int   lov_page_init_empty(const struct lu_env *env,
+			   struct cl_object *obj,
+			   struct cl_page *page, struct page *vmpage);
+int   lov_page_init_raid0(const struct lu_env *env,
+			   struct cl_object *obj,
+			   struct cl_page *page, struct page *vmpage);
+struct lu_object *lov_object_alloc(const struct lu_env *env,
+				      const struct lu_object_header *hdr,
+				      struct lu_device *dev);
+struct lu_object *lovsub_object_alloc(const struct lu_env *env,
+				      const struct lu_object_header *hdr,
+				      struct lu_device *dev);
+
+struct lov_lock_link *lov_lock_link_find(const struct lu_env *env,
+					 struct lov_lock *lck,
+					 struct lovsub_lock *sub);
+struct lov_io_sub    *lov_page_subio(const struct lu_env *env,
+					 struct lov_io *lio,
+					 const struct cl_page_slice *slice);
+
+void lov_lsm_decref(struct lov_object *lov, struct lov_stripe_md *lsm);
+struct lov_stripe_md *lov_lsm_addref(struct lov_object *lov);
+
+#define lov_foreach_target(lov, var)		    \
+	for (var = 0; var < lov_targets_nr(lov); ++var)
+
+/*****************************************************************************
+ *
+ * Type conversions.
+ *
+ * Accessors.
+ *
+ */
+
+static inline struct lov_session *lov_env_session(const struct lu_env *env)
+{
+	struct lov_session *ses;
+
+	ses = lu_context_key_get(env->le_ses, &lov_session_key);
+	LASSERT(ses != NULL);
+	return ses;
+}
+
+static inline struct lov_io *lov_env_io(const struct lu_env *env)
+{
+	return &lov_env_session(env)->ls_io;
+}
+
+static inline int lov_is_object(const struct lu_object *obj)
+{
+	return obj->lo_dev->ld_type == &lov_device_type;
+}
+
+static inline int lovsub_is_object(const struct lu_object *obj)
+{
+	return obj->lo_dev->ld_type == &lovsub_device_type;
+}
+
+static inline struct lu_device *lov2lu_dev(struct lov_device *lov)
+{
+	return &lov->ld_cl.cd_lu_dev;
+}
+
+static inline struct lov_device *lu2lov_dev(const struct lu_device *d)
+{
+	LINVRNT(d->ld_type == &lov_device_type);
+	return container_of0(d, struct lov_device, ld_cl.cd_lu_dev);
+}
+
+static inline struct cl_device *lovsub2cl_dev(struct lovsub_device *lovsub)
+{
+	return &lovsub->acid_cl;
+}
+
+static inline struct lu_device *lovsub2lu_dev(struct lovsub_device *lovsub)
+{
+	return &lovsub2cl_dev(lovsub)->cd_lu_dev;
+}
+
+static inline struct lovsub_device *lu2lovsub_dev(const struct lu_device *d)
+{
+	LINVRNT(d->ld_type == &lovsub_device_type);
+	return container_of0(d, struct lovsub_device, acid_cl.cd_lu_dev);
+}
+
+static inline struct lovsub_device *cl2lovsub_dev(const struct cl_device *d)
+{
+	LINVRNT(d->cd_lu_dev.ld_type == &lovsub_device_type);
+	return container_of0(d, struct lovsub_device, acid_cl);
+}
+
+static inline struct lu_object *lov2lu(struct lov_object *lov)
+{
+	return &lov->lo_cl.co_lu;
+}
+
+static inline struct cl_object *lov2cl(struct lov_object *lov)
+{
+	return &lov->lo_cl;
+}
+
+static inline struct lov_object *lu2lov(const struct lu_object *obj)
+{
+	LINVRNT(lov_is_object(obj));
+	return container_of0(obj, struct lov_object, lo_cl.co_lu);
+}
+
+static inline struct lov_object *cl2lov(const struct cl_object *obj)
+{
+	LINVRNT(lov_is_object(&obj->co_lu));
+	return container_of0(obj, struct lov_object, lo_cl);
+}
+
+static inline struct lu_object *lovsub2lu(struct lovsub_object *los)
+{
+	return &los->lso_cl.co_lu;
+}
+
+static inline struct cl_object *lovsub2cl(struct lovsub_object *los)
+{
+	return &los->lso_cl;
+}
+
+static inline struct lovsub_object *cl2lovsub(const struct cl_object *obj)
+{
+	LINVRNT(lovsub_is_object(&obj->co_lu));
+	return container_of0(obj, struct lovsub_object, lso_cl);
+}
+
+static inline struct lovsub_object *lu2lovsub(const struct lu_object *obj)
+{
+	LINVRNT(lovsub_is_object(obj));
+	return container_of0(obj, struct lovsub_object, lso_cl.co_lu);
+}
+
+static inline struct lovsub_lock *
+cl2lovsub_lock(const struct cl_lock_slice *slice)
+{
+	LINVRNT(lovsub_is_object(&slice->cls_obj->co_lu));
+	return container_of(slice, struct lovsub_lock, lss_cl);
+}
+
+static inline struct lovsub_lock *cl2sub_lock(const struct cl_lock *lock)
+{
+	const struct cl_lock_slice *slice;
+
+	slice = cl_lock_at(lock, &lovsub_device_type);
+	LASSERT(slice != NULL);
+	return cl2lovsub_lock(slice);
+}
+
+static inline struct lov_lock *cl2lov_lock(const struct cl_lock_slice *slice)
+{
+	LINVRNT(lov_is_object(&slice->cls_obj->co_lu));
+	return container_of(slice, struct lov_lock, lls_cl);
+}
+
+static inline struct lov_page *cl2lov_page(const struct cl_page_slice *slice)
+{
+	LINVRNT(lov_is_object(&slice->cpl_obj->co_lu));
+	return container_of0(slice, struct lov_page, lps_cl);
+}
+
+static inline struct lov_req *cl2lov_req(const struct cl_req_slice *slice)
+{
+	return container_of0(slice, struct lov_req, lr_cl);
+}
+
+static inline struct lovsub_page *
+cl2lovsub_page(const struct cl_page_slice *slice)
+{
+	LINVRNT(lovsub_is_object(&slice->cpl_obj->co_lu));
+	return container_of0(slice, struct lovsub_page, lsb_cl);
+}
+
+static inline struct lovsub_req *cl2lovsub_req(const struct cl_req_slice *slice)
+{
+	return container_of0(slice, struct lovsub_req, lsrq_cl);
+}
+
+static inline struct cl_page *lov_sub_page(const struct cl_page_slice *slice)
+{
+	return slice->cpl_page->cp_child;
+}
+
+static inline struct lov_io *cl2lov_io(const struct lu_env *env,
+				const struct cl_io_slice *ios)
+{
+	struct lov_io *lio;
+
+	lio = container_of(ios, struct lov_io, lis_cl);
+	LASSERT(lio == lov_env_io(env));
+	return lio;
+}
+
+static inline int lov_targets_nr(const struct lov_device *lov)
+{
+	return lov->ld_lov->desc.ld_tgt_count;
+}
+
+static inline struct lov_thread_info *lov_env_info(const struct lu_env *env)
+{
+	struct lov_thread_info *info;
+
+	info = lu_context_key_get(&env->le_ctx, &lov_key);
+	LASSERT(info != NULL);
+	return info;
+}
+
+static inline struct lov_layout_raid0 *lov_r0(struct lov_object *lov)
+{
+	LASSERT(lov->lo_type == LLT_RAID0);
+	LASSERT(lov->lo_lsm->lsm_wire.lw_magic == LOV_MAGIC ||
+		lov->lo_lsm->lsm_wire.lw_magic == LOV_MAGIC_V3);
+	return &lov->u.raid0;
+}
+
+/** @} lov */
+
+#endif
diff --git a/kernel/drivers/staging/lustre/lustre/lov/lov_dev.c b/kernel/drivers/staging/lustre/lustre/lov/lov_dev.c
new file mode 100644
index 000000000..711b837dd
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/lov/lov_dev.c
@@ -0,0 +1,528 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Implementation of cl_device and cl_device_type for LOV layer.
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LOV
+
+/* class_name2obd() */
+#include "../include/obd_class.h"
+
+#include "lov_cl_internal.h"
+#include "lov_internal.h"
+
+
+struct kmem_cache *lov_lock_kmem;
+struct kmem_cache *lov_object_kmem;
+struct kmem_cache *lov_thread_kmem;
+struct kmem_cache *lov_session_kmem;
+struct kmem_cache *lov_req_kmem;
+
+struct kmem_cache *lovsub_lock_kmem;
+struct kmem_cache *lovsub_object_kmem;
+struct kmem_cache *lovsub_req_kmem;
+
+struct kmem_cache *lov_lock_link_kmem;
+
+/** Lock class of lov_device::ld_mutex. */
+static struct lock_class_key cl_lov_device_mutex_class;
+
+struct lu_kmem_descr lov_caches[] = {
+	{
+		.ckd_cache = &lov_lock_kmem,
+		.ckd_name  = "lov_lock_kmem",
+		.ckd_size  = sizeof(struct lov_lock)
+	},
+	{
+		.ckd_cache = &lov_object_kmem,
+		.ckd_name  = "lov_object_kmem",
+		.ckd_size  = sizeof(struct lov_object)
+	},
+	{
+		.ckd_cache = &lov_thread_kmem,
+		.ckd_name  = "lov_thread_kmem",
+		.ckd_size  = sizeof(struct lov_thread_info)
+	},
+	{
+		.ckd_cache = &lov_session_kmem,
+		.ckd_name  = "lov_session_kmem",
+		.ckd_size  = sizeof(struct lov_session)
+	},
+	{
+		.ckd_cache = &lov_req_kmem,
+		.ckd_name  = "lov_req_kmem",
+		.ckd_size  = sizeof(struct lov_req)
+	},
+	{
+		.ckd_cache = &lovsub_lock_kmem,
+		.ckd_name  = "lovsub_lock_kmem",
+		.ckd_size  = sizeof(struct lovsub_lock)
+	},
+	{
+		.ckd_cache = &lovsub_object_kmem,
+		.ckd_name  = "lovsub_object_kmem",
+		.ckd_size  = sizeof(struct lovsub_object)
+	},
+	{
+		.ckd_cache = &lovsub_req_kmem,
+		.ckd_name  = "lovsub_req_kmem",
+		.ckd_size  = sizeof(struct lovsub_req)
+	},
+	{
+		.ckd_cache = &lov_lock_link_kmem,
+		.ckd_name  = "lov_lock_link_kmem",
+		.ckd_size  = sizeof(struct lov_lock_link)
+	},
+	{
+		.ckd_cache = NULL
+	}
+};
+
+/*****************************************************************************
+ *
+ * Lov transfer operations.
+ *
+ */
+
+static void lov_req_completion(const struct lu_env *env,
+			       const struct cl_req_slice *slice, int ioret)
+{
+	struct lov_req *lr;
+
+	lr = cl2lov_req(slice);
+	OBD_SLAB_FREE_PTR(lr, lov_req_kmem);
+}
+
+static const struct cl_req_operations lov_req_ops = {
+	.cro_completion = lov_req_completion
+};
+
+/*****************************************************************************
+ *
+ * Lov device and device type functions.
+ *
+ */
+
+static void *lov_key_init(const struct lu_context *ctx,
+			  struct lu_context_key *key)
+{
+	struct lov_thread_info *info;
+
+	OBD_SLAB_ALLOC_PTR_GFP(info, lov_thread_kmem, GFP_NOFS);
+	if (info != NULL)
+		INIT_LIST_HEAD(&info->lti_closure.clc_list);
+	else
+		info = ERR_PTR(-ENOMEM);
+	return info;
+}
+
+static void lov_key_fini(const struct lu_context *ctx,
+			 struct lu_context_key *key, void *data)
+{
+	struct lov_thread_info *info = data;
+	LINVRNT(list_empty(&info->lti_closure.clc_list));
+	OBD_SLAB_FREE_PTR(info, lov_thread_kmem);
+}
+
+struct lu_context_key lov_key = {
+	.lct_tags = LCT_CL_THREAD,
+	.lct_init = lov_key_init,
+	.lct_fini = lov_key_fini
+};
+
+static void *lov_session_key_init(const struct lu_context *ctx,
+				  struct lu_context_key *key)
+{
+	struct lov_session *info;
+
+	OBD_SLAB_ALLOC_PTR_GFP(info, lov_session_kmem, GFP_NOFS);
+	if (info == NULL)
+		info = ERR_PTR(-ENOMEM);
+	return info;
+}
+
+static void lov_session_key_fini(const struct lu_context *ctx,
+				 struct lu_context_key *key, void *data)
+{
+	struct lov_session *info = data;
+	OBD_SLAB_FREE_PTR(info, lov_session_kmem);
+}
+
+struct lu_context_key lov_session_key = {
+	.lct_tags = LCT_SESSION,
+	.lct_init = lov_session_key_init,
+	.lct_fini = lov_session_key_fini
+};
+
+/* type constructor/destructor: lov_type_{init,fini,start,stop}() */
+LU_TYPE_INIT_FINI(lov, &lov_key, &lov_session_key);
+
+static struct lu_device *lov_device_fini(const struct lu_env *env,
+					 struct lu_device *d)
+{
+	int i;
+	struct lov_device *ld = lu2lov_dev(d);
+
+	LASSERT(ld->ld_lov != NULL);
+	if (ld->ld_target == NULL)
+		return NULL;
+
+	lov_foreach_target(ld, i) {
+		struct lovsub_device *lsd;
+
+		lsd = ld->ld_target[i];
+		if (lsd != NULL) {
+			cl_stack_fini(env, lovsub2cl_dev(lsd));
+			ld->ld_target[i] = NULL;
+		}
+	}
+	return NULL;
+}
+
+static int lov_device_init(const struct lu_env *env, struct lu_device *d,
+			   const char *name, struct lu_device *next)
+{
+	struct lov_device *ld = lu2lov_dev(d);
+	int i;
+	int rc = 0;
+
+	LASSERT(d->ld_site != NULL);
+	if (ld->ld_target == NULL)
+		return rc;
+
+	lov_foreach_target(ld, i) {
+		struct lovsub_device *lsd;
+		struct cl_device     *cl;
+		struct lov_tgt_desc  *desc;
+
+		desc = ld->ld_lov->lov_tgts[i];
+		if (desc == NULL)
+			continue;
+
+		cl = cl_type_setup(env, d->ld_site, &lovsub_device_type,
+				   desc->ltd_obd->obd_lu_dev);
+		if (IS_ERR(cl)) {
+			rc = PTR_ERR(cl);
+			break;
+		}
+		lsd = cl2lovsub_dev(cl);
+		lsd->acid_idx = i;
+		lsd->acid_super = ld;
+		ld->ld_target[i] = lsd;
+	}
+
+	if (rc)
+		lov_device_fini(env, d);
+	else
+		ld->ld_flags |= LOV_DEV_INITIALIZED;
+
+	return rc;
+}
+
+static int lov_req_init(const struct lu_env *env, struct cl_device *dev,
+			struct cl_req *req)
+{
+	struct lov_req *lr;
+	int result;
+
+	OBD_SLAB_ALLOC_PTR_GFP(lr, lov_req_kmem, GFP_NOFS);
+	if (lr != NULL) {
+		cl_req_slice_add(req, &lr->lr_cl, dev, &lov_req_ops);
+		result = 0;
+	} else
+		result = -ENOMEM;
+	return result;
+}
+
+static const struct cl_device_operations lov_cl_ops = {
+	.cdo_req_init = lov_req_init
+};
+
+static void lov_emerg_free(struct lov_device_emerg **emrg, int nr)
+{
+	int i;
+
+	for (i = 0; i < nr; ++i) {
+		struct lov_device_emerg *em;
+
+		em = emrg[i];
+		if (em != NULL) {
+			LASSERT(em->emrg_page_list.pl_nr == 0);
+			if (em->emrg_env != NULL)
+				cl_env_put(em->emrg_env, &em->emrg_refcheck);
+			OBD_FREE_PTR(em);
+		}
+	}
+	OBD_FREE(emrg, nr * sizeof(emrg[0]));
+}
+
+static struct lu_device *lov_device_free(const struct lu_env *env,
+					 struct lu_device *d)
+{
+	struct lov_device *ld = lu2lov_dev(d);
+	const int	  nr = ld->ld_target_nr;
+
+	cl_device_fini(lu2cl_dev(d));
+	if (ld->ld_target != NULL)
+		OBD_FREE(ld->ld_target, nr * sizeof(ld->ld_target[0]));
+	if (ld->ld_emrg != NULL)
+		lov_emerg_free(ld->ld_emrg, nr);
+	OBD_FREE_PTR(ld);
+	return NULL;
+}
+
+static void lov_cl_del_target(const struct lu_env *env, struct lu_device *dev,
+			      __u32 index)
+{
+	struct lov_device *ld = lu2lov_dev(dev);
+
+	if (ld->ld_target[index] != NULL) {
+		cl_stack_fini(env, lovsub2cl_dev(ld->ld_target[index]));
+		ld->ld_target[index] = NULL;
+	}
+}
+
+static struct lov_device_emerg **lov_emerg_alloc(int nr)
+{
+	struct lov_device_emerg **emerg;
+	int i;
+	int result;
+
+	OBD_ALLOC(emerg, nr * sizeof(emerg[0]));
+	if (emerg == NULL)
+		return ERR_PTR(-ENOMEM);
+	for (result = i = 0; i < nr && result == 0; i++) {
+		struct lov_device_emerg *em;
+
+		OBD_ALLOC_PTR(em);
+		if (em != NULL) {
+			emerg[i] = em;
+			cl_page_list_init(&em->emrg_page_list);
+			em->emrg_env = cl_env_alloc(&em->emrg_refcheck,
+						    LCT_REMEMBER|LCT_NOREF);
+			if (!IS_ERR(em->emrg_env))
+				em->emrg_env->le_ctx.lc_cookie = 0x2;
+			else {
+				result = PTR_ERR(em->emrg_env);
+				em->emrg_env = NULL;
+			}
+		} else
+			result = -ENOMEM;
+	}
+	if (result != 0) {
+		lov_emerg_free(emerg, nr);
+		emerg = ERR_PTR(result);
+	}
+	return emerg;
+}
+
+static int lov_expand_targets(const struct lu_env *env, struct lov_device *dev)
+{
+	int   result;
+	__u32 tgt_size;
+	__u32 sub_size;
+
+	result = 0;
+	tgt_size = dev->ld_lov->lov_tgt_size;
+	sub_size = dev->ld_target_nr;
+	if (sub_size < tgt_size) {
+		struct lovsub_device    **newd;
+		struct lov_device_emerg **emerg;
+		const size_t	      sz   = sizeof(newd[0]);
+
+		emerg = lov_emerg_alloc(tgt_size);
+		if (IS_ERR(emerg))
+			return PTR_ERR(emerg);
+
+		OBD_ALLOC(newd, tgt_size * sz);
+		if (newd != NULL) {
+			mutex_lock(&dev->ld_mutex);
+			if (sub_size > 0) {
+				memcpy(newd, dev->ld_target, sub_size * sz);
+				OBD_FREE(dev->ld_target, sub_size * sz);
+			}
+			dev->ld_target    = newd;
+			dev->ld_target_nr = tgt_size;
+
+			if (dev->ld_emrg != NULL)
+				lov_emerg_free(dev->ld_emrg, sub_size);
+			dev->ld_emrg = emerg;
+			mutex_unlock(&dev->ld_mutex);
+		} else {
+			lov_emerg_free(emerg, tgt_size);
+			result = -ENOMEM;
+		}
+	}
+	return result;
+}
+
+static int lov_cl_add_target(const struct lu_env *env, struct lu_device *dev,
+			     __u32 index)
+{
+	struct obd_device    *obd = dev->ld_obd;
+	struct lov_device    *ld  = lu2lov_dev(dev);
+	struct lov_tgt_desc  *tgt;
+	struct lovsub_device *lsd;
+	struct cl_device     *cl;
+	int rc;
+
+	obd_getref(obd);
+
+	tgt = obd->u.lov.lov_tgts[index];
+	LASSERT(tgt != NULL);
+	LASSERT(tgt->ltd_obd != NULL);
+
+	if (!tgt->ltd_obd->obd_set_up) {
+		CERROR("Target %s not set up\n", obd_uuid2str(&tgt->ltd_uuid));
+		return -EINVAL;
+	}
+
+	rc = lov_expand_targets(env, ld);
+	if (rc == 0 && ld->ld_flags & LOV_DEV_INITIALIZED) {
+		LASSERT(dev->ld_site != NULL);
+
+		cl = cl_type_setup(env, dev->ld_site, &lovsub_device_type,
+				   tgt->ltd_obd->obd_lu_dev);
+		if (!IS_ERR(cl)) {
+			lsd = cl2lovsub_dev(cl);
+			lsd->acid_idx = index;
+			lsd->acid_super = ld;
+			ld->ld_target[index] = lsd;
+		} else {
+			CERROR("add failed (%d), deleting %s\n", rc,
+			       obd_uuid2str(&tgt->ltd_uuid));
+			lov_cl_del_target(env, dev, index);
+			rc = PTR_ERR(cl);
+		}
+	}
+	obd_putref(obd);
+	return rc;
+}
+
+static int lov_process_config(const struct lu_env *env,
+			      struct lu_device *d, struct lustre_cfg *cfg)
+{
+	struct obd_device *obd = d->ld_obd;
+	int cmd;
+	int rc;
+	int gen;
+	__u32 index;
+
+	obd_getref(obd);
+
+	cmd = cfg->lcfg_command;
+	rc = lov_process_config_base(d->ld_obd, cfg, &index, &gen);
+	if (rc == 0) {
+		switch (cmd) {
+		case LCFG_LOV_ADD_OBD:
+		case LCFG_LOV_ADD_INA:
+			rc = lov_cl_add_target(env, d, index);
+			if (rc != 0)
+				lov_del_target(d->ld_obd, index, NULL, 0);
+			break;
+		case LCFG_LOV_DEL_OBD:
+			lov_cl_del_target(env, d, index);
+			break;
+		}
+	}
+	obd_putref(obd);
+	return rc;
+}
+
+static const struct lu_device_operations lov_lu_ops = {
+	.ldo_object_alloc      = lov_object_alloc,
+	.ldo_process_config    = lov_process_config,
+};
+
+static struct lu_device *lov_device_alloc(const struct lu_env *env,
+					  struct lu_device_type *t,
+					  struct lustre_cfg *cfg)
+{
+	struct lu_device *d;
+	struct lov_device *ld;
+	struct obd_device *obd;
+	int rc;
+
+	OBD_ALLOC_PTR(ld);
+	if (ld == NULL)
+		return ERR_PTR(-ENOMEM);
+
+	cl_device_init(&ld->ld_cl, t);
+	d = lov2lu_dev(ld);
+	d->ld_ops	= &lov_lu_ops;
+	ld->ld_cl.cd_ops = &lov_cl_ops;
+
+	mutex_init(&ld->ld_mutex);
+	lockdep_set_class(&ld->ld_mutex, &cl_lov_device_mutex_class);
+
+	/* setup the LOV OBD */
+	obd = class_name2obd(lustre_cfg_string(cfg, 0));
+	LASSERT(obd != NULL);
+	rc = lov_setup(obd, cfg);
+	if (rc) {
+		lov_device_free(env, d);
+		return ERR_PTR(rc);
+	}
+
+	ld->ld_lov = &obd->u.lov;
+	return d;
+}
+
+static const struct lu_device_type_operations lov_device_type_ops = {
+	.ldto_init = lov_type_init,
+	.ldto_fini = lov_type_fini,
+
+	.ldto_start = lov_type_start,
+	.ldto_stop  = lov_type_stop,
+
+	.ldto_device_alloc = lov_device_alloc,
+	.ldto_device_free  = lov_device_free,
+
+	.ldto_device_init    = lov_device_init,
+	.ldto_device_fini    = lov_device_fini
+};
+
+struct lu_device_type lov_device_type = {
+	.ldt_tags     = LU_DEVICE_CL,
+	.ldt_name     = LUSTRE_LOV_NAME,
+	.ldt_ops      = &lov_device_type_ops,
+	.ldt_ctx_tags = LCT_CL_THREAD
+};
+EXPORT_SYMBOL(lov_device_type);
+
+/** @} lov */
diff --git a/kernel/drivers/staging/lustre/lustre/lov/lov_ea.c b/kernel/drivers/staging/lustre/lustre/lov/lov_ea.c
new file mode 100644
index 000000000..2bcfaeaff
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/lov/lov_ea.c
@@ -0,0 +1,363 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/lov/lov_ea.c
+ *
+ * Author: Wang Di <wangdi@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LOV
+
+#include <asm/div64.h>
+#include "../../include/linux/libcfs/libcfs.h"
+
+#include "../include/obd_class.h"
+#include "../include/lustre/lustre_idl.h"
+
+#include "lov_internal.h"
+
+struct lovea_unpack_args {
+	struct lov_stripe_md *lsm;
+	int		   cursor;
+};
+
+static int lsm_lmm_verify_common(struct lov_mds_md *lmm, int lmm_bytes,
+				 __u16 stripe_count)
+{
+	if (stripe_count > LOV_V1_INSANE_STRIPE_COUNT) {
+		CERROR("bad stripe count %d\n", stripe_count);
+		lov_dump_lmm_common(D_WARNING, lmm);
+		return -EINVAL;
+	}
+
+	if (lmm_oi_id(&lmm->lmm_oi) == 0) {
+		CERROR("zero object id\n");
+		lov_dump_lmm_common(D_WARNING, lmm);
+		return -EINVAL;
+	}
+
+	if (lov_pattern(le32_to_cpu(lmm->lmm_pattern)) != LOV_PATTERN_RAID0) {
+		CERROR("bad striping pattern\n");
+		lov_dump_lmm_common(D_WARNING, lmm);
+		return -EINVAL;
+	}
+
+	if (lmm->lmm_stripe_size == 0 ||
+	    (le32_to_cpu(lmm->lmm_stripe_size)&(LOV_MIN_STRIPE_SIZE-1)) != 0) {
+		CERROR("bad stripe size %u\n",
+		       le32_to_cpu(lmm->lmm_stripe_size));
+		lov_dump_lmm_common(D_WARNING, lmm);
+		return -EINVAL;
+	}
+	return 0;
+}
+
+struct lov_stripe_md *lsm_alloc_plain(__u16 stripe_count, int *size)
+{
+	struct lov_stripe_md *lsm;
+	struct lov_oinfo     *loi;
+	int		   i, oinfo_ptrs_size;
+
+	LASSERT(stripe_count <= LOV_MAX_STRIPE_COUNT);
+
+	oinfo_ptrs_size = sizeof(struct lov_oinfo *) * stripe_count;
+	*size = sizeof(struct lov_stripe_md) + oinfo_ptrs_size;
+
+	OBD_ALLOC_LARGE(lsm, *size);
+	if (!lsm)
+		return NULL;
+
+	for (i = 0; i < stripe_count; i++) {
+		OBD_SLAB_ALLOC_PTR_GFP(loi, lov_oinfo_slab, GFP_NOFS);
+		if (loi == NULL)
+			goto err;
+		lsm->lsm_oinfo[i] = loi;
+	}
+	lsm->lsm_stripe_count = stripe_count;
+	return lsm;
+
+err:
+	while (--i >= 0)
+		OBD_SLAB_FREE(lsm->lsm_oinfo[i], lov_oinfo_slab, sizeof(*loi));
+	OBD_FREE_LARGE(lsm, *size);
+	return NULL;
+}
+
+void lsm_free_plain(struct lov_stripe_md *lsm)
+{
+	__u16 stripe_count = lsm->lsm_stripe_count;
+	int i;
+
+	for (i = 0; i < stripe_count; i++)
+		OBD_SLAB_FREE(lsm->lsm_oinfo[i], lov_oinfo_slab,
+			      sizeof(struct lov_oinfo));
+	OBD_FREE_LARGE(lsm, sizeof(struct lov_stripe_md) +
+		       stripe_count * sizeof(struct lov_oinfo *));
+}
+
+static void lsm_unpackmd_common(struct lov_stripe_md *lsm,
+				struct lov_mds_md *lmm)
+{
+	/*
+	 * This supposes lov_mds_md_v1/v3 first fields are
+	 * are the same
+	 */
+	lmm_oi_le_to_cpu(&lsm->lsm_oi, &lmm->lmm_oi);
+	lsm->lsm_stripe_size = le32_to_cpu(lmm->lmm_stripe_size);
+	lsm->lsm_pattern = le32_to_cpu(lmm->lmm_pattern);
+	lsm->lsm_layout_gen = le16_to_cpu(lmm->lmm_layout_gen);
+	lsm->lsm_pool_name[0] = '\0';
+}
+
+static void
+lsm_stripe_by_index_plain(struct lov_stripe_md *lsm, int *stripeno,
+			   u64 *lov_off, u64 *swidth)
+{
+	if (swidth)
+		*swidth = (u64)lsm->lsm_stripe_size * lsm->lsm_stripe_count;
+}
+
+static void
+lsm_stripe_by_offset_plain(struct lov_stripe_md *lsm, int *stripeno,
+			   u64 *lov_off, u64 *swidth)
+{
+	if (swidth)
+		*swidth = (u64)lsm->lsm_stripe_size * lsm->lsm_stripe_count;
+}
+
+static int lsm_destroy_plain(struct lov_stripe_md *lsm, struct obdo *oa,
+			     struct obd_export *md_exp)
+{
+	return 0;
+}
+
+/* Find minimum stripe maxbytes value.  For inactive or
+ * reconnecting targets use LUSTRE_STRIPE_MAXBYTES. */
+static void lov_tgt_maxbytes(struct lov_tgt_desc *tgt, __u64 *stripe_maxbytes)
+{
+	struct obd_import *imp = tgt->ltd_obd->u.cli.cl_import;
+
+	if (imp == NULL || !tgt->ltd_active) {
+		*stripe_maxbytes = LUSTRE_STRIPE_MAXBYTES;
+		return;
+	}
+
+	spin_lock(&imp->imp_lock);
+	if (imp->imp_state == LUSTRE_IMP_FULL &&
+	    (imp->imp_connect_data.ocd_connect_flags & OBD_CONNECT_MAXBYTES) &&
+	    imp->imp_connect_data.ocd_maxbytes > 0) {
+		if (*stripe_maxbytes > imp->imp_connect_data.ocd_maxbytes)
+			*stripe_maxbytes = imp->imp_connect_data.ocd_maxbytes;
+	} else {
+		*stripe_maxbytes = LUSTRE_STRIPE_MAXBYTES;
+	}
+	spin_unlock(&imp->imp_lock);
+}
+
+static int lsm_lmm_verify_v1(struct lov_mds_md_v1 *lmm, int lmm_bytes,
+			     __u16 *stripe_count)
+{
+	if (lmm_bytes < sizeof(*lmm)) {
+		CERROR("lov_mds_md_v1 too small: %d, need at least %d\n",
+		       lmm_bytes, (int)sizeof(*lmm));
+		return -EINVAL;
+	}
+
+	*stripe_count = le16_to_cpu(lmm->lmm_stripe_count);
+	if (le32_to_cpu(lmm->lmm_pattern) & LOV_PATTERN_F_RELEASED)
+		*stripe_count = 0;
+
+	if (lmm_bytes < lov_mds_md_size(*stripe_count, LOV_MAGIC_V1)) {
+		CERROR("LOV EA V1 too small: %d, need %d\n",
+		       lmm_bytes, lov_mds_md_size(*stripe_count, LOV_MAGIC_V1));
+		lov_dump_lmm_common(D_WARNING, lmm);
+		return -EINVAL;
+	}
+
+	return lsm_lmm_verify_common(lmm, lmm_bytes, *stripe_count);
+}
+
+static int lsm_unpackmd_v1(struct lov_obd *lov, struct lov_stripe_md *lsm,
+			   struct lov_mds_md_v1 *lmm)
+{
+	struct lov_oinfo *loi;
+	int i;
+	int stripe_count;
+	__u64 stripe_maxbytes = OBD_OBJECT_EOF;
+
+	lsm_unpackmd_common(lsm, lmm);
+
+	stripe_count = lsm_is_released(lsm) ? 0 : lsm->lsm_stripe_count;
+
+	for (i = 0; i < stripe_count; i++) {
+		/* XXX LOV STACKING call down to osc_unpackmd() */
+		loi = lsm->lsm_oinfo[i];
+		ostid_le_to_cpu(&lmm->lmm_objects[i].l_ost_oi, &loi->loi_oi);
+		loi->loi_ost_idx = le32_to_cpu(lmm->lmm_objects[i].l_ost_idx);
+		loi->loi_ost_gen = le32_to_cpu(lmm->lmm_objects[i].l_ost_gen);
+		if (lov_oinfo_is_dummy(loi))
+			continue;
+
+		if (loi->loi_ost_idx >= lov->desc.ld_tgt_count) {
+			CERROR("OST index %d more than OST count %d\n",
+			       loi->loi_ost_idx, lov->desc.ld_tgt_count);
+			lov_dump_lmm_v1(D_WARNING, lmm);
+			return -EINVAL;
+		}
+		if (!lov->lov_tgts[loi->loi_ost_idx]) {
+			CERROR("OST index %d missing\n", loi->loi_ost_idx);
+			lov_dump_lmm_v1(D_WARNING, lmm);
+			return -EINVAL;
+		}
+		/* calculate the minimum stripe max bytes */
+		lov_tgt_maxbytes(lov->lov_tgts[loi->loi_ost_idx],
+				 &stripe_maxbytes);
+	}
+
+	lsm->lsm_maxbytes = stripe_maxbytes * lsm->lsm_stripe_count;
+	if (lsm->lsm_stripe_count == 0)
+		lsm->lsm_maxbytes = stripe_maxbytes * lov->desc.ld_tgt_count;
+
+	return 0;
+}
+
+const struct lsm_operations lsm_v1_ops = {
+	.lsm_free	    = lsm_free_plain,
+	.lsm_destroy	 = lsm_destroy_plain,
+	.lsm_stripe_by_index    = lsm_stripe_by_index_plain,
+	.lsm_stripe_by_offset   = lsm_stripe_by_offset_plain,
+	.lsm_lmm_verify	 = lsm_lmm_verify_v1,
+	.lsm_unpackmd	   = lsm_unpackmd_v1,
+};
+
+static int lsm_lmm_verify_v3(struct lov_mds_md *lmmv1, int lmm_bytes,
+			     __u16 *stripe_count)
+{
+	struct lov_mds_md_v3 *lmm;
+
+	lmm = (struct lov_mds_md_v3 *)lmmv1;
+
+	if (lmm_bytes < sizeof(*lmm)) {
+		CERROR("lov_mds_md_v3 too small: %d, need at least %d\n",
+		       lmm_bytes, (int)sizeof(*lmm));
+		return -EINVAL;
+	}
+
+	*stripe_count = le16_to_cpu(lmm->lmm_stripe_count);
+	if (le32_to_cpu(lmm->lmm_pattern) & LOV_PATTERN_F_RELEASED)
+		*stripe_count = 0;
+
+	if (lmm_bytes < lov_mds_md_size(*stripe_count, LOV_MAGIC_V3)) {
+		CERROR("LOV EA V3 too small: %d, need %d\n",
+		       lmm_bytes, lov_mds_md_size(*stripe_count, LOV_MAGIC_V3));
+		lov_dump_lmm_common(D_WARNING, lmm);
+		return -EINVAL;
+	}
+
+	return lsm_lmm_verify_common((struct lov_mds_md_v1 *)lmm, lmm_bytes,
+				     *stripe_count);
+}
+
+static int lsm_unpackmd_v3(struct lov_obd *lov, struct lov_stripe_md *lsm,
+			   struct lov_mds_md *lmmv1)
+{
+	struct lov_mds_md_v3 *lmm;
+	struct lov_oinfo *loi;
+	int i;
+	int stripe_count;
+	__u64 stripe_maxbytes = OBD_OBJECT_EOF;
+	int cplen = 0;
+
+	lmm = (struct lov_mds_md_v3 *)lmmv1;
+
+	lsm_unpackmd_common(lsm, (struct lov_mds_md_v1 *)lmm);
+
+	stripe_count = lsm_is_released(lsm) ? 0 : lsm->lsm_stripe_count;
+
+	cplen = strlcpy(lsm->lsm_pool_name, lmm->lmm_pool_name,
+			sizeof(lsm->lsm_pool_name));
+	if (cplen >= sizeof(lsm->lsm_pool_name))
+		return -E2BIG;
+
+	for (i = 0; i < stripe_count; i++) {
+		/* XXX LOV STACKING call down to osc_unpackmd() */
+		loi = lsm->lsm_oinfo[i];
+		ostid_le_to_cpu(&lmm->lmm_objects[i].l_ost_oi, &loi->loi_oi);
+		loi->loi_ost_idx = le32_to_cpu(lmm->lmm_objects[i].l_ost_idx);
+		loi->loi_ost_gen = le32_to_cpu(lmm->lmm_objects[i].l_ost_gen);
+		if (lov_oinfo_is_dummy(loi))
+			continue;
+
+		if (loi->loi_ost_idx >= lov->desc.ld_tgt_count) {
+			CERROR("OST index %d more than OST count %d\n",
+			       loi->loi_ost_idx, lov->desc.ld_tgt_count);
+			lov_dump_lmm_v3(D_WARNING, lmm);
+			return -EINVAL;
+		}
+		if (!lov->lov_tgts[loi->loi_ost_idx]) {
+			CERROR("OST index %d missing\n", loi->loi_ost_idx);
+			lov_dump_lmm_v3(D_WARNING, lmm);
+			return -EINVAL;
+		}
+		/* calculate the minimum stripe max bytes */
+		lov_tgt_maxbytes(lov->lov_tgts[loi->loi_ost_idx],
+				 &stripe_maxbytes);
+	}
+
+	lsm->lsm_maxbytes = stripe_maxbytes * lsm->lsm_stripe_count;
+	if (lsm->lsm_stripe_count == 0)
+		lsm->lsm_maxbytes = stripe_maxbytes * lov->desc.ld_tgt_count;
+
+	return 0;
+}
+
+const struct lsm_operations lsm_v3_ops = {
+	.lsm_free	    = lsm_free_plain,
+	.lsm_destroy	 = lsm_destroy_plain,
+	.lsm_stripe_by_index    = lsm_stripe_by_index_plain,
+	.lsm_stripe_by_offset   = lsm_stripe_by_offset_plain,
+	.lsm_lmm_verify	 = lsm_lmm_verify_v3,
+	.lsm_unpackmd	   = lsm_unpackmd_v3,
+};
+
+void dump_lsm(unsigned int level, const struct lov_stripe_md *lsm)
+{
+	CDEBUG(level, "lsm %p, objid " DOSTID ", maxbytes %#llx, magic 0x%08X, stripe_size %u, stripe_count %u, refc: %d, layout_gen %u, pool [" LOV_POOLNAMEF "]\n",
+	       lsm,
+	       POSTID(&lsm->lsm_oi), lsm->lsm_maxbytes, lsm->lsm_magic,
+	       lsm->lsm_stripe_size, lsm->lsm_stripe_count,
+	       atomic_read(&lsm->lsm_refc), lsm->lsm_layout_gen,
+	       lsm->lsm_pool_name);
+}
diff --git a/kernel/drivers/staging/lustre/lustre/lov/lov_internal.h b/kernel/drivers/staging/lustre/lustre/lov/lov_internal.h
new file mode 100644
index 000000000..b644acc9b
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/lov/lov_internal.h
@@ -0,0 +1,319 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef LOV_INTERNAL_H
+#define LOV_INTERNAL_H
+
+#include "../include/obd_class.h"
+#include "../include/lustre/lustre_user.h"
+
+/* lov_do_div64(a, b) returns a % b, and a = a / b.
+ * The 32-bit code is LOV-specific due to knowing about stripe limits in
+ * order to reduce the divisor to a 32-bit number.  If the divisor is
+ * already a 32-bit value the compiler handles this directly. */
+#if BITS_PER_LONG == 64
+# define lov_do_div64(n, base) ({					\
+	uint64_t __base = (base);					\
+	uint64_t __rem;							\
+	__rem = ((uint64_t)(n)) % __base;				\
+	(n) = ((uint64_t)(n)) / __base;					\
+	__rem;								\
+})
+#elif BITS_PER_LONG == 32
+# define lov_do_div64(n, base) ({					\
+	uint64_t __rem;							\
+	if ((sizeof(base) > 4) && (((base) & 0xffffffff00000000ULL) != 0)) {  \
+		int __remainder;					      \
+		LASSERTF(!((base) & (LOV_MIN_STRIPE_SIZE - 1)), "64 bit lov " \
+			 "division %llu / %llu\n", (n), (uint64_t)(base));    \
+		__remainder = (n) & (LOV_MIN_STRIPE_SIZE - 1);		\
+		(n) >>= LOV_MIN_STRIPE_BITS;				\
+		__rem = do_div(n, (base) >> LOV_MIN_STRIPE_BITS);	\
+		__rem <<= LOV_MIN_STRIPE_BITS;				\
+		__rem += __remainder;					\
+	} else {							\
+		__rem = do_div(n, base);				\
+	}								\
+	__rem;								\
+})
+#endif
+
+struct lov_lock_handles {
+	struct portals_handle   llh_handle;
+	atomic_t	    llh_refcount;
+	int		     llh_stripe_count;
+	struct lustre_handle    llh_handles[0];
+};
+
+struct lov_request {
+	struct obd_info	  rq_oi;
+	struct lov_request_set  *rq_rqset;
+
+	struct list_head	       rq_link;
+
+	int		      rq_idx;	/* index in lov->tgts array */
+	int		      rq_stripe;     /* stripe number */
+	int		      rq_complete;
+	int		      rq_rc;
+	int		      rq_buflen;     /* length of sub_md */
+
+	u32		      rq_oabufs;
+	u32		      rq_pgaidx;
+};
+
+struct lov_request_set {
+	struct ldlm_enqueue_info	*set_ei;
+	struct obd_info			*set_oi;
+	atomic_t			set_refcount;
+	struct obd_export		*set_exp;
+	/* XXX: There is @set_exp already, however obd_statfs gets obd_device
+	   only. */
+	struct obd_device		*set_obd;
+	int				set_count;
+	atomic_t			set_completes;
+	atomic_t			set_success;
+	atomic_t			set_finish_checked;
+	struct llog_cookie		*set_cookies;
+	int				set_cookie_sent;
+	struct obd_trans_info		*set_oti;
+	u32				set_oabufs;
+	struct brw_page			*set_pga;
+	struct lov_lock_handles		*set_lockh;
+	struct list_head			set_list;
+	wait_queue_head_t			set_waitq;
+	spinlock_t			set_lock;
+};
+
+extern struct kmem_cache *lov_oinfo_slab;
+
+extern struct lu_kmem_descr lov_caches[];
+
+void lov_finish_set(struct lov_request_set *set);
+
+static inline void lov_get_reqset(struct lov_request_set *set)
+{
+	LASSERT(set != NULL);
+	LASSERT(atomic_read(&set->set_refcount) > 0);
+	atomic_inc(&set->set_refcount);
+}
+
+static inline void lov_put_reqset(struct lov_request_set *set)
+{
+	if (atomic_dec_and_test(&set->set_refcount))
+		lov_finish_set(set);
+}
+
+static inline struct lov_lock_handles *
+lov_handle2llh(struct lustre_handle *handle)
+{
+	LASSERT(handle != NULL);
+	return class_handle2object(handle->cookie);
+}
+
+static inline void lov_llh_put(struct lov_lock_handles *llh)
+{
+	CDEBUG(D_INFO, "PUTting llh %p : new refcount %d\n", llh,
+	       atomic_read(&llh->llh_refcount) - 1);
+	LASSERT(atomic_read(&llh->llh_refcount) > 0 &&
+		atomic_read(&llh->llh_refcount) < 0x5a5a);
+	if (atomic_dec_and_test(&llh->llh_refcount)) {
+		class_handle_unhash(&llh->llh_handle);
+		/* The structure may be held by other threads because RCU.
+		 *   -jxiong */
+		if (atomic_read(&llh->llh_refcount))
+			return;
+
+		OBD_FREE_RCU(llh, sizeof(*llh) +
+			     sizeof(*llh->llh_handles) * llh->llh_stripe_count,
+			     &llh->llh_handle);
+	}
+}
+
+#define lov_uuid2str(lv, index) \
+	(char *)((lv)->lov_tgts[index]->ltd_uuid.uuid)
+
+/* lov_merge.c */
+void lov_merge_attrs(struct obdo *tgt, struct obdo *src, u64 valid,
+		     struct lov_stripe_md *lsm, int stripeno, int *set);
+int lov_adjust_kms(struct obd_export *exp, struct lov_stripe_md *lsm,
+		   u64 size, int shrink);
+int lov_merge_lvb_kms(struct lov_stripe_md *lsm,
+		      struct ost_lvb *lvb, __u64 *kms_place);
+
+/* lov_offset.c */
+u64 lov_stripe_size(struct lov_stripe_md *lsm, u64 ost_size,
+			 int stripeno);
+int lov_stripe_offset(struct lov_stripe_md *lsm, u64 lov_off,
+		      int stripeno, u64 *u64);
+u64 lov_size_to_stripe(struct lov_stripe_md *lsm, u64 file_size,
+			   int stripeno);
+int lov_stripe_intersects(struct lov_stripe_md *lsm, int stripeno,
+			  u64 start, u64 end,
+			  u64 *obd_start, u64 *obd_end);
+int lov_stripe_number(struct lov_stripe_md *lsm, u64 lov_off);
+
+/* lov_qos.c */
+#define LOV_USES_ASSIGNED_STRIPE	0
+#define LOV_USES_DEFAULT_STRIPE	 1
+int qos_add_tgt(struct obd_device *obd, __u32 index);
+int qos_del_tgt(struct obd_device *obd, struct lov_tgt_desc *tgt);
+void qos_shrink_lsm(struct lov_request_set *set);
+int qos_prep_create(struct obd_export *exp, struct lov_request_set *set);
+void qos_update(struct lov_obd *lov);
+void qos_statfs_done(struct lov_obd *lov);
+void qos_statfs_update(struct obd_device *obd, __u64 max_age, int wait);
+int qos_remedy_create(struct lov_request_set *set, struct lov_request *req);
+
+/* lov_request.c */
+void lov_set_add_req(struct lov_request *req, struct lov_request_set *set);
+int lov_set_finished(struct lov_request_set *set, int idempotent);
+void lov_update_set(struct lov_request_set *set,
+		    struct lov_request *req, int rc);
+int lov_update_common_set(struct lov_request_set *set,
+			  struct lov_request *req, int rc);
+int lov_check_and_wait_active(struct lov_obd *lov, int ost_idx);
+int lov_prep_getattr_set(struct obd_export *exp, struct obd_info *oinfo,
+			 struct lov_request_set **reqset);
+int lov_fini_getattr_set(struct lov_request_set *set);
+int lov_prep_destroy_set(struct obd_export *exp, struct obd_info *oinfo,
+			 struct obdo *src_oa, struct lov_stripe_md *lsm,
+			 struct obd_trans_info *oti,
+			 struct lov_request_set **reqset);
+int lov_fini_destroy_set(struct lov_request_set *set);
+int lov_prep_setattr_set(struct obd_export *exp, struct obd_info *oinfo,
+			 struct obd_trans_info *oti,
+			 struct lov_request_set **reqset);
+int lov_update_setattr_set(struct lov_request_set *set,
+			   struct lov_request *req, int rc);
+int lov_fini_setattr_set(struct lov_request_set *set);
+int lov_prep_statfs_set(struct obd_device *obd, struct obd_info *oinfo,
+			struct lov_request_set **reqset);
+void lov_update_statfs(struct obd_statfs *osfs, struct obd_statfs *lov_sfs,
+		       int success);
+int lov_fini_statfs(struct obd_device *obd, struct obd_statfs *osfs,
+		    int success);
+int lov_fini_statfs_set(struct lov_request_set *set);
+int lov_statfs_interpret(struct ptlrpc_request_set *rqset, void *data, int rc);
+
+/* lov_obd.c */
+void lov_fix_desc(struct lov_desc *desc);
+void lov_fix_desc_stripe_size(__u64 *val);
+void lov_fix_desc_stripe_count(__u32 *val);
+void lov_fix_desc_pattern(__u32 *val);
+void lov_fix_desc_qos_maxage(__u32 *val);
+__u16 lov_get_stripecnt(struct lov_obd *lov, __u32 magic, __u16 stripe_count);
+int lov_connect_obd(struct obd_device *obd, __u32 index, int activate,
+		    struct obd_connect_data *data);
+int lov_setup(struct obd_device *obd, struct lustre_cfg *lcfg);
+int lov_process_config_base(struct obd_device *obd, struct lustre_cfg *lcfg,
+			    __u32 *indexp, int *genp);
+int lov_del_target(struct obd_device *obd, __u32 index,
+		   struct obd_uuid *uuidp, int gen);
+
+/* lov_pack.c */
+int lov_packmd(struct obd_export *exp, struct lov_mds_md **lmm,
+	       struct lov_stripe_md *lsm);
+int lov_unpackmd(struct obd_export *exp, struct lov_stripe_md **lsmp,
+		 struct lov_mds_md *lmm, int lmm_bytes);
+int lov_getstripe(struct obd_export *exp,
+		  struct lov_stripe_md *lsm, struct lov_user_md *lump);
+int lov_alloc_memmd(struct lov_stripe_md **lsmp, __u16 stripe_count,
+		    int pattern, int magic);
+int lov_free_memmd(struct lov_stripe_md **lsmp);
+
+void lov_dump_lmm_v1(int level, struct lov_mds_md_v1 *lmm);
+void lov_dump_lmm_v3(int level, struct lov_mds_md_v3 *lmm);
+void lov_dump_lmm_common(int level, void *lmmp);
+void lov_dump_lmm(int level, void *lmm);
+
+/* lov_ea.c */
+struct lov_stripe_md *lsm_alloc_plain(__u16 stripe_count, int *size);
+void lsm_free_plain(struct lov_stripe_md *lsm);
+void dump_lsm(unsigned int level, const struct lov_stripe_md *lsm);
+
+/* lproc_lov.c */
+#if defined (CONFIG_PROC_FS)
+extern const struct file_operations lov_proc_target_fops;
+void lprocfs_lov_init_vars(struct lprocfs_static_vars *lvars);
+#else
+static inline void lprocfs_lov_init_vars(struct lprocfs_static_vars *lvars)
+{
+	memset(lvars, 0, sizeof(*lvars));
+}
+#endif
+
+/* lov_cl.c */
+extern struct lu_device_type lov_device_type;
+
+/* pools */
+extern cfs_hash_ops_t pool_hash_operations;
+/* ost_pool methods */
+int lov_ost_pool_init(struct ost_pool *op, unsigned int count);
+int lov_ost_pool_extend(struct ost_pool *op, unsigned int min_count);
+int lov_ost_pool_add(struct ost_pool *op, __u32 idx, unsigned int min_count);
+int lov_ost_pool_remove(struct ost_pool *op, __u32 idx);
+int lov_ost_pool_free(struct ost_pool *op);
+
+/* high level pool methods */
+int lov_pool_new(struct obd_device *obd, char *poolname);
+int lov_pool_del(struct obd_device *obd, char *poolname);
+int lov_pool_add(struct obd_device *obd, char *poolname, char *ostname);
+int lov_pool_remove(struct obd_device *obd, char *poolname, char *ostname);
+void lov_dump_pool(int level, struct pool_desc *pool);
+struct pool_desc *lov_find_pool(struct lov_obd *lov, char *poolname);
+int lov_check_index_in_pool(__u32 idx, struct pool_desc *pool);
+void lov_pool_putref(struct pool_desc *pool);
+
+static inline struct lov_stripe_md *lsm_addref(struct lov_stripe_md *lsm)
+{
+	LASSERT(atomic_read(&lsm->lsm_refc) > 0);
+	atomic_inc(&lsm->lsm_refc);
+	return lsm;
+}
+
+static inline bool lov_oinfo_is_dummy(const struct lov_oinfo *loi)
+{
+	if (unlikely(loi->loi_oi.oi.oi_id == 0 &&
+		     loi->loi_oi.oi.oi_seq == 0 &&
+		     loi->loi_ost_idx == 0 &&
+		     loi->loi_ost_gen == 0))
+		return true;
+
+	return false;
+}
+
+
+#endif
diff --git a/kernel/drivers/staging/lustre/lustre/lov/lov_io.c b/kernel/drivers/staging/lustre/lustre/lov/lov_io.c
new file mode 100644
index 000000000..cf96e0d01
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/lov/lov_io.c
@@ -0,0 +1,990 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Implementation of cl_io for LOV layer.
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ *   Author: Jinshan Xiong <jinshan.xiong@whamcloud.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LOV
+
+#include "lov_cl_internal.h"
+
+/** \addtogroup lov
+ *  @{
+ */
+
+static inline void lov_sub_enter(struct lov_io_sub *sub)
+{
+	sub->sub_reenter++;
+}
+static inline void lov_sub_exit(struct lov_io_sub *sub)
+{
+	sub->sub_reenter--;
+}
+
+static void lov_io_sub_fini(const struct lu_env *env, struct lov_io *lio,
+			    struct lov_io_sub *sub)
+{
+	if (sub->sub_io != NULL) {
+		if (sub->sub_io_initialized) {
+			lov_sub_enter(sub);
+			cl_io_fini(sub->sub_env, sub->sub_io);
+			lov_sub_exit(sub);
+			sub->sub_io_initialized = 0;
+			lio->lis_active_subios--;
+		}
+		if (sub->sub_stripe == lio->lis_single_subio_index)
+			lio->lis_single_subio_index = -1;
+		else if (!sub->sub_borrowed)
+			OBD_FREE_PTR(sub->sub_io);
+		sub->sub_io = NULL;
+	}
+	if (sub->sub_env != NULL && !IS_ERR(sub->sub_env)) {
+		if (!sub->sub_borrowed)
+			cl_env_put(sub->sub_env, &sub->sub_refcheck);
+		sub->sub_env = NULL;
+	}
+}
+
+static void lov_io_sub_inherit(struct cl_io *io, struct lov_io *lio,
+			       int stripe, loff_t start, loff_t end)
+{
+	struct lov_stripe_md *lsm    = lio->lis_object->lo_lsm;
+	struct cl_io	 *parent = lio->lis_cl.cis_io;
+
+	switch (io->ci_type) {
+	case CIT_SETATTR: {
+		io->u.ci_setattr.sa_attr = parent->u.ci_setattr.sa_attr;
+		io->u.ci_setattr.sa_valid = parent->u.ci_setattr.sa_valid;
+		io->u.ci_setattr.sa_capa = parent->u.ci_setattr.sa_capa;
+		if (cl_io_is_trunc(io)) {
+			loff_t new_size = parent->u.ci_setattr.sa_attr.lvb_size;
+
+			new_size = lov_size_to_stripe(lsm, new_size, stripe);
+			io->u.ci_setattr.sa_attr.lvb_size = new_size;
+		}
+		break;
+	}
+	case CIT_FAULT: {
+		struct cl_object *obj = parent->ci_obj;
+		loff_t off = cl_offset(obj, parent->u.ci_fault.ft_index);
+
+		io->u.ci_fault = parent->u.ci_fault;
+		off = lov_size_to_stripe(lsm, off, stripe);
+		io->u.ci_fault.ft_index = cl_index(obj, off);
+		break;
+	}
+	case CIT_FSYNC: {
+		io->u.ci_fsync.fi_start = start;
+		io->u.ci_fsync.fi_end = end;
+		io->u.ci_fsync.fi_capa = parent->u.ci_fsync.fi_capa;
+		io->u.ci_fsync.fi_fid = parent->u.ci_fsync.fi_fid;
+		io->u.ci_fsync.fi_mode = parent->u.ci_fsync.fi_mode;
+		break;
+	}
+	case CIT_READ:
+	case CIT_WRITE: {
+		io->u.ci_wr.wr_sync = cl_io_is_sync_write(parent);
+		if (cl_io_is_append(parent)) {
+			io->u.ci_wr.wr_append = 1;
+		} else {
+			io->u.ci_rw.crw_pos = start;
+			io->u.ci_rw.crw_count = end - start;
+		}
+		break;
+	}
+	default:
+		break;
+	}
+}
+
+static int lov_io_sub_init(const struct lu_env *env, struct lov_io *lio,
+			   struct lov_io_sub *sub)
+{
+	struct lov_object *lov = lio->lis_object;
+	struct lov_device *ld  = lu2lov_dev(lov2cl(lov)->co_lu.lo_dev);
+	struct cl_io      *sub_io;
+	struct cl_object  *sub_obj;
+	struct cl_io      *io  = lio->lis_cl.cis_io;
+
+	int stripe = sub->sub_stripe;
+	int result;
+
+	LASSERT(sub->sub_io == NULL);
+	LASSERT(sub->sub_env == NULL);
+	LASSERT(sub->sub_stripe < lio->lis_stripe_count);
+
+	if (unlikely(lov_r0(lov)->lo_sub[stripe] == NULL))
+		return -EIO;
+
+	result = 0;
+	sub->sub_io_initialized = 0;
+	sub->sub_borrowed = 0;
+
+	if (lio->lis_mem_frozen) {
+		LASSERT(mutex_is_locked(&ld->ld_mutex));
+		sub->sub_io  = &ld->ld_emrg[stripe]->emrg_subio;
+		sub->sub_env = ld->ld_emrg[stripe]->emrg_env;
+		sub->sub_borrowed = 1;
+	} else {
+		void *cookie;
+
+		/* obtain new environment */
+		cookie = cl_env_reenter();
+		sub->sub_env = cl_env_get(&sub->sub_refcheck);
+		cl_env_reexit(cookie);
+		if (IS_ERR(sub->sub_env))
+			result = PTR_ERR(sub->sub_env);
+
+		if (result == 0) {
+			/*
+			 * First sub-io. Use ->lis_single_subio to
+			 * avoid dynamic allocation.
+			 */
+			if (lio->lis_active_subios == 0) {
+				sub->sub_io = &lio->lis_single_subio;
+				lio->lis_single_subio_index = stripe;
+			} else {
+				OBD_ALLOC_PTR(sub->sub_io);
+				if (sub->sub_io == NULL)
+					result = -ENOMEM;
+			}
+		}
+	}
+
+	if (result == 0) {
+		sub_obj = lovsub2cl(lov_r0(lov)->lo_sub[stripe]);
+		sub_io  = sub->sub_io;
+
+		sub_io->ci_obj    = sub_obj;
+		sub_io->ci_result = 0;
+
+		sub_io->ci_parent  = io;
+		sub_io->ci_lockreq = io->ci_lockreq;
+		sub_io->ci_type    = io->ci_type;
+		sub_io->ci_no_srvlock = io->ci_no_srvlock;
+		sub_io->ci_noatime = io->ci_noatime;
+
+		lov_sub_enter(sub);
+		result = cl_io_sub_init(sub->sub_env, sub_io,
+					io->ci_type, sub_obj);
+		lov_sub_exit(sub);
+		if (result >= 0) {
+			lio->lis_active_subios++;
+			sub->sub_io_initialized = 1;
+			result = 0;
+		}
+	}
+	if (result != 0)
+		lov_io_sub_fini(env, lio, sub);
+	return result;
+}
+
+struct lov_io_sub *lov_sub_get(const struct lu_env *env,
+			       struct lov_io *lio, int stripe)
+{
+	int rc;
+	struct lov_io_sub *sub = &lio->lis_subs[stripe];
+
+	LASSERT(stripe < lio->lis_stripe_count);
+
+	if (!sub->sub_io_initialized) {
+		sub->sub_stripe = stripe;
+		rc = lov_io_sub_init(env, lio, sub);
+	} else
+		rc = 0;
+	if (rc == 0)
+		lov_sub_enter(sub);
+	else
+		sub = ERR_PTR(rc);
+	return sub;
+}
+
+void lov_sub_put(struct lov_io_sub *sub)
+{
+	lov_sub_exit(sub);
+}
+
+/*****************************************************************************
+ *
+ * Lov io operations.
+ *
+ */
+
+static int lov_page_stripe(const struct cl_page *page)
+{
+	struct lovsub_object *subobj;
+
+	subobj = lu2lovsub(
+		lu_object_locate(page->cp_child->cp_obj->co_lu.lo_header,
+				 &lovsub_device_type));
+	LASSERT(subobj != NULL);
+	return subobj->lso_index;
+}
+
+struct lov_io_sub *lov_page_subio(const struct lu_env *env, struct lov_io *lio,
+				  const struct cl_page_slice *slice)
+{
+	struct lov_stripe_md *lsm  = lio->lis_object->lo_lsm;
+	struct cl_page       *page = slice->cpl_page;
+	int stripe;
+
+	LASSERT(lio->lis_cl.cis_io != NULL);
+	LASSERT(cl2lov(slice->cpl_obj) == lio->lis_object);
+	LASSERT(lsm != NULL);
+	LASSERT(lio->lis_nr_subios > 0);
+
+	stripe = lov_page_stripe(page);
+	return lov_sub_get(env, lio, stripe);
+}
+
+
+static int lov_io_subio_init(const struct lu_env *env, struct lov_io *lio,
+			     struct cl_io *io)
+{
+	struct lov_stripe_md *lsm = lio->lis_object->lo_lsm;
+	int result;
+
+	LASSERT(lio->lis_object != NULL);
+
+	/*
+	 * Need to be optimized, we can't afford to allocate a piece of memory
+	 * when writing a page. -jay
+	 */
+	OBD_ALLOC_LARGE(lio->lis_subs,
+			lsm->lsm_stripe_count * sizeof(lio->lis_subs[0]));
+	if (lio->lis_subs != NULL) {
+		lio->lis_nr_subios = lio->lis_stripe_count;
+		lio->lis_single_subio_index = -1;
+		lio->lis_active_subios = 0;
+		result = 0;
+	} else
+		result = -ENOMEM;
+	return result;
+}
+
+static void lov_io_slice_init(struct lov_io *lio,
+			      struct lov_object *obj, struct cl_io *io)
+{
+	io->ci_result = 0;
+	lio->lis_object = obj;
+
+	LASSERT(obj->lo_lsm != NULL);
+	lio->lis_stripe_count = obj->lo_lsm->lsm_stripe_count;
+
+	switch (io->ci_type) {
+	case CIT_READ:
+	case CIT_WRITE:
+		lio->lis_pos = io->u.ci_rw.crw_pos;
+		lio->lis_endpos = io->u.ci_rw.crw_pos + io->u.ci_rw.crw_count;
+		lio->lis_io_endpos = lio->lis_endpos;
+		if (cl_io_is_append(io)) {
+			LASSERT(io->ci_type == CIT_WRITE);
+			lio->lis_pos = 0;
+			lio->lis_endpos = OBD_OBJECT_EOF;
+		}
+		break;
+
+	case CIT_SETATTR:
+		if (cl_io_is_trunc(io))
+			lio->lis_pos = io->u.ci_setattr.sa_attr.lvb_size;
+		else
+			lio->lis_pos = 0;
+		lio->lis_endpos = OBD_OBJECT_EOF;
+		break;
+
+	case CIT_FAULT: {
+		pgoff_t index = io->u.ci_fault.ft_index;
+		lio->lis_pos = cl_offset(io->ci_obj, index);
+		lio->lis_endpos = cl_offset(io->ci_obj, index + 1);
+		break;
+	}
+
+	case CIT_FSYNC: {
+		lio->lis_pos = io->u.ci_fsync.fi_start;
+		lio->lis_endpos = io->u.ci_fsync.fi_end;
+		break;
+	}
+
+	case CIT_MISC:
+		lio->lis_pos = 0;
+		lio->lis_endpos = OBD_OBJECT_EOF;
+		break;
+
+	default:
+		LBUG();
+	}
+}
+
+static void lov_io_fini(const struct lu_env *env, const struct cl_io_slice *ios)
+{
+	struct lov_io *lio = cl2lov_io(env, ios);
+	struct lov_object *lov = cl2lov(ios->cis_obj);
+	int i;
+
+	if (lio->lis_subs != NULL) {
+		for (i = 0; i < lio->lis_nr_subios; i++)
+			lov_io_sub_fini(env, lio, &lio->lis_subs[i]);
+		OBD_FREE_LARGE(lio->lis_subs,
+			 lio->lis_nr_subios * sizeof(lio->lis_subs[0]));
+		lio->lis_nr_subios = 0;
+	}
+
+	LASSERT(atomic_read(&lov->lo_active_ios) > 0);
+	if (atomic_dec_and_test(&lov->lo_active_ios))
+		wake_up_all(&lov->lo_waitq);
+}
+
+static u64 lov_offset_mod(u64 val, int delta)
+{
+	if (val != OBD_OBJECT_EOF)
+		val += delta;
+	return val;
+}
+
+static int lov_io_iter_init(const struct lu_env *env,
+			    const struct cl_io_slice *ios)
+{
+	struct lov_io	*lio = cl2lov_io(env, ios);
+	struct lov_stripe_md *lsm = lio->lis_object->lo_lsm;
+	struct lov_io_sub    *sub;
+	u64 endpos;
+	u64 start;
+	u64 end;
+	int stripe;
+	int rc = 0;
+
+	endpos = lov_offset_mod(lio->lis_endpos, -1);
+	for (stripe = 0; stripe < lio->lis_stripe_count; stripe++) {
+		if (!lov_stripe_intersects(lsm, stripe, lio->lis_pos,
+					   endpos, &start, &end))
+			continue;
+
+		if (unlikely(lov_r0(lio->lis_object)->lo_sub[stripe] == NULL)) {
+			if (ios->cis_io->ci_type == CIT_READ ||
+			    ios->cis_io->ci_type == CIT_WRITE ||
+			    ios->cis_io->ci_type == CIT_FAULT)
+				return -EIO;
+
+			continue;
+		}
+
+		end = lov_offset_mod(end, 1);
+		sub = lov_sub_get(env, lio, stripe);
+		if (!IS_ERR(sub)) {
+			lov_io_sub_inherit(sub->sub_io, lio, stripe,
+					   start, end);
+			rc = cl_io_iter_init(sub->sub_env, sub->sub_io);
+			lov_sub_put(sub);
+			CDEBUG(D_VFSTRACE, "shrink: %d [%llu, %llu)\n",
+			       stripe, start, end);
+		} else
+			rc = PTR_ERR(sub);
+
+		if (!rc)
+			list_add_tail(&sub->sub_linkage, &lio->lis_active);
+		else
+			break;
+	}
+	return rc;
+}
+
+static int lov_io_rw_iter_init(const struct lu_env *env,
+			       const struct cl_io_slice *ios)
+{
+	struct lov_io	*lio = cl2lov_io(env, ios);
+	struct cl_io	 *io  = ios->cis_io;
+	struct lov_stripe_md *lsm = lio->lis_object->lo_lsm;
+	__u64 start = io->u.ci_rw.crw_pos;
+	loff_t next;
+	unsigned long ssize = lsm->lsm_stripe_size;
+
+	LASSERT(io->ci_type == CIT_READ || io->ci_type == CIT_WRITE);
+
+	/* fast path for common case. */
+	if (lio->lis_nr_subios != 1 && !cl_io_is_append(io)) {
+
+		lov_do_div64(start, ssize);
+		next = (start + 1) * ssize;
+		if (next <= start * ssize)
+			next = ~0ull;
+
+		io->ci_continue = next < lio->lis_io_endpos;
+		io->u.ci_rw.crw_count = min_t(loff_t, lio->lis_io_endpos,
+					      next) - io->u.ci_rw.crw_pos;
+		lio->lis_pos    = io->u.ci_rw.crw_pos;
+		lio->lis_endpos = io->u.ci_rw.crw_pos + io->u.ci_rw.crw_count;
+		CDEBUG(D_VFSTRACE, "stripe: %llu chunk: [%llu, %llu) %llu\n",
+		       (__u64)start, lio->lis_pos, lio->lis_endpos,
+		       (__u64)lio->lis_io_endpos);
+	}
+	/*
+	 * XXX The following call should be optimized: we know, that
+	 * [lio->lis_pos, lio->lis_endpos) intersects with exactly one stripe.
+	 */
+	return lov_io_iter_init(env, ios);
+}
+
+static int lov_io_call(const struct lu_env *env, struct lov_io *lio,
+		       int (*iofunc)(const struct lu_env *, struct cl_io *))
+{
+	struct cl_io *parent = lio->lis_cl.cis_io;
+	struct lov_io_sub *sub;
+	int rc = 0;
+
+	list_for_each_entry(sub, &lio->lis_active, sub_linkage) {
+		lov_sub_enter(sub);
+		rc = iofunc(sub->sub_env, sub->sub_io);
+		lov_sub_exit(sub);
+		if (rc)
+			break;
+
+		if (parent->ci_result == 0)
+			parent->ci_result = sub->sub_io->ci_result;
+	}
+	return rc;
+}
+
+static int lov_io_lock(const struct lu_env *env, const struct cl_io_slice *ios)
+{
+	return lov_io_call(env, cl2lov_io(env, ios), cl_io_lock);
+}
+
+static int lov_io_start(const struct lu_env *env, const struct cl_io_slice *ios)
+{
+	return lov_io_call(env, cl2lov_io(env, ios), cl_io_start);
+}
+
+static int lov_io_end_wrapper(const struct lu_env *env, struct cl_io *io)
+{
+	/*
+	 * It's possible that lov_io_start() wasn't called against this
+	 * sub-io, either because previous sub-io failed, or upper layer
+	 * completed IO.
+	 */
+	if (io->ci_state == CIS_IO_GOING)
+		cl_io_end(env, io);
+	else
+		io->ci_state = CIS_IO_FINISHED;
+	return 0;
+}
+
+static int lov_io_iter_fini_wrapper(const struct lu_env *env, struct cl_io *io)
+{
+	cl_io_iter_fini(env, io);
+	return 0;
+}
+
+static int lov_io_unlock_wrapper(const struct lu_env *env, struct cl_io *io)
+{
+	cl_io_unlock(env, io);
+	return 0;
+}
+
+static void lov_io_end(const struct lu_env *env, const struct cl_io_slice *ios)
+{
+	int rc;
+
+	rc = lov_io_call(env, cl2lov_io(env, ios), lov_io_end_wrapper);
+	LASSERT(rc == 0);
+}
+
+static void lov_io_iter_fini(const struct lu_env *env,
+			     const struct cl_io_slice *ios)
+{
+	struct lov_io *lio = cl2lov_io(env, ios);
+	int rc;
+
+	rc = lov_io_call(env, lio, lov_io_iter_fini_wrapper);
+	LASSERT(rc == 0);
+	while (!list_empty(&lio->lis_active))
+		list_del_init(lio->lis_active.next);
+}
+
+static void lov_io_unlock(const struct lu_env *env,
+			  const struct cl_io_slice *ios)
+{
+	int rc;
+
+	rc = lov_io_call(env, cl2lov_io(env, ios), lov_io_unlock_wrapper);
+	LASSERT(rc == 0);
+}
+
+
+static struct cl_page_list *lov_io_submit_qin(struct lov_device *ld,
+					      struct cl_page_list *qin,
+					      int idx, int alloc)
+{
+	return alloc ? &qin[idx] : &ld->ld_emrg[idx]->emrg_page_list;
+}
+
+/**
+ * lov implementation of cl_operations::cio_submit() method. It takes a list
+ * of pages in \a queue, splits it into per-stripe sub-lists, invokes
+ * cl_io_submit() on underlying devices to submit sub-lists, and then splices
+ * everything back.
+ *
+ * Major complication of this function is a need to handle memory cleansing:
+ * cl_io_submit() is called to write out pages as a part of VM memory
+ * reclamation, and hence it may not fail due to memory shortages (system
+ * dead-locks otherwise). To deal with this, some resources (sub-lists,
+ * sub-environment, etc.) are allocated per-device on "startup" (i.e., in a
+ * not-memory cleansing context), and in case of memory shortage, these
+ * pre-allocated resources are used by lov_io_submit() under
+ * lov_device::ld_mutex mutex.
+ */
+static int lov_io_submit(const struct lu_env *env,
+			 const struct cl_io_slice *ios,
+			 enum cl_req_type crt, struct cl_2queue *queue)
+{
+	struct lov_io	  *lio = cl2lov_io(env, ios);
+	struct lov_object      *obj = lio->lis_object;
+	struct lov_device       *ld = lu2lov_dev(lov2cl(obj)->co_lu.lo_dev);
+	struct cl_page_list    *qin = &queue->c2_qin;
+	struct cl_2queue      *cl2q = &lov_env_info(env)->lti_cl2q;
+	struct cl_page_list *stripes_qin = NULL;
+	struct cl_page *page;
+	struct cl_page *tmp;
+	int stripe;
+
+#define QIN(stripe) lov_io_submit_qin(ld, stripes_qin, stripe, alloc)
+
+	int rc = 0;
+	int alloc =
+		!(current->flags & PF_MEMALLOC);
+
+	if (lio->lis_active_subios == 1) {
+		int idx = lio->lis_single_subio_index;
+		struct lov_io_sub *sub;
+
+		LASSERT(idx < lio->lis_nr_subios);
+		sub = lov_sub_get(env, lio, idx);
+		LASSERT(!IS_ERR(sub));
+		LASSERT(sub->sub_io == &lio->lis_single_subio);
+		rc = cl_io_submit_rw(sub->sub_env, sub->sub_io,
+				     crt, queue);
+		lov_sub_put(sub);
+		return rc;
+	}
+
+	LASSERT(lio->lis_subs != NULL);
+	if (alloc) {
+		OBD_ALLOC_LARGE(stripes_qin,
+				sizeof(*stripes_qin) * lio->lis_nr_subios);
+		if (stripes_qin == NULL)
+			return -ENOMEM;
+
+		for (stripe = 0; stripe < lio->lis_nr_subios; stripe++)
+			cl_page_list_init(&stripes_qin[stripe]);
+	} else {
+		/*
+		 * If we get here, it means pageout & swap doesn't help.
+		 * In order to not make things worse, even don't try to
+		 * allocate the memory with __GFP_NOWARN. -jay
+		 */
+		mutex_lock(&ld->ld_mutex);
+		lio->lis_mem_frozen = 1;
+	}
+
+	cl_2queue_init(cl2q);
+	cl_page_list_for_each_safe(page, tmp, qin) {
+		stripe = lov_page_stripe(page);
+		cl_page_list_move(QIN(stripe), qin, page);
+	}
+
+	for (stripe = 0; stripe < lio->lis_nr_subios; stripe++) {
+		struct lov_io_sub   *sub;
+		struct cl_page_list *sub_qin = QIN(stripe);
+
+		if (list_empty(&sub_qin->pl_pages))
+			continue;
+
+		cl_page_list_splice(sub_qin, &cl2q->c2_qin);
+		sub = lov_sub_get(env, lio, stripe);
+		if (!IS_ERR(sub)) {
+			rc = cl_io_submit_rw(sub->sub_env, sub->sub_io,
+					     crt, cl2q);
+			lov_sub_put(sub);
+		} else
+			rc = PTR_ERR(sub);
+		cl_page_list_splice(&cl2q->c2_qin,  &queue->c2_qin);
+		cl_page_list_splice(&cl2q->c2_qout, &queue->c2_qout);
+		if (rc != 0)
+			break;
+	}
+
+	for (stripe = 0; stripe < lio->lis_nr_subios; stripe++) {
+		struct cl_page_list *sub_qin = QIN(stripe);
+
+		if (list_empty(&sub_qin->pl_pages))
+			continue;
+
+		cl_page_list_splice(sub_qin, qin);
+	}
+
+	if (alloc) {
+		OBD_FREE_LARGE(stripes_qin,
+			 sizeof(*stripes_qin) * lio->lis_nr_subios);
+	} else {
+		int i;
+
+		for (i = 0; i < lio->lis_nr_subios; i++) {
+			struct cl_io *cio = lio->lis_subs[i].sub_io;
+
+			if (cio && cio == &ld->ld_emrg[i]->emrg_subio)
+				lov_io_sub_fini(env, lio, &lio->lis_subs[i]);
+		}
+		lio->lis_mem_frozen = 0;
+		mutex_unlock(&ld->ld_mutex);
+	}
+
+	return rc;
+#undef QIN
+}
+
+static int lov_io_prepare_write(const struct lu_env *env,
+				const struct cl_io_slice *ios,
+				const struct cl_page_slice *slice,
+				unsigned from, unsigned to)
+{
+	struct lov_io     *lio      = cl2lov_io(env, ios);
+	struct cl_page    *sub_page = lov_sub_page(slice);
+	struct lov_io_sub *sub;
+	int result;
+
+	sub = lov_page_subio(env, lio, slice);
+	if (!IS_ERR(sub)) {
+		result = cl_io_prepare_write(sub->sub_env, sub->sub_io,
+					     sub_page, from, to);
+		lov_sub_put(sub);
+	} else
+		result = PTR_ERR(sub);
+	return result;
+}
+
+static int lov_io_commit_write(const struct lu_env *env,
+			       const struct cl_io_slice *ios,
+			       const struct cl_page_slice *slice,
+			       unsigned from, unsigned to)
+{
+	struct lov_io     *lio      = cl2lov_io(env, ios);
+	struct cl_page    *sub_page = lov_sub_page(slice);
+	struct lov_io_sub *sub;
+	int result;
+
+	sub = lov_page_subio(env, lio, slice);
+	if (!IS_ERR(sub)) {
+		result = cl_io_commit_write(sub->sub_env, sub->sub_io,
+					    sub_page, from, to);
+		lov_sub_put(sub);
+	} else
+		result = PTR_ERR(sub);
+	return result;
+}
+
+static int lov_io_fault_start(const struct lu_env *env,
+			      const struct cl_io_slice *ios)
+{
+	struct cl_fault_io *fio;
+	struct lov_io      *lio;
+	struct lov_io_sub  *sub;
+
+	fio = &ios->cis_io->u.ci_fault;
+	lio = cl2lov_io(env, ios);
+	sub = lov_sub_get(env, lio, lov_page_stripe(fio->ft_page));
+	sub->sub_io->u.ci_fault.ft_nob = fio->ft_nob;
+	lov_sub_put(sub);
+	return lov_io_start(env, ios);
+}
+
+static void lov_io_fsync_end(const struct lu_env *env,
+			     const struct cl_io_slice *ios)
+{
+	struct lov_io *lio = cl2lov_io(env, ios);
+	struct lov_io_sub *sub;
+	unsigned int *written = &ios->cis_io->u.ci_fsync.fi_nr_written;
+
+	*written = 0;
+	list_for_each_entry(sub, &lio->lis_active, sub_linkage) {
+		struct cl_io *subio = sub->sub_io;
+
+		lov_sub_enter(sub);
+		lov_io_end_wrapper(sub->sub_env, subio);
+		lov_sub_exit(sub);
+
+		if (subio->ci_result == 0)
+			*written += subio->u.ci_fsync.fi_nr_written;
+	}
+}
+
+static const struct cl_io_operations lov_io_ops = {
+	.op = {
+		[CIT_READ] = {
+			.cio_fini      = lov_io_fini,
+			.cio_iter_init = lov_io_rw_iter_init,
+			.cio_iter_fini = lov_io_iter_fini,
+			.cio_lock      = lov_io_lock,
+			.cio_unlock    = lov_io_unlock,
+			.cio_start     = lov_io_start,
+			.cio_end       = lov_io_end
+		},
+		[CIT_WRITE] = {
+			.cio_fini      = lov_io_fini,
+			.cio_iter_init = lov_io_rw_iter_init,
+			.cio_iter_fini = lov_io_iter_fini,
+			.cio_lock      = lov_io_lock,
+			.cio_unlock    = lov_io_unlock,
+			.cio_start     = lov_io_start,
+			.cio_end       = lov_io_end
+		},
+		[CIT_SETATTR] = {
+			.cio_fini      = lov_io_fini,
+			.cio_iter_init = lov_io_iter_init,
+			.cio_iter_fini = lov_io_iter_fini,
+			.cio_lock      = lov_io_lock,
+			.cio_unlock    = lov_io_unlock,
+			.cio_start     = lov_io_start,
+			.cio_end       = lov_io_end
+		},
+		[CIT_FAULT] = {
+			.cio_fini      = lov_io_fini,
+			.cio_iter_init = lov_io_iter_init,
+			.cio_iter_fini = lov_io_iter_fini,
+			.cio_lock      = lov_io_lock,
+			.cio_unlock    = lov_io_unlock,
+			.cio_start     = lov_io_fault_start,
+			.cio_end       = lov_io_end
+		},
+		[CIT_FSYNC] = {
+			.cio_fini      = lov_io_fini,
+			.cio_iter_init = lov_io_iter_init,
+			.cio_iter_fini = lov_io_iter_fini,
+			.cio_lock      = lov_io_lock,
+			.cio_unlock    = lov_io_unlock,
+			.cio_start     = lov_io_start,
+			.cio_end       = lov_io_fsync_end
+		},
+		[CIT_MISC] = {
+			.cio_fini   = lov_io_fini
+		}
+	},
+	.req_op = {
+		 [CRT_READ] = {
+			 .cio_submit    = lov_io_submit
+		 },
+		 [CRT_WRITE] = {
+			 .cio_submit    = lov_io_submit
+		 }
+	 },
+	.cio_prepare_write = lov_io_prepare_write,
+	.cio_commit_write  = lov_io_commit_write
+};
+
+/*****************************************************************************
+ *
+ * Empty lov io operations.
+ *
+ */
+
+static void lov_empty_io_fini(const struct lu_env *env,
+			      const struct cl_io_slice *ios)
+{
+	struct lov_object *lov = cl2lov(ios->cis_obj);
+
+	if (atomic_dec_and_test(&lov->lo_active_ios))
+		wake_up_all(&lov->lo_waitq);
+}
+
+static void lov_empty_impossible(const struct lu_env *env,
+				 struct cl_io_slice *ios)
+{
+	LBUG();
+}
+
+#define LOV_EMPTY_IMPOSSIBLE ((void *)lov_empty_impossible)
+
+/**
+ * An io operation vector for files without stripes.
+ */
+static const struct cl_io_operations lov_empty_io_ops = {
+	.op = {
+		[CIT_READ] = {
+			.cio_fini       = lov_empty_io_fini,
+#if 0
+			.cio_iter_init  = LOV_EMPTY_IMPOSSIBLE,
+			.cio_lock       = LOV_EMPTY_IMPOSSIBLE,
+			.cio_start      = LOV_EMPTY_IMPOSSIBLE,
+			.cio_end	= LOV_EMPTY_IMPOSSIBLE
+#endif
+		},
+		[CIT_WRITE] = {
+			.cio_fini      = lov_empty_io_fini,
+			.cio_iter_init = LOV_EMPTY_IMPOSSIBLE,
+			.cio_lock      = LOV_EMPTY_IMPOSSIBLE,
+			.cio_start     = LOV_EMPTY_IMPOSSIBLE,
+			.cio_end       = LOV_EMPTY_IMPOSSIBLE
+		},
+		[CIT_SETATTR] = {
+			.cio_fini      = lov_empty_io_fini,
+			.cio_iter_init = LOV_EMPTY_IMPOSSIBLE,
+			.cio_lock      = LOV_EMPTY_IMPOSSIBLE,
+			.cio_start     = LOV_EMPTY_IMPOSSIBLE,
+			.cio_end       = LOV_EMPTY_IMPOSSIBLE
+		},
+		[CIT_FAULT] = {
+			.cio_fini      = lov_empty_io_fini,
+			.cio_iter_init = LOV_EMPTY_IMPOSSIBLE,
+			.cio_lock      = LOV_EMPTY_IMPOSSIBLE,
+			.cio_start     = LOV_EMPTY_IMPOSSIBLE,
+			.cio_end       = LOV_EMPTY_IMPOSSIBLE
+		},
+		[CIT_FSYNC] = {
+			.cio_fini   = lov_empty_io_fini
+		},
+		[CIT_MISC] = {
+			.cio_fini   = lov_empty_io_fini
+		}
+	},
+	.req_op = {
+		 [CRT_READ] = {
+			 .cio_submit    = LOV_EMPTY_IMPOSSIBLE
+		 },
+		 [CRT_WRITE] = {
+			 .cio_submit    = LOV_EMPTY_IMPOSSIBLE
+		 }
+	 },
+	.cio_commit_write = LOV_EMPTY_IMPOSSIBLE
+};
+
+int lov_io_init_raid0(const struct lu_env *env, struct cl_object *obj,
+		      struct cl_io *io)
+{
+	struct lov_io       *lio = lov_env_io(env);
+	struct lov_object   *lov = cl2lov(obj);
+
+	INIT_LIST_HEAD(&lio->lis_active);
+	lov_io_slice_init(lio, lov, io);
+	if (io->ci_result == 0) {
+		io->ci_result = lov_io_subio_init(env, lio, io);
+		if (io->ci_result == 0) {
+			cl_io_slice_add(io, &lio->lis_cl, obj, &lov_io_ops);
+			atomic_inc(&lov->lo_active_ios);
+		}
+	}
+	return io->ci_result;
+}
+
+int lov_io_init_empty(const struct lu_env *env, struct cl_object *obj,
+		      struct cl_io *io)
+{
+	struct lov_object *lov = cl2lov(obj);
+	struct lov_io *lio = lov_env_io(env);
+	int result;
+
+	lio->lis_object = lov;
+	switch (io->ci_type) {
+	default:
+		LBUG();
+	case CIT_MISC:
+	case CIT_READ:
+		result = 0;
+		break;
+	case CIT_FSYNC:
+	case CIT_SETATTR:
+		result = 1;
+		break;
+	case CIT_WRITE:
+		result = -EBADF;
+		break;
+	case CIT_FAULT:
+		result = -EFAULT;
+		CERROR("Page fault on a file without stripes: "DFID"\n",
+		       PFID(lu_object_fid(&obj->co_lu)));
+		break;
+	}
+	if (result == 0) {
+		cl_io_slice_add(io, &lio->lis_cl, obj, &lov_empty_io_ops);
+		atomic_inc(&lov->lo_active_ios);
+	}
+
+	io->ci_result = result < 0 ? result : 0;
+	return result != 0;
+}
+
+int lov_io_init_released(const struct lu_env *env, struct cl_object *obj,
+			struct cl_io *io)
+{
+	struct lov_object *lov = cl2lov(obj);
+	struct lov_io *lio = lov_env_io(env);
+	int result;
+
+	LASSERT(lov->lo_lsm != NULL);
+	lio->lis_object = lov;
+
+	switch (io->ci_type) {
+	default:
+		LASSERTF(0, "invalid type %d\n", io->ci_type);
+	case CIT_MISC:
+	case CIT_FSYNC:
+		result = 1;
+		break;
+	case CIT_SETATTR:
+		/* the truncate to 0 is managed by MDT:
+		 * - in open, for open O_TRUNC
+		 * - in setattr, for truncate
+		 */
+		/* the truncate is for size > 0 so triggers a restore */
+		if (cl_io_is_trunc(io))
+			io->ci_restore_needed = 1;
+		result = -ENODATA;
+		break;
+	case CIT_READ:
+	case CIT_WRITE:
+	case CIT_FAULT:
+		io->ci_restore_needed = 1;
+		result = -ENODATA;
+		break;
+	}
+	if (result == 0) {
+		cl_io_slice_add(io, &lio->lis_cl, obj, &lov_empty_io_ops);
+		atomic_inc(&lov->lo_active_ios);
+	}
+
+	io->ci_result = result < 0 ? result : 0;
+	return result != 0;
+}
+/** @} lov */
diff --git a/kernel/drivers/staging/lustre/lustre/lov/lov_lock.c b/kernel/drivers/staging/lustre/lustre/lov/lov_lock.c
new file mode 100644
index 000000000..f2eca565b
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/lov/lov_lock.c
@@ -0,0 +1,1198 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Implementation of cl_lock for LOV layer.
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LOV
+
+#include "lov_cl_internal.h"
+
+/** \addtogroup lov
+ *  @{
+ */
+
+static struct cl_lock_closure *lov_closure_get(const struct lu_env *env,
+					       struct cl_lock *parent);
+
+static int lov_lock_unuse(const struct lu_env *env,
+			  const struct cl_lock_slice *slice);
+/*****************************************************************************
+ *
+ * Lov lock operations.
+ *
+ */
+
+static struct lov_sublock_env *lov_sublock_env_get(const struct lu_env *env,
+						   struct cl_lock *parent,
+						   struct lov_lock_sub *lls)
+{
+	struct lov_sublock_env *subenv;
+	struct lov_io	  *lio    = lov_env_io(env);
+	struct cl_io	   *io     = lio->lis_cl.cis_io;
+	struct lov_io_sub      *sub;
+
+	subenv = &lov_env_session(env)->ls_subenv;
+
+	/*
+	 * FIXME: We tend to use the subio's env & io to call the sublock
+	 * lock operations because osc lock sometimes stores some control
+	 * variables in thread's IO information(Now only lockless information).
+	 * However, if the lock's host(object) is different from the object
+	 * for current IO, we have no way to get the subenv and subio because
+	 * they are not initialized at all. As a temp fix, in this case,
+	 * we still borrow the parent's env to call sublock operations.
+	 */
+	if (!io || !cl_object_same(io->ci_obj, parent->cll_descr.cld_obj)) {
+		subenv->lse_env = env;
+		subenv->lse_io  = io;
+		subenv->lse_sub = NULL;
+	} else {
+		sub = lov_sub_get(env, lio, lls->sub_stripe);
+		if (!IS_ERR(sub)) {
+			subenv->lse_env = sub->sub_env;
+			subenv->lse_io  = sub->sub_io;
+			subenv->lse_sub = sub;
+		} else {
+			subenv = (void *)sub;
+		}
+	}
+	return subenv;
+}
+
+static void lov_sublock_env_put(struct lov_sublock_env *subenv)
+{
+	if (subenv && subenv->lse_sub)
+		lov_sub_put(subenv->lse_sub);
+}
+
+static void lov_sublock_adopt(const struct lu_env *env, struct lov_lock *lck,
+			      struct cl_lock *sublock, int idx,
+			      struct lov_lock_link *link)
+{
+	struct lovsub_lock *lsl;
+	struct cl_lock     *parent = lck->lls_cl.cls_lock;
+	int		 rc;
+
+	LASSERT(cl_lock_is_mutexed(parent));
+	LASSERT(cl_lock_is_mutexed(sublock));
+
+	lsl = cl2sub_lock(sublock);
+	/*
+	 * check that sub-lock doesn't have lock link to this top-lock.
+	 */
+	LASSERT(lov_lock_link_find(env, lck, lsl) == NULL);
+	LASSERT(idx < lck->lls_nr);
+
+	lck->lls_sub[idx].sub_lock = lsl;
+	lck->lls_nr_filled++;
+	LASSERT(lck->lls_nr_filled <= lck->lls_nr);
+	list_add_tail(&link->lll_list, &lsl->lss_parents);
+	link->lll_idx = idx;
+	link->lll_super = lck;
+	cl_lock_get(parent);
+	lu_ref_add(&parent->cll_reference, "lov-child", sublock);
+	lck->lls_sub[idx].sub_flags |= LSF_HELD;
+	cl_lock_user_add(env, sublock);
+
+	rc = lov_sublock_modify(env, lck, lsl, &sublock->cll_descr, idx);
+	LASSERT(rc == 0); /* there is no way this can fail, currently */
+}
+
+static struct cl_lock *lov_sublock_alloc(const struct lu_env *env,
+					 const struct cl_io *io,
+					 struct lov_lock *lck,
+					 int idx, struct lov_lock_link **out)
+{
+	struct cl_lock       *sublock;
+	struct cl_lock       *parent;
+	struct lov_lock_link *link;
+
+	LASSERT(idx < lck->lls_nr);
+
+	OBD_SLAB_ALLOC_PTR_GFP(link, lov_lock_link_kmem, GFP_NOFS);
+	if (link != NULL) {
+		struct lov_sublock_env *subenv;
+		struct lov_lock_sub  *lls;
+		struct cl_lock_descr *descr;
+
+		parent = lck->lls_cl.cls_lock;
+		lls    = &lck->lls_sub[idx];
+		descr  = &lls->sub_got;
+
+		subenv = lov_sublock_env_get(env, parent, lls);
+		if (!IS_ERR(subenv)) {
+			/* CAVEAT: Don't try to add a field in lov_lock_sub
+			 * to remember the subio. This is because lock is able
+			 * to be cached, but this is not true for IO. This
+			 * further means a sublock might be referenced in
+			 * different io context. -jay */
+
+			sublock = cl_lock_hold(subenv->lse_env, subenv->lse_io,
+					       descr, "lov-parent", parent);
+			lov_sublock_env_put(subenv);
+		} else {
+			/* error occurs. */
+			sublock = (void *)subenv;
+		}
+
+		if (!IS_ERR(sublock))
+			*out = link;
+		else
+			OBD_SLAB_FREE_PTR(link, lov_lock_link_kmem);
+	} else
+		sublock = ERR_PTR(-ENOMEM);
+	return sublock;
+}
+
+static void lov_sublock_unlock(const struct lu_env *env,
+			       struct lovsub_lock *lsl,
+			       struct cl_lock_closure *closure,
+			       struct lov_sublock_env *subenv)
+{
+	lov_sublock_env_put(subenv);
+	lsl->lss_active = NULL;
+	cl_lock_disclosure(env, closure);
+}
+
+static int lov_sublock_lock(const struct lu_env *env,
+			    struct lov_lock *lck,
+			    struct lov_lock_sub *lls,
+			    struct cl_lock_closure *closure,
+			    struct lov_sublock_env **lsep)
+{
+	struct lovsub_lock *sublock;
+	struct cl_lock     *child;
+	int		 result = 0;
+
+	LASSERT(list_empty(&closure->clc_list));
+
+	sublock = lls->sub_lock;
+	child = sublock->lss_cl.cls_lock;
+	result = cl_lock_closure_build(env, child, closure);
+	if (result == 0) {
+		struct cl_lock *parent = closure->clc_origin;
+
+		LASSERT(cl_lock_is_mutexed(child));
+		sublock->lss_active = parent;
+
+		if (unlikely((child->cll_state == CLS_FREEING) ||
+			     (child->cll_flags & CLF_CANCELLED))) {
+			struct lov_lock_link *link;
+			/*
+			 * we could race with lock deletion which temporarily
+			 * put the lock in freeing state, bug 19080.
+			 */
+			LASSERT(!(lls->sub_flags & LSF_HELD));
+
+			link = lov_lock_link_find(env, lck, sublock);
+			LASSERT(link != NULL);
+			lov_lock_unlink(env, link, sublock);
+			lov_sublock_unlock(env, sublock, closure, NULL);
+			lck->lls_cancel_race = 1;
+			result = CLO_REPEAT;
+		} else if (lsep) {
+			struct lov_sublock_env *subenv;
+			subenv = lov_sublock_env_get(env, parent, lls);
+			if (IS_ERR(subenv)) {
+				lov_sublock_unlock(env, sublock,
+						   closure, NULL);
+				result = PTR_ERR(subenv);
+			} else {
+				*lsep = subenv;
+			}
+		}
+	}
+	return result;
+}
+
+/**
+ * Updates the result of a top-lock operation from a result of sub-lock
+ * sub-operations. Top-operations like lov_lock_{enqueue,use,unuse}() iterate
+ * over sub-locks and lov_subresult() is used to calculate return value of a
+ * top-operation. To this end, possible return values of sub-operations are
+ * ordered as
+ *
+ *     - 0		  success
+ *     - CLO_WAIT	   wait for event
+ *     - CLO_REPEAT	 repeat top-operation
+ *     - -ne		fundamental error
+ *
+ * Top-level return code can only go down through this list. CLO_REPEAT
+ * overwrites CLO_WAIT, because lock mutex was released and sleeping condition
+ * has to be rechecked by the upper layer.
+ */
+static int lov_subresult(int result, int rc)
+{
+	int result_rank;
+	int rc_rank;
+
+	LASSERTF(result <= 0 || result == CLO_REPEAT || result == CLO_WAIT,
+		 "result = %d", result);
+	LASSERTF(rc <= 0 || rc == CLO_REPEAT || rc == CLO_WAIT,
+		 "rc = %d\n", rc);
+	CLASSERT(CLO_WAIT < CLO_REPEAT);
+
+	/* calculate ranks in the ordering above */
+	result_rank = result < 0 ? 1 + CLO_REPEAT : result;
+	rc_rank = rc < 0 ? 1 + CLO_REPEAT : rc;
+
+	if (result_rank < rc_rank)
+		result = rc;
+	return result;
+}
+
+/**
+ * Creates sub-locks for a given lov_lock for the first time.
+ *
+ * Goes through all sub-objects of top-object, and creates sub-locks on every
+ * sub-object intersecting with top-lock extent. This is complicated by the
+ * fact that top-lock (that is being created) can be accessed concurrently
+ * through already created sub-locks (possibly shared with other top-locks).
+ */
+static int lov_lock_sub_init(const struct lu_env *env,
+			     struct lov_lock *lck, const struct cl_io *io)
+{
+	int result = 0;
+	int i;
+	int nr;
+	u64 start;
+	u64 end;
+	u64 file_start;
+	u64 file_end;
+
+	struct lov_object       *loo    = cl2lov(lck->lls_cl.cls_obj);
+	struct lov_layout_raid0 *r0     = lov_r0(loo);
+	struct cl_lock	  *parent = lck->lls_cl.cls_lock;
+
+	lck->lls_orig = parent->cll_descr;
+	file_start = cl_offset(lov2cl(loo), parent->cll_descr.cld_start);
+	file_end   = cl_offset(lov2cl(loo), parent->cll_descr.cld_end + 1) - 1;
+
+	for (i = 0, nr = 0; i < r0->lo_nr; i++) {
+		/*
+		 * XXX for wide striping smarter algorithm is desirable,
+		 * breaking out of the loop, early.
+		 */
+		if (likely(r0->lo_sub[i] != NULL) &&
+		    lov_stripe_intersects(loo->lo_lsm, i,
+					  file_start, file_end, &start, &end))
+			nr++;
+	}
+	LASSERT(nr > 0);
+	OBD_ALLOC_LARGE(lck->lls_sub, nr * sizeof(lck->lls_sub[0]));
+	if (lck->lls_sub == NULL)
+		return -ENOMEM;
+
+	lck->lls_nr = nr;
+	/*
+	 * First, fill in sub-lock descriptions in
+	 * lck->lls_sub[].sub_descr. They are used by lov_sublock_alloc()
+	 * (called below in this function, and by lov_lock_enqueue()) to
+	 * create sub-locks. At this moment, no other thread can access
+	 * top-lock.
+	 */
+	for (i = 0, nr = 0; i < r0->lo_nr; ++i) {
+		if (likely(r0->lo_sub[i] != NULL) &&
+		    lov_stripe_intersects(loo->lo_lsm, i,
+					  file_start, file_end, &start, &end)) {
+			struct cl_lock_descr *descr;
+
+			descr = &lck->lls_sub[nr].sub_descr;
+
+			LASSERT(descr->cld_obj == NULL);
+			descr->cld_obj   = lovsub2cl(r0->lo_sub[i]);
+			descr->cld_start = cl_index(descr->cld_obj, start);
+			descr->cld_end   = cl_index(descr->cld_obj, end);
+			descr->cld_mode  = parent->cll_descr.cld_mode;
+			descr->cld_gid   = parent->cll_descr.cld_gid;
+			descr->cld_enq_flags   = parent->cll_descr.cld_enq_flags;
+			/* XXX has no effect */
+			lck->lls_sub[nr].sub_got = *descr;
+			lck->lls_sub[nr].sub_stripe = i;
+			nr++;
+		}
+	}
+	LASSERT(nr == lck->lls_nr);
+
+	/*
+	 * Some sub-locks can be missing at this point. This is not a problem,
+	 * because enqueue will create them anyway. Main duty of this function
+	 * is to fill in sub-lock descriptions in a race free manner.
+	 */
+	return result;
+}
+
+static int lov_sublock_release(const struct lu_env *env, struct lov_lock *lck,
+			       int i, int deluser, int rc)
+{
+	struct cl_lock *parent = lck->lls_cl.cls_lock;
+
+	LASSERT(cl_lock_is_mutexed(parent));
+
+	if (lck->lls_sub[i].sub_flags & LSF_HELD) {
+		struct cl_lock    *sublock;
+		int dying;
+
+		LASSERT(lck->lls_sub[i].sub_lock != NULL);
+		sublock = lck->lls_sub[i].sub_lock->lss_cl.cls_lock;
+		LASSERT(cl_lock_is_mutexed(sublock));
+
+		lck->lls_sub[i].sub_flags &= ~LSF_HELD;
+		if (deluser)
+			cl_lock_user_del(env, sublock);
+		/*
+		 * If the last hold is released, and cancellation is pending
+		 * for a sub-lock, release parent mutex, to avoid keeping it
+		 * while sub-lock is being paged out.
+		 */
+		dying = (sublock->cll_descr.cld_mode == CLM_PHANTOM ||
+			 sublock->cll_descr.cld_mode == CLM_GROUP ||
+			 (sublock->cll_flags & (CLF_CANCELPEND|CLF_DOOMED))) &&
+			sublock->cll_holds == 1;
+		if (dying)
+			cl_lock_mutex_put(env, parent);
+		cl_lock_unhold(env, sublock, "lov-parent", parent);
+		if (dying) {
+			cl_lock_mutex_get(env, parent);
+			rc = lov_subresult(rc, CLO_REPEAT);
+		}
+		/*
+		 * From now on lck->lls_sub[i].sub_lock is a "weak" pointer,
+		 * not backed by a reference on a
+		 * sub-lock. lovsub_lock_delete() will clear
+		 * lck->lls_sub[i].sub_lock under semaphores, just before
+		 * sub-lock is destroyed.
+		 */
+	}
+	return rc;
+}
+
+static void lov_sublock_hold(const struct lu_env *env, struct lov_lock *lck,
+			     int i)
+{
+	struct cl_lock *parent = lck->lls_cl.cls_lock;
+
+	LASSERT(cl_lock_is_mutexed(parent));
+
+	if (!(lck->lls_sub[i].sub_flags & LSF_HELD)) {
+		struct cl_lock *sublock;
+
+		LASSERT(lck->lls_sub[i].sub_lock != NULL);
+		sublock = lck->lls_sub[i].sub_lock->lss_cl.cls_lock;
+		LASSERT(cl_lock_is_mutexed(sublock));
+		LASSERT(sublock->cll_state != CLS_FREEING);
+
+		lck->lls_sub[i].sub_flags |= LSF_HELD;
+
+		cl_lock_get_trust(sublock);
+		cl_lock_hold_add(env, sublock, "lov-parent", parent);
+		cl_lock_user_add(env, sublock);
+		cl_lock_put(env, sublock);
+	}
+}
+
+static void lov_lock_fini(const struct lu_env *env,
+			  struct cl_lock_slice *slice)
+{
+	struct lov_lock *lck;
+	int i;
+
+	lck = cl2lov_lock(slice);
+	LASSERT(lck->lls_nr_filled == 0);
+	if (lck->lls_sub != NULL) {
+		for (i = 0; i < lck->lls_nr; ++i)
+			/*
+			 * No sub-locks exists at this point, as sub-lock has
+			 * a reference on its parent.
+			 */
+			LASSERT(lck->lls_sub[i].sub_lock == NULL);
+		OBD_FREE_LARGE(lck->lls_sub,
+			       lck->lls_nr * sizeof(lck->lls_sub[0]));
+	}
+	OBD_SLAB_FREE_PTR(lck, lov_lock_kmem);
+}
+
+static int lov_lock_enqueue_wait(const struct lu_env *env,
+				 struct lov_lock *lck,
+				 struct cl_lock *sublock)
+{
+	struct cl_lock *lock = lck->lls_cl.cls_lock;
+	int	     result;
+
+	LASSERT(cl_lock_is_mutexed(lock));
+
+	cl_lock_mutex_put(env, lock);
+	result = cl_lock_enqueue_wait(env, sublock, 0);
+	cl_lock_mutex_get(env, lock);
+	return result ?: CLO_REPEAT;
+}
+
+/**
+ * Tries to advance a state machine of a given sub-lock toward enqueuing of
+ * the top-lock.
+ *
+ * \retval 0 if state-transition can proceed
+ * \retval -ve otherwise.
+ */
+static int lov_lock_enqueue_one(const struct lu_env *env, struct lov_lock *lck,
+				struct cl_lock *sublock,
+				struct cl_io *io, __u32 enqflags, int last)
+{
+	int result;
+
+	/* first, try to enqueue a sub-lock ... */
+	result = cl_enqueue_try(env, sublock, io, enqflags);
+	if ((sublock->cll_state == CLS_ENQUEUED) && !(enqflags & CEF_AGL)) {
+		/* if it is enqueued, try to `wait' on it---maybe it's already
+		 * granted */
+		result = cl_wait_try(env, sublock);
+		if (result == CLO_REENQUEUED)
+			result = CLO_WAIT;
+	}
+	/*
+	 * If CEF_ASYNC flag is set, then all sub-locks can be enqueued in
+	 * parallel, otherwise---enqueue has to wait until sub-lock is granted
+	 * before proceeding to the next one.
+	 */
+	if ((result == CLO_WAIT) && (sublock->cll_state <= CLS_HELD) &&
+	    (enqflags & CEF_ASYNC) && (!last || (enqflags & CEF_AGL)))
+		result = 0;
+	return result;
+}
+
+/**
+ * Helper function for lov_lock_enqueue() that creates missing sub-lock.
+ */
+static int lov_sublock_fill(const struct lu_env *env, struct cl_lock *parent,
+			    struct cl_io *io, struct lov_lock *lck, int idx)
+{
+	struct lov_lock_link *link = NULL;
+	struct cl_lock       *sublock;
+	int		   result;
+
+	LASSERT(parent->cll_depth == 1);
+	cl_lock_mutex_put(env, parent);
+	sublock = lov_sublock_alloc(env, io, lck, idx, &link);
+	if (!IS_ERR(sublock))
+		cl_lock_mutex_get(env, sublock);
+	cl_lock_mutex_get(env, parent);
+
+	if (!IS_ERR(sublock)) {
+		cl_lock_get_trust(sublock);
+		if (parent->cll_state == CLS_QUEUING &&
+		    lck->lls_sub[idx].sub_lock == NULL) {
+			lov_sublock_adopt(env, lck, sublock, idx, link);
+		} else {
+			OBD_SLAB_FREE_PTR(link, lov_lock_link_kmem);
+			/* other thread allocated sub-lock, or enqueue is no
+			 * longer going on */
+			cl_lock_mutex_put(env, parent);
+			cl_lock_unhold(env, sublock, "lov-parent", parent);
+			cl_lock_mutex_get(env, parent);
+		}
+		cl_lock_mutex_put(env, sublock);
+		cl_lock_put(env, sublock);
+		result = CLO_REPEAT;
+	} else
+		result = PTR_ERR(sublock);
+	return result;
+}
+
+/**
+ * Implementation of cl_lock_operations::clo_enqueue() for lov layer. This
+ * function is rather subtle, as it enqueues top-lock (i.e., advances top-lock
+ * state machine from CLS_QUEUING to CLS_ENQUEUED states) by juggling sub-lock
+ * state machines in the face of sub-locks sharing (by multiple top-locks),
+ * and concurrent sub-lock cancellations.
+ */
+static int lov_lock_enqueue(const struct lu_env *env,
+			    const struct cl_lock_slice *slice,
+			    struct cl_io *io, __u32 enqflags)
+{
+	struct cl_lock	 *lock    = slice->cls_lock;
+	struct lov_lock	*lck     = cl2lov_lock(slice);
+	struct cl_lock_closure *closure = lov_closure_get(env, lock);
+	int i;
+	int result;
+	enum cl_lock_state minstate;
+
+	for (result = 0, minstate = CLS_FREEING, i = 0; i < lck->lls_nr; ++i) {
+		int rc;
+		struct lovsub_lock     *sub;
+		struct lov_lock_sub    *lls;
+		struct cl_lock	 *sublock;
+		struct lov_sublock_env *subenv;
+
+		if (lock->cll_state != CLS_QUEUING) {
+			/*
+			 * Lock might have left QUEUING state if previous
+			 * iteration released its mutex. Stop enqueing in this
+			 * case and let the upper layer to decide what to do.
+			 */
+			LASSERT(i > 0 && result != 0);
+			break;
+		}
+
+		lls = &lck->lls_sub[i];
+		sub = lls->sub_lock;
+		/*
+		 * Sub-lock might have been canceled, while top-lock was
+		 * cached.
+		 */
+		if (sub == NULL) {
+			result = lov_sublock_fill(env, lock, io, lck, i);
+			/* lov_sublock_fill() released @lock mutex,
+			 * restart. */
+			break;
+		}
+		sublock = sub->lss_cl.cls_lock;
+		rc = lov_sublock_lock(env, lck, lls, closure, &subenv);
+		if (rc == 0) {
+			lov_sublock_hold(env, lck, i);
+			rc = lov_lock_enqueue_one(subenv->lse_env, lck, sublock,
+						  subenv->lse_io, enqflags,
+						  i == lck->lls_nr - 1);
+			minstate = min(minstate, sublock->cll_state);
+			if (rc == CLO_WAIT) {
+				switch (sublock->cll_state) {
+				case CLS_QUEUING:
+					/* take recursive mutex, the lock is
+					 * released in lov_lock_enqueue_wait.
+					 */
+					cl_lock_mutex_get(env, sublock);
+					lov_sublock_unlock(env, sub, closure,
+							   subenv);
+					rc = lov_lock_enqueue_wait(env, lck,
+								   sublock);
+					break;
+				case CLS_CACHED:
+					cl_lock_get(sublock);
+					/* take recursive mutex of sublock */
+					cl_lock_mutex_get(env, sublock);
+					/* need to release all locks in closure
+					 * otherwise it may deadlock. LU-2683.*/
+					lov_sublock_unlock(env, sub, closure,
+							   subenv);
+					/* sublock and parent are held. */
+					rc = lov_sublock_release(env, lck, i,
+								 1, rc);
+					cl_lock_mutex_put(env, sublock);
+					cl_lock_put(env, sublock);
+					break;
+				default:
+					lov_sublock_unlock(env, sub, closure,
+							   subenv);
+					break;
+				}
+			} else {
+				LASSERT(sublock->cll_conflict == NULL);
+				lov_sublock_unlock(env, sub, closure, subenv);
+			}
+		}
+		result = lov_subresult(result, rc);
+		if (result != 0)
+			break;
+	}
+	cl_lock_closure_fini(closure);
+	return result ?: minstate >= CLS_ENQUEUED ? 0 : CLO_WAIT;
+}
+
+static int lov_lock_unuse(const struct lu_env *env,
+			  const struct cl_lock_slice *slice)
+{
+	struct lov_lock	*lck     = cl2lov_lock(slice);
+	struct cl_lock_closure *closure = lov_closure_get(env, slice->cls_lock);
+	int i;
+	int result;
+
+	for (result = 0, i = 0; i < lck->lls_nr; ++i) {
+		int rc;
+		struct lovsub_lock     *sub;
+		struct cl_lock	 *sublock;
+		struct lov_lock_sub    *lls;
+		struct lov_sublock_env *subenv;
+
+		/* top-lock state cannot change concurrently, because single
+		 * thread (one that released the last hold) carries unlocking
+		 * to the completion. */
+		LASSERT(slice->cls_lock->cll_state == CLS_INTRANSIT);
+		lls = &lck->lls_sub[i];
+		sub = lls->sub_lock;
+		if (sub == NULL)
+			continue;
+
+		sublock = sub->lss_cl.cls_lock;
+		rc = lov_sublock_lock(env, lck, lls, closure, &subenv);
+		if (rc == 0) {
+			if (lls->sub_flags & LSF_HELD) {
+				LASSERT(sublock->cll_state == CLS_HELD ||
+					sublock->cll_state == CLS_ENQUEUED);
+				rc = cl_unuse_try(subenv->lse_env, sublock);
+				rc = lov_sublock_release(env, lck, i, 0, rc);
+			}
+			lov_sublock_unlock(env, sub, closure, subenv);
+		}
+		result = lov_subresult(result, rc);
+	}
+
+	if (result == 0 && lck->lls_cancel_race) {
+		lck->lls_cancel_race = 0;
+		result = -ESTALE;
+	}
+	cl_lock_closure_fini(closure);
+	return result;
+}
+
+
+static void lov_lock_cancel(const struct lu_env *env,
+			   const struct cl_lock_slice *slice)
+{
+	struct lov_lock	*lck     = cl2lov_lock(slice);
+	struct cl_lock_closure *closure = lov_closure_get(env, slice->cls_lock);
+	int i;
+	int result;
+
+	for (result = 0, i = 0; i < lck->lls_nr; ++i) {
+		int rc;
+		struct lovsub_lock     *sub;
+		struct cl_lock	 *sublock;
+		struct lov_lock_sub    *lls;
+		struct lov_sublock_env *subenv;
+
+		/* top-lock state cannot change concurrently, because single
+		 * thread (one that released the last hold) carries unlocking
+		 * to the completion. */
+		lls = &lck->lls_sub[i];
+		sub = lls->sub_lock;
+		if (sub == NULL)
+			continue;
+
+		sublock = sub->lss_cl.cls_lock;
+		rc = lov_sublock_lock(env, lck, lls, closure, &subenv);
+		if (rc == 0) {
+			if (!(lls->sub_flags & LSF_HELD)) {
+				lov_sublock_unlock(env, sub, closure, subenv);
+				continue;
+			}
+
+			switch (sublock->cll_state) {
+			case CLS_HELD:
+				rc = cl_unuse_try(subenv->lse_env, sublock);
+				lov_sublock_release(env, lck, i, 0, 0);
+				break;
+			default:
+				lov_sublock_release(env, lck, i, 1, 0);
+				break;
+			}
+			lov_sublock_unlock(env, sub, closure, subenv);
+		}
+
+		if (rc == CLO_REPEAT) {
+			--i;
+			continue;
+		}
+
+		result = lov_subresult(result, rc);
+	}
+
+	if (result)
+		CL_LOCK_DEBUG(D_ERROR, env, slice->cls_lock,
+			      "lov_lock_cancel fails with %d.\n", result);
+
+	cl_lock_closure_fini(closure);
+}
+
+static int lov_lock_wait(const struct lu_env *env,
+			 const struct cl_lock_slice *slice)
+{
+	struct lov_lock	*lck     = cl2lov_lock(slice);
+	struct cl_lock_closure *closure = lov_closure_get(env, slice->cls_lock);
+	enum cl_lock_state      minstate;
+	int		     reenqueued;
+	int		     result;
+	int		     i;
+
+again:
+	for (result = 0, minstate = CLS_FREEING, i = 0, reenqueued = 0;
+	     i < lck->lls_nr; ++i) {
+		int rc;
+		struct lovsub_lock     *sub;
+		struct cl_lock	 *sublock;
+		struct lov_lock_sub    *lls;
+		struct lov_sublock_env *subenv;
+
+		lls = &lck->lls_sub[i];
+		sub = lls->sub_lock;
+		LASSERT(sub != NULL);
+		sublock = sub->lss_cl.cls_lock;
+		rc = lov_sublock_lock(env, lck, lls, closure, &subenv);
+		if (rc == 0) {
+			LASSERT(sublock->cll_state >= CLS_ENQUEUED);
+			if (sublock->cll_state < CLS_HELD)
+				rc = cl_wait_try(env, sublock);
+
+			minstate = min(minstate, sublock->cll_state);
+			lov_sublock_unlock(env, sub, closure, subenv);
+		}
+		if (rc == CLO_REENQUEUED) {
+			reenqueued++;
+			rc = 0;
+		}
+		result = lov_subresult(result, rc);
+		if (result != 0)
+			break;
+	}
+	/* Each sublock only can be reenqueued once, so will not loop for
+	 * ever. */
+	if (result == 0 && reenqueued != 0)
+		goto again;
+	cl_lock_closure_fini(closure);
+	return result ?: minstate >= CLS_HELD ? 0 : CLO_WAIT;
+}
+
+static int lov_lock_use(const struct lu_env *env,
+			const struct cl_lock_slice *slice)
+{
+	struct lov_lock	*lck     = cl2lov_lock(slice);
+	struct cl_lock_closure *closure = lov_closure_get(env, slice->cls_lock);
+	int		     result;
+	int		     i;
+
+	LASSERT(slice->cls_lock->cll_state == CLS_INTRANSIT);
+
+	for (result = 0, i = 0; i < lck->lls_nr; ++i) {
+		int rc;
+		struct lovsub_lock     *sub;
+		struct cl_lock	 *sublock;
+		struct lov_lock_sub    *lls;
+		struct lov_sublock_env *subenv;
+
+		LASSERT(slice->cls_lock->cll_state == CLS_INTRANSIT);
+
+		lls = &lck->lls_sub[i];
+		sub = lls->sub_lock;
+		if (sub == NULL) {
+			/*
+			 * Sub-lock might have been canceled, while top-lock was
+			 * cached.
+			 */
+			result = -ESTALE;
+			break;
+		}
+
+		sublock = sub->lss_cl.cls_lock;
+		rc = lov_sublock_lock(env, lck, lls, closure, &subenv);
+		if (rc == 0) {
+			LASSERT(sublock->cll_state != CLS_FREEING);
+			lov_sublock_hold(env, lck, i);
+			if (sublock->cll_state == CLS_CACHED) {
+				rc = cl_use_try(subenv->lse_env, sublock, 0);
+				if (rc != 0)
+					rc = lov_sublock_release(env, lck,
+								 i, 1, rc);
+			} else if (sublock->cll_state == CLS_NEW) {
+				/* Sub-lock might have been canceled, while
+				 * top-lock was cached. */
+				result = -ESTALE;
+				lov_sublock_release(env, lck, i, 1, result);
+			}
+			lov_sublock_unlock(env, sub, closure, subenv);
+		}
+		result = lov_subresult(result, rc);
+		if (result != 0)
+			break;
+	}
+
+	if (lck->lls_cancel_race) {
+		/*
+		 * If there is unlocking happened at the same time, then
+		 * sublock_lock state should be FREEING, and lov_sublock_lock
+		 * should return CLO_REPEAT. In this case, it should return
+		 * ESTALE, and up layer should reset the lock state to be NEW.
+		 */
+		lck->lls_cancel_race = 0;
+		LASSERT(result != 0);
+		result = -ESTALE;
+	}
+	cl_lock_closure_fini(closure);
+	return result;
+}
+
+#if 0
+static int lock_lock_multi_match()
+{
+	struct cl_lock	  *lock    = slice->cls_lock;
+	struct cl_lock_descr    *subneed = &lov_env_info(env)->lti_ldescr;
+	struct lov_object       *loo     = cl2lov(lov->lls_cl.cls_obj);
+	struct lov_layout_raid0 *r0      = lov_r0(loo);
+	struct lov_lock_sub     *sub;
+	struct cl_object	*subobj;
+	u64  fstart;
+	u64  fend;
+	u64  start;
+	u64  end;
+	int i;
+
+	fstart = cl_offset(need->cld_obj, need->cld_start);
+	fend   = cl_offset(need->cld_obj, need->cld_end + 1) - 1;
+	subneed->cld_mode = need->cld_mode;
+	cl_lock_mutex_get(env, lock);
+	for (i = 0; i < lov->lls_nr; ++i) {
+		sub = &lov->lls_sub[i];
+		if (sub->sub_lock == NULL)
+			continue;
+		subobj = sub->sub_descr.cld_obj;
+		if (!lov_stripe_intersects(loo->lo_lsm, sub->sub_stripe,
+					   fstart, fend, &start, &end))
+			continue;
+		subneed->cld_start = cl_index(subobj, start);
+		subneed->cld_end   = cl_index(subobj, end);
+		subneed->cld_obj   = subobj;
+		if (!cl_lock_ext_match(&sub->sub_got, subneed)) {
+			result = 0;
+			break;
+		}
+	}
+	cl_lock_mutex_put(env, lock);
+}
+#endif
+
+/**
+ * Check if the extent region \a descr is covered by \a child against the
+ * specific \a stripe.
+ */
+static int lov_lock_stripe_is_matching(const struct lu_env *env,
+				       struct lov_object *lov, int stripe,
+				       const struct cl_lock_descr *child,
+				       const struct cl_lock_descr *descr)
+{
+	struct lov_stripe_md *lsm = lov->lo_lsm;
+	u64 start;
+	u64 end;
+	int result;
+
+	if (lov_r0(lov)->lo_nr == 1)
+		return cl_lock_ext_match(child, descr);
+
+	/*
+	 * For a multi-stripes object:
+	 * - make sure the descr only covers child's stripe, and
+	 * - check if extent is matching.
+	 */
+	start = cl_offset(&lov->lo_cl, descr->cld_start);
+	end   = cl_offset(&lov->lo_cl, descr->cld_end + 1) - 1;
+	result = 0;
+	/* glimpse should work on the object with LOV EA hole. */
+	if (end - start <= lsm->lsm_stripe_size) {
+		int idx;
+
+		idx = lov_stripe_number(lsm, start);
+		if (idx == stripe ||
+		    unlikely(lov_r0(lov)->lo_sub[idx] == NULL)) {
+			idx = lov_stripe_number(lsm, end);
+			if (idx == stripe ||
+			    unlikely(lov_r0(lov)->lo_sub[idx] == NULL))
+				result = 1;
+		}
+	}
+
+	if (result != 0) {
+		struct cl_lock_descr *subd = &lov_env_info(env)->lti_ldescr;
+		u64 sub_start;
+		u64 sub_end;
+
+		subd->cld_obj  = NULL;   /* don't need sub object at all */
+		subd->cld_mode = descr->cld_mode;
+		subd->cld_gid  = descr->cld_gid;
+		result = lov_stripe_intersects(lsm, stripe, start, end,
+					       &sub_start, &sub_end);
+		LASSERT(result);
+		subd->cld_start = cl_index(child->cld_obj, sub_start);
+		subd->cld_end   = cl_index(child->cld_obj, sub_end);
+		result = cl_lock_ext_match(child, subd);
+	}
+	return result;
+}
+
+/**
+ * An implementation of cl_lock_operations::clo_fits_into() method.
+ *
+ * Checks whether a lock (given by \a slice) is suitable for \a
+ * io. Multi-stripe locks can be used only for "quick" io, like truncate, or
+ * O_APPEND write.
+ *
+ * \see ccc_lock_fits_into().
+ */
+static int lov_lock_fits_into(const struct lu_env *env,
+			      const struct cl_lock_slice *slice,
+			      const struct cl_lock_descr *need,
+			      const struct cl_io *io)
+{
+	struct lov_lock   *lov = cl2lov_lock(slice);
+	struct lov_object *obj = cl2lov(slice->cls_obj);
+	int result;
+
+	LASSERT(cl_object_same(need->cld_obj, slice->cls_obj));
+	LASSERT(lov->lls_nr > 0);
+
+	/* for top lock, it's necessary to match enq flags otherwise it will
+	 * run into problem if a sublock is missing and reenqueue. */
+	if (need->cld_enq_flags != lov->lls_orig.cld_enq_flags)
+		return 0;
+
+	if (need->cld_mode == CLM_GROUP)
+		/*
+		 * always allow to match group lock.
+		 */
+		result = cl_lock_ext_match(&lov->lls_orig, need);
+	else if (lov->lls_nr == 1) {
+		struct cl_lock_descr *got = &lov->lls_sub[0].sub_got;
+		result = lov_lock_stripe_is_matching(env,
+						     cl2lov(slice->cls_obj),
+						     lov->lls_sub[0].sub_stripe,
+						     got, need);
+	} else if (io->ci_type != CIT_SETATTR && io->ci_type != CIT_MISC &&
+		   !cl_io_is_append(io) && need->cld_mode != CLM_PHANTOM)
+		/*
+		 * Multi-stripe locks are only suitable for `quick' IO and for
+		 * glimpse.
+		 */
+		result = 0;
+	else
+		/*
+		 * Most general case: multi-stripe existing lock, and
+		 * (potentially) multi-stripe @need lock. Check that @need is
+		 * covered by @lov's sub-locks.
+		 *
+		 * For now, ignore lock expansions made by the server, and
+		 * match against original lock extent.
+		 */
+		result = cl_lock_ext_match(&lov->lls_orig, need);
+	CDEBUG(D_DLMTRACE, DDESCR"/"DDESCR" %d %d/%d: %d\n",
+	       PDESCR(&lov->lls_orig), PDESCR(&lov->lls_sub[0].sub_got),
+	       lov->lls_sub[0].sub_stripe, lov->lls_nr, lov_r0(obj)->lo_nr,
+	       result);
+	return result;
+}
+
+void lov_lock_unlink(const struct lu_env *env,
+		     struct lov_lock_link *link, struct lovsub_lock *sub)
+{
+	struct lov_lock *lck    = link->lll_super;
+	struct cl_lock  *parent = lck->lls_cl.cls_lock;
+
+	LASSERT(cl_lock_is_mutexed(parent));
+	LASSERT(cl_lock_is_mutexed(sub->lss_cl.cls_lock));
+
+	list_del_init(&link->lll_list);
+	LASSERT(lck->lls_sub[link->lll_idx].sub_lock == sub);
+	/* yank this sub-lock from parent's array */
+	lck->lls_sub[link->lll_idx].sub_lock = NULL;
+	LASSERT(lck->lls_nr_filled > 0);
+	lck->lls_nr_filled--;
+	lu_ref_del(&parent->cll_reference, "lov-child", sub->lss_cl.cls_lock);
+	cl_lock_put(env, parent);
+	OBD_SLAB_FREE_PTR(link, lov_lock_link_kmem);
+}
+
+struct lov_lock_link *lov_lock_link_find(const struct lu_env *env,
+					 struct lov_lock *lck,
+					 struct lovsub_lock *sub)
+{
+	struct lov_lock_link *scan;
+
+	LASSERT(cl_lock_is_mutexed(sub->lss_cl.cls_lock));
+
+	list_for_each_entry(scan, &sub->lss_parents, lll_list) {
+		if (scan->lll_super == lck)
+			return scan;
+	}
+	return NULL;
+}
+
+/**
+ * An implementation of cl_lock_operations::clo_delete() method. This is
+ * invoked for "top-to-bottom" delete, when lock destruction starts from the
+ * top-lock, e.g., as a result of inode destruction.
+ *
+ * Unlinks top-lock from all its sub-locks. Sub-locks are not deleted there:
+ * this is done separately elsewhere:
+ *
+ *     - for inode destruction, lov_object_delete() calls cl_object_kill() for
+ *       each sub-object, purging its locks;
+ *
+ *     - in other cases (e.g., a fatal error with a top-lock) sub-locks are
+ *       left in the cache.
+ */
+static void lov_lock_delete(const struct lu_env *env,
+			    const struct cl_lock_slice *slice)
+{
+	struct lov_lock	*lck     = cl2lov_lock(slice);
+	struct cl_lock_closure *closure = lov_closure_get(env, slice->cls_lock);
+	struct lov_lock_link   *link;
+	int		     rc;
+	int		     i;
+
+	LASSERT(slice->cls_lock->cll_state == CLS_FREEING);
+
+	for (i = 0; i < lck->lls_nr; ++i) {
+		struct lov_lock_sub *lls = &lck->lls_sub[i];
+		struct lovsub_lock  *lsl = lls->sub_lock;
+
+		if (lsl == NULL) /* already removed */
+			continue;
+
+		rc = lov_sublock_lock(env, lck, lls, closure, NULL);
+		if (rc == CLO_REPEAT) {
+			--i;
+			continue;
+		}
+
+		LASSERT(rc == 0);
+		LASSERT(lsl->lss_cl.cls_lock->cll_state < CLS_FREEING);
+
+		if (lls->sub_flags & LSF_HELD)
+			lov_sublock_release(env, lck, i, 1, 0);
+
+		link = lov_lock_link_find(env, lck, lsl);
+		LASSERT(link != NULL);
+		lov_lock_unlink(env, link, lsl);
+		LASSERT(lck->lls_sub[i].sub_lock == NULL);
+
+		lov_sublock_unlock(env, lsl, closure, NULL);
+	}
+
+	cl_lock_closure_fini(closure);
+}
+
+static int lov_lock_print(const struct lu_env *env, void *cookie,
+			  lu_printer_t p, const struct cl_lock_slice *slice)
+{
+	struct lov_lock *lck = cl2lov_lock(slice);
+	int	      i;
+
+	(*p)(env, cookie, "%d\n", lck->lls_nr);
+	for (i = 0; i < lck->lls_nr; ++i) {
+		struct lov_lock_sub *sub;
+
+		sub = &lck->lls_sub[i];
+		(*p)(env, cookie, "    %d %x: ", i, sub->sub_flags);
+		if (sub->sub_lock != NULL)
+			cl_lock_print(env, cookie, p,
+				      sub->sub_lock->lss_cl.cls_lock);
+		else
+			(*p)(env, cookie, "---\n");
+	}
+	return 0;
+}
+
+static const struct cl_lock_operations lov_lock_ops = {
+	.clo_fini      = lov_lock_fini,
+	.clo_enqueue   = lov_lock_enqueue,
+	.clo_wait      = lov_lock_wait,
+	.clo_use       = lov_lock_use,
+	.clo_unuse     = lov_lock_unuse,
+	.clo_cancel    = lov_lock_cancel,
+	.clo_fits_into = lov_lock_fits_into,
+	.clo_delete    = lov_lock_delete,
+	.clo_print     = lov_lock_print
+};
+
+int lov_lock_init_raid0(const struct lu_env *env, struct cl_object *obj,
+			struct cl_lock *lock, const struct cl_io *io)
+{
+	struct lov_lock *lck;
+	int result;
+
+	OBD_SLAB_ALLOC_PTR_GFP(lck, lov_lock_kmem, GFP_NOFS);
+	if (lck != NULL) {
+		cl_lock_slice_add(lock, &lck->lls_cl, obj, &lov_lock_ops);
+		result = lov_lock_sub_init(env, lck, io);
+	} else
+		result = -ENOMEM;
+	return result;
+}
+
+static void lov_empty_lock_fini(const struct lu_env *env,
+				struct cl_lock_slice *slice)
+{
+	struct lov_lock *lck = cl2lov_lock(slice);
+	OBD_SLAB_FREE_PTR(lck, lov_lock_kmem);
+}
+
+static int lov_empty_lock_print(const struct lu_env *env, void *cookie,
+			lu_printer_t p, const struct cl_lock_slice *slice)
+{
+	(*p)(env, cookie, "empty\n");
+	return 0;
+}
+
+/* XXX: more methods will be added later. */
+static const struct cl_lock_operations lov_empty_lock_ops = {
+	.clo_fini  = lov_empty_lock_fini,
+	.clo_print = lov_empty_lock_print
+};
+
+int lov_lock_init_empty(const struct lu_env *env, struct cl_object *obj,
+		struct cl_lock *lock, const struct cl_io *io)
+{
+	struct lov_lock *lck;
+	int result = -ENOMEM;
+
+	OBD_SLAB_ALLOC_PTR_GFP(lck, lov_lock_kmem, GFP_NOFS);
+	if (lck != NULL) {
+		cl_lock_slice_add(lock, &lck->lls_cl, obj, &lov_empty_lock_ops);
+		lck->lls_orig = lock->cll_descr;
+		result = 0;
+	}
+	return result;
+}
+
+static struct cl_lock_closure *lov_closure_get(const struct lu_env *env,
+					       struct cl_lock *parent)
+{
+	struct cl_lock_closure *closure;
+
+	closure = &lov_env_info(env)->lti_closure;
+	LASSERT(list_empty(&closure->clc_list));
+	cl_lock_closure_init(env, closure, parent, 1);
+	return closure;
+}
+
+
+/** @} lov */
diff --git a/kernel/drivers/staging/lustre/lustre/lov/lov_merge.c b/kernel/drivers/staging/lustre/lustre/lov/lov_merge.c
new file mode 100644
index 000000000..b7e7bfabe
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/lov/lov_merge.c
@@ -0,0 +1,186 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_LOV
+
+#include "../../include/linux/libcfs/libcfs.h"
+
+#include "../include/obd_class.h"
+#include "lov_internal.h"
+
+/** Merge the lock value block(&lvb) attributes and KMS from each of the
+ * stripes in a file into a single lvb. It is expected that the caller
+ * initializes the current atime, mtime, ctime to avoid regressing a more
+ * uptodate time on the local client.
+ */
+int lov_merge_lvb_kms(struct lov_stripe_md *lsm,
+		      struct ost_lvb *lvb, __u64 *kms_place)
+{
+	__u64 size = 0;
+	__u64 kms = 0;
+	__u64 blocks = 0;
+	s64 current_mtime = lvb->lvb_mtime;
+	s64 current_atime = lvb->lvb_atime;
+	s64 current_ctime = lvb->lvb_ctime;
+	int i;
+	int rc = 0;
+
+	assert_spin_locked(&lsm->lsm_lock);
+	LASSERT(lsm->lsm_lock_owner == current_pid());
+
+	CDEBUG(D_INODE, "MDT ID "DOSTID" initial value: s=%llu m=%llu a=%llu c=%llu b=%llu\n",
+	       POSTID(&lsm->lsm_oi), lvb->lvb_size, lvb->lvb_mtime,
+	       lvb->lvb_atime, lvb->lvb_ctime, lvb->lvb_blocks);
+	for (i = 0; i < lsm->lsm_stripe_count; i++) {
+		struct lov_oinfo *loi = lsm->lsm_oinfo[i];
+		u64 lov_size, tmpsize;
+
+		if (OST_LVB_IS_ERR(loi->loi_lvb.lvb_blocks)) {
+			rc = OST_LVB_GET_ERR(loi->loi_lvb.lvb_blocks);
+			continue;
+		}
+
+		tmpsize = loi->loi_kms;
+		lov_size = lov_stripe_size(lsm, tmpsize, i);
+		if (lov_size > kms)
+			kms = lov_size;
+
+		if (loi->loi_lvb.lvb_size > tmpsize)
+			tmpsize = loi->loi_lvb.lvb_size;
+
+		lov_size = lov_stripe_size(lsm, tmpsize, i);
+		if (lov_size > size)
+			size = lov_size;
+		/* merge blocks, mtime, atime */
+		blocks += loi->loi_lvb.lvb_blocks;
+		if (loi->loi_lvb.lvb_mtime > current_mtime)
+			current_mtime = loi->loi_lvb.lvb_mtime;
+		if (loi->loi_lvb.lvb_atime > current_atime)
+			current_atime = loi->loi_lvb.lvb_atime;
+		if (loi->loi_lvb.lvb_ctime > current_ctime)
+			current_ctime = loi->loi_lvb.lvb_ctime;
+
+		CDEBUG(D_INODE, "MDT ID "DOSTID" on OST[%u]: s=%llu m=%llu a=%llu c=%llu b=%llu\n",
+		       POSTID(&lsm->lsm_oi), loi->loi_ost_idx,
+		       loi->loi_lvb.lvb_size, loi->loi_lvb.lvb_mtime,
+		       loi->loi_lvb.lvb_atime, loi->loi_lvb.lvb_ctime,
+		       loi->loi_lvb.lvb_blocks);
+	}
+
+	*kms_place = kms;
+	lvb->lvb_size = size;
+	lvb->lvb_blocks = blocks;
+	lvb->lvb_mtime = current_mtime;
+	lvb->lvb_atime = current_atime;
+	lvb->lvb_ctime = current_ctime;
+	return rc;
+}
+
+/* Must be called under the lov_stripe_lock() */
+int lov_adjust_kms(struct obd_export *exp, struct lov_stripe_md *lsm,
+		   u64 size, int shrink)
+{
+	struct lov_oinfo *loi;
+	int stripe = 0;
+	__u64 kms;
+
+	assert_spin_locked(&lsm->lsm_lock);
+	LASSERT(lsm->lsm_lock_owner == current_pid());
+
+	if (shrink) {
+		for (; stripe < lsm->lsm_stripe_count; stripe++) {
+			struct lov_oinfo *loi = lsm->lsm_oinfo[stripe];
+			kms = lov_size_to_stripe(lsm, size, stripe);
+			CDEBUG(D_INODE,
+			       "stripe %d KMS %sing %llu->%llu\n",
+			       stripe, kms > loi->loi_kms ? "increase":"shrink",
+			       loi->loi_kms, kms);
+			loi_kms_set(loi, loi->loi_lvb.lvb_size = kms);
+		}
+		return 0;
+	}
+
+	if (size > 0)
+		stripe = lov_stripe_number(lsm, size - 1);
+	kms = lov_size_to_stripe(lsm, size, stripe);
+	loi = lsm->lsm_oinfo[stripe];
+
+	CDEBUG(D_INODE, "stripe %d KMS %sincreasing %llu->%llu\n",
+	       stripe, kms > loi->loi_kms ? "" : "not ", loi->loi_kms, kms);
+	if (kms > loi->loi_kms)
+		loi_kms_set(loi, kms);
+
+	return 0;
+}
+
+void lov_merge_attrs(struct obdo *tgt, struct obdo *src, u64 valid,
+		     struct lov_stripe_md *lsm, int stripeno, int *set)
+{
+	valid &= src->o_valid;
+
+	if (*set) {
+		if (valid & OBD_MD_FLSIZE) {
+			/* this handles sparse files properly */
+			u64 lov_size;
+
+			lov_size = lov_stripe_size(lsm, src->o_size, stripeno);
+			if (lov_size > tgt->o_size)
+				tgt->o_size = lov_size;
+		}
+		if (valid & OBD_MD_FLBLOCKS)
+			tgt->o_blocks += src->o_blocks;
+		if (valid & OBD_MD_FLBLKSZ)
+			tgt->o_blksize += src->o_blksize;
+		if (valid & OBD_MD_FLCTIME && tgt->o_ctime < src->o_ctime)
+			tgt->o_ctime = src->o_ctime;
+		if (valid & OBD_MD_FLMTIME && tgt->o_mtime < src->o_mtime)
+			tgt->o_mtime = src->o_mtime;
+		if (valid & OBD_MD_FLDATAVERSION)
+			tgt->o_data_version += src->o_data_version;
+	} else {
+		memcpy(tgt, src, sizeof(*tgt));
+		tgt->o_oi = lsm->lsm_oi;
+		if (valid & OBD_MD_FLSIZE)
+			tgt->o_size = lov_stripe_size(lsm, src->o_size,
+						      stripeno);
+	}
+
+	/* data_version needs to be valid on all stripes to be correct! */
+	if (!(valid & OBD_MD_FLDATAVERSION))
+		tgt->o_valid &= ~OBD_MD_FLDATAVERSION;
+
+	*set += 1;
+}
diff --git a/kernel/drivers/staging/lustre/lustre/lov/lov_obd.c b/kernel/drivers/staging/lustre/lustre/lov/lov_obd.c
new file mode 100644
index 000000000..027815766
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/lov/lov_obd.c
@@ -0,0 +1,2395 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/lov/lov_obd.c
+ *
+ * Author: Phil Schwan <phil@clusterfs.com>
+ * Author: Peter Braam <braam@clusterfs.com>
+ * Author: Mike Shaver <shaver@clusterfs.com>
+ * Author: Nathan Rutman <nathan@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LOV
+#include "../../include/linux/libcfs/libcfs.h"
+
+#include "../include/obd_support.h"
+#include "../include/lustre_lib.h"
+#include "../include/lustre_net.h"
+#include "../include/lustre/lustre_idl.h"
+#include "../include/lustre_dlm.h"
+#include "../include/lustre_mds.h"
+#include "../include/obd_class.h"
+#include "../include/lprocfs_status.h"
+#include "../include/lustre_param.h"
+#include "../include/cl_object.h"
+#include "../include/lclient.h"		/* for cl_client_lru */
+#include "../include/lustre/ll_fiemap.h"
+#include "../include/lustre_fid.h"
+
+#include "lov_internal.h"
+
+/* Keep a refcount of lov->tgt usage to prevent racing with addition/deletion.
+   Any function that expects lov_tgts to remain stationary must take a ref. */
+static void lov_getref(struct obd_device *obd)
+{
+	struct lov_obd *lov = &obd->u.lov;
+
+	/* nobody gets through here until lov_putref is done */
+	mutex_lock(&lov->lov_lock);
+	atomic_inc(&lov->lov_refcount);
+	mutex_unlock(&lov->lov_lock);
+	return;
+}
+
+static void __lov_del_obd(struct obd_device *obd, struct lov_tgt_desc *tgt);
+
+static void lov_putref(struct obd_device *obd)
+{
+	struct lov_obd *lov = &obd->u.lov;
+
+	mutex_lock(&lov->lov_lock);
+	/* ok to dec to 0 more than once -- ltd_exp's will be null */
+	if (atomic_dec_and_test(&lov->lov_refcount) && lov->lov_death_row) {
+		LIST_HEAD(kill);
+		int i;
+		struct lov_tgt_desc *tgt, *n;
+		CDEBUG(D_CONFIG, "destroying %d lov targets\n",
+		       lov->lov_death_row);
+		for (i = 0; i < lov->desc.ld_tgt_count; i++) {
+			tgt = lov->lov_tgts[i];
+
+			if (!tgt || !tgt->ltd_reap)
+				continue;
+			list_add(&tgt->ltd_kill, &kill);
+			/* XXX - right now there is a dependency on ld_tgt_count
+			 * being the maximum tgt index for computing the
+			 * mds_max_easize. So we can't shrink it. */
+			lov_ost_pool_remove(&lov->lov_packed, i);
+			lov->lov_tgts[i] = NULL;
+			lov->lov_death_row--;
+		}
+		mutex_unlock(&lov->lov_lock);
+
+		list_for_each_entry_safe(tgt, n, &kill, ltd_kill) {
+			list_del(&tgt->ltd_kill);
+			/* Disconnect */
+			__lov_del_obd(obd, tgt);
+		}
+	} else {
+		mutex_unlock(&lov->lov_lock);
+	}
+}
+
+static int lov_set_osc_active(struct obd_device *obd, struct obd_uuid *uuid,
+			      enum obd_notify_event ev);
+static int lov_notify(struct obd_device *obd, struct obd_device *watched,
+		      enum obd_notify_event ev, void *data);
+
+
+#define MAX_STRING_SIZE 128
+int lov_connect_obd(struct obd_device *obd, __u32 index, int activate,
+		    struct obd_connect_data *data)
+{
+	struct lov_obd *lov = &obd->u.lov;
+	struct obd_uuid *tgt_uuid;
+	struct obd_device *tgt_obd;
+	static struct obd_uuid lov_osc_uuid = { "LOV_OSC_UUID" };
+	struct obd_import *imp;
+	struct proc_dir_entry *lov_proc_dir;
+	int rc;
+
+	if (!lov->lov_tgts[index])
+		return -EINVAL;
+
+	tgt_uuid = &lov->lov_tgts[index]->ltd_uuid;
+	tgt_obd = lov->lov_tgts[index]->ltd_obd;
+
+	if (!tgt_obd->obd_set_up) {
+		CERROR("Target %s not set up\n", obd_uuid2str(tgt_uuid));
+		return -EINVAL;
+	}
+
+	/* override the sp_me from lov */
+	tgt_obd->u.cli.cl_sp_me = lov->lov_sp_me;
+
+	if (data && (data->ocd_connect_flags & OBD_CONNECT_INDEX))
+		data->ocd_index = index;
+
+	/*
+	 * Divine LOV knows that OBDs under it are OSCs.
+	 */
+	imp = tgt_obd->u.cli.cl_import;
+
+	if (activate) {
+		tgt_obd->obd_no_recov = 0;
+		/* FIXME this is probably supposed to be
+		   ptlrpc_set_import_active.  Horrible naming. */
+		ptlrpc_activate_import(imp);
+	}
+
+	rc = obd_register_observer(tgt_obd, obd);
+	if (rc) {
+		CERROR("Target %s register_observer error %d\n",
+		       obd_uuid2str(tgt_uuid), rc);
+		return rc;
+	}
+
+
+	if (imp->imp_invalid) {
+		CDEBUG(D_CONFIG, "not connecting OSC %s; administratively disabled\n",
+		       obd_uuid2str(tgt_uuid));
+		return 0;
+	}
+
+	rc = obd_connect(NULL, &lov->lov_tgts[index]->ltd_exp, tgt_obd,
+			 &lov_osc_uuid, data, NULL);
+	if (rc || !lov->lov_tgts[index]->ltd_exp) {
+		CERROR("Target %s connect error %d\n",
+		       obd_uuid2str(tgt_uuid), rc);
+		return -ENODEV;
+	}
+
+	lov->lov_tgts[index]->ltd_reap = 0;
+
+	CDEBUG(D_CONFIG, "Connected tgt idx %d %s (%s) %sactive\n", index,
+	       obd_uuid2str(tgt_uuid), tgt_obd->obd_name, activate ? "":"in");
+
+	lov_proc_dir = obd->obd_proc_private;
+	if (lov_proc_dir) {
+		struct obd_device *osc_obd = lov->lov_tgts[index]->ltd_exp->exp_obd;
+		struct proc_dir_entry *osc_symlink;
+
+		LASSERT(osc_obd != NULL);
+		LASSERT(osc_obd->obd_magic == OBD_DEVICE_MAGIC);
+		LASSERT(osc_obd->obd_type->typ_name != NULL);
+
+		osc_symlink = lprocfs_add_symlink(osc_obd->obd_name,
+						  lov_proc_dir,
+						  "../../../%s/%s",
+						  osc_obd->obd_type->typ_name,
+						  osc_obd->obd_name);
+		if (osc_symlink == NULL) {
+			CERROR("could not register LOV target /proc/fs/lustre/%s/%s/target_obds/%s.",
+			       obd->obd_type->typ_name, obd->obd_name,
+			       osc_obd->obd_name);
+			lprocfs_remove(&lov_proc_dir);
+			obd->obd_proc_private = NULL;
+		}
+	}
+
+	return 0;
+}
+
+static int lov_connect(const struct lu_env *env,
+		       struct obd_export **exp, struct obd_device *obd,
+		       struct obd_uuid *cluuid, struct obd_connect_data *data,
+		       void *localdata)
+{
+	struct lov_obd *lov = &obd->u.lov;
+	struct lov_tgt_desc *tgt;
+	struct lustre_handle conn;
+	int i, rc;
+
+	CDEBUG(D_CONFIG, "connect #%d\n", lov->lov_connects);
+
+	rc = class_connect(&conn, obd, cluuid);
+	if (rc)
+		return rc;
+
+	*exp = class_conn2export(&conn);
+
+	/* Why should there ever be more than 1 connect? */
+	lov->lov_connects++;
+	LASSERT(lov->lov_connects == 1);
+
+	memset(&lov->lov_ocd, 0, sizeof(lov->lov_ocd));
+	if (data)
+		lov->lov_ocd = *data;
+
+	obd_getref(obd);
+	for (i = 0; i < lov->desc.ld_tgt_count; i++) {
+		tgt = lov->lov_tgts[i];
+		if (!tgt || obd_uuid_empty(&tgt->ltd_uuid))
+			continue;
+		/* Flags will be lowest common denominator */
+		rc = lov_connect_obd(obd, i, tgt->ltd_activate, &lov->lov_ocd);
+		if (rc) {
+			CERROR("%s: lov connect tgt %d failed: %d\n",
+			       obd->obd_name, i, rc);
+			continue;
+		}
+		/* connect to administrative disabled ost */
+		if (!lov->lov_tgts[i]->ltd_exp)
+			continue;
+
+		rc = lov_notify(obd, lov->lov_tgts[i]->ltd_exp->exp_obd,
+				OBD_NOTIFY_CONNECT, (void *)&i);
+		if (rc) {
+			CERROR("%s error sending notify %d\n",
+			       obd->obd_name, rc);
+		}
+	}
+	obd_putref(obd);
+
+	return 0;
+}
+
+static int lov_disconnect_obd(struct obd_device *obd, struct lov_tgt_desc *tgt)
+{
+	struct proc_dir_entry *lov_proc_dir;
+	struct lov_obd *lov = &obd->u.lov;
+	struct obd_device *osc_obd;
+	int rc;
+
+	osc_obd = class_exp2obd(tgt->ltd_exp);
+	CDEBUG(D_CONFIG, "%s: disconnecting target %s\n",
+		obd->obd_name, osc_obd ? osc_obd->obd_name : "NULL");
+
+	if (tgt->ltd_active) {
+		tgt->ltd_active = 0;
+		lov->desc.ld_active_tgt_count--;
+		tgt->ltd_exp->exp_obd->obd_inactive = 1;
+	}
+
+	if (osc_obd) {
+		lov_proc_dir = obd->obd_proc_private;
+		if (lov_proc_dir) {
+			lprocfs_remove_proc_entry(osc_obd->obd_name, lov_proc_dir);
+		}
+		/* Pass it on to our clients.
+		 * XXX This should be an argument to disconnect,
+		 * XXX not a back-door flag on the OBD.  Ah well.
+		 */
+		osc_obd->obd_force = obd->obd_force;
+		osc_obd->obd_fail = obd->obd_fail;
+		osc_obd->obd_no_recov = obd->obd_no_recov;
+	}
+
+	obd_register_observer(osc_obd, NULL);
+
+	rc = obd_disconnect(tgt->ltd_exp);
+	if (rc) {
+		CERROR("Target %s disconnect error %d\n",
+		       tgt->ltd_uuid.uuid, rc);
+		rc = 0;
+	}
+
+	tgt->ltd_exp = NULL;
+	return 0;
+}
+
+static int lov_disconnect(struct obd_export *exp)
+{
+	struct obd_device *obd = class_exp2obd(exp);
+	struct lov_obd *lov = &obd->u.lov;
+	int i, rc;
+
+	if (!lov->lov_tgts)
+		goto out;
+
+	/* Only disconnect the underlying layers on the final disconnect. */
+	lov->lov_connects--;
+	if (lov->lov_connects != 0) {
+		/* why should there be more than 1 connect? */
+		CERROR("disconnect #%d\n", lov->lov_connects);
+		goto out;
+	}
+
+	/* Let's hold another reference so lov_del_obd doesn't spin through
+	   putref every time */
+	obd_getref(obd);
+
+	for (i = 0; i < lov->desc.ld_tgt_count; i++) {
+		if (lov->lov_tgts[i] && lov->lov_tgts[i]->ltd_exp) {
+			/* Disconnection is the last we know about an obd */
+			lov_del_target(obd, i, NULL, lov->lov_tgts[i]->ltd_gen);
+		}
+	}
+	obd_putref(obd);
+
+out:
+	rc = class_disconnect(exp); /* bz 9811 */
+	return rc;
+}
+
+/* Error codes:
+ *
+ *  -EINVAL  : UUID can't be found in the LOV's target list
+ *  -ENOTCONN: The UUID is found, but the target connection is bad (!)
+ *  -EBADF   : The UUID is found, but the OBD is the wrong type (!)
+ *  any >= 0 : is log target index
+ */
+static int lov_set_osc_active(struct obd_device *obd, struct obd_uuid *uuid,
+			      enum obd_notify_event ev)
+{
+	struct lov_obd *lov = &obd->u.lov;
+	struct lov_tgt_desc *tgt;
+	int index, activate, active;
+
+	CDEBUG(D_INFO, "Searching in lov %p for uuid %s event(%d)\n",
+	       lov, uuid->uuid, ev);
+
+	obd_getref(obd);
+	for (index = 0; index < lov->desc.ld_tgt_count; index++) {
+		tgt = lov->lov_tgts[index];
+		if (!tgt)
+			continue;
+		/*
+		 * LU-642, initially inactive OSC could miss the obd_connect,
+		 * we make up for it here.
+		 */
+		if (ev == OBD_NOTIFY_ACTIVATE && tgt->ltd_exp == NULL &&
+		    obd_uuid_equals(uuid, &tgt->ltd_uuid)) {
+			struct obd_uuid lov_osc_uuid = {"LOV_OSC_UUID"};
+
+			obd_connect(NULL, &tgt->ltd_exp, tgt->ltd_obd,
+				    &lov_osc_uuid, &lov->lov_ocd, NULL);
+		}
+		if (!tgt->ltd_exp)
+			continue;
+
+		CDEBUG(D_INFO, "lov idx %d is %s conn %#llx\n",
+		       index, obd_uuid2str(&tgt->ltd_uuid),
+		       tgt->ltd_exp->exp_handle.h_cookie);
+		if (obd_uuid_equals(uuid, &tgt->ltd_uuid))
+			break;
+	}
+
+	if (index == lov->desc.ld_tgt_count) {
+		index = -EINVAL;
+		goto out;
+	}
+
+	if (ev == OBD_NOTIFY_DEACTIVATE || ev == OBD_NOTIFY_ACTIVATE) {
+		activate = (ev == OBD_NOTIFY_ACTIVATE) ? 1 : 0;
+
+		if (lov->lov_tgts[index]->ltd_activate == activate) {
+			CDEBUG(D_INFO, "OSC %s already %sactivate!\n",
+			       uuid->uuid, activate ? "" : "de");
+		} else {
+			lov->lov_tgts[index]->ltd_activate = activate;
+			CDEBUG(D_CONFIG, "%sactivate OSC %s\n",
+			       activate ? "" : "de", obd_uuid2str(uuid));
+		}
+
+	} else if (ev == OBD_NOTIFY_INACTIVE || ev == OBD_NOTIFY_ACTIVE) {
+		active = (ev == OBD_NOTIFY_ACTIVE) ? 1 : 0;
+
+		if (lov->lov_tgts[index]->ltd_active == active) {
+			CDEBUG(D_INFO, "OSC %s already %sactive!\n",
+			       uuid->uuid, active ? "" : "in");
+			goto out;
+		} else {
+			CDEBUG(D_CONFIG, "Marking OSC %s %sactive\n",
+			       obd_uuid2str(uuid), active ? "" : "in");
+		}
+
+		lov->lov_tgts[index]->ltd_active = active;
+		if (active) {
+			lov->desc.ld_active_tgt_count++;
+			lov->lov_tgts[index]->ltd_exp->exp_obd->obd_inactive = 0;
+		} else {
+			lov->desc.ld_active_tgt_count--;
+			lov->lov_tgts[index]->ltd_exp->exp_obd->obd_inactive = 1;
+		}
+	} else {
+		CERROR("Unknown event(%d) for uuid %s", ev, uuid->uuid);
+	}
+
+ out:
+	obd_putref(obd);
+	return index;
+}
+
+static int lov_notify(struct obd_device *obd, struct obd_device *watched,
+		      enum obd_notify_event ev, void *data)
+{
+	int rc = 0;
+	struct lov_obd *lov = &obd->u.lov;
+
+	down_read(&lov->lov_notify_lock);
+	if (!lov->lov_connects) {
+		up_read(&lov->lov_notify_lock);
+		return rc;
+	}
+
+	if (ev == OBD_NOTIFY_ACTIVE || ev == OBD_NOTIFY_INACTIVE ||
+	    ev == OBD_NOTIFY_ACTIVATE || ev == OBD_NOTIFY_DEACTIVATE) {
+		struct obd_uuid *uuid;
+
+		LASSERT(watched);
+
+		if (strcmp(watched->obd_type->typ_name, LUSTRE_OSC_NAME)) {
+			up_read(&lov->lov_notify_lock);
+			CERROR("unexpected notification of %s %s!\n",
+			       watched->obd_type->typ_name,
+			       watched->obd_name);
+			return -EINVAL;
+		}
+		uuid = &watched->u.cli.cl_target_uuid;
+
+		/* Set OSC as active before notifying the observer, so the
+		 * observer can use the OSC normally.
+		 */
+		rc = lov_set_osc_active(obd, uuid, ev);
+		if (rc < 0) {
+			up_read(&lov->lov_notify_lock);
+			CERROR("event(%d) of %s failed: %d\n", ev,
+			       obd_uuid2str(uuid), rc);
+			return rc;
+		}
+		/* active event should be pass lov target index as data */
+		data = &rc;
+	}
+
+	/* Pass the notification up the chain. */
+	if (watched) {
+		rc = obd_notify_observer(obd, watched, ev, data);
+	} else {
+		/* NULL watched means all osc's in the lov (only for syncs) */
+		/* sync event should be send lov idx as data */
+		struct lov_obd *lov = &obd->u.lov;
+		int i, is_sync;
+
+		data = &i;
+		is_sync = (ev == OBD_NOTIFY_SYNC) ||
+			  (ev == OBD_NOTIFY_SYNC_NONBLOCK);
+
+		obd_getref(obd);
+		for (i = 0; i < lov->desc.ld_tgt_count; i++) {
+			if (!lov->lov_tgts[i])
+				continue;
+
+			/* don't send sync event if target not
+			 * connected/activated */
+			if (is_sync &&  !lov->lov_tgts[i]->ltd_active)
+				continue;
+
+			rc = obd_notify_observer(obd, lov->lov_tgts[i]->ltd_obd,
+						 ev, data);
+			if (rc) {
+				CERROR("%s: notify %s of %s failed %d\n",
+				       obd->obd_name,
+				       obd->obd_observer->obd_name,
+				       lov->lov_tgts[i]->ltd_obd->obd_name,
+				       rc);
+			}
+		}
+		obd_putref(obd);
+	}
+
+	up_read(&lov->lov_notify_lock);
+	return rc;
+}
+
+static int lov_add_target(struct obd_device *obd, struct obd_uuid *uuidp,
+			  __u32 index, int gen, int active)
+{
+	struct lov_obd *lov = &obd->u.lov;
+	struct lov_tgt_desc *tgt;
+	struct obd_device *tgt_obd;
+	int rc;
+
+	CDEBUG(D_CONFIG, "uuid:%s idx:%d gen:%d active:%d\n",
+	       uuidp->uuid, index, gen, active);
+
+	if (gen <= 0) {
+		CERROR("request to add OBD %s with invalid generation: %d\n",
+		       uuidp->uuid, gen);
+		return -EINVAL;
+	}
+
+	tgt_obd = class_find_client_obd(uuidp, LUSTRE_OSC_NAME,
+					&obd->obd_uuid);
+	if (tgt_obd == NULL)
+		return -EINVAL;
+
+	mutex_lock(&lov->lov_lock);
+
+	if ((index < lov->lov_tgt_size) && (lov->lov_tgts[index] != NULL)) {
+		tgt = lov->lov_tgts[index];
+		CERROR("UUID %s already assigned at LOV target index %d\n",
+		       obd_uuid2str(&tgt->ltd_uuid), index);
+		mutex_unlock(&lov->lov_lock);
+		return -EEXIST;
+	}
+
+	if (index >= lov->lov_tgt_size) {
+		/* We need to reallocate the lov target array. */
+		struct lov_tgt_desc **newtgts, **old = NULL;
+		__u32 newsize, oldsize = 0;
+
+		newsize = max_t(__u32, lov->lov_tgt_size, 2);
+		while (newsize < index + 1)
+			newsize <<= 1;
+		OBD_ALLOC(newtgts, sizeof(*newtgts) * newsize);
+		if (newtgts == NULL) {
+			mutex_unlock(&lov->lov_lock);
+			return -ENOMEM;
+		}
+
+		if (lov->lov_tgt_size) {
+			memcpy(newtgts, lov->lov_tgts, sizeof(*newtgts) *
+			       lov->lov_tgt_size);
+			old = lov->lov_tgts;
+			oldsize = lov->lov_tgt_size;
+		}
+
+		lov->lov_tgts = newtgts;
+		lov->lov_tgt_size = newsize;
+		smp_rmb();
+		if (old)
+			OBD_FREE(old, sizeof(*old) * oldsize);
+
+		CDEBUG(D_CONFIG, "tgts: %p size: %d\n",
+		       lov->lov_tgts, lov->lov_tgt_size);
+	}
+
+	OBD_ALLOC_PTR(tgt);
+	if (!tgt) {
+		mutex_unlock(&lov->lov_lock);
+		return -ENOMEM;
+	}
+
+	rc = lov_ost_pool_add(&lov->lov_packed, index, lov->lov_tgt_size);
+	if (rc) {
+		mutex_unlock(&lov->lov_lock);
+		OBD_FREE_PTR(tgt);
+		return rc;
+	}
+
+	tgt->ltd_uuid = *uuidp;
+	tgt->ltd_obd = tgt_obd;
+	/* XXX - add a sanity check on the generation number. */
+	tgt->ltd_gen = gen;
+	tgt->ltd_index = index;
+	tgt->ltd_activate = active;
+	lov->lov_tgts[index] = tgt;
+	if (index >= lov->desc.ld_tgt_count)
+		lov->desc.ld_tgt_count = index + 1;
+
+	mutex_unlock(&lov->lov_lock);
+
+	CDEBUG(D_CONFIG, "idx=%d ltd_gen=%d ld_tgt_count=%d\n",
+		index, tgt->ltd_gen, lov->desc.ld_tgt_count);
+
+	rc = obd_notify(obd, tgt_obd, OBD_NOTIFY_CREATE, &index);
+
+	if (lov->lov_connects == 0) {
+		/* lov_connect hasn't been called yet. We'll do the
+		   lov_connect_obd on this target when that fn first runs,
+		   because we don't know the connect flags yet. */
+		return 0;
+	}
+
+	obd_getref(obd);
+
+	rc = lov_connect_obd(obd, index, active, &lov->lov_ocd);
+	if (rc)
+		goto out;
+
+	/* connect to administrative disabled ost */
+	if (!tgt->ltd_exp) {
+		rc = 0;
+		goto out;
+	}
+
+	if (lov->lov_cache != NULL) {
+		rc = obd_set_info_async(NULL, tgt->ltd_exp,
+				sizeof(KEY_CACHE_SET), KEY_CACHE_SET,
+				sizeof(struct cl_client_cache), lov->lov_cache,
+				NULL);
+		if (rc < 0)
+			goto out;
+	}
+
+	rc = lov_notify(obd, tgt->ltd_exp->exp_obd,
+			active ? OBD_NOTIFY_CONNECT : OBD_NOTIFY_INACTIVE,
+			(void *)&index);
+
+out:
+	if (rc) {
+		CERROR("add failed (%d), deleting %s\n", rc,
+		       obd_uuid2str(&tgt->ltd_uuid));
+		lov_del_target(obd, index, NULL, 0);
+	}
+	obd_putref(obd);
+	return rc;
+}
+
+/* Schedule a target for deletion */
+int lov_del_target(struct obd_device *obd, __u32 index,
+		   struct obd_uuid *uuidp, int gen)
+{
+	struct lov_obd *lov = &obd->u.lov;
+	int count = lov->desc.ld_tgt_count;
+	int rc = 0;
+
+	if (index >= count) {
+		CERROR("LOV target index %d >= number of LOV OBDs %d.\n",
+		       index, count);
+		return -EINVAL;
+	}
+
+	/* to make sure there's no ongoing lov_notify() now */
+	down_write(&lov->lov_notify_lock);
+	obd_getref(obd);
+
+	if (!lov->lov_tgts[index]) {
+		CERROR("LOV target at index %d is not setup.\n", index);
+		rc = -EINVAL;
+		goto out;
+	}
+
+	if (uuidp && !obd_uuid_equals(uuidp, &lov->lov_tgts[index]->ltd_uuid)) {
+		CERROR("LOV target UUID %s at index %d doesn't match %s.\n",
+		       lov_uuid2str(lov, index), index,
+		       obd_uuid2str(uuidp));
+		rc = -EINVAL;
+		goto out;
+	}
+
+	CDEBUG(D_CONFIG, "uuid: %s idx: %d gen: %d exp: %p active: %d\n",
+	       lov_uuid2str(lov, index), index,
+	       lov->lov_tgts[index]->ltd_gen, lov->lov_tgts[index]->ltd_exp,
+	       lov->lov_tgts[index]->ltd_active);
+
+	lov->lov_tgts[index]->ltd_reap = 1;
+	lov->lov_death_row++;
+	/* we really delete it from obd_putref */
+out:
+	obd_putref(obd);
+	up_write(&lov->lov_notify_lock);
+
+	return rc;
+}
+
+static void __lov_del_obd(struct obd_device *obd, struct lov_tgt_desc *tgt)
+{
+	struct obd_device *osc_obd;
+
+	LASSERT(tgt);
+	LASSERT(tgt->ltd_reap);
+
+	osc_obd = class_exp2obd(tgt->ltd_exp);
+
+	CDEBUG(D_CONFIG, "Removing tgt %s : %s\n",
+	       tgt->ltd_uuid.uuid,
+	       osc_obd ? osc_obd->obd_name : "<no obd>");
+
+	if (tgt->ltd_exp)
+		lov_disconnect_obd(obd, tgt);
+
+	OBD_FREE_PTR(tgt);
+
+	/* Manual cleanup - no cleanup logs to clean up the osc's.  We must
+	   do it ourselves. And we can't do it from lov_cleanup,
+	   because we just lost our only reference to it. */
+	if (osc_obd)
+		class_manual_cleanup(osc_obd);
+}
+
+void lov_fix_desc_stripe_size(__u64 *val)
+{
+	if (*val < LOV_MIN_STRIPE_SIZE) {
+		if (*val != 0)
+			LCONSOLE_INFO("Increasing default stripe size to minimum %u\n",
+				      LOV_DESC_STRIPE_SIZE_DEFAULT);
+		*val = LOV_DESC_STRIPE_SIZE_DEFAULT;
+	} else if (*val & (LOV_MIN_STRIPE_SIZE - 1)) {
+		*val &= ~(LOV_MIN_STRIPE_SIZE - 1);
+		LCONSOLE_WARN("Changing default stripe size to %llu (a multiple of %u)\n",
+			      *val, LOV_MIN_STRIPE_SIZE);
+	}
+}
+
+void lov_fix_desc_stripe_count(__u32 *val)
+{
+	if (*val == 0)
+		*val = 1;
+}
+
+void lov_fix_desc_pattern(__u32 *val)
+{
+	/* from lov_setstripe */
+	if ((*val != 0) && (*val != LOV_PATTERN_RAID0)) {
+		LCONSOLE_WARN("Unknown stripe pattern: %#x\n", *val);
+		*val = 0;
+	}
+}
+
+void lov_fix_desc_qos_maxage(__u32 *val)
+{
+	if (*val == 0)
+		*val = LOV_DESC_QOS_MAXAGE_DEFAULT;
+}
+
+void lov_fix_desc(struct lov_desc *desc)
+{
+	lov_fix_desc_stripe_size(&desc->ld_default_stripe_size);
+	lov_fix_desc_stripe_count(&desc->ld_default_stripe_count);
+	lov_fix_desc_pattern(&desc->ld_pattern);
+	lov_fix_desc_qos_maxage(&desc->ld_qos_maxage);
+}
+
+int lov_setup(struct obd_device *obd, struct lustre_cfg *lcfg)
+{
+	struct lprocfs_static_vars lvars = { NULL };
+	struct lov_desc *desc;
+	struct lov_obd *lov = &obd->u.lov;
+	int rc;
+
+	if (LUSTRE_CFG_BUFLEN(lcfg, 1) < 1) {
+		CERROR("LOV setup requires a descriptor\n");
+		return -EINVAL;
+	}
+
+	desc = (struct lov_desc *)lustre_cfg_buf(lcfg, 1);
+
+	if (sizeof(*desc) > LUSTRE_CFG_BUFLEN(lcfg, 1)) {
+		CERROR("descriptor size wrong: %d > %d\n",
+		       (int)sizeof(*desc), LUSTRE_CFG_BUFLEN(lcfg, 1));
+		return -EINVAL;
+	}
+
+	if (desc->ld_magic != LOV_DESC_MAGIC) {
+		if (desc->ld_magic == __swab32(LOV_DESC_MAGIC)) {
+			    CDEBUG(D_OTHER, "%s: Swabbing lov desc %p\n",
+				   obd->obd_name, desc);
+			    lustre_swab_lov_desc(desc);
+		} else {
+			CERROR("%s: Bad lov desc magic: %#x\n",
+			       obd->obd_name, desc->ld_magic);
+			return -EINVAL;
+		}
+	}
+
+	lov_fix_desc(desc);
+
+	desc->ld_active_tgt_count = 0;
+	lov->desc = *desc;
+	lov->lov_tgt_size = 0;
+
+	mutex_init(&lov->lov_lock);
+	atomic_set(&lov->lov_refcount, 0);
+	lov->lov_sp_me = LUSTRE_SP_CLI;
+
+	init_rwsem(&lov->lov_notify_lock);
+
+	lov->lov_pools_hash_body = cfs_hash_create("POOLS", HASH_POOLS_CUR_BITS,
+						   HASH_POOLS_MAX_BITS,
+						   HASH_POOLS_BKT_BITS, 0,
+						   CFS_HASH_MIN_THETA,
+						   CFS_HASH_MAX_THETA,
+						   &pool_hash_operations,
+						   CFS_HASH_DEFAULT);
+	INIT_LIST_HEAD(&lov->lov_pool_list);
+	lov->lov_pool_count = 0;
+	rc = lov_ost_pool_init(&lov->lov_packed, 0);
+	if (rc)
+		goto out;
+
+	lprocfs_lov_init_vars(&lvars);
+	lprocfs_obd_setup(obd, lvars.obd_vars);
+#if defined (CONFIG_PROC_FS)
+	{
+		int rc1;
+
+		rc1 = lprocfs_seq_create(obd->obd_proc_entry, "target_obd",
+					0444, &lov_proc_target_fops, obd);
+		if (rc1)
+			CWARN("Error adding the target_obd file\n");
+	}
+#endif
+	lov->lov_pool_proc_entry = lprocfs_register("pools",
+						    obd->obd_proc_entry,
+						    NULL, NULL);
+
+	return 0;
+
+out:
+	return rc;
+}
+
+static int lov_precleanup(struct obd_device *obd, enum obd_cleanup_stage stage)
+{
+	struct lov_obd *lov = &obd->u.lov;
+
+	switch (stage) {
+	case OBD_CLEANUP_EARLY: {
+		int i;
+		for (i = 0; i < lov->desc.ld_tgt_count; i++) {
+			if (!lov->lov_tgts[i] || !lov->lov_tgts[i]->ltd_active)
+				continue;
+			obd_precleanup(class_exp2obd(lov->lov_tgts[i]->ltd_exp),
+				       OBD_CLEANUP_EARLY);
+		}
+		break;
+	}
+	default:
+		break;
+	}
+
+	return 0;
+}
+
+static int lov_cleanup(struct obd_device *obd)
+{
+	struct lov_obd *lov = &obd->u.lov;
+	struct list_head *pos, *tmp;
+	struct pool_desc *pool;
+
+	list_for_each_safe(pos, tmp, &lov->lov_pool_list) {
+		pool = list_entry(pos, struct pool_desc, pool_list);
+		/* free pool structs */
+		CDEBUG(D_INFO, "delete pool %p\n", pool);
+		/* In the function below, .hs_keycmp resolves to
+		 * pool_hashkey_keycmp() */
+		/* coverity[overrun-buffer-val] */
+		lov_pool_del(obd, pool->pool_name);
+	}
+	cfs_hash_putref(lov->lov_pools_hash_body);
+	lov_ost_pool_free(&lov->lov_packed);
+
+	lprocfs_obd_cleanup(obd);
+	if (lov->lov_tgts) {
+		int i;
+		obd_getref(obd);
+		for (i = 0; i < lov->desc.ld_tgt_count; i++) {
+			if (!lov->lov_tgts[i])
+				continue;
+
+			/* Inactive targets may never have connected */
+			if (lov->lov_tgts[i]->ltd_active ||
+			    atomic_read(&lov->lov_refcount))
+			    /* We should never get here - these
+			       should have been removed in the
+			     disconnect. */
+				CERROR("lov tgt %d not cleaned! deathrow=%d, lovrc=%d\n",
+				       i, lov->lov_death_row,
+				       atomic_read(&lov->lov_refcount));
+			lov_del_target(obd, i, NULL, 0);
+		}
+		obd_putref(obd);
+		OBD_FREE(lov->lov_tgts, sizeof(*lov->lov_tgts) *
+			 lov->lov_tgt_size);
+		lov->lov_tgt_size = 0;
+	}
+	return 0;
+}
+
+int lov_process_config_base(struct obd_device *obd, struct lustre_cfg *lcfg,
+			    __u32 *indexp, int *genp)
+{
+	struct obd_uuid obd_uuid;
+	int cmd;
+	int rc = 0;
+
+	switch (cmd = lcfg->lcfg_command) {
+	case LCFG_LOV_ADD_OBD:
+	case LCFG_LOV_ADD_INA:
+	case LCFG_LOV_DEL_OBD: {
+		__u32 index;
+		int gen;
+		/* lov_modify_tgts add  0:lov_mdsA  1:ost1_UUID  2:0  3:1 */
+		if (LUSTRE_CFG_BUFLEN(lcfg, 1) > sizeof(obd_uuid.uuid)) {
+			rc = -EINVAL;
+			goto out;
+		}
+
+		obd_str2uuid(&obd_uuid,  lustre_cfg_buf(lcfg, 1));
+
+		if (sscanf(lustre_cfg_buf(lcfg, 2), "%d", indexp) != 1) {
+			rc = -EINVAL;
+			goto out;
+		}
+		if (sscanf(lustre_cfg_buf(lcfg, 3), "%d", genp) != 1) {
+			rc = -EINVAL;
+			goto out;
+		}
+		index = *indexp;
+		gen = *genp;
+		if (cmd == LCFG_LOV_ADD_OBD)
+			rc = lov_add_target(obd, &obd_uuid, index, gen, 1);
+		else if (cmd == LCFG_LOV_ADD_INA)
+			rc = lov_add_target(obd, &obd_uuid, index, gen, 0);
+		else
+			rc = lov_del_target(obd, index, &obd_uuid, gen);
+		goto out;
+	}
+	case LCFG_PARAM: {
+		struct lprocfs_static_vars lvars = { NULL };
+		struct lov_desc *desc = &(obd->u.lov.desc);
+
+		if (!desc) {
+			rc = -EINVAL;
+			goto out;
+		}
+
+		lprocfs_lov_init_vars(&lvars);
+
+		rc = class_process_proc_param(PARAM_LOV, lvars.obd_vars,
+					      lcfg, obd);
+		if (rc > 0)
+			rc = 0;
+		goto out;
+	}
+	case LCFG_POOL_NEW:
+	case LCFG_POOL_ADD:
+	case LCFG_POOL_DEL:
+	case LCFG_POOL_REM:
+		goto out;
+
+	default: {
+		CERROR("Unknown command: %d\n", lcfg->lcfg_command);
+		rc = -EINVAL;
+		goto out;
+
+	}
+	}
+out:
+	return rc;
+}
+
+static int lov_recreate(struct obd_export *exp, struct obdo *src_oa,
+			struct lov_stripe_md **ea, struct obd_trans_info *oti)
+{
+	struct lov_stripe_md *obj_mdp, *lsm;
+	struct lov_obd *lov = &exp->exp_obd->u.lov;
+	unsigned ost_idx;
+	int rc, i;
+
+	LASSERT(src_oa->o_valid & OBD_MD_FLFLAGS &&
+		src_oa->o_flags & OBD_FL_RECREATE_OBJS);
+
+	OBD_ALLOC(obj_mdp, sizeof(*obj_mdp));
+	if (obj_mdp == NULL)
+		return -ENOMEM;
+
+	ost_idx = src_oa->o_nlink;
+	lsm = *ea;
+	if (lsm == NULL) {
+		rc = -EINVAL;
+		goto out;
+	}
+	if (ost_idx >= lov->desc.ld_tgt_count ||
+	    !lov->lov_tgts[ost_idx]) {
+		rc = -EINVAL;
+		goto out;
+	}
+
+	for (i = 0; i < lsm->lsm_stripe_count; i++) {
+		struct lov_oinfo *loi = lsm->lsm_oinfo[i];
+
+		if (lov_oinfo_is_dummy(loi))
+			continue;
+
+		if (loi->loi_ost_idx == ost_idx) {
+			if (ostid_id(&loi->loi_oi) != ostid_id(&src_oa->o_oi)) {
+				rc = -EINVAL;
+				goto out;
+			}
+			break;
+		}
+	}
+	if (i == lsm->lsm_stripe_count) {
+		rc = -EINVAL;
+		goto out;
+	}
+
+	rc = obd_create(NULL, lov->lov_tgts[ost_idx]->ltd_exp,
+			src_oa, &obj_mdp, oti);
+out:
+	OBD_FREE(obj_mdp, sizeof(*obj_mdp));
+	return rc;
+}
+
+/* the LOV expects oa->o_id to be set to the LOV object id */
+static int lov_create(const struct lu_env *env, struct obd_export *exp,
+		      struct obdo *src_oa, struct lov_stripe_md **ea,
+		      struct obd_trans_info *oti)
+{
+	struct lov_obd *lov;
+	int rc = 0;
+
+	LASSERT(ea != NULL);
+	if (exp == NULL)
+		return -EINVAL;
+
+	if ((src_oa->o_valid & OBD_MD_FLFLAGS) &&
+	    src_oa->o_flags == OBD_FL_DELORPHAN) {
+		/* should be used with LOV anymore */
+		LBUG();
+	}
+
+	lov = &exp->exp_obd->u.lov;
+	if (!lov->desc.ld_active_tgt_count)
+		return -EIO;
+
+	obd_getref(exp->exp_obd);
+	/* Recreate a specific object id at the given OST index */
+	if ((src_oa->o_valid & OBD_MD_FLFLAGS) &&
+	    (src_oa->o_flags & OBD_FL_RECREATE_OBJS)) {
+		 rc = lov_recreate(exp, src_oa, ea, oti);
+	}
+
+	obd_putref(exp->exp_obd);
+	return rc;
+}
+
+#define ASSERT_LSM_MAGIC(lsmp)						  \
+do {									    \
+	LASSERT((lsmp) != NULL);						\
+	LASSERTF(((lsmp)->lsm_magic == LOV_MAGIC_V1 ||			  \
+		 (lsmp)->lsm_magic == LOV_MAGIC_V3),			    \
+		 "%p->lsm_magic=%x\n", (lsmp), (lsmp)->lsm_magic);	      \
+} while (0)
+
+static int lov_destroy(const struct lu_env *env, struct obd_export *exp,
+		       struct obdo *oa, struct lov_stripe_md *lsm,
+		       struct obd_trans_info *oti, struct obd_export *md_exp,
+		       void *capa)
+{
+	struct lov_request_set *set;
+	struct obd_info oinfo;
+	struct lov_request *req;
+	struct list_head *pos;
+	struct lov_obd *lov;
+	int rc = 0, err = 0;
+
+	ASSERT_LSM_MAGIC(lsm);
+
+	if (!exp || !exp->exp_obd)
+		return -ENODEV;
+
+	if (oa->o_valid & OBD_MD_FLCOOKIE) {
+		LASSERT(oti);
+		LASSERT(oti->oti_logcookies);
+	}
+
+	lov = &exp->exp_obd->u.lov;
+	obd_getref(exp->exp_obd);
+	rc = lov_prep_destroy_set(exp, &oinfo, oa, lsm, oti, &set);
+	if (rc)
+		goto out;
+
+	list_for_each(pos, &set->set_list) {
+		req = list_entry(pos, struct lov_request, rq_link);
+
+		if (oa->o_valid & OBD_MD_FLCOOKIE)
+			oti->oti_logcookies = set->set_cookies + req->rq_stripe;
+
+		err = obd_destroy(env, lov->lov_tgts[req->rq_idx]->ltd_exp,
+				  req->rq_oi.oi_oa, NULL, oti, NULL, capa);
+		err = lov_update_common_set(set, req, err);
+		if (err) {
+			CERROR("%s: destroying objid "DOSTID" subobj "
+			       DOSTID" on OST idx %d: rc = %d\n",
+			       exp->exp_obd->obd_name, POSTID(&oa->o_oi),
+			       POSTID(&req->rq_oi.oi_oa->o_oi),
+			       req->rq_idx, err);
+			if (!rc)
+				rc = err;
+		}
+	}
+
+	if (rc == 0) {
+		LASSERT(lsm_op_find(lsm->lsm_magic) != NULL);
+		rc = lsm_op_find(lsm->lsm_magic)->lsm_destroy(lsm, oa, md_exp);
+	}
+	err = lov_fini_destroy_set(set);
+out:
+	obd_putref(exp->exp_obd);
+	return rc ? rc : err;
+}
+
+static int lov_getattr_interpret(struct ptlrpc_request_set *rqset,
+				 void *data, int rc)
+{
+	struct lov_request_set *lovset = (struct lov_request_set *)data;
+	int err;
+
+	/* don't do attribute merge if this async op failed */
+	if (rc)
+		atomic_set(&lovset->set_completes, 0);
+	err = lov_fini_getattr_set(lovset);
+	return rc ? rc : err;
+}
+
+static int lov_getattr_async(struct obd_export *exp, struct obd_info *oinfo,
+			      struct ptlrpc_request_set *rqset)
+{
+	struct lov_request_set *lovset;
+	struct lov_obd *lov;
+	struct list_head *pos;
+	struct lov_request *req;
+	int rc = 0, err;
+
+	LASSERT(oinfo);
+	ASSERT_LSM_MAGIC(oinfo->oi_md);
+
+	if (!exp || !exp->exp_obd)
+		return -ENODEV;
+
+	lov = &exp->exp_obd->u.lov;
+
+	rc = lov_prep_getattr_set(exp, oinfo, &lovset);
+	if (rc)
+		return rc;
+
+	CDEBUG(D_INFO, "objid "DOSTID": %ux%u byte stripes\n",
+	       POSTID(&oinfo->oi_md->lsm_oi), oinfo->oi_md->lsm_stripe_count,
+	       oinfo->oi_md->lsm_stripe_size);
+
+	list_for_each(pos, &lovset->set_list) {
+		req = list_entry(pos, struct lov_request, rq_link);
+
+		CDEBUG(D_INFO, "objid " DOSTID "[%d] has subobj " DOSTID " at idx%u\n",
+		       POSTID(&oinfo->oi_oa->o_oi), req->rq_stripe,
+		       POSTID(&req->rq_oi.oi_oa->o_oi), req->rq_idx);
+		rc = obd_getattr_async(lov->lov_tgts[req->rq_idx]->ltd_exp,
+				       &req->rq_oi, rqset);
+		if (rc) {
+			CERROR("%s: getattr objid "DOSTID" subobj"
+			       DOSTID" on OST idx %d: rc = %d\n",
+			       exp->exp_obd->obd_name,
+			       POSTID(&oinfo->oi_oa->o_oi),
+			       POSTID(&req->rq_oi.oi_oa->o_oi),
+			       req->rq_idx, rc);
+			goto out;
+		}
+	}
+
+	if (!list_empty(&rqset->set_requests)) {
+		LASSERT(rc == 0);
+		LASSERT(rqset->set_interpret == NULL);
+		rqset->set_interpret = lov_getattr_interpret;
+		rqset->set_arg = (void *)lovset;
+		return rc;
+	}
+out:
+	if (rc)
+		atomic_set(&lovset->set_completes, 0);
+	err = lov_fini_getattr_set(lovset);
+	return rc ? rc : err;
+}
+
+static int lov_setattr_interpret(struct ptlrpc_request_set *rqset,
+				 void *data, int rc)
+{
+	struct lov_request_set *lovset = (struct lov_request_set *)data;
+	int err;
+
+	if (rc)
+		atomic_set(&lovset->set_completes, 0);
+	err = lov_fini_setattr_set(lovset);
+	return rc ? rc : err;
+}
+
+/* If @oti is given, the request goes from MDS and responses from OSTs are not
+   needed. Otherwise, a client is waiting for responses. */
+static int lov_setattr_async(struct obd_export *exp, struct obd_info *oinfo,
+			     struct obd_trans_info *oti,
+			     struct ptlrpc_request_set *rqset)
+{
+	struct lov_request_set *set;
+	struct lov_request *req;
+	struct list_head *pos;
+	struct lov_obd *lov;
+	int rc = 0;
+
+	LASSERT(oinfo);
+	ASSERT_LSM_MAGIC(oinfo->oi_md);
+	if (oinfo->oi_oa->o_valid & OBD_MD_FLCOOKIE) {
+		LASSERT(oti);
+		LASSERT(oti->oti_logcookies);
+	}
+
+	if (!exp || !exp->exp_obd)
+		return -ENODEV;
+
+	lov = &exp->exp_obd->u.lov;
+	rc = lov_prep_setattr_set(exp, oinfo, oti, &set);
+	if (rc)
+		return rc;
+
+	CDEBUG(D_INFO, "objid "DOSTID": %ux%u byte stripes\n",
+	       POSTID(&oinfo->oi_md->lsm_oi),
+	       oinfo->oi_md->lsm_stripe_count,
+	       oinfo->oi_md->lsm_stripe_size);
+
+	list_for_each(pos, &set->set_list) {
+		req = list_entry(pos, struct lov_request, rq_link);
+
+		if (oinfo->oi_oa->o_valid & OBD_MD_FLCOOKIE)
+			oti->oti_logcookies = set->set_cookies + req->rq_stripe;
+
+		CDEBUG(D_INFO, "objid " DOSTID "[%d] has subobj " DOSTID " at idx%u\n",
+		       POSTID(&oinfo->oi_oa->o_oi), req->rq_stripe,
+		       POSTID(&req->rq_oi.oi_oa->o_oi), req->rq_idx);
+
+		rc = obd_setattr_async(lov->lov_tgts[req->rq_idx]->ltd_exp,
+				       &req->rq_oi, oti, rqset);
+		if (rc) {
+			CERROR("error: setattr objid "DOSTID" subobj"
+			       DOSTID" on OST idx %d: rc = %d\n",
+			       POSTID(&set->set_oi->oi_oa->o_oi),
+			       POSTID(&req->rq_oi.oi_oa->o_oi),
+			       req->rq_idx, rc);
+			break;
+		}
+	}
+
+	/* If we are not waiting for responses on async requests, return. */
+	if (rc || !rqset || list_empty(&rqset->set_requests)) {
+		int err;
+		if (rc)
+			atomic_set(&set->set_completes, 0);
+		err = lov_fini_setattr_set(set);
+		return rc ? rc : err;
+	}
+
+	LASSERT(rqset->set_interpret == NULL);
+	rqset->set_interpret = lov_setattr_interpret;
+	rqset->set_arg = (void *)set;
+
+	return 0;
+}
+
+/* find any ldlm lock of the inode in lov
+ * return 0    not find
+ *	1    find one
+ *      < 0    error */
+static int lov_find_cbdata(struct obd_export *exp,
+			   struct lov_stripe_md *lsm, ldlm_iterator_t it,
+			   void *data)
+{
+	struct lov_obd *lov;
+	int rc = 0, i;
+
+	ASSERT_LSM_MAGIC(lsm);
+
+	if (!exp || !exp->exp_obd)
+		return -ENODEV;
+
+	lov = &exp->exp_obd->u.lov;
+	for (i = 0; i < lsm->lsm_stripe_count; i++) {
+		struct lov_stripe_md submd;
+		struct lov_oinfo *loi = lsm->lsm_oinfo[i];
+
+		if (lov_oinfo_is_dummy(loi))
+			continue;
+
+		if (!lov->lov_tgts[loi->loi_ost_idx]) {
+			CDEBUG(D_HA, "lov idx %d NULL\n", loi->loi_ost_idx);
+			continue;
+		}
+
+		submd.lsm_oi = loi->loi_oi;
+		submd.lsm_stripe_count = 0;
+		rc = obd_find_cbdata(lov->lov_tgts[loi->loi_ost_idx]->ltd_exp,
+				     &submd, it, data);
+		if (rc != 0)
+			return rc;
+	}
+	return rc;
+}
+
+int lov_statfs_interpret(struct ptlrpc_request_set *rqset, void *data, int rc)
+{
+	struct lov_request_set *lovset = (struct lov_request_set *)data;
+	int err;
+
+	if (rc)
+		atomic_set(&lovset->set_completes, 0);
+
+	err = lov_fini_statfs_set(lovset);
+	return rc ? rc : err;
+}
+
+static int lov_statfs_async(struct obd_export *exp, struct obd_info *oinfo,
+			    __u64 max_age, struct ptlrpc_request_set *rqset)
+{
+	struct obd_device      *obd = class_exp2obd(exp);
+	struct lov_request_set *set;
+	struct lov_request *req;
+	struct list_head *pos;
+	struct lov_obd *lov;
+	int rc = 0;
+
+	LASSERT(oinfo != NULL);
+	LASSERT(oinfo->oi_osfs != NULL);
+
+	lov = &obd->u.lov;
+	rc = lov_prep_statfs_set(obd, oinfo, &set);
+	if (rc)
+		return rc;
+
+	list_for_each(pos, &set->set_list) {
+		req = list_entry(pos, struct lov_request, rq_link);
+		rc = obd_statfs_async(lov->lov_tgts[req->rq_idx]->ltd_exp,
+				      &req->rq_oi, max_age, rqset);
+		if (rc)
+			break;
+	}
+
+	if (rc || list_empty(&rqset->set_requests)) {
+		int err;
+		if (rc)
+			atomic_set(&set->set_completes, 0);
+		err = lov_fini_statfs_set(set);
+		return rc ? rc : err;
+	}
+
+	LASSERT(rqset->set_interpret == NULL);
+	rqset->set_interpret = lov_statfs_interpret;
+	rqset->set_arg = (void *)set;
+	return 0;
+}
+
+static int lov_statfs(const struct lu_env *env, struct obd_export *exp,
+		      struct obd_statfs *osfs, __u64 max_age, __u32 flags)
+{
+	struct ptlrpc_request_set *set = NULL;
+	struct obd_info oinfo = { { { 0 } } };
+	int rc = 0;
+
+	/* for obdclass we forbid using obd_statfs_rqset, but prefer using async
+	 * statfs requests */
+	set = ptlrpc_prep_set();
+	if (set == NULL)
+		return -ENOMEM;
+
+	oinfo.oi_osfs = osfs;
+	oinfo.oi_flags = flags;
+	rc = lov_statfs_async(exp, &oinfo, max_age, set);
+	if (rc == 0)
+		rc = ptlrpc_set_wait(set);
+	ptlrpc_set_destroy(set);
+
+	return rc;
+}
+
+static int lov_iocontrol(unsigned int cmd, struct obd_export *exp, int len,
+			 void *karg, void *uarg)
+{
+	struct obd_device *obddev = class_exp2obd(exp);
+	struct lov_obd *lov = &obddev->u.lov;
+	int i = 0, rc = 0, count = lov->desc.ld_tgt_count;
+	struct obd_uuid *uuidp;
+
+	switch (cmd) {
+	case IOC_OBD_STATFS: {
+		struct obd_ioctl_data *data = karg;
+		struct obd_device *osc_obd;
+		struct obd_statfs stat_buf = {0};
+		__u32 index;
+		__u32 flags;
+
+		memcpy(&index, data->ioc_inlbuf2, sizeof(__u32));
+		if ((index >= count))
+			return -ENODEV;
+
+		if (!lov->lov_tgts[index])
+			/* Try again with the next index */
+			return -EAGAIN;
+		if (!lov->lov_tgts[index]->ltd_active)
+			return -ENODATA;
+
+		osc_obd = class_exp2obd(lov->lov_tgts[index]->ltd_exp);
+		if (!osc_obd)
+			return -EINVAL;
+
+		/* copy UUID */
+		if (copy_to_user(data->ioc_pbuf2, obd2cli_tgt(osc_obd),
+				     min((int) data->ioc_plen2,
+					 (int) sizeof(struct obd_uuid))))
+			return -EFAULT;
+
+		flags = uarg ? *(__u32 *)uarg : 0;
+		/* got statfs data */
+		rc = obd_statfs(NULL, lov->lov_tgts[index]->ltd_exp, &stat_buf,
+				cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS),
+				flags);
+		if (rc)
+			return rc;
+		if (copy_to_user(data->ioc_pbuf1, &stat_buf,
+				     min((int) data->ioc_plen1,
+					 (int) sizeof(stat_buf))))
+			return -EFAULT;
+		break;
+	}
+	case OBD_IOC_LOV_GET_CONFIG: {
+		struct obd_ioctl_data *data;
+		struct lov_desc *desc;
+		char *buf = NULL;
+		__u32 *genp;
+
+		len = 0;
+		if (obd_ioctl_getdata(&buf, &len, (void *)uarg))
+			return -EINVAL;
+
+		data = (struct obd_ioctl_data *)buf;
+
+		if (sizeof(*desc) > data->ioc_inllen1) {
+			obd_ioctl_freedata(buf, len);
+			return -EINVAL;
+		}
+
+		if (sizeof(uuidp->uuid) * count > data->ioc_inllen2) {
+			obd_ioctl_freedata(buf, len);
+			return -EINVAL;
+		}
+
+		if (sizeof(__u32) * count > data->ioc_inllen3) {
+			obd_ioctl_freedata(buf, len);
+			return -EINVAL;
+		}
+
+		desc = (struct lov_desc *)data->ioc_inlbuf1;
+		memcpy(desc, &(lov->desc), sizeof(*desc));
+
+		uuidp = (struct obd_uuid *)data->ioc_inlbuf2;
+		genp = (__u32 *)data->ioc_inlbuf3;
+		/* the uuid will be empty for deleted OSTs */
+		for (i = 0; i < count; i++, uuidp++, genp++) {
+			if (!lov->lov_tgts[i])
+				continue;
+			*uuidp = lov->lov_tgts[i]->ltd_uuid;
+			*genp = lov->lov_tgts[i]->ltd_gen;
+		}
+
+		if (copy_to_user((void *)uarg, buf, len))
+			rc = -EFAULT;
+		obd_ioctl_freedata(buf, len);
+		break;
+	}
+	case LL_IOC_LOV_GETSTRIPE:
+		rc = lov_getstripe(exp, karg, uarg);
+		break;
+	case OBD_IOC_QUOTACTL: {
+		struct if_quotactl *qctl = karg;
+		struct lov_tgt_desc *tgt = NULL;
+		struct obd_quotactl *oqctl;
+
+		if (qctl->qc_valid == QC_OSTIDX) {
+			if (qctl->qc_idx < 0 || count <= qctl->qc_idx)
+				return -EINVAL;
+
+			tgt = lov->lov_tgts[qctl->qc_idx];
+			if (!tgt || !tgt->ltd_exp)
+				return -EINVAL;
+		} else if (qctl->qc_valid == QC_UUID) {
+			for (i = 0; i < count; i++) {
+				tgt = lov->lov_tgts[i];
+				if (!tgt ||
+				    !obd_uuid_equals(&tgt->ltd_uuid,
+						     &qctl->obd_uuid))
+					continue;
+
+				if (tgt->ltd_exp == NULL)
+					return -EINVAL;
+
+				break;
+			}
+		} else {
+			return -EINVAL;
+		}
+
+		if (i >= count)
+			return -EAGAIN;
+
+		LASSERT(tgt && tgt->ltd_exp);
+		OBD_ALLOC_PTR(oqctl);
+		if (!oqctl)
+			return -ENOMEM;
+
+		QCTL_COPY(oqctl, qctl);
+		rc = obd_quotactl(tgt->ltd_exp, oqctl);
+		if (rc == 0) {
+			QCTL_COPY(qctl, oqctl);
+			qctl->qc_valid = QC_OSTIDX;
+			qctl->obd_uuid = tgt->ltd_uuid;
+		}
+		OBD_FREE_PTR(oqctl);
+		break;
+	}
+	default: {
+		int set = 0;
+
+		if (count == 0)
+			return -ENOTTY;
+
+		for (i = 0; i < count; i++) {
+			int err;
+			struct obd_device *osc_obd;
+
+			/* OST was disconnected */
+			if (!lov->lov_tgts[i] || !lov->lov_tgts[i]->ltd_exp)
+				continue;
+
+			/* ll_umount_begin() sets force flag but for lov, not
+			 * osc. Let's pass it through */
+			osc_obd = class_exp2obd(lov->lov_tgts[i]->ltd_exp);
+			osc_obd->obd_force = obddev->obd_force;
+			err = obd_iocontrol(cmd, lov->lov_tgts[i]->ltd_exp,
+					    len, karg, uarg);
+			if (err == -ENODATA && cmd == OBD_IOC_POLL_QUOTACHECK) {
+				return err;
+			} else if (err) {
+				if (lov->lov_tgts[i]->ltd_active) {
+					CDEBUG(err == -ENOTTY ?
+					       D_IOCTL : D_WARNING,
+					       "iocontrol OSC %s on OST idx %d cmd %x: err = %d\n",
+					       lov_uuid2str(lov, i),
+					       i, cmd, err);
+					if (!rc)
+						rc = err;
+				}
+			} else {
+				set = 1;
+			}
+		}
+		if (!set && !rc)
+			rc = -EIO;
+	}
+	}
+
+	return rc;
+}
+
+#define FIEMAP_BUFFER_SIZE 4096
+
+/**
+ * Non-zero fe_logical indicates that this is a continuation FIEMAP
+ * call. The local end offset and the device are sent in the first
+ * fm_extent. This function calculates the stripe number from the index.
+ * This function returns a stripe_no on which mapping is to be restarted.
+ *
+ * This function returns fm_end_offset which is the in-OST offset at which
+ * mapping should be restarted. If fm_end_offset=0 is returned then caller
+ * will re-calculate proper offset in next stripe.
+ * Note that the first extent is passed to lov_get_info via the value field.
+ *
+ * \param fiemap fiemap request header
+ * \param lsm striping information for the file
+ * \param fm_start logical start of mapping
+ * \param fm_end logical end of mapping
+ * \param start_stripe starting stripe will be returned in this
+ */
+static u64 fiemap_calc_fm_end_offset(struct ll_user_fiemap *fiemap,
+				     struct lov_stripe_md *lsm, u64 fm_start,
+				     u64 fm_end, int *start_stripe)
+{
+	u64 local_end = fiemap->fm_extents[0].fe_logical;
+	u64 lun_start, lun_end;
+	u64 fm_end_offset;
+	int stripe_no = -1, i;
+
+	if (fiemap->fm_extent_count == 0 ||
+	    fiemap->fm_extents[0].fe_logical == 0)
+		return 0;
+
+	/* Find out stripe_no from ost_index saved in the fe_device */
+	for (i = 0; i < lsm->lsm_stripe_count; i++) {
+		struct lov_oinfo *oinfo = lsm->lsm_oinfo[i];
+
+		if (lov_oinfo_is_dummy(oinfo))
+			continue;
+
+		if (oinfo->loi_ost_idx == fiemap->fm_extents[0].fe_device) {
+			stripe_no = i;
+			break;
+		}
+	}
+	if (stripe_no == -1)
+		return -EINVAL;
+
+	/* If we have finished mapping on previous device, shift logical
+	 * offset to start of next device */
+	if ((lov_stripe_intersects(lsm, stripe_no, fm_start, fm_end,
+				   &lun_start, &lun_end)) != 0 &&
+				   local_end < lun_end) {
+		fm_end_offset = local_end;
+		*start_stripe = stripe_no;
+	} else {
+		/* This is a special value to indicate that caller should
+		 * calculate offset in next stripe. */
+		fm_end_offset = 0;
+		*start_stripe = (stripe_no + 1) % lsm->lsm_stripe_count;
+	}
+
+	return fm_end_offset;
+}
+
+/**
+ * We calculate on which OST the mapping will end. If the length of mapping
+ * is greater than (stripe_size * stripe_count) then the last_stripe will
+ * will be one just before start_stripe. Else we check if the mapping
+ * intersects each OST and find last_stripe.
+ * This function returns the last_stripe and also sets the stripe_count
+ * over which the mapping is spread
+ *
+ * \param lsm striping information for the file
+ * \param fm_start logical start of mapping
+ * \param fm_end logical end of mapping
+ * \param start_stripe starting stripe of the mapping
+ * \param stripe_count the number of stripes across which to map is returned
+ *
+ * \retval last_stripe return the last stripe of the mapping
+ */
+static int fiemap_calc_last_stripe(struct lov_stripe_md *lsm, u64 fm_start,
+				   u64 fm_end, int start_stripe,
+				   int *stripe_count)
+{
+	int last_stripe;
+	u64 obd_start, obd_end;
+	int i, j;
+
+	if (fm_end - fm_start > lsm->lsm_stripe_size * lsm->lsm_stripe_count) {
+		last_stripe = start_stripe < 1 ? lsm->lsm_stripe_count - 1 :
+							      start_stripe - 1;
+		*stripe_count = lsm->lsm_stripe_count;
+	} else {
+		for (j = 0, i = start_stripe; j < lsm->lsm_stripe_count;
+		     i = (i + 1) % lsm->lsm_stripe_count, j++) {
+			if ((lov_stripe_intersects(lsm, i, fm_start, fm_end,
+						   &obd_start, &obd_end)) == 0)
+				break;
+		}
+		*stripe_count = j;
+		last_stripe = (start_stripe + j - 1) %lsm->lsm_stripe_count;
+	}
+
+	return last_stripe;
+}
+
+/**
+ * Set fe_device and copy extents from local buffer into main return buffer.
+ *
+ * \param fiemap fiemap request header
+ * \param lcl_fm_ext array of local fiemap extents to be copied
+ * \param ost_index OST index to be written into the fm_device field for each
+		    extent
+ * \param ext_count number of extents to be copied
+ * \param current_extent where to start copying in main extent array
+ */
+static void fiemap_prepare_and_copy_exts(struct ll_user_fiemap *fiemap,
+					 struct ll_fiemap_extent *lcl_fm_ext,
+					 int ost_index, unsigned int ext_count,
+					 int current_extent)
+{
+	char *to;
+	int ext;
+
+	for (ext = 0; ext < ext_count; ext++) {
+		lcl_fm_ext[ext].fe_device = ost_index;
+		lcl_fm_ext[ext].fe_flags |= FIEMAP_EXTENT_NET;
+	}
+
+	/* Copy fm_extent's from fm_local to return buffer */
+	to = (char *)fiemap + fiemap_count_to_size(current_extent);
+	memcpy(to, lcl_fm_ext, ext_count * sizeof(struct ll_fiemap_extent));
+}
+
+/**
+ * Break down the FIEMAP request and send appropriate calls to individual OSTs.
+ * This also handles the restarting of FIEMAP calls in case mapping overflows
+ * the available number of extents in single call.
+ */
+static int lov_fiemap(struct lov_obd *lov, __u32 keylen, void *key,
+		      __u32 *vallen, void *val, struct lov_stripe_md *lsm)
+{
+	struct ll_fiemap_info_key *fm_key = key;
+	struct ll_user_fiemap *fiemap = val;
+	struct ll_user_fiemap *fm_local = NULL;
+	struct ll_fiemap_extent *lcl_fm_ext;
+	int count_local;
+	unsigned int get_num_extents = 0;
+	int ost_index = 0, actual_start_stripe, start_stripe;
+	u64 fm_start, fm_end, fm_length, fm_end_offset;
+	u64 curr_loc;
+	int current_extent = 0, rc = 0, i;
+	int ost_eof = 0; /* EOF for object */
+	int ost_done = 0; /* done with required mapping for this OST? */
+	int last_stripe;
+	int cur_stripe = 0, cur_stripe_wrap = 0, stripe_count;
+	unsigned int buffer_size = FIEMAP_BUFFER_SIZE;
+
+	if (!lsm_has_objects(lsm)) {
+		rc = 0;
+		goto out;
+	}
+
+	if (fiemap_count_to_size(fm_key->fiemap.fm_extent_count) < buffer_size)
+		buffer_size = fiemap_count_to_size(fm_key->fiemap.fm_extent_count);
+
+	OBD_ALLOC_LARGE(fm_local, buffer_size);
+	if (fm_local == NULL) {
+		rc = -ENOMEM;
+		goto out;
+	}
+	lcl_fm_ext = &fm_local->fm_extents[0];
+
+	count_local = fiemap_size_to_count(buffer_size);
+
+	memcpy(fiemap, &fm_key->fiemap, sizeof(*fiemap));
+	fm_start = fiemap->fm_start;
+	fm_length = fiemap->fm_length;
+	/* Calculate start stripe, last stripe and length of mapping */
+	actual_start_stripe = start_stripe = lov_stripe_number(lsm, fm_start);
+	fm_end = (fm_length == ~0ULL ? fm_key->oa.o_size :
+						fm_start + fm_length - 1);
+	/* If fm_length != ~0ULL but fm_start+fm_length-1 exceeds file size */
+	if (fm_end > fm_key->oa.o_size)
+		fm_end = fm_key->oa.o_size;
+
+	last_stripe = fiemap_calc_last_stripe(lsm, fm_start, fm_end,
+					    actual_start_stripe, &stripe_count);
+
+	fm_end_offset = fiemap_calc_fm_end_offset(fiemap, lsm, fm_start,
+						  fm_end, &start_stripe);
+	if (fm_end_offset == -EINVAL) {
+		rc = -EINVAL;
+		goto out;
+	}
+
+	if (fiemap_count_to_size(fiemap->fm_extent_count) > *vallen)
+		fiemap->fm_extent_count = fiemap_size_to_count(*vallen);
+	if (fiemap->fm_extent_count == 0) {
+		get_num_extents = 1;
+		count_local = 0;
+	}
+	/* Check each stripe */
+	for (cur_stripe = start_stripe, i = 0; i < stripe_count;
+	     i++, cur_stripe = (cur_stripe + 1) % lsm->lsm_stripe_count) {
+		u64 req_fm_len; /* Stores length of required mapping */
+		u64 len_mapped_single_call;
+		u64 lun_start, lun_end, obd_object_end;
+		unsigned int ext_count;
+
+		cur_stripe_wrap = cur_stripe;
+
+		/* Find out range of mapping on this stripe */
+		if ((lov_stripe_intersects(lsm, cur_stripe, fm_start, fm_end,
+					   &lun_start, &obd_object_end)) == 0)
+			continue;
+
+		if (lov_oinfo_is_dummy(lsm->lsm_oinfo[cur_stripe])) {
+			rc = -EIO;
+			goto out;
+		}
+
+		/* If this is a continuation FIEMAP call and we are on
+		 * starting stripe then lun_start needs to be set to
+		 * fm_end_offset */
+		if (fm_end_offset != 0 && cur_stripe == start_stripe)
+			lun_start = fm_end_offset;
+
+		if (fm_length != ~0ULL) {
+			/* Handle fm_start + fm_length overflow */
+			if (fm_start + fm_length < fm_start)
+				fm_length = ~0ULL - fm_start;
+			lun_end = lov_size_to_stripe(lsm, fm_start + fm_length,
+						     cur_stripe);
+		} else {
+			lun_end = ~0ULL;
+		}
+
+		if (lun_start == lun_end)
+			continue;
+
+		req_fm_len = obd_object_end - lun_start;
+		fm_local->fm_length = 0;
+		len_mapped_single_call = 0;
+
+		/* If the output buffer is very large and the objects have many
+		 * extents we may need to loop on a single OST repeatedly */
+		ost_eof = 0;
+		ost_done = 0;
+		do {
+			if (get_num_extents == 0) {
+				/* Don't get too many extents. */
+				if (current_extent + count_local >
+				    fiemap->fm_extent_count)
+					count_local = fiemap->fm_extent_count -
+								 current_extent;
+			}
+
+			lun_start += len_mapped_single_call;
+			fm_local->fm_length = req_fm_len - len_mapped_single_call;
+			req_fm_len = fm_local->fm_length;
+			fm_local->fm_extent_count = count_local;
+			fm_local->fm_mapped_extents = 0;
+			fm_local->fm_flags = fiemap->fm_flags;
+
+			fm_key->oa.o_oi = lsm->lsm_oinfo[cur_stripe]->loi_oi;
+			ost_index = lsm->lsm_oinfo[cur_stripe]->loi_ost_idx;
+
+			if (ost_index < 0 ||
+			    ost_index >= lov->desc.ld_tgt_count) {
+				rc = -EINVAL;
+				goto out;
+			}
+
+			/* If OST is inactive, return extent with UNKNOWN flag */
+			if (!lov->lov_tgts[ost_index]->ltd_active) {
+				fm_local->fm_flags |= FIEMAP_EXTENT_LAST;
+				fm_local->fm_mapped_extents = 1;
+
+				lcl_fm_ext[0].fe_logical = lun_start;
+				lcl_fm_ext[0].fe_length = obd_object_end -
+								      lun_start;
+				lcl_fm_ext[0].fe_flags |= FIEMAP_EXTENT_UNKNOWN;
+
+				goto inactive_tgt;
+			}
+
+			fm_local->fm_start = lun_start;
+			fm_local->fm_flags &= ~FIEMAP_FLAG_DEVICE_ORDER;
+			memcpy(&fm_key->fiemap, fm_local, sizeof(*fm_local));
+			*vallen=fiemap_count_to_size(fm_local->fm_extent_count);
+			rc = obd_get_info(NULL,
+					  lov->lov_tgts[ost_index]->ltd_exp,
+					  keylen, key, vallen, fm_local, lsm);
+			if (rc != 0)
+				goto out;
+
+inactive_tgt:
+			ext_count = fm_local->fm_mapped_extents;
+			if (ext_count == 0) {
+				ost_done = 1;
+				/* If last stripe has hole at the end,
+				 * then we need to return */
+				if (cur_stripe_wrap == last_stripe) {
+					fiemap->fm_mapped_extents = 0;
+					goto finish;
+				}
+				break;
+			}
+
+			/* If we just need num of extents then go to next device */
+			if (get_num_extents) {
+				current_extent += ext_count;
+				break;
+			}
+
+			len_mapped_single_call = lcl_fm_ext[ext_count-1].fe_logical -
+				  lun_start + lcl_fm_ext[ext_count - 1].fe_length;
+
+			/* Have we finished mapping on this device? */
+			if (req_fm_len <= len_mapped_single_call)
+				ost_done = 1;
+
+			/* Clear the EXTENT_LAST flag which can be present on
+			 * last extent */
+			if (lcl_fm_ext[ext_count-1].fe_flags & FIEMAP_EXTENT_LAST)
+				lcl_fm_ext[ext_count - 1].fe_flags &=
+							    ~FIEMAP_EXTENT_LAST;
+
+			curr_loc = lov_stripe_size(lsm,
+					   lcl_fm_ext[ext_count - 1].fe_logical+
+					   lcl_fm_ext[ext_count - 1].fe_length,
+					   cur_stripe);
+			if (curr_loc >= fm_key->oa.o_size)
+				ost_eof = 1;
+
+			fiemap_prepare_and_copy_exts(fiemap, lcl_fm_ext,
+						     ost_index, ext_count,
+						     current_extent);
+
+			current_extent += ext_count;
+
+			/* Ran out of available extents? */
+			if (current_extent >= fiemap->fm_extent_count)
+				goto finish;
+		} while (ost_done == 0 && ost_eof == 0);
+
+		if (cur_stripe_wrap == last_stripe)
+			goto finish;
+	}
+
+finish:
+	/* Indicate that we are returning device offsets unless file just has
+	 * single stripe */
+	if (lsm->lsm_stripe_count > 1)
+		fiemap->fm_flags |= FIEMAP_FLAG_DEVICE_ORDER;
+
+	if (get_num_extents)
+		goto skip_last_device_calc;
+
+	/* Check if we have reached the last stripe and whether mapping for that
+	 * stripe is done. */
+	if (cur_stripe_wrap == last_stripe) {
+		if (ost_done || ost_eof)
+			fiemap->fm_extents[current_extent - 1].fe_flags |=
+							     FIEMAP_EXTENT_LAST;
+	}
+
+skip_last_device_calc:
+	fiemap->fm_mapped_extents = current_extent;
+
+out:
+	OBD_FREE_LARGE(fm_local, buffer_size);
+	return rc;
+}
+
+static int lov_get_info(const struct lu_env *env, struct obd_export *exp,
+			__u32 keylen, void *key, __u32 *vallen, void *val,
+			struct lov_stripe_md *lsm)
+{
+	struct obd_device *obddev = class_exp2obd(exp);
+	struct lov_obd *lov = &obddev->u.lov;
+	int i, rc;
+
+	if (!vallen || !val)
+		return -EFAULT;
+
+	obd_getref(obddev);
+
+	if (KEY_IS(KEY_LOCK_TO_STRIPE)) {
+		struct {
+			char name[16];
+			struct ldlm_lock *lock;
+		} *data = key;
+		struct ldlm_res_id *res_id = &data->lock->l_resource->lr_name;
+		struct lov_oinfo *loi;
+		__u32 *stripe = val;
+
+		if (*vallen < sizeof(*stripe)) {
+			rc = -EFAULT;
+			goto out;
+		}
+		*vallen = sizeof(*stripe);
+
+		/* XXX This is another one of those bits that will need to
+		 * change if we ever actually support nested LOVs.  It uses
+		 * the lock's export to find out which stripe it is. */
+		/* XXX - it's assumed all the locks for deleted OSTs have
+		 * been cancelled. Also, the export for deleted OSTs will
+		 * be NULL and won't match the lock's export. */
+		for (i = 0; i < lsm->lsm_stripe_count; i++) {
+			loi = lsm->lsm_oinfo[i];
+			if (lov_oinfo_is_dummy(loi))
+				continue;
+
+			if (!lov->lov_tgts[loi->loi_ost_idx])
+				continue;
+			if (lov->lov_tgts[loi->loi_ost_idx]->ltd_exp ==
+			    data->lock->l_conn_export &&
+			    ostid_res_name_eq(&loi->loi_oi, res_id)) {
+				*stripe = i;
+				rc = 0;
+				goto out;
+			}
+		}
+		LDLM_ERROR(data->lock, "lock on inode without such object");
+		dump_lsm(D_ERROR, lsm);
+		rc = -ENXIO;
+		goto out;
+	} else if (KEY_IS(KEY_LAST_ID)) {
+		struct obd_id_info *info = val;
+		__u32 size = sizeof(u64);
+		struct lov_tgt_desc *tgt;
+
+		LASSERT(*vallen == sizeof(struct obd_id_info));
+		tgt = lov->lov_tgts[info->idx];
+
+		if (!tgt || !tgt->ltd_active) {
+			rc = -ESRCH;
+			goto out;
+		}
+
+		rc = obd_get_info(env, tgt->ltd_exp, keylen, key,
+				  &size, info->data, NULL);
+		rc = 0;
+		goto out;
+	} else if (KEY_IS(KEY_LOVDESC)) {
+		struct lov_desc *desc_ret = val;
+		*desc_ret = lov->desc;
+
+		rc = 0;
+		goto out;
+	} else if (KEY_IS(KEY_FIEMAP)) {
+		rc = lov_fiemap(lov, keylen, key, vallen, val, lsm);
+		goto out;
+	} else if (KEY_IS(KEY_CONNECT_FLAG)) {
+		struct lov_tgt_desc *tgt;
+		__u64 ost_idx = *((__u64 *)val);
+
+		LASSERT(*vallen == sizeof(__u64));
+		LASSERT(ost_idx < lov->desc.ld_tgt_count);
+		tgt = lov->lov_tgts[ost_idx];
+
+		if (!tgt || !tgt->ltd_exp) {
+			rc = -ESRCH;
+			goto out;
+		}
+
+		*((__u64 *)val) = exp_connect_flags(tgt->ltd_exp);
+		rc = 0;
+		goto out;
+	} else if (KEY_IS(KEY_TGT_COUNT)) {
+		*((int *)val) = lov->desc.ld_tgt_count;
+		rc = 0;
+		goto out;
+	}
+
+	rc = -EINVAL;
+
+out:
+	obd_putref(obddev);
+	return rc;
+}
+
+static int lov_set_info_async(const struct lu_env *env, struct obd_export *exp,
+			      u32 keylen, void *key, u32 vallen,
+			      void *val, struct ptlrpc_request_set *set)
+{
+	struct obd_device *obddev = class_exp2obd(exp);
+	struct lov_obd *lov = &obddev->u.lov;
+	u32 count;
+	int i, rc = 0, err;
+	struct lov_tgt_desc *tgt;
+	unsigned incr, check_uuid,
+		 do_inactive, no_set;
+	unsigned next_id = 0,  mds_con = 0, capa = 0;
+
+	incr = check_uuid = do_inactive = no_set = 0;
+	if (set == NULL) {
+		no_set = 1;
+		set = ptlrpc_prep_set();
+		if (!set)
+			return -ENOMEM;
+	}
+
+	obd_getref(obddev);
+	count = lov->desc.ld_tgt_count;
+
+	if (KEY_IS(KEY_NEXT_ID)) {
+		count = vallen / sizeof(struct obd_id_info);
+		vallen = sizeof(u64);
+		incr = sizeof(struct obd_id_info);
+		do_inactive = 1;
+		next_id = 1;
+	} else if (KEY_IS(KEY_CHECKSUM)) {
+		do_inactive = 1;
+	} else if (KEY_IS(KEY_EVICT_BY_NID)) {
+		/* use defaults:  do_inactive = incr = 0; */
+	} else if (KEY_IS(KEY_MDS_CONN)) {
+		mds_con = 1;
+	} else if (KEY_IS(KEY_CAPA_KEY)) {
+		capa = 1;
+	} else if (KEY_IS(KEY_CACHE_SET)) {
+		LASSERT(lov->lov_cache == NULL);
+		lov->lov_cache = val;
+		do_inactive = 1;
+	}
+
+	for (i = 0; i < count; i++, val = (char *)val + incr) {
+		if (next_id) {
+			tgt = lov->lov_tgts[((struct obd_id_info *)val)->idx];
+		} else {
+			tgt = lov->lov_tgts[i];
+		}
+		/* OST was disconnected */
+		if (!tgt || !tgt->ltd_exp)
+			continue;
+
+		/* OST is inactive and we don't want inactive OSCs */
+		if (!tgt->ltd_active && !do_inactive)
+			continue;
+
+		if (mds_con) {
+			struct mds_group_info *mgi;
+
+			LASSERT(vallen == sizeof(*mgi));
+			mgi = (struct mds_group_info *)val;
+
+			/* Only want a specific OSC */
+			if (mgi->uuid && !obd_uuid_equals(mgi->uuid,
+						&tgt->ltd_uuid))
+				continue;
+
+			err = obd_set_info_async(env, tgt->ltd_exp,
+					 keylen, key, sizeof(int),
+					 &mgi->group, set);
+		} else if (next_id) {
+			err = obd_set_info_async(env, tgt->ltd_exp,
+					 keylen, key, vallen,
+					 ((struct obd_id_info *)val)->data, set);
+		} else if (capa) {
+			struct mds_capa_info *info = (struct mds_capa_info *)val;
+
+			LASSERT(vallen == sizeof(*info));
+
+			 /* Only want a specific OSC */
+			if (info->uuid &&
+			    !obd_uuid_equals(info->uuid, &tgt->ltd_uuid))
+				continue;
+
+			err = obd_set_info_async(env, tgt->ltd_exp, keylen,
+						 key, sizeof(*info->capa),
+						 info->capa, set);
+		} else {
+			/* Only want a specific OSC */
+			if (check_uuid &&
+			    !obd_uuid_equals(val, &tgt->ltd_uuid))
+				continue;
+
+			err = obd_set_info_async(env, tgt->ltd_exp,
+					 keylen, key, vallen, val, set);
+		}
+
+		if (!rc)
+			rc = err;
+	}
+
+	obd_putref(obddev);
+	if (no_set) {
+		err = ptlrpc_set_wait(set);
+		if (!rc)
+			rc = err;
+		ptlrpc_set_destroy(set);
+	}
+	return rc;
+}
+
+void lov_stripe_lock(struct lov_stripe_md *md)
+		__acquires(&md->lsm_lock)
+{
+	LASSERT(md->lsm_lock_owner != current_pid());
+	spin_lock(&md->lsm_lock);
+	LASSERT(md->lsm_lock_owner == 0);
+	md->lsm_lock_owner = current_pid();
+}
+EXPORT_SYMBOL(lov_stripe_lock);
+
+void lov_stripe_unlock(struct lov_stripe_md *md)
+		__releases(&md->lsm_lock)
+{
+	LASSERT(md->lsm_lock_owner == current_pid());
+	md->lsm_lock_owner = 0;
+	spin_unlock(&md->lsm_lock);
+}
+EXPORT_SYMBOL(lov_stripe_unlock);
+
+static int lov_quotactl(struct obd_device *obd, struct obd_export *exp,
+			struct obd_quotactl *oqctl)
+{
+	struct lov_obd      *lov = &obd->u.lov;
+	struct lov_tgt_desc *tgt;
+	__u64		curspace = 0;
+	__u64		bhardlimit = 0;
+	int		  i, rc = 0;
+
+	if (oqctl->qc_cmd != LUSTRE_Q_QUOTAON &&
+	    oqctl->qc_cmd != LUSTRE_Q_QUOTAOFF &&
+	    oqctl->qc_cmd != Q_GETOQUOTA &&
+	    oqctl->qc_cmd != Q_INITQUOTA &&
+	    oqctl->qc_cmd != LUSTRE_Q_SETQUOTA &&
+	    oqctl->qc_cmd != Q_FINVALIDATE) {
+		CERROR("bad quota opc %x for lov obd", oqctl->qc_cmd);
+		return -EFAULT;
+	}
+
+	/* for lov tgt */
+	obd_getref(obd);
+	for (i = 0; i < lov->desc.ld_tgt_count; i++) {
+		int err;
+
+		tgt = lov->lov_tgts[i];
+
+		if (!tgt)
+			continue;
+
+		if (!tgt->ltd_active || tgt->ltd_reap) {
+			if (oqctl->qc_cmd == Q_GETOQUOTA &&
+			    lov->lov_tgts[i]->ltd_activate) {
+				rc = -EREMOTEIO;
+				CERROR("ost %d is inactive\n", i);
+			} else {
+				CDEBUG(D_HA, "ost %d is inactive\n", i);
+			}
+			continue;
+		}
+
+		err = obd_quotactl(tgt->ltd_exp, oqctl);
+		if (err) {
+			if (tgt->ltd_active && !rc)
+				rc = err;
+			continue;
+		}
+
+		if (oqctl->qc_cmd == Q_GETOQUOTA) {
+			curspace += oqctl->qc_dqblk.dqb_curspace;
+			bhardlimit += oqctl->qc_dqblk.dqb_bhardlimit;
+		}
+	}
+	obd_putref(obd);
+
+	if (oqctl->qc_cmd == Q_GETOQUOTA) {
+		oqctl->qc_dqblk.dqb_curspace = curspace;
+		oqctl->qc_dqblk.dqb_bhardlimit = bhardlimit;
+	}
+	return rc;
+}
+
+static int lov_quotacheck(struct obd_device *obd, struct obd_export *exp,
+			  struct obd_quotactl *oqctl)
+{
+	struct lov_obd *lov = &obd->u.lov;
+	int	     i, rc = 0;
+
+	obd_getref(obd);
+
+	for (i = 0; i < lov->desc.ld_tgt_count; i++) {
+		if (!lov->lov_tgts[i])
+			continue;
+
+		/* Skip quota check on the administratively disabled OSTs. */
+		if (!lov->lov_tgts[i]->ltd_activate) {
+			CWARN("lov idx %d was administratively disabled, skip quotacheck on it.\n",
+			      i);
+			continue;
+		}
+
+		if (!lov->lov_tgts[i]->ltd_active) {
+			CERROR("lov idx %d inactive\n", i);
+			rc = -EIO;
+			goto out;
+		}
+	}
+
+	for (i = 0; i < lov->desc.ld_tgt_count; i++) {
+		int err;
+
+		if (!lov->lov_tgts[i] || !lov->lov_tgts[i]->ltd_activate)
+			continue;
+
+		err = obd_quotacheck(lov->lov_tgts[i]->ltd_exp, oqctl);
+		if (err && !rc)
+			rc = err;
+	}
+
+out:
+	obd_putref(obd);
+
+	return rc;
+}
+
+static struct obd_ops lov_obd_ops = {
+	.o_owner	       = THIS_MODULE,
+	.o_setup	       = lov_setup,
+	.o_precleanup	  = lov_precleanup,
+	.o_cleanup	     = lov_cleanup,
+	/*.o_process_config      = lov_process_config,*/
+	.o_connect	     = lov_connect,
+	.o_disconnect	  = lov_disconnect,
+	.o_statfs	      = lov_statfs,
+	.o_statfs_async	= lov_statfs_async,
+	.o_packmd	      = lov_packmd,
+	.o_unpackmd	    = lov_unpackmd,
+	.o_create	      = lov_create,
+	.o_destroy	     = lov_destroy,
+	.o_getattr_async       = lov_getattr_async,
+	.o_setattr_async       = lov_setattr_async,
+	.o_adjust_kms	  = lov_adjust_kms,
+	.o_find_cbdata	 = lov_find_cbdata,
+	.o_iocontrol	   = lov_iocontrol,
+	.o_get_info	    = lov_get_info,
+	.o_set_info_async      = lov_set_info_async,
+	.o_notify	      = lov_notify,
+	.o_pool_new	    = lov_pool_new,
+	.o_pool_rem	    = lov_pool_remove,
+	.o_pool_add	    = lov_pool_add,
+	.o_pool_del	    = lov_pool_del,
+	.o_getref	      = lov_getref,
+	.o_putref	      = lov_putref,
+	.o_quotactl	    = lov_quotactl,
+	.o_quotacheck	  = lov_quotacheck,
+};
+
+struct kmem_cache *lov_oinfo_slab;
+
+static int __init lov_init(void)
+{
+	struct lprocfs_static_vars lvars = { NULL };
+	int rc;
+
+	/* print an address of _any_ initialized kernel symbol from this
+	 * module, to allow debugging with gdb that doesn't support data
+	 * symbols from modules.*/
+	CDEBUG(D_INFO, "Lustre LOV module (%p).\n", &lov_caches);
+
+	rc = lu_kmem_init(lov_caches);
+	if (rc)
+		return rc;
+
+	lov_oinfo_slab = kmem_cache_create("lov_oinfo",
+					      sizeof(struct lov_oinfo),
+					      0, SLAB_HWCACHE_ALIGN, NULL);
+	if (lov_oinfo_slab == NULL) {
+		lu_kmem_fini(lov_caches);
+		return -ENOMEM;
+	}
+	lprocfs_lov_init_vars(&lvars);
+
+	rc = class_register_type(&lov_obd_ops, NULL, lvars.module_vars,
+				 LUSTRE_LOV_NAME, &lov_device_type);
+
+	if (rc) {
+		kmem_cache_destroy(lov_oinfo_slab);
+		lu_kmem_fini(lov_caches);
+	}
+
+	return rc;
+}
+
+static void /*__exit*/ lov_exit(void)
+{
+	class_unregister_type(LUSTRE_LOV_NAME);
+	kmem_cache_destroy(lov_oinfo_slab);
+
+	lu_kmem_fini(lov_caches);
+}
+
+MODULE_AUTHOR("Sun Microsystems, Inc. <http://www.lustre.org/>");
+MODULE_DESCRIPTION("Lustre Logical Object Volume OBD driver");
+MODULE_LICENSE("GPL");
+MODULE_VERSION(LUSTRE_VERSION_STRING);
+
+module_init(lov_init);
+module_exit(lov_exit);
diff --git a/kernel/drivers/staging/lustre/lustre/lov/lov_object.c b/kernel/drivers/staging/lustre/lustre/lov/lov_object.c
new file mode 100644
index 000000000..a22342fa7
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/lov/lov_object.c
@@ -0,0 +1,1001 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Implementation of cl_object for LOV layer.
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ *   Author: Jinshan Xiong <jinshan.xiong@whamcloud.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LOV
+
+#include "lov_cl_internal.h"
+#include "../include/lclient.h"
+
+/** \addtogroup lov
+ *  @{
+ */
+
+/*****************************************************************************
+ *
+ * Layout operations.
+ *
+ */
+
+struct lov_layout_operations {
+	int (*llo_init)(const struct lu_env *env, struct lov_device *dev,
+			struct lov_object *lov,
+			const struct cl_object_conf *conf,
+			union lov_layout_state *state);
+	int (*llo_delete)(const struct lu_env *env, struct lov_object *lov,
+			   union lov_layout_state *state);
+	void (*llo_fini)(const struct lu_env *env, struct lov_object *lov,
+			 union lov_layout_state *state);
+	void (*llo_install)(const struct lu_env *env, struct lov_object *lov,
+			    union lov_layout_state *state);
+	int  (*llo_print)(const struct lu_env *env, void *cookie,
+			  lu_printer_t p, const struct lu_object *o);
+	int  (*llo_page_init)(const struct lu_env *env, struct cl_object *obj,
+				struct cl_page *page, struct page *vmpage);
+	int  (*llo_lock_init)(const struct lu_env *env,
+			      struct cl_object *obj, struct cl_lock *lock,
+			      const struct cl_io *io);
+	int  (*llo_io_init)(const struct lu_env *env,
+			    struct cl_object *obj, struct cl_io *io);
+	int  (*llo_getattr)(const struct lu_env *env, struct cl_object *obj,
+			    struct cl_attr *attr);
+};
+
+static int lov_layout_wait(const struct lu_env *env, struct lov_object *lov);
+
+/*****************************************************************************
+ *
+ * Lov object layout operations.
+ *
+ */
+
+static void lov_install_empty(const struct lu_env *env,
+			      struct lov_object *lov,
+			      union  lov_layout_state *state)
+{
+	/*
+	 * File without objects.
+	 */
+}
+
+static int lov_init_empty(const struct lu_env *env,
+			  struct lov_device *dev, struct lov_object *lov,
+			  const struct cl_object_conf *conf,
+			  union  lov_layout_state *state)
+{
+	return 0;
+}
+
+static void lov_install_raid0(const struct lu_env *env,
+			      struct lov_object *lov,
+			      union  lov_layout_state *state)
+{
+}
+
+static struct cl_object *lov_sub_find(const struct lu_env *env,
+				      struct cl_device *dev,
+				      const struct lu_fid *fid,
+				      const struct cl_object_conf *conf)
+{
+	struct lu_object *o;
+
+	o = lu_object_find_at(env, cl2lu_dev(dev), fid, &conf->coc_lu);
+	LASSERT(ergo(!IS_ERR(o), o->lo_dev->ld_type == &lovsub_device_type));
+	return lu2cl(o);
+}
+
+static int lov_init_sub(const struct lu_env *env, struct lov_object *lov,
+			struct cl_object *stripe, struct lov_layout_raid0 *r0,
+			int idx)
+{
+	struct cl_object_header *hdr;
+	struct cl_object_header *subhdr;
+	struct cl_object_header *parent;
+	struct lov_oinfo	*oinfo;
+	int result;
+
+	if (OBD_FAIL_CHECK(OBD_FAIL_LOV_INIT)) {
+		/* For sanity:test_206.
+		 * Do not leave the object in cache to avoid accessing
+		 * freed memory. This is because osc_object is referring to
+		 * lov_oinfo of lsm_stripe_data which will be freed due to
+		 * this failure. */
+		cl_object_kill(env, stripe);
+		cl_object_put(env, stripe);
+		return -EIO;
+	}
+
+	hdr    = cl_object_header(lov2cl(lov));
+	subhdr = cl_object_header(stripe);
+
+	oinfo = lov->lo_lsm->lsm_oinfo[idx];
+	CDEBUG(D_INODE, DFID"@%p[%d] -> "DFID"@%p: ostid: "DOSTID
+	       " idx: %d gen: %d\n",
+	       PFID(&subhdr->coh_lu.loh_fid), subhdr, idx,
+	       PFID(&hdr->coh_lu.loh_fid), hdr, POSTID(&oinfo->loi_oi),
+	       oinfo->loi_ost_idx, oinfo->loi_ost_gen);
+
+	/* reuse ->coh_attr_guard to protect coh_parent change */
+	spin_lock(&subhdr->coh_attr_guard);
+	parent = subhdr->coh_parent;
+	if (parent == NULL) {
+		subhdr->coh_parent = hdr;
+		spin_unlock(&subhdr->coh_attr_guard);
+		subhdr->coh_nesting = hdr->coh_nesting + 1;
+		lu_object_ref_add(&stripe->co_lu, "lov-parent", lov);
+		r0->lo_sub[idx] = cl2lovsub(stripe);
+		r0->lo_sub[idx]->lso_super = lov;
+		r0->lo_sub[idx]->lso_index = idx;
+		result = 0;
+	} else {
+		struct lu_object  *old_obj;
+		struct lov_object *old_lov;
+		unsigned int mask = D_INODE;
+
+		spin_unlock(&subhdr->coh_attr_guard);
+		old_obj = lu_object_locate(&parent->coh_lu, &lov_device_type);
+		LASSERT(old_obj != NULL);
+		old_lov = cl2lov(lu2cl(old_obj));
+		if (old_lov->lo_layout_invalid) {
+			/* the object's layout has already changed but isn't
+			 * refreshed */
+			lu_object_unhash(env, &stripe->co_lu);
+			result = -EAGAIN;
+		} else {
+			mask = D_ERROR;
+			result = -EIO;
+		}
+
+		LU_OBJECT_DEBUG(mask, env, &stripe->co_lu,
+				"stripe %d is already owned.\n", idx);
+		LU_OBJECT_DEBUG(mask, env, old_obj, "owned.\n");
+		LU_OBJECT_HEADER(mask, env, lov2lu(lov), "try to own.\n");
+		cl_object_put(env, stripe);
+	}
+	return result;
+}
+
+static int lov_init_raid0(const struct lu_env *env,
+			  struct lov_device *dev, struct lov_object *lov,
+			  const struct cl_object_conf *conf,
+			  union  lov_layout_state *state)
+{
+	int result;
+	int i;
+
+	struct cl_object	*stripe;
+	struct lov_thread_info  *lti     = lov_env_info(env);
+	struct cl_object_conf   *subconf = &lti->lti_stripe_conf;
+	struct lov_stripe_md    *lsm     = conf->u.coc_md->lsm;
+	struct lu_fid	   *ofid    = &lti->lti_fid;
+	struct lov_layout_raid0 *r0      = &state->raid0;
+
+	if (lsm->lsm_magic != LOV_MAGIC_V1 && lsm->lsm_magic != LOV_MAGIC_V3) {
+		dump_lsm(D_ERROR, lsm);
+		LASSERTF(0, "magic mismatch, expected %d/%d, actual %d.\n",
+			 LOV_MAGIC_V1, LOV_MAGIC_V3, lsm->lsm_magic);
+	}
+
+	LASSERT(lov->lo_lsm == NULL);
+	lov->lo_lsm = lsm_addref(lsm);
+	r0->lo_nr  = lsm->lsm_stripe_count;
+	LASSERT(r0->lo_nr <= lov_targets_nr(dev));
+
+	OBD_ALLOC_LARGE(r0->lo_sub, r0->lo_nr * sizeof(r0->lo_sub[0]));
+	if (r0->lo_sub != NULL) {
+		result = 0;
+		subconf->coc_inode = conf->coc_inode;
+		spin_lock_init(&r0->lo_sub_lock);
+		/*
+		 * Create stripe cl_objects.
+		 */
+		for (i = 0; i < r0->lo_nr && result == 0; ++i) {
+			struct cl_device *subdev;
+			struct lov_oinfo *oinfo = lsm->lsm_oinfo[i];
+			int ost_idx = oinfo->loi_ost_idx;
+
+			if (lov_oinfo_is_dummy(oinfo))
+				continue;
+
+			result = ostid_to_fid(ofid, &oinfo->loi_oi,
+					      oinfo->loi_ost_idx);
+			if (result != 0)
+				goto out;
+
+			subdev = lovsub2cl_dev(dev->ld_target[ost_idx]);
+			subconf->u.coc_oinfo = oinfo;
+			LASSERTF(subdev != NULL, "not init ost %d\n", ost_idx);
+			/* In the function below, .hs_keycmp resolves to
+			 * lu_obj_hop_keycmp() */
+			/* coverity[overrun-buffer-val] */
+			stripe = lov_sub_find(env, subdev, ofid, subconf);
+			if (!IS_ERR(stripe)) {
+				result = lov_init_sub(env, lov, stripe, r0, i);
+				if (result == -EAGAIN) { /* try again */
+					--i;
+					result = 0;
+				}
+			} else {
+				result = PTR_ERR(stripe);
+			}
+		}
+	} else
+		result = -ENOMEM;
+out:
+	return result;
+}
+
+static int lov_init_released(const struct lu_env *env,
+			struct lov_device *dev, struct lov_object *lov,
+			const struct cl_object_conf *conf,
+			union  lov_layout_state *state)
+{
+	struct lov_stripe_md *lsm = conf->u.coc_md->lsm;
+
+	LASSERT(lsm != NULL);
+	LASSERT(lsm_is_released(lsm));
+	LASSERT(lov->lo_lsm == NULL);
+
+	lov->lo_lsm = lsm_addref(lsm);
+	return 0;
+}
+
+static int lov_delete_empty(const struct lu_env *env, struct lov_object *lov,
+			    union lov_layout_state *state)
+{
+	LASSERT(lov->lo_type == LLT_EMPTY || lov->lo_type == LLT_RELEASED);
+
+	lov_layout_wait(env, lov);
+
+	cl_object_prune(env, &lov->lo_cl);
+	return 0;
+}
+
+static void lov_subobject_kill(const struct lu_env *env, struct lov_object *lov,
+			       struct lovsub_object *los, int idx)
+{
+	struct cl_object	*sub;
+	struct lov_layout_raid0 *r0;
+	struct lu_site	  *site;
+	struct lu_site_bkt_data *bkt;
+	wait_queue_t	  *waiter;
+
+	r0  = &lov->u.raid0;
+	LASSERT(r0->lo_sub[idx] == los);
+
+	sub  = lovsub2cl(los);
+	site = sub->co_lu.lo_dev->ld_site;
+	bkt  = lu_site_bkt_from_fid(site, &sub->co_lu.lo_header->loh_fid);
+
+	cl_object_kill(env, sub);
+	/* release a reference to the sub-object and ... */
+	lu_object_ref_del(&sub->co_lu, "lov-parent", lov);
+	cl_object_put(env, sub);
+
+	/* ... wait until it is actually destroyed---sub-object clears its
+	 * ->lo_sub[] slot in lovsub_object_fini() */
+	if (r0->lo_sub[idx] == los) {
+		waiter = &lov_env_info(env)->lti_waiter;
+		init_waitqueue_entry(waiter, current);
+		add_wait_queue(&bkt->lsb_marche_funebre, waiter);
+		set_current_state(TASK_UNINTERRUPTIBLE);
+		while (1) {
+			/* this wait-queue is signaled at the end of
+			 * lu_object_free(). */
+			set_current_state(TASK_UNINTERRUPTIBLE);
+			spin_lock(&r0->lo_sub_lock);
+			if (r0->lo_sub[idx] == los) {
+				spin_unlock(&r0->lo_sub_lock);
+				schedule();
+			} else {
+				spin_unlock(&r0->lo_sub_lock);
+				set_current_state(TASK_RUNNING);
+				break;
+			}
+		}
+		remove_wait_queue(&bkt->lsb_marche_funebre, waiter);
+	}
+	LASSERT(r0->lo_sub[idx] == NULL);
+}
+
+static int lov_delete_raid0(const struct lu_env *env, struct lov_object *lov,
+			    union lov_layout_state *state)
+{
+	struct lov_layout_raid0 *r0 = &state->raid0;
+	struct lov_stripe_md    *lsm = lov->lo_lsm;
+	int i;
+
+	dump_lsm(D_INODE, lsm);
+
+	lov_layout_wait(env, lov);
+	if (r0->lo_sub != NULL) {
+		for (i = 0; i < r0->lo_nr; ++i) {
+			struct lovsub_object *los = r0->lo_sub[i];
+
+			if (los != NULL) {
+				cl_locks_prune(env, &los->lso_cl, 1);
+				/*
+				 * If top-level object is to be evicted from
+				 * the cache, so are its sub-objects.
+				 */
+				lov_subobject_kill(env, lov, los, i);
+			}
+		}
+	}
+	cl_object_prune(env, &lov->lo_cl);
+	return 0;
+}
+
+static void lov_fini_empty(const struct lu_env *env, struct lov_object *lov,
+			   union lov_layout_state *state)
+{
+	LASSERT(lov->lo_type == LLT_EMPTY || lov->lo_type == LLT_RELEASED);
+}
+
+static void lov_fini_raid0(const struct lu_env *env, struct lov_object *lov,
+			   union lov_layout_state *state)
+{
+	struct lov_layout_raid0 *r0 = &state->raid0;
+
+	if (r0->lo_sub != NULL) {
+		OBD_FREE_LARGE(r0->lo_sub, r0->lo_nr * sizeof(r0->lo_sub[0]));
+		r0->lo_sub = NULL;
+	}
+
+	dump_lsm(D_INODE, lov->lo_lsm);
+	lov_free_memmd(&lov->lo_lsm);
+}
+
+static void lov_fini_released(const struct lu_env *env, struct lov_object *lov,
+				union lov_layout_state *state)
+{
+	dump_lsm(D_INODE, lov->lo_lsm);
+	lov_free_memmd(&lov->lo_lsm);
+}
+
+static int lov_print_empty(const struct lu_env *env, void *cookie,
+			   lu_printer_t p, const struct lu_object *o)
+{
+	(*p)(env, cookie, "empty %d\n", lu2lov(o)->lo_layout_invalid);
+	return 0;
+}
+
+static int lov_print_raid0(const struct lu_env *env, void *cookie,
+			   lu_printer_t p, const struct lu_object *o)
+{
+	struct lov_object	*lov = lu2lov(o);
+	struct lov_layout_raid0	*r0  = lov_r0(lov);
+	struct lov_stripe_md	*lsm = lov->lo_lsm;
+	int			 i;
+
+	(*p)(env, cookie, "stripes: %d, %s, lsm{%p 0x%08X %d %u %u}:\n",
+		r0->lo_nr, lov->lo_layout_invalid ? "invalid" : "valid", lsm,
+		lsm->lsm_magic, atomic_read(&lsm->lsm_refc),
+		lsm->lsm_stripe_count, lsm->lsm_layout_gen);
+	for (i = 0; i < r0->lo_nr; ++i) {
+		struct lu_object *sub;
+
+		if (r0->lo_sub[i] != NULL) {
+			sub = lovsub2lu(r0->lo_sub[i]);
+			lu_object_print(env, cookie, p, sub);
+		} else {
+			(*p)(env, cookie, "sub %d absent\n", i);
+		}
+	}
+	return 0;
+}
+
+static int lov_print_released(const struct lu_env *env, void *cookie,
+				lu_printer_t p, const struct lu_object *o)
+{
+	struct lov_object	*lov = lu2lov(o);
+	struct lov_stripe_md	*lsm = lov->lo_lsm;
+
+	(*p)(env, cookie,
+		"released: %s, lsm{%p 0x%08X %d %u %u}:\n",
+		lov->lo_layout_invalid ? "invalid" : "valid", lsm,
+		lsm->lsm_magic, atomic_read(&lsm->lsm_refc),
+		lsm->lsm_stripe_count, lsm->lsm_layout_gen);
+	return 0;
+}
+
+/**
+ * Implements cl_object_operations::coo_attr_get() method for an object
+ * without stripes (LLT_EMPTY layout type).
+ *
+ * The only attributes this layer is authoritative in this case is
+ * cl_attr::cat_blocks---it's 0.
+ */
+static int lov_attr_get_empty(const struct lu_env *env, struct cl_object *obj,
+			      struct cl_attr *attr)
+{
+	attr->cat_blocks = 0;
+	return 0;
+}
+
+static int lov_attr_get_raid0(const struct lu_env *env, struct cl_object *obj,
+			      struct cl_attr *attr)
+{
+	struct lov_object	*lov = cl2lov(obj);
+	struct lov_layout_raid0 *r0 = lov_r0(lov);
+	struct cl_attr		*lov_attr = &r0->lo_attr;
+	int			 result = 0;
+
+	/* this is called w/o holding type guard mutex, so it must be inside
+	 * an on going IO otherwise lsm may be replaced.
+	 * LU-2117: it turns out there exists one exception. For mmaped files,
+	 * the lock of those files may be requested in the other file's IO
+	 * context, and this function is called in ccc_lock_state(), it will
+	 * hit this assertion.
+	 * Anyway, it's still okay to call attr_get w/o type guard as layout
+	 * can't go if locks exist. */
+	/* LASSERT(atomic_read(&lsm->lsm_refc) > 1); */
+
+	if (!r0->lo_attr_valid) {
+		struct lov_stripe_md    *lsm = lov->lo_lsm;
+		struct ost_lvb	  *lvb = &lov_env_info(env)->lti_lvb;
+		__u64		    kms = 0;
+
+		memset(lvb, 0, sizeof(*lvb));
+		/* XXX: timestamps can be negative by sanity:test_39m,
+		 * how can it be? */
+		lvb->lvb_atime = LLONG_MIN;
+		lvb->lvb_ctime = LLONG_MIN;
+		lvb->lvb_mtime = LLONG_MIN;
+
+		/*
+		 * XXX that should be replaced with a loop over sub-objects,
+		 * doing cl_object_attr_get() on them. But for now, let's
+		 * reuse old lov code.
+		 */
+
+		/*
+		 * XXX take lsm spin-lock to keep lov_merge_lvb_kms()
+		 * happy. It's not needed, because new code uses
+		 * ->coh_attr_guard spin-lock to protect consistency of
+		 * sub-object attributes.
+		 */
+		lov_stripe_lock(lsm);
+		result = lov_merge_lvb_kms(lsm, lvb, &kms);
+		lov_stripe_unlock(lsm);
+		if (result == 0) {
+			cl_lvb2attr(lov_attr, lvb);
+			lov_attr->cat_kms = kms;
+			r0->lo_attr_valid = 1;
+		}
+	}
+	if (result == 0) { /* merge results */
+		attr->cat_blocks = lov_attr->cat_blocks;
+		attr->cat_size = lov_attr->cat_size;
+		attr->cat_kms = lov_attr->cat_kms;
+		if (attr->cat_atime < lov_attr->cat_atime)
+			attr->cat_atime = lov_attr->cat_atime;
+		if (attr->cat_ctime < lov_attr->cat_ctime)
+			attr->cat_ctime = lov_attr->cat_ctime;
+		if (attr->cat_mtime < lov_attr->cat_mtime)
+			attr->cat_mtime = lov_attr->cat_mtime;
+	}
+	return result;
+}
+
+static const struct lov_layout_operations lov_dispatch[] = {
+	[LLT_EMPTY] = {
+		.llo_init      = lov_init_empty,
+		.llo_delete    = lov_delete_empty,
+		.llo_fini      = lov_fini_empty,
+		.llo_install   = lov_install_empty,
+		.llo_print     = lov_print_empty,
+		.llo_page_init = lov_page_init_empty,
+		.llo_lock_init = lov_lock_init_empty,
+		.llo_io_init   = lov_io_init_empty,
+		.llo_getattr   = lov_attr_get_empty
+	},
+	[LLT_RAID0] = {
+		.llo_init      = lov_init_raid0,
+		.llo_delete    = lov_delete_raid0,
+		.llo_fini      = lov_fini_raid0,
+		.llo_install   = lov_install_raid0,
+		.llo_print     = lov_print_raid0,
+		.llo_page_init = lov_page_init_raid0,
+		.llo_lock_init = lov_lock_init_raid0,
+		.llo_io_init   = lov_io_init_raid0,
+		.llo_getattr   = lov_attr_get_raid0
+	},
+	[LLT_RELEASED] = {
+		.llo_init      = lov_init_released,
+		.llo_delete    = lov_delete_empty,
+		.llo_fini      = lov_fini_released,
+		.llo_install   = lov_install_empty,
+		.llo_print     = lov_print_released,
+		.llo_page_init = lov_page_init_empty,
+		.llo_lock_init = lov_lock_init_empty,
+		.llo_io_init   = lov_io_init_released,
+		.llo_getattr   = lov_attr_get_empty
+	}
+};
+
+/**
+ * Performs a double-dispatch based on the layout type of an object.
+ */
+#define LOV_2DISPATCH_NOLOCK(obj, op, ...)			      \
+({								      \
+	struct lov_object		      *__obj = (obj);	  \
+	enum lov_layout_type		    __llt;		  \
+									\
+	__llt = __obj->lo_type;					 \
+	LASSERT(0 <= __llt && __llt < ARRAY_SIZE(lov_dispatch));	\
+	lov_dispatch[__llt].op(__VA_ARGS__);			    \
+})
+
+/**
+ * Return lov_layout_type associated with a given lsm
+ */
+static enum lov_layout_type lov_type(struct lov_stripe_md *lsm)
+{
+	if (lsm == NULL)
+		return LLT_EMPTY;
+	if (lsm_is_released(lsm))
+		return LLT_RELEASED;
+	return LLT_RAID0;
+}
+
+static inline void lov_conf_freeze(struct lov_object *lov)
+{
+	if (lov->lo_owner != current)
+		down_read(&lov->lo_type_guard);
+}
+
+static inline void lov_conf_thaw(struct lov_object *lov)
+{
+	if (lov->lo_owner != current)
+		up_read(&lov->lo_type_guard);
+}
+
+#define LOV_2DISPATCH_MAYLOCK(obj, op, lock, ...)		       \
+({								      \
+	struct lov_object		      *__obj = (obj);	  \
+	int				     __lock = !!(lock);      \
+	typeof(lov_dispatch[0].op(__VA_ARGS__)) __result;	       \
+									\
+	if (__lock)						     \
+		lov_conf_freeze(__obj);					\
+	__result = LOV_2DISPATCH_NOLOCK(obj, op, __VA_ARGS__);	  \
+	if (__lock)						     \
+		lov_conf_thaw(__obj);					\
+	__result;						       \
+})
+
+/**
+ * Performs a locked double-dispatch based on the layout type of an object.
+ */
+#define LOV_2DISPATCH(obj, op, ...)		     \
+	LOV_2DISPATCH_MAYLOCK(obj, op, 1, __VA_ARGS__)
+
+#define LOV_2DISPATCH_VOID(obj, op, ...)				\
+do {								    \
+	struct lov_object		      *__obj = (obj);	  \
+	enum lov_layout_type		    __llt;		  \
+									\
+	lov_conf_freeze(__obj);						\
+	__llt = __obj->lo_type;					 \
+	LASSERT(0 <= __llt && __llt < ARRAY_SIZE(lov_dispatch));	\
+	lov_dispatch[__llt].op(__VA_ARGS__);			    \
+	lov_conf_thaw(__obj);						\
+} while (0)
+
+static void lov_conf_lock(struct lov_object *lov)
+{
+	LASSERT(lov->lo_owner != current);
+	down_write(&lov->lo_type_guard);
+	LASSERT(lov->lo_owner == NULL);
+	lov->lo_owner = current;
+}
+
+static void lov_conf_unlock(struct lov_object *lov)
+{
+	lov->lo_owner = NULL;
+	up_write(&lov->lo_type_guard);
+}
+
+static int lov_layout_wait(const struct lu_env *env, struct lov_object *lov)
+{
+	struct l_wait_info lwi = { 0 };
+
+	while (atomic_read(&lov->lo_active_ios) > 0) {
+		CDEBUG(D_INODE, "file:"DFID" wait for active IO, now: %d.\n",
+			PFID(lu_object_fid(lov2lu(lov))),
+			atomic_read(&lov->lo_active_ios));
+
+		l_wait_event(lov->lo_waitq,
+			     atomic_read(&lov->lo_active_ios) == 0, &lwi);
+	}
+	return 0;
+}
+
+static int lov_layout_change(const struct lu_env *unused,
+			     struct lov_object *lov,
+			     const struct cl_object_conf *conf)
+{
+	int result;
+	enum lov_layout_type llt = LLT_EMPTY;
+	union lov_layout_state *state = &lov->u;
+	const struct lov_layout_operations *old_ops;
+	const struct lov_layout_operations *new_ops;
+
+	struct cl_object_header *hdr = cl_object_header(&lov->lo_cl);
+	void *cookie;
+	struct lu_env *env;
+	int refcheck;
+
+	LASSERT(0 <= lov->lo_type && lov->lo_type < ARRAY_SIZE(lov_dispatch));
+
+	if (conf->u.coc_md != NULL)
+		llt = lov_type(conf->u.coc_md->lsm);
+	LASSERT(0 <= llt && llt < ARRAY_SIZE(lov_dispatch));
+
+	cookie = cl_env_reenter();
+	env = cl_env_get(&refcheck);
+	if (IS_ERR(env)) {
+		cl_env_reexit(cookie);
+		return PTR_ERR(env);
+	}
+
+	CDEBUG(D_INODE, DFID" from %s to %s\n",
+	       PFID(lu_object_fid(lov2lu(lov))),
+	       llt2str(lov->lo_type), llt2str(llt));
+
+	old_ops = &lov_dispatch[lov->lo_type];
+	new_ops = &lov_dispatch[llt];
+
+	result = old_ops->llo_delete(env, lov, &lov->u);
+	if (result == 0) {
+		old_ops->llo_fini(env, lov, &lov->u);
+
+		LASSERT(atomic_read(&lov->lo_active_ios) == 0);
+		LASSERT(hdr->coh_tree.rnode == NULL);
+		LASSERT(hdr->coh_pages == 0);
+
+		lov->lo_type = LLT_EMPTY;
+		result = new_ops->llo_init(env,
+					lu2lov_dev(lov->lo_cl.co_lu.lo_dev),
+					lov, conf, state);
+		if (result == 0) {
+			new_ops->llo_install(env, lov, state);
+			lov->lo_type = llt;
+		} else {
+			new_ops->llo_delete(env, lov, state);
+			new_ops->llo_fini(env, lov, state);
+			/* this file becomes an EMPTY file. */
+		}
+	}
+
+	cl_env_put(env, &refcheck);
+	cl_env_reexit(cookie);
+	return result;
+}
+
+/*****************************************************************************
+ *
+ * Lov object operations.
+ *
+ */
+int lov_object_init(const struct lu_env *env, struct lu_object *obj,
+		    const struct lu_object_conf *conf)
+{
+	struct lov_device	    *dev   = lu2lov_dev(obj->lo_dev);
+	struct lov_object	    *lov   = lu2lov(obj);
+	const struct cl_object_conf  *cconf = lu2cl_conf(conf);
+	union  lov_layout_state      *set   = &lov->u;
+	const struct lov_layout_operations *ops;
+	int result;
+
+	init_rwsem(&lov->lo_type_guard);
+	atomic_set(&lov->lo_active_ios, 0);
+	init_waitqueue_head(&lov->lo_waitq);
+
+	cl_object_page_init(lu2cl(obj), sizeof(struct lov_page));
+
+	/* no locking is necessary, as object is being created */
+	lov->lo_type = lov_type(cconf->u.coc_md->lsm);
+	ops = &lov_dispatch[lov->lo_type];
+	result = ops->llo_init(env, dev, lov, cconf, set);
+	if (result == 0)
+		ops->llo_install(env, lov, set);
+	return result;
+}
+
+static int lov_conf_set(const struct lu_env *env, struct cl_object *obj,
+			const struct cl_object_conf *conf)
+{
+	struct lov_stripe_md	*lsm = NULL;
+	struct lov_object	*lov = cl2lov(obj);
+	int			 result = 0;
+
+	lov_conf_lock(lov);
+	if (conf->coc_opc == OBJECT_CONF_INVALIDATE) {
+		lov->lo_layout_invalid = true;
+		result = 0;
+		goto out;
+	}
+
+	if (conf->coc_opc == OBJECT_CONF_WAIT) {
+		if (lov->lo_layout_invalid &&
+		    atomic_read(&lov->lo_active_ios) > 0) {
+			lov_conf_unlock(lov);
+			result = lov_layout_wait(env, lov);
+			lov_conf_lock(lov);
+		}
+		goto out;
+	}
+
+	LASSERT(conf->coc_opc == OBJECT_CONF_SET);
+
+	if (conf->u.coc_md != NULL)
+		lsm = conf->u.coc_md->lsm;
+	if ((lsm == NULL && lov->lo_lsm == NULL) ||
+	    ((lsm != NULL && lov->lo_lsm != NULL) &&
+	     (lov->lo_lsm->lsm_layout_gen == lsm->lsm_layout_gen) &&
+	     (lov->lo_lsm->lsm_pattern == lsm->lsm_pattern))) {
+		/* same version of layout */
+		lov->lo_layout_invalid = false;
+		result = 0;
+		goto out;
+	}
+
+	/* will change layout - check if there still exists active IO. */
+	if (atomic_read(&lov->lo_active_ios) > 0) {
+		lov->lo_layout_invalid = true;
+		result = -EBUSY;
+		goto out;
+	}
+
+	lov->lo_layout_invalid = lov_layout_change(env, lov, conf);
+
+out:
+	lov_conf_unlock(lov);
+	CDEBUG(D_INODE, DFID" lo_layout_invalid=%d\n",
+	       PFID(lu_object_fid(lov2lu(lov))), lov->lo_layout_invalid);
+	return result;
+}
+
+static void lov_object_delete(const struct lu_env *env, struct lu_object *obj)
+{
+	struct lov_object *lov = lu2lov(obj);
+
+	LOV_2DISPATCH_VOID(lov, llo_delete, env, lov, &lov->u);
+}
+
+static void lov_object_free(const struct lu_env *env, struct lu_object *obj)
+{
+	struct lov_object *lov = lu2lov(obj);
+
+	LOV_2DISPATCH_VOID(lov, llo_fini, env, lov, &lov->u);
+	lu_object_fini(obj);
+	OBD_SLAB_FREE_PTR(lov, lov_object_kmem);
+}
+
+static int lov_object_print(const struct lu_env *env, void *cookie,
+			    lu_printer_t p, const struct lu_object *o)
+{
+	return LOV_2DISPATCH_NOLOCK(lu2lov(o), llo_print, env, cookie, p, o);
+}
+
+int lov_page_init(const struct lu_env *env, struct cl_object *obj,
+		struct cl_page *page, struct page *vmpage)
+{
+	return LOV_2DISPATCH_NOLOCK(cl2lov(obj),
+				    llo_page_init, env, obj, page, vmpage);
+}
+
+/**
+ * Implements cl_object_operations::clo_io_init() method for lov
+ * layer. Dispatches to the appropriate layout io initialization method.
+ */
+int lov_io_init(const struct lu_env *env, struct cl_object *obj,
+		struct cl_io *io)
+{
+	CL_IO_SLICE_CLEAN(lov_env_io(env), lis_cl);
+	return LOV_2DISPATCH_MAYLOCK(cl2lov(obj), llo_io_init,
+				     !io->ci_ignore_layout, env, obj, io);
+}
+
+/**
+ * An implementation of cl_object_operations::clo_attr_get() method for lov
+ * layer. For raid0 layout this collects and merges attributes of all
+ * sub-objects.
+ */
+static int lov_attr_get(const struct lu_env *env, struct cl_object *obj,
+			struct cl_attr *attr)
+{
+	/* do not take lock, as this function is called under a
+	 * spin-lock. Layout is protected from changing by ongoing IO. */
+	return LOV_2DISPATCH_NOLOCK(cl2lov(obj), llo_getattr, env, obj, attr);
+}
+
+static int lov_attr_set(const struct lu_env *env, struct cl_object *obj,
+			const struct cl_attr *attr, unsigned valid)
+{
+	/*
+	 * No dispatch is required here, as no layout implements this.
+	 */
+	return 0;
+}
+
+int lov_lock_init(const struct lu_env *env, struct cl_object *obj,
+		  struct cl_lock *lock, const struct cl_io *io)
+{
+	/* No need to lock because we've taken one refcount of layout.  */
+	return LOV_2DISPATCH_NOLOCK(cl2lov(obj), llo_lock_init, env, obj, lock,
+				    io);
+}
+
+static const struct cl_object_operations lov_ops = {
+	.coo_page_init = lov_page_init,
+	.coo_lock_init = lov_lock_init,
+	.coo_io_init   = lov_io_init,
+	.coo_attr_get  = lov_attr_get,
+	.coo_attr_set  = lov_attr_set,
+	.coo_conf_set  = lov_conf_set
+};
+
+static const struct lu_object_operations lov_lu_obj_ops = {
+	.loo_object_init      = lov_object_init,
+	.loo_object_delete    = lov_object_delete,
+	.loo_object_release   = NULL,
+	.loo_object_free      = lov_object_free,
+	.loo_object_print     = lov_object_print,
+	.loo_object_invariant = NULL
+};
+
+struct lu_object *lov_object_alloc(const struct lu_env *env,
+				   const struct lu_object_header *unused,
+				   struct lu_device *dev)
+{
+	struct lov_object *lov;
+	struct lu_object  *obj;
+
+	OBD_SLAB_ALLOC_PTR_GFP(lov, lov_object_kmem, GFP_NOFS);
+	if (lov != NULL) {
+		obj = lov2lu(lov);
+		lu_object_init(obj, NULL, dev);
+		lov->lo_cl.co_ops = &lov_ops;
+		lov->lo_type = -1; /* invalid, to catch uninitialized type */
+		/*
+		 * object io operation vector (cl_object::co_iop) is installed
+		 * later in lov_object_init(), as different vectors are used
+		 * for object with different layouts.
+		 */
+		obj->lo_ops = &lov_lu_obj_ops;
+	} else
+		obj = NULL;
+	return obj;
+}
+
+struct lov_stripe_md *lov_lsm_addref(struct lov_object *lov)
+{
+	struct lov_stripe_md *lsm = NULL;
+
+	lov_conf_freeze(lov);
+	if (lov->lo_lsm != NULL) {
+		lsm = lsm_addref(lov->lo_lsm);
+		CDEBUG(D_INODE, "lsm %p addref %d/%d by %p.\n",
+			lsm, atomic_read(&lsm->lsm_refc),
+			lov->lo_layout_invalid, current);
+	}
+	lov_conf_thaw(lov);
+	return lsm;
+}
+
+void lov_lsm_decref(struct lov_object *lov, struct lov_stripe_md *lsm)
+{
+	if (lsm == NULL)
+		return;
+
+	CDEBUG(D_INODE, "lsm %p decref %d by %p.\n",
+		lsm, atomic_read(&lsm->lsm_refc), current);
+
+	lov_free_memmd(&lsm);
+}
+
+struct lov_stripe_md *lov_lsm_get(struct cl_object *clobj)
+{
+	struct lu_object *luobj;
+	struct lov_stripe_md *lsm = NULL;
+
+	if (clobj == NULL)
+		return NULL;
+
+	luobj = lu_object_locate(&cl_object_header(clobj)->coh_lu,
+				 &lov_device_type);
+	if (luobj != NULL)
+		lsm = lov_lsm_addref(lu2lov(luobj));
+	return lsm;
+}
+EXPORT_SYMBOL(lov_lsm_get);
+
+void lov_lsm_put(struct cl_object *unused, struct lov_stripe_md *lsm)
+{
+	if (lsm != NULL)
+		lov_free_memmd(&lsm);
+}
+EXPORT_SYMBOL(lov_lsm_put);
+
+int lov_read_and_clear_async_rc(struct cl_object *clob)
+{
+	struct lu_object *luobj;
+	int rc = 0;
+
+	luobj = lu_object_locate(&cl_object_header(clob)->coh_lu,
+				 &lov_device_type);
+	if (luobj != NULL) {
+		struct lov_object *lov = lu2lov(luobj);
+
+		lov_conf_freeze(lov);
+		switch (lov->lo_type) {
+		case LLT_RAID0: {
+			struct lov_stripe_md *lsm;
+			int i;
+
+			lsm = lov->lo_lsm;
+			LASSERT(lsm != NULL);
+			for (i = 0; i < lsm->lsm_stripe_count; i++) {
+				struct lov_oinfo *loi = lsm->lsm_oinfo[i];
+
+				if (lov_oinfo_is_dummy(loi))
+					continue;
+
+				if (loi->loi_ar.ar_rc && !rc)
+					rc = loi->loi_ar.ar_rc;
+				loi->loi_ar.ar_rc = 0;
+			}
+		}
+		case LLT_RELEASED:
+		case LLT_EMPTY:
+			break;
+		default:
+			LBUG();
+		}
+		lov_conf_thaw(lov);
+	}
+	return rc;
+}
+EXPORT_SYMBOL(lov_read_and_clear_async_rc);
+
+/** @} lov */
diff --git a/kernel/drivers/staging/lustre/lustre/lov/lov_offset.c b/kernel/drivers/staging/lustre/lustre/lov/lov_offset.c
new file mode 100644
index 000000000..9c8c77c05
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/lov/lov_offset.c
@@ -0,0 +1,264 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_LOV
+
+#include "../../include/linux/libcfs/libcfs.h"
+
+#include "../include/obd_class.h"
+
+#include "lov_internal.h"
+
+/* compute object size given "stripeno" and the ost size */
+u64 lov_stripe_size(struct lov_stripe_md *lsm, u64 ost_size,
+			 int stripeno)
+{
+	unsigned long ssize = lsm->lsm_stripe_size;
+	unsigned long stripe_size;
+	u64 swidth;
+	u64 lov_size;
+	int magic = lsm->lsm_magic;
+
+	if (ost_size == 0)
+		return 0;
+
+	LASSERT(lsm_op_find(magic) != NULL);
+	lsm_op_find(magic)->lsm_stripe_by_index(lsm, &stripeno, NULL, &swidth);
+
+	/* lov_do_div64(a, b) returns a % b, and a = a / b */
+	stripe_size = lov_do_div64(ost_size, ssize);
+	if (stripe_size)
+		lov_size = ost_size * swidth + stripeno * ssize + stripe_size;
+	else
+		lov_size = (ost_size - 1) * swidth + (stripeno + 1) * ssize;
+
+	return lov_size;
+}
+
+/* we have an offset in file backed by an lov and want to find out where
+ * that offset lands in our given stripe of the file.  for the easy
+ * case where the offset is within the stripe, we just have to scale the
+ * offset down to make it relative to the stripe instead of the lov.
+ *
+ * the harder case is what to do when the offset doesn't intersect the
+ * stripe.  callers will want start offsets clamped ahead to the start
+ * of the nearest stripe in the file.  end offsets similarly clamped to the
+ * nearest ending byte of a stripe in the file:
+ *
+ * all this function does is move offsets to the nearest region of the
+ * stripe, and it does its work "mod" the full length of all the stripes.
+ * consider a file with 3 stripes:
+ *
+ *	     S					      E
+ * ---------------------------------------------------------------------
+ * |    0    |     1     |     2     |    0    |     1     |     2     |
+ * ---------------------------------------------------------------------
+ *
+ * to find stripe 1's offsets for S and E, it divides by the full stripe
+ * width and does its math in the context of a single set of stripes:
+ *
+ *	     S	 E
+ * -----------------------------------
+ * |    0    |     1     |     2     |
+ * -----------------------------------
+ *
+ * it'll notice that E is outside stripe 1 and clamp it to the end of the
+ * stripe, then multiply it back out by lov_off to give the real offsets in
+ * the stripe:
+ *
+ *   S		   E
+ * ---------------------------------------------------------------------
+ * |    1    |     1     |     1     |    1    |     1     |     1     |
+ * ---------------------------------------------------------------------
+ *
+ * it would have done similarly and pulled S forward to the start of a 1
+ * stripe if, say, S had landed in a 0 stripe.
+ *
+ * this rounding isn't always correct.  consider an E lov offset that lands
+ * on a 0 stripe, the "mod stripe width" math will pull it forward to the
+ * start of a 1 stripe, when in fact it wanted to be rounded back to the end
+ * of a previous 1 stripe.  this logic is handled by callers and this is why:
+ *
+ * this function returns < 0 when the offset was "before" the stripe and
+ * was moved forward to the start of the stripe in question;  0 when it
+ * falls in the stripe and no shifting was done; > 0 when the offset
+ * was outside the stripe and was pulled back to its final byte. */
+int lov_stripe_offset(struct lov_stripe_md *lsm, u64 lov_off,
+		      int stripeno, u64 *obdoff)
+{
+	unsigned long ssize  = lsm->lsm_stripe_size;
+	u64 stripe_off, this_stripe, swidth;
+	int magic = lsm->lsm_magic;
+	int ret = 0;
+
+	if (lov_off == OBD_OBJECT_EOF) {
+		*obdoff = OBD_OBJECT_EOF;
+		return 0;
+	}
+
+	LASSERT(lsm_op_find(magic) != NULL);
+
+	lsm_op_find(magic)->lsm_stripe_by_index(lsm, &stripeno, &lov_off,
+						&swidth);
+
+	/* lov_do_div64(a, b) returns a % b, and a = a / b */
+	stripe_off = lov_do_div64(lov_off, swidth);
+
+	this_stripe = (u64)stripeno * ssize;
+	if (stripe_off < this_stripe) {
+		stripe_off = 0;
+		ret = -1;
+	} else {
+		stripe_off -= this_stripe;
+
+		if (stripe_off >= ssize) {
+			stripe_off = ssize;
+			ret = 1;
+		}
+	}
+
+	*obdoff = lov_off * ssize + stripe_off;
+	return ret;
+}
+
+/* Given a whole-file size and a stripe number, give the file size which
+ * corresponds to the individual object of that stripe.
+ *
+ * This behaves basically in the same was as lov_stripe_offset, except that
+ * file sizes falling before the beginning of a stripe are clamped to the end
+ * of the previous stripe, not the beginning of the next:
+ *
+ *					       S
+ * ---------------------------------------------------------------------
+ * |    0    |     1     |     2     |    0    |     1     |     2     |
+ * ---------------------------------------------------------------------
+ *
+ * if clamped to stripe 2 becomes:
+ *
+ *				   S
+ * ---------------------------------------------------------------------
+ * |    0    |     1     |     2     |    0    |     1     |     2     |
+ * ---------------------------------------------------------------------
+ */
+u64 lov_size_to_stripe(struct lov_stripe_md *lsm, u64 file_size,
+		       int stripeno)
+{
+	unsigned long ssize  = lsm->lsm_stripe_size;
+	u64 stripe_off, this_stripe, swidth;
+	int magic = lsm->lsm_magic;
+
+	if (file_size == OBD_OBJECT_EOF)
+		return OBD_OBJECT_EOF;
+
+	LASSERT(lsm_op_find(magic) != NULL);
+	lsm_op_find(magic)->lsm_stripe_by_index(lsm, &stripeno, &file_size,
+						&swidth);
+
+	/* lov_do_div64(a, b) returns a % b, and a = a / b */
+	stripe_off = lov_do_div64(file_size, swidth);
+
+	this_stripe = (u64)stripeno * ssize;
+	if (stripe_off < this_stripe) {
+		/* Move to end of previous stripe, or zero */
+		if (file_size > 0) {
+			file_size--;
+			stripe_off = ssize;
+		} else {
+			stripe_off = 0;
+		}
+	} else {
+		stripe_off -= this_stripe;
+
+		if (stripe_off >= ssize) {
+			/* Clamp to end of this stripe */
+			stripe_off = ssize;
+		}
+	}
+
+	return (file_size * ssize + stripe_off);
+}
+
+/* given an extent in an lov and a stripe, calculate the extent of the stripe
+ * that is contained within the lov extent.  this returns true if the given
+ * stripe does intersect with the lov extent. */
+int lov_stripe_intersects(struct lov_stripe_md *lsm, int stripeno,
+			  u64 start, u64 end, u64 *obd_start, u64 *obd_end)
+{
+	int start_side, end_side;
+
+	start_side = lov_stripe_offset(lsm, start, stripeno, obd_start);
+	end_side = lov_stripe_offset(lsm, end, stripeno, obd_end);
+
+	CDEBUG(D_INODE, "[%llu->%llu] -> [(%d) %llu->%llu (%d)]\n",
+	       start, end, start_side, *obd_start, *obd_end, end_side);
+
+	/* this stripe doesn't intersect the file extent when neither
+	 * start or the end intersected the stripe and obd_start and
+	 * obd_end got rounded up to the save value. */
+	if (start_side != 0 && end_side != 0 && *obd_start == *obd_end)
+		return 0;
+
+	/* as mentioned in the lov_stripe_offset commentary, end
+	 * might have been shifted in the wrong direction.  This
+	 * happens when an end offset is before the stripe when viewed
+	 * through the "mod stripe size" math. we detect it being shifted
+	 * in the wrong direction and touch it up.
+	 * interestingly, this can't underflow since end must be > start
+	 * if we passed through the previous check.
+	 * (should we assert for that somewhere?) */
+	if (end_side != 0)
+		(*obd_end)--;
+
+	return 1;
+}
+
+/* compute which stripe number "lov_off" will be written into */
+int lov_stripe_number(struct lov_stripe_md *lsm, u64 lov_off)
+{
+	unsigned long ssize  = lsm->lsm_stripe_size;
+	u64 stripe_off, swidth;
+	int magic = lsm->lsm_magic;
+
+	LASSERT(lsm_op_find(magic) != NULL);
+	lsm_op_find(magic)->lsm_stripe_by_offset(lsm, NULL, &lov_off, &swidth);
+
+	stripe_off = lov_do_div64(lov_off, swidth);
+
+	/* Puts stripe_off/ssize result into stripe_off */
+	lov_do_div64(stripe_off, ssize);
+
+	return stripe_off;
+}
diff --git a/kernel/drivers/staging/lustre/lustre/lov/lov_pack.c b/kernel/drivers/staging/lustre/lustre/lov/lov_pack.c
new file mode 100644
index 000000000..5356d5324
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/lov/lov_pack.c
@@ -0,0 +1,511 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/lov/lov_pack.c
+ *
+ * (Un)packing of OST/MDS requests
+ *
+ * Author: Andreas Dilger <adilger@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LOV
+
+#include "../include/lustre_net.h"
+#include "../include/obd.h"
+#include "../include/obd_class.h"
+#include "../include/obd_support.h"
+#include "../include/lustre/lustre_user.h"
+
+#include "lov_internal.h"
+
+void lov_dump_lmm_common(int level, void *lmmp)
+{
+	struct lov_mds_md *lmm = lmmp;
+	struct ost_id	oi;
+
+	lmm_oi_le_to_cpu(&oi, &lmm->lmm_oi);
+	CDEBUG(level, "objid "DOSTID", magic 0x%08x, pattern %#x\n",
+	       POSTID(&oi), le32_to_cpu(lmm->lmm_magic),
+	       le32_to_cpu(lmm->lmm_pattern));
+	CDEBUG(level, "stripe_size %u, stripe_count %u, layout_gen %u\n",
+	       le32_to_cpu(lmm->lmm_stripe_size),
+	       le16_to_cpu(lmm->lmm_stripe_count),
+	       le16_to_cpu(lmm->lmm_layout_gen));
+}
+
+static void lov_dump_lmm_objects(int level, struct lov_ost_data *lod,
+				 int stripe_count)
+{
+	int i;
+
+	if (stripe_count > LOV_V1_INSANE_STRIPE_COUNT) {
+		CDEBUG(level, "bad stripe_count %u > max_stripe_count %u\n",
+		       stripe_count, LOV_V1_INSANE_STRIPE_COUNT);
+		return;
+	}
+
+	for (i = 0; i < stripe_count; ++i, ++lod) {
+		struct ost_id	oi;
+
+		ostid_le_to_cpu(&lod->l_ost_oi, &oi);
+		CDEBUG(level, "stripe %u idx %u subobj "DOSTID"\n", i,
+		       le32_to_cpu(lod->l_ost_idx), POSTID(&oi));
+	}
+}
+
+void lov_dump_lmm_v1(int level, struct lov_mds_md_v1 *lmm)
+{
+	lov_dump_lmm_common(level, lmm);
+	lov_dump_lmm_objects(level, lmm->lmm_objects,
+			     le16_to_cpu(lmm->lmm_stripe_count));
+}
+
+void lov_dump_lmm_v3(int level, struct lov_mds_md_v3 *lmm)
+{
+	lov_dump_lmm_common(level, lmm);
+	CDEBUG(level, "pool_name "LOV_POOLNAMEF"\n", lmm->lmm_pool_name);
+	lov_dump_lmm_objects(level, lmm->lmm_objects,
+			     le16_to_cpu(lmm->lmm_stripe_count));
+}
+
+void lov_dump_lmm(int level, void *lmm)
+{
+	int magic;
+
+	magic = le32_to_cpu(((struct lov_mds_md *)lmm)->lmm_magic);
+	switch (magic) {
+	case LOV_MAGIC_V1:
+		lov_dump_lmm_v1(level, (struct lov_mds_md_v1 *)lmm);
+		break;
+	case LOV_MAGIC_V3:
+		lov_dump_lmm_v3(level, (struct lov_mds_md_v3 *)lmm);
+		break;
+	default:
+		CDEBUG(level, "unrecognized lmm_magic %x, assuming %x\n",
+		       magic, LOV_MAGIC_V1);
+		lov_dump_lmm_common(level, lmm);
+		break;
+	}
+}
+
+/* Pack LOV object metadata for disk storage.  It is packed in LE byte
+ * order and is opaque to the networking layer.
+ *
+ * XXX In the future, this will be enhanced to get the EA size from the
+ *     underlying OSC device(s) to get their EA sizes so we can stack
+ *     LOVs properly.  For now lov_mds_md_size() just assumes one u64
+ *     per stripe.
+ */
+int lov_packmd(struct obd_export *exp, struct lov_mds_md **lmmp,
+	       struct lov_stripe_md *lsm)
+{
+	struct obd_device *obd = class_exp2obd(exp);
+	struct lov_obd *lov = &obd->u.lov;
+	struct lov_mds_md_v1 *lmmv1;
+	struct lov_mds_md_v3 *lmmv3;
+	__u16 stripe_count;
+	struct lov_ost_data_v1 *lmm_objects;
+	int lmm_size, lmm_magic;
+	int i;
+	int cplen = 0;
+
+	if (lsm) {
+		lmm_magic = lsm->lsm_magic;
+	} else {
+		if (lmmp && *lmmp)
+			lmm_magic = le32_to_cpu((*lmmp)->lmm_magic);
+		else
+			/* lsm == NULL and lmmp == NULL */
+			lmm_magic = LOV_MAGIC;
+	}
+
+	if ((lmm_magic != LOV_MAGIC_V1) &&
+	    (lmm_magic != LOV_MAGIC_V3)) {
+		CERROR("bad mem LOV MAGIC: 0x%08X != 0x%08X nor 0x%08X\n",
+			lmm_magic, LOV_MAGIC_V1, LOV_MAGIC_V3);
+		return -EINVAL;
+
+	}
+
+	if (lsm) {
+		/* If we are just sizing the EA, limit the stripe count
+		 * to the actual number of OSTs in this filesystem. */
+		if (!lmmp) {
+			stripe_count = lov_get_stripecnt(lov, lmm_magic,
+							lsm->lsm_stripe_count);
+			lsm->lsm_stripe_count = stripe_count;
+		} else if (!lsm_is_released(lsm)) {
+			stripe_count = lsm->lsm_stripe_count;
+		} else {
+			stripe_count = 0;
+		}
+	} else {
+		/* No need to allocate more than maximum supported stripes.
+		 * Anyway, this is pretty inaccurate since ld_tgt_count now
+		 * represents max index and we should rely on the actual number
+		 * of OSTs instead */
+		stripe_count = lov_mds_md_max_stripe_count(
+			lov->lov_ocd.ocd_max_easize, lmm_magic);
+
+		if (stripe_count > lov->desc.ld_tgt_count)
+			stripe_count = lov->desc.ld_tgt_count;
+	}
+
+	/* XXX LOV STACKING call into osc for sizes */
+	lmm_size = lov_mds_md_size(stripe_count, lmm_magic);
+
+	if (!lmmp)
+		return lmm_size;
+
+	if (*lmmp && !lsm) {
+		stripe_count = le16_to_cpu((*lmmp)->lmm_stripe_count);
+		lmm_size = lov_mds_md_size(stripe_count, lmm_magic);
+		OBD_FREE_LARGE(*lmmp, lmm_size);
+		*lmmp = NULL;
+		return 0;
+	}
+
+	if (!*lmmp) {
+		OBD_ALLOC_LARGE(*lmmp, lmm_size);
+		if (!*lmmp)
+			return -ENOMEM;
+	}
+
+	CDEBUG(D_INFO, "lov_packmd: LOV_MAGIC 0x%08X, lmm_size = %d \n",
+	       lmm_magic, lmm_size);
+
+	lmmv1 = *lmmp;
+	lmmv3 = (struct lov_mds_md_v3 *)*lmmp;
+	if (lmm_magic == LOV_MAGIC_V3)
+		lmmv3->lmm_magic = cpu_to_le32(LOV_MAGIC_V3);
+	else
+		lmmv1->lmm_magic = cpu_to_le32(LOV_MAGIC_V1);
+
+	if (!lsm)
+		return lmm_size;
+
+	/* lmmv1 and lmmv3 point to the same struct and have the
+	 * same first fields
+	 */
+	lmm_oi_cpu_to_le(&lmmv1->lmm_oi, &lsm->lsm_oi);
+	lmmv1->lmm_stripe_size = cpu_to_le32(lsm->lsm_stripe_size);
+	lmmv1->lmm_stripe_count = cpu_to_le16(stripe_count);
+	lmmv1->lmm_pattern = cpu_to_le32(lsm->lsm_pattern);
+	lmmv1->lmm_layout_gen = cpu_to_le16(lsm->lsm_layout_gen);
+	if (lsm->lsm_magic == LOV_MAGIC_V3) {
+		cplen = strlcpy(lmmv3->lmm_pool_name, lsm->lsm_pool_name,
+				sizeof(lmmv3->lmm_pool_name));
+		if (cplen >= sizeof(lmmv3->lmm_pool_name))
+			return -E2BIG;
+		lmm_objects = lmmv3->lmm_objects;
+	} else {
+		lmm_objects = lmmv1->lmm_objects;
+	}
+
+	for (i = 0; i < stripe_count; i++) {
+		struct lov_oinfo *loi = lsm->lsm_oinfo[i];
+		/* XXX LOV STACKING call down to osc_packmd() to do packing */
+		LASSERTF(ostid_id(&loi->loi_oi) != 0, "lmm_oi "DOSTID
+			 " stripe %u/%u idx %u\n", POSTID(&lmmv1->lmm_oi),
+			 i, stripe_count, loi->loi_ost_idx);
+		ostid_cpu_to_le(&loi->loi_oi, &lmm_objects[i].l_ost_oi);
+		lmm_objects[i].l_ost_gen = cpu_to_le32(loi->loi_ost_gen);
+		lmm_objects[i].l_ost_idx = cpu_to_le32(loi->loi_ost_idx);
+	}
+
+	return lmm_size;
+}
+
+/* Find the max stripecount we should use */
+__u16 lov_get_stripecnt(struct lov_obd *lov, __u32 magic, __u16 stripe_count)
+{
+	__u32 max_stripes = LOV_MAX_STRIPE_COUNT_OLD;
+
+	if (!stripe_count)
+		stripe_count = lov->desc.ld_default_stripe_count;
+	if (stripe_count > lov->desc.ld_active_tgt_count)
+		stripe_count = lov->desc.ld_active_tgt_count;
+	if (!stripe_count)
+		stripe_count = 1;
+
+	/* stripe count is based on whether ldiskfs can handle
+	 * larger EA sizes */
+	if (lov->lov_ocd.ocd_connect_flags & OBD_CONNECT_MAX_EASIZE &&
+	    lov->lov_ocd.ocd_max_easize)
+		max_stripes = lov_mds_md_max_stripe_count(
+			lov->lov_ocd.ocd_max_easize, magic);
+
+	if (stripe_count > max_stripes)
+		stripe_count = max_stripes;
+
+	return stripe_count;
+}
+
+
+static int lov_verify_lmm(void *lmm, int lmm_bytes, __u16 *stripe_count)
+{
+	int rc;
+
+	if (lsm_op_find(le32_to_cpu(*(__u32 *)lmm)) == NULL) {
+		char *buffer;
+		int sz;
+
+		CERROR("bad disk LOV MAGIC: 0x%08X; dumping LMM (size=%d):\n",
+		       le32_to_cpu(*(__u32 *)lmm), lmm_bytes);
+		sz = lmm_bytes * 2 + 1;
+		OBD_ALLOC_LARGE(buffer, sz);
+		if (buffer != NULL) {
+			int i;
+
+			for (i = 0; i < lmm_bytes; i++)
+				sprintf(buffer+2*i, "%.2X", ((char *)lmm)[i]);
+			buffer[sz - 1] = '\0';
+			CERROR("%s\n", buffer);
+			OBD_FREE_LARGE(buffer, sz);
+		}
+		return -EINVAL;
+	}
+	rc = lsm_op_find(le32_to_cpu(*(__u32 *)lmm))->lsm_lmm_verify(lmm,
+				     lmm_bytes, stripe_count);
+	return rc;
+}
+
+int lov_alloc_memmd(struct lov_stripe_md **lsmp, __u16 stripe_count,
+		    int pattern, int magic)
+{
+	int i, lsm_size;
+
+	CDEBUG(D_INFO, "alloc lsm, stripe_count %d\n", stripe_count);
+
+	*lsmp = lsm_alloc_plain(stripe_count, &lsm_size);
+	if (!*lsmp) {
+		CERROR("can't allocate lsmp stripe_count %d\n", stripe_count);
+		return -ENOMEM;
+	}
+
+	atomic_set(&(*lsmp)->lsm_refc, 1);
+	spin_lock_init(&(*lsmp)->lsm_lock);
+	(*lsmp)->lsm_magic = magic;
+	(*lsmp)->lsm_stripe_count = stripe_count;
+	(*lsmp)->lsm_maxbytes = LUSTRE_STRIPE_MAXBYTES * stripe_count;
+	(*lsmp)->lsm_pattern = pattern;
+	(*lsmp)->lsm_pool_name[0] = '\0';
+	(*lsmp)->lsm_layout_gen = 0;
+	if (stripe_count > 0)
+		(*lsmp)->lsm_oinfo[0]->loi_ost_idx = ~0;
+
+	for (i = 0; i < stripe_count; i++)
+		loi_init((*lsmp)->lsm_oinfo[i]);
+
+	return lsm_size;
+}
+
+int lov_free_memmd(struct lov_stripe_md **lsmp)
+{
+	struct lov_stripe_md *lsm = *lsmp;
+	int refc;
+
+	*lsmp = NULL;
+	LASSERT(atomic_read(&lsm->lsm_refc) > 0);
+	refc = atomic_dec_return(&lsm->lsm_refc);
+	if (refc == 0) {
+		LASSERT(lsm_op_find(lsm->lsm_magic) != NULL);
+		lsm_op_find(lsm->lsm_magic)->lsm_free(lsm);
+	}
+	return refc;
+}
+
+
+/* Unpack LOV object metadata from disk storage.  It is packed in LE byte
+ * order and is opaque to the networking layer.
+ */
+int lov_unpackmd(struct obd_export *exp,  struct lov_stripe_md **lsmp,
+		 struct lov_mds_md *lmm, int lmm_bytes)
+{
+	struct obd_device *obd = class_exp2obd(exp);
+	struct lov_obd *lov = &obd->u.lov;
+	int rc = 0, lsm_size;
+	__u16 stripe_count;
+	__u32 magic;
+	__u32 pattern;
+
+	/* If passed an MDS struct use values from there, otherwise defaults */
+	if (lmm) {
+		rc = lov_verify_lmm(lmm, lmm_bytes, &stripe_count);
+		if (rc)
+			return rc;
+		magic = le32_to_cpu(lmm->lmm_magic);
+	} else {
+		magic = LOV_MAGIC;
+		stripe_count = lov_get_stripecnt(lov, magic, 0);
+	}
+
+	/* If we aren't passed an lsmp struct, we just want the size */
+	if (!lsmp) {
+		/* XXX LOV STACKING call into osc for sizes */
+		LBUG();
+		return lov_stripe_md_size(stripe_count);
+	}
+	/* If we are passed an allocated struct but nothing to unpack, free */
+	if (*lsmp && !lmm) {
+		lov_free_memmd(lsmp);
+		return 0;
+	}
+
+	pattern = le32_to_cpu(lmm->lmm_pattern);
+	lsm_size = lov_alloc_memmd(lsmp, stripe_count, pattern, magic);
+	if (lsm_size < 0)
+		return lsm_size;
+
+	/* If we are passed a pointer but nothing to unpack, we only alloc */
+	if (!lmm)
+		return lsm_size;
+
+	LASSERT(lsm_op_find(magic) != NULL);
+	rc = lsm_op_find(magic)->lsm_unpackmd(lov, *lsmp, lmm);
+	if (rc) {
+		lov_free_memmd(lsmp);
+		return rc;
+	}
+
+	return lsm_size;
+}
+
+/* Retrieve object striping information.
+ *
+ * @lump is a pointer to an in-core struct with lmm_ost_count indicating
+ * the maximum number of OST indices which will fit in the user buffer.
+ * lmm_magic must be LOV_USER_MAGIC.
+ */
+int lov_getstripe(struct obd_export *exp, struct lov_stripe_md *lsm,
+		  struct lov_user_md *lump)
+{
+	/*
+	 * XXX huge struct allocated on stack.
+	 */
+	/* we use lov_user_md_v3 because it is larger than lov_user_md_v1 */
+	struct lov_user_md_v3 lum;
+	struct lov_mds_md *lmmk = NULL;
+	int rc, lmm_size;
+	int lum_size;
+	mm_segment_t seg;
+
+	if (!lsm)
+		return -ENODATA;
+
+	/*
+	 * "Switch to kernel segment" to allow copying from kernel space by
+	 * copy_{to,from}_user().
+	 */
+	seg = get_fs();
+	set_fs(KERNEL_DS);
+
+	/* we only need the header part from user space to get lmm_magic and
+	 * lmm_stripe_count, (the header part is common to v1 and v3) */
+	lum_size = sizeof(struct lov_user_md_v1);
+	if (copy_from_user(&lum, lump, lum_size)) {
+		rc = -EFAULT;
+		goto out_set;
+	} else if ((lum.lmm_magic != LOV_USER_MAGIC) &&
+		 (lum.lmm_magic != LOV_USER_MAGIC_V3)) {
+		rc = -EINVAL;
+		goto out_set;
+	}
+
+	if (lum.lmm_stripe_count &&
+	    (lum.lmm_stripe_count < lsm->lsm_stripe_count)) {
+		/* Return right size of stripe to user */
+		lum.lmm_stripe_count = lsm->lsm_stripe_count;
+		rc = copy_to_user(lump, &lum, lum_size);
+		rc = -EOVERFLOW;
+		goto out_set;
+	}
+	rc = lov_packmd(exp, &lmmk, lsm);
+	if (rc < 0)
+		goto out_set;
+	lmm_size = rc;
+	rc = 0;
+
+	/* FIXME: Bug 1185 - copy fields properly when structs change */
+	/* struct lov_user_md_v3 and struct lov_mds_md_v3 must be the same */
+	CLASSERT(sizeof(lum) == sizeof(struct lov_mds_md_v3));
+	CLASSERT(sizeof(lum.lmm_objects[0]) == sizeof(lmmk->lmm_objects[0]));
+
+	if ((cpu_to_le32(LOV_MAGIC) != LOV_MAGIC) &&
+	    ((lmmk->lmm_magic == cpu_to_le32(LOV_MAGIC_V1)) ||
+	    (lmmk->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)))) {
+		lustre_swab_lov_mds_md(lmmk);
+		lustre_swab_lov_user_md_objects(
+				(struct lov_user_ost_data *)lmmk->lmm_objects,
+				lmmk->lmm_stripe_count);
+	}
+	if (lum.lmm_magic == LOV_USER_MAGIC) {
+		/* User request for v1, we need skip lmm_pool_name */
+		if (lmmk->lmm_magic == LOV_MAGIC_V3) {
+			memmove((char *)(&lmmk->lmm_stripe_count) +
+				sizeof(lmmk->lmm_stripe_count),
+				((struct lov_mds_md_v3 *)lmmk)->lmm_objects,
+				lmmk->lmm_stripe_count *
+				sizeof(struct lov_ost_data_v1));
+			lmm_size -= LOV_MAXPOOLNAME;
+		}
+	} else {
+		/* if v3 we just have to update the lum_size */
+		lum_size = sizeof(struct lov_user_md_v3);
+	}
+
+	/* User wasn't expecting this many OST entries */
+	if (lum.lmm_stripe_count == 0)
+		lmm_size = lum_size;
+	else if (lum.lmm_stripe_count < lmmk->lmm_stripe_count) {
+		rc = -EOVERFLOW;
+		goto out_set;
+	}
+	/*
+	 * Have a difference between lov_mds_md & lov_user_md.
+	 * So we have to re-order the data before copy to user.
+	 */
+	lum.lmm_stripe_count = lmmk->lmm_stripe_count;
+	lum.lmm_layout_gen = lmmk->lmm_layout_gen;
+	((struct lov_user_md *)lmmk)->lmm_layout_gen = lum.lmm_layout_gen;
+	((struct lov_user_md *)lmmk)->lmm_stripe_count = lum.lmm_stripe_count;
+	if (copy_to_user(lump, lmmk, lmm_size))
+		rc = -EFAULT;
+
+	obd_free_diskmd(exp, &lmmk);
+out_set:
+	set_fs(seg);
+	return rc;
+}
diff --git a/kernel/drivers/staging/lustre/lustre/lov/lov_page.c b/kernel/drivers/staging/lustre/lustre/lov/lov_page.c
new file mode 100644
index 000000000..c4596e8e5
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/lov/lov_page.c
@@ -0,0 +1,232 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Implementation of cl_page for LOV layer.
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LOV
+
+#include "lov_cl_internal.h"
+
+/** \addtogroup lov
+ *  @{
+ */
+
+/*****************************************************************************
+ *
+ * Lov page operations.
+ *
+ */
+
+static int lov_page_invariant(const struct cl_page_slice *slice)
+{
+	const struct cl_page  *page = slice->cpl_page;
+	const struct cl_page  *sub  = lov_sub_page(slice);
+
+	return ergo(sub != NULL,
+		    page->cp_child == sub &&
+		    sub->cp_parent == page &&
+		    page->cp_state == sub->cp_state);
+}
+
+static void lov_page_fini(const struct lu_env *env,
+			  struct cl_page_slice *slice)
+{
+	struct cl_page  *sub = lov_sub_page(slice);
+
+	LINVRNT(lov_page_invariant(slice));
+
+	if (sub != NULL) {
+		LASSERT(sub->cp_state == CPS_FREEING);
+		lu_ref_del(&sub->cp_reference, "lov", sub->cp_parent);
+		sub->cp_parent = NULL;
+		slice->cpl_page->cp_child = NULL;
+		cl_page_put(env, sub);
+	}
+}
+
+static int lov_page_own(const struct lu_env *env,
+			const struct cl_page_slice *slice, struct cl_io *io,
+			int nonblock)
+{
+	struct lov_io     *lio = lov_env_io(env);
+	struct lov_io_sub *sub;
+
+	LINVRNT(lov_page_invariant(slice));
+	LINVRNT(!cl2lov_page(slice)->lps_invalid);
+
+	sub = lov_page_subio(env, lio, slice);
+	if (!IS_ERR(sub)) {
+		lov_sub_page(slice)->cp_owner = sub->sub_io;
+		lov_sub_put(sub);
+	} else
+		LBUG(); /* Arrgh */
+	return 0;
+}
+
+static void lov_page_assume(const struct lu_env *env,
+			    const struct cl_page_slice *slice, struct cl_io *io)
+{
+	lov_page_own(env, slice, io, 0);
+}
+
+static int lov_page_cache_add(const struct lu_env *env,
+			      const struct cl_page_slice *slice,
+			      struct cl_io *io)
+{
+	struct lov_io     *lio = lov_env_io(env);
+	struct lov_io_sub *sub;
+	int rc = 0;
+
+	LINVRNT(lov_page_invariant(slice));
+	LINVRNT(!cl2lov_page(slice)->lps_invalid);
+
+	sub = lov_page_subio(env, lio, slice);
+	if (!IS_ERR(sub)) {
+		rc = cl_page_cache_add(sub->sub_env, sub->sub_io,
+				       slice->cpl_page->cp_child, CRT_WRITE);
+		lov_sub_put(sub);
+	} else {
+		rc = PTR_ERR(sub);
+		CL_PAGE_DEBUG(D_ERROR, env, slice->cpl_page, "rc = %d\n", rc);
+	}
+	return rc;
+}
+
+static int lov_page_print(const struct lu_env *env,
+			  const struct cl_page_slice *slice,
+			  void *cookie, lu_printer_t printer)
+{
+	struct lov_page *lp = cl2lov_page(slice);
+
+	return (*printer)(env, cookie, LUSTRE_LOV_NAME"-page@%p\n", lp);
+}
+
+static const struct cl_page_operations lov_page_ops = {
+	.cpo_fini   = lov_page_fini,
+	.cpo_own    = lov_page_own,
+	.cpo_assume = lov_page_assume,
+	.io = {
+		[CRT_WRITE] = {
+			.cpo_cache_add = lov_page_cache_add
+		}
+	},
+	.cpo_print  = lov_page_print
+};
+
+static void lov_empty_page_fini(const struct lu_env *env,
+				struct cl_page_slice *slice)
+{
+	LASSERT(slice->cpl_page->cp_child == NULL);
+}
+
+int lov_page_init_raid0(const struct lu_env *env, struct cl_object *obj,
+			struct cl_page *page, struct page *vmpage)
+{
+	struct lov_object *loo = cl2lov(obj);
+	struct lov_layout_raid0 *r0 = lov_r0(loo);
+	struct lov_io     *lio = lov_env_io(env);
+	struct cl_page    *subpage;
+	struct cl_object  *subobj;
+	struct lov_io_sub *sub;
+	struct lov_page   *lpg = cl_object_page_slice(obj, page);
+	loff_t	     offset;
+	u64	    suboff;
+	int		stripe;
+	int		rc;
+
+	offset = cl_offset(obj, page->cp_index);
+	stripe = lov_stripe_number(loo->lo_lsm, offset);
+	LASSERT(stripe < r0->lo_nr);
+	rc = lov_stripe_offset(loo->lo_lsm, offset, stripe,
+				   &suboff);
+	LASSERT(rc == 0);
+
+	lpg->lps_invalid = 1;
+	cl_page_slice_add(page, &lpg->lps_cl, obj, &lov_page_ops);
+
+	sub = lov_sub_get(env, lio, stripe);
+	if (IS_ERR(sub)) {
+		rc = PTR_ERR(sub);
+		goto out;
+	}
+
+	subobj = lovsub2cl(r0->lo_sub[stripe]);
+	subpage = cl_page_find_sub(sub->sub_env, subobj,
+				   cl_index(subobj, suboff), vmpage, page);
+	lov_sub_put(sub);
+	if (IS_ERR(subpage)) {
+		rc = PTR_ERR(subpage);
+		goto out;
+	}
+
+	if (likely(subpage->cp_parent == page)) {
+		lu_ref_add(&subpage->cp_reference, "lov", page);
+		lpg->lps_invalid = 0;
+		rc = 0;
+	} else {
+		CL_PAGE_DEBUG(D_ERROR, env, page, "parent page\n");
+		CL_PAGE_DEBUG(D_ERROR, env, subpage, "child page\n");
+		LASSERT(0);
+	}
+
+out:
+	return rc;
+}
+
+
+static const struct cl_page_operations lov_empty_page_ops = {
+	.cpo_fini   = lov_empty_page_fini,
+	.cpo_print  = lov_page_print
+};
+
+int lov_page_init_empty(const struct lu_env *env, struct cl_object *obj,
+			struct cl_page *page, struct page *vmpage)
+{
+	struct lov_page *lpg = cl_object_page_slice(obj, page);
+	void *addr;
+
+	cl_page_slice_add(page, &lpg->lps_cl, obj, &lov_empty_page_ops);
+	addr = kmap(vmpage);
+	memset(addr, 0, cl_page_size(obj));
+	kunmap(vmpage);
+	cl_page_export(env, page, 1);
+	return 0;
+}
+
+
+/** @} lov */
diff --git a/kernel/drivers/staging/lustre/lustre/lov/lov_pool.c b/kernel/drivers/staging/lustre/lustre/lov/lov_pool.c
new file mode 100644
index 000000000..d96163de7
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/lov/lov_pool.c
@@ -0,0 +1,673 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see [sun.com URL with a
+ * copy of GPLv2].
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/lov/lov_pool.c
+ *
+ * OST pool methods
+ *
+ * Author: Jacques-Charles LAFOUCRIERE <jc.lafoucriere@cea.fr>
+ * Author: Alex Lyashkov <Alexey.Lyashkov@Sun.COM>
+ * Author: Nathaniel Rutman <Nathan.Rutman@Sun.COM>
+ */
+
+#define DEBUG_SUBSYSTEM S_LOV
+
+#include "../../include/linux/libcfs/libcfs.h"
+
+#include "../include/obd.h"
+#include "lov_internal.h"
+
+#define pool_tgt(_p, _i) \
+		_p->pool_lobd->u.lov.lov_tgts[_p->pool_obds.op_array[_i]]
+
+static void lov_pool_getref(struct pool_desc *pool)
+{
+	CDEBUG(D_INFO, "pool %p\n", pool);
+	atomic_inc(&pool->pool_refcount);
+}
+
+void lov_pool_putref(struct pool_desc *pool)
+{
+	CDEBUG(D_INFO, "pool %p\n", pool);
+	if (atomic_dec_and_test(&pool->pool_refcount)) {
+		LASSERT(hlist_unhashed(&pool->pool_hash));
+		LASSERT(list_empty(&pool->pool_list));
+		LASSERT(pool->pool_proc_entry == NULL);
+		lov_ost_pool_free(&(pool->pool_rr.lqr_pool));
+		lov_ost_pool_free(&(pool->pool_obds));
+		OBD_FREE_PTR(pool);
+	}
+}
+
+static void lov_pool_putref_locked(struct pool_desc *pool)
+{
+	CDEBUG(D_INFO, "pool %p\n", pool);
+	LASSERT(atomic_read(&pool->pool_refcount) > 1);
+
+	atomic_dec(&pool->pool_refcount);
+}
+
+/*
+ * hash function using a Rotating Hash algorithm
+ * Knuth, D. The Art of Computer Programming,
+ * Volume 3: Sorting and Searching,
+ * Chapter 6.4.
+ * Addison Wesley, 1973
+ */
+static __u32 pool_hashfn(struct cfs_hash *hash_body, const void *key, unsigned mask)
+{
+	int i;
+	__u32 result;
+	char *poolname;
+
+	result = 0;
+	poolname = (char *)key;
+	for (i = 0; i < LOV_MAXPOOLNAME; i++) {
+		if (poolname[i] == '\0')
+			break;
+		result = (result << 4)^(result >> 28) ^  poolname[i];
+	}
+	return (result % mask);
+}
+
+static void *pool_key(struct hlist_node *hnode)
+{
+	struct pool_desc *pool;
+
+	pool = hlist_entry(hnode, struct pool_desc, pool_hash);
+	return pool->pool_name;
+}
+
+static int pool_hashkey_keycmp(const void *key, struct hlist_node *compared_hnode)
+{
+	char *pool_name;
+	struct pool_desc *pool;
+
+	pool_name = (char *)key;
+	pool = hlist_entry(compared_hnode, struct pool_desc, pool_hash);
+	return !strncmp(pool_name, pool->pool_name, LOV_MAXPOOLNAME);
+}
+
+static void *pool_hashobject(struct hlist_node *hnode)
+{
+	return hlist_entry(hnode, struct pool_desc, pool_hash);
+}
+
+static void pool_hashrefcount_get(struct cfs_hash *hs, struct hlist_node *hnode)
+{
+	struct pool_desc *pool;
+
+	pool = hlist_entry(hnode, struct pool_desc, pool_hash);
+	lov_pool_getref(pool);
+}
+
+static void pool_hashrefcount_put_locked(struct cfs_hash *hs,
+					 struct hlist_node *hnode)
+{
+	struct pool_desc *pool;
+
+	pool = hlist_entry(hnode, struct pool_desc, pool_hash);
+	lov_pool_putref_locked(pool);
+}
+
+cfs_hash_ops_t pool_hash_operations = {
+	.hs_hash	= pool_hashfn,
+	.hs_key	 = pool_key,
+	.hs_keycmp      = pool_hashkey_keycmp,
+	.hs_object      = pool_hashobject,
+	.hs_get	 = pool_hashrefcount_get,
+	.hs_put_locked  = pool_hashrefcount_put_locked,
+
+};
+
+#if defined (CONFIG_PROC_FS)
+/* ifdef needed for liblustre support */
+/*
+ * pool /proc seq_file methods
+ */
+/*
+ * iterator is used to go through the target pool entries
+ * index is the current entry index in the lp_array[] array
+ * index >= pos returned to the seq_file interface
+ * pos is from 0 to (pool->pool_obds.op_count - 1)
+ */
+#define POOL_IT_MAGIC 0xB001CEA0
+struct pool_iterator {
+	int magic;
+	struct pool_desc *pool;
+	int idx;	/* from 0 to pool_tgt_size - 1 */
+};
+
+static void *pool_proc_next(struct seq_file *s, void *v, loff_t *pos)
+{
+	struct pool_iterator *iter = (struct pool_iterator *)s->private;
+	int prev_idx;
+
+	LASSERTF(iter->magic == POOL_IT_MAGIC, "%08X", iter->magic);
+
+	/* test if end of file */
+	if (*pos >= pool_tgt_count(iter->pool))
+		return NULL;
+
+	/* iterate to find a non empty entry */
+	prev_idx = iter->idx;
+	down_read(&pool_tgt_rw_sem(iter->pool));
+	iter->idx++;
+	if (iter->idx == pool_tgt_count(iter->pool)) {
+		iter->idx = prev_idx; /* we stay on the last entry */
+		up_read(&pool_tgt_rw_sem(iter->pool));
+		return NULL;
+	}
+	up_read(&pool_tgt_rw_sem(iter->pool));
+	(*pos)++;
+	/* return != NULL to continue */
+	return iter;
+}
+
+static void *pool_proc_start(struct seq_file *s, loff_t *pos)
+{
+	struct pool_desc *pool = (struct pool_desc *)s->private;
+	struct pool_iterator *iter;
+
+	lov_pool_getref(pool);
+	if ((pool_tgt_count(pool) == 0) ||
+	    (*pos >= pool_tgt_count(pool))) {
+		/* iter is not created, so stop() has no way to
+		 * find pool to dec ref */
+		lov_pool_putref(pool);
+		return NULL;
+	}
+
+	OBD_ALLOC_PTR(iter);
+	if (!iter)
+		return ERR_PTR(-ENOMEM);
+	iter->magic = POOL_IT_MAGIC;
+	iter->pool = pool;
+	iter->idx = 0;
+
+	/* we use seq_file private field to memorized iterator so
+	 * we can free it at stop() */
+	/* /!\ do not forget to restore it to pool before freeing it */
+	s->private = iter;
+	if (*pos > 0) {
+		loff_t i;
+		void *ptr;
+
+		i = 0;
+		do {
+		     ptr = pool_proc_next(s, &iter, &i);
+		} while ((i < *pos) && (ptr != NULL));
+		return ptr;
+	}
+	return iter;
+}
+
+static void pool_proc_stop(struct seq_file *s, void *v)
+{
+	struct pool_iterator *iter = (struct pool_iterator *)s->private;
+
+	/* in some cases stop() method is called 2 times, without
+	 * calling start() method (see seq_read() from fs/seq_file.c)
+	 * we have to free only if s->private is an iterator */
+	if ((iter) && (iter->magic == POOL_IT_MAGIC)) {
+		/* we restore s->private so next call to pool_proc_start()
+		 * will work */
+		s->private = iter->pool;
+		lov_pool_putref(iter->pool);
+		OBD_FREE_PTR(iter);
+	}
+	return;
+}
+
+static int pool_proc_show(struct seq_file *s, void *v)
+{
+	struct pool_iterator *iter = (struct pool_iterator *)v;
+	struct lov_tgt_desc *tgt;
+
+	LASSERTF(iter->magic == POOL_IT_MAGIC, "%08X", iter->magic);
+	LASSERT(iter->pool != NULL);
+	LASSERT(iter->idx <= pool_tgt_count(iter->pool));
+
+	down_read(&pool_tgt_rw_sem(iter->pool));
+	tgt = pool_tgt(iter->pool, iter->idx);
+	up_read(&pool_tgt_rw_sem(iter->pool));
+	if (tgt)
+		seq_printf(s, "%s\n", obd_uuid2str(&(tgt->ltd_uuid)));
+
+	return 0;
+}
+
+static struct seq_operations pool_proc_ops = {
+	.start	  = pool_proc_start,
+	.next	   = pool_proc_next,
+	.stop	   = pool_proc_stop,
+	.show	   = pool_proc_show,
+};
+
+static int pool_proc_open(struct inode *inode, struct file *file)
+{
+	int rc;
+
+	rc = seq_open(file, &pool_proc_ops);
+	if (!rc) {
+		struct seq_file *s = file->private_data;
+		s->private = PDE_DATA(inode);
+	}
+	return rc;
+}
+
+static struct file_operations pool_proc_operations = {
+	.open	   = pool_proc_open,
+	.read	   = seq_read,
+	.llseek	 = seq_lseek,
+	.release	= seq_release,
+};
+#endif /* CONFIG_PROC_FS */
+
+void lov_dump_pool(int level, struct pool_desc *pool)
+{
+	int i;
+
+	lov_pool_getref(pool);
+
+	CDEBUG(level, "pool "LOV_POOLNAMEF" has %d members\n",
+	       pool->pool_name, pool->pool_obds.op_count);
+	down_read(&pool_tgt_rw_sem(pool));
+
+	for (i = 0; i < pool_tgt_count(pool) ; i++) {
+		if (!pool_tgt(pool, i) || !(pool_tgt(pool, i))->ltd_exp)
+			continue;
+		CDEBUG(level, "pool "LOV_POOLNAMEF"[%d] = %s\n",
+		       pool->pool_name, i,
+		       obd_uuid2str(&((pool_tgt(pool, i))->ltd_uuid)));
+	}
+
+	up_read(&pool_tgt_rw_sem(pool));
+	lov_pool_putref(pool);
+}
+
+#define LOV_POOL_INIT_COUNT 2
+int lov_ost_pool_init(struct ost_pool *op, unsigned int count)
+{
+	if (count == 0)
+		count = LOV_POOL_INIT_COUNT;
+	op->op_array = NULL;
+	op->op_count = 0;
+	init_rwsem(&op->op_rw_sem);
+	op->op_size = count;
+	OBD_ALLOC(op->op_array, op->op_size * sizeof(op->op_array[0]));
+	if (op->op_array == NULL) {
+		op->op_size = 0;
+		return -ENOMEM;
+	}
+	return 0;
+}
+
+/* Caller must hold write op_rwlock */
+int lov_ost_pool_extend(struct ost_pool *op, unsigned int min_count)
+{
+	__u32 *new;
+	int new_size;
+
+	LASSERT(min_count != 0);
+
+	if (op->op_count < op->op_size)
+		return 0;
+
+	new_size = max(min_count, 2 * op->op_size);
+	OBD_ALLOC(new, new_size * sizeof(op->op_array[0]));
+	if (new == NULL)
+		return -ENOMEM;
+
+	/* copy old array to new one */
+	memcpy(new, op->op_array, op->op_size * sizeof(op->op_array[0]));
+	OBD_FREE(op->op_array, op->op_size * sizeof(op->op_array[0]));
+	op->op_array = new;
+	op->op_size = new_size;
+	return 0;
+}
+
+int lov_ost_pool_add(struct ost_pool *op, __u32 idx, unsigned int min_count)
+{
+	int rc = 0, i;
+
+	down_write(&op->op_rw_sem);
+
+	rc = lov_ost_pool_extend(op, min_count);
+	if (rc)
+		goto out;
+
+	/* search ost in pool array */
+	for (i = 0; i < op->op_count; i++) {
+		if (op->op_array[i] == idx) {
+			rc = -EEXIST;
+			goto out;
+		}
+	}
+	/* ost not found we add it */
+	op->op_array[op->op_count] = idx;
+	op->op_count++;
+out:
+	up_write(&op->op_rw_sem);
+	return rc;
+}
+
+int lov_ost_pool_remove(struct ost_pool *op, __u32 idx)
+{
+	int i;
+
+	down_write(&op->op_rw_sem);
+
+	for (i = 0; i < op->op_count; i++) {
+		if (op->op_array[i] == idx) {
+			memmove(&op->op_array[i], &op->op_array[i + 1],
+				(op->op_count - i - 1) * sizeof(op->op_array[0]));
+			op->op_count--;
+			up_write(&op->op_rw_sem);
+			return 0;
+		}
+	}
+
+	up_write(&op->op_rw_sem);
+	return -EINVAL;
+}
+
+int lov_ost_pool_free(struct ost_pool *op)
+{
+	if (op->op_size == 0)
+		return 0;
+
+	down_write(&op->op_rw_sem);
+
+	OBD_FREE(op->op_array, op->op_size * sizeof(op->op_array[0]));
+	op->op_array = NULL;
+	op->op_count = 0;
+	op->op_size = 0;
+
+	up_write(&op->op_rw_sem);
+	return 0;
+}
+
+
+int lov_pool_new(struct obd_device *obd, char *poolname)
+{
+	struct lov_obd *lov;
+	struct pool_desc *new_pool;
+	int rc;
+
+	lov = &(obd->u.lov);
+
+	if (strlen(poolname) > LOV_MAXPOOLNAME)
+		return -ENAMETOOLONG;
+
+	OBD_ALLOC_PTR(new_pool);
+	if (new_pool == NULL)
+		return -ENOMEM;
+
+	strncpy(new_pool->pool_name, poolname, LOV_MAXPOOLNAME);
+	new_pool->pool_name[LOV_MAXPOOLNAME] = '\0';
+	new_pool->pool_lobd = obd;
+	/* ref count init to 1 because when created a pool is always used
+	 * up to deletion
+	 */
+	atomic_set(&new_pool->pool_refcount, 1);
+	rc = lov_ost_pool_init(&new_pool->pool_obds, 0);
+	if (rc)
+		goto out_err;
+
+	memset(&(new_pool->pool_rr), 0, sizeof(struct lov_qos_rr));
+	rc = lov_ost_pool_init(&new_pool->pool_rr.lqr_pool, 0);
+	if (rc)
+		goto out_free_pool_obds;
+
+	INIT_HLIST_NODE(&new_pool->pool_hash);
+
+#if defined (CONFIG_PROC_FS)
+	/* we need this assert seq_file is not implemented for liblustre */
+	/* get ref for /proc file */
+	lov_pool_getref(new_pool);
+	new_pool->pool_proc_entry = lprocfs_add_simple(lov->lov_pool_proc_entry,
+						       poolname, new_pool,
+						       &pool_proc_operations);
+	if (IS_ERR(new_pool->pool_proc_entry)) {
+		CWARN("Cannot add proc pool entry "LOV_POOLNAMEF"\n", poolname);
+		new_pool->pool_proc_entry = NULL;
+		lov_pool_putref(new_pool);
+	}
+	CDEBUG(D_INFO, "pool %p - proc %p\n", new_pool, new_pool->pool_proc_entry);
+#endif
+
+	spin_lock(&obd->obd_dev_lock);
+	list_add_tail(&new_pool->pool_list, &lov->lov_pool_list);
+	lov->lov_pool_count++;
+	spin_unlock(&obd->obd_dev_lock);
+
+	/* add to find only when it fully ready  */
+	rc = cfs_hash_add_unique(lov->lov_pools_hash_body, poolname,
+				 &new_pool->pool_hash);
+	if (rc) {
+		rc = -EEXIST;
+		goto out_err;
+	}
+
+	CDEBUG(D_CONFIG, LOV_POOLNAMEF" is pool #%d\n",
+	       poolname, lov->lov_pool_count);
+
+	return 0;
+
+out_err:
+	spin_lock(&obd->obd_dev_lock);
+	list_del_init(&new_pool->pool_list);
+	lov->lov_pool_count--;
+	spin_unlock(&obd->obd_dev_lock);
+
+	lprocfs_remove(&new_pool->pool_proc_entry);
+
+	lov_ost_pool_free(&new_pool->pool_rr.lqr_pool);
+out_free_pool_obds:
+	lov_ost_pool_free(&new_pool->pool_obds);
+	OBD_FREE_PTR(new_pool);
+	return rc;
+}
+
+int lov_pool_del(struct obd_device *obd, char *poolname)
+{
+	struct lov_obd *lov;
+	struct pool_desc *pool;
+
+	lov = &(obd->u.lov);
+
+	/* lookup and kill hash reference */
+	pool = cfs_hash_del_key(lov->lov_pools_hash_body, poolname);
+	if (pool == NULL)
+		return -ENOENT;
+
+	if (pool->pool_proc_entry != NULL) {
+		CDEBUG(D_INFO, "proc entry %p\n", pool->pool_proc_entry);
+		lprocfs_remove(&pool->pool_proc_entry);
+		lov_pool_putref(pool);
+	}
+
+	spin_lock(&obd->obd_dev_lock);
+	list_del_init(&pool->pool_list);
+	lov->lov_pool_count--;
+	spin_unlock(&obd->obd_dev_lock);
+
+	/* release last reference */
+	lov_pool_putref(pool);
+
+	return 0;
+}
+
+
+int lov_pool_add(struct obd_device *obd, char *poolname, char *ostname)
+{
+	struct obd_uuid ost_uuid;
+	struct lov_obd *lov;
+	struct pool_desc *pool;
+	unsigned int lov_idx;
+	int rc;
+
+	lov = &(obd->u.lov);
+
+	pool = cfs_hash_lookup(lov->lov_pools_hash_body, poolname);
+	if (pool == NULL)
+		return -ENOENT;
+
+	obd_str2uuid(&ost_uuid, ostname);
+
+
+	/* search ost in lov array */
+	obd_getref(obd);
+	for (lov_idx = 0; lov_idx < lov->desc.ld_tgt_count; lov_idx++) {
+		if (!lov->lov_tgts[lov_idx])
+			continue;
+		if (obd_uuid_equals(&ost_uuid,
+				    &(lov->lov_tgts[lov_idx]->ltd_uuid)))
+			break;
+	}
+	/* test if ost found in lov */
+	if (lov_idx == lov->desc.ld_tgt_count) {
+		rc = -EINVAL;
+		goto out;
+	}
+
+	rc = lov_ost_pool_add(&pool->pool_obds, lov_idx, lov->lov_tgt_size);
+	if (rc)
+		goto out;
+
+	pool->pool_rr.lqr_dirty = 1;
+
+	CDEBUG(D_CONFIG, "Added %s to "LOV_POOLNAMEF" as member %d\n",
+	       ostname, poolname,  pool_tgt_count(pool));
+
+out:
+	obd_putref(obd);
+	lov_pool_putref(pool);
+	return rc;
+}
+
+int lov_pool_remove(struct obd_device *obd, char *poolname, char *ostname)
+{
+	struct obd_uuid ost_uuid;
+	struct lov_obd *lov;
+	struct pool_desc *pool;
+	unsigned int lov_idx;
+	int rc = 0;
+
+	lov = &(obd->u.lov);
+
+	pool = cfs_hash_lookup(lov->lov_pools_hash_body, poolname);
+	if (pool == NULL)
+		return -ENOENT;
+
+	obd_str2uuid(&ost_uuid, ostname);
+
+	obd_getref(obd);
+	/* search ost in lov array, to get index */
+	for (lov_idx = 0; lov_idx < lov->desc.ld_tgt_count; lov_idx++) {
+		if (!lov->lov_tgts[lov_idx])
+			continue;
+
+		if (obd_uuid_equals(&ost_uuid,
+				    &(lov->lov_tgts[lov_idx]->ltd_uuid)))
+			break;
+	}
+
+	/* test if ost found in lov */
+	if (lov_idx == lov->desc.ld_tgt_count) {
+		rc = -EINVAL;
+		goto out;
+	}
+
+	lov_ost_pool_remove(&pool->pool_obds, lov_idx);
+
+	pool->pool_rr.lqr_dirty = 1;
+
+	CDEBUG(D_CONFIG, "%s removed from "LOV_POOLNAMEF"\n", ostname,
+	       poolname);
+
+out:
+	obd_putref(obd);
+	lov_pool_putref(pool);
+	return rc;
+}
+
+int lov_check_index_in_pool(__u32 idx, struct pool_desc *pool)
+{
+	int i, rc;
+
+	/* caller may no have a ref on pool if it got the pool
+	 * without calling lov_find_pool() (e.g. go through the lov pool
+	 * list)
+	 */
+	lov_pool_getref(pool);
+
+	down_read(&pool_tgt_rw_sem(pool));
+
+	for (i = 0; i < pool_tgt_count(pool); i++) {
+		if (pool_tgt_array(pool)[i] == idx) {
+			rc = 0;
+			goto out;
+		}
+	}
+	rc = -ENOENT;
+out:
+	up_read(&pool_tgt_rw_sem(pool));
+
+	lov_pool_putref(pool);
+	return rc;
+}
+
+struct pool_desc *lov_find_pool(struct lov_obd *lov, char *poolname)
+{
+	struct pool_desc *pool;
+
+	pool = NULL;
+	if (poolname[0] != '\0') {
+		pool = cfs_hash_lookup(lov->lov_pools_hash_body, poolname);
+		if (pool == NULL)
+			CWARN("Request for an unknown pool ("LOV_POOLNAMEF")\n",
+			      poolname);
+		if ((pool != NULL) && (pool_tgt_count(pool) == 0)) {
+			CWARN("Request for an empty pool ("LOV_POOLNAMEF")\n",
+			       poolname);
+			/* pool is ignored, so we remove ref on it */
+			lov_pool_putref(pool);
+			pool = NULL;
+		}
+	}
+	return pool;
+}
diff --git a/kernel/drivers/staging/lustre/lustre/lov/lov_request.c b/kernel/drivers/staging/lustre/lustre/lov/lov_request.c
new file mode 100644
index 000000000..933e2d1f8
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/lov/lov_request.c
@@ -0,0 +1,773 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_LOV
+
+#include "../../include/linux/libcfs/libcfs.h"
+
+#include "../include/obd_class.h"
+#include "../include/lustre/lustre_idl.h"
+#include "lov_internal.h"
+
+static void lov_init_set(struct lov_request_set *set)
+{
+	set->set_count = 0;
+	atomic_set(&set->set_completes, 0);
+	atomic_set(&set->set_success, 0);
+	atomic_set(&set->set_finish_checked, 0);
+	set->set_cookies = NULL;
+	INIT_LIST_HEAD(&set->set_list);
+	atomic_set(&set->set_refcount, 1);
+	init_waitqueue_head(&set->set_waitq);
+	spin_lock_init(&set->set_lock);
+}
+
+void lov_finish_set(struct lov_request_set *set)
+{
+	struct list_head *pos, *n;
+
+	LASSERT(set);
+	list_for_each_safe(pos, n, &set->set_list) {
+		struct lov_request *req = list_entry(pos,
+							 struct lov_request,
+							 rq_link);
+		list_del_init(&req->rq_link);
+
+		if (req->rq_oi.oi_oa)
+			OBDO_FREE(req->rq_oi.oi_oa);
+		if (req->rq_oi.oi_md)
+			OBD_FREE_LARGE(req->rq_oi.oi_md, req->rq_buflen);
+		if (req->rq_oi.oi_osfs)
+			OBD_FREE(req->rq_oi.oi_osfs,
+				 sizeof(*req->rq_oi.oi_osfs));
+		OBD_FREE(req, sizeof(*req));
+	}
+
+	if (set->set_pga) {
+		int len = set->set_oabufs * sizeof(*set->set_pga);
+		OBD_FREE_LARGE(set->set_pga, len);
+	}
+	if (set->set_lockh)
+		lov_llh_put(set->set_lockh);
+
+	OBD_FREE(set, sizeof(*set));
+}
+
+int lov_set_finished(struct lov_request_set *set, int idempotent)
+{
+	int completes = atomic_read(&set->set_completes);
+
+	CDEBUG(D_INFO, "check set %d/%d\n", completes, set->set_count);
+
+	if (completes == set->set_count) {
+		if (idempotent)
+			return 1;
+		if (atomic_inc_return(&set->set_finish_checked) == 1)
+			return 1;
+	}
+	return 0;
+}
+
+void lov_update_set(struct lov_request_set *set,
+		    struct lov_request *req, int rc)
+{
+	req->rq_complete = 1;
+	req->rq_rc = rc;
+
+	atomic_inc(&set->set_completes);
+	if (rc == 0)
+		atomic_inc(&set->set_success);
+
+	wake_up(&set->set_waitq);
+}
+
+int lov_update_common_set(struct lov_request_set *set,
+			  struct lov_request *req, int rc)
+{
+	struct lov_obd *lov = &set->set_exp->exp_obd->u.lov;
+
+	lov_update_set(set, req, rc);
+
+	/* grace error on inactive ost */
+	if (rc && !(lov->lov_tgts[req->rq_idx] &&
+		    lov->lov_tgts[req->rq_idx]->ltd_active))
+		rc = 0;
+
+	/* FIXME in raid1 regime, should return 0 */
+	return rc;
+}
+
+void lov_set_add_req(struct lov_request *req, struct lov_request_set *set)
+{
+	list_add_tail(&req->rq_link, &set->set_list);
+	set->set_count++;
+	req->rq_rqset = set;
+}
+
+static int lov_check_set(struct lov_obd *lov, int idx)
+{
+	int rc;
+	struct lov_tgt_desc *tgt;
+
+	mutex_lock(&lov->lov_lock);
+	tgt = lov->lov_tgts[idx];
+	rc = !tgt || tgt->ltd_active ||
+		(tgt->ltd_exp &&
+		 class_exp2cliimp(tgt->ltd_exp)->imp_connect_tried);
+	mutex_unlock(&lov->lov_lock);
+
+	return rc;
+}
+
+/* Check if the OSC connection exists and is active.
+ * If the OSC has not yet had a chance to connect to the OST the first time,
+ * wait once for it to connect instead of returning an error.
+ */
+int lov_check_and_wait_active(struct lov_obd *lov, int ost_idx)
+{
+	wait_queue_head_t waitq;
+	struct l_wait_info lwi;
+	struct lov_tgt_desc *tgt;
+	int rc = 0;
+
+	mutex_lock(&lov->lov_lock);
+
+	tgt = lov->lov_tgts[ost_idx];
+
+	if (unlikely(tgt == NULL)) {
+		rc = 0;
+		goto out;
+	}
+
+	if (likely(tgt->ltd_active)) {
+		rc = 1;
+		goto out;
+	}
+
+	if (tgt->ltd_exp && class_exp2cliimp(tgt->ltd_exp)->imp_connect_tried) {
+		rc = 0;
+		goto out;
+	}
+
+	mutex_unlock(&lov->lov_lock);
+
+	init_waitqueue_head(&waitq);
+	lwi = LWI_TIMEOUT_INTERVAL(cfs_time_seconds(obd_timeout),
+				   cfs_time_seconds(1), NULL, NULL);
+
+	rc = l_wait_event(waitq, lov_check_set(lov, ost_idx), &lwi);
+	if (tgt != NULL && tgt->ltd_active)
+		return 1;
+
+	return 0;
+
+out:
+	mutex_unlock(&lov->lov_lock);
+	return rc;
+}
+
+static int common_attr_done(struct lov_request_set *set)
+{
+	struct list_head *pos;
+	struct lov_request *req;
+	struct obdo *tmp_oa;
+	int rc = 0, attrset = 0;
+
+	LASSERT(set->set_oi != NULL);
+
+	if (set->set_oi->oi_oa == NULL)
+		return 0;
+
+	if (!atomic_read(&set->set_success))
+		return -EIO;
+
+	OBDO_ALLOC(tmp_oa);
+	if (tmp_oa == NULL) {
+		rc = -ENOMEM;
+		goto out;
+	}
+
+	list_for_each(pos, &set->set_list) {
+		req = list_entry(pos, struct lov_request, rq_link);
+
+		if (!req->rq_complete || req->rq_rc)
+			continue;
+		if (req->rq_oi.oi_oa->o_valid == 0)   /* inactive stripe */
+			continue;
+		lov_merge_attrs(tmp_oa, req->rq_oi.oi_oa,
+				req->rq_oi.oi_oa->o_valid,
+				set->set_oi->oi_md, req->rq_stripe, &attrset);
+	}
+	if (!attrset) {
+		CERROR("No stripes had valid attrs\n");
+		rc = -EIO;
+	}
+	if ((set->set_oi->oi_oa->o_valid & OBD_MD_FLEPOCH) &&
+	    (set->set_oi->oi_md->lsm_stripe_count != attrset)) {
+		/* When we take attributes of some epoch, we require all the
+		 * ost to be active. */
+		CERROR("Not all the stripes had valid attrs\n");
+		rc = -EIO;
+		goto out;
+	}
+
+	tmp_oa->o_oi = set->set_oi->oi_oa->o_oi;
+	memcpy(set->set_oi->oi_oa, tmp_oa, sizeof(*set->set_oi->oi_oa));
+out:
+	if (tmp_oa)
+		OBDO_FREE(tmp_oa);
+	return rc;
+
+}
+
+int lov_fini_getattr_set(struct lov_request_set *set)
+{
+	int rc = 0;
+
+	if (set == NULL)
+		return 0;
+	LASSERT(set->set_exp);
+	if (atomic_read(&set->set_completes))
+		rc = common_attr_done(set);
+
+	lov_put_reqset(set);
+
+	return rc;
+}
+
+/* The callback for osc_getattr_async that finalizes a request info when a
+ * response is received. */
+static int cb_getattr_update(void *cookie, int rc)
+{
+	struct obd_info *oinfo = cookie;
+	struct lov_request *lovreq;
+
+	lovreq = container_of(oinfo, struct lov_request, rq_oi);
+	return lov_update_common_set(lovreq->rq_rqset, lovreq, rc);
+}
+
+int lov_prep_getattr_set(struct obd_export *exp, struct obd_info *oinfo,
+			 struct lov_request_set **reqset)
+{
+	struct lov_request_set *set;
+	struct lov_obd *lov = &exp->exp_obd->u.lov;
+	int rc = 0, i;
+
+	OBD_ALLOC(set, sizeof(*set));
+	if (set == NULL)
+		return -ENOMEM;
+	lov_init_set(set);
+
+	set->set_exp = exp;
+	set->set_oi = oinfo;
+
+	for (i = 0; i < oinfo->oi_md->lsm_stripe_count; i++) {
+		struct lov_oinfo *loi;
+		struct lov_request *req;
+
+		loi = oinfo->oi_md->lsm_oinfo[i];
+		if (lov_oinfo_is_dummy(loi))
+			continue;
+
+		if (!lov_check_and_wait_active(lov, loi->loi_ost_idx)) {
+			CDEBUG(D_HA, "lov idx %d inactive\n", loi->loi_ost_idx);
+			if (oinfo->oi_oa->o_valid & OBD_MD_FLEPOCH) {
+				/* SOM requires all the OSTs to be active. */
+				rc = -EIO;
+				goto out_set;
+			}
+			continue;
+		}
+
+		OBD_ALLOC(req, sizeof(*req));
+		if (req == NULL) {
+			rc = -ENOMEM;
+			goto out_set;
+		}
+
+		req->rq_stripe = i;
+		req->rq_idx = loi->loi_ost_idx;
+
+		OBDO_ALLOC(req->rq_oi.oi_oa);
+		if (req->rq_oi.oi_oa == NULL) {
+			OBD_FREE(req, sizeof(*req));
+			rc = -ENOMEM;
+			goto out_set;
+		}
+		memcpy(req->rq_oi.oi_oa, oinfo->oi_oa,
+		       sizeof(*req->rq_oi.oi_oa));
+		req->rq_oi.oi_oa->o_oi = loi->loi_oi;
+		req->rq_oi.oi_cb_up = cb_getattr_update;
+		req->rq_oi.oi_capa = oinfo->oi_capa;
+
+		lov_set_add_req(req, set);
+	}
+	if (!set->set_count) {
+		rc = -EIO;
+		goto out_set;
+	}
+	*reqset = set;
+	return rc;
+out_set:
+	lov_fini_getattr_set(set);
+	return rc;
+}
+
+int lov_fini_destroy_set(struct lov_request_set *set)
+{
+	if (set == NULL)
+		return 0;
+	LASSERT(set->set_exp);
+	if (atomic_read(&set->set_completes)) {
+		/* FIXME update qos data here */
+	}
+
+	lov_put_reqset(set);
+
+	return 0;
+}
+
+int lov_prep_destroy_set(struct obd_export *exp, struct obd_info *oinfo,
+			 struct obdo *src_oa, struct lov_stripe_md *lsm,
+			 struct obd_trans_info *oti,
+			 struct lov_request_set **reqset)
+{
+	struct lov_request_set *set;
+	struct lov_obd *lov = &exp->exp_obd->u.lov;
+	int rc = 0, i;
+
+	OBD_ALLOC(set, sizeof(*set));
+	if (set == NULL)
+		return -ENOMEM;
+	lov_init_set(set);
+
+	set->set_exp = exp;
+	set->set_oi = oinfo;
+	set->set_oi->oi_md = lsm;
+	set->set_oi->oi_oa = src_oa;
+	set->set_oti = oti;
+	if (oti != NULL && src_oa->o_valid & OBD_MD_FLCOOKIE)
+		set->set_cookies = oti->oti_logcookies;
+
+	for (i = 0; i < lsm->lsm_stripe_count; i++) {
+		struct lov_oinfo *loi;
+		struct lov_request *req;
+
+		loi = lsm->lsm_oinfo[i];
+		if (lov_oinfo_is_dummy(loi))
+			continue;
+
+		if (!lov_check_and_wait_active(lov, loi->loi_ost_idx)) {
+			CDEBUG(D_HA, "lov idx %d inactive\n", loi->loi_ost_idx);
+			continue;
+		}
+
+		OBD_ALLOC(req, sizeof(*req));
+		if (req == NULL) {
+			rc = -ENOMEM;
+			goto out_set;
+		}
+
+		req->rq_stripe = i;
+		req->rq_idx = loi->loi_ost_idx;
+
+		OBDO_ALLOC(req->rq_oi.oi_oa);
+		if (req->rq_oi.oi_oa == NULL) {
+			OBD_FREE(req, sizeof(*req));
+			rc = -ENOMEM;
+			goto out_set;
+		}
+		memcpy(req->rq_oi.oi_oa, src_oa, sizeof(*req->rq_oi.oi_oa));
+		req->rq_oi.oi_oa->o_oi = loi->loi_oi;
+		lov_set_add_req(req, set);
+	}
+	if (!set->set_count) {
+		rc = -EIO;
+		goto out_set;
+	}
+	*reqset = set;
+	return rc;
+out_set:
+	lov_fini_destroy_set(set);
+	return rc;
+}
+
+int lov_fini_setattr_set(struct lov_request_set *set)
+{
+	int rc = 0;
+
+	if (set == NULL)
+		return 0;
+	LASSERT(set->set_exp);
+	if (atomic_read(&set->set_completes)) {
+		rc = common_attr_done(set);
+		/* FIXME update qos data here */
+	}
+
+	lov_put_reqset(set);
+	return rc;
+}
+
+int lov_update_setattr_set(struct lov_request_set *set,
+			   struct lov_request *req, int rc)
+{
+	struct lov_obd *lov = &req->rq_rqset->set_exp->exp_obd->u.lov;
+	struct lov_stripe_md *lsm = req->rq_rqset->set_oi->oi_md;
+
+	lov_update_set(set, req, rc);
+
+	/* grace error on inactive ost */
+	if (rc && !(lov->lov_tgts[req->rq_idx] &&
+		    lov->lov_tgts[req->rq_idx]->ltd_active))
+		rc = 0;
+
+	if (rc == 0) {
+		if (req->rq_oi.oi_oa->o_valid & OBD_MD_FLCTIME)
+			lsm->lsm_oinfo[req->rq_stripe]->loi_lvb.lvb_ctime =
+				req->rq_oi.oi_oa->o_ctime;
+		if (req->rq_oi.oi_oa->o_valid & OBD_MD_FLMTIME)
+			lsm->lsm_oinfo[req->rq_stripe]->loi_lvb.lvb_mtime =
+				req->rq_oi.oi_oa->o_mtime;
+		if (req->rq_oi.oi_oa->o_valid & OBD_MD_FLATIME)
+			lsm->lsm_oinfo[req->rq_stripe]->loi_lvb.lvb_atime =
+				req->rq_oi.oi_oa->o_atime;
+	}
+
+	return rc;
+}
+
+/* The callback for osc_setattr_async that finalizes a request info when a
+ * response is received. */
+static int cb_setattr_update(void *cookie, int rc)
+{
+	struct obd_info *oinfo = cookie;
+	struct lov_request *lovreq;
+
+	lovreq = container_of(oinfo, struct lov_request, rq_oi);
+	return lov_update_setattr_set(lovreq->rq_rqset, lovreq, rc);
+}
+
+int lov_prep_setattr_set(struct obd_export *exp, struct obd_info *oinfo,
+			 struct obd_trans_info *oti,
+			 struct lov_request_set **reqset)
+{
+	struct lov_request_set *set;
+	struct lov_obd *lov = &exp->exp_obd->u.lov;
+	int rc = 0, i;
+
+	OBD_ALLOC(set, sizeof(*set));
+	if (set == NULL)
+		return -ENOMEM;
+	lov_init_set(set);
+
+	set->set_exp = exp;
+	set->set_oti = oti;
+	set->set_oi = oinfo;
+	if (oti != NULL && oinfo->oi_oa->o_valid & OBD_MD_FLCOOKIE)
+		set->set_cookies = oti->oti_logcookies;
+
+	for (i = 0; i < oinfo->oi_md->lsm_stripe_count; i++) {
+		struct lov_oinfo *loi = oinfo->oi_md->lsm_oinfo[i];
+		struct lov_request *req;
+
+		if (lov_oinfo_is_dummy(loi))
+			continue;
+
+		if (!lov_check_and_wait_active(lov, loi->loi_ost_idx)) {
+			CDEBUG(D_HA, "lov idx %d inactive\n", loi->loi_ost_idx);
+			continue;
+		}
+
+		OBD_ALLOC(req, sizeof(*req));
+		if (req == NULL) {
+			rc = -ENOMEM;
+			goto out_set;
+		}
+		req->rq_stripe = i;
+		req->rq_idx = loi->loi_ost_idx;
+
+		OBDO_ALLOC(req->rq_oi.oi_oa);
+		if (req->rq_oi.oi_oa == NULL) {
+			OBD_FREE(req, sizeof(*req));
+			rc = -ENOMEM;
+			goto out_set;
+		}
+		memcpy(req->rq_oi.oi_oa, oinfo->oi_oa,
+		       sizeof(*req->rq_oi.oi_oa));
+		req->rq_oi.oi_oa->o_oi = loi->loi_oi;
+		req->rq_oi.oi_oa->o_stripe_idx = i;
+		req->rq_oi.oi_cb_up = cb_setattr_update;
+		req->rq_oi.oi_capa = oinfo->oi_capa;
+
+		if (oinfo->oi_oa->o_valid & OBD_MD_FLSIZE) {
+			int off = lov_stripe_offset(oinfo->oi_md,
+						    oinfo->oi_oa->o_size, i,
+						    &req->rq_oi.oi_oa->o_size);
+
+			if (off < 0 && req->rq_oi.oi_oa->o_size)
+				req->rq_oi.oi_oa->o_size--;
+
+			CDEBUG(D_INODE, "stripe %d has size %llu/%llu\n",
+			       i, req->rq_oi.oi_oa->o_size,
+			       oinfo->oi_oa->o_size);
+		}
+		lov_set_add_req(req, set);
+	}
+	if (!set->set_count) {
+		rc = -EIO;
+		goto out_set;
+	}
+	*reqset = set;
+	return rc;
+out_set:
+	lov_fini_setattr_set(set);
+	return rc;
+}
+
+#define LOV_U64_MAX ((__u64)~0ULL)
+#define LOV_SUM_MAX(tot, add)					   \
+	do {							    \
+		if ((tot) + (add) < (tot))			      \
+			(tot) = LOV_U64_MAX;			    \
+		else						    \
+			(tot) += (add);				 \
+	} while (0)
+
+int lov_fini_statfs(struct obd_device *obd, struct obd_statfs *osfs,
+		    int success)
+{
+	if (success) {
+		__u32 expected_stripes = lov_get_stripecnt(&obd->u.lov,
+							   LOV_MAGIC, 0);
+		if (osfs->os_files != LOV_U64_MAX)
+			lov_do_div64(osfs->os_files, expected_stripes);
+		if (osfs->os_ffree != LOV_U64_MAX)
+			lov_do_div64(osfs->os_ffree, expected_stripes);
+
+		spin_lock(&obd->obd_osfs_lock);
+		memcpy(&obd->obd_osfs, osfs, sizeof(*osfs));
+		obd->obd_osfs_age = cfs_time_current_64();
+		spin_unlock(&obd->obd_osfs_lock);
+		return 0;
+	}
+
+	return -EIO;
+}
+
+int lov_fini_statfs_set(struct lov_request_set *set)
+{
+	int rc = 0;
+
+	if (set == NULL)
+		return 0;
+
+	if (atomic_read(&set->set_completes)) {
+		rc = lov_fini_statfs(set->set_obd, set->set_oi->oi_osfs,
+				     atomic_read(&set->set_success));
+	}
+	lov_put_reqset(set);
+	return rc;
+}
+
+void lov_update_statfs(struct obd_statfs *osfs, struct obd_statfs *lov_sfs,
+		       int success)
+{
+	int shift = 0, quit = 0;
+	__u64 tmp;
+
+	if (success == 0) {
+		memcpy(osfs, lov_sfs, sizeof(*lov_sfs));
+	} else {
+		if (osfs->os_bsize != lov_sfs->os_bsize) {
+			/* assume all block sizes are always powers of 2 */
+			/* get the bits difference */
+			tmp = osfs->os_bsize | lov_sfs->os_bsize;
+			for (shift = 0; shift <= 64; ++shift) {
+				if (tmp & 1) {
+					if (quit)
+						break;
+					else
+						quit = 1;
+					shift = 0;
+				}
+				tmp >>= 1;
+			}
+		}
+
+		if (osfs->os_bsize < lov_sfs->os_bsize) {
+			osfs->os_bsize = lov_sfs->os_bsize;
+
+			osfs->os_bfree  >>= shift;
+			osfs->os_bavail >>= shift;
+			osfs->os_blocks >>= shift;
+		} else if (shift != 0) {
+			lov_sfs->os_bfree  >>= shift;
+			lov_sfs->os_bavail >>= shift;
+			lov_sfs->os_blocks >>= shift;
+		}
+		osfs->os_bfree += lov_sfs->os_bfree;
+		osfs->os_bavail += lov_sfs->os_bavail;
+		osfs->os_blocks += lov_sfs->os_blocks;
+		/* XXX not sure about this one - depends on policy.
+		 *   - could be minimum if we always stripe on all OBDs
+		 *     (but that would be wrong for any other policy,
+		 *     if one of the OBDs has no more objects left)
+		 *   - could be sum if we stripe whole objects
+		 *   - could be average, just to give a nice number
+		 *
+		 * To give a "reasonable" (if not wholly accurate)
+		 * number, we divide the total number of free objects
+		 * by expected stripe count (watch out for overflow).
+		 */
+		LOV_SUM_MAX(osfs->os_files, lov_sfs->os_files);
+		LOV_SUM_MAX(osfs->os_ffree, lov_sfs->os_ffree);
+	}
+}
+
+/* The callback for osc_statfs_async that finalizes a request info when a
+ * response is received. */
+static int cb_statfs_update(void *cookie, int rc)
+{
+	struct obd_info *oinfo = cookie;
+	struct lov_request *lovreq;
+	struct lov_request_set *set;
+	struct obd_statfs *osfs, *lov_sfs;
+	struct lov_obd *lov;
+	struct lov_tgt_desc *tgt;
+	struct obd_device *lovobd, *tgtobd;
+	int success;
+
+	lovreq = container_of(oinfo, struct lov_request, rq_oi);
+	set = lovreq->rq_rqset;
+	lovobd = set->set_obd;
+	lov = &lovobd->u.lov;
+	osfs = set->set_oi->oi_osfs;
+	lov_sfs = oinfo->oi_osfs;
+	success = atomic_read(&set->set_success);
+	/* XXX: the same is done in lov_update_common_set, however
+	   lovset->set_exp is not initialized. */
+	lov_update_set(set, lovreq, rc);
+	if (rc)
+		goto out;
+
+	obd_getref(lovobd);
+	tgt = lov->lov_tgts[lovreq->rq_idx];
+	if (!tgt || !tgt->ltd_active)
+		goto out_update;
+
+	tgtobd = class_exp2obd(tgt->ltd_exp);
+	spin_lock(&tgtobd->obd_osfs_lock);
+	memcpy(&tgtobd->obd_osfs, lov_sfs, sizeof(*lov_sfs));
+	if ((oinfo->oi_flags & OBD_STATFS_FROM_CACHE) == 0)
+		tgtobd->obd_osfs_age = cfs_time_current_64();
+	spin_unlock(&tgtobd->obd_osfs_lock);
+
+out_update:
+	lov_update_statfs(osfs, lov_sfs, success);
+	obd_putref(lovobd);
+
+out:
+	if (set->set_oi->oi_flags & OBD_STATFS_PTLRPCD &&
+	    lov_set_finished(set, 0)) {
+		lov_statfs_interpret(NULL, set, set->set_count !=
+				     atomic_read(&set->set_success));
+	}
+
+	return 0;
+}
+
+int lov_prep_statfs_set(struct obd_device *obd, struct obd_info *oinfo,
+			struct lov_request_set **reqset)
+{
+	struct lov_request_set *set;
+	struct lov_obd *lov = &obd->u.lov;
+	int rc = 0, i;
+
+	OBD_ALLOC(set, sizeof(*set));
+	if (set == NULL)
+		return -ENOMEM;
+	lov_init_set(set);
+
+	set->set_obd = obd;
+	set->set_oi = oinfo;
+
+	/* We only get block data from the OBD */
+	for (i = 0; i < lov->desc.ld_tgt_count; i++) {
+		struct lov_request *req;
+
+		if (lov->lov_tgts[i] == NULL ||
+		    (!lov_check_and_wait_active(lov, i) &&
+		     (oinfo->oi_flags & OBD_STATFS_NODELAY))) {
+			CDEBUG(D_HA, "lov idx %d inactive\n", i);
+			continue;
+		}
+
+		/* skip targets that have been explicitly disabled by the
+		 * administrator */
+		if (!lov->lov_tgts[i]->ltd_exp) {
+			CDEBUG(D_HA, "lov idx %d administratively disabled\n", i);
+			continue;
+		}
+
+		OBD_ALLOC(req, sizeof(*req));
+		if (req == NULL) {
+			rc = -ENOMEM;
+			goto out_set;
+		}
+
+		OBD_ALLOC(req->rq_oi.oi_osfs, sizeof(*req->rq_oi.oi_osfs));
+		if (req->rq_oi.oi_osfs == NULL) {
+			OBD_FREE(req, sizeof(*req));
+			rc = -ENOMEM;
+			goto out_set;
+		}
+
+		req->rq_idx = i;
+		req->rq_oi.oi_cb_up = cb_statfs_update;
+		req->rq_oi.oi_flags = oinfo->oi_flags;
+
+		lov_set_add_req(req, set);
+	}
+	if (!set->set_count) {
+		rc = -EIO;
+		goto out_set;
+	}
+	*reqset = set;
+	return rc;
+out_set:
+	lov_fini_statfs_set(set);
+	return rc;
+}
diff --git a/kernel/drivers/staging/lustre/lustre/lov/lovsub_dev.c b/kernel/drivers/staging/lustre/lustre/lov/lovsub_dev.c
new file mode 100644
index 000000000..42336f13a
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/lov/lovsub_dev.c
@@ -0,0 +1,209 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Implementation of cl_device and cl_device_type for LOVSUB layer.
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LOV
+
+#include "lov_cl_internal.h"
+
+/** \addtogroup lov
+ *  @{
+ */
+
+/*****************************************************************************
+ *
+ * Lovsub transfer operations.
+ *
+ */
+
+static void lovsub_req_completion(const struct lu_env *env,
+				  const struct cl_req_slice *slice, int ioret)
+{
+	struct lovsub_req *lsr;
+
+	lsr = cl2lovsub_req(slice);
+	OBD_SLAB_FREE_PTR(lsr, lovsub_req_kmem);
+}
+
+/**
+ * Implementation of struct cl_req_operations::cro_attr_set() for lovsub
+ * layer. Lov and lovsub are responsible only for struct obdo::o_stripe_idx
+ * field, which is filled there.
+ */
+static void lovsub_req_attr_set(const struct lu_env *env,
+				const struct cl_req_slice *slice,
+				const struct cl_object *obj,
+				struct cl_req_attr *attr, u64 flags)
+{
+	struct lovsub_object *subobj;
+
+	subobj = cl2lovsub(obj);
+	/*
+	 * There is no OBD_MD_* flag for obdo::o_stripe_idx, so set it
+	 * unconditionally. It never changes anyway.
+	 */
+	attr->cra_oa->o_stripe_idx = subobj->lso_index;
+}
+
+static const struct cl_req_operations lovsub_req_ops = {
+	.cro_attr_set   = lovsub_req_attr_set,
+	.cro_completion = lovsub_req_completion
+};
+
+/*****************************************************************************
+ *
+ * Lov-sub device and device type functions.
+ *
+ */
+
+static int lovsub_device_init(const struct lu_env *env, struct lu_device *d,
+			      const char *name, struct lu_device *next)
+{
+	struct lovsub_device  *lsd = lu2lovsub_dev(d);
+	struct lu_device_type *ldt;
+	int rc;
+
+	next->ld_site = d->ld_site;
+	ldt = next->ld_type;
+	LASSERT(ldt != NULL);
+	rc = ldt->ldt_ops->ldto_device_init(env, next, ldt->ldt_name, NULL);
+	if (rc) {
+		next->ld_site = NULL;
+		return rc;
+	}
+
+	lu_device_get(next);
+	lu_ref_add(&next->ld_reference, "lu-stack", &lu_site_init);
+	lsd->acid_next = lu2cl_dev(next);
+	return rc;
+}
+
+static struct lu_device *lovsub_device_fini(const struct lu_env *env,
+					    struct lu_device *d)
+{
+	struct lu_device *next;
+	struct lovsub_device *lsd;
+
+	lsd = lu2lovsub_dev(d);
+	next = cl2lu_dev(lsd->acid_next);
+	lsd->acid_super = NULL;
+	lsd->acid_next = NULL;
+	return next;
+}
+
+static struct lu_device *lovsub_device_free(const struct lu_env *env,
+					    struct lu_device *d)
+{
+	struct lovsub_device *lsd  = lu2lovsub_dev(d);
+	struct lu_device     *next = cl2lu_dev(lsd->acid_next);
+
+	if (atomic_read(&d->ld_ref) && d->ld_site) {
+		LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, D_ERROR, NULL);
+		lu_site_print(env, d->ld_site, &msgdata, lu_cdebug_printer);
+	}
+	cl_device_fini(lu2cl_dev(d));
+	OBD_FREE_PTR(lsd);
+	return next;
+}
+
+static int lovsub_req_init(const struct lu_env *env, struct cl_device *dev,
+			   struct cl_req *req)
+{
+	struct lovsub_req *lsr;
+	int result;
+
+	OBD_SLAB_ALLOC_PTR_GFP(lsr, lovsub_req_kmem, GFP_NOFS);
+	if (lsr != NULL) {
+		cl_req_slice_add(req, &lsr->lsrq_cl, dev, &lovsub_req_ops);
+		result = 0;
+	} else
+		result = -ENOMEM;
+	return result;
+}
+
+static const struct lu_device_operations lovsub_lu_ops = {
+	.ldo_object_alloc      = lovsub_object_alloc,
+	.ldo_process_config    = NULL,
+	.ldo_recovery_complete = NULL
+};
+
+static const struct cl_device_operations lovsub_cl_ops = {
+	.cdo_req_init = lovsub_req_init
+};
+
+static struct lu_device *lovsub_device_alloc(const struct lu_env *env,
+					     struct lu_device_type *t,
+					     struct lustre_cfg *cfg)
+{
+	struct lu_device     *d;
+	struct lovsub_device *lsd;
+
+	OBD_ALLOC_PTR(lsd);
+	if (lsd != NULL) {
+		int result;
+
+		result = cl_device_init(&lsd->acid_cl, t);
+		if (result == 0) {
+			d = lovsub2lu_dev(lsd);
+			d->ld_ops	 = &lovsub_lu_ops;
+			lsd->acid_cl.cd_ops = &lovsub_cl_ops;
+		} else
+			d = ERR_PTR(result);
+	} else
+		d = ERR_PTR(-ENOMEM);
+	return d;
+}
+
+static const struct lu_device_type_operations lovsub_device_type_ops = {
+	.ldto_device_alloc = lovsub_device_alloc,
+	.ldto_device_free  = lovsub_device_free,
+
+	.ldto_device_init    = lovsub_device_init,
+	.ldto_device_fini    = lovsub_device_fini
+};
+
+#define LUSTRE_LOVSUB_NAME	 "lovsub"
+
+struct lu_device_type lovsub_device_type = {
+	.ldt_tags     = LU_DEVICE_CL,
+	.ldt_name     = LUSTRE_LOVSUB_NAME,
+	.ldt_ops      = &lovsub_device_type_ops,
+	.ldt_ctx_tags = LCT_CL_THREAD
+};
+
+
+/** @} lov */
diff --git a/kernel/drivers/staging/lustre/lustre/lov/lovsub_io.c b/kernel/drivers/staging/lustre/lustre/lov/lovsub_io.c
new file mode 100644
index 000000000..783ec687a
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/lov/lovsub_io.c
@@ -0,0 +1,55 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Implementation of cl_io for LOVSUB layer.
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LOV
+
+#include "lov_cl_internal.h"
+
+/** \addtogroup lov
+ *  @{
+ */
+
+/*****************************************************************************
+ *
+ * Lovsub io operations.
+ *
+ */
+
+/* All trivial */
+
+/** @} lov */
diff --git a/kernel/drivers/staging/lustre/lustre/lov/lovsub_lock.c b/kernel/drivers/staging/lustre/lustre/lov/lovsub_lock.c
new file mode 100644
index 000000000..62b696d25
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/lov/lovsub_lock.c
@@ -0,0 +1,466 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Implementation of cl_lock for LOVSUB layer.
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LOV
+
+#include "lov_cl_internal.h"
+
+/** \addtogroup lov
+ *  @{
+ */
+
+/*****************************************************************************
+ *
+ * Lovsub lock operations.
+ *
+ */
+
+static void lovsub_lock_fini(const struct lu_env *env,
+			     struct cl_lock_slice *slice)
+{
+	struct lovsub_lock   *lsl;
+
+	lsl = cl2lovsub_lock(slice);
+	LASSERT(list_empty(&lsl->lss_parents));
+	OBD_SLAB_FREE_PTR(lsl, lovsub_lock_kmem);
+}
+
+static void lovsub_parent_lock(const struct lu_env *env, struct lov_lock *lov)
+{
+	struct cl_lock *parent;
+
+	parent = lov->lls_cl.cls_lock;
+	cl_lock_get(parent);
+	lu_ref_add(&parent->cll_reference, "lovsub-parent", current);
+	cl_lock_mutex_get(env, parent);
+}
+
+static void lovsub_parent_unlock(const struct lu_env *env, struct lov_lock *lov)
+{
+	struct cl_lock *parent;
+
+	parent = lov->lls_cl.cls_lock;
+	cl_lock_mutex_put(env, lov->lls_cl.cls_lock);
+	lu_ref_del(&parent->cll_reference, "lovsub-parent", current);
+	cl_lock_put(env, parent);
+}
+
+/**
+ * Implements cl_lock_operations::clo_state() method for lovsub layer, which
+ * method is called whenever sub-lock state changes. Propagates state change
+ * to the top-locks.
+ */
+static void lovsub_lock_state(const struct lu_env *env,
+			      const struct cl_lock_slice *slice,
+			      enum cl_lock_state state)
+{
+	struct lovsub_lock   *sub = cl2lovsub_lock(slice);
+	struct lov_lock_link *scan;
+
+	LASSERT(cl_lock_is_mutexed(slice->cls_lock));
+
+	list_for_each_entry(scan, &sub->lss_parents, lll_list) {
+		struct lov_lock *lov    = scan->lll_super;
+		struct cl_lock  *parent = lov->lls_cl.cls_lock;
+
+		if (sub->lss_active != parent) {
+			lovsub_parent_lock(env, lov);
+			cl_lock_signal(env, parent);
+			lovsub_parent_unlock(env, lov);
+		}
+	}
+}
+
+/**
+ * Implementation of cl_lock_operation::clo_weigh() estimating lock weight by
+ * asking parent lock.
+ */
+static unsigned long lovsub_lock_weigh(const struct lu_env *env,
+				       const struct cl_lock_slice *slice)
+{
+	struct lovsub_lock *lock = cl2lovsub_lock(slice);
+	struct lov_lock    *lov;
+	unsigned long       dumbbell;
+
+	LASSERT(cl_lock_is_mutexed(slice->cls_lock));
+
+	if (!list_empty(&lock->lss_parents)) {
+		/*
+		 * It is not clear whether all parents have to be asked and
+		 * their estimations summed, or it is enough to ask one. For
+		 * the current usages, one is always enough.
+		 */
+		lov = container_of(lock->lss_parents.next,
+				   struct lov_lock_link, lll_list)->lll_super;
+
+		lovsub_parent_lock(env, lov);
+		dumbbell = cl_lock_weigh(env, lov->lls_cl.cls_lock);
+		lovsub_parent_unlock(env, lov);
+	} else
+		dumbbell = 0;
+
+	return dumbbell;
+}
+
+/**
+ * Maps start/end offsets within a stripe, to offsets within a file.
+ */
+static void lovsub_lock_descr_map(const struct cl_lock_descr *in,
+				  struct lov_object *lov,
+				  int stripe, struct cl_lock_descr *out)
+{
+	pgoff_t size; /* stripe size in pages */
+	pgoff_t skip; /* how many pages in every stripe are occupied by
+		       * "other" stripes */
+	pgoff_t start;
+	pgoff_t end;
+
+	start = in->cld_start;
+	end   = in->cld_end;
+
+	if (lov->lo_lsm->lsm_stripe_count > 1) {
+		size = cl_index(lov2cl(lov), lov->lo_lsm->lsm_stripe_size);
+		skip = (lov->lo_lsm->lsm_stripe_count - 1) * size;
+
+		/* XXX overflow check here? */
+		start += start/size * skip + stripe * size;
+
+		if (end != CL_PAGE_EOF) {
+			end += end/size * skip + stripe * size;
+			/*
+			 * And check for overflow...
+			 */
+			if (end < in->cld_end)
+				end = CL_PAGE_EOF;
+		}
+	}
+	out->cld_start = start;
+	out->cld_end   = end;
+}
+
+/**
+ * Adjusts parent lock extent when a sub-lock is attached to a parent. This is
+ * called in two ways:
+ *
+ *     - as part of receive call-back, when server returns granted extent to
+ *       the client, and
+ *
+ *     - when top-lock finds existing sub-lock in the cache.
+ *
+ * Note, that lock mode is not propagated to the parent: i.e., if CLM_READ
+ * top-lock matches CLM_WRITE sub-lock, top-lock is still CLM_READ.
+ */
+int lov_sublock_modify(const struct lu_env *env, struct lov_lock *lov,
+		       struct lovsub_lock *sublock,
+		       const struct cl_lock_descr *d, int idx)
+{
+	struct cl_lock       *parent;
+	struct lovsub_object *subobj;
+	struct cl_lock_descr *pd;
+	struct cl_lock_descr *parent_descr;
+	int		   result;
+
+	parent       = lov->lls_cl.cls_lock;
+	parent_descr = &parent->cll_descr;
+	LASSERT(cl_lock_mode_match(d->cld_mode, parent_descr->cld_mode));
+
+	subobj = cl2lovsub(sublock->lss_cl.cls_obj);
+	pd     = &lov_env_info(env)->lti_ldescr;
+
+	pd->cld_obj  = parent_descr->cld_obj;
+	pd->cld_mode = parent_descr->cld_mode;
+	pd->cld_gid  = parent_descr->cld_gid;
+	lovsub_lock_descr_map(d, subobj->lso_super, subobj->lso_index, pd);
+	lov->lls_sub[idx].sub_got = *d;
+	/*
+	 * Notify top-lock about modification, if lock description changes
+	 * materially.
+	 */
+	if (!cl_lock_ext_match(parent_descr, pd))
+		result = cl_lock_modify(env, parent, pd);
+	else
+		result = 0;
+	return result;
+}
+
+static int lovsub_lock_modify(const struct lu_env *env,
+			      const struct cl_lock_slice *s,
+			      const struct cl_lock_descr *d)
+{
+	struct lovsub_lock   *lock   = cl2lovsub_lock(s);
+	struct lov_lock_link *scan;
+	struct lov_lock      *lov;
+	int result		   = 0;
+
+	LASSERT(cl_lock_mode_match(d->cld_mode,
+				   s->cls_lock->cll_descr.cld_mode));
+	list_for_each_entry(scan, &lock->lss_parents, lll_list) {
+		int rc;
+
+		lov = scan->lll_super;
+		lovsub_parent_lock(env, lov);
+		rc = lov_sublock_modify(env, lov, lock, d, scan->lll_idx);
+		lovsub_parent_unlock(env, lov);
+		result = result ?: rc;
+	}
+	return result;
+}
+
+static int lovsub_lock_closure(const struct lu_env *env,
+			       const struct cl_lock_slice *slice,
+			       struct cl_lock_closure *closure)
+{
+	struct lovsub_lock   *sub;
+	struct cl_lock       *parent;
+	struct lov_lock_link *scan;
+	int		   result;
+
+	LASSERT(cl_lock_is_mutexed(slice->cls_lock));
+
+	sub    = cl2lovsub_lock(slice);
+	result = 0;
+
+	list_for_each_entry(scan, &sub->lss_parents, lll_list) {
+		parent = scan->lll_super->lls_cl.cls_lock;
+		result = cl_lock_closure_build(env, parent, closure);
+		if (result != 0)
+			break;
+	}
+	return result;
+}
+
+/**
+ * A helper function for lovsub_lock_delete() that deals with a given parent
+ * top-lock.
+ */
+static int lovsub_lock_delete_one(const struct lu_env *env,
+				  struct cl_lock *child, struct lov_lock *lov)
+{
+	struct cl_lock *parent;
+	int	     result;
+
+	parent = lov->lls_cl.cls_lock;
+	if (parent->cll_error)
+		return 0;
+
+	result = 0;
+	switch (parent->cll_state) {
+	case CLS_ENQUEUED:
+		/* See LU-1355 for the case that a glimpse lock is
+		 * interrupted by signal */
+		LASSERT(parent->cll_flags & CLF_CANCELLED);
+		break;
+	case CLS_QUEUING:
+	case CLS_FREEING:
+		cl_lock_signal(env, parent);
+		break;
+	case CLS_INTRANSIT:
+		/*
+		 * Here lies a problem: a sub-lock is canceled while top-lock
+		 * is being unlocked. Top-lock cannot be moved into CLS_NEW
+		 * state, because unlocking has to succeed eventually by
+		 * placing lock into CLS_CACHED (or failing it), see
+		 * cl_unuse_try(). Nor can top-lock be left in CLS_CACHED
+		 * state, because lov maintains an invariant that all
+		 * sub-locks exist in CLS_CACHED (this allows cached top-lock
+		 * to be reused immediately). Nor can we wait for top-lock
+		 * state to change, because this can be synchronous to the
+		 * current thread.
+		 *
+		 * We know for sure that lov_lock_unuse() will be called at
+		 * least one more time to finish un-using, so leave a mark on
+		 * the top-lock, that will be seen by the next call to
+		 * lov_lock_unuse().
+		 */
+		if (cl_lock_is_intransit(parent))
+			lov->lls_cancel_race = 1;
+		break;
+	case CLS_CACHED:
+		/*
+		 * if a sub-lock is canceled move its top-lock into CLS_NEW
+		 * state to preserve an invariant that a top-lock in
+		 * CLS_CACHED is immediately ready for re-use (i.e., has all
+		 * sub-locks), and so that next attempt to re-use the top-lock
+		 * enqueues missing sub-lock.
+		 */
+		cl_lock_state_set(env, parent, CLS_NEW);
+		/* fall through */
+	case CLS_NEW:
+		/*
+		 * if last sub-lock is canceled, destroy the top-lock (which
+		 * is now `empty') proactively.
+		 */
+		if (lov->lls_nr_filled == 0) {
+			/* ... but unfortunately, this cannot be done easily,
+			 * as cancellation of a top-lock might acquire mutices
+			 * of its other sub-locks, violating lock ordering,
+			 * see cl_lock_{cancel,delete}() preconditions.
+			 *
+			 * To work around this, the mutex of this sub-lock is
+			 * released, top-lock is destroyed, and sub-lock mutex
+			 * acquired again. The list of parents has to be
+			 * re-scanned from the beginning after this.
+			 *
+			 * Only do this if no mutices other than on @child and
+			 * @parent are held by the current thread.
+			 *
+			 * TODO: The lock modal here is too complex, because
+			 * the lock may be canceled and deleted by voluntarily:
+			 *    cl_lock_request
+			 *      -> osc_lock_enqueue_wait
+			 *	-> osc_lock_cancel_wait
+			 *	  -> cl_lock_delete
+			 *	    -> lovsub_lock_delete
+			 *	      -> cl_lock_cancel/delete
+			 *		-> ...
+			 *
+			 * The better choice is to spawn a kernel thread for
+			 * this purpose. -jay
+			 */
+			if (cl_lock_nr_mutexed(env) == 2) {
+				cl_lock_mutex_put(env, child);
+				cl_lock_cancel(env, parent);
+				cl_lock_delete(env, parent);
+				result = 1;
+			}
+		}
+		break;
+	case CLS_HELD:
+		CL_LOCK_DEBUG(D_ERROR, env, parent, "Delete CLS_HELD lock\n");
+	default:
+		CERROR("Impossible state: %d\n", parent->cll_state);
+		LBUG();
+		break;
+	}
+
+	return result;
+}
+
+/**
+ * An implementation of cl_lock_operations::clo_delete() method. This is
+ * invoked in "bottom-to-top" delete, when lock destruction starts from the
+ * sub-lock (e.g, as a result of ldlm lock LRU policy).
+ */
+static void lovsub_lock_delete(const struct lu_env *env,
+			       const struct cl_lock_slice *slice)
+{
+	struct cl_lock     *child = slice->cls_lock;
+	struct lovsub_lock *sub   = cl2lovsub_lock(slice);
+	int restart;
+
+	LASSERT(cl_lock_is_mutexed(child));
+
+	/*
+	 * Destruction of a sub-lock might take multiple iterations, because
+	 * when the last sub-lock of a given top-lock is deleted, top-lock is
+	 * canceled proactively, and this requires to release sub-lock
+	 * mutex. Once sub-lock mutex has been released, list of its parents
+	 * has to be re-scanned from the beginning.
+	 */
+	do {
+		struct lov_lock      *lov;
+		struct lov_lock_link *scan;
+		struct lov_lock_link *temp;
+		struct lov_lock_sub  *subdata;
+
+		restart = 0;
+		list_for_each_entry_safe(scan, temp,
+					     &sub->lss_parents, lll_list) {
+			lov     = scan->lll_super;
+			subdata = &lov->lls_sub[scan->lll_idx];
+			lovsub_parent_lock(env, lov);
+			subdata->sub_got = subdata->sub_descr;
+			lov_lock_unlink(env, scan, sub);
+			restart = lovsub_lock_delete_one(env, child, lov);
+			lovsub_parent_unlock(env, lov);
+
+			if (restart) {
+				cl_lock_mutex_get(env, child);
+				break;
+			}
+	       }
+	} while (restart);
+}
+
+static int lovsub_lock_print(const struct lu_env *env, void *cookie,
+			     lu_printer_t p, const struct cl_lock_slice *slice)
+{
+	struct lovsub_lock   *sub = cl2lovsub_lock(slice);
+	struct lov_lock      *lov;
+	struct lov_lock_link *scan;
+
+	list_for_each_entry(scan, &sub->lss_parents, lll_list) {
+		lov = scan->lll_super;
+		(*p)(env, cookie, "[%d %p ", scan->lll_idx, lov);
+		if (lov != NULL)
+			cl_lock_descr_print(env, cookie, p,
+					    &lov->lls_cl.cls_lock->cll_descr);
+		(*p)(env, cookie, "] ");
+	}
+	return 0;
+}
+
+static const struct cl_lock_operations lovsub_lock_ops = {
+	.clo_fini    = lovsub_lock_fini,
+	.clo_state   = lovsub_lock_state,
+	.clo_delete  = lovsub_lock_delete,
+	.clo_modify  = lovsub_lock_modify,
+	.clo_closure = lovsub_lock_closure,
+	.clo_weigh   = lovsub_lock_weigh,
+	.clo_print   = lovsub_lock_print
+};
+
+int lovsub_lock_init(const struct lu_env *env, struct cl_object *obj,
+		     struct cl_lock *lock, const struct cl_io *io)
+{
+	struct lovsub_lock *lsk;
+	int result;
+
+	OBD_SLAB_ALLOC_PTR_GFP(lsk, lovsub_lock_kmem, GFP_NOFS);
+	if (lsk != NULL) {
+		INIT_LIST_HEAD(&lsk->lss_parents);
+		cl_lock_slice_add(lock, &lsk->lss_cl, obj, &lovsub_lock_ops);
+		result = 0;
+	} else
+		result = -ENOMEM;
+	return result;
+}
+
+/** @} lov */
diff --git a/kernel/drivers/staging/lustre/lustre/lov/lovsub_object.c b/kernel/drivers/staging/lustre/lustre/lov/lovsub_object.c
new file mode 100644
index 000000000..57e3629fc
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/lov/lovsub_object.c
@@ -0,0 +1,164 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Implementation of cl_object for LOVSUB layer.
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LOV
+
+#include "lov_cl_internal.h"
+
+/** \addtogroup lov
+ *  @{
+ */
+
+/*****************************************************************************
+ *
+ * Lovsub object operations.
+ *
+ */
+
+int lovsub_object_init(const struct lu_env *env, struct lu_object *obj,
+		       const struct lu_object_conf *conf)
+{
+	struct lovsub_device  *dev   = lu2lovsub_dev(obj->lo_dev);
+	struct lu_object      *below;
+	struct lu_device      *under;
+
+	int result;
+
+	under = &dev->acid_next->cd_lu_dev;
+	below = under->ld_ops->ldo_object_alloc(env, obj->lo_header, under);
+	if (below != NULL) {
+		lu_object_add(obj, below);
+		cl_object_page_init(lu2cl(obj), sizeof(struct lovsub_page));
+		result = 0;
+	} else
+		result = -ENOMEM;
+	return result;
+
+}
+
+static void lovsub_object_free(const struct lu_env *env, struct lu_object *obj)
+{
+	struct lovsub_object *los = lu2lovsub(obj);
+	struct lov_object    *lov = los->lso_super;
+
+	/* We can't assume lov was assigned here, because of the shadow
+	 * object handling in lu_object_find.
+	 */
+	if (lov) {
+		LASSERT(lov->lo_type == LLT_RAID0);
+		LASSERT(lov->u.raid0.lo_sub[los->lso_index] == los);
+		spin_lock(&lov->u.raid0.lo_sub_lock);
+		lov->u.raid0.lo_sub[los->lso_index] = NULL;
+		spin_unlock(&lov->u.raid0.lo_sub_lock);
+	}
+
+	lu_object_fini(obj);
+	lu_object_header_fini(&los->lso_header.coh_lu);
+	OBD_SLAB_FREE_PTR(los, lovsub_object_kmem);
+}
+
+static int lovsub_object_print(const struct lu_env *env, void *cookie,
+			       lu_printer_t p, const struct lu_object *obj)
+{
+	struct lovsub_object *los = lu2lovsub(obj);
+
+	return (*p)(env, cookie, "[%d]", los->lso_index);
+}
+
+static int lovsub_attr_set(const struct lu_env *env, struct cl_object *obj,
+			   const struct cl_attr *attr, unsigned valid)
+{
+	struct lov_object *lov = cl2lovsub(obj)->lso_super;
+
+	lov_r0(lov)->lo_attr_valid = 0;
+	return 0;
+}
+
+static int lovsub_object_glimpse(const struct lu_env *env,
+				 const struct cl_object *obj,
+				 struct ost_lvb *lvb)
+{
+	struct lovsub_object *los = cl2lovsub(obj);
+
+	return cl_object_glimpse(env, &los->lso_super->lo_cl, lvb);
+}
+
+
+
+static const struct cl_object_operations lovsub_ops = {
+	.coo_page_init = lovsub_page_init,
+	.coo_lock_init = lovsub_lock_init,
+	.coo_attr_set  = lovsub_attr_set,
+	.coo_glimpse   = lovsub_object_glimpse
+};
+
+static const struct lu_object_operations lovsub_lu_obj_ops = {
+	.loo_object_init      = lovsub_object_init,
+	.loo_object_delete    = NULL,
+	.loo_object_release   = NULL,
+	.loo_object_free      = lovsub_object_free,
+	.loo_object_print     = lovsub_object_print,
+	.loo_object_invariant = NULL
+};
+
+struct lu_object *lovsub_object_alloc(const struct lu_env *env,
+				      const struct lu_object_header *unused,
+				      struct lu_device *dev)
+{
+	struct lovsub_object *los;
+	struct lu_object     *obj;
+
+	OBD_SLAB_ALLOC_PTR_GFP(los, lovsub_object_kmem, GFP_NOFS);
+	if (los != NULL) {
+		struct cl_object_header *hdr;
+
+		obj = lovsub2lu(los);
+		hdr = &los->lso_header;
+		cl_object_header_init(hdr);
+		lu_object_init(obj, &hdr->coh_lu, dev);
+		lu_object_add_top(&hdr->coh_lu, obj);
+		los->lso_cl.co_ops = &lovsub_ops;
+		obj->lo_ops = &lovsub_lu_obj_ops;
+	} else
+		obj = NULL;
+	return obj;
+}
+
+/** @} lov */
diff --git a/kernel/drivers/staging/lustre/lustre/lov/lovsub_page.c b/kernel/drivers/staging/lustre/lustre/lov/lovsub_page.c
new file mode 100644
index 000000000..3f00ce967
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/lov/lovsub_page.c
@@ -0,0 +1,71 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Implementation of cl_page for LOVSUB layer.
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LOV
+
+#include "lov_cl_internal.h"
+
+/** \addtogroup lov
+ *  @{
+ */
+
+/*****************************************************************************
+ *
+ * Lovsub page operations.
+ *
+ */
+
+static void lovsub_page_fini(const struct lu_env *env,
+			     struct cl_page_slice *slice)
+{
+}
+
+static const struct cl_page_operations lovsub_page_ops = {
+	.cpo_fini   = lovsub_page_fini
+};
+
+int lovsub_page_init(const struct lu_env *env, struct cl_object *obj,
+			struct cl_page *page, struct page *unused)
+{
+	struct lovsub_page *lsb = cl_object_page_slice(obj, page);
+
+	cl_page_slice_add(page, &lsb->lsb_cl, obj, &lovsub_page_ops);
+	return 0;
+}
+
+/** @} lov */
diff --git a/kernel/drivers/staging/lustre/lustre/lov/lproc_lov.c b/kernel/drivers/staging/lustre/lustre/lov/lproc_lov.c
new file mode 100644
index 000000000..174cbf5c1
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/lov/lproc_lov.c
@@ -0,0 +1,311 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+#define DEBUG_SUBSYSTEM S_CLASS
+
+#include <linux/statfs.h>
+#include "../include/lprocfs_status.h"
+#include "../include/obd_class.h"
+#include <linux/seq_file.h>
+#include "lov_internal.h"
+
+static int lov_stripesize_seq_show(struct seq_file *m, void *v)
+{
+	struct obd_device *dev = (struct obd_device *)m->private;
+	struct lov_desc *desc;
+
+	LASSERT(dev != NULL);
+	desc = &dev->u.lov.desc;
+	seq_printf(m, "%llu\n", desc->ld_default_stripe_size);
+	return 0;
+}
+
+static ssize_t lov_stripesize_seq_write(struct file *file,
+				const char __user *buffer,
+				size_t count, loff_t *off)
+{
+	struct obd_device *dev = ((struct seq_file *)file->private_data)->private;
+	struct lov_desc *desc;
+	__u64 val;
+	int rc;
+
+	LASSERT(dev != NULL);
+	desc = &dev->u.lov.desc;
+	rc = lprocfs_write_u64_helper(buffer, count, &val);
+	if (rc)
+		return rc;
+
+	lov_fix_desc_stripe_size(&val);
+	desc->ld_default_stripe_size = val;
+	return count;
+}
+LPROC_SEQ_FOPS(lov_stripesize);
+
+static int lov_stripeoffset_seq_show(struct seq_file *m, void *v)
+{
+	struct obd_device *dev = (struct obd_device *)m->private;
+	struct lov_desc *desc;
+
+	LASSERT(dev != NULL);
+	desc = &dev->u.lov.desc;
+	seq_printf(m, "%llu\n", desc->ld_default_stripe_offset);
+	return 0;
+}
+
+static ssize_t lov_stripeoffset_seq_write(struct file *file,
+				const char __user *buffer,
+				size_t count, loff_t *off)
+{
+	struct obd_device *dev = ((struct seq_file *)file->private_data)->private;
+	struct lov_desc *desc;
+	__u64 val;
+	int rc;
+
+	LASSERT(dev != NULL);
+	desc = &dev->u.lov.desc;
+	rc = lprocfs_write_u64_helper(buffer, count, &val);
+	if (rc)
+		return rc;
+
+	desc->ld_default_stripe_offset = val;
+	return count;
+}
+LPROC_SEQ_FOPS(lov_stripeoffset);
+
+static int lov_stripetype_seq_show(struct seq_file *m, void *v)
+{
+	struct obd_device *dev = (struct obd_device *)m->private;
+	struct lov_desc *desc;
+
+	LASSERT(dev != NULL);
+	desc = &dev->u.lov.desc;
+	seq_printf(m, "%u\n", desc->ld_pattern);
+	return 0;
+}
+
+static ssize_t lov_stripetype_seq_write(struct file *file,
+				const char __user *buffer,
+				size_t count, loff_t *off)
+{
+	struct obd_device *dev = ((struct seq_file *)file->private_data)->private;
+	struct lov_desc *desc;
+	int val, rc;
+
+	LASSERT(dev != NULL);
+	desc = &dev->u.lov.desc;
+	rc = lprocfs_write_helper(buffer, count, &val);
+	if (rc)
+		return rc;
+
+	lov_fix_desc_pattern(&val);
+	desc->ld_pattern = val;
+	return count;
+}
+LPROC_SEQ_FOPS(lov_stripetype);
+
+static int lov_stripecount_seq_show(struct seq_file *m, void *v)
+{
+	struct obd_device *dev = (struct obd_device *)m->private;
+	struct lov_desc *desc;
+
+	LASSERT(dev != NULL);
+	desc = &dev->u.lov.desc;
+	seq_printf(m, "%d\n", (__s16)(desc->ld_default_stripe_count + 1) - 1);
+	return 0;
+}
+
+static ssize_t lov_stripecount_seq_write(struct file *file,
+				const char __user *buffer,
+				size_t count, loff_t *off)
+{
+	struct obd_device *dev = ((struct seq_file *)file->private_data)->private;
+	struct lov_desc *desc;
+	int val, rc;
+
+	LASSERT(dev != NULL);
+	desc = &dev->u.lov.desc;
+	rc = lprocfs_write_helper(buffer, count, &val);
+	if (rc)
+		return rc;
+
+	lov_fix_desc_stripe_count(&val);
+	desc->ld_default_stripe_count = val;
+	return count;
+}
+LPROC_SEQ_FOPS(lov_stripecount);
+
+static int lov_numobd_seq_show(struct seq_file *m, void *v)
+{
+	struct obd_device *dev = (struct obd_device *)m->private;
+	struct lov_desc *desc;
+
+	LASSERT(dev != NULL);
+	desc = &dev->u.lov.desc;
+	seq_printf(m, "%u\n", desc->ld_tgt_count);
+	return 0;
+}
+LPROC_SEQ_FOPS_RO(lov_numobd);
+
+static int lov_activeobd_seq_show(struct seq_file *m, void *v)
+{
+	struct obd_device *dev = (struct obd_device *)m->private;
+	struct lov_desc *desc;
+
+	LASSERT(dev != NULL);
+	desc = &dev->u.lov.desc;
+	seq_printf(m, "%u\n", desc->ld_active_tgt_count);
+	return 0;
+}
+LPROC_SEQ_FOPS_RO(lov_activeobd);
+
+static int lov_desc_uuid_seq_show(struct seq_file *m, void *v)
+{
+	struct obd_device *dev = (struct obd_device *)m->private;
+	struct lov_obd *lov;
+
+	LASSERT(dev != NULL);
+	lov = &dev->u.lov;
+	seq_printf(m, "%s\n", lov->desc.ld_uuid.uuid);
+	return 0;
+}
+LPROC_SEQ_FOPS_RO(lov_desc_uuid);
+
+static void *lov_tgt_seq_start(struct seq_file *p, loff_t *pos)
+{
+	struct obd_device *dev = p->private;
+	struct lov_obd *lov = &dev->u.lov;
+
+	while (*pos < lov->desc.ld_tgt_count) {
+		if (lov->lov_tgts[*pos])
+			return lov->lov_tgts[*pos];
+		++*pos;
+	}
+	return NULL;
+}
+
+static void lov_tgt_seq_stop(struct seq_file *p, void *v)
+{
+}
+
+static void *lov_tgt_seq_next(struct seq_file *p, void *v, loff_t *pos)
+{
+	struct obd_device *dev = p->private;
+	struct lov_obd *lov = &dev->u.lov;
+
+	while (++*pos < lov->desc.ld_tgt_count) {
+		if (lov->lov_tgts[*pos])
+			return lov->lov_tgts[*pos];
+	}
+	return NULL;
+}
+
+static int lov_tgt_seq_show(struct seq_file *p, void *v)
+{
+	struct lov_tgt_desc *tgt = v;
+
+	seq_printf(p, "%d: %s %sACTIVE\n",
+		   tgt->ltd_index, obd_uuid2str(&tgt->ltd_uuid),
+		   tgt->ltd_active ? "" : "IN");
+	return 0;
+}
+
+static const struct seq_operations lov_tgt_sops = {
+	.start = lov_tgt_seq_start,
+	.stop = lov_tgt_seq_stop,
+	.next = lov_tgt_seq_next,
+	.show = lov_tgt_seq_show,
+};
+
+static int lov_target_seq_open(struct inode *inode, struct file *file)
+{
+	struct seq_file *seq;
+	int rc;
+
+	rc = seq_open(file, &lov_tgt_sops);
+	if (rc)
+		return rc;
+
+	seq = file->private_data;
+	seq->private = PDE_DATA(inode);
+	return 0;
+}
+
+LPROC_SEQ_FOPS_RO_TYPE(lov, uuid);
+LPROC_SEQ_FOPS_RO_TYPE(lov, filestotal);
+LPROC_SEQ_FOPS_RO_TYPE(lov, filesfree);
+LPROC_SEQ_FOPS_RO_TYPE(lov, blksize);
+LPROC_SEQ_FOPS_RO_TYPE(lov, kbytestotal);
+LPROC_SEQ_FOPS_RO_TYPE(lov, kbytesfree);
+LPROC_SEQ_FOPS_RO_TYPE(lov, kbytesavail);
+
+static struct lprocfs_vars lprocfs_lov_obd_vars[] = {
+	{ "uuid",	  &lov_uuid_fops,	  NULL, 0 },
+	{ "stripesize",   &lov_stripesize_fops,   NULL },
+	{ "stripeoffset", &lov_stripeoffset_fops, NULL },
+	{ "stripecount",  &lov_stripecount_fops,  NULL },
+	{ "stripetype",   &lov_stripetype_fops,   NULL },
+	{ "numobd",       &lov_numobd_fops,	  NULL, 0 },
+	{ "activeobd",    &lov_activeobd_fops,	  NULL, 0 },
+	{ "filestotal",   &lov_filestotal_fops,   NULL, 0 },
+	{ "filesfree",    &lov_filesfree_fops,    NULL, 0 },
+	/*{ "filegroups", lprocfs_rd_filegroups,  NULL, 0 },*/
+	{ "blocksize",    &lov_blksize_fops,      NULL, 0 },
+	{ "kbytestotal",  &lov_kbytestotal_fops,  NULL, 0 },
+	{ "kbytesfree",   &lov_kbytesfree_fops,   NULL, 0 },
+	{ "kbytesavail",  &lov_kbytesavail_fops,  NULL, 0 },
+	{ "desc_uuid",    &lov_desc_uuid_fops,    NULL, 0 },
+	{ NULL }
+};
+
+LPROC_SEQ_FOPS_RO_TYPE(lov, numrefs);
+
+static struct lprocfs_vars lprocfs_lov_module_vars[] = {
+	{ "num_refs",     &lov_numrefs_fops,     NULL, 0 },
+	{ NULL }
+};
+
+void lprocfs_lov_init_vars(struct lprocfs_static_vars *lvars)
+{
+    lvars->module_vars  = lprocfs_lov_module_vars;
+    lvars->obd_vars     = lprocfs_lov_obd_vars;
+}
+
+const struct file_operations lov_proc_target_fops = {
+	.owner   = THIS_MODULE,
+	.open    = lov_target_seq_open,
+	.read    = seq_read,
+	.llseek  = seq_lseek,
+	.release = lprocfs_seq_release,
+};
diff --git a/kernel/drivers/staging/lustre/lustre/mdc/Makefile b/kernel/drivers/staging/lustre/lustre/mdc/Makefile
new file mode 100644
index 000000000..2516551a6
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/mdc/Makefile
@@ -0,0 +1,3 @@
+obj-$(CONFIG_LUSTRE_FS) += mdc.o
+mdc-y := mdc_request.o mdc_reint.o mdc_lib.o mdc_locks.o
+mdc-$(CONFIG_PROC_FS) += lproc_mdc.o
diff --git a/kernel/drivers/staging/lustre/lustre/mdc/lproc_mdc.c b/kernel/drivers/staging/lustre/lustre/mdc/lproc_mdc.c
new file mode 100644
index 000000000..acfe08e45
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/mdc/lproc_mdc.c
@@ -0,0 +1,220 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+#define DEBUG_SUBSYSTEM S_CLASS
+
+#include <linux/vfs.h>
+#include "../include/obd_class.h"
+#include "../include/lprocfs_status.h"
+#include "mdc_internal.h"
+
+static int mdc_max_rpcs_in_flight_seq_show(struct seq_file *m, void *v)
+{
+	struct obd_device *dev = m->private;
+	struct client_obd *cli = &dev->u.cli;
+
+	client_obd_list_lock(&cli->cl_loi_list_lock);
+	seq_printf(m, "%u\n", cli->cl_max_rpcs_in_flight);
+	client_obd_list_unlock(&cli->cl_loi_list_lock);
+
+	return 0;
+}
+
+static ssize_t mdc_max_rpcs_in_flight_seq_write(struct file *file,
+						const char __user *buffer,
+						size_t count,
+						loff_t *off)
+{
+	struct obd_device *dev =
+			((struct seq_file *)file->private_data)->private;
+	struct client_obd *cli = &dev->u.cli;
+	int val, rc;
+
+	rc = lprocfs_write_helper(buffer, count, &val);
+	if (rc)
+		return rc;
+
+	if (val < 1 || val > MDC_MAX_RIF_MAX)
+		return -ERANGE;
+
+	client_obd_list_lock(&cli->cl_loi_list_lock);
+	cli->cl_max_rpcs_in_flight = val;
+	client_obd_list_unlock(&cli->cl_loi_list_lock);
+
+	return count;
+}
+LPROC_SEQ_FOPS(mdc_max_rpcs_in_flight);
+
+static int mdc_kuc_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, NULL, PDE_DATA(inode));
+}
+
+/* temporary for testing */
+static ssize_t mdc_kuc_write(struct file *file,
+				const char __user *buffer,
+				size_t count, loff_t *off)
+{
+	struct obd_device *obd =
+			((struct seq_file *)file->private_data)->private;
+	struct kuc_hdr		*lh;
+	struct hsm_action_list	*hal;
+	struct hsm_action_item	*hai;
+	int			 len;
+	int			 fd, rc;
+
+	rc = lprocfs_write_helper(buffer, count, &fd);
+	if (rc)
+		return rc;
+
+	if (fd < 0)
+		return -ERANGE;
+	CWARN("message to fd %d\n", fd);
+
+	len = sizeof(*lh) + sizeof(*hal) + MTI_NAME_MAXLEN +
+		/* for mockup below */ 2 * cfs_size_round(sizeof(*hai));
+
+	OBD_ALLOC(lh, len);
+	if (!lh)
+		return -ENOMEM;
+
+	lh->kuc_magic = KUC_MAGIC;
+	lh->kuc_transport = KUC_TRANSPORT_HSM;
+	lh->kuc_msgtype = HMT_ACTION_LIST;
+	lh->kuc_msglen = len;
+
+	hal = (struct hsm_action_list *)(lh + 1);
+	hal->hal_version = HAL_VERSION;
+	hal->hal_archive_id = 1;
+	hal->hal_flags = 0;
+	obd_uuid2fsname(hal->hal_fsname, obd->obd_name, MTI_NAME_MAXLEN);
+
+	/* mock up an action list */
+	hal->hal_count = 2;
+	hai = hai_zero(hal);
+	hai->hai_action = HSMA_ARCHIVE;
+	hai->hai_fid.f_oid = 5;
+	hai->hai_len = sizeof(*hai);
+	hai = hai_next(hai);
+	hai->hai_action = HSMA_RESTORE;
+	hai->hai_fid.f_oid = 10;
+	hai->hai_len = sizeof(*hai);
+
+	/* This works for either broadcast or unicast to a single fd */
+	if (fd == 0) {
+		rc = libcfs_kkuc_group_put(KUC_GRP_HSM, lh);
+	} else {
+		struct file *fp = fget(fd);
+
+		rc = libcfs_kkuc_msg_put(fp, lh);
+		fput(fp);
+	}
+	OBD_FREE(lh, len);
+	if (rc < 0)
+		return rc;
+	return count;
+}
+
+struct file_operations mdc_kuc_fops = {
+	.open		= mdc_kuc_open,
+	.write		= mdc_kuc_write,
+	.release	= single_release,
+};
+
+LPROC_SEQ_FOPS_WR_ONLY(mdc, ping);
+
+LPROC_SEQ_FOPS_RO_TYPE(mdc, uuid);
+LPROC_SEQ_FOPS_RO_TYPE(mdc, connect_flags);
+LPROC_SEQ_FOPS_RO_TYPE(mdc, blksize);
+LPROC_SEQ_FOPS_RO_TYPE(mdc, kbytestotal);
+LPROC_SEQ_FOPS_RO_TYPE(mdc, kbytesfree);
+LPROC_SEQ_FOPS_RO_TYPE(mdc, kbytesavail);
+LPROC_SEQ_FOPS_RO_TYPE(mdc, filestotal);
+LPROC_SEQ_FOPS_RO_TYPE(mdc, filesfree);
+LPROC_SEQ_FOPS_RO_TYPE(mdc, server_uuid);
+LPROC_SEQ_FOPS_RO_TYPE(mdc, conn_uuid);
+LPROC_SEQ_FOPS_RO_TYPE(mdc, timeouts);
+LPROC_SEQ_FOPS_RO_TYPE(mdc, state);
+
+static int mdc_obd_max_pages_per_rpc_seq_show(struct seq_file *m, void *v)
+{
+	return lprocfs_obd_rd_max_pages_per_rpc(m, m->private);
+}
+LPROC_SEQ_FOPS_RO(mdc_obd_max_pages_per_rpc);
+
+LPROC_SEQ_FOPS_RW_TYPE(mdc, import);
+LPROC_SEQ_FOPS_RW_TYPE(mdc, pinger_recov);
+
+static struct lprocfs_vars lprocfs_mdc_obd_vars[] = {
+	{ "uuid",		&mdc_uuid_fops,			NULL, 0 },
+	{ "ping",		&mdc_ping_fops,			NULL, 0222 },
+	{ "connect_flags",	&mdc_connect_flags_fops,	NULL, 0 },
+	{ "blocksize",		&mdc_blksize_fops,		NULL, 0 },
+	{ "kbytestotal",	&mdc_kbytestotal_fops,		NULL, 0 },
+	{ "kbytesfree",		&mdc_kbytesfree_fops,		NULL, 0 },
+	{ "kbytesavail",	&mdc_kbytesavail_fops,		NULL, 0 },
+	{ "filestotal",		&mdc_filestotal_fops,		NULL, 0 },
+	{ "filesfree",		&mdc_filesfree_fops,		NULL, 0 },
+	/*{ "filegroups",	lprocfs_rd_filegroups,		NULL, 0 },*/
+	{ "mds_server_uuid",	&mdc_server_uuid_fops,		NULL, 0 },
+	{ "mds_conn_uuid",	&mdc_conn_uuid_fops,		NULL, 0 },
+	/*
+	 * FIXME: below proc entry is provided, but not in used, instead
+	 * sbi->sb_md_brw_size is used, the per obd variable should be used
+	 * when CMD is enabled, and dir pages are managed in MDC layer.
+	 * Remember to enable proc write function.
+	 */
+	{ "max_pages_per_rpc",	&mdc_obd_max_pages_per_rpc_fops, NULL, 0 },
+	{ "max_rpcs_in_flight",	&mdc_max_rpcs_in_flight_fops,	NULL, 0 },
+	{ "timeouts",		&mdc_timeouts_fops,		NULL, 0 },
+	{ "import",		&mdc_import_fops,		NULL, 0 },
+	{ "state",		&mdc_state_fops,		NULL, 0 },
+	{ "hsm_nl",		&mdc_kuc_fops,			NULL, 0200 },
+	{ "pinger_recov",	&mdc_pinger_recov_fops,		NULL, 0 },
+	{ NULL }
+};
+
+LPROC_SEQ_FOPS_RO_TYPE(mdc, numrefs);
+
+static struct lprocfs_vars lprocfs_mdc_module_vars[] = {
+	{ "num_refs",		&mdc_numrefs_fops,		NULL, 0 },
+	{ NULL }
+};
+
+void lprocfs_mdc_init_vars(struct lprocfs_static_vars *lvars)
+{
+	lvars->module_vars  = lprocfs_mdc_module_vars;
+	lvars->obd_vars     = lprocfs_mdc_obd_vars;
+}
diff --git a/kernel/drivers/staging/lustre/lustre/mdc/mdc_internal.h b/kernel/drivers/staging/lustre/lustre/mdc/mdc_internal.h
new file mode 100644
index 000000000..81780c943
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/mdc/mdc_internal.h
@@ -0,0 +1,181 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef _MDC_INTERNAL_H
+#define _MDC_INTERNAL_H
+
+#include "../include/lustre_mdc.h"
+#include "../include/lustre_mds.h"
+
+#if defined CONFIG_PROC_FS
+void lprocfs_mdc_init_vars(struct lprocfs_static_vars *lvars);
+#else
+static inline void lprocfs_mdc_init_vars(struct lprocfs_static_vars *lvars)
+{
+	memset(lvars, 0, sizeof(*lvars));
+}
+#endif
+
+void mdc_pack_body(struct ptlrpc_request *req, const struct lu_fid *fid,
+		   struct obd_capa *oc, __u64 valid, int ea_size,
+		   __u32 suppgid, int flags);
+void mdc_pack_capa(struct ptlrpc_request *req,
+		   const struct req_msg_field *field, struct obd_capa *oc);
+int mdc_pack_req(struct ptlrpc_request *req, int version, int opc);
+void mdc_is_subdir_pack(struct ptlrpc_request *req, const struct lu_fid *pfid,
+			const struct lu_fid *cfid, int flags);
+void mdc_swap_layouts_pack(struct ptlrpc_request *req,
+			   struct md_op_data *op_data);
+void mdc_readdir_pack(struct ptlrpc_request *req, __u64 pgoff, __u32 size,
+		      const struct lu_fid *fid, struct obd_capa *oc);
+void mdc_getattr_pack(struct ptlrpc_request *req, __u64 valid, int flags,
+		      struct md_op_data *data, int ea_size);
+void mdc_setattr_pack(struct ptlrpc_request *req, struct md_op_data *op_data,
+		     void *ea, int ealen, void *ea2, int ea2len);
+void mdc_create_pack(struct ptlrpc_request *req, struct md_op_data *op_data,
+		     const void *data, int datalen, __u32 mode, __u32 uid,
+		     __u32 gid, cfs_cap_t capability, __u64 rdev);
+void mdc_open_pack(struct ptlrpc_request *req, struct md_op_data *op_data,
+		   __u32 mode, __u64 rdev, __u64 flags, const void *data,
+		   int datalen);
+void mdc_unlink_pack(struct ptlrpc_request *req, struct md_op_data *op_data);
+void mdc_getxattr_pack(struct ptlrpc_request *req, struct md_op_data *op_data);
+void mdc_link_pack(struct ptlrpc_request *req, struct md_op_data *op_data);
+void mdc_rename_pack(struct ptlrpc_request *req, struct md_op_data *op_data,
+		     const char *old, int oldlen, const char *new, int newlen);
+void mdc_close_pack(struct ptlrpc_request *req, struct md_op_data *op_data);
+int mdc_enter_request(struct client_obd *cli);
+void mdc_exit_request(struct client_obd *cli);
+
+/* mdc/mdc_locks.c */
+int mdc_set_lock_data(struct obd_export *exp,
+		      __u64 *lockh, void *data, __u64 *bits);
+
+int mdc_null_inode(struct obd_export *exp, const struct lu_fid *fid);
+
+int mdc_find_cbdata(struct obd_export *exp, const struct lu_fid *fid,
+		    ldlm_iterator_t it, void *data);
+
+int mdc_intent_lock(struct obd_export *exp,
+		    struct md_op_data *,
+		    void *lmm, int lmmsize,
+		    struct lookup_intent *, int,
+		    struct ptlrpc_request **reqp,
+		    ldlm_blocking_callback cb_blocking,
+		    __u64 extra_lock_flags);
+int mdc_enqueue(struct obd_export *exp, struct ldlm_enqueue_info *einfo,
+		struct lookup_intent *it, struct md_op_data *op_data,
+		struct lustre_handle *lockh, void *lmm, int lmmsize,
+		struct ptlrpc_request **req, __u64 extra_lock_flags);
+
+int mdc_resource_get_unused(struct obd_export *exp, const struct lu_fid *fid,
+			    struct list_head *cancels, ldlm_mode_t mode,
+			    __u64 bits);
+/* mdc/mdc_request.c */
+int mdc_fid_alloc(struct obd_export *exp, struct lu_fid *fid,
+		  struct md_op_data *op_data);
+
+int mdc_open(struct obd_export *exp, u64 ino, int type, int flags,
+	     struct lov_mds_md *lmm, int lmm_size, struct lustre_handle *fh,
+	     struct ptlrpc_request **);
+
+struct obd_client_handle;
+
+int mdc_get_lustre_md(struct obd_export *md_exp, struct ptlrpc_request *req,
+		      struct obd_export *dt_exp, struct obd_export *lmv_exp,
+		      struct lustre_md *md);
+
+int mdc_free_lustre_md(struct obd_export *exp, struct lustre_md *md);
+
+int mdc_set_open_replay_data(struct obd_export *exp,
+			     struct obd_client_handle *och,
+			     struct lookup_intent *it);
+
+int mdc_clear_open_replay_data(struct obd_export *exp,
+			       struct obd_client_handle *och);
+void mdc_commit_open(struct ptlrpc_request *req);
+void mdc_replay_open(struct ptlrpc_request *req);
+
+int mdc_create(struct obd_export *exp, struct md_op_data *op_data,
+	       const void *data, int datalen, int mode, __u32 uid, __u32 gid,
+	       cfs_cap_t capability, __u64 rdev,
+	       struct ptlrpc_request **request);
+int mdc_link(struct obd_export *exp, struct md_op_data *op_data,
+	     struct ptlrpc_request **request);
+int mdc_rename(struct obd_export *exp, struct md_op_data *op_data,
+	       const char *old, int oldlen, const char *new, int newlen,
+	       struct ptlrpc_request **request);
+int mdc_setattr(struct obd_export *exp, struct md_op_data *op_data,
+		void *ea, int ealen, void *ea2, int ea2len,
+		struct ptlrpc_request **request, struct md_open_data **mod);
+int mdc_unlink(struct obd_export *exp, struct md_op_data *op_data,
+	       struct ptlrpc_request **request);
+int mdc_cancel_unused(struct obd_export *exp, const struct lu_fid *fid,
+		      ldlm_policy_data_t *policy, ldlm_mode_t mode,
+		      ldlm_cancel_flags_t flags, void *opaque);
+
+static inline void mdc_set_capa_size(struct ptlrpc_request *req,
+				     const struct req_msg_field *field,
+				     struct obd_capa *oc)
+{
+	if (oc == NULL)
+		req_capsule_set_size(&req->rq_pill, field, RCL_CLIENT, 0);
+	else
+		/* it is already calculated as sizeof struct obd_capa */
+		;
+}
+
+int mdc_revalidate_lock(struct obd_export *exp, struct lookup_intent *it,
+			struct lu_fid *fid, __u64 *bits);
+
+int mdc_intent_getattr_async(struct obd_export *exp,
+			     struct md_enqueue_info *minfo,
+			     struct ldlm_enqueue_info *einfo);
+
+ldlm_mode_t mdc_lock_match(struct obd_export *exp, __u64 flags,
+			   const struct lu_fid *fid, ldlm_type_t type,
+			   ldlm_policy_data_t *policy, ldlm_mode_t mode,
+			   struct lustre_handle *lockh);
+
+static inline int mdc_prep_elc_req(struct obd_export *exp,
+				   struct ptlrpc_request *req, int opc,
+				   struct list_head *cancels, int count)
+{
+	return ldlm_prep_elc_req(exp, req, LUSTRE_MDS_VERSION, opc, 0, cancels,
+				 count);
+}
+
+#endif
diff --git a/kernel/drivers/staging/lustre/lustre/mdc/mdc_lib.c b/kernel/drivers/staging/lustre/lustre/mdc/mdc_lib.c
new file mode 100644
index 000000000..d3234cb1e
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/mdc/mdc_lib.c
@@ -0,0 +1,593 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_MDC
+#include "../include/lustre_net.h"
+#include "../include/lustre/lustre_idl.h"
+#include "mdc_internal.h"
+
+
+static void __mdc_pack_body(struct mdt_body *b, __u32 suppgid)
+{
+	LASSERT(b != NULL);
+
+	b->suppgid = suppgid;
+	b->uid = from_kuid(&init_user_ns, current_uid());
+	b->gid = from_kgid(&init_user_ns, current_gid());
+	b->fsuid = from_kuid(&init_user_ns, current_fsuid());
+	b->fsgid = from_kgid(&init_user_ns, current_fsgid());
+	b->capability = cfs_curproc_cap_pack();
+}
+
+void mdc_pack_capa(struct ptlrpc_request *req,
+		   const struct req_msg_field *field,
+		   struct obd_capa *oc)
+{
+	struct req_capsule *pill = &req->rq_pill;
+	struct lustre_capa *c;
+
+	if (oc == NULL) {
+		LASSERT(req_capsule_get_size(pill, field, RCL_CLIENT) == 0);
+		return;
+	}
+
+	c = req_capsule_client_get(pill, field);
+	LASSERT(c != NULL);
+	capa_cpy(c, oc);
+	DEBUG_CAPA(D_SEC, c, "pack");
+}
+
+void mdc_is_subdir_pack(struct ptlrpc_request *req, const struct lu_fid *pfid,
+			const struct lu_fid *cfid, int flags)
+{
+	struct mdt_body *b = req_capsule_client_get(&req->rq_pill,
+						    &RMF_MDT_BODY);
+
+	if (pfid) {
+		b->fid1 = *pfid;
+		b->valid = OBD_MD_FLID;
+	}
+	if (cfid)
+		b->fid2 = *cfid;
+	b->flags = flags;
+}
+
+void mdc_swap_layouts_pack(struct ptlrpc_request *req,
+			   struct md_op_data *op_data)
+{
+	struct mdt_body *b = req_capsule_client_get(&req->rq_pill,
+						    &RMF_MDT_BODY);
+
+	__mdc_pack_body(b, op_data->op_suppgids[0]);
+	b->fid1 = op_data->op_fid1;
+	b->fid2 = op_data->op_fid2;
+	b->valid |= OBD_MD_FLID;
+
+	mdc_pack_capa(req, &RMF_CAPA1, op_data->op_capa1);
+	mdc_pack_capa(req, &RMF_CAPA2, op_data->op_capa2);
+}
+
+void mdc_pack_body(struct ptlrpc_request *req,
+		   const struct lu_fid *fid, struct obd_capa *oc,
+		   __u64 valid, int ea_size, __u32 suppgid, int flags)
+{
+	struct mdt_body *b = req_capsule_client_get(&req->rq_pill,
+						    &RMF_MDT_BODY);
+	LASSERT(b != NULL);
+	b->valid = valid;
+	b->eadatasize = ea_size;
+	b->flags = flags;
+	__mdc_pack_body(b, suppgid);
+	if (fid) {
+		b->fid1 = *fid;
+		b->valid |= OBD_MD_FLID;
+		mdc_pack_capa(req, &RMF_CAPA1, oc);
+	}
+}
+
+void mdc_readdir_pack(struct ptlrpc_request *req, __u64 pgoff,
+		      __u32 size, const struct lu_fid *fid, struct obd_capa *oc)
+{
+	struct mdt_body *b = req_capsule_client_get(&req->rq_pill,
+						    &RMF_MDT_BODY);
+	b->fid1 = *fid;
+	b->valid |= OBD_MD_FLID;
+	b->size = pgoff;		       /* !! */
+	b->nlink = size;			/* !! */
+	__mdc_pack_body(b, -1);
+	b->mode = LUDA_FID | LUDA_TYPE;
+
+	mdc_pack_capa(req, &RMF_CAPA1, oc);
+}
+
+/* packing of MDS records */
+void mdc_create_pack(struct ptlrpc_request *req, struct md_op_data *op_data,
+		     const void *data, int datalen, __u32 mode,
+		     __u32 uid, __u32 gid, cfs_cap_t cap_effective, __u64 rdev)
+{
+	struct mdt_rec_create	*rec;
+	char			*tmp;
+	__u64			 flags;
+
+	CLASSERT(sizeof(struct mdt_rec_reint) == sizeof(struct mdt_rec_create));
+	rec = req_capsule_client_get(&req->rq_pill, &RMF_REC_REINT);
+
+
+	rec->cr_opcode   = REINT_CREATE;
+	rec->cr_fsuid    = uid;
+	rec->cr_fsgid    = gid;
+	rec->cr_cap      = cap_effective;
+	rec->cr_fid1     = op_data->op_fid1;
+	rec->cr_fid2     = op_data->op_fid2;
+	rec->cr_mode     = mode;
+	rec->cr_rdev     = rdev;
+	rec->cr_time     = op_data->op_mod_time;
+	rec->cr_suppgid1 = op_data->op_suppgids[0];
+	rec->cr_suppgid2 = op_data->op_suppgids[1];
+	flags = op_data->op_flags & MF_SOM_LOCAL_FLAGS;
+	if (op_data->op_bias & MDS_CREATE_VOLATILE)
+		flags |= MDS_OPEN_VOLATILE;
+	set_mrc_cr_flags(rec, flags);
+	rec->cr_bias     = op_data->op_bias;
+	rec->cr_umask    = current_umask();
+
+	mdc_pack_capa(req, &RMF_CAPA1, op_data->op_capa1);
+
+	tmp = req_capsule_client_get(&req->rq_pill, &RMF_NAME);
+	LOGL0(op_data->op_name, op_data->op_namelen, tmp);
+
+	if (data) {
+		tmp = req_capsule_client_get(&req->rq_pill, &RMF_EADATA);
+		memcpy(tmp, data, datalen);
+	}
+}
+
+static __u64 mds_pack_open_flags(__u64 flags, __u32 mode)
+{
+	__u64 cr_flags = (flags & (FMODE_READ | FMODE_WRITE |
+				   MDS_OPEN_HAS_EA | MDS_OPEN_HAS_OBJS |
+				   MDS_OPEN_OWNEROVERRIDE | MDS_OPEN_LOCK |
+				   MDS_OPEN_BY_FID | MDS_OPEN_LEASE |
+				   MDS_OPEN_RELEASE));
+	if (flags & O_CREAT)
+		cr_flags |= MDS_OPEN_CREAT;
+	if (flags & O_EXCL)
+		cr_flags |= MDS_OPEN_EXCL;
+	if (flags & O_TRUNC)
+		cr_flags |= MDS_OPEN_TRUNC;
+	if (flags & O_APPEND)
+		cr_flags |= MDS_OPEN_APPEND;
+	if (flags & O_SYNC)
+		cr_flags |= MDS_OPEN_SYNC;
+	if (flags & O_DIRECTORY)
+		cr_flags |= MDS_OPEN_DIRECTORY;
+	if (flags & __FMODE_EXEC)
+		cr_flags |= MDS_FMODE_EXEC;
+	if (cl_is_lov_delay_create(flags))
+		cr_flags |= MDS_OPEN_DELAY_CREATE;
+
+	if (flags & O_NONBLOCK)
+		cr_flags |= MDS_OPEN_NORESTORE;
+
+	return cr_flags;
+}
+
+/* packing of MDS records */
+void mdc_open_pack(struct ptlrpc_request *req, struct md_op_data *op_data,
+		   __u32 mode, __u64 rdev, __u64 flags, const void *lmm,
+		   int lmmlen)
+{
+	struct mdt_rec_create *rec;
+	char *tmp;
+	__u64 cr_flags;
+
+	CLASSERT(sizeof(struct mdt_rec_reint) == sizeof(struct mdt_rec_create));
+	rec = req_capsule_client_get(&req->rq_pill, &RMF_REC_REINT);
+
+	/* XXX do something about time, uid, gid */
+	rec->cr_opcode   = REINT_OPEN;
+	rec->cr_fsuid    = from_kuid(&init_user_ns, current_fsuid());
+	rec->cr_fsgid    = from_kgid(&init_user_ns, current_fsgid());
+	rec->cr_cap      = cfs_curproc_cap_pack();
+	rec->cr_fid1 = op_data->op_fid1;
+	rec->cr_fid2 = op_data->op_fid2;
+
+	rec->cr_mode     = mode;
+	cr_flags = mds_pack_open_flags(flags, mode);
+	rec->cr_rdev     = rdev;
+	rec->cr_time     = op_data->op_mod_time;
+	rec->cr_suppgid1 = op_data->op_suppgids[0];
+	rec->cr_suppgid2 = op_data->op_suppgids[1];
+	rec->cr_bias     = op_data->op_bias;
+	rec->cr_umask    = current_umask();
+	rec->cr_old_handle = op_data->op_handle;
+
+	mdc_pack_capa(req, &RMF_CAPA1, op_data->op_capa1);
+	/* the next buffer is child capa, which is used for replay,
+	 * will be packed from the data in reply message. */
+
+	if (op_data->op_name) {
+		tmp = req_capsule_client_get(&req->rq_pill, &RMF_NAME);
+		LOGL0(op_data->op_name, op_data->op_namelen, tmp);
+		if (op_data->op_bias & MDS_CREATE_VOLATILE)
+			cr_flags |= MDS_OPEN_VOLATILE;
+	}
+
+	if (lmm) {
+		cr_flags |= MDS_OPEN_HAS_EA;
+		tmp = req_capsule_client_get(&req->rq_pill, &RMF_EADATA);
+		memcpy(tmp, lmm, lmmlen);
+	}
+	set_mrc_cr_flags(rec, cr_flags);
+}
+
+static inline __u64 attr_pack(unsigned int ia_valid)
+{
+	__u64 sa_valid = 0;
+
+	if (ia_valid & ATTR_MODE)
+		sa_valid |= MDS_ATTR_MODE;
+	if (ia_valid & ATTR_UID)
+		sa_valid |= MDS_ATTR_UID;
+	if (ia_valid & ATTR_GID)
+		sa_valid |= MDS_ATTR_GID;
+	if (ia_valid & ATTR_SIZE)
+		sa_valid |= MDS_ATTR_SIZE;
+	if (ia_valid & ATTR_ATIME)
+		sa_valid |= MDS_ATTR_ATIME;
+	if (ia_valid & ATTR_MTIME)
+		sa_valid |= MDS_ATTR_MTIME;
+	if (ia_valid & ATTR_CTIME)
+		sa_valid |= MDS_ATTR_CTIME;
+	if (ia_valid & ATTR_ATIME_SET)
+		sa_valid |= MDS_ATTR_ATIME_SET;
+	if (ia_valid & ATTR_MTIME_SET)
+		sa_valid |= MDS_ATTR_MTIME_SET;
+	if (ia_valid & ATTR_FORCE)
+		sa_valid |= MDS_ATTR_FORCE;
+	if (ia_valid & ATTR_ATTR_FLAG)
+		sa_valid |= MDS_ATTR_ATTR_FLAG;
+	if (ia_valid & ATTR_KILL_SUID)
+		sa_valid |=  MDS_ATTR_KILL_SUID;
+	if (ia_valid & ATTR_KILL_SGID)
+		sa_valid |= MDS_ATTR_KILL_SGID;
+	if (ia_valid & ATTR_CTIME_SET)
+		sa_valid |= MDS_ATTR_CTIME_SET;
+	if (ia_valid & ATTR_FROM_OPEN)
+		sa_valid |= MDS_ATTR_FROM_OPEN;
+	if (ia_valid & ATTR_BLOCKS)
+		sa_valid |= MDS_ATTR_BLOCKS;
+	if (ia_valid & MDS_OPEN_OWNEROVERRIDE)
+		/* NFSD hack (see bug 5781) */
+		sa_valid |= MDS_OPEN_OWNEROVERRIDE;
+	return sa_valid;
+}
+
+static void mdc_setattr_pack_rec(struct mdt_rec_setattr *rec,
+				 struct md_op_data *op_data)
+{
+	rec->sa_opcode  = REINT_SETATTR;
+	rec->sa_fsuid   = from_kuid(&init_user_ns, current_fsuid());
+	rec->sa_fsgid   = from_kgid(&init_user_ns, current_fsgid());
+	rec->sa_cap     = cfs_curproc_cap_pack();
+	rec->sa_suppgid = -1;
+
+	rec->sa_fid    = op_data->op_fid1;
+	rec->sa_valid  = attr_pack(op_data->op_attr.ia_valid);
+	rec->sa_mode   = op_data->op_attr.ia_mode;
+	rec->sa_uid    = from_kuid(&init_user_ns, op_data->op_attr.ia_uid);
+	rec->sa_gid    = from_kgid(&init_user_ns, op_data->op_attr.ia_gid);
+	rec->sa_size   = op_data->op_attr.ia_size;
+	rec->sa_blocks = op_data->op_attr_blocks;
+	rec->sa_atime  = LTIME_S(op_data->op_attr.ia_atime);
+	rec->sa_mtime  = LTIME_S(op_data->op_attr.ia_mtime);
+	rec->sa_ctime  = LTIME_S(op_data->op_attr.ia_ctime);
+	rec->sa_attr_flags =
+			((struct ll_iattr *)&op_data->op_attr)->ia_attr_flags;
+	if ((op_data->op_attr.ia_valid & ATTR_GID) &&
+	    in_group_p(op_data->op_attr.ia_gid))
+		rec->sa_suppgid =
+			from_kgid(&init_user_ns, op_data->op_attr.ia_gid);
+	else
+		rec->sa_suppgid = op_data->op_suppgids[0];
+
+	rec->sa_bias = op_data->op_bias;
+}
+
+static void mdc_ioepoch_pack(struct mdt_ioepoch *epoch,
+			     struct md_op_data *op_data)
+{
+	memcpy(&epoch->handle, &op_data->op_handle, sizeof(epoch->handle));
+	epoch->ioepoch = op_data->op_ioepoch;
+	epoch->flags = op_data->op_flags & MF_SOM_LOCAL_FLAGS;
+}
+
+void mdc_setattr_pack(struct ptlrpc_request *req, struct md_op_data *op_data,
+		      void *ea, int ealen, void *ea2, int ea2len)
+{
+	struct mdt_rec_setattr *rec;
+	struct mdt_ioepoch *epoch;
+	struct lov_user_md *lum = NULL;
+
+	CLASSERT(sizeof(struct mdt_rec_reint) ==
+					sizeof(struct mdt_rec_setattr));
+	rec = req_capsule_client_get(&req->rq_pill, &RMF_REC_REINT);
+	mdc_setattr_pack_rec(rec, op_data);
+
+	mdc_pack_capa(req, &RMF_CAPA1, op_data->op_capa1);
+
+	if (op_data->op_flags & (MF_SOM_CHANGE | MF_EPOCH_OPEN)) {
+		epoch = req_capsule_client_get(&req->rq_pill, &RMF_MDT_EPOCH);
+		mdc_ioepoch_pack(epoch, op_data);
+	}
+
+	if (ealen == 0)
+		return;
+
+	lum = req_capsule_client_get(&req->rq_pill, &RMF_EADATA);
+	if (ea == NULL) { /* Remove LOV EA */
+		lum->lmm_magic = LOV_USER_MAGIC_V1;
+		lum->lmm_stripe_size = 0;
+		lum->lmm_stripe_count = 0;
+		lum->lmm_stripe_offset = (typeof(lum->lmm_stripe_offset))(-1);
+	} else {
+		memcpy(lum, ea, ealen);
+	}
+
+	if (ea2len == 0)
+		return;
+
+	memcpy(req_capsule_client_get(&req->rq_pill, &RMF_LOGCOOKIES), ea2,
+	       ea2len);
+}
+
+void mdc_unlink_pack(struct ptlrpc_request *req, struct md_op_data *op_data)
+{
+	struct mdt_rec_unlink *rec;
+	char *tmp;
+
+	CLASSERT(sizeof(struct mdt_rec_reint) == sizeof(struct mdt_rec_unlink));
+	rec = req_capsule_client_get(&req->rq_pill, &RMF_REC_REINT);
+	LASSERT(rec != NULL);
+
+	rec->ul_opcode   = op_data->op_cli_flags & CLI_RM_ENTRY ?
+					REINT_RMENTRY : REINT_UNLINK;
+	rec->ul_fsuid    = op_data->op_fsuid;
+	rec->ul_fsgid    = op_data->op_fsgid;
+	rec->ul_cap      = op_data->op_cap;
+	rec->ul_mode     = op_data->op_mode;
+	rec->ul_suppgid1 = op_data->op_suppgids[0];
+	rec->ul_suppgid2 = -1;
+	rec->ul_fid1     = op_data->op_fid1;
+	rec->ul_fid2     = op_data->op_fid2;
+	rec->ul_time     = op_data->op_mod_time;
+	rec->ul_bias     = op_data->op_bias;
+
+	mdc_pack_capa(req, &RMF_CAPA1, op_data->op_capa1);
+
+	tmp = req_capsule_client_get(&req->rq_pill, &RMF_NAME);
+	LASSERT(tmp != NULL);
+	LOGL0(op_data->op_name, op_data->op_namelen, tmp);
+}
+
+void mdc_link_pack(struct ptlrpc_request *req, struct md_op_data *op_data)
+{
+	struct mdt_rec_link *rec;
+	char *tmp;
+
+	CLASSERT(sizeof(struct mdt_rec_reint) == sizeof(struct mdt_rec_link));
+	rec = req_capsule_client_get(&req->rq_pill, &RMF_REC_REINT);
+	LASSERT(rec != NULL);
+
+	rec->lk_opcode   = REINT_LINK;
+	rec->lk_fsuid    = op_data->op_fsuid; /* current->fsuid; */
+	rec->lk_fsgid    = op_data->op_fsgid; /* current->fsgid; */
+	rec->lk_cap      = op_data->op_cap;   /* current->cap_effective; */
+	rec->lk_suppgid1 = op_data->op_suppgids[0];
+	rec->lk_suppgid2 = op_data->op_suppgids[1];
+	rec->lk_fid1     = op_data->op_fid1;
+	rec->lk_fid2     = op_data->op_fid2;
+	rec->lk_time     = op_data->op_mod_time;
+	rec->lk_bias     = op_data->op_bias;
+
+	mdc_pack_capa(req, &RMF_CAPA1, op_data->op_capa1);
+	mdc_pack_capa(req, &RMF_CAPA2, op_data->op_capa2);
+
+	tmp = req_capsule_client_get(&req->rq_pill, &RMF_NAME);
+	LOGL0(op_data->op_name, op_data->op_namelen, tmp);
+}
+
+void mdc_rename_pack(struct ptlrpc_request *req, struct md_op_data *op_data,
+		     const char *old, int oldlen, const char *new, int newlen)
+{
+	struct mdt_rec_rename *rec;
+	char *tmp;
+
+	CLASSERT(sizeof(struct mdt_rec_reint) == sizeof(struct mdt_rec_rename));
+	rec = req_capsule_client_get(&req->rq_pill, &RMF_REC_REINT);
+
+	/* XXX do something about time, uid, gid */
+	rec->rn_opcode   = REINT_RENAME;
+	rec->rn_fsuid    = op_data->op_fsuid;
+	rec->rn_fsgid    = op_data->op_fsgid;
+	rec->rn_cap      = op_data->op_cap;
+	rec->rn_suppgid1 = op_data->op_suppgids[0];
+	rec->rn_suppgid2 = op_data->op_suppgids[1];
+	rec->rn_fid1     = op_data->op_fid1;
+	rec->rn_fid2     = op_data->op_fid2;
+	rec->rn_time     = op_data->op_mod_time;
+	rec->rn_mode     = op_data->op_mode;
+	rec->rn_bias     = op_data->op_bias;
+
+	mdc_pack_capa(req, &RMF_CAPA1, op_data->op_capa1);
+	mdc_pack_capa(req, &RMF_CAPA2, op_data->op_capa2);
+
+	tmp = req_capsule_client_get(&req->rq_pill, &RMF_NAME);
+	LOGL0(old, oldlen, tmp);
+
+	if (new) {
+		tmp = req_capsule_client_get(&req->rq_pill, &RMF_SYMTGT);
+		LOGL0(new, newlen, tmp);
+	}
+}
+
+void mdc_getattr_pack(struct ptlrpc_request *req, __u64 valid, int flags,
+		      struct md_op_data *op_data, int ea_size)
+{
+	struct mdt_body *b = req_capsule_client_get(&req->rq_pill,
+						    &RMF_MDT_BODY);
+
+	b->valid = valid;
+	if (op_data->op_bias & MDS_CHECK_SPLIT)
+		b->valid |= OBD_MD_FLCKSPLIT;
+	if (op_data->op_bias & MDS_CROSS_REF)
+		b->valid |= OBD_MD_FLCROSSREF;
+	b->eadatasize = ea_size;
+	b->flags = flags;
+	__mdc_pack_body(b, op_data->op_suppgids[0]);
+
+	b->fid1 = op_data->op_fid1;
+	b->fid2 = op_data->op_fid2;
+	b->valid |= OBD_MD_FLID;
+
+	mdc_pack_capa(req, &RMF_CAPA1, op_data->op_capa1);
+
+	if (op_data->op_name) {
+		char *tmp = req_capsule_client_get(&req->rq_pill, &RMF_NAME);
+
+		LOGL0(op_data->op_name, op_data->op_namelen, tmp);
+
+	}
+}
+
+static void mdc_hsm_release_pack(struct ptlrpc_request *req,
+				 struct md_op_data *op_data)
+{
+	if (op_data->op_bias & MDS_HSM_RELEASE) {
+		struct close_data *data;
+		struct ldlm_lock *lock;
+
+		data = req_capsule_client_get(&req->rq_pill, &RMF_CLOSE_DATA);
+		LASSERT(data != NULL);
+
+		lock = ldlm_handle2lock(&op_data->op_lease_handle);
+		if (lock != NULL) {
+			data->cd_handle = lock->l_remote_handle;
+			ldlm_lock_put(lock);
+		}
+		ldlm_cli_cancel(&op_data->op_lease_handle, LCF_LOCAL);
+
+		data->cd_data_version = op_data->op_data_version;
+		data->cd_fid = op_data->op_fid2;
+	}
+}
+
+void mdc_close_pack(struct ptlrpc_request *req, struct md_op_data *op_data)
+{
+	struct mdt_ioepoch *epoch;
+	struct mdt_rec_setattr *rec;
+
+	epoch = req_capsule_client_get(&req->rq_pill, &RMF_MDT_EPOCH);
+	rec = req_capsule_client_get(&req->rq_pill, &RMF_REC_REINT);
+
+	mdc_setattr_pack_rec(rec, op_data);
+	mdc_pack_capa(req, &RMF_CAPA1, op_data->op_capa1);
+	mdc_ioepoch_pack(epoch, op_data);
+	mdc_hsm_release_pack(req, op_data);
+}
+
+static int mdc_req_avail(struct client_obd *cli, struct mdc_cache_waiter *mcw)
+{
+	int rc;
+
+	client_obd_list_lock(&cli->cl_loi_list_lock);
+	rc = list_empty(&mcw->mcw_entry);
+	client_obd_list_unlock(&cli->cl_loi_list_lock);
+	return rc;
+};
+
+/* We record requests in flight in cli->cl_r_in_flight here.
+ * There is only one write rpc possible in mdc anyway. If this to change
+ * in the future - the code may need to be revisited. */
+int mdc_enter_request(struct client_obd *cli)
+{
+	int rc = 0;
+	struct mdc_cache_waiter mcw;
+	struct l_wait_info lwi = LWI_INTR(LWI_ON_SIGNAL_NOOP, NULL);
+
+	client_obd_list_lock(&cli->cl_loi_list_lock);
+	if (cli->cl_r_in_flight >= cli->cl_max_rpcs_in_flight) {
+		list_add_tail(&mcw.mcw_entry, &cli->cl_cache_waiters);
+		init_waitqueue_head(&mcw.mcw_waitq);
+		client_obd_list_unlock(&cli->cl_loi_list_lock);
+		rc = l_wait_event(mcw.mcw_waitq, mdc_req_avail(cli, &mcw),
+				  &lwi);
+		if (rc) {
+			client_obd_list_lock(&cli->cl_loi_list_lock);
+			if (list_empty(&mcw.mcw_entry))
+				cli->cl_r_in_flight--;
+			list_del_init(&mcw.mcw_entry);
+			client_obd_list_unlock(&cli->cl_loi_list_lock);
+		}
+	} else {
+		cli->cl_r_in_flight++;
+		client_obd_list_unlock(&cli->cl_loi_list_lock);
+	}
+	return rc;
+}
+
+void mdc_exit_request(struct client_obd *cli)
+{
+	struct list_head *l, *tmp;
+	struct mdc_cache_waiter *mcw;
+
+	client_obd_list_lock(&cli->cl_loi_list_lock);
+	cli->cl_r_in_flight--;
+	list_for_each_safe(l, tmp, &cli->cl_cache_waiters) {
+		if (cli->cl_r_in_flight >= cli->cl_max_rpcs_in_flight) {
+			/* No free request slots anymore */
+			break;
+		}
+
+		mcw = list_entry(l, struct mdc_cache_waiter, mcw_entry);
+		list_del_init(&mcw->mcw_entry);
+		cli->cl_r_in_flight++;
+		wake_up(&mcw->mcw_waitq);
+	}
+	/* Empty waiting list? Decrease reqs in-flight number */
+
+	client_obd_list_unlock(&cli->cl_loi_list_lock);
+}
diff --git a/kernel/drivers/staging/lustre/lustre/mdc/mdc_locks.c b/kernel/drivers/staging/lustre/lustre/mdc/mdc_locks.c
new file mode 100644
index 000000000..d1c224ecd
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/mdc/mdc_locks.c
@@ -0,0 +1,1313 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_MDC
+
+# include <linux/module.h>
+
+#include "../include/lustre_intent.h"
+#include "../include/obd.h"
+#include "../include/obd_class.h"
+#include "../include/lustre_dlm.h"
+#include "../include/lustre_fid.h"	/* fid_res_name_eq() */
+#include "../include/lustre_mdc.h"
+#include "../include/lustre_net.h"
+#include "../include/lustre_req_layout.h"
+#include "mdc_internal.h"
+
+struct mdc_getattr_args {
+	struct obd_export	   *ga_exp;
+	struct md_enqueue_info      *ga_minfo;
+	struct ldlm_enqueue_info    *ga_einfo;
+};
+
+int it_disposition(struct lookup_intent *it, int flag)
+{
+	return it->d.lustre.it_disposition & flag;
+}
+EXPORT_SYMBOL(it_disposition);
+
+void it_set_disposition(struct lookup_intent *it, int flag)
+{
+	it->d.lustre.it_disposition |= flag;
+}
+EXPORT_SYMBOL(it_set_disposition);
+
+void it_clear_disposition(struct lookup_intent *it, int flag)
+{
+	it->d.lustre.it_disposition &= ~flag;
+}
+EXPORT_SYMBOL(it_clear_disposition);
+
+int it_open_error(int phase, struct lookup_intent *it)
+{
+	if (it_disposition(it, DISP_OPEN_LEASE)) {
+		if (phase >= DISP_OPEN_LEASE)
+			return it->d.lustre.it_status;
+		else
+			return 0;
+	}
+	if (it_disposition(it, DISP_OPEN_OPEN)) {
+		if (phase >= DISP_OPEN_OPEN)
+			return it->d.lustre.it_status;
+		else
+			return 0;
+	}
+
+	if (it_disposition(it, DISP_OPEN_CREATE)) {
+		if (phase >= DISP_OPEN_CREATE)
+			return it->d.lustre.it_status;
+		else
+			return 0;
+	}
+
+	if (it_disposition(it, DISP_LOOKUP_EXECD)) {
+		if (phase >= DISP_LOOKUP_EXECD)
+			return it->d.lustre.it_status;
+		else
+			return 0;
+	}
+
+	if (it_disposition(it, DISP_IT_EXECD)) {
+		if (phase >= DISP_IT_EXECD)
+			return it->d.lustre.it_status;
+		else
+			return 0;
+	}
+	CERROR("it disp: %X, status: %d\n", it->d.lustre.it_disposition,
+	       it->d.lustre.it_status);
+	LBUG();
+	return 0;
+}
+EXPORT_SYMBOL(it_open_error);
+
+/* this must be called on a lockh that is known to have a referenced lock */
+int mdc_set_lock_data(struct obd_export *exp, __u64 *lockh, void *data,
+		      __u64 *bits)
+{
+	struct ldlm_lock *lock;
+	struct inode *new_inode = data;
+
+	if (bits)
+		*bits = 0;
+
+	if (!*lockh)
+		return 0;
+
+	lock = ldlm_handle2lock((struct lustre_handle *)lockh);
+
+	LASSERT(lock != NULL);
+	lock_res_and_lock(lock);
+	if (lock->l_resource->lr_lvb_inode &&
+	    lock->l_resource->lr_lvb_inode != data) {
+		struct inode *old_inode = lock->l_resource->lr_lvb_inode;
+
+		LASSERTF(old_inode->i_state & I_FREEING,
+			 "Found existing inode %p/%lu/%u state %lu in lock: setting data to %p/%lu/%u\n",
+			 old_inode, old_inode->i_ino, old_inode->i_generation,
+			 old_inode->i_state, new_inode, new_inode->i_ino,
+			 new_inode->i_generation);
+	}
+	lock->l_resource->lr_lvb_inode = new_inode;
+	if (bits)
+		*bits = lock->l_policy_data.l_inodebits.bits;
+
+	unlock_res_and_lock(lock);
+	LDLM_LOCK_PUT(lock);
+
+	return 0;
+}
+
+ldlm_mode_t mdc_lock_match(struct obd_export *exp, __u64 flags,
+			   const struct lu_fid *fid, ldlm_type_t type,
+			   ldlm_policy_data_t *policy, ldlm_mode_t mode,
+			   struct lustre_handle *lockh)
+{
+	struct ldlm_res_id res_id;
+	ldlm_mode_t rc;
+
+	fid_build_reg_res_name(fid, &res_id);
+	/* LU-4405: Clear bits not supported by server */
+	policy->l_inodebits.bits &= exp_connect_ibits(exp);
+	rc = ldlm_lock_match(class_exp2obd(exp)->obd_namespace, flags,
+			     &res_id, type, policy, mode, lockh, 0);
+	return rc;
+}
+
+int mdc_cancel_unused(struct obd_export *exp,
+		      const struct lu_fid *fid,
+		      ldlm_policy_data_t *policy,
+		      ldlm_mode_t mode,
+		      ldlm_cancel_flags_t flags,
+		      void *opaque)
+{
+	struct ldlm_res_id res_id;
+	struct obd_device *obd = class_exp2obd(exp);
+	int rc;
+
+	fid_build_reg_res_name(fid, &res_id);
+	rc = ldlm_cli_cancel_unused_resource(obd->obd_namespace, &res_id,
+					     policy, mode, flags, opaque);
+	return rc;
+}
+
+int mdc_null_inode(struct obd_export *exp,
+		   const struct lu_fid *fid)
+{
+	struct ldlm_res_id res_id;
+	struct ldlm_resource *res;
+	struct ldlm_namespace *ns = class_exp2obd(exp)->obd_namespace;
+
+	LASSERTF(ns != NULL, "no namespace passed\n");
+
+	fid_build_reg_res_name(fid, &res_id);
+
+	res = ldlm_resource_get(ns, NULL, &res_id, 0, 0);
+	if (res == NULL)
+		return 0;
+
+	lock_res(res);
+	res->lr_lvb_inode = NULL;
+	unlock_res(res);
+
+	ldlm_resource_putref(res);
+	return 0;
+}
+
+/* find any ldlm lock of the inode in mdc
+ * return 0    not find
+ *	1    find one
+ *      < 0    error */
+int mdc_find_cbdata(struct obd_export *exp,
+		    const struct lu_fid *fid,
+		    ldlm_iterator_t it, void *data)
+{
+	struct ldlm_res_id res_id;
+	int rc = 0;
+
+	fid_build_reg_res_name((struct lu_fid *)fid, &res_id);
+	rc = ldlm_resource_iterate(class_exp2obd(exp)->obd_namespace, &res_id,
+				   it, data);
+	if (rc == LDLM_ITER_STOP)
+		return 1;
+	else if (rc == LDLM_ITER_CONTINUE)
+		return 0;
+	return rc;
+}
+
+static inline void mdc_clear_replay_flag(struct ptlrpc_request *req, int rc)
+{
+	/* Don't hold error requests for replay. */
+	if (req->rq_replay) {
+		spin_lock(&req->rq_lock);
+		req->rq_replay = 0;
+		spin_unlock(&req->rq_lock);
+	}
+	if (rc && req->rq_transno != 0) {
+		DEBUG_REQ(D_ERROR, req, "transno returned on error rc %d", rc);
+		LBUG();
+	}
+}
+
+/* Save a large LOV EA into the request buffer so that it is available
+ * for replay.  We don't do this in the initial request because the
+ * original request doesn't need this buffer (at most it sends just the
+ * lov_mds_md) and it is a waste of RAM/bandwidth to send the empty
+ * buffer and may also be difficult to allocate and save a very large
+ * request buffer for each open. (bug 5707)
+ *
+ * OOM here may cause recovery failure if lmm is needed (only for the
+ * original open if the MDS crashed just when this client also OOM'd)
+ * but this is incredibly unlikely, and questionable whether the client
+ * could do MDS recovery under OOM anyways... */
+static void mdc_realloc_openmsg(struct ptlrpc_request *req,
+				struct mdt_body *body)
+{
+	int     rc;
+
+	/* FIXME: remove this explicit offset. */
+	rc = sptlrpc_cli_enlarge_reqbuf(req, DLM_INTENT_REC_OFF + 4,
+					body->eadatasize);
+	if (rc) {
+		CERROR("Can't enlarge segment %d size to %d\n",
+		       DLM_INTENT_REC_OFF + 4, body->eadatasize);
+		body->valid &= ~OBD_MD_FLEASIZE;
+		body->eadatasize = 0;
+	}
+}
+
+static struct ptlrpc_request *mdc_intent_open_pack(struct obd_export *exp,
+						   struct lookup_intent *it,
+						   struct md_op_data *op_data,
+						   void *lmm, int lmmsize,
+						   void *cb_data)
+{
+	struct ptlrpc_request *req;
+	struct obd_device     *obddev = class_exp2obd(exp);
+	struct ldlm_intent    *lit;
+	LIST_HEAD(cancels);
+	int		    count = 0;
+	int		    mode;
+	int		    rc;
+
+	it->it_create_mode = (it->it_create_mode & ~S_IFMT) | S_IFREG;
+
+	/* XXX: openlock is not cancelled for cross-refs. */
+	/* If inode is known, cancel conflicting OPEN locks. */
+	if (fid_is_sane(&op_data->op_fid2)) {
+		if (it->it_flags & MDS_OPEN_LEASE) { /* try to get lease */
+			if (it->it_flags & FMODE_WRITE)
+				mode = LCK_EX;
+			else
+				mode = LCK_PR;
+		} else {
+			if (it->it_flags & (FMODE_WRITE|MDS_OPEN_TRUNC))
+				mode = LCK_CW;
+			else if (it->it_flags & __FMODE_EXEC)
+				mode = LCK_PR;
+			else
+				mode = LCK_CR;
+		}
+		count = mdc_resource_get_unused(exp, &op_data->op_fid2,
+						&cancels, mode,
+						MDS_INODELOCK_OPEN);
+	}
+
+	/* If CREATE, cancel parent's UPDATE lock. */
+	if (it->it_op & IT_CREAT)
+		mode = LCK_EX;
+	else
+		mode = LCK_CR;
+	count += mdc_resource_get_unused(exp, &op_data->op_fid1,
+					 &cancels, mode,
+					 MDS_INODELOCK_UPDATE);
+
+	req = ptlrpc_request_alloc(class_exp2cliimp(exp),
+				   &RQF_LDLM_INTENT_OPEN);
+	if (req == NULL) {
+		ldlm_lock_list_put(&cancels, l_bl_ast, count);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	/* parent capability */
+	mdc_set_capa_size(req, &RMF_CAPA1, op_data->op_capa1);
+	/* child capability, reserve the size according to parent capa, it will
+	 * be filled after we get the reply */
+	mdc_set_capa_size(req, &RMF_CAPA2, op_data->op_capa1);
+
+	req_capsule_set_size(&req->rq_pill, &RMF_NAME, RCL_CLIENT,
+			     op_data->op_namelen + 1);
+	req_capsule_set_size(&req->rq_pill, &RMF_EADATA, RCL_CLIENT,
+			     max(lmmsize, obddev->u.cli.cl_default_mds_easize));
+
+	rc = ldlm_prep_enqueue_req(exp, req, &cancels, count);
+	if (rc < 0) {
+		ptlrpc_request_free(req);
+		return ERR_PTR(rc);
+	}
+
+	spin_lock(&req->rq_lock);
+	req->rq_replay = req->rq_import->imp_replayable;
+	spin_unlock(&req->rq_lock);
+
+	/* pack the intent */
+	lit = req_capsule_client_get(&req->rq_pill, &RMF_LDLM_INTENT);
+	lit->opc = (__u64)it->it_op;
+
+	/* pack the intended request */
+	mdc_open_pack(req, op_data, it->it_create_mode, 0, it->it_flags, lmm,
+		      lmmsize);
+
+	/* for remote client, fetch remote perm for current user */
+	if (client_is_remote(exp))
+		req_capsule_set_size(&req->rq_pill, &RMF_ACL, RCL_SERVER,
+				     sizeof(struct mdt_remote_perm));
+	ptlrpc_request_set_replen(req);
+	return req;
+}
+
+static struct ptlrpc_request *
+mdc_intent_getxattr_pack(struct obd_export *exp,
+			 struct lookup_intent *it,
+			 struct md_op_data *op_data)
+{
+	struct ptlrpc_request	*req;
+	struct ldlm_intent	*lit;
+	int			rc, count = 0, maxdata;
+	LIST_HEAD(cancels);
+
+
+
+	req = ptlrpc_request_alloc(class_exp2cliimp(exp),
+					&RQF_LDLM_INTENT_GETXATTR);
+	if (req == NULL)
+		return ERR_PTR(-ENOMEM);
+
+	mdc_set_capa_size(req, &RMF_CAPA1, op_data->op_capa1);
+
+	rc = ldlm_prep_enqueue_req(exp, req, &cancels, count);
+	if (rc) {
+		ptlrpc_request_free(req);
+		return ERR_PTR(rc);
+	}
+
+	/* pack the intent */
+	lit = req_capsule_client_get(&req->rq_pill, &RMF_LDLM_INTENT);
+	lit->opc = IT_GETXATTR;
+
+	maxdata = class_exp2cliimp(exp)->imp_connect_data.ocd_max_easize;
+
+	/* pack the intended request */
+	mdc_pack_body(req, &op_data->op_fid1, op_data->op_capa1,
+			op_data->op_valid, maxdata, -1, 0);
+
+	req_capsule_set_size(&req->rq_pill, &RMF_EADATA,
+				RCL_SERVER, maxdata);
+
+	req_capsule_set_size(&req->rq_pill, &RMF_EAVALS,
+				RCL_SERVER, maxdata);
+
+	req_capsule_set_size(&req->rq_pill, &RMF_EAVALS_LENS,
+				RCL_SERVER, maxdata);
+
+	ptlrpc_request_set_replen(req);
+
+	return req;
+}
+
+static struct ptlrpc_request *mdc_intent_unlink_pack(struct obd_export *exp,
+						     struct lookup_intent *it,
+						     struct md_op_data *op_data)
+{
+	struct ptlrpc_request *req;
+	struct obd_device     *obddev = class_exp2obd(exp);
+	struct ldlm_intent    *lit;
+	int		    rc;
+
+	req = ptlrpc_request_alloc(class_exp2cliimp(exp),
+				   &RQF_LDLM_INTENT_UNLINK);
+	if (req == NULL)
+		return ERR_PTR(-ENOMEM);
+
+	mdc_set_capa_size(req, &RMF_CAPA1, op_data->op_capa1);
+	req_capsule_set_size(&req->rq_pill, &RMF_NAME, RCL_CLIENT,
+			     op_data->op_namelen + 1);
+
+	rc = ldlm_prep_enqueue_req(exp, req, NULL, 0);
+	if (rc) {
+		ptlrpc_request_free(req);
+		return ERR_PTR(rc);
+	}
+
+	/* pack the intent */
+	lit = req_capsule_client_get(&req->rq_pill, &RMF_LDLM_INTENT);
+	lit->opc = (__u64)it->it_op;
+
+	/* pack the intended request */
+	mdc_unlink_pack(req, op_data);
+
+	req_capsule_set_size(&req->rq_pill, &RMF_MDT_MD, RCL_SERVER,
+			     obddev->u.cli.cl_default_mds_easize);
+	req_capsule_set_size(&req->rq_pill, &RMF_ACL, RCL_SERVER,
+			     obddev->u.cli.cl_default_mds_cookiesize);
+	ptlrpc_request_set_replen(req);
+	return req;
+}
+
+static struct ptlrpc_request *mdc_intent_getattr_pack(struct obd_export *exp,
+						    struct lookup_intent *it,
+						    struct md_op_data *op_data)
+{
+	struct ptlrpc_request *req;
+	struct obd_device     *obddev = class_exp2obd(exp);
+	u64		       valid = OBD_MD_FLGETATTR | OBD_MD_FLEASIZE |
+				       OBD_MD_FLMODEASIZE | OBD_MD_FLDIREA |
+				       OBD_MD_FLMDSCAPA | OBD_MD_MEA |
+				       (client_is_remote(exp) ?
+					       OBD_MD_FLRMTPERM : OBD_MD_FLACL);
+	struct ldlm_intent    *lit;
+	int		    rc;
+	int		    easize;
+
+	req = ptlrpc_request_alloc(class_exp2cliimp(exp),
+				   &RQF_LDLM_INTENT_GETATTR);
+	if (req == NULL)
+		return ERR_PTR(-ENOMEM);
+
+	mdc_set_capa_size(req, &RMF_CAPA1, op_data->op_capa1);
+	req_capsule_set_size(&req->rq_pill, &RMF_NAME, RCL_CLIENT,
+			     op_data->op_namelen + 1);
+
+	rc = ldlm_prep_enqueue_req(exp, req, NULL, 0);
+	if (rc) {
+		ptlrpc_request_free(req);
+		return ERR_PTR(rc);
+	}
+
+	/* pack the intent */
+	lit = req_capsule_client_get(&req->rq_pill, &RMF_LDLM_INTENT);
+	lit->opc = (__u64)it->it_op;
+
+	if (obddev->u.cli.cl_default_mds_easize > 0)
+		easize = obddev->u.cli.cl_default_mds_easize;
+	else
+		easize = obddev->u.cli.cl_max_mds_easize;
+
+	/* pack the intended request */
+	mdc_getattr_pack(req, valid, it->it_flags, op_data, easize);
+
+	req_capsule_set_size(&req->rq_pill, &RMF_MDT_MD, RCL_SERVER, easize);
+	if (client_is_remote(exp))
+		req_capsule_set_size(&req->rq_pill, &RMF_ACL, RCL_SERVER,
+				     sizeof(struct mdt_remote_perm));
+	ptlrpc_request_set_replen(req);
+	return req;
+}
+
+static struct ptlrpc_request *mdc_intent_layout_pack(struct obd_export *exp,
+						     struct lookup_intent *it,
+						     struct md_op_data *unused)
+{
+	struct obd_device     *obd = class_exp2obd(exp);
+	struct ptlrpc_request *req;
+	struct ldlm_intent    *lit;
+	struct layout_intent  *layout;
+	int rc;
+
+	req = ptlrpc_request_alloc(class_exp2cliimp(exp),
+				&RQF_LDLM_INTENT_LAYOUT);
+	if (req == NULL)
+		return ERR_PTR(-ENOMEM);
+
+	req_capsule_set_size(&req->rq_pill, &RMF_EADATA, RCL_CLIENT, 0);
+	rc = ldlm_prep_enqueue_req(exp, req, NULL, 0);
+	if (rc) {
+		ptlrpc_request_free(req);
+		return ERR_PTR(rc);
+	}
+
+	/* pack the intent */
+	lit = req_capsule_client_get(&req->rq_pill, &RMF_LDLM_INTENT);
+	lit->opc = (__u64)it->it_op;
+
+	/* pack the layout intent request */
+	layout = req_capsule_client_get(&req->rq_pill, &RMF_LAYOUT_INTENT);
+	/* LAYOUT_INTENT_ACCESS is generic, specific operation will be
+	 * set for replication */
+	layout->li_opc = LAYOUT_INTENT_ACCESS;
+
+	req_capsule_set_size(&req->rq_pill, &RMF_DLM_LVB, RCL_SERVER,
+			     obd->u.cli.cl_default_mds_easize);
+	ptlrpc_request_set_replen(req);
+	return req;
+}
+
+static struct ptlrpc_request *
+mdc_enqueue_pack(struct obd_export *exp, int lvb_len)
+{
+	struct ptlrpc_request *req;
+	int rc;
+
+	req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_LDLM_ENQUEUE);
+	if (req == NULL)
+		return ERR_PTR(-ENOMEM);
+
+	rc = ldlm_prep_enqueue_req(exp, req, NULL, 0);
+	if (rc) {
+		ptlrpc_request_free(req);
+		return ERR_PTR(rc);
+	}
+
+	req_capsule_set_size(&req->rq_pill, &RMF_DLM_LVB, RCL_SERVER, lvb_len);
+	ptlrpc_request_set_replen(req);
+	return req;
+}
+
+static int mdc_finish_enqueue(struct obd_export *exp,
+			      struct ptlrpc_request *req,
+			      struct ldlm_enqueue_info *einfo,
+			      struct lookup_intent *it,
+			      struct lustre_handle *lockh,
+			      int rc)
+{
+	struct req_capsule  *pill = &req->rq_pill;
+	struct ldlm_request *lockreq;
+	struct ldlm_reply   *lockrep;
+	struct lustre_intent_data *intent = &it->d.lustre;
+	struct ldlm_lock    *lock;
+	void		*lvb_data = NULL;
+	int		  lvb_len = 0;
+
+	LASSERT(rc >= 0);
+	/* Similarly, if we're going to replay this request, we don't want to
+	 * actually get a lock, just perform the intent. */
+	if (req->rq_transno || req->rq_replay) {
+		lockreq = req_capsule_client_get(pill, &RMF_DLM_REQ);
+		lockreq->lock_flags |= ldlm_flags_to_wire(LDLM_FL_INTENT_ONLY);
+	}
+
+	if (rc == ELDLM_LOCK_ABORTED) {
+		einfo->ei_mode = 0;
+		memset(lockh, 0, sizeof(*lockh));
+		rc = 0;
+	} else { /* rc = 0 */
+		lock = ldlm_handle2lock(lockh);
+		LASSERT(lock != NULL);
+
+		/* If the server gave us back a different lock mode, we should
+		 * fix up our variables. */
+		if (lock->l_req_mode != einfo->ei_mode) {
+			ldlm_lock_addref(lockh, lock->l_req_mode);
+			ldlm_lock_decref(lockh, einfo->ei_mode);
+			einfo->ei_mode = lock->l_req_mode;
+		}
+		LDLM_LOCK_PUT(lock);
+	}
+
+	lockrep = req_capsule_server_get(pill, &RMF_DLM_REP);
+	LASSERT(lockrep != NULL); /* checked by ldlm_cli_enqueue() */
+
+	intent->it_disposition = (int)lockrep->lock_policy_res1;
+	intent->it_status = (int)lockrep->lock_policy_res2;
+	intent->it_lock_mode = einfo->ei_mode;
+	intent->it_lock_handle = lockh->cookie;
+	intent->it_data = req;
+
+	/* Technically speaking rq_transno must already be zero if
+	 * it_status is in error, so the check is a bit redundant */
+	if ((!req->rq_transno || intent->it_status < 0) && req->rq_replay)
+		mdc_clear_replay_flag(req, intent->it_status);
+
+	/* If we're doing an IT_OPEN which did not result in an actual
+	 * successful open, then we need to remove the bit which saves
+	 * this request for unconditional replay.
+	 *
+	 * It's important that we do this first!  Otherwise we might exit the
+	 * function without doing so, and try to replay a failed create
+	 * (bug 3440) */
+	if (it->it_op & IT_OPEN && req->rq_replay &&
+	    (!it_disposition(it, DISP_OPEN_OPEN) || intent->it_status != 0))
+		mdc_clear_replay_flag(req, intent->it_status);
+
+	DEBUG_REQ(D_RPCTRACE, req, "op: %d disposition: %x, status: %d",
+		  it->it_op, intent->it_disposition, intent->it_status);
+
+	/* We know what to expect, so we do any byte flipping required here */
+	if (it->it_op & (IT_OPEN | IT_UNLINK | IT_LOOKUP | IT_GETATTR)) {
+		struct mdt_body *body;
+
+		body = req_capsule_server_get(pill, &RMF_MDT_BODY);
+		if (body == NULL) {
+			CERROR("Can't swab mdt_body\n");
+			return -EPROTO;
+		}
+
+		if (it_disposition(it, DISP_OPEN_OPEN) &&
+		    !it_open_error(DISP_OPEN_OPEN, it)) {
+			/*
+			 * If this is a successful OPEN request, we need to set
+			 * replay handler and data early, so that if replay
+			 * happens immediately after swabbing below, new reply
+			 * is swabbed by that handler correctly.
+			 */
+			mdc_set_open_replay_data(NULL, NULL, it);
+		}
+
+		if ((body->valid & (OBD_MD_FLDIREA | OBD_MD_FLEASIZE)) != 0) {
+			void *eadata;
+
+			mdc_update_max_ea_from_body(exp, body);
+
+			/*
+			 * The eadata is opaque; just check that it is there.
+			 * Eventually, obd_unpackmd() will check the contents.
+			 */
+			eadata = req_capsule_server_sized_get(pill, &RMF_MDT_MD,
+							      body->eadatasize);
+			if (eadata == NULL)
+				return -EPROTO;
+
+			/* save lvb data and length in case this is for layout
+			 * lock */
+			lvb_data = eadata;
+			lvb_len = body->eadatasize;
+
+			/*
+			 * We save the reply LOV EA in case we have to replay a
+			 * create for recovery.  If we didn't allocate a large
+			 * enough request buffer above we need to reallocate it
+			 * here to hold the actual LOV EA.
+			 *
+			 * To not save LOV EA if request is not going to replay
+			 * (for example error one).
+			 */
+			if ((it->it_op & IT_OPEN) && req->rq_replay) {
+				void *lmm;
+
+				if (req_capsule_get_size(pill, &RMF_EADATA,
+							 RCL_CLIENT) <
+				    body->eadatasize)
+					mdc_realloc_openmsg(req, body);
+				else
+					req_capsule_shrink(pill, &RMF_EADATA,
+							   body->eadatasize,
+							   RCL_CLIENT);
+
+				req_capsule_set_size(pill, &RMF_EADATA,
+						     RCL_CLIENT,
+						     body->eadatasize);
+
+				lmm = req_capsule_client_get(pill, &RMF_EADATA);
+				if (lmm)
+					memcpy(lmm, eadata, body->eadatasize);
+			}
+		}
+
+		if (body->valid & OBD_MD_FLRMTPERM) {
+			struct mdt_remote_perm *perm;
+
+			LASSERT(client_is_remote(exp));
+			perm = req_capsule_server_swab_get(pill, &RMF_ACL,
+						lustre_swab_mdt_remote_perm);
+			if (perm == NULL)
+				return -EPROTO;
+		}
+		if (body->valid & OBD_MD_FLMDSCAPA) {
+			struct lustre_capa *capa, *p;
+
+			capa = req_capsule_server_get(pill, &RMF_CAPA1);
+			if (capa == NULL)
+				return -EPROTO;
+
+			if (it->it_op & IT_OPEN) {
+				/* client fid capa will be checked in replay */
+				p = req_capsule_client_get(pill, &RMF_CAPA2);
+				LASSERT(p);
+				*p = *capa;
+			}
+		}
+		if (body->valid & OBD_MD_FLOSSCAPA) {
+			struct lustre_capa *capa;
+
+			capa = req_capsule_server_get(pill, &RMF_CAPA2);
+			if (capa == NULL)
+				return -EPROTO;
+		}
+	} else if (it->it_op & IT_LAYOUT) {
+		/* maybe the lock was granted right away and layout
+		 * is packed into RMF_DLM_LVB of req */
+		lvb_len = req_capsule_get_size(pill, &RMF_DLM_LVB, RCL_SERVER);
+		if (lvb_len > 0) {
+			lvb_data = req_capsule_server_sized_get(pill,
+							&RMF_DLM_LVB, lvb_len);
+			if (lvb_data == NULL)
+				return -EPROTO;
+		}
+	}
+
+	/* fill in stripe data for layout lock */
+	lock = ldlm_handle2lock(lockh);
+	if (lock != NULL && ldlm_has_layout(lock) && lvb_data != NULL) {
+		void *lmm;
+
+		LDLM_DEBUG(lock, "layout lock returned by: %s, lvb_len: %d\n",
+			ldlm_it2str(it->it_op), lvb_len);
+
+		OBD_ALLOC_LARGE(lmm, lvb_len);
+		if (lmm == NULL) {
+			LDLM_LOCK_PUT(lock);
+			return -ENOMEM;
+		}
+		memcpy(lmm, lvb_data, lvb_len);
+
+		/* install lvb_data */
+		lock_res_and_lock(lock);
+		if (lock->l_lvb_data == NULL) {
+			lock->l_lvb_type = LVB_T_LAYOUT;
+			lock->l_lvb_data = lmm;
+			lock->l_lvb_len = lvb_len;
+			lmm = NULL;
+		}
+		unlock_res_and_lock(lock);
+		if (lmm != NULL)
+			OBD_FREE_LARGE(lmm, lvb_len);
+	}
+	if (lock != NULL)
+		LDLM_LOCK_PUT(lock);
+
+	return rc;
+}
+
+/* We always reserve enough space in the reply packet for a stripe MD, because
+ * we don't know in advance the file type. */
+int mdc_enqueue(struct obd_export *exp, struct ldlm_enqueue_info *einfo,
+		struct lookup_intent *it, struct md_op_data *op_data,
+		struct lustre_handle *lockh, void *lmm, int lmmsize,
+		struct ptlrpc_request **reqp, u64 extra_lock_flags)
+{
+	static const ldlm_policy_data_t lookup_policy = {
+		.l_inodebits = { MDS_INODELOCK_LOOKUP }
+	};
+	static const ldlm_policy_data_t update_policy = {
+		.l_inodebits = { MDS_INODELOCK_UPDATE }
+	};
+	static const ldlm_policy_data_t layout_policy = {
+		.l_inodebits = { MDS_INODELOCK_LAYOUT }
+	};
+	static const ldlm_policy_data_t getxattr_policy = {
+		.l_inodebits = { MDS_INODELOCK_XATTR }
+	};
+	ldlm_policy_data_t const *policy = &lookup_policy;
+	struct obd_device *obddev = class_exp2obd(exp);
+	struct ptlrpc_request *req;
+	u64 flags, saved_flags = extra_lock_flags;
+	struct ldlm_res_id res_id;
+	int generation, resends = 0;
+	struct ldlm_reply *lockrep;
+	enum lvb_type lvb_type = LVB_T_NONE;
+	int rc;
+
+	LASSERTF(!it || einfo->ei_type == LDLM_IBITS, "lock type %d\n",
+		 einfo->ei_type);
+
+	fid_build_reg_res_name(&op_data->op_fid1, &res_id);
+
+	if (it) {
+		saved_flags |= LDLM_FL_HAS_INTENT;
+		if (it->it_op & (IT_UNLINK | IT_GETATTR | IT_READDIR))
+			policy = &update_policy;
+		else if (it->it_op & IT_LAYOUT)
+			policy = &layout_policy;
+		else if (it->it_op & (IT_GETXATTR | IT_SETXATTR))
+			policy = &getxattr_policy;
+	}
+
+	LASSERT(reqp == NULL);
+
+	generation = obddev->u.cli.cl_import->imp_generation;
+resend:
+	flags = saved_flags;
+	if (!it) {
+		/* The only way right now is FLOCK, in this case we hide flock
+		   policy as lmm, but lmmsize is 0 */
+		LASSERT(lmm && lmmsize == 0);
+		LASSERTF(einfo->ei_type == LDLM_FLOCK, "lock type %d\n",
+			 einfo->ei_type);
+		policy = (ldlm_policy_data_t *)lmm;
+		res_id.name[3] = LDLM_FLOCK;
+		req = NULL;
+	} else if (it->it_op & IT_OPEN) {
+		req = mdc_intent_open_pack(exp, it, op_data, lmm, lmmsize,
+					   einfo->ei_cbdata);
+		policy = &update_policy;
+		einfo->ei_cbdata = NULL;
+		lmm = NULL;
+	} else if (it->it_op & IT_UNLINK) {
+		req = mdc_intent_unlink_pack(exp, it, op_data);
+	} else if (it->it_op & (IT_GETATTR | IT_LOOKUP)) {
+		req = mdc_intent_getattr_pack(exp, it, op_data);
+	} else if (it->it_op & IT_READDIR) {
+		req = mdc_enqueue_pack(exp, 0);
+	} else if (it->it_op & IT_LAYOUT) {
+		if (!imp_connect_lvb_type(class_exp2cliimp(exp)))
+			return -EOPNOTSUPP;
+		req = mdc_intent_layout_pack(exp, it, op_data);
+		lvb_type = LVB_T_LAYOUT;
+	} else if (it->it_op & IT_GETXATTR) {
+		req = mdc_intent_getxattr_pack(exp, it, op_data);
+	} else {
+		LBUG();
+		return -EINVAL;
+	}
+
+	if (IS_ERR(req))
+		return PTR_ERR(req);
+
+	if (req != NULL && it && it->it_op & IT_CREAT)
+		/* ask ptlrpc not to resend on EINPROGRESS since we have our own
+		 * retry logic */
+		req->rq_no_retry_einprogress = 1;
+
+	if (resends) {
+		req->rq_generation_set = 1;
+		req->rq_import_generation = generation;
+		req->rq_sent = get_seconds() + resends;
+	}
+
+	/* It is important to obtain rpc_lock first (if applicable), so that
+	 * threads that are serialised with rpc_lock are not polluting our
+	 * rpcs in flight counter. We do not do flock request limiting, though*/
+	if (it) {
+		mdc_get_rpc_lock(obddev->u.cli.cl_rpc_lock, it);
+		rc = mdc_enter_request(&obddev->u.cli);
+		if (rc != 0) {
+			mdc_put_rpc_lock(obddev->u.cli.cl_rpc_lock, it);
+			mdc_clear_replay_flag(req, 0);
+			ptlrpc_req_finished(req);
+			return rc;
+		}
+	}
+
+	rc = ldlm_cli_enqueue(exp, &req, einfo, &res_id, policy, &flags, NULL,
+			      0, lvb_type, lockh, 0);
+	if (!it) {
+		/* For flock requests we immediately return without further
+		   delay and let caller deal with the rest, since rest of
+		   this function metadata processing makes no sense for flock
+		   requests anyway. But in case of problem during comms with
+		   Server (ETIMEDOUT) or any signal/kill attempt (EINTR), we
+		   can not rely on caller and this mainly for F_UNLCKs
+		   (explicits or automatically generated by Kernel to clean
+		   current FLocks upon exit) that can't be trashed */
+		if ((rc == -EINTR) || (rc == -ETIMEDOUT))
+			goto resend;
+		return rc;
+	}
+
+	mdc_exit_request(&obddev->u.cli);
+	mdc_put_rpc_lock(obddev->u.cli.cl_rpc_lock, it);
+
+	if (rc < 0) {
+		CDEBUG_LIMIT((rc == -EACCES || rc == -EIDRM) ? D_INFO : D_ERROR,
+			     "%s: ldlm_cli_enqueue failed: rc = %d\n",
+			     obddev->obd_name, rc);
+
+		mdc_clear_replay_flag(req, rc);
+		ptlrpc_req_finished(req);
+		return rc;
+	}
+
+	lockrep = req_capsule_server_get(&req->rq_pill, &RMF_DLM_REP);
+	LASSERT(lockrep != NULL);
+
+	lockrep->lock_policy_res2 =
+		ptlrpc_status_ntoh(lockrep->lock_policy_res2);
+
+	/* Retry the create infinitely when we get -EINPROGRESS from
+	 * server. This is required by the new quota design. */
+	if (it && it->it_op & IT_CREAT &&
+	    (int)lockrep->lock_policy_res2 == -EINPROGRESS) {
+		mdc_clear_replay_flag(req, rc);
+		ptlrpc_req_finished(req);
+		resends++;
+
+		CDEBUG(D_HA, "%s: resend:%d op:%d "DFID"/"DFID"\n",
+		       obddev->obd_name, resends, it->it_op,
+		       PFID(&op_data->op_fid1), PFID(&op_data->op_fid2));
+
+		if (generation == obddev->u.cli.cl_import->imp_generation) {
+			goto resend;
+		} else {
+			CDEBUG(D_HA, "resend cross eviction\n");
+			return -EIO;
+		}
+	}
+
+	rc = mdc_finish_enqueue(exp, req, einfo, it, lockh, rc);
+	if (rc < 0) {
+		if (lustre_handle_is_used(lockh)) {
+			ldlm_lock_decref(lockh, einfo->ei_mode);
+			memset(lockh, 0, sizeof(*lockh));
+		}
+		ptlrpc_req_finished(req);
+
+		it->d.lustre.it_lock_handle = 0;
+		it->d.lustre.it_lock_mode = 0;
+		it->d.lustre.it_data = NULL;
+	}
+
+	return rc;
+}
+
+static int mdc_finish_intent_lock(struct obd_export *exp,
+				  struct ptlrpc_request *request,
+				  struct md_op_data *op_data,
+				  struct lookup_intent *it,
+				  struct lustre_handle *lockh)
+{
+	struct lustre_handle old_lock;
+	struct mdt_body *mdt_body;
+	struct ldlm_lock *lock;
+	int rc;
+
+	LASSERT(request != NULL);
+	LASSERT(request != LP_POISON);
+	LASSERT(request->rq_repmsg != LP_POISON);
+
+	if (!it_disposition(it, DISP_IT_EXECD)) {
+		/* The server failed before it even started executing the
+		 * intent, i.e. because it couldn't unpack the request. */
+		LASSERT(it->d.lustre.it_status != 0);
+		return it->d.lustre.it_status;
+	}
+	rc = it_open_error(DISP_IT_EXECD, it);
+	if (rc)
+		return rc;
+
+	mdt_body = req_capsule_server_get(&request->rq_pill, &RMF_MDT_BODY);
+	LASSERT(mdt_body != NULL);      /* mdc_enqueue checked */
+
+	/* If we were revalidating a fid/name pair, mark the intent in
+	 * case we fail and get called again from lookup */
+	if (fid_is_sane(&op_data->op_fid2) &&
+	    it->it_create_mode & M_CHECK_STALE &&
+	    it->it_op != IT_GETATTR) {
+
+		/* Also: did we find the same inode? */
+		/* sever can return one of two fids:
+		 * op_fid2 - new allocated fid - if file is created.
+		 * op_fid3 - existent fid - if file only open.
+		 * op_fid3 is saved in lmv_intent_open */
+		if ((!lu_fid_eq(&op_data->op_fid2, &mdt_body->fid1)) &&
+		    (!lu_fid_eq(&op_data->op_fid3, &mdt_body->fid1))) {
+			CDEBUG(D_DENTRY, "Found stale data "DFID"("DFID")/"DFID
+			       "\n", PFID(&op_data->op_fid2),
+			       PFID(&op_data->op_fid2), PFID(&mdt_body->fid1));
+			return -ESTALE;
+		}
+	}
+
+	rc = it_open_error(DISP_LOOKUP_EXECD, it);
+	if (rc)
+		return rc;
+
+	/* keep requests around for the multiple phases of the call
+	 * this shows the DISP_XX must guarantee we make it into the call
+	 */
+	if (!it_disposition(it, DISP_ENQ_CREATE_REF) &&
+	    it_disposition(it, DISP_OPEN_CREATE) &&
+	    !it_open_error(DISP_OPEN_CREATE, it)) {
+		it_set_disposition(it, DISP_ENQ_CREATE_REF);
+		ptlrpc_request_addref(request); /* balanced in ll_create_node */
+	}
+	if (!it_disposition(it, DISP_ENQ_OPEN_REF) &&
+	    it_disposition(it, DISP_OPEN_OPEN) &&
+	    !it_open_error(DISP_OPEN_OPEN, it)) {
+		it_set_disposition(it, DISP_ENQ_OPEN_REF);
+		ptlrpc_request_addref(request); /* balanced in ll_file_open */
+		/* BUG 11546 - eviction in the middle of open rpc processing */
+		OBD_FAIL_TIMEOUT(OBD_FAIL_MDC_ENQUEUE_PAUSE, obd_timeout);
+	}
+
+	if (it->it_op & IT_CREAT) {
+		/* XXX this belongs in ll_create_it */
+	} else if (it->it_op == IT_OPEN) {
+		LASSERT(!it_disposition(it, DISP_OPEN_CREATE));
+	} else {
+		LASSERT(it->it_op & (IT_GETATTR | IT_LOOKUP | IT_LAYOUT));
+	}
+
+	/* If we already have a matching lock, then cancel the new
+	 * one.  We have to set the data here instead of in
+	 * mdc_enqueue, because we need to use the child's inode as
+	 * the l_ast_data to match, and that's not available until
+	 * intent_finish has performed the iget().) */
+	lock = ldlm_handle2lock(lockh);
+	if (lock) {
+		ldlm_policy_data_t policy = lock->l_policy_data;
+
+		LDLM_DEBUG(lock, "matching against this");
+
+		LASSERTF(fid_res_name_eq(&mdt_body->fid1,
+					 &lock->l_resource->lr_name),
+			 "Lock res_id: "DLDLMRES", fid: "DFID"\n",
+			 PLDLMRES(lock->l_resource), PFID(&mdt_body->fid1));
+		LDLM_LOCK_PUT(lock);
+
+		memcpy(&old_lock, lockh, sizeof(*lockh));
+		if (ldlm_lock_match(NULL, LDLM_FL_BLOCK_GRANTED, NULL,
+				    LDLM_IBITS, &policy, LCK_NL,
+				    &old_lock, 0)) {
+			ldlm_lock_decref_and_cancel(lockh,
+						    it->d.lustre.it_lock_mode);
+			memcpy(lockh, &old_lock, sizeof(old_lock));
+			it->d.lustre.it_lock_handle = lockh->cookie;
+		}
+	}
+	CDEBUG(D_DENTRY,
+	       "D_IT dentry %.*s intent: %s status %d disp %x rc %d\n",
+	       op_data->op_namelen, op_data->op_name, ldlm_it2str(it->it_op),
+	       it->d.lustre.it_status, it->d.lustre.it_disposition, rc);
+	return rc;
+}
+
+int mdc_revalidate_lock(struct obd_export *exp, struct lookup_intent *it,
+			struct lu_fid *fid, __u64 *bits)
+{
+	/* We could just return 1 immediately, but since we should only
+	 * be called in revalidate_it if we already have a lock, let's
+	 * verify that. */
+	struct ldlm_res_id res_id;
+	struct lustre_handle lockh;
+	ldlm_policy_data_t policy;
+	ldlm_mode_t mode;
+
+	if (it->d.lustre.it_lock_handle) {
+		lockh.cookie = it->d.lustre.it_lock_handle;
+		mode = ldlm_revalidate_lock_handle(&lockh, bits);
+	} else {
+		fid_build_reg_res_name(fid, &res_id);
+		switch (it->it_op) {
+		case IT_GETATTR:
+			/* File attributes are held under multiple bits:
+			 * nlink is under lookup lock, size and times are
+			 * under UPDATE lock and recently we've also got
+			 * a separate permissions lock for owner/group/acl that
+			 * were protected by lookup lock before.
+			 * Getattr must provide all of that information,
+			 * so we need to ensure we have all of those locks.
+			 * Unfortunately, if the bits are split across multiple
+			 * locks, there's no easy way to match all of them here,
+			 * so an extra RPC would be performed to fetch all
+			 * of those bits at once for now. */
+			/* For new MDTs(> 2.4), UPDATE|PERM should be enough,
+			 * but for old MDTs (< 2.4), permission is covered
+			 * by LOOKUP lock, so it needs to match all bits here.*/
+			policy.l_inodebits.bits = MDS_INODELOCK_UPDATE |
+						  MDS_INODELOCK_LOOKUP |
+						  MDS_INODELOCK_PERM;
+			break;
+		case IT_LAYOUT:
+			policy.l_inodebits.bits = MDS_INODELOCK_LAYOUT;
+			break;
+		default:
+			policy.l_inodebits.bits = MDS_INODELOCK_LOOKUP;
+			break;
+		}
+
+		mode = mdc_lock_match(exp, LDLM_FL_BLOCK_GRANTED, fid,
+				       LDLM_IBITS, &policy,
+				      LCK_CR | LCK_CW | LCK_PR | LCK_PW,
+				      &lockh);
+	}
+
+	if (mode) {
+		it->d.lustre.it_lock_handle = lockh.cookie;
+		it->d.lustre.it_lock_mode = mode;
+	} else {
+		it->d.lustre.it_lock_handle = 0;
+		it->d.lustre.it_lock_mode = 0;
+	}
+
+	return !!mode;
+}
+
+/*
+ * This long block is all about fixing up the lock and request state
+ * so that it is correct as of the moment _before_ the operation was
+ * applied; that way, the VFS will think that everything is normal and
+ * call Lustre's regular VFS methods.
+ *
+ * If we're performing a creation, that means that unless the creation
+ * failed with EEXIST, we should fake up a negative dentry.
+ *
+ * For everything else, we want to lookup to succeed.
+ *
+ * One additional note: if CREATE or OPEN succeeded, we add an extra
+ * reference to the request because we need to keep it around until
+ * ll_create/ll_open gets called.
+ *
+ * The server will return to us, in it_disposition, an indication of
+ * exactly what d.lustre.it_status refers to.
+ *
+ * If DISP_OPEN_OPEN is set, then d.lustre.it_status refers to the open() call,
+ * otherwise if DISP_OPEN_CREATE is set, then it status is the
+ * creation failure mode.  In either case, one of DISP_LOOKUP_NEG or
+ * DISP_LOOKUP_POS will be set, indicating whether the child lookup
+ * was successful.
+ *
+ * Else, if DISP_LOOKUP_EXECD then d.lustre.it_status is the rc of the
+ * child lookup.
+ */
+int mdc_intent_lock(struct obd_export *exp, struct md_op_data *op_data,
+		    void *lmm, int lmmsize, struct lookup_intent *it,
+		    int lookup_flags, struct ptlrpc_request **reqp,
+		    ldlm_blocking_callback cb_blocking,
+		    __u64 extra_lock_flags)
+{
+	struct ldlm_enqueue_info einfo = {
+		.ei_type	= LDLM_IBITS,
+		.ei_mode	= it_to_lock_mode(it),
+		.ei_cb_bl	= cb_blocking,
+		.ei_cb_cp	= ldlm_completion_ast,
+	};
+	struct lustre_handle lockh;
+	int rc = 0;
+
+	LASSERT(it);
+
+	CDEBUG(D_DLMTRACE, "(name: %.*s,"DFID") in obj "DFID
+		", intent: %s flags %#Lo\n", op_data->op_namelen,
+		op_data->op_name, PFID(&op_data->op_fid2),
+		PFID(&op_data->op_fid1), ldlm_it2str(it->it_op),
+		it->it_flags);
+
+	lockh.cookie = 0;
+	if (fid_is_sane(&op_data->op_fid2) &&
+	    (it->it_op & (IT_LOOKUP | IT_GETATTR))) {
+		/* We could just return 1 immediately, but since we should only
+		 * be called in revalidate_it if we already have a lock, let's
+		 * verify that. */
+		it->d.lustre.it_lock_handle = 0;
+		rc = mdc_revalidate_lock(exp, it, &op_data->op_fid2, NULL);
+		/* Only return failure if it was not GETATTR by cfid
+		   (from inode_revalidate) */
+		if (rc || op_data->op_namelen != 0)
+			return rc;
+	}
+
+	/* For case if upper layer did not alloc fid, do it now. */
+	if (!fid_is_sane(&op_data->op_fid2) && it->it_op & IT_CREAT) {
+		rc = mdc_fid_alloc(exp, &op_data->op_fid2, op_data);
+		if (rc < 0) {
+			CERROR("Can't alloc new fid, rc %d\n", rc);
+			return rc;
+		}
+	}
+	rc = mdc_enqueue(exp, &einfo, it, op_data, &lockh, lmm, lmmsize, NULL,
+			 extra_lock_flags);
+	if (rc < 0)
+		return rc;
+
+	*reqp = it->d.lustre.it_data;
+	rc = mdc_finish_intent_lock(exp, *reqp, op_data, it, &lockh);
+	return rc;
+}
+
+static int mdc_intent_getattr_async_interpret(const struct lu_env *env,
+					      struct ptlrpc_request *req,
+					      void *args, int rc)
+{
+	struct mdc_getattr_args  *ga = args;
+	struct obd_export	*exp = ga->ga_exp;
+	struct md_enqueue_info   *minfo = ga->ga_minfo;
+	struct ldlm_enqueue_info *einfo = ga->ga_einfo;
+	struct lookup_intent     *it;
+	struct lustre_handle     *lockh;
+	struct obd_device	*obddev;
+	struct ldlm_reply	 *lockrep;
+	__u64		     flags = LDLM_FL_HAS_INTENT;
+
+	it    = &minfo->mi_it;
+	lockh = &minfo->mi_lockh;
+
+	obddev = class_exp2obd(exp);
+
+	mdc_exit_request(&obddev->u.cli);
+	if (OBD_FAIL_CHECK(OBD_FAIL_MDC_GETATTR_ENQUEUE))
+		rc = -ETIMEDOUT;
+
+	rc = ldlm_cli_enqueue_fini(exp, req, einfo->ei_type, 1, einfo->ei_mode,
+				   &flags, NULL, 0, lockh, rc);
+	if (rc < 0) {
+		CERROR("ldlm_cli_enqueue_fini: %d\n", rc);
+		mdc_clear_replay_flag(req, rc);
+		goto out;
+	}
+
+	lockrep = req_capsule_server_get(&req->rq_pill, &RMF_DLM_REP);
+	LASSERT(lockrep != NULL);
+
+	lockrep->lock_policy_res2 =
+		ptlrpc_status_ntoh(lockrep->lock_policy_res2);
+
+	rc = mdc_finish_enqueue(exp, req, einfo, it, lockh, rc);
+	if (rc)
+		goto out;
+
+	rc = mdc_finish_intent_lock(exp, req, &minfo->mi_data, it, lockh);
+
+out:
+	OBD_FREE_PTR(einfo);
+	minfo->mi_cb(req, minfo, rc);
+	return 0;
+}
+
+int mdc_intent_getattr_async(struct obd_export *exp,
+			     struct md_enqueue_info *minfo,
+			     struct ldlm_enqueue_info *einfo)
+{
+	struct md_op_data       *op_data = &minfo->mi_data;
+	struct lookup_intent    *it = &minfo->mi_it;
+	struct ptlrpc_request   *req;
+	struct mdc_getattr_args *ga;
+	struct obd_device       *obddev = class_exp2obd(exp);
+	struct ldlm_res_id       res_id;
+	/*XXX: Both MDS_INODELOCK_LOOKUP and MDS_INODELOCK_UPDATE are needed
+	 *     for statahead currently. Consider CMD in future, such two bits
+	 *     maybe managed by different MDS, should be adjusted then. */
+	ldlm_policy_data_t       policy = {
+					.l_inodebits = { MDS_INODELOCK_LOOKUP |
+							 MDS_INODELOCK_UPDATE }
+				 };
+	int		      rc = 0;
+	__u64		    flags = LDLM_FL_HAS_INTENT;
+
+	CDEBUG(D_DLMTRACE,
+		"name: %.*s in inode "DFID", intent: %s flags %#Lo\n",
+		op_data->op_namelen, op_data->op_name, PFID(&op_data->op_fid1),
+		ldlm_it2str(it->it_op), it->it_flags);
+
+	fid_build_reg_res_name(&op_data->op_fid1, &res_id);
+	req = mdc_intent_getattr_pack(exp, it, op_data);
+	if (IS_ERR(req))
+		return PTR_ERR(req);
+
+	rc = mdc_enter_request(&obddev->u.cli);
+	if (rc != 0) {
+		ptlrpc_req_finished(req);
+		return rc;
+	}
+
+	rc = ldlm_cli_enqueue(exp, &req, einfo, &res_id, &policy, &flags, NULL,
+			      0, LVB_T_NONE, &minfo->mi_lockh, 1);
+	if (rc < 0) {
+		mdc_exit_request(&obddev->u.cli);
+		ptlrpc_req_finished(req);
+		return rc;
+	}
+
+	CLASSERT(sizeof(*ga) <= sizeof(req->rq_async_args));
+	ga = ptlrpc_req_async_args(req);
+	ga->ga_exp = exp;
+	ga->ga_minfo = minfo;
+	ga->ga_einfo = einfo;
+
+	req->rq_interpret_reply = mdc_intent_getattr_async_interpret;
+	ptlrpcd_add_req(req, PDL_POLICY_LOCAL, -1);
+
+	return 0;
+}
diff --git a/kernel/drivers/staging/lustre/lustre/mdc/mdc_reint.c b/kernel/drivers/staging/lustre/lustre/mdc/mdc_reint.c
new file mode 100644
index 000000000..5e9c6296c
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/mdc/mdc_reint.c
@@ -0,0 +1,483 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_MDC
+
+# include <linux/module.h>
+# include <linux/kernel.h>
+
+#include "../include/obd_class.h"
+#include "mdc_internal.h"
+#include "../include/lustre_fid.h"
+
+/* mdc_setattr does its own semaphore handling */
+static int mdc_reint(struct ptlrpc_request *request,
+		     struct mdc_rpc_lock *rpc_lock,
+		     int level)
+{
+	int rc;
+
+	request->rq_send_state = level;
+
+	mdc_get_rpc_lock(rpc_lock, NULL);
+	rc = ptlrpc_queue_wait(request);
+	mdc_put_rpc_lock(rpc_lock, NULL);
+	if (rc)
+		CDEBUG(D_INFO, "error in handling %d\n", rc);
+	else if (!req_capsule_server_get(&request->rq_pill, &RMF_MDT_BODY))
+		rc = -EPROTO;
+
+	return rc;
+}
+
+/* Find and cancel locally locks matched by inode @bits & @mode in the resource
+ * found by @fid. Found locks are added into @cancel list. Returns the amount of
+ * locks added to @cancels list. */
+int mdc_resource_get_unused(struct obd_export *exp, const struct lu_fid *fid,
+			    struct list_head *cancels, ldlm_mode_t mode,
+			    __u64 bits)
+{
+	struct ldlm_namespace *ns = exp->exp_obd->obd_namespace;
+	ldlm_policy_data_t policy = {};
+	struct ldlm_res_id res_id;
+	struct ldlm_resource *res;
+	int count;
+
+	/* Return, i.e. cancel nothing, only if ELC is supported (flag in
+	 * export) but disabled through procfs (flag in NS).
+	 *
+	 * This distinguishes from a case when ELC is not supported originally,
+	 * when we still want to cancel locks in advance and just cancel them
+	 * locally, without sending any RPC. */
+	if (exp_connect_cancelset(exp) && !ns_connect_cancelset(ns))
+		return 0;
+
+	fid_build_reg_res_name(fid, &res_id);
+	res = ldlm_resource_get(exp->exp_obd->obd_namespace,
+				NULL, &res_id, 0, 0);
+	if (res == NULL)
+		return 0;
+	LDLM_RESOURCE_ADDREF(res);
+	/* Initialize ibits lock policy. */
+	policy.l_inodebits.bits = bits;
+	count = ldlm_cancel_resource_local(res, cancels, &policy,
+					   mode, 0, 0, NULL);
+	LDLM_RESOURCE_DELREF(res);
+	ldlm_resource_putref(res);
+	return count;
+}
+
+int mdc_setattr(struct obd_export *exp, struct md_op_data *op_data,
+		void *ea, int ealen, void *ea2, int ea2len,
+		struct ptlrpc_request **request, struct md_open_data **mod)
+{
+	LIST_HEAD(cancels);
+	struct ptlrpc_request *req;
+	struct mdc_rpc_lock *rpc_lock;
+	struct obd_device *obd = exp->exp_obd;
+	int count = 0, rc;
+	__u64 bits;
+
+	LASSERT(op_data != NULL);
+
+	bits = MDS_INODELOCK_UPDATE;
+	if (op_data->op_attr.ia_valid & (ATTR_MODE|ATTR_UID|ATTR_GID))
+		bits |= MDS_INODELOCK_LOOKUP;
+	if ((op_data->op_flags & MF_MDC_CANCEL_FID1) &&
+	    (fid_is_sane(&op_data->op_fid1)) &&
+	    !OBD_FAIL_CHECK(OBD_FAIL_LDLM_BL_CALLBACK_NET))
+		count = mdc_resource_get_unused(exp, &op_data->op_fid1,
+						&cancels, LCK_EX, bits);
+	req = ptlrpc_request_alloc(class_exp2cliimp(exp),
+				   &RQF_MDS_REINT_SETATTR);
+	if (req == NULL) {
+		ldlm_lock_list_put(&cancels, l_bl_ast, count);
+		return -ENOMEM;
+	}
+	mdc_set_capa_size(req, &RMF_CAPA1, op_data->op_capa1);
+	if ((op_data->op_flags & (MF_SOM_CHANGE | MF_EPOCH_OPEN)) == 0)
+		req_capsule_set_size(&req->rq_pill, &RMF_MDT_EPOCH, RCL_CLIENT,
+				     0);
+	req_capsule_set_size(&req->rq_pill, &RMF_EADATA, RCL_CLIENT, ealen);
+	req_capsule_set_size(&req->rq_pill, &RMF_LOGCOOKIES, RCL_CLIENT,
+			     ea2len);
+
+	rc = mdc_prep_elc_req(exp, req, MDS_REINT, &cancels, count);
+	if (rc) {
+		ptlrpc_request_free(req);
+		return rc;
+	}
+
+	rpc_lock = obd->u.cli.cl_rpc_lock;
+
+	if (op_data->op_attr.ia_valid & (ATTR_MTIME | ATTR_CTIME))
+		CDEBUG(D_INODE, "setting mtime "CFS_TIME_T
+		       ", ctime "CFS_TIME_T"\n",
+		       LTIME_S(op_data->op_attr.ia_mtime),
+		       LTIME_S(op_data->op_attr.ia_ctime));
+	mdc_setattr_pack(req, op_data, ea, ealen, ea2, ea2len);
+
+	ptlrpc_request_set_replen(req);
+	if (mod && (op_data->op_flags & MF_EPOCH_OPEN) &&
+	    req->rq_import->imp_replayable) {
+		LASSERT(*mod == NULL);
+
+		*mod = obd_mod_alloc();
+		if (*mod == NULL) {
+			DEBUG_REQ(D_ERROR, req, "Can't allocate md_open_data");
+		} else {
+			req->rq_replay = 1;
+			req->rq_cb_data = *mod;
+			(*mod)->mod_open_req = req;
+			req->rq_commit_cb = mdc_commit_open;
+			(*mod)->mod_is_create = true;
+			/**
+			 * Take an extra reference on \var mod, it protects \var
+			 * mod from being freed on eviction (commit callback is
+			 * called despite rq_replay flag).
+			 * Will be put on mdc_done_writing().
+			 */
+			obd_mod_get(*mod);
+		}
+	}
+
+	rc = mdc_reint(req, rpc_lock, LUSTRE_IMP_FULL);
+
+	/* Save the obtained info in the original RPC for the replay case. */
+	if (rc == 0 && (op_data->op_flags & MF_EPOCH_OPEN)) {
+		struct mdt_ioepoch *epoch;
+		struct mdt_body  *body;
+
+		epoch = req_capsule_client_get(&req->rq_pill, &RMF_MDT_EPOCH);
+		body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
+		LASSERT(epoch != NULL);
+		LASSERT(body != NULL);
+		epoch->handle = body->handle;
+		epoch->ioepoch = body->ioepoch;
+		req->rq_replay_cb = mdc_replay_open;
+	/** bug 3633, open may be committed and estale answer is not error */
+	} else if (rc == -ESTALE && (op_data->op_flags & MF_SOM_CHANGE)) {
+		rc = 0;
+	} else if (rc == -ERESTARTSYS) {
+		rc = 0;
+	}
+	*request = req;
+	if (rc && req->rq_commit_cb) {
+		/* Put an extra reference on \var mod on error case. */
+		if (mod != NULL && *mod != NULL)
+			obd_mod_put(*mod);
+		req->rq_commit_cb(req);
+	}
+	return rc;
+}
+
+int mdc_create(struct obd_export *exp, struct md_op_data *op_data,
+	       const void *data, int datalen, int mode, __u32 uid, __u32 gid,
+	       cfs_cap_t cap_effective, __u64 rdev,
+	       struct ptlrpc_request **request)
+{
+	struct ptlrpc_request *req;
+	int level, rc;
+	int count, resends = 0;
+	struct obd_import *import = exp->exp_obd->u.cli.cl_import;
+	int generation = import->imp_generation;
+	LIST_HEAD(cancels);
+
+	/* For case if upper layer did not alloc fid, do it now. */
+	if (!fid_is_sane(&op_data->op_fid2)) {
+		/*
+		 * mdc_fid_alloc() may return errno 1 in case of switch to new
+		 * sequence, handle this.
+		 */
+		rc = mdc_fid_alloc(exp, &op_data->op_fid2, op_data);
+		if (rc < 0) {
+			CERROR("Can't alloc new fid, rc %d\n", rc);
+			return rc;
+		}
+	}
+
+rebuild:
+	count = 0;
+	if ((op_data->op_flags & MF_MDC_CANCEL_FID1) &&
+	    (fid_is_sane(&op_data->op_fid1)))
+		count = mdc_resource_get_unused(exp, &op_data->op_fid1,
+						&cancels, LCK_EX,
+						MDS_INODELOCK_UPDATE);
+
+	req = ptlrpc_request_alloc(class_exp2cliimp(exp),
+				   &RQF_MDS_REINT_CREATE_RMT_ACL);
+	if (req == NULL) {
+		ldlm_lock_list_put(&cancels, l_bl_ast, count);
+		return -ENOMEM;
+	}
+	mdc_set_capa_size(req, &RMF_CAPA1, op_data->op_capa1);
+	req_capsule_set_size(&req->rq_pill, &RMF_NAME, RCL_CLIENT,
+			     op_data->op_namelen + 1);
+	req_capsule_set_size(&req->rq_pill, &RMF_EADATA, RCL_CLIENT,
+			     data && datalen ? datalen : 0);
+
+	rc = mdc_prep_elc_req(exp, req, MDS_REINT, &cancels, count);
+	if (rc) {
+		ptlrpc_request_free(req);
+		return rc;
+	}
+
+	/*
+	 * mdc_create_pack() fills msg->bufs[1] with name and msg->bufs[2] with
+	 * tgt, for symlinks or lov MD data.
+	 */
+	mdc_create_pack(req, op_data, data, datalen, mode, uid,
+			gid, cap_effective, rdev);
+
+	ptlrpc_request_set_replen(req);
+
+	/* ask ptlrpc not to resend on EINPROGRESS since we have our own retry
+	 * logic here */
+	req->rq_no_retry_einprogress = 1;
+
+	if (resends) {
+		req->rq_generation_set = 1;
+		req->rq_import_generation = generation;
+		req->rq_sent = get_seconds() + resends;
+	}
+	level = LUSTRE_IMP_FULL;
+ resend:
+	rc = mdc_reint(req, exp->exp_obd->u.cli.cl_rpc_lock, level);
+
+	/* Resend if we were told to. */
+	if (rc == -ERESTARTSYS) {
+		level = LUSTRE_IMP_RECOVER;
+		goto resend;
+	} else if (rc == -EINPROGRESS) {
+		/* Retry create infinitely until succeed or get other
+		 * error code. */
+		ptlrpc_req_finished(req);
+		resends++;
+
+		CDEBUG(D_HA, "%s: resend:%d create on "DFID"/"DFID"\n",
+		       exp->exp_obd->obd_name, resends,
+		       PFID(&op_data->op_fid1), PFID(&op_data->op_fid2));
+
+		if (generation == import->imp_generation) {
+			goto rebuild;
+		} else {
+			CDEBUG(D_HA, "resend cross eviction\n");
+			return -EIO;
+		}
+	} else if (rc == 0) {
+		struct mdt_body *body;
+		struct lustre_capa *capa;
+
+		body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
+		LASSERT(body);
+		if (body->valid & OBD_MD_FLMDSCAPA) {
+			capa = req_capsule_server_get(&req->rq_pill,
+						      &RMF_CAPA1);
+			if (capa == NULL)
+				rc = -EPROTO;
+		}
+	}
+
+	*request = req;
+	return rc;
+}
+
+int mdc_unlink(struct obd_export *exp, struct md_op_data *op_data,
+	       struct ptlrpc_request **request)
+{
+	LIST_HEAD(cancels);
+	struct obd_device *obd = class_exp2obd(exp);
+	struct ptlrpc_request *req = *request;
+	int count = 0, rc;
+
+	LASSERT(req == NULL);
+
+	if ((op_data->op_flags & MF_MDC_CANCEL_FID1) &&
+	    (fid_is_sane(&op_data->op_fid1)) &&
+	    !OBD_FAIL_CHECK(OBD_FAIL_LDLM_BL_CALLBACK_NET))
+		count = mdc_resource_get_unused(exp, &op_data->op_fid1,
+						&cancels, LCK_EX,
+						MDS_INODELOCK_UPDATE);
+	if ((op_data->op_flags & MF_MDC_CANCEL_FID3) &&
+	    (fid_is_sane(&op_data->op_fid3)) &&
+	    !OBD_FAIL_CHECK(OBD_FAIL_LDLM_BL_CALLBACK_NET))
+		count += mdc_resource_get_unused(exp, &op_data->op_fid3,
+						 &cancels, LCK_EX,
+						 MDS_INODELOCK_FULL);
+	req = ptlrpc_request_alloc(class_exp2cliimp(exp),
+				   &RQF_MDS_REINT_UNLINK);
+	if (req == NULL) {
+		ldlm_lock_list_put(&cancels, l_bl_ast, count);
+		return -ENOMEM;
+	}
+	mdc_set_capa_size(req, &RMF_CAPA1, op_data->op_capa1);
+	req_capsule_set_size(&req->rq_pill, &RMF_NAME, RCL_CLIENT,
+			     op_data->op_namelen + 1);
+
+	rc = mdc_prep_elc_req(exp, req, MDS_REINT, &cancels, count);
+	if (rc) {
+		ptlrpc_request_free(req);
+		return rc;
+	}
+
+	mdc_unlink_pack(req, op_data);
+
+	req_capsule_set_size(&req->rq_pill, &RMF_MDT_MD, RCL_SERVER,
+			     obd->u.cli.cl_default_mds_easize);
+	req_capsule_set_size(&req->rq_pill, &RMF_LOGCOOKIES, RCL_SERVER,
+			     obd->u.cli.cl_default_mds_cookiesize);
+	ptlrpc_request_set_replen(req);
+
+	*request = req;
+
+	rc = mdc_reint(req, obd->u.cli.cl_rpc_lock, LUSTRE_IMP_FULL);
+	if (rc == -ERESTARTSYS)
+		rc = 0;
+	return rc;
+}
+
+int mdc_link(struct obd_export *exp, struct md_op_data *op_data,
+	     struct ptlrpc_request **request)
+{
+	LIST_HEAD(cancels);
+	struct obd_device *obd = exp->exp_obd;
+	struct ptlrpc_request *req;
+	int count = 0, rc;
+
+	if ((op_data->op_flags & MF_MDC_CANCEL_FID2) &&
+	    (fid_is_sane(&op_data->op_fid2)))
+		count = mdc_resource_get_unused(exp, &op_data->op_fid2,
+						&cancels, LCK_EX,
+						MDS_INODELOCK_UPDATE);
+	if ((op_data->op_flags & MF_MDC_CANCEL_FID1) &&
+	    (fid_is_sane(&op_data->op_fid1)))
+		count += mdc_resource_get_unused(exp, &op_data->op_fid1,
+						 &cancels, LCK_EX,
+						 MDS_INODELOCK_UPDATE);
+
+	req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_MDS_REINT_LINK);
+	if (req == NULL) {
+		ldlm_lock_list_put(&cancels, l_bl_ast, count);
+		return -ENOMEM;
+	}
+	mdc_set_capa_size(req, &RMF_CAPA1, op_data->op_capa1);
+	mdc_set_capa_size(req, &RMF_CAPA2, op_data->op_capa2);
+	req_capsule_set_size(&req->rq_pill, &RMF_NAME, RCL_CLIENT,
+			     op_data->op_namelen + 1);
+
+	rc = mdc_prep_elc_req(exp, req, MDS_REINT, &cancels, count);
+	if (rc) {
+		ptlrpc_request_free(req);
+		return rc;
+	}
+
+	mdc_link_pack(req, op_data);
+	ptlrpc_request_set_replen(req);
+
+	rc = mdc_reint(req, obd->u.cli.cl_rpc_lock, LUSTRE_IMP_FULL);
+	*request = req;
+	if (rc == -ERESTARTSYS)
+		rc = 0;
+
+	return rc;
+}
+
+int mdc_rename(struct obd_export *exp, struct md_op_data *op_data,
+	       const char *old, int oldlen, const char *new, int newlen,
+	       struct ptlrpc_request **request)
+{
+	LIST_HEAD(cancels);
+	struct obd_device *obd = exp->exp_obd;
+	struct ptlrpc_request *req;
+	int count = 0, rc;
+
+	if ((op_data->op_flags & MF_MDC_CANCEL_FID1) &&
+	    (fid_is_sane(&op_data->op_fid1)))
+		count = mdc_resource_get_unused(exp, &op_data->op_fid1,
+						&cancels, LCK_EX,
+						MDS_INODELOCK_UPDATE);
+	if ((op_data->op_flags & MF_MDC_CANCEL_FID2) &&
+	    (fid_is_sane(&op_data->op_fid2)))
+		count += mdc_resource_get_unused(exp, &op_data->op_fid2,
+						 &cancels, LCK_EX,
+						 MDS_INODELOCK_UPDATE);
+	if ((op_data->op_flags & MF_MDC_CANCEL_FID3) &&
+	    (fid_is_sane(&op_data->op_fid3)))
+		count += mdc_resource_get_unused(exp, &op_data->op_fid3,
+						 &cancels, LCK_EX,
+						 MDS_INODELOCK_LOOKUP);
+	if ((op_data->op_flags & MF_MDC_CANCEL_FID4) &&
+	     (fid_is_sane(&op_data->op_fid4)))
+		count += mdc_resource_get_unused(exp, &op_data->op_fid4,
+						 &cancels, LCK_EX,
+						 MDS_INODELOCK_FULL);
+
+	req = ptlrpc_request_alloc(class_exp2cliimp(exp),
+				   &RQF_MDS_REINT_RENAME);
+	if (req == NULL) {
+		ldlm_lock_list_put(&cancels, l_bl_ast, count);
+		return -ENOMEM;
+	}
+
+	mdc_set_capa_size(req, &RMF_CAPA1, op_data->op_capa1);
+	mdc_set_capa_size(req, &RMF_CAPA2, op_data->op_capa2);
+	req_capsule_set_size(&req->rq_pill, &RMF_NAME, RCL_CLIENT, oldlen + 1);
+	req_capsule_set_size(&req->rq_pill, &RMF_SYMTGT, RCL_CLIENT, newlen+1);
+
+	rc = mdc_prep_elc_req(exp, req, MDS_REINT, &cancels, count);
+	if (rc) {
+		ptlrpc_request_free(req);
+		return rc;
+	}
+
+	if (exp_connect_cancelset(exp) && req)
+		ldlm_cli_cancel_list(&cancels, count, req, 0);
+
+	mdc_rename_pack(req, op_data, old, oldlen, new, newlen);
+
+	req_capsule_set_size(&req->rq_pill, &RMF_MDT_MD, RCL_SERVER,
+			     obd->u.cli.cl_default_mds_easize);
+	req_capsule_set_size(&req->rq_pill, &RMF_LOGCOOKIES, RCL_SERVER,
+			     obd->u.cli.cl_default_mds_cookiesize);
+	ptlrpc_request_set_replen(req);
+
+	rc = mdc_reint(req, obd->u.cli.cl_rpc_lock, LUSTRE_IMP_FULL);
+	*request = req;
+	if (rc == -ERESTARTSYS)
+		rc = 0;
+
+	return rc;
+}
diff --git a/kernel/drivers/staging/lustre/lustre/mdc/mdc_request.c b/kernel/drivers/staging/lustre/lustre/mdc/mdc_request.c
new file mode 100644
index 000000000..f8ef5fe5e
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/mdc/mdc_request.c
@@ -0,0 +1,2731 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2001, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_MDC
+
+# include <linux/module.h>
+# include <linux/pagemap.h>
+# include <linux/miscdevice.h>
+# include <linux/init.h>
+# include <linux/utsname.h>
+
+#include "../include/lustre_acl.h"
+#include "../include/obd_class.h"
+#include "../include/lustre_fid.h"
+#include "../include/lprocfs_status.h"
+#include "../include/lustre_param.h"
+#include "../include/lustre_log.h"
+
+#include "mdc_internal.h"
+
+#define REQUEST_MINOR 244
+
+struct mdc_renew_capa_args {
+	struct obd_capa	*ra_oc;
+	renew_capa_cb_t	 ra_cb;
+};
+
+static int mdc_cleanup(struct obd_device *obd);
+
+static int mdc_unpack_capa(struct obd_export *exp, struct ptlrpc_request *req,
+		    const struct req_msg_field *field, struct obd_capa **oc)
+{
+	struct lustre_capa *capa;
+	struct obd_capa *c;
+
+	/* swabbed already in mdc_enqueue */
+	capa = req_capsule_server_get(&req->rq_pill, field);
+	if (capa == NULL)
+		return -EPROTO;
+
+	c = alloc_capa(CAPA_SITE_CLIENT);
+	if (IS_ERR(c)) {
+		CDEBUG(D_INFO, "alloc capa failed!\n");
+		return PTR_ERR(c);
+	} else {
+		c->c_capa = *capa;
+		*oc = c;
+		return 0;
+	}
+}
+
+static inline int mdc_queue_wait(struct ptlrpc_request *req)
+{
+	struct client_obd *cli = &req->rq_import->imp_obd->u.cli;
+	int rc;
+
+	/* mdc_enter_request() ensures that this client has no more
+	 * than cl_max_rpcs_in_flight RPCs simultaneously inf light
+	 * against an MDT. */
+	rc = mdc_enter_request(cli);
+	if (rc != 0)
+		return rc;
+
+	rc = ptlrpc_queue_wait(req);
+	mdc_exit_request(cli);
+
+	return rc;
+}
+
+/* Helper that implements most of mdc_getstatus and signal_completed_replay. */
+/* XXX this should become mdc_get_info("key"), sending MDS_GET_INFO RPC */
+static int send_getstatus(struct obd_import *imp, struct lu_fid *rootfid,
+			  struct obd_capa **pc, int level, int msg_flags)
+{
+	struct ptlrpc_request *req;
+	struct mdt_body       *body;
+	int		    rc;
+
+	req = ptlrpc_request_alloc_pack(imp, &RQF_MDS_GETSTATUS,
+					LUSTRE_MDS_VERSION, MDS_GETSTATUS);
+	if (req == NULL)
+		return -ENOMEM;
+
+	mdc_pack_body(req, NULL, NULL, 0, 0, -1, 0);
+	lustre_msg_add_flags(req->rq_reqmsg, msg_flags);
+	req->rq_send_state = level;
+
+	ptlrpc_request_set_replen(req);
+
+	rc = ptlrpc_queue_wait(req);
+	if (rc)
+		goto out;
+
+	body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
+	if (body == NULL) {
+		rc = -EPROTO;
+		goto out;
+	}
+
+	if (body->valid & OBD_MD_FLMDSCAPA) {
+		rc = mdc_unpack_capa(NULL, req, &RMF_CAPA1, pc);
+		if (rc)
+			goto out;
+	}
+
+	*rootfid = body->fid1;
+	CDEBUG(D_NET,
+	       "root fid="DFID", last_committed=%llu\n",
+	       PFID(rootfid),
+	       lustre_msg_get_last_committed(req->rq_repmsg));
+out:
+	ptlrpc_req_finished(req);
+	return rc;
+}
+
+/* This should be mdc_get_info("rootfid") */
+static int mdc_getstatus(struct obd_export *exp, struct lu_fid *rootfid,
+		  struct obd_capa **pc)
+{
+	return send_getstatus(class_exp2cliimp(exp), rootfid, pc,
+			      LUSTRE_IMP_FULL, 0);
+}
+
+/*
+ * This function now is known to always saying that it will receive 4 buffers
+ * from server. Even for cases when acl_size and md_size is zero, RPC header
+ * will contain 4 fields and RPC itself will contain zero size fields. This is
+ * because mdt_getattr*() _always_ returns 4 fields, but if acl is not needed
+ * and thus zero, it shrinks it, making zero size. The same story about
+ * md_size. And this is course of problem when client waits for smaller number
+ * of fields. This issue will be fixed later when client gets aware of RPC
+ * layouts.  --umka
+ */
+static int mdc_getattr_common(struct obd_export *exp,
+			      struct ptlrpc_request *req)
+{
+	struct req_capsule *pill = &req->rq_pill;
+	struct mdt_body    *body;
+	void	       *eadata;
+	int		 rc;
+
+	/* Request message already built. */
+	rc = ptlrpc_queue_wait(req);
+	if (rc != 0)
+		return rc;
+
+	/* sanity check for the reply */
+	body = req_capsule_server_get(pill, &RMF_MDT_BODY);
+	if (body == NULL)
+		return -EPROTO;
+
+	CDEBUG(D_NET, "mode: %o\n", body->mode);
+
+	if (body->eadatasize != 0) {
+		mdc_update_max_ea_from_body(exp, body);
+
+		eadata = req_capsule_server_sized_get(pill, &RMF_MDT_MD,
+						      body->eadatasize);
+		if (eadata == NULL)
+			return -EPROTO;
+	}
+
+	if (body->valid & OBD_MD_FLRMTPERM) {
+		struct mdt_remote_perm *perm;
+
+		LASSERT(client_is_remote(exp));
+		perm = req_capsule_server_swab_get(pill, &RMF_ACL,
+						lustre_swab_mdt_remote_perm);
+		if (perm == NULL)
+			return -EPROTO;
+	}
+
+	if (body->valid & OBD_MD_FLMDSCAPA) {
+		struct lustre_capa *capa;
+
+		capa = req_capsule_server_get(pill, &RMF_CAPA1);
+		if (capa == NULL)
+			return -EPROTO;
+	}
+
+	return 0;
+}
+
+static int mdc_getattr(struct obd_export *exp, struct md_op_data *op_data,
+		struct ptlrpc_request **request)
+{
+	struct ptlrpc_request *req;
+	int		    rc;
+
+	/* Single MDS without an LMV case */
+	if (op_data->op_flags & MF_GET_MDT_IDX) {
+		op_data->op_mds = 0;
+		return 0;
+	}
+	*request = NULL;
+	req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_MDS_GETATTR);
+	if (req == NULL)
+		return -ENOMEM;
+
+	mdc_set_capa_size(req, &RMF_CAPA1, op_data->op_capa1);
+
+	rc = ptlrpc_request_pack(req, LUSTRE_MDS_VERSION, MDS_GETATTR);
+	if (rc) {
+		ptlrpc_request_free(req);
+		return rc;
+	}
+
+	mdc_pack_body(req, &op_data->op_fid1, op_data->op_capa1,
+		      op_data->op_valid, op_data->op_mode, -1, 0);
+
+	req_capsule_set_size(&req->rq_pill, &RMF_MDT_MD, RCL_SERVER,
+			     op_data->op_mode);
+	if (op_data->op_valid & OBD_MD_FLRMTPERM) {
+		LASSERT(client_is_remote(exp));
+		req_capsule_set_size(&req->rq_pill, &RMF_ACL, RCL_SERVER,
+				     sizeof(struct mdt_remote_perm));
+	}
+	ptlrpc_request_set_replen(req);
+
+	rc = mdc_getattr_common(exp, req);
+	if (rc)
+		ptlrpc_req_finished(req);
+	else
+		*request = req;
+	return rc;
+}
+
+static int mdc_getattr_name(struct obd_export *exp, struct md_op_data *op_data,
+		     struct ptlrpc_request **request)
+{
+	struct ptlrpc_request *req;
+	int		    rc;
+
+	*request = NULL;
+	req = ptlrpc_request_alloc(class_exp2cliimp(exp),
+				   &RQF_MDS_GETATTR_NAME);
+	if (req == NULL)
+		return -ENOMEM;
+
+	mdc_set_capa_size(req, &RMF_CAPA1, op_data->op_capa1);
+	req_capsule_set_size(&req->rq_pill, &RMF_NAME, RCL_CLIENT,
+			     op_data->op_namelen + 1);
+
+	rc = ptlrpc_request_pack(req, LUSTRE_MDS_VERSION, MDS_GETATTR_NAME);
+	if (rc) {
+		ptlrpc_request_free(req);
+		return rc;
+	}
+
+	mdc_pack_body(req, &op_data->op_fid1, op_data->op_capa1,
+		      op_data->op_valid, op_data->op_mode,
+		      op_data->op_suppgids[0], 0);
+
+	if (op_data->op_name) {
+		char *name = req_capsule_client_get(&req->rq_pill, &RMF_NAME);
+
+		LASSERT(strnlen(op_data->op_name, op_data->op_namelen) ==
+				op_data->op_namelen);
+		memcpy(name, op_data->op_name, op_data->op_namelen);
+	}
+
+	req_capsule_set_size(&req->rq_pill, &RMF_MDT_MD, RCL_SERVER,
+			     op_data->op_mode);
+	ptlrpc_request_set_replen(req);
+
+	rc = mdc_getattr_common(exp, req);
+	if (rc)
+		ptlrpc_req_finished(req);
+	else
+		*request = req;
+	return rc;
+}
+
+static int mdc_is_subdir(struct obd_export *exp,
+			 const struct lu_fid *pfid,
+			 const struct lu_fid *cfid,
+			 struct ptlrpc_request **request)
+{
+	struct ptlrpc_request  *req;
+	int		     rc;
+
+	*request = NULL;
+	req = ptlrpc_request_alloc_pack(class_exp2cliimp(exp),
+					&RQF_MDS_IS_SUBDIR, LUSTRE_MDS_VERSION,
+					MDS_IS_SUBDIR);
+	if (req == NULL)
+		return -ENOMEM;
+
+	mdc_is_subdir_pack(req, pfid, cfid, 0);
+	ptlrpc_request_set_replen(req);
+
+	rc = ptlrpc_queue_wait(req);
+	if (rc && rc != -EREMOTE)
+		ptlrpc_req_finished(req);
+	else
+		*request = req;
+	return rc;
+}
+
+static int mdc_xattr_common(struct obd_export *exp,
+			    const struct req_format *fmt,
+			    const struct lu_fid *fid,
+			    struct obd_capa *oc, int opcode, u64 valid,
+			    const char *xattr_name, const char *input,
+			    int input_size, int output_size, int flags,
+			    __u32 suppgid, struct ptlrpc_request **request)
+{
+	struct ptlrpc_request *req;
+	int   xattr_namelen = 0;
+	char *tmp;
+	int   rc;
+
+	*request = NULL;
+	req = ptlrpc_request_alloc(class_exp2cliimp(exp), fmt);
+	if (req == NULL)
+		return -ENOMEM;
+
+	mdc_set_capa_size(req, &RMF_CAPA1, oc);
+	if (xattr_name) {
+		xattr_namelen = strlen(xattr_name) + 1;
+		req_capsule_set_size(&req->rq_pill, &RMF_NAME, RCL_CLIENT,
+				     xattr_namelen);
+	}
+	if (input_size) {
+		LASSERT(input);
+		req_capsule_set_size(&req->rq_pill, &RMF_EADATA, RCL_CLIENT,
+				     input_size);
+	}
+
+	/* Flush local XATTR locks to get rid of a possible cancel RPC */
+	if (opcode == MDS_REINT && fid_is_sane(fid) &&
+	    exp->exp_connect_data.ocd_ibits_known & MDS_INODELOCK_XATTR) {
+		LIST_HEAD(cancels);
+		int count;
+
+		/* Without that packing would fail */
+		if (input_size == 0)
+			req_capsule_set_size(&req->rq_pill, &RMF_EADATA,
+					     RCL_CLIENT, 0);
+
+		count = mdc_resource_get_unused(exp, fid,
+						&cancels, LCK_EX,
+						MDS_INODELOCK_XATTR);
+
+		rc = mdc_prep_elc_req(exp, req, MDS_REINT, &cancels, count);
+		if (rc) {
+			ptlrpc_request_free(req);
+			return rc;
+		}
+	} else {
+		rc = ptlrpc_request_pack(req, LUSTRE_MDS_VERSION, opcode);
+		if (rc) {
+			ptlrpc_request_free(req);
+			return rc;
+		}
+	}
+
+	if (opcode == MDS_REINT) {
+		struct mdt_rec_setxattr *rec;
+
+		CLASSERT(sizeof(struct mdt_rec_setxattr) ==
+			 sizeof(struct mdt_rec_reint));
+		rec = req_capsule_client_get(&req->rq_pill, &RMF_REC_REINT);
+		rec->sx_opcode = REINT_SETXATTR;
+		rec->sx_fsuid  = from_kuid(&init_user_ns, current_fsuid());
+		rec->sx_fsgid  = from_kgid(&init_user_ns, current_fsgid());
+		rec->sx_cap    = cfs_curproc_cap_pack();
+		rec->sx_suppgid1 = suppgid;
+		rec->sx_suppgid2 = -1;
+		rec->sx_fid    = *fid;
+		rec->sx_valid  = valid | OBD_MD_FLCTIME;
+		rec->sx_time   = get_seconds();
+		rec->sx_size   = output_size;
+		rec->sx_flags  = flags;
+
+		mdc_pack_capa(req, &RMF_CAPA1, oc);
+	} else {
+		mdc_pack_body(req, fid, oc, valid, output_size, suppgid, flags);
+	}
+
+	if (xattr_name) {
+		tmp = req_capsule_client_get(&req->rq_pill, &RMF_NAME);
+		memcpy(tmp, xattr_name, xattr_namelen);
+	}
+	if (input_size) {
+		tmp = req_capsule_client_get(&req->rq_pill, &RMF_EADATA);
+		memcpy(tmp, input, input_size);
+	}
+
+	if (req_capsule_has_field(&req->rq_pill, &RMF_EADATA, RCL_SERVER))
+		req_capsule_set_size(&req->rq_pill, &RMF_EADATA,
+				     RCL_SERVER, output_size);
+	ptlrpc_request_set_replen(req);
+
+	/* make rpc */
+	if (opcode == MDS_REINT)
+		mdc_get_rpc_lock(exp->exp_obd->u.cli.cl_rpc_lock, NULL);
+
+	rc = ptlrpc_queue_wait(req);
+
+	if (opcode == MDS_REINT)
+		mdc_put_rpc_lock(exp->exp_obd->u.cli.cl_rpc_lock, NULL);
+
+	if (rc)
+		ptlrpc_req_finished(req);
+	else
+		*request = req;
+	return rc;
+}
+
+static int mdc_setxattr(struct obd_export *exp, const struct lu_fid *fid,
+		 struct obd_capa *oc, u64 valid, const char *xattr_name,
+		 const char *input, int input_size, int output_size,
+		 int flags, __u32 suppgid, struct ptlrpc_request **request)
+{
+	return mdc_xattr_common(exp, &RQF_MDS_REINT_SETXATTR,
+				fid, oc, MDS_REINT, valid, xattr_name,
+				input, input_size, output_size, flags,
+				suppgid, request);
+}
+
+static int mdc_getxattr(struct obd_export *exp, const struct lu_fid *fid,
+		 struct obd_capa *oc, u64 valid, const char *xattr_name,
+		 const char *input, int input_size, int output_size,
+		 int flags, struct ptlrpc_request **request)
+{
+	return mdc_xattr_common(exp, &RQF_MDS_GETXATTR,
+				fid, oc, MDS_GETXATTR, valid, xattr_name,
+				input, input_size, output_size, flags,
+				-1, request);
+}
+
+#ifdef CONFIG_FS_POSIX_ACL
+static int mdc_unpack_acl(struct ptlrpc_request *req, struct lustre_md *md)
+{
+	struct req_capsule     *pill = &req->rq_pill;
+	struct mdt_body	*body = md->body;
+	struct posix_acl       *acl;
+	void		   *buf;
+	int		     rc;
+
+	if (!body->aclsize)
+		return 0;
+
+	buf = req_capsule_server_sized_get(pill, &RMF_ACL, body->aclsize);
+
+	if (!buf)
+		return -EPROTO;
+
+	acl = posix_acl_from_xattr(&init_user_ns, buf, body->aclsize);
+	if (acl == NULL)
+		return 0;
+
+	if (IS_ERR(acl)) {
+		rc = PTR_ERR(acl);
+		CERROR("convert xattr to acl: %d\n", rc);
+		return rc;
+	}
+
+	rc = posix_acl_valid(acl);
+	if (rc) {
+		CERROR("validate acl: %d\n", rc);
+		posix_acl_release(acl);
+		return rc;
+	}
+
+	md->posix_acl = acl;
+	return 0;
+}
+#else
+#define mdc_unpack_acl(req, md) 0
+#endif
+
+int mdc_get_lustre_md(struct obd_export *exp, struct ptlrpc_request *req,
+		      struct obd_export *dt_exp, struct obd_export *md_exp,
+		      struct lustre_md *md)
+{
+	struct req_capsule *pill = &req->rq_pill;
+	int rc;
+
+	LASSERT(md);
+	memset(md, 0, sizeof(*md));
+
+	md->body = req_capsule_server_get(pill, &RMF_MDT_BODY);
+	LASSERT(md->body != NULL);
+
+	if (md->body->valid & OBD_MD_FLEASIZE) {
+		int lmmsize;
+		struct lov_mds_md *lmm;
+
+		if (!S_ISREG(md->body->mode)) {
+			CDEBUG(D_INFO,
+			       "OBD_MD_FLEASIZE set, should be a regular file, but is not\n");
+			rc = -EPROTO;
+			goto out;
+		}
+
+		if (md->body->eadatasize == 0) {
+			CDEBUG(D_INFO,
+			       "OBD_MD_FLEASIZE set, but eadatasize 0\n");
+			rc = -EPROTO;
+			goto out;
+		}
+		lmmsize = md->body->eadatasize;
+		lmm = req_capsule_server_sized_get(pill, &RMF_MDT_MD, lmmsize);
+		if (!lmm) {
+			rc = -EPROTO;
+			goto out;
+		}
+
+		rc = obd_unpackmd(dt_exp, &md->lsm, lmm, lmmsize);
+		if (rc < 0)
+			goto out;
+
+		if (rc < sizeof(*md->lsm)) {
+			CDEBUG(D_INFO,
+			       "lsm size too small: rc < sizeof (*md->lsm) (%d < %d)\n",
+			       rc, (int)sizeof(*md->lsm));
+			rc = -EPROTO;
+			goto out;
+		}
+
+	} else if (md->body->valid & OBD_MD_FLDIREA) {
+		int lmvsize;
+		struct lov_mds_md *lmv;
+
+		if (!S_ISDIR(md->body->mode)) {
+			CDEBUG(D_INFO,
+			       "OBD_MD_FLDIREA set, should be a directory, but is not\n");
+			rc = -EPROTO;
+			goto out;
+		}
+
+		if (md->body->eadatasize == 0) {
+			CDEBUG(D_INFO,
+			       "OBD_MD_FLDIREA is set, but eadatasize 0\n");
+			return -EPROTO;
+		}
+		if (md->body->valid & OBD_MD_MEA) {
+			lmvsize = md->body->eadatasize;
+			lmv = req_capsule_server_sized_get(pill, &RMF_MDT_MD,
+							   lmvsize);
+			if (!lmv) {
+				rc = -EPROTO;
+				goto out;
+			}
+
+			rc = obd_unpackmd(md_exp, (void *)&md->mea, lmv,
+					  lmvsize);
+			if (rc < 0)
+				goto out;
+
+			if (rc < sizeof(*md->mea)) {
+				CDEBUG(D_INFO,
+				       "size too small: rc < sizeof(*md->mea) (%d < %d)\n",
+					rc, (int)sizeof(*md->mea));
+				rc = -EPROTO;
+				goto out;
+			}
+		}
+	}
+	rc = 0;
+
+	if (md->body->valid & OBD_MD_FLRMTPERM) {
+		/* remote permission */
+		LASSERT(client_is_remote(exp));
+		md->remote_perm = req_capsule_server_swab_get(pill, &RMF_ACL,
+						lustre_swab_mdt_remote_perm);
+		if (!md->remote_perm) {
+			rc = -EPROTO;
+			goto out;
+		}
+	} else if (md->body->valid & OBD_MD_FLACL) {
+		/* for ACL, it's possible that FLACL is set but aclsize is zero.
+		 * only when aclsize != 0 there's an actual segment for ACL
+		 * in reply buffer.
+		 */
+		if (md->body->aclsize) {
+			rc = mdc_unpack_acl(req, md);
+			if (rc)
+				goto out;
+#ifdef CONFIG_FS_POSIX_ACL
+		} else {
+			md->posix_acl = NULL;
+#endif
+		}
+	}
+	if (md->body->valid & OBD_MD_FLMDSCAPA) {
+		struct obd_capa *oc = NULL;
+
+		rc = mdc_unpack_capa(NULL, req, &RMF_CAPA1, &oc);
+		if (rc)
+			goto out;
+		md->mds_capa = oc;
+	}
+
+	if (md->body->valid & OBD_MD_FLOSSCAPA) {
+		struct obd_capa *oc = NULL;
+
+		rc = mdc_unpack_capa(NULL, req, &RMF_CAPA2, &oc);
+		if (rc)
+			goto out;
+		md->oss_capa = oc;
+	}
+
+out:
+	if (rc) {
+		if (md->oss_capa) {
+			capa_put(md->oss_capa);
+			md->oss_capa = NULL;
+		}
+		if (md->mds_capa) {
+			capa_put(md->mds_capa);
+			md->mds_capa = NULL;
+		}
+#ifdef CONFIG_FS_POSIX_ACL
+		posix_acl_release(md->posix_acl);
+#endif
+		if (md->lsm)
+			obd_free_memmd(dt_exp, &md->lsm);
+	}
+	return rc;
+}
+
+int mdc_free_lustre_md(struct obd_export *exp, struct lustre_md *md)
+{
+	return 0;
+}
+
+/**
+ * Handles both OPEN and SETATTR RPCs for OPEN-CLOSE and SETATTR-DONE_WRITING
+ * RPC chains.
+ */
+void mdc_replay_open(struct ptlrpc_request *req)
+{
+	struct md_open_data *mod = req->rq_cb_data;
+	struct ptlrpc_request *close_req;
+	struct obd_client_handle *och;
+	struct lustre_handle old;
+	struct mdt_body *body;
+
+	if (mod == NULL) {
+		DEBUG_REQ(D_ERROR, req,
+			  "Can't properly replay without open data.");
+		return;
+	}
+
+	body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
+	LASSERT(body != NULL);
+
+	och = mod->mod_och;
+	if (och != NULL) {
+		struct lustre_handle *file_fh;
+
+		LASSERT(och->och_magic == OBD_CLIENT_HANDLE_MAGIC);
+
+		file_fh = &och->och_fh;
+		CDEBUG(D_HA, "updating handle from %#llx to %#llx\n",
+		       file_fh->cookie, body->handle.cookie);
+		old = *file_fh;
+		*file_fh = body->handle;
+	}
+	close_req = mod->mod_close_req;
+	if (close_req != NULL) {
+		__u32 opc = lustre_msg_get_opc(close_req->rq_reqmsg);
+		struct mdt_ioepoch *epoch;
+
+		LASSERT(opc == MDS_CLOSE || opc == MDS_DONE_WRITING);
+		epoch = req_capsule_client_get(&close_req->rq_pill,
+					       &RMF_MDT_EPOCH);
+		LASSERT(epoch);
+
+		if (och != NULL)
+			LASSERT(!memcmp(&old, &epoch->handle, sizeof(old)));
+		DEBUG_REQ(D_HA, close_req, "updating close body with new fh");
+		epoch->handle = body->handle;
+	}
+}
+
+void mdc_commit_open(struct ptlrpc_request *req)
+{
+	struct md_open_data *mod = req->rq_cb_data;
+
+	if (mod == NULL)
+		return;
+
+	/**
+	 * No need to touch md_open_data::mod_och, it holds a reference on
+	 * \var mod and will zero references to each other, \var mod will be
+	 * freed after that when md_open_data::mod_och will put the reference.
+	 */
+
+	/**
+	 * Do not let open request to disappear as it still may be needed
+	 * for close rpc to happen (it may happen on evict only, otherwise
+	 * ptlrpc_request::rq_replay does not let mdc_commit_open() to be
+	 * called), just mark this rpc as committed to distinguish these 2
+	 * cases, see mdc_close() for details. The open request reference will
+	 * be put along with freeing \var mod.
+	 */
+	ptlrpc_request_addref(req);
+	spin_lock(&req->rq_lock);
+	req->rq_committed = 1;
+	spin_unlock(&req->rq_lock);
+	req->rq_cb_data = NULL;
+	obd_mod_put(mod);
+}
+
+int mdc_set_open_replay_data(struct obd_export *exp,
+			     struct obd_client_handle *och,
+			     struct lookup_intent *it)
+{
+	struct md_open_data   *mod;
+	struct mdt_rec_create *rec;
+	struct mdt_body       *body;
+	struct ptlrpc_request *open_req = it->d.lustre.it_data;
+	struct obd_import     *imp = open_req->rq_import;
+
+	if (!open_req->rq_replay)
+		return 0;
+
+	rec = req_capsule_client_get(&open_req->rq_pill, &RMF_REC_REINT);
+	body = req_capsule_server_get(&open_req->rq_pill, &RMF_MDT_BODY);
+	LASSERT(rec != NULL);
+	/* Incoming message in my byte order (it's been swabbed). */
+	/* Outgoing messages always in my byte order. */
+	LASSERT(body != NULL);
+
+	/* Only if the import is replayable, we set replay_open data */
+	if (och && imp->imp_replayable) {
+		mod = obd_mod_alloc();
+		if (mod == NULL) {
+			DEBUG_REQ(D_ERROR, open_req,
+				  "Can't allocate md_open_data");
+			return 0;
+		}
+
+		/**
+		 * Take a reference on \var mod, to be freed on mdc_close().
+		 * It protects \var mod from being freed on eviction (commit
+		 * callback is called despite rq_replay flag).
+		 * Another reference for \var och.
+		 */
+		obd_mod_get(mod);
+		obd_mod_get(mod);
+
+		spin_lock(&open_req->rq_lock);
+		och->och_mod = mod;
+		mod->mod_och = och;
+		mod->mod_is_create = it_disposition(it, DISP_OPEN_CREATE) ||
+				     it_disposition(it, DISP_OPEN_STRIPE);
+		mod->mod_open_req = open_req;
+		open_req->rq_cb_data = mod;
+		open_req->rq_commit_cb = mdc_commit_open;
+		spin_unlock(&open_req->rq_lock);
+	}
+
+	rec->cr_fid2 = body->fid1;
+	rec->cr_ioepoch = body->ioepoch;
+	rec->cr_old_handle.cookie = body->handle.cookie;
+	open_req->rq_replay_cb = mdc_replay_open;
+	if (!fid_is_sane(&body->fid1)) {
+		DEBUG_REQ(D_ERROR, open_req,
+			  "Saving replay request with insane fid");
+		LBUG();
+	}
+
+	DEBUG_REQ(D_RPCTRACE, open_req, "Set up open replay data");
+	return 0;
+}
+
+static void mdc_free_open(struct md_open_data *mod)
+{
+	int committed = 0;
+
+	if (mod->mod_is_create == 0 &&
+	    imp_connect_disp_stripe(mod->mod_open_req->rq_import))
+		committed = 1;
+
+	LASSERT(mod->mod_open_req->rq_replay == 0);
+
+	DEBUG_REQ(D_RPCTRACE, mod->mod_open_req, "free open request\n");
+
+	ptlrpc_request_committed(mod->mod_open_req, committed);
+	if (mod->mod_close_req)
+		ptlrpc_request_committed(mod->mod_close_req, committed);
+}
+
+int mdc_clear_open_replay_data(struct obd_export *exp,
+			       struct obd_client_handle *och)
+{
+	struct md_open_data *mod = och->och_mod;
+
+	/**
+	 * It is possible to not have \var mod in a case of eviction between
+	 * lookup and ll_file_open().
+	 **/
+	if (mod == NULL)
+		return 0;
+
+	LASSERT(mod != LP_POISON);
+	LASSERT(mod->mod_open_req != NULL);
+	mdc_free_open(mod);
+
+	mod->mod_och = NULL;
+	och->och_mod = NULL;
+	obd_mod_put(mod);
+
+	return 0;
+}
+
+/* Prepares the request for the replay by the given reply */
+static void mdc_close_handle_reply(struct ptlrpc_request *req,
+				   struct md_op_data *op_data, int rc) {
+	struct mdt_body  *repbody;
+	struct mdt_ioepoch *epoch;
+
+	if (req && rc == -EAGAIN) {
+		repbody = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
+		epoch = req_capsule_client_get(&req->rq_pill, &RMF_MDT_EPOCH);
+
+		epoch->flags |= MF_SOM_AU;
+		if (repbody->valid & OBD_MD_FLGETATTRLOCK)
+			op_data->op_flags |= MF_GETATTR_LOCK;
+	}
+}
+
+static int mdc_close(struct obd_export *exp, struct md_op_data *op_data,
+		     struct md_open_data *mod, struct ptlrpc_request **request)
+{
+	struct obd_device     *obd = class_exp2obd(exp);
+	struct ptlrpc_request *req;
+	struct req_format     *req_fmt;
+	int                    rc;
+	int		       saved_rc = 0;
+
+
+	req_fmt = &RQF_MDS_CLOSE;
+	if (op_data->op_bias & MDS_HSM_RELEASE) {
+		req_fmt = &RQF_MDS_RELEASE_CLOSE;
+
+		/* allocate a FID for volatile file */
+		rc = mdc_fid_alloc(exp, &op_data->op_fid2, op_data);
+		if (rc < 0) {
+			CERROR("%s: "DFID" failed to allocate FID: %d\n",
+			       obd->obd_name, PFID(&op_data->op_fid1), rc);
+			/* save the errcode and proceed to close */
+			saved_rc = rc;
+		}
+	}
+
+	*request = NULL;
+	req = ptlrpc_request_alloc(class_exp2cliimp(exp), req_fmt);
+	if (req == NULL)
+		return -ENOMEM;
+
+	mdc_set_capa_size(req, &RMF_CAPA1, op_data->op_capa1);
+
+	rc = ptlrpc_request_pack(req, LUSTRE_MDS_VERSION, MDS_CLOSE);
+	if (rc) {
+		ptlrpc_request_free(req);
+		return rc;
+	}
+
+	/* To avoid a livelock (bug 7034), we need to send CLOSE RPCs to a
+	 * portal whose threads are not taking any DLM locks and are therefore
+	 * always progressing */
+	req->rq_request_portal = MDS_READPAGE_PORTAL;
+	ptlrpc_at_set_req_timeout(req);
+
+	/* Ensure that this close's handle is fixed up during replay. */
+	if (likely(mod != NULL)) {
+		LASSERTF(mod->mod_open_req != NULL &&
+			 mod->mod_open_req->rq_type != LI_POISON,
+			 "POISONED open %p!\n", mod->mod_open_req);
+
+		mod->mod_close_req = req;
+
+		DEBUG_REQ(D_HA, mod->mod_open_req, "matched open");
+		/* We no longer want to preserve this open for replay even
+		 * though the open was committed. b=3632, b=3633 */
+		spin_lock(&mod->mod_open_req->rq_lock);
+		mod->mod_open_req->rq_replay = 0;
+		spin_unlock(&mod->mod_open_req->rq_lock);
+	} else {
+		 CDEBUG(D_HA,
+			"couldn't find open req; expecting close error\n");
+	}
+
+	mdc_close_pack(req, op_data);
+
+	req_capsule_set_size(&req->rq_pill, &RMF_MDT_MD, RCL_SERVER,
+			     obd->u.cli.cl_default_mds_easize);
+	req_capsule_set_size(&req->rq_pill, &RMF_LOGCOOKIES, RCL_SERVER,
+			     obd->u.cli.cl_default_mds_cookiesize);
+
+	ptlrpc_request_set_replen(req);
+
+	mdc_get_rpc_lock(obd->u.cli.cl_close_lock, NULL);
+	rc = ptlrpc_queue_wait(req);
+	mdc_put_rpc_lock(obd->u.cli.cl_close_lock, NULL);
+
+	if (req->rq_repmsg == NULL) {
+		CDEBUG(D_RPCTRACE, "request failed to send: %p, %d\n", req,
+		       req->rq_status);
+		if (rc == 0)
+			rc = req->rq_status ?: -EIO;
+	} else if (rc == 0 || rc == -EAGAIN) {
+		struct mdt_body *body;
+
+		rc = lustre_msg_get_status(req->rq_repmsg);
+		if (lustre_msg_get_type(req->rq_repmsg) == PTL_RPC_MSG_ERR) {
+			DEBUG_REQ(D_ERROR, req,
+				  "type == PTL_RPC_MSG_ERR, err = %d", rc);
+			if (rc > 0)
+				rc = -rc;
+		}
+		body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
+		if (body == NULL)
+			rc = -EPROTO;
+	} else if (rc == -ESTALE) {
+		/**
+		 * it can be allowed error after 3633 if open was committed and
+		 * server failed before close was sent. Let's check if mod
+		 * exists and return no error in that case
+		 */
+		if (mod) {
+			DEBUG_REQ(D_HA, req, "Reset ESTALE = %d", rc);
+			LASSERT(mod->mod_open_req != NULL);
+			if (mod->mod_open_req->rq_committed)
+				rc = 0;
+		}
+	}
+
+	if (mod) {
+		if (rc != 0)
+			mod->mod_close_req = NULL;
+		/* Since now, mod is accessed through open_req only,
+		 * thus close req does not keep a reference on mod anymore. */
+		obd_mod_put(mod);
+	}
+	*request = req;
+	mdc_close_handle_reply(req, op_data, rc);
+	return rc < 0 ? rc : saved_rc;
+}
+
+static int mdc_done_writing(struct obd_export *exp, struct md_op_data *op_data,
+			    struct md_open_data *mod)
+{
+	struct obd_device     *obd = class_exp2obd(exp);
+	struct ptlrpc_request *req;
+	int		    rc;
+
+	req = ptlrpc_request_alloc(class_exp2cliimp(exp),
+				   &RQF_MDS_DONE_WRITING);
+	if (req == NULL)
+		return -ENOMEM;
+
+	mdc_set_capa_size(req, &RMF_CAPA1, op_data->op_capa1);
+	rc = ptlrpc_request_pack(req, LUSTRE_MDS_VERSION, MDS_DONE_WRITING);
+	if (rc) {
+		ptlrpc_request_free(req);
+		return rc;
+	}
+
+	if (mod != NULL) {
+		LASSERTF(mod->mod_open_req != NULL &&
+			 mod->mod_open_req->rq_type != LI_POISON,
+			 "POISONED setattr %p!\n", mod->mod_open_req);
+
+		mod->mod_close_req = req;
+		DEBUG_REQ(D_HA, mod->mod_open_req, "matched setattr");
+		/* We no longer want to preserve this setattr for replay even
+		 * though the open was committed. b=3632, b=3633 */
+		spin_lock(&mod->mod_open_req->rq_lock);
+		mod->mod_open_req->rq_replay = 0;
+		spin_unlock(&mod->mod_open_req->rq_lock);
+	}
+
+	mdc_close_pack(req, op_data);
+	ptlrpc_request_set_replen(req);
+
+	mdc_get_rpc_lock(obd->u.cli.cl_close_lock, NULL);
+	rc = ptlrpc_queue_wait(req);
+	mdc_put_rpc_lock(obd->u.cli.cl_close_lock, NULL);
+
+	if (rc == -ESTALE) {
+		/**
+		 * it can be allowed error after 3633 if open or setattr were
+		 * committed and server failed before close was sent.
+		 * Let's check if mod exists and return no error in that case
+		 */
+		if (mod) {
+			LASSERT(mod->mod_open_req != NULL);
+			if (mod->mod_open_req->rq_committed)
+				rc = 0;
+		}
+	}
+
+	if (mod) {
+		if (rc != 0)
+			mod->mod_close_req = NULL;
+		LASSERT(mod->mod_open_req != NULL);
+		mdc_free_open(mod);
+
+		/* Since now, mod is accessed through setattr req only,
+		 * thus DW req does not keep a reference on mod anymore. */
+		obd_mod_put(mod);
+	}
+
+	mdc_close_handle_reply(req, op_data, rc);
+	ptlrpc_req_finished(req);
+	return rc;
+}
+
+
+static int mdc_readpage(struct obd_export *exp, struct md_op_data *op_data,
+			struct page **pages, struct ptlrpc_request **request)
+{
+	struct ptlrpc_request   *req;
+	struct ptlrpc_bulk_desc *desc;
+	int		      i;
+	wait_queue_head_t	      waitq;
+	int		      resends = 0;
+	struct l_wait_info       lwi;
+	int		      rc;
+
+	*request = NULL;
+	init_waitqueue_head(&waitq);
+
+restart_bulk:
+	req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_MDS_READPAGE);
+	if (req == NULL)
+		return -ENOMEM;
+
+	mdc_set_capa_size(req, &RMF_CAPA1, op_data->op_capa1);
+
+	rc = ptlrpc_request_pack(req, LUSTRE_MDS_VERSION, MDS_READPAGE);
+	if (rc) {
+		ptlrpc_request_free(req);
+		return rc;
+	}
+
+	req->rq_request_portal = MDS_READPAGE_PORTAL;
+	ptlrpc_at_set_req_timeout(req);
+
+	desc = ptlrpc_prep_bulk_imp(req, op_data->op_npages, 1, BULK_PUT_SINK,
+				    MDS_BULK_PORTAL);
+	if (desc == NULL) {
+		ptlrpc_request_free(req);
+		return -ENOMEM;
+	}
+
+	/* NB req now owns desc and will free it when it gets freed */
+	for (i = 0; i < op_data->op_npages; i++)
+		ptlrpc_prep_bulk_page_pin(desc, pages[i], 0, PAGE_CACHE_SIZE);
+
+	mdc_readdir_pack(req, op_data->op_offset,
+			 PAGE_CACHE_SIZE * op_data->op_npages,
+			 &op_data->op_fid1, op_data->op_capa1);
+
+	ptlrpc_request_set_replen(req);
+	rc = ptlrpc_queue_wait(req);
+	if (rc) {
+		ptlrpc_req_finished(req);
+		if (rc != -ETIMEDOUT)
+			return rc;
+
+		resends++;
+		if (!client_should_resend(resends, &exp->exp_obd->u.cli)) {
+			CERROR("too many resend retries, returning error\n");
+			return -EIO;
+		}
+		lwi = LWI_TIMEOUT_INTR(cfs_time_seconds(resends),
+				       NULL, NULL, NULL);
+		l_wait_event(waitq, 0, &lwi);
+
+		goto restart_bulk;
+	}
+
+	rc = sptlrpc_cli_unwrap_bulk_read(req, req->rq_bulk,
+					  req->rq_bulk->bd_nob_transferred);
+	if (rc < 0) {
+		ptlrpc_req_finished(req);
+		return rc;
+	}
+
+	if (req->rq_bulk->bd_nob_transferred & ~LU_PAGE_MASK) {
+		CERROR("Unexpected # bytes transferred: %d (%ld expected)\n",
+			req->rq_bulk->bd_nob_transferred,
+			PAGE_CACHE_SIZE * op_data->op_npages);
+		ptlrpc_req_finished(req);
+		return -EPROTO;
+	}
+
+	*request = req;
+	return 0;
+}
+
+static int mdc_statfs(const struct lu_env *env,
+		      struct obd_export *exp, struct obd_statfs *osfs,
+		      __u64 max_age, __u32 flags)
+{
+	struct obd_device     *obd = class_exp2obd(exp);
+	struct ptlrpc_request *req;
+	struct obd_statfs     *msfs;
+	struct obd_import     *imp = NULL;
+	int		    rc;
+
+	/*
+	 * Since the request might also come from lprocfs, so we need
+	 * sync this with client_disconnect_export Bug15684
+	 */
+	down_read(&obd->u.cli.cl_sem);
+	if (obd->u.cli.cl_import)
+		imp = class_import_get(obd->u.cli.cl_import);
+	up_read(&obd->u.cli.cl_sem);
+	if (!imp)
+		return -ENODEV;
+
+	req = ptlrpc_request_alloc_pack(imp, &RQF_MDS_STATFS,
+					LUSTRE_MDS_VERSION, MDS_STATFS);
+	if (req == NULL) {
+		rc = -ENOMEM;
+		goto output;
+	}
+
+	ptlrpc_request_set_replen(req);
+
+	if (flags & OBD_STATFS_NODELAY) {
+		/* procfs requests not want stay in wait for avoid deadlock */
+		req->rq_no_resend = 1;
+		req->rq_no_delay = 1;
+	}
+
+	rc = ptlrpc_queue_wait(req);
+	if (rc) {
+		/* check connection error first */
+		if (imp->imp_connect_error)
+			rc = imp->imp_connect_error;
+		goto out;
+	}
+
+	msfs = req_capsule_server_get(&req->rq_pill, &RMF_OBD_STATFS);
+	if (msfs == NULL) {
+		rc = -EPROTO;
+		goto out;
+	}
+
+	*osfs = *msfs;
+out:
+	ptlrpc_req_finished(req);
+output:
+	class_import_put(imp);
+	return rc;
+}
+
+static int mdc_ioc_fid2path(struct obd_export *exp, struct getinfo_fid2path *gf)
+{
+	__u32 keylen, vallen;
+	void *key;
+	int rc;
+
+	if (gf->gf_pathlen > PATH_MAX)
+		return -ENAMETOOLONG;
+	if (gf->gf_pathlen < 2)
+		return -EOVERFLOW;
+
+	/* Key is KEY_FID2PATH + getinfo_fid2path description */
+	keylen = cfs_size_round(sizeof(KEY_FID2PATH)) + sizeof(*gf);
+	OBD_ALLOC(key, keylen);
+	if (key == NULL)
+		return -ENOMEM;
+	memcpy(key, KEY_FID2PATH, sizeof(KEY_FID2PATH));
+	memcpy(key + cfs_size_round(sizeof(KEY_FID2PATH)), gf, sizeof(*gf));
+
+	CDEBUG(D_IOCTL, "path get "DFID" from %llu #%d\n",
+	       PFID(&gf->gf_fid), gf->gf_recno, gf->gf_linkno);
+
+	if (!fid_is_sane(&gf->gf_fid)) {
+		rc = -EINVAL;
+		goto out;
+	}
+
+	/* Val is struct getinfo_fid2path result plus path */
+	vallen = sizeof(*gf) + gf->gf_pathlen;
+
+	rc = obd_get_info(NULL, exp, keylen, key, &vallen, gf, NULL);
+	if (rc != 0 && rc != -EREMOTE)
+		goto out;
+
+	if (vallen <= sizeof(*gf)) {
+		rc = -EPROTO;
+		goto out;
+	} else if (vallen > sizeof(*gf) + gf->gf_pathlen) {
+		rc = -EOVERFLOW;
+		goto out;
+	}
+
+	CDEBUG(D_IOCTL, "path get "DFID" from %llu #%d\n%s\n",
+	       PFID(&gf->gf_fid), gf->gf_recno, gf->gf_linkno, gf->gf_path);
+
+out:
+	OBD_FREE(key, keylen);
+	return rc;
+}
+
+static int mdc_ioc_hsm_progress(struct obd_export *exp,
+				struct hsm_progress_kernel *hpk)
+{
+	struct obd_import		*imp = class_exp2cliimp(exp);
+	struct hsm_progress_kernel	*req_hpk;
+	struct ptlrpc_request		*req;
+	int				 rc;
+
+	req = ptlrpc_request_alloc_pack(imp, &RQF_MDS_HSM_PROGRESS,
+					LUSTRE_MDS_VERSION, MDS_HSM_PROGRESS);
+	if (req == NULL) {
+		rc = -ENOMEM;
+		goto out;
+	}
+
+	mdc_pack_body(req, NULL, NULL, OBD_MD_FLRMTPERM, 0, 0, 0);
+
+	/* Copy hsm_progress struct */
+	req_hpk = req_capsule_client_get(&req->rq_pill, &RMF_MDS_HSM_PROGRESS);
+	if (req_hpk == NULL) {
+		rc = -EPROTO;
+		goto out;
+	}
+
+	*req_hpk = *hpk;
+	req_hpk->hpk_errval = lustre_errno_hton(hpk->hpk_errval);
+
+	ptlrpc_request_set_replen(req);
+
+	rc = mdc_queue_wait(req);
+	goto out;
+out:
+	ptlrpc_req_finished(req);
+	return rc;
+}
+
+static int mdc_ioc_hsm_ct_register(struct obd_import *imp, __u32 archives)
+{
+	__u32			*archive_mask;
+	struct ptlrpc_request	*req;
+	int			 rc;
+
+	req = ptlrpc_request_alloc_pack(imp, &RQF_MDS_HSM_CT_REGISTER,
+					LUSTRE_MDS_VERSION,
+					MDS_HSM_CT_REGISTER);
+	if (req == NULL) {
+		rc = -ENOMEM;
+		goto out;
+	}
+
+	mdc_pack_body(req, NULL, NULL, OBD_MD_FLRMTPERM, 0, 0, 0);
+
+	/* Copy hsm_progress struct */
+	archive_mask = req_capsule_client_get(&req->rq_pill,
+					      &RMF_MDS_HSM_ARCHIVE);
+	if (archive_mask == NULL) {
+		rc = -EPROTO;
+		goto out;
+	}
+
+	*archive_mask = archives;
+
+	ptlrpc_request_set_replen(req);
+
+	rc = mdc_queue_wait(req);
+	goto out;
+out:
+	ptlrpc_req_finished(req);
+	return rc;
+}
+
+static int mdc_ioc_hsm_current_action(struct obd_export *exp,
+				      struct md_op_data *op_data)
+{
+	struct hsm_current_action	*hca = op_data->op_data;
+	struct hsm_current_action	*req_hca;
+	struct ptlrpc_request		*req;
+	int				 rc;
+
+	req = ptlrpc_request_alloc(class_exp2cliimp(exp),
+				   &RQF_MDS_HSM_ACTION);
+	if (req == NULL)
+		return -ENOMEM;
+
+	mdc_set_capa_size(req, &RMF_CAPA1, op_data->op_capa1);
+
+	rc = ptlrpc_request_pack(req, LUSTRE_MDS_VERSION, MDS_HSM_ACTION);
+	if (rc) {
+		ptlrpc_request_free(req);
+		return rc;
+	}
+
+	mdc_pack_body(req, &op_data->op_fid1, op_data->op_capa1,
+		      OBD_MD_FLRMTPERM, 0, op_data->op_suppgids[0], 0);
+
+	ptlrpc_request_set_replen(req);
+
+	rc = mdc_queue_wait(req);
+	if (rc)
+		goto out;
+
+	req_hca = req_capsule_server_get(&req->rq_pill,
+					 &RMF_MDS_HSM_CURRENT_ACTION);
+	if (req_hca == NULL) {
+		rc = -EPROTO;
+		goto out;
+	}
+
+	*hca = *req_hca;
+
+out:
+	ptlrpc_req_finished(req);
+	return rc;
+}
+
+static int mdc_ioc_hsm_ct_unregister(struct obd_import *imp)
+{
+	struct ptlrpc_request	*req;
+	int			 rc;
+
+	req = ptlrpc_request_alloc_pack(imp, &RQF_MDS_HSM_CT_UNREGISTER,
+					LUSTRE_MDS_VERSION,
+					MDS_HSM_CT_UNREGISTER);
+	if (req == NULL) {
+		rc = -ENOMEM;
+		goto out;
+	}
+
+	mdc_pack_body(req, NULL, NULL, OBD_MD_FLRMTPERM, 0, 0, 0);
+
+	ptlrpc_request_set_replen(req);
+
+	rc = mdc_queue_wait(req);
+	goto out;
+out:
+	ptlrpc_req_finished(req);
+	return rc;
+}
+
+static int mdc_ioc_hsm_state_get(struct obd_export *exp,
+				 struct md_op_data *op_data)
+{
+	struct hsm_user_state	*hus = op_data->op_data;
+	struct hsm_user_state	*req_hus;
+	struct ptlrpc_request	*req;
+	int			 rc;
+
+	req = ptlrpc_request_alloc(class_exp2cliimp(exp),
+				   &RQF_MDS_HSM_STATE_GET);
+	if (req == NULL)
+		return -ENOMEM;
+
+	mdc_set_capa_size(req, &RMF_CAPA1, op_data->op_capa1);
+
+	rc = ptlrpc_request_pack(req, LUSTRE_MDS_VERSION, MDS_HSM_STATE_GET);
+	if (rc != 0) {
+		ptlrpc_request_free(req);
+		return rc;
+	}
+
+	mdc_pack_body(req, &op_data->op_fid1, op_data->op_capa1,
+		      OBD_MD_FLRMTPERM, 0, op_data->op_suppgids[0], 0);
+
+	ptlrpc_request_set_replen(req);
+
+	rc = mdc_queue_wait(req);
+	if (rc)
+		goto out;
+
+	req_hus = req_capsule_server_get(&req->rq_pill, &RMF_HSM_USER_STATE);
+	if (req_hus == NULL) {
+		rc = -EPROTO;
+		goto out;
+	}
+
+	*hus = *req_hus;
+
+out:
+	ptlrpc_req_finished(req);
+	return rc;
+}
+
+static int mdc_ioc_hsm_state_set(struct obd_export *exp,
+				 struct md_op_data *op_data)
+{
+	struct hsm_state_set	*hss = op_data->op_data;
+	struct hsm_state_set	*req_hss;
+	struct ptlrpc_request	*req;
+	int			 rc;
+
+	req = ptlrpc_request_alloc(class_exp2cliimp(exp),
+				   &RQF_MDS_HSM_STATE_SET);
+	if (req == NULL)
+		return -ENOMEM;
+
+	mdc_set_capa_size(req, &RMF_CAPA1, op_data->op_capa1);
+
+	rc = ptlrpc_request_pack(req, LUSTRE_MDS_VERSION, MDS_HSM_STATE_SET);
+	if (rc) {
+		ptlrpc_request_free(req);
+		return rc;
+	}
+
+	mdc_pack_body(req, &op_data->op_fid1, op_data->op_capa1,
+		      OBD_MD_FLRMTPERM, 0, op_data->op_suppgids[0], 0);
+
+	/* Copy states */
+	req_hss = req_capsule_client_get(&req->rq_pill, &RMF_HSM_STATE_SET);
+	if (req_hss == NULL) {
+		rc = -EPROTO;
+		goto out;
+	}
+	*req_hss = *hss;
+
+	ptlrpc_request_set_replen(req);
+
+	rc = mdc_queue_wait(req);
+	goto out;
+
+out:
+	ptlrpc_req_finished(req);
+	return rc;
+}
+
+static int mdc_ioc_hsm_request(struct obd_export *exp,
+			       struct hsm_user_request *hur)
+{
+	struct obd_import	*imp = class_exp2cliimp(exp);
+	struct ptlrpc_request	*req;
+	struct hsm_request	*req_hr;
+	struct hsm_user_item	*req_hui;
+	char			*req_opaque;
+	int			 rc;
+
+	req = ptlrpc_request_alloc(imp, &RQF_MDS_HSM_REQUEST);
+	if (req == NULL) {
+		rc = -ENOMEM;
+		goto out;
+	}
+
+	req_capsule_set_size(&req->rq_pill, &RMF_MDS_HSM_USER_ITEM, RCL_CLIENT,
+			     hur->hur_request.hr_itemcount
+			     * sizeof(struct hsm_user_item));
+	req_capsule_set_size(&req->rq_pill, &RMF_GENERIC_DATA, RCL_CLIENT,
+			     hur->hur_request.hr_data_len);
+
+	rc = ptlrpc_request_pack(req, LUSTRE_MDS_VERSION, MDS_HSM_REQUEST);
+	if (rc) {
+		ptlrpc_request_free(req);
+		return rc;
+	}
+
+	mdc_pack_body(req, NULL, NULL, OBD_MD_FLRMTPERM, 0, 0, 0);
+
+	/* Copy hsm_request struct */
+	req_hr = req_capsule_client_get(&req->rq_pill, &RMF_MDS_HSM_REQUEST);
+	if (req_hr == NULL) {
+		rc = -EPROTO;
+		goto out;
+	}
+	*req_hr = hur->hur_request;
+
+	/* Copy hsm_user_item structs */
+	req_hui = req_capsule_client_get(&req->rq_pill, &RMF_MDS_HSM_USER_ITEM);
+	if (req_hui == NULL) {
+		rc = -EPROTO;
+		goto out;
+	}
+	memcpy(req_hui, hur->hur_user_item,
+	       hur->hur_request.hr_itemcount * sizeof(struct hsm_user_item));
+
+	/* Copy opaque field */
+	req_opaque = req_capsule_client_get(&req->rq_pill, &RMF_GENERIC_DATA);
+	if (req_opaque == NULL) {
+		rc = -EPROTO;
+		goto out;
+	}
+	memcpy(req_opaque, hur_data(hur), hur->hur_request.hr_data_len);
+
+	ptlrpc_request_set_replen(req);
+
+	rc = mdc_queue_wait(req);
+	goto out;
+
+out:
+	ptlrpc_req_finished(req);
+	return rc;
+}
+
+static struct kuc_hdr *changelog_kuc_hdr(char *buf, int len, int flags)
+{
+	struct kuc_hdr *lh = (struct kuc_hdr *)buf;
+
+	LASSERT(len <= KUC_CHANGELOG_MSG_MAXSIZE);
+
+	lh->kuc_magic = KUC_MAGIC;
+	lh->kuc_transport = KUC_TRANSPORT_CHANGELOG;
+	lh->kuc_flags = flags;
+	lh->kuc_msgtype = CL_RECORD;
+	lh->kuc_msglen = len;
+	return lh;
+}
+
+#define D_CHANGELOG 0
+
+struct changelog_show {
+	__u64		cs_startrec;
+	__u32		cs_flags;
+	struct file	*cs_fp;
+	char		*cs_buf;
+	struct obd_device *cs_obd;
+};
+
+static int changelog_kkuc_cb(const struct lu_env *env, struct llog_handle *llh,
+			     struct llog_rec_hdr *hdr, void *data)
+{
+	struct changelog_show *cs = data;
+	struct llog_changelog_rec *rec = (struct llog_changelog_rec *)hdr;
+	struct kuc_hdr *lh;
+	int len, rc;
+
+	if (rec->cr_hdr.lrh_type != CHANGELOG_REC) {
+		rc = -EINVAL;
+		CERROR("%s: not a changelog rec %x/%d: rc = %d\n",
+		       cs->cs_obd->obd_name, rec->cr_hdr.lrh_type,
+		       rec->cr.cr_type, rc);
+		return rc;
+	}
+
+	if (rec->cr.cr_index < cs->cs_startrec) {
+		/* Skip entries earlier than what we are interested in */
+		CDEBUG(D_CHANGELOG, "rec=%llu start=%llu\n",
+		       rec->cr.cr_index, cs->cs_startrec);
+		return 0;
+	}
+
+	CDEBUG(D_CHANGELOG, "%llu %02d%-5s %llu 0x%x t="DFID" p="DFID
+		" %.*s\n", rec->cr.cr_index, rec->cr.cr_type,
+		changelog_type2str(rec->cr.cr_type), rec->cr.cr_time,
+		rec->cr.cr_flags & CLF_FLAGMASK,
+		PFID(&rec->cr.cr_tfid), PFID(&rec->cr.cr_pfid),
+		rec->cr.cr_namelen, changelog_rec_name(&rec->cr));
+
+	len = sizeof(*lh) + changelog_rec_size(&rec->cr) + rec->cr.cr_namelen;
+
+	/* Set up the message */
+	lh = changelog_kuc_hdr(cs->cs_buf, len, cs->cs_flags);
+	memcpy(lh + 1, &rec->cr, len - sizeof(*lh));
+
+	rc = libcfs_kkuc_msg_put(cs->cs_fp, lh);
+	CDEBUG(D_CHANGELOG, "kucmsg fp %p len %d rc %d\n", cs->cs_fp, len, rc);
+
+	return rc;
+}
+
+static int mdc_changelog_send_thread(void *csdata)
+{
+	struct changelog_show *cs = csdata;
+	struct llog_ctxt *ctxt = NULL;
+	struct llog_handle *llh = NULL;
+	struct kuc_hdr *kuch;
+	int rc;
+
+	CDEBUG(D_CHANGELOG, "changelog to fp=%p start %llu\n",
+	       cs->cs_fp, cs->cs_startrec);
+
+	OBD_ALLOC(cs->cs_buf, KUC_CHANGELOG_MSG_MAXSIZE);
+	if (cs->cs_buf == NULL) {
+		rc = -ENOMEM;
+		goto out;
+	}
+
+	/* Set up the remote catalog handle */
+	ctxt = llog_get_context(cs->cs_obd, LLOG_CHANGELOG_REPL_CTXT);
+	if (ctxt == NULL) {
+		rc = -ENOENT;
+		goto out;
+	}
+	rc = llog_open(NULL, ctxt, &llh, NULL, CHANGELOG_CATALOG,
+		       LLOG_OPEN_EXISTS);
+	if (rc) {
+		CERROR("%s: fail to open changelog catalog: rc = %d\n",
+		       cs->cs_obd->obd_name, rc);
+		goto out;
+	}
+	rc = llog_init_handle(NULL, llh, LLOG_F_IS_CAT, NULL);
+	if (rc) {
+		CERROR("llog_init_handle failed %d\n", rc);
+		goto out;
+	}
+
+	rc = llog_cat_process(NULL, llh, changelog_kkuc_cb, cs, 0, 0);
+
+	/* Send EOF no matter what our result */
+	kuch = changelog_kuc_hdr(cs->cs_buf, sizeof(*kuch), cs->cs_flags);
+	if (kuch) {
+		kuch->kuc_msgtype = CL_EOF;
+		libcfs_kkuc_msg_put(cs->cs_fp, kuch);
+	}
+
+out:
+	fput(cs->cs_fp);
+	if (llh)
+		llog_cat_close(NULL, llh);
+	if (ctxt)
+		llog_ctxt_put(ctxt);
+	if (cs->cs_buf)
+		OBD_FREE(cs->cs_buf, KUC_CHANGELOG_MSG_MAXSIZE);
+	OBD_FREE_PTR(cs);
+	return rc;
+}
+
+static int mdc_ioc_changelog_send(struct obd_device *obd,
+				  struct ioc_changelog *icc)
+{
+	struct changelog_show *cs;
+	int rc;
+
+	/* Freed in mdc_changelog_send_thread */
+	OBD_ALLOC_PTR(cs);
+	if (!cs)
+		return -ENOMEM;
+
+	cs->cs_obd = obd;
+	cs->cs_startrec = icc->icc_recno;
+	/* matching fput in mdc_changelog_send_thread */
+	cs->cs_fp = fget(icc->icc_id);
+	cs->cs_flags = icc->icc_flags;
+
+	/*
+	 * New thread because we should return to user app before
+	 * writing into our pipe
+	 */
+	rc = PTR_ERR(kthread_run(mdc_changelog_send_thread, cs,
+				 "mdc_clg_send_thread"));
+	if (!IS_ERR_VALUE(rc)) {
+		CDEBUG(D_CHANGELOG, "start changelog thread\n");
+		return 0;
+	}
+
+	CERROR("Failed to start changelog thread: %d\n", rc);
+	OBD_FREE_PTR(cs);
+	return rc;
+}
+
+static int mdc_ioc_hsm_ct_start(struct obd_export *exp,
+				struct lustre_kernelcomm *lk);
+
+static int mdc_quotacheck(struct obd_device *unused, struct obd_export *exp,
+			  struct obd_quotactl *oqctl)
+{
+	struct client_obd       *cli = &exp->exp_obd->u.cli;
+	struct ptlrpc_request   *req;
+	struct obd_quotactl     *body;
+	int		      rc;
+
+	req = ptlrpc_request_alloc_pack(class_exp2cliimp(exp),
+					&RQF_MDS_QUOTACHECK, LUSTRE_MDS_VERSION,
+					MDS_QUOTACHECK);
+	if (req == NULL)
+		return -ENOMEM;
+
+	body = req_capsule_client_get(&req->rq_pill, &RMF_OBD_QUOTACTL);
+	*body = *oqctl;
+
+	ptlrpc_request_set_replen(req);
+
+	/* the next poll will find -ENODATA, that means quotacheck is
+	 * going on */
+	cli->cl_qchk_stat = -ENODATA;
+	rc = ptlrpc_queue_wait(req);
+	if (rc)
+		cli->cl_qchk_stat = rc;
+	ptlrpc_req_finished(req);
+	return rc;
+}
+
+static int mdc_quota_poll_check(struct obd_export *exp,
+				struct if_quotacheck *qchk)
+{
+	struct client_obd *cli = &exp->exp_obd->u.cli;
+	int rc;
+
+	qchk->obd_uuid = cli->cl_target_uuid;
+	memcpy(qchk->obd_type, LUSTRE_MDS_NAME, strlen(LUSTRE_MDS_NAME));
+
+	rc = cli->cl_qchk_stat;
+	/* the client is not the previous one */
+	if (rc == CL_NOT_QUOTACHECKED)
+		rc = -EINTR;
+	return rc;
+}
+
+static int mdc_quotactl(struct obd_device *unused, struct obd_export *exp,
+			struct obd_quotactl *oqctl)
+{
+	struct ptlrpc_request   *req;
+	struct obd_quotactl     *oqc;
+	int		      rc;
+
+	req = ptlrpc_request_alloc_pack(class_exp2cliimp(exp),
+					&RQF_MDS_QUOTACTL, LUSTRE_MDS_VERSION,
+					MDS_QUOTACTL);
+	if (req == NULL)
+		return -ENOMEM;
+
+	oqc = req_capsule_client_get(&req->rq_pill, &RMF_OBD_QUOTACTL);
+	*oqc = *oqctl;
+
+	ptlrpc_request_set_replen(req);
+	ptlrpc_at_set_req_timeout(req);
+	req->rq_no_resend = 1;
+
+	rc = ptlrpc_queue_wait(req);
+	if (rc)
+		CERROR("ptlrpc_queue_wait failed, rc: %d\n", rc);
+
+	if (req->rq_repmsg) {
+		oqc = req_capsule_server_get(&req->rq_pill, &RMF_OBD_QUOTACTL);
+		if (oqc) {
+			*oqctl = *oqc;
+		} else if (!rc) {
+			CERROR("Can't unpack obd_quotactl\n");
+			rc = -EPROTO;
+		}
+	} else if (!rc) {
+		CERROR("Can't unpack obd_quotactl\n");
+		rc = -EPROTO;
+	}
+	ptlrpc_req_finished(req);
+
+	return rc;
+}
+
+static int mdc_ioc_swap_layouts(struct obd_export *exp,
+				struct md_op_data *op_data)
+{
+	LIST_HEAD(cancels);
+	struct ptlrpc_request	*req;
+	int			 rc, count;
+	struct mdc_swap_layouts *msl, *payload;
+
+	msl = op_data->op_data;
+
+	/* When the MDT will get the MDS_SWAP_LAYOUTS RPC the
+	 * first thing it will do is to cancel the 2 layout
+	 * locks hold by this client.
+	 * So the client must cancel its layout locks on the 2 fids
+	 * with the request RPC to avoid extra RPC round trips
+	 */
+	count = mdc_resource_get_unused(exp, &op_data->op_fid1, &cancels,
+					LCK_CR, MDS_INODELOCK_LAYOUT);
+	count += mdc_resource_get_unused(exp, &op_data->op_fid2, &cancels,
+					 LCK_CR, MDS_INODELOCK_LAYOUT);
+
+	req = ptlrpc_request_alloc(class_exp2cliimp(exp),
+				   &RQF_MDS_SWAP_LAYOUTS);
+	if (req == NULL) {
+		ldlm_lock_list_put(&cancels, l_bl_ast, count);
+		return -ENOMEM;
+	}
+
+	mdc_set_capa_size(req, &RMF_CAPA1, op_data->op_capa1);
+	mdc_set_capa_size(req, &RMF_CAPA2, op_data->op_capa2);
+
+	rc = mdc_prep_elc_req(exp, req, MDS_SWAP_LAYOUTS, &cancels, count);
+	if (rc) {
+		ptlrpc_request_free(req);
+		return rc;
+	}
+
+	mdc_swap_layouts_pack(req, op_data);
+
+	payload = req_capsule_client_get(&req->rq_pill, &RMF_SWAP_LAYOUTS);
+	LASSERT(payload);
+
+	*payload = *msl;
+
+	ptlrpc_request_set_replen(req);
+
+	rc = ptlrpc_queue_wait(req);
+	if (rc)
+		goto out;
+
+out:
+	ptlrpc_req_finished(req);
+	return rc;
+}
+
+static int mdc_iocontrol(unsigned int cmd, struct obd_export *exp, int len,
+			 void *karg, void *uarg)
+{
+	struct obd_device *obd = exp->exp_obd;
+	struct obd_ioctl_data *data = karg;
+	struct obd_import *imp = obd->u.cli.cl_import;
+	int rc;
+
+	if (!try_module_get(THIS_MODULE)) {
+		CERROR("Can't get module. Is it alive?");
+		return -EINVAL;
+	}
+	switch (cmd) {
+	case OBD_IOC_CHANGELOG_SEND:
+		rc = mdc_ioc_changelog_send(obd, karg);
+		goto out;
+	case OBD_IOC_CHANGELOG_CLEAR: {
+		struct ioc_changelog *icc = karg;
+		struct changelog_setinfo cs = {
+			.cs_recno = icc->icc_recno,
+			.cs_id = icc->icc_id
+		};
+
+		rc = obd_set_info_async(NULL, exp, strlen(KEY_CHANGELOG_CLEAR),
+					KEY_CHANGELOG_CLEAR, sizeof(cs), &cs,
+					NULL);
+		goto out;
+	}
+	case OBD_IOC_FID2PATH:
+		rc = mdc_ioc_fid2path(exp, karg);
+		goto out;
+	case LL_IOC_HSM_CT_START:
+		rc = mdc_ioc_hsm_ct_start(exp, karg);
+		/* ignore if it was already registered on this MDS. */
+		if (rc == -EEXIST)
+			rc = 0;
+		goto out;
+	case LL_IOC_HSM_PROGRESS:
+		rc = mdc_ioc_hsm_progress(exp, karg);
+		goto out;
+	case LL_IOC_HSM_STATE_GET:
+		rc = mdc_ioc_hsm_state_get(exp, karg);
+		goto out;
+	case LL_IOC_HSM_STATE_SET:
+		rc = mdc_ioc_hsm_state_set(exp, karg);
+		goto out;
+	case LL_IOC_HSM_ACTION:
+		rc = mdc_ioc_hsm_current_action(exp, karg);
+		goto out;
+	case LL_IOC_HSM_REQUEST:
+		rc = mdc_ioc_hsm_request(exp, karg);
+		goto out;
+	case OBD_IOC_CLIENT_RECOVER:
+		rc = ptlrpc_recover_import(imp, data->ioc_inlbuf1, 0);
+		if (rc < 0)
+			goto out;
+		rc = 0;
+		goto out;
+	case IOC_OSC_SET_ACTIVE:
+		rc = ptlrpc_set_import_active(imp, data->ioc_offset);
+		goto out;
+	case OBD_IOC_POLL_QUOTACHECK:
+		rc = mdc_quota_poll_check(exp, (struct if_quotacheck *)karg);
+		goto out;
+	case OBD_IOC_PING_TARGET:
+		rc = ptlrpc_obd_ping(obd);
+		goto out;
+	/*
+	 * Normally IOC_OBD_STATFS, OBD_IOC_QUOTACTL iocontrol are handled by
+	 * LMV instead of MDC. But when the cluster is upgraded from 1.8,
+	 * there'd be no LMV layer thus we might be called here. Eventually
+	 * this code should be removed.
+	 * bz20731, LU-592.
+	 */
+	case IOC_OBD_STATFS: {
+		struct obd_statfs stat_buf = {0};
+
+		if (*((__u32 *) data->ioc_inlbuf2) != 0) {
+			rc = -ENODEV;
+			goto out;
+		}
+
+		/* copy UUID */
+		if (copy_to_user(data->ioc_pbuf2, obd2cli_tgt(obd),
+				 min_t(size_t, data->ioc_plen2,
+					       sizeof(struct obd_uuid)))) {
+			rc = -EFAULT;
+			goto out;
+		}
+
+		rc = mdc_statfs(NULL, obd->obd_self_export, &stat_buf,
+				cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS),
+				0);
+		if (rc != 0)
+			goto out;
+
+		if (copy_to_user(data->ioc_pbuf1, &stat_buf,
+				 min_t(size_t, data->ioc_plen1,
+					       sizeof(stat_buf)))) {
+			rc = -EFAULT;
+			goto out;
+		}
+
+		rc = 0;
+		goto out;
+	}
+	case OBD_IOC_QUOTACTL: {
+		struct if_quotactl *qctl = karg;
+		struct obd_quotactl *oqctl;
+
+		OBD_ALLOC_PTR(oqctl);
+		if (oqctl == NULL) {
+			rc = -ENOMEM;
+			goto out;
+		}
+
+		QCTL_COPY(oqctl, qctl);
+		rc = obd_quotactl(exp, oqctl);
+		if (rc == 0) {
+			QCTL_COPY(qctl, oqctl);
+			qctl->qc_valid = QC_MDTIDX;
+			qctl->obd_uuid = obd->u.cli.cl_target_uuid;
+		}
+
+		OBD_FREE_PTR(oqctl);
+		goto out;
+	}
+	case LL_IOC_GET_CONNECT_FLAGS:
+		if (copy_to_user(uarg, exp_connect_flags_ptr(exp),
+				 sizeof(*exp_connect_flags_ptr(exp)))) {
+			rc = -EFAULT;
+			goto out;
+		}
+
+		rc = 0;
+		goto out;
+	case LL_IOC_LOV_SWAP_LAYOUTS:
+		rc = mdc_ioc_swap_layouts(exp, karg);
+		goto out;
+	default:
+		CERROR("unrecognised ioctl: cmd = %#x\n", cmd);
+		rc = -ENOTTY;
+		goto out;
+	}
+out:
+	module_put(THIS_MODULE);
+
+	return rc;
+}
+
+static int mdc_get_info_rpc(struct obd_export *exp,
+			    u32 keylen, void *key,
+			    int vallen, void *val)
+{
+	struct obd_import      *imp = class_exp2cliimp(exp);
+	struct ptlrpc_request  *req;
+	char		   *tmp;
+	int		     rc = -EINVAL;
+
+	req = ptlrpc_request_alloc(imp, &RQF_MDS_GET_INFO);
+	if (req == NULL)
+		return -ENOMEM;
+
+	req_capsule_set_size(&req->rq_pill, &RMF_GETINFO_KEY,
+			     RCL_CLIENT, keylen);
+	req_capsule_set_size(&req->rq_pill, &RMF_GETINFO_VALLEN,
+			     RCL_CLIENT, sizeof(__u32));
+
+	rc = ptlrpc_request_pack(req, LUSTRE_MDS_VERSION, MDS_GET_INFO);
+	if (rc) {
+		ptlrpc_request_free(req);
+		return rc;
+	}
+
+	tmp = req_capsule_client_get(&req->rq_pill, &RMF_GETINFO_KEY);
+	memcpy(tmp, key, keylen);
+	tmp = req_capsule_client_get(&req->rq_pill, &RMF_GETINFO_VALLEN);
+	memcpy(tmp, &vallen, sizeof(__u32));
+
+	req_capsule_set_size(&req->rq_pill, &RMF_GETINFO_VAL,
+			     RCL_SERVER, vallen);
+	ptlrpc_request_set_replen(req);
+
+	rc = ptlrpc_queue_wait(req);
+	/* -EREMOTE means the get_info result is partial, and it needs to
+	 * continue on another MDT, see fid2path part in lmv_iocontrol */
+	if (rc == 0 || rc == -EREMOTE) {
+		tmp = req_capsule_server_get(&req->rq_pill, &RMF_GETINFO_VAL);
+		memcpy(val, tmp, vallen);
+		if (ptlrpc_rep_need_swab(req)) {
+			if (KEY_IS(KEY_FID2PATH))
+				lustre_swab_fid2path(val);
+		}
+	}
+	ptlrpc_req_finished(req);
+
+	return rc;
+}
+
+static void lustre_swab_hai(struct hsm_action_item *h)
+{
+	__swab32s(&h->hai_len);
+	__swab32s(&h->hai_action);
+	lustre_swab_lu_fid(&h->hai_fid);
+	lustre_swab_lu_fid(&h->hai_dfid);
+	__swab64s(&h->hai_cookie);
+	__swab64s(&h->hai_extent.offset);
+	__swab64s(&h->hai_extent.length);
+	__swab64s(&h->hai_gid);
+}
+
+static void lustre_swab_hal(struct hsm_action_list *h)
+{
+	struct hsm_action_item	*hai;
+	int			 i;
+
+	__swab32s(&h->hal_version);
+	__swab32s(&h->hal_count);
+	__swab32s(&h->hal_archive_id);
+	__swab64s(&h->hal_flags);
+	hai = hai_zero(h);
+	for (i = 0; i < h->hal_count; i++, hai = hai_next(hai))
+		lustre_swab_hai(hai);
+}
+
+static void lustre_swab_kuch(struct kuc_hdr *l)
+{
+	__swab16s(&l->kuc_magic);
+	/* __u8 l->kuc_transport */
+	__swab16s(&l->kuc_msgtype);
+	__swab16s(&l->kuc_msglen);
+}
+
+static int mdc_ioc_hsm_ct_start(struct obd_export *exp,
+				struct lustre_kernelcomm *lk)
+{
+	struct obd_import  *imp = class_exp2cliimp(exp);
+	__u32		    archive = lk->lk_data;
+	int		    rc = 0;
+
+	if (lk->lk_group != KUC_GRP_HSM) {
+		CERROR("Bad copytool group %d\n", lk->lk_group);
+		return -EINVAL;
+	}
+
+	CDEBUG(D_HSM, "CT start r%d w%d u%d g%d f%#x\n", lk->lk_rfd, lk->lk_wfd,
+	       lk->lk_uid, lk->lk_group, lk->lk_flags);
+
+	if (lk->lk_flags & LK_FLG_STOP) {
+		/* Unregister with the coordinator */
+		rc = mdc_ioc_hsm_ct_unregister(imp);
+	} else {
+		rc = mdc_ioc_hsm_ct_register(imp, archive);
+	}
+
+	return rc;
+}
+
+/**
+ * Send a message to any listening copytools
+ * @param val KUC message (kuc_hdr + hsm_action_list)
+ * @param len total length of message
+ */
+static int mdc_hsm_copytool_send(int len, void *val)
+{
+	struct kuc_hdr		*lh = (struct kuc_hdr *)val;
+	struct hsm_action_list	*hal = (struct hsm_action_list *)(lh + 1);
+	int			 rc;
+
+	if (len < sizeof(*lh) + sizeof(*hal)) {
+		CERROR("Short HSM message %d < %d\n", len,
+		       (int) (sizeof(*lh) + sizeof(*hal)));
+		return -EPROTO;
+	}
+	if (lh->kuc_magic == __swab16(KUC_MAGIC)) {
+		lustre_swab_kuch(lh);
+		lustre_swab_hal(hal);
+	} else if (lh->kuc_magic != KUC_MAGIC) {
+		CERROR("Bad magic %x!=%x\n", lh->kuc_magic, KUC_MAGIC);
+		return -EPROTO;
+	}
+
+	CDEBUG(D_HSM,
+	       "Received message mg=%x t=%d m=%d l=%d actions=%d on %s\n",
+	       lh->kuc_magic, lh->kuc_transport, lh->kuc_msgtype,
+	       lh->kuc_msglen, hal->hal_count, hal->hal_fsname);
+
+	/* Broadcast to HSM listeners */
+	rc = libcfs_kkuc_group_put(KUC_GRP_HSM, lh);
+
+	return rc;
+}
+
+/**
+ * callback function passed to kuc for re-registering each HSM copytool
+ * running on MDC, after MDT shutdown/recovery.
+ * @param data archive id served by the copytool
+ * @param cb_arg callback argument (obd_import)
+ */
+static int mdc_hsm_ct_reregister(__u32 data, void *cb_arg)
+{
+	struct obd_import	*imp = (struct obd_import *)cb_arg;
+	__u32			 archive = data;
+	int			 rc;
+
+	CDEBUG(D_HA, "recover copytool registration to MDT (archive=%#x)\n",
+	       archive);
+	rc = mdc_ioc_hsm_ct_register(imp, archive);
+
+	/* ignore error if the copytool is already registered */
+	return ((rc != 0) && (rc != -EEXIST)) ? rc : 0;
+}
+
+/**
+ * Re-establish all kuc contexts with MDT
+ * after MDT shutdown/recovery.
+ */
+static int mdc_kuc_reregister(struct obd_import *imp)
+{
+	/* re-register HSM agents */
+	return libcfs_kkuc_group_foreach(KUC_GRP_HSM, mdc_hsm_ct_reregister,
+					 (void *)imp);
+}
+
+static int mdc_set_info_async(const struct lu_env *env,
+			      struct obd_export *exp,
+			      u32 keylen, void *key,
+			      u32 vallen, void *val,
+			      struct ptlrpc_request_set *set)
+{
+	struct obd_import	*imp = class_exp2cliimp(exp);
+	int			 rc;
+
+	if (KEY_IS(KEY_READ_ONLY)) {
+		if (vallen != sizeof(int))
+			return -EINVAL;
+
+		spin_lock(&imp->imp_lock);
+		if (*((int *)val)) {
+			imp->imp_connect_flags_orig |= OBD_CONNECT_RDONLY;
+			imp->imp_connect_data.ocd_connect_flags |=
+							OBD_CONNECT_RDONLY;
+		} else {
+			imp->imp_connect_flags_orig &= ~OBD_CONNECT_RDONLY;
+			imp->imp_connect_data.ocd_connect_flags &=
+							~OBD_CONNECT_RDONLY;
+		}
+		spin_unlock(&imp->imp_lock);
+
+		rc = do_set_info_async(imp, MDS_SET_INFO, LUSTRE_MDS_VERSION,
+				       keylen, key, vallen, val, set);
+		return rc;
+	}
+	if (KEY_IS(KEY_SPTLRPC_CONF)) {
+		sptlrpc_conf_client_adapt(exp->exp_obd);
+		return 0;
+	}
+	if (KEY_IS(KEY_FLUSH_CTX)) {
+		sptlrpc_import_flush_my_ctx(imp);
+		return 0;
+	}
+	if (KEY_IS(KEY_CHANGELOG_CLEAR)) {
+		rc = do_set_info_async(imp, MDS_SET_INFO, LUSTRE_MDS_VERSION,
+				       keylen, key, vallen, val, set);
+		return rc;
+	}
+	if (KEY_IS(KEY_HSM_COPYTOOL_SEND)) {
+		rc = mdc_hsm_copytool_send(vallen, val);
+		return rc;
+	}
+
+	CERROR("Unknown key %s\n", (char *)key);
+	return -EINVAL;
+}
+
+static int mdc_get_info(const struct lu_env *env, struct obd_export *exp,
+			__u32 keylen, void *key, __u32 *vallen, void *val,
+			struct lov_stripe_md *lsm)
+{
+	int rc = -EINVAL;
+
+	if (KEY_IS(KEY_MAX_EASIZE)) {
+		int mdsize, *max_easize;
+
+		if (*vallen != sizeof(int))
+			return -EINVAL;
+		mdsize = *(int *)val;
+		if (mdsize > exp->exp_obd->u.cli.cl_max_mds_easize)
+			exp->exp_obd->u.cli.cl_max_mds_easize = mdsize;
+		max_easize = val;
+		*max_easize = exp->exp_obd->u.cli.cl_max_mds_easize;
+		return 0;
+	} else if (KEY_IS(KEY_DEFAULT_EASIZE)) {
+		int *default_easize;
+
+		if (*vallen != sizeof(int))
+			return -EINVAL;
+		default_easize = val;
+		*default_easize = exp->exp_obd->u.cli.cl_default_mds_easize;
+		return 0;
+	} else if (KEY_IS(KEY_MAX_COOKIESIZE)) {
+		int mdsize, *max_cookiesize;
+
+		if (*vallen != sizeof(int))
+			return -EINVAL;
+		mdsize = *(int *)val;
+		if (mdsize > exp->exp_obd->u.cli.cl_max_mds_cookiesize)
+			exp->exp_obd->u.cli.cl_max_mds_cookiesize = mdsize;
+		max_cookiesize = val;
+		*max_cookiesize = exp->exp_obd->u.cli.cl_max_mds_cookiesize;
+		return 0;
+	} else if (KEY_IS(KEY_DEFAULT_COOKIESIZE)) {
+		int *default_cookiesize;
+
+		if (*vallen != sizeof(int))
+			return -EINVAL;
+		default_cookiesize = val;
+		*default_cookiesize =
+			exp->exp_obd->u.cli.cl_default_mds_cookiesize;
+		return 0;
+	} else if (KEY_IS(KEY_CONN_DATA)) {
+		struct obd_import *imp = class_exp2cliimp(exp);
+		struct obd_connect_data *data = val;
+
+		if (*vallen != sizeof(*data))
+			return -EINVAL;
+
+		*data = imp->imp_connect_data;
+		return 0;
+	} else if (KEY_IS(KEY_TGT_COUNT)) {
+		*((int *)val) = 1;
+		return 0;
+	}
+
+	rc = mdc_get_info_rpc(exp, keylen, key, *vallen, val);
+
+	return rc;
+}
+
+static int mdc_sync(struct obd_export *exp, const struct lu_fid *fid,
+		    struct obd_capa *oc, struct ptlrpc_request **request)
+{
+	struct ptlrpc_request *req;
+	int		    rc;
+
+	*request = NULL;
+	req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_MDS_SYNC);
+	if (req == NULL)
+		return -ENOMEM;
+
+	mdc_set_capa_size(req, &RMF_CAPA1, oc);
+
+	rc = ptlrpc_request_pack(req, LUSTRE_MDS_VERSION, MDS_SYNC);
+	if (rc) {
+		ptlrpc_request_free(req);
+		return rc;
+	}
+
+	mdc_pack_body(req, fid, oc, 0, 0, -1, 0);
+
+	ptlrpc_request_set_replen(req);
+
+	rc = ptlrpc_queue_wait(req);
+	if (rc)
+		ptlrpc_req_finished(req);
+	else
+		*request = req;
+	return rc;
+}
+
+static int mdc_import_event(struct obd_device *obd, struct obd_import *imp,
+			    enum obd_import_event event)
+{
+	int rc = 0;
+
+	LASSERT(imp->imp_obd == obd);
+
+	switch (event) {
+	case IMP_EVENT_DISCON: {
+#if 0
+		/* XXX Pass event up to OBDs stack. used only for FLD now */
+		rc = obd_notify_observer(obd, obd, OBD_NOTIFY_DISCON, NULL);
+#endif
+		break;
+	}
+	case IMP_EVENT_INACTIVE: {
+		struct client_obd *cli = &obd->u.cli;
+		/*
+		 * Flush current sequence to make client obtain new one
+		 * from server in case of disconnect/reconnect.
+		 */
+		if (cli->cl_seq != NULL)
+			seq_client_flush(cli->cl_seq);
+
+		rc = obd_notify_observer(obd, obd, OBD_NOTIFY_INACTIVE, NULL);
+		break;
+	}
+	case IMP_EVENT_INVALIDATE: {
+		struct ldlm_namespace *ns = obd->obd_namespace;
+
+		ldlm_namespace_cleanup(ns, LDLM_FL_LOCAL_ONLY);
+
+		break;
+	}
+	case IMP_EVENT_ACTIVE:
+		rc = obd_notify_observer(obd, obd, OBD_NOTIFY_ACTIVE, NULL);
+		/* redo the kuc registration after reconnecting */
+		if (rc == 0)
+			rc = mdc_kuc_reregister(imp);
+		break;
+	case IMP_EVENT_OCD:
+		rc = obd_notify_observer(obd, obd, OBD_NOTIFY_OCD, NULL);
+		break;
+	case IMP_EVENT_DEACTIVATE:
+	case IMP_EVENT_ACTIVATE:
+		break;
+	default:
+		CERROR("Unknown import event %x\n", event);
+		LBUG();
+	}
+	return rc;
+}
+
+int mdc_fid_alloc(struct obd_export *exp, struct lu_fid *fid,
+		  struct md_op_data *op_data)
+{
+	struct client_obd *cli = &exp->exp_obd->u.cli;
+	struct lu_client_seq *seq = cli->cl_seq;
+
+	return seq_client_alloc_fid(NULL, seq, fid);
+}
+
+static struct obd_uuid *mdc_get_uuid(struct obd_export *exp)
+{
+	struct client_obd *cli = &exp->exp_obd->u.cli;
+
+	return &cli->cl_target_uuid;
+}
+
+/**
+ * Determine whether the lock can be canceled before replaying it during
+ * recovery, non zero value will be return if the lock can be canceled,
+ * or zero returned for not
+ */
+static int mdc_cancel_for_recovery(struct ldlm_lock *lock)
+{
+	if (lock->l_resource->lr_type != LDLM_IBITS)
+		return 0;
+
+	/* FIXME: if we ever get into a situation where there are too many
+	 * opened files with open locks on a single node, then we really
+	 * should replay these open locks to reget it */
+	if (lock->l_policy_data.l_inodebits.bits & MDS_INODELOCK_OPEN)
+		return 0;
+
+	return 1;
+}
+
+static int mdc_resource_inode_free(struct ldlm_resource *res)
+{
+	if (res->lr_lvb_inode)
+		res->lr_lvb_inode = NULL;
+
+	return 0;
+}
+
+static struct ldlm_valblock_ops inode_lvbo = {
+	.lvbo_free = mdc_resource_inode_free,
+};
+
+static int mdc_llog_init(struct obd_device *obd)
+{
+	struct obd_llog_group	*olg = &obd->obd_olg;
+	struct llog_ctxt	*ctxt;
+	int			 rc;
+
+	rc = llog_setup(NULL, obd, olg, LLOG_CHANGELOG_REPL_CTXT, obd,
+			&llog_client_ops);
+	if (rc)
+		return rc;
+
+	ctxt = llog_group_get_ctxt(olg, LLOG_CHANGELOG_REPL_CTXT);
+	llog_initiator_connect(ctxt);
+	llog_ctxt_put(ctxt);
+
+	return 0;
+}
+
+static void mdc_llog_finish(struct obd_device *obd)
+{
+	struct llog_ctxt *ctxt;
+
+	ctxt = llog_get_context(obd, LLOG_CHANGELOG_REPL_CTXT);
+	if (ctxt)
+		llog_cleanup(NULL, ctxt);
+}
+
+static int mdc_setup(struct obd_device *obd, struct lustre_cfg *cfg)
+{
+	struct client_obd *cli = &obd->u.cli;
+	struct lprocfs_static_vars lvars = { NULL };
+	int rc;
+
+	OBD_ALLOC(cli->cl_rpc_lock, sizeof(*cli->cl_rpc_lock));
+	if (!cli->cl_rpc_lock)
+		return -ENOMEM;
+	mdc_init_rpc_lock(cli->cl_rpc_lock);
+
+	ptlrpcd_addref();
+
+	OBD_ALLOC(cli->cl_close_lock, sizeof(*cli->cl_close_lock));
+	if (!cli->cl_close_lock) {
+		rc = -ENOMEM;
+		goto err_rpc_lock;
+	}
+	mdc_init_rpc_lock(cli->cl_close_lock);
+
+	rc = client_obd_setup(obd, cfg);
+	if (rc)
+		goto err_close_lock;
+	lprocfs_mdc_init_vars(&lvars);
+	lprocfs_obd_setup(obd, lvars.obd_vars);
+	sptlrpc_lprocfs_cliobd_attach(obd);
+	ptlrpc_lprocfs_register_obd(obd);
+
+	ns_register_cancel(obd->obd_namespace, mdc_cancel_for_recovery);
+
+	obd->obd_namespace->ns_lvbo = &inode_lvbo;
+
+	rc = mdc_llog_init(obd);
+	if (rc) {
+		mdc_cleanup(obd);
+		CERROR("failed to setup llogging subsystems\n");
+	}
+
+	return rc;
+
+err_close_lock:
+	OBD_FREE(cli->cl_close_lock, sizeof(*cli->cl_close_lock));
+err_rpc_lock:
+	OBD_FREE(cli->cl_rpc_lock, sizeof(*cli->cl_rpc_lock));
+	ptlrpcd_decref();
+	return rc;
+}
+
+/* Initialize the default and maximum LOV EA and cookie sizes.  This allows
+ * us to make MDS RPCs with large enough reply buffers to hold a default
+ * sized EA and cookie without having to calculate this (via a call into the
+ * LOV + OSCs) each time we make an RPC.  The maximum size is also tracked
+ * but not used to avoid wastefully vmalloc()'ing large reply buffers when
+ * a large number of stripes is possible.  If a larger reply buffer is
+ * required it will be reallocated in the ptlrpc layer due to overflow.
+ */
+static int mdc_init_ea_size(struct obd_export *exp, int easize,
+			    int def_easize, int cookiesize, int def_cookiesize)
+{
+	struct obd_device *obd = exp->exp_obd;
+	struct client_obd *cli = &obd->u.cli;
+
+	if (cli->cl_max_mds_easize < easize)
+		cli->cl_max_mds_easize = easize;
+
+	if (cli->cl_default_mds_easize < def_easize)
+		cli->cl_default_mds_easize = def_easize;
+
+	if (cli->cl_max_mds_cookiesize < cookiesize)
+		cli->cl_max_mds_cookiesize = cookiesize;
+
+	if (cli->cl_default_mds_cookiesize < def_cookiesize)
+		cli->cl_default_mds_cookiesize = def_cookiesize;
+
+	return 0;
+}
+
+static int mdc_precleanup(struct obd_device *obd, enum obd_cleanup_stage stage)
+{
+	switch (stage) {
+	case OBD_CLEANUP_EARLY:
+		break;
+	case OBD_CLEANUP_EXPORTS:
+		/* Failsafe, ok if racy */
+		if (obd->obd_type->typ_refcnt <= 1)
+			libcfs_kkuc_group_rem(0, KUC_GRP_HSM);
+
+		obd_cleanup_client_import(obd);
+		ptlrpc_lprocfs_unregister_obd(obd);
+		lprocfs_obd_cleanup(obd);
+
+		mdc_llog_finish(obd);
+		break;
+	}
+	return 0;
+}
+
+static int mdc_cleanup(struct obd_device *obd)
+{
+	struct client_obd *cli = &obd->u.cli;
+
+	OBD_FREE(cli->cl_rpc_lock, sizeof(*cli->cl_rpc_lock));
+	OBD_FREE(cli->cl_close_lock, sizeof(*cli->cl_close_lock));
+
+	ptlrpcd_decref();
+
+	return client_obd_cleanup(obd);
+}
+
+static int mdc_process_config(struct obd_device *obd, u32 len, void *buf)
+{
+	struct lustre_cfg *lcfg = buf;
+	struct lprocfs_static_vars lvars = { NULL };
+	int rc = 0;
+
+	lprocfs_mdc_init_vars(&lvars);
+	switch (lcfg->lcfg_command) {
+	default:
+		rc = class_process_proc_param(PARAM_MDC, lvars.obd_vars,
+					      lcfg, obd);
+		if (rc > 0)
+			rc = 0;
+		break;
+	}
+	return rc;
+}
+
+
+/* get remote permission for current user on fid */
+static int mdc_get_remote_perm(struct obd_export *exp, const struct lu_fid *fid,
+			       struct obd_capa *oc, __u32 suppgid,
+			       struct ptlrpc_request **request)
+{
+	struct ptlrpc_request  *req;
+	int		    rc;
+
+	LASSERT(client_is_remote(exp));
+
+	*request = NULL;
+	req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_MDS_GETATTR);
+	if (req == NULL)
+		return -ENOMEM;
+
+	mdc_set_capa_size(req, &RMF_CAPA1, oc);
+
+	rc = ptlrpc_request_pack(req, LUSTRE_MDS_VERSION, MDS_GETATTR);
+	if (rc) {
+		ptlrpc_request_free(req);
+		return rc;
+	}
+
+	mdc_pack_body(req, fid, oc, OBD_MD_FLRMTPERM, 0, suppgid, 0);
+
+	req_capsule_set_size(&req->rq_pill, &RMF_ACL, RCL_SERVER,
+			     sizeof(struct mdt_remote_perm));
+
+	ptlrpc_request_set_replen(req);
+
+	rc = ptlrpc_queue_wait(req);
+	if (rc)
+		ptlrpc_req_finished(req);
+	else
+		*request = req;
+	return rc;
+}
+
+static int mdc_interpret_renew_capa(const struct lu_env *env,
+				    struct ptlrpc_request *req, void *args,
+				    int status)
+{
+	struct mdc_renew_capa_args *ra = args;
+	struct mdt_body *body = NULL;
+	struct lustre_capa *capa;
+
+	if (status) {
+		capa = ERR_PTR(status);
+		goto out;
+	}
+
+	body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
+	if (body == NULL) {
+		capa = ERR_PTR(-EFAULT);
+		goto out;
+	}
+
+	if ((body->valid & OBD_MD_FLOSSCAPA) == 0) {
+		capa = ERR_PTR(-ENOENT);
+		goto out;
+	}
+
+	capa = req_capsule_server_get(&req->rq_pill, &RMF_CAPA2);
+	if (!capa) {
+		capa = ERR_PTR(-EFAULT);
+		goto out;
+	}
+out:
+	ra->ra_cb(ra->ra_oc, capa);
+	return 0;
+}
+
+static int mdc_renew_capa(struct obd_export *exp, struct obd_capa *oc,
+			  renew_capa_cb_t cb)
+{
+	struct ptlrpc_request *req;
+	struct mdc_renew_capa_args *ra;
+
+	req = ptlrpc_request_alloc_pack(class_exp2cliimp(exp), &RQF_MDS_GETATTR,
+					LUSTRE_MDS_VERSION, MDS_GETATTR);
+	if (req == NULL)
+		return -ENOMEM;
+
+	/* NB, OBD_MD_FLOSSCAPA is set here, but it doesn't necessarily mean the
+	 * capa to renew is oss capa.
+	 */
+	mdc_pack_body(req, &oc->c_capa.lc_fid, oc, OBD_MD_FLOSSCAPA, 0, -1, 0);
+	ptlrpc_request_set_replen(req);
+
+	CLASSERT(sizeof(*ra) <= sizeof(req->rq_async_args));
+	ra = ptlrpc_req_async_args(req);
+	ra->ra_oc = oc;
+	ra->ra_cb = cb;
+	req->rq_interpret_reply = mdc_interpret_renew_capa;
+	ptlrpcd_add_req(req, PDL_POLICY_LOCAL, -1);
+	return 0;
+}
+
+static struct obd_ops mdc_obd_ops = {
+	.o_owner	    = THIS_MODULE,
+	.o_setup	    = mdc_setup,
+	.o_precleanup       = mdc_precleanup,
+	.o_cleanup	  = mdc_cleanup,
+	.o_add_conn	 = client_import_add_conn,
+	.o_del_conn	 = client_import_del_conn,
+	.o_connect          = client_connect_import,
+	.o_disconnect       = client_disconnect_export,
+	.o_iocontrol	= mdc_iocontrol,
+	.o_set_info_async   = mdc_set_info_async,
+	.o_statfs	   = mdc_statfs,
+	.o_fid_init	    = client_fid_init,
+	.o_fid_fini	    = client_fid_fini,
+	.o_fid_alloc	= mdc_fid_alloc,
+	.o_import_event     = mdc_import_event,
+	.o_get_info	 = mdc_get_info,
+	.o_process_config   = mdc_process_config,
+	.o_get_uuid	 = mdc_get_uuid,
+	.o_quotactl	 = mdc_quotactl,
+	.o_quotacheck       = mdc_quotacheck
+};
+
+static struct md_ops mdc_md_ops = {
+	.m_getstatus	= mdc_getstatus,
+	.m_null_inode	    = mdc_null_inode,
+	.m_find_cbdata      = mdc_find_cbdata,
+	.m_close	    = mdc_close,
+	.m_create	   = mdc_create,
+	.m_done_writing     = mdc_done_writing,
+	.m_enqueue	  = mdc_enqueue,
+	.m_getattr	  = mdc_getattr,
+	.m_getattr_name     = mdc_getattr_name,
+	.m_intent_lock      = mdc_intent_lock,
+	.m_link	     = mdc_link,
+	.m_is_subdir	= mdc_is_subdir,
+	.m_rename	   = mdc_rename,
+	.m_setattr	  = mdc_setattr,
+	.m_setxattr	 = mdc_setxattr,
+	.m_getxattr	 = mdc_getxattr,
+	.m_sync	     = mdc_sync,
+	.m_readpage	 = mdc_readpage,
+	.m_unlink	   = mdc_unlink,
+	.m_cancel_unused    = mdc_cancel_unused,
+	.m_init_ea_size     = mdc_init_ea_size,
+	.m_set_lock_data    = mdc_set_lock_data,
+	.m_lock_match       = mdc_lock_match,
+	.m_get_lustre_md    = mdc_get_lustre_md,
+	.m_free_lustre_md   = mdc_free_lustre_md,
+	.m_set_open_replay_data = mdc_set_open_replay_data,
+	.m_clear_open_replay_data = mdc_clear_open_replay_data,
+	.m_renew_capa       = mdc_renew_capa,
+	.m_unpack_capa      = mdc_unpack_capa,
+	.m_get_remote_perm  = mdc_get_remote_perm,
+	.m_intent_getattr_async = mdc_intent_getattr_async,
+	.m_revalidate_lock      = mdc_revalidate_lock
+};
+
+static int __init mdc_init(void)
+{
+	struct lprocfs_static_vars lvars = { NULL };
+
+	lprocfs_mdc_init_vars(&lvars);
+
+	return class_register_type(&mdc_obd_ops, &mdc_md_ops, lvars.module_vars,
+				 LUSTRE_MDC_NAME, NULL);
+}
+
+static void /*__exit*/ mdc_exit(void)
+{
+	class_unregister_type(LUSTRE_MDC_NAME);
+}
+
+MODULE_AUTHOR("Sun Microsystems, Inc. <http://www.lustre.org/>");
+MODULE_DESCRIPTION("Lustre Metadata Client");
+MODULE_LICENSE("GPL");
+
+module_init(mdc_init);
+module_exit(mdc_exit);
diff --git a/kernel/drivers/staging/lustre/lustre/mgc/Makefile b/kernel/drivers/staging/lustre/lustre/mgc/Makefile
new file mode 100644
index 000000000..cc6e9f51a
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/mgc/Makefile
@@ -0,0 +1,3 @@
+obj-$(CONFIG_LUSTRE_FS) += mgc.o
+mgc-y := mgc_request.o
+mgc-$(CONFIG_PROC_FS) += lproc_mgc.o
diff --git a/kernel/drivers/staging/lustre/lustre/mgc/lproc_mgc.c b/kernel/drivers/staging/lustre/lustre/mgc/lproc_mgc.c
new file mode 100644
index 000000000..c4ea38e5f
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/mgc/lproc_mgc.c
@@ -0,0 +1,80 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+#define DEBUG_SUBSYSTEM S_CLASS
+
+#include <linux/vfs.h>
+#include "../include/obd_class.h"
+#include "../include/lprocfs_status.h"
+#include "mgc_internal.h"
+
+LPROC_SEQ_FOPS_RO_TYPE(mgc, uuid);
+LPROC_SEQ_FOPS_RO_TYPE(mgc, connect_flags);
+LPROC_SEQ_FOPS_RO_TYPE(mgc, server_uuid);
+LPROC_SEQ_FOPS_RO_TYPE(mgc, conn_uuid);
+LPROC_SEQ_FOPS_RO_TYPE(mgc, import);
+LPROC_SEQ_FOPS_RO_TYPE(mgc, state);
+
+LPROC_SEQ_FOPS_WR_ONLY(mgc, ping);
+
+static int mgc_ir_state_seq_show(struct seq_file *m, void *v)
+{
+	return lprocfs_mgc_rd_ir_state(m, m->private);
+}
+LPROC_SEQ_FOPS_RO(mgc_ir_state);
+
+static struct lprocfs_vars lprocfs_mgc_obd_vars[] = {
+	{ "uuid",	     &mgc_uuid_fops,	  NULL, 0 },
+	{ "ping",	     &mgc_ping_fops,      NULL, 0222 },
+	{ "connect_flags",   &mgc_connect_flags_fops, NULL, 0 },
+	{ "mgs_server_uuid", &mgc_server_uuid_fops,   NULL, 0 },
+	{ "mgs_conn_uuid",   &mgc_conn_uuid_fops,     NULL, 0 },
+	{ "import",	     &mgc_import_fops,	NULL, 0 },
+	{ "state",	     &mgc_state_fops,	 NULL, 0 },
+	{ "ir_state",	     &mgc_ir_state_fops,  NULL, 0 },
+	{ NULL }
+};
+
+LPROC_SEQ_FOPS_RO_TYPE(mgc, numrefs);
+static struct lprocfs_vars lprocfs_mgc_module_vars[] = {
+	{ "num_refs",	&mgc_numrefs_fops,       NULL, 0 },
+	{ NULL }
+};
+
+void lprocfs_mgc_init_vars(struct lprocfs_static_vars *lvars)
+{
+	lvars->module_vars = lprocfs_mgc_module_vars;
+	lvars->obd_vars    = lprocfs_mgc_obd_vars;
+}
diff --git a/kernel/drivers/staging/lustre/lustre/mgc/mgc_internal.h b/kernel/drivers/staging/lustre/lustre/mgc/mgc_internal.h
new file mode 100644
index 000000000..a6f8b3ced
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/mgc/mgc_internal.h
@@ -0,0 +1,73 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef _MGC_INTERNAL_H
+#define _MGC_INTERNAL_H
+
+#include "../../include/linux/libcfs/libcfs.h"
+#include "../include/lustre/lustre_idl.h"
+#include "../include/lustre_lib.h"
+#include "../include/lustre_dlm.h"
+#include "../include/lustre_log.h"
+#include "../include/lustre_export.h"
+
+#if defined (CONFIG_PROC_FS)
+void lprocfs_mgc_init_vars(struct lprocfs_static_vars *lvars);
+int lprocfs_mgc_rd_ir_state(struct seq_file *m, void *data);
+#else
+static inline void lprocfs_mgc_init_vars(struct lprocfs_static_vars *lvars)
+{
+	memset(lvars, 0, sizeof(*lvars));
+}
+static inline int lprocfs_mgc_rd_ir_state(struct seq_file *m, void *data)
+{
+	return 0;
+}
+#endif  /* CONFIG_PROC_FS */
+
+int mgc_process_log(struct obd_device *mgc, struct config_llog_data *cld);
+
+static inline int cld_is_sptlrpc(struct config_llog_data *cld)
+{
+	return cld->cld_type == CONFIG_T_SPTLRPC;
+}
+
+static inline int cld_is_recover(struct config_llog_data *cld)
+{
+	return cld->cld_type == CONFIG_T_RECOVER;
+}
+
+#endif  /* _MGC_INTERNAL_H */
diff --git a/kernel/drivers/staging/lustre/lustre/mgc/mgc_request.c b/kernel/drivers/staging/lustre/lustre/mgc/mgc_request.c
new file mode 100644
index 000000000..7947aec5c
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/mgc/mgc_request.c
@@ -0,0 +1,1762 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/mgc/mgc_request.c
+ *
+ * Author: Nathan Rutman <nathan@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_MGC
+#define D_MGC D_CONFIG /*|D_WARNING*/
+
+#include <linux/module.h>
+#include "../include/obd_class.h"
+#include "../include/lustre_dlm.h"
+#include "../include/lprocfs_status.h"
+#include "../include/lustre_log.h"
+#include "../include/lustre_disk.h"
+
+#include "mgc_internal.h"
+
+static int mgc_name2resid(char *name, int len, struct ldlm_res_id *res_id,
+			  int type)
+{
+	__u64 resname = 0;
+
+	if (len > sizeof(resname)) {
+		CERROR("name too long: %s\n", name);
+		return -EINVAL;
+	}
+	if (len <= 0) {
+		CERROR("missing name: %s\n", name);
+		return -EINVAL;
+	}
+	memcpy(&resname, name, len);
+
+	/* Always use the same endianness for the resid */
+	memset(res_id, 0, sizeof(*res_id));
+	res_id->name[0] = cpu_to_le64(resname);
+	/* XXX: unfortunately, sptlprc and config llog share one lock */
+	switch (type) {
+	case CONFIG_T_CONFIG:
+	case CONFIG_T_SPTLRPC:
+		resname = 0;
+		break;
+	case CONFIG_T_RECOVER:
+	case CONFIG_T_PARAMS:
+		resname = type;
+		break;
+	default:
+		LBUG();
+	}
+	res_id->name[1] = cpu_to_le64(resname);
+	CDEBUG(D_MGC, "log %s to resid %#llx/%#llx (%.8s)\n", name,
+	       res_id->name[0], res_id->name[1], (char *)&res_id->name[0]);
+	return 0;
+}
+
+int mgc_fsname2resid(char *fsname, struct ldlm_res_id *res_id, int type)
+{
+	/* fsname is at most 8 chars long, maybe contain "-".
+	 * e.g. "lustre", "SUN-000" */
+	return mgc_name2resid(fsname, strlen(fsname), res_id, type);
+}
+EXPORT_SYMBOL(mgc_fsname2resid);
+
+static int mgc_logname2resid(char *logname, struct ldlm_res_id *res_id, int type)
+{
+	char *name_end;
+	int len;
+
+	/* logname consists of "fsname-nodetype".
+	 * e.g. "lustre-MDT0001", "SUN-000-client"
+	 * there is an exception: llog "params" */
+	name_end = strrchr(logname, '-');
+	if (!name_end)
+		len = strlen(logname);
+	else
+		len = name_end - logname;
+	return mgc_name2resid(logname, len, res_id, type);
+}
+
+/********************** config llog list **********************/
+static LIST_HEAD(config_llog_list);
+static DEFINE_SPINLOCK(config_list_lock);
+
+/* Take a reference to a config log */
+static int config_log_get(struct config_llog_data *cld)
+{
+	atomic_inc(&cld->cld_refcount);
+	CDEBUG(D_INFO, "log %s refs %d\n", cld->cld_logname,
+	       atomic_read(&cld->cld_refcount));
+	return 0;
+}
+
+/* Drop a reference to a config log.  When no longer referenced,
+   we can free the config log data */
+static void config_log_put(struct config_llog_data *cld)
+{
+	CDEBUG(D_INFO, "log %s refs %d\n", cld->cld_logname,
+	       atomic_read(&cld->cld_refcount));
+	LASSERT(atomic_read(&cld->cld_refcount) > 0);
+
+	/* spinlock to make sure no item with 0 refcount in the list */
+	if (atomic_dec_and_lock(&cld->cld_refcount, &config_list_lock)) {
+		list_del(&cld->cld_list_chain);
+		spin_unlock(&config_list_lock);
+
+		CDEBUG(D_MGC, "dropping config log %s\n", cld->cld_logname);
+
+		if (cld->cld_recover)
+			config_log_put(cld->cld_recover);
+		if (cld->cld_sptlrpc)
+			config_log_put(cld->cld_sptlrpc);
+		if (cld->cld_params)
+			config_log_put(cld->cld_params);
+		if (cld_is_sptlrpc(cld))
+			sptlrpc_conf_log_stop(cld->cld_logname);
+
+		class_export_put(cld->cld_mgcexp);
+		OBD_FREE(cld, sizeof(*cld) + strlen(cld->cld_logname) + 1);
+	}
+}
+
+/* Find a config log by name */
+static
+struct config_llog_data *config_log_find(char *logname,
+					 struct config_llog_instance *cfg)
+{
+	struct config_llog_data *cld;
+	struct config_llog_data *found = NULL;
+	void *instance;
+
+	LASSERT(logname != NULL);
+
+	instance = cfg ? cfg->cfg_instance : NULL;
+	spin_lock(&config_list_lock);
+	list_for_each_entry(cld, &config_llog_list, cld_list_chain) {
+		/* check if instance equals */
+		if (instance != cld->cld_cfg.cfg_instance)
+			continue;
+
+		/* instance may be NULL, should check name */
+		if (strcmp(logname, cld->cld_logname) == 0) {
+			found = cld;
+			break;
+		}
+	}
+	if (found) {
+		atomic_inc(&found->cld_refcount);
+		LASSERT(found->cld_stopping == 0 || cld_is_sptlrpc(found) == 0);
+	}
+	spin_unlock(&config_list_lock);
+	return found;
+}
+
+static
+struct config_llog_data *do_config_log_add(struct obd_device *obd,
+					   char *logname,
+					   int type,
+					   struct config_llog_instance *cfg,
+					   struct super_block *sb)
+{
+	struct config_llog_data *cld;
+	int		      rc;
+
+	CDEBUG(D_MGC, "do adding config log %s:%p\n", logname,
+	       cfg ? cfg->cfg_instance : NULL);
+
+	OBD_ALLOC(cld, sizeof(*cld) + strlen(logname) + 1);
+	if (!cld)
+		return ERR_PTR(-ENOMEM);
+
+	strcpy(cld->cld_logname, logname);
+	if (cfg)
+		cld->cld_cfg = *cfg;
+	else
+		cld->cld_cfg.cfg_callback = class_config_llog_handler;
+	mutex_init(&cld->cld_lock);
+	cld->cld_cfg.cfg_last_idx = 0;
+	cld->cld_cfg.cfg_flags = 0;
+	cld->cld_cfg.cfg_sb = sb;
+	cld->cld_type = type;
+	atomic_set(&cld->cld_refcount, 1);
+
+	/* Keep the mgc around until we are done */
+	cld->cld_mgcexp = class_export_get(obd->obd_self_export);
+
+	if (cld_is_sptlrpc(cld)) {
+		sptlrpc_conf_log_start(logname);
+		cld->cld_cfg.cfg_obdname = obd->obd_name;
+	}
+
+	rc = mgc_logname2resid(logname, &cld->cld_resid, type);
+
+	spin_lock(&config_list_lock);
+	list_add(&cld->cld_list_chain, &config_llog_list);
+	spin_unlock(&config_list_lock);
+
+	if (rc) {
+		config_log_put(cld);
+		return ERR_PTR(rc);
+	}
+
+	if (cld_is_sptlrpc(cld)) {
+		rc = mgc_process_log(obd, cld);
+		if (rc && rc != -ENOENT)
+			CERROR("failed processing sptlrpc log: %d\n", rc);
+	}
+
+	return cld;
+}
+
+static struct config_llog_data *config_recover_log_add(struct obd_device *obd,
+	char *fsname,
+	struct config_llog_instance *cfg,
+	struct super_block *sb)
+{
+	struct config_llog_instance lcfg = *cfg;
+	struct lustre_sb_info *lsi = s2lsi(sb);
+	struct config_llog_data *cld;
+	char logname[32];
+
+	if (IS_OST(lsi))
+		return NULL;
+
+	/* for osp-on-ost, see lustre_start_osp() */
+	if (IS_MDT(lsi) && lcfg.cfg_instance)
+		return NULL;
+
+	/* we have to use different llog for clients and mdts for cmd
+	 * where only clients are notified if one of cmd server restarts */
+	LASSERT(strlen(fsname) < sizeof(logname) / 2);
+	strcpy(logname, fsname);
+	if (IS_SERVER(lsi)) { /* mdt */
+		LASSERT(lcfg.cfg_instance == NULL);
+		lcfg.cfg_instance = sb;
+		strcat(logname, "-mdtir");
+	} else {
+		LASSERT(lcfg.cfg_instance != NULL);
+		strcat(logname, "-cliir");
+	}
+
+	cld = do_config_log_add(obd, logname, CONFIG_T_RECOVER, &lcfg, sb);
+	return cld;
+}
+
+static struct config_llog_data *config_params_log_add(struct obd_device *obd,
+	struct config_llog_instance *cfg, struct super_block *sb)
+{
+	struct config_llog_instance	lcfg = *cfg;
+	struct config_llog_data		*cld;
+
+	lcfg.cfg_instance = sb;
+
+	cld = do_config_log_add(obd, PARAMS_FILENAME, CONFIG_T_PARAMS,
+				&lcfg, sb);
+
+	return cld;
+}
+
+/** Add this log to the list of active logs watched by an MGC.
+ * Active means we're watching for updates.
+ * We have one active log per "mount" - client instance or servername.
+ * Each instance may be at a different point in the log.
+ */
+static int config_log_add(struct obd_device *obd, char *logname,
+			  struct config_llog_instance *cfg,
+			  struct super_block *sb)
+{
+	struct lustre_sb_info *lsi = s2lsi(sb);
+	struct config_llog_data *cld;
+	struct config_llog_data *sptlrpc_cld;
+	struct config_llog_data *params_cld;
+	char			seclogname[32];
+	char			*ptr;
+	int			rc;
+
+	CDEBUG(D_MGC, "adding config log %s:%p\n", logname, cfg->cfg_instance);
+
+	/*
+	 * for each regular log, the depended sptlrpc log name is
+	 * <fsname>-sptlrpc. multiple regular logs may share one sptlrpc log.
+	 */
+	ptr = strrchr(logname, '-');
+	if (ptr == NULL || ptr - logname > 8) {
+		CERROR("logname %s is too long\n", logname);
+		return -EINVAL;
+	}
+
+	memcpy(seclogname, logname, ptr - logname);
+	strcpy(seclogname + (ptr - logname), "-sptlrpc");
+
+	sptlrpc_cld = config_log_find(seclogname, NULL);
+	if (sptlrpc_cld == NULL) {
+		sptlrpc_cld = do_config_log_add(obd, seclogname,
+						CONFIG_T_SPTLRPC, NULL, NULL);
+		if (IS_ERR(sptlrpc_cld)) {
+			CERROR("can't create sptlrpc log: %s\n", seclogname);
+			rc = PTR_ERR(sptlrpc_cld);
+			goto out_err;
+		}
+	}
+	params_cld = config_params_log_add(obd, cfg, sb);
+	if (IS_ERR(params_cld)) {
+		rc = PTR_ERR(params_cld);
+		CERROR("%s: can't create params log: rc = %d\n",
+		       obd->obd_name, rc);
+		goto out_err1;
+	}
+
+	cld = do_config_log_add(obd, logname, CONFIG_T_CONFIG, cfg, sb);
+	if (IS_ERR(cld)) {
+		CERROR("can't create log: %s\n", logname);
+		rc = PTR_ERR(cld);
+		goto out_err2;
+	}
+
+	cld->cld_sptlrpc = sptlrpc_cld;
+	cld->cld_params = params_cld;
+
+	LASSERT(lsi->lsi_lmd);
+	if (!(lsi->lsi_lmd->lmd_flags & LMD_FLG_NOIR)) {
+		struct config_llog_data *recover_cld;
+		*strrchr(seclogname, '-') = 0;
+		recover_cld = config_recover_log_add(obd, seclogname, cfg, sb);
+		if (IS_ERR(recover_cld)) {
+			rc = PTR_ERR(recover_cld);
+			goto out_err3;
+		}
+		cld->cld_recover = recover_cld;
+	}
+
+	return 0;
+
+out_err3:
+	config_log_put(cld);
+
+out_err2:
+	config_log_put(params_cld);
+
+out_err1:
+	config_log_put(sptlrpc_cld);
+
+out_err:
+	return rc;
+}
+
+DEFINE_MUTEX(llog_process_lock);
+
+/** Stop watching for updates on this log.
+ */
+static int config_log_end(char *logname, struct config_llog_instance *cfg)
+{
+	struct config_llog_data *cld;
+	struct config_llog_data *cld_sptlrpc = NULL;
+	struct config_llog_data *cld_params = NULL;
+	struct config_llog_data *cld_recover = NULL;
+	int rc = 0;
+
+	cld = config_log_find(logname, cfg);
+	if (cld == NULL)
+		return -ENOENT;
+
+	mutex_lock(&cld->cld_lock);
+	/*
+	 * if cld_stopping is set, it means we didn't start the log thus
+	 * not owning the start ref. this can happen after previous umount:
+	 * the cld still hanging there waiting for lock cancel, and we
+	 * remount again but failed in the middle and call log_end without
+	 * calling start_log.
+	 */
+	if (unlikely(cld->cld_stopping)) {
+		mutex_unlock(&cld->cld_lock);
+		/* drop the ref from the find */
+		config_log_put(cld);
+		return rc;
+	}
+
+	cld->cld_stopping = 1;
+
+	cld_recover = cld->cld_recover;
+	cld->cld_recover = NULL;
+	mutex_unlock(&cld->cld_lock);
+
+	if (cld_recover) {
+		mutex_lock(&cld_recover->cld_lock);
+		cld_recover->cld_stopping = 1;
+		mutex_unlock(&cld_recover->cld_lock);
+		config_log_put(cld_recover);
+	}
+
+	spin_lock(&config_list_lock);
+	cld_sptlrpc = cld->cld_sptlrpc;
+	cld->cld_sptlrpc = NULL;
+	cld_params = cld->cld_params;
+	cld->cld_params = NULL;
+	spin_unlock(&config_list_lock);
+
+	if (cld_sptlrpc)
+		config_log_put(cld_sptlrpc);
+
+	if (cld_params) {
+		mutex_lock(&cld_params->cld_lock);
+		cld_params->cld_stopping = 1;
+		mutex_unlock(&cld_params->cld_lock);
+		config_log_put(cld_params);
+	}
+
+	/* drop the ref from the find */
+	config_log_put(cld);
+	/* drop the start ref */
+	config_log_put(cld);
+
+	CDEBUG(D_MGC, "end config log %s (%d)\n", logname ? logname : "client",
+	       rc);
+	return rc;
+}
+
+#if defined (CONFIG_PROC_FS)
+int lprocfs_mgc_rd_ir_state(struct seq_file *m, void *data)
+{
+	struct obd_device       *obd = data;
+	struct obd_import       *imp;
+	struct obd_connect_data *ocd;
+	struct config_llog_data *cld;
+
+	LPROCFS_CLIMP_CHECK(obd);
+	imp = obd->u.cli.cl_import;
+	ocd = &imp->imp_connect_data;
+
+	seq_printf(m, "imperative_recovery: %s\n",
+		      OCD_HAS_FLAG(ocd, IMP_RECOV) ? "ENABLED" : "DISABLED");
+	seq_printf(m, "client_state:\n");
+
+	spin_lock(&config_list_lock);
+	list_for_each_entry(cld, &config_llog_list, cld_list_chain) {
+		if (cld->cld_recover == NULL)
+			continue;
+		seq_printf(m,  "    - { client: %s, nidtbl_version: %u }\n",
+			       cld->cld_logname,
+			       cld->cld_recover->cld_cfg.cfg_last_idx);
+	}
+	spin_unlock(&config_list_lock);
+
+	LPROCFS_CLIMP_EXIT(obd);
+	return 0;
+}
+#endif
+
+/* reenqueue any lost locks */
+#define RQ_RUNNING 0x1
+#define RQ_NOW     0x2
+#define RQ_LATER   0x4
+#define RQ_STOP    0x8
+#define RQ_PRECLEANUP  0x10
+static int rq_state;
+static wait_queue_head_t	    rq_waitq;
+static DECLARE_COMPLETION(rq_exit);
+static DECLARE_COMPLETION(rq_start);
+
+static void do_requeue(struct config_llog_data *cld)
+{
+	LASSERT(atomic_read(&cld->cld_refcount) > 0);
+
+	/* Do not run mgc_process_log on a disconnected export or an
+	   export which is being disconnected. Take the client
+	   semaphore to make the check non-racy. */
+	down_read(&cld->cld_mgcexp->exp_obd->u.cli.cl_sem);
+	if (cld->cld_mgcexp->exp_obd->u.cli.cl_conn_count != 0) {
+		CDEBUG(D_MGC, "updating log %s\n", cld->cld_logname);
+		mgc_process_log(cld->cld_mgcexp->exp_obd, cld);
+	} else {
+		CDEBUG(D_MGC, "disconnecting, won't update log %s\n",
+		       cld->cld_logname);
+	}
+	up_read(&cld->cld_mgcexp->exp_obd->u.cli.cl_sem);
+}
+
+/* this timeout represents how many seconds MGC should wait before
+ * requeue config and recover lock to the MGS. We need to randomize this
+ * in order to not flood the MGS.
+ */
+#define MGC_TIMEOUT_MIN_SECONDS   5
+#define MGC_TIMEOUT_RAND_CENTISEC 0x1ff /* ~500 */
+
+static int mgc_requeue_thread(void *data)
+{
+	bool first = true;
+
+	CDEBUG(D_MGC, "Starting requeue thread\n");
+
+	/* Keep trying failed locks periodically */
+	spin_lock(&config_list_lock);
+	rq_state |= RQ_RUNNING;
+	while (1) {
+		struct l_wait_info lwi;
+		struct config_llog_data *cld, *cld_prev;
+		int rand = cfs_rand() & MGC_TIMEOUT_RAND_CENTISEC;
+		int stopped = !!(rq_state & RQ_STOP);
+		int to;
+
+		/* Any new or requeued lostlocks will change the state */
+		rq_state &= ~(RQ_NOW | RQ_LATER);
+		spin_unlock(&config_list_lock);
+
+		if (first) {
+			first = false;
+			complete(&rq_start);
+		}
+
+		/* Always wait a few seconds to allow the server who
+		   caused the lock revocation to finish its setup, plus some
+		   random so everyone doesn't try to reconnect at once. */
+		to = MGC_TIMEOUT_MIN_SECONDS * HZ;
+		to += rand * HZ / 100; /* rand is centi-seconds */
+		lwi = LWI_TIMEOUT(to, NULL, NULL);
+		l_wait_event(rq_waitq, rq_state & (RQ_STOP | RQ_PRECLEANUP),
+			     &lwi);
+
+		/*
+		 * iterate & processing through the list. for each cld, process
+		 * its depending sptlrpc cld firstly (if any) and then itself.
+		 *
+		 * it's guaranteed any item in the list must have
+		 * reference > 0; and if cld_lostlock is set, at
+		 * least one reference is taken by the previous enqueue.
+		 */
+		cld_prev = NULL;
+
+		spin_lock(&config_list_lock);
+		rq_state &= ~RQ_PRECLEANUP;
+		list_for_each_entry(cld, &config_llog_list,
+					cld_list_chain) {
+			if (!cld->cld_lostlock)
+				continue;
+
+			spin_unlock(&config_list_lock);
+
+			LASSERT(atomic_read(&cld->cld_refcount) > 0);
+
+			/* Whether we enqueued again or not in mgc_process_log,
+			 * we're done with the ref from the old enqueue */
+			if (cld_prev)
+				config_log_put(cld_prev);
+			cld_prev = cld;
+
+			cld->cld_lostlock = 0;
+			if (likely(!stopped))
+				do_requeue(cld);
+
+			spin_lock(&config_list_lock);
+		}
+		spin_unlock(&config_list_lock);
+		if (cld_prev)
+			config_log_put(cld_prev);
+
+		/* break after scanning the list so that we can drop
+		 * refcount to losing lock clds */
+		if (unlikely(stopped)) {
+			spin_lock(&config_list_lock);
+			break;
+		}
+
+		/* Wait a bit to see if anyone else needs a requeue */
+		lwi = (struct l_wait_info) { 0 };
+		l_wait_event(rq_waitq, rq_state & (RQ_NOW | RQ_STOP),
+			     &lwi);
+		spin_lock(&config_list_lock);
+	}
+	/* spinlock and while guarantee RQ_NOW and RQ_LATER are not set */
+	rq_state &= ~RQ_RUNNING;
+	spin_unlock(&config_list_lock);
+
+	complete(&rq_exit);
+
+	CDEBUG(D_MGC, "Ending requeue thread\n");
+	return 0;
+}
+
+/* Add a cld to the list to requeue.  Start the requeue thread if needed.
+   We are responsible for dropping the config log reference from here on out. */
+static void mgc_requeue_add(struct config_llog_data *cld)
+{
+	CDEBUG(D_INFO, "log %s: requeue (r=%d sp=%d st=%x)\n",
+	       cld->cld_logname, atomic_read(&cld->cld_refcount),
+	       cld->cld_stopping, rq_state);
+	LASSERT(atomic_read(&cld->cld_refcount) > 0);
+
+	mutex_lock(&cld->cld_lock);
+	if (cld->cld_stopping || cld->cld_lostlock) {
+		mutex_unlock(&cld->cld_lock);
+		return;
+	}
+	/* this refcount will be released in mgc_requeue_thread. */
+	config_log_get(cld);
+	cld->cld_lostlock = 1;
+	mutex_unlock(&cld->cld_lock);
+
+	/* Hold lock for rq_state */
+	spin_lock(&config_list_lock);
+	if (rq_state & RQ_STOP) {
+		spin_unlock(&config_list_lock);
+		cld->cld_lostlock = 0;
+		config_log_put(cld);
+	} else {
+		rq_state |= RQ_NOW;
+		spin_unlock(&config_list_lock);
+		wake_up(&rq_waitq);
+	}
+}
+
+static int mgc_llog_init(const struct lu_env *env, struct obd_device *obd)
+{
+	struct llog_ctxt	*ctxt;
+	int			 rc;
+
+	/* setup only remote ctxt, the local disk context is switched per each
+	 * filesystem during mgc_fs_setup() */
+	rc = llog_setup(env, obd, &obd->obd_olg, LLOG_CONFIG_REPL_CTXT, obd,
+			&llog_client_ops);
+	if (rc)
+		return rc;
+
+	ctxt = llog_get_context(obd, LLOG_CONFIG_REPL_CTXT);
+	LASSERT(ctxt);
+
+	llog_initiator_connect(ctxt);
+	llog_ctxt_put(ctxt);
+
+	return 0;
+}
+
+static int mgc_llog_fini(const struct lu_env *env, struct obd_device *obd)
+{
+	struct llog_ctxt *ctxt;
+
+	ctxt = llog_get_context(obd, LLOG_CONFIG_REPL_CTXT);
+	if (ctxt)
+		llog_cleanup(env, ctxt);
+
+	return 0;
+}
+
+static atomic_t mgc_count = ATOMIC_INIT(0);
+static int mgc_precleanup(struct obd_device *obd, enum obd_cleanup_stage stage)
+{
+	int rc = 0;
+	int temp;
+
+	switch (stage) {
+	case OBD_CLEANUP_EARLY:
+		break;
+	case OBD_CLEANUP_EXPORTS:
+		if (atomic_dec_and_test(&mgc_count)) {
+			LASSERT(rq_state & RQ_RUNNING);
+			/* stop requeue thread */
+			temp = RQ_STOP;
+		} else {
+			/* wakeup requeue thread to clean our cld */
+			temp = RQ_NOW | RQ_PRECLEANUP;
+		}
+		spin_lock(&config_list_lock);
+		rq_state |= temp;
+		spin_unlock(&config_list_lock);
+		wake_up(&rq_waitq);
+		if (temp & RQ_STOP)
+			wait_for_completion(&rq_exit);
+		obd_cleanup_client_import(obd);
+		rc = mgc_llog_fini(NULL, obd);
+		if (rc != 0)
+			CERROR("failed to cleanup llogging subsystems\n");
+		break;
+	}
+	return rc;
+}
+
+static int mgc_cleanup(struct obd_device *obd)
+{
+	/* COMPAT_146 - old config logs may have added profiles we don't
+	   know about */
+	if (obd->obd_type->typ_refcnt <= 1)
+		/* Only for the last mgc */
+		class_del_profiles();
+
+	lprocfs_obd_cleanup(obd);
+	ptlrpcd_decref();
+
+	return client_obd_cleanup(obd);
+}
+
+static int mgc_setup(struct obd_device *obd, struct lustre_cfg *lcfg)
+{
+	struct lprocfs_static_vars lvars;
+	int rc;
+
+	ptlrpcd_addref();
+
+	rc = client_obd_setup(obd, lcfg);
+	if (rc)
+		goto err_decref;
+
+	rc = mgc_llog_init(NULL, obd);
+	if (rc) {
+		CERROR("failed to setup llogging subsystems\n");
+		goto err_cleanup;
+	}
+
+	lprocfs_mgc_init_vars(&lvars);
+	lprocfs_obd_setup(obd, lvars.obd_vars);
+	sptlrpc_lprocfs_cliobd_attach(obd);
+
+	if (atomic_inc_return(&mgc_count) == 1) {
+		rq_state = 0;
+		init_waitqueue_head(&rq_waitq);
+
+		/* start requeue thread */
+		rc = PTR_ERR(kthread_run(mgc_requeue_thread, NULL,
+					     "ll_cfg_requeue"));
+		if (IS_ERR_VALUE(rc)) {
+			CERROR("%s: Cannot start requeue thread (%d),no more log updates!\n",
+			       obd->obd_name, rc);
+			goto err_cleanup;
+		}
+		/* rc is the task_struct pointer of mgc_requeue_thread. */
+		rc = 0;
+		wait_for_completion(&rq_start);
+	}
+
+	return rc;
+
+err_cleanup:
+	client_obd_cleanup(obd);
+err_decref:
+	ptlrpcd_decref();
+	return rc;
+}
+
+/* based on ll_mdc_blocking_ast */
+static int mgc_blocking_ast(struct ldlm_lock *lock, struct ldlm_lock_desc *desc,
+			    void *data, int flag)
+{
+	struct lustre_handle lockh;
+	struct config_llog_data *cld = (struct config_llog_data *)data;
+	int rc = 0;
+
+	switch (flag) {
+	case LDLM_CB_BLOCKING:
+		/* mgs wants the lock, give it up... */
+		LDLM_DEBUG(lock, "MGC blocking CB");
+		ldlm_lock2handle(lock, &lockh);
+		rc = ldlm_cli_cancel(&lockh, LCF_ASYNC);
+		break;
+	case LDLM_CB_CANCELING:
+		/* We've given up the lock, prepare ourselves to update. */
+		LDLM_DEBUG(lock, "MGC cancel CB");
+
+		CDEBUG(D_MGC, "Lock res "DLDLMRES" (%.8s)\n",
+		       PLDLMRES(lock->l_resource),
+		       (char *)&lock->l_resource->lr_name.name[0]);
+
+		if (!cld) {
+			CDEBUG(D_INFO, "missing data, won't requeue\n");
+			break;
+		}
+
+		/* held at mgc_process_log(). */
+		LASSERT(atomic_read(&cld->cld_refcount) > 0);
+		/* Are we done with this log? */
+		if (cld->cld_stopping) {
+			CDEBUG(D_MGC, "log %s: stopping, won't requeue\n",
+			       cld->cld_logname);
+			config_log_put(cld);
+			break;
+		}
+		/* Make sure not to re-enqueue when the mgc is stopping
+		   (we get called from client_disconnect_export) */
+		if (!lock->l_conn_export ||
+		    !lock->l_conn_export->exp_obd->u.cli.cl_conn_count) {
+			CDEBUG(D_MGC, "log %.8s: disconnecting, won't requeue\n",
+			       cld->cld_logname);
+			config_log_put(cld);
+			break;
+		}
+
+		/* Re-enqueue now */
+		mgc_requeue_add(cld);
+		config_log_put(cld);
+		break;
+	default:
+		LBUG();
+	}
+
+	return rc;
+}
+
+/* Not sure where this should go... */
+/* This is the timeout value for MGS_CONNECT request plus a ping interval, such
+ * that we can have a chance to try the secondary MGS if any. */
+#define  MGC_ENQUEUE_LIMIT (INITIAL_CONNECT_TIMEOUT + (AT_OFF ? 0 : at_min) \
+				+ PING_INTERVAL)
+#define  MGC_TARGET_REG_LIMIT 10
+#define  MGC_SEND_PARAM_LIMIT 10
+
+/* Send parameter to MGS*/
+static int mgc_set_mgs_param(struct obd_export *exp,
+			     struct mgs_send_param *msp)
+{
+	struct ptlrpc_request *req;
+	struct mgs_send_param *req_msp, *rep_msp;
+	int rc;
+
+	req = ptlrpc_request_alloc_pack(class_exp2cliimp(exp),
+					&RQF_MGS_SET_INFO, LUSTRE_MGS_VERSION,
+					MGS_SET_INFO);
+	if (!req)
+		return -ENOMEM;
+
+	req_msp = req_capsule_client_get(&req->rq_pill, &RMF_MGS_SEND_PARAM);
+	if (!req_msp) {
+		ptlrpc_req_finished(req);
+		return -ENOMEM;
+	}
+
+	memcpy(req_msp, msp, sizeof(*req_msp));
+	ptlrpc_request_set_replen(req);
+
+	/* Limit how long we will wait for the enqueue to complete */
+	req->rq_delay_limit = MGC_SEND_PARAM_LIMIT;
+	rc = ptlrpc_queue_wait(req);
+	if (!rc) {
+		rep_msp = req_capsule_server_get(&req->rq_pill, &RMF_MGS_SEND_PARAM);
+		memcpy(msp, rep_msp, sizeof(*rep_msp));
+	}
+
+	ptlrpc_req_finished(req);
+
+	return rc;
+}
+
+/* Take a config lock so we can get cancel notifications */
+static int mgc_enqueue(struct obd_export *exp, struct lov_stripe_md *lsm,
+		       __u32 type, ldlm_policy_data_t *policy, __u32 mode,
+		       __u64 *flags, void *bl_cb, void *cp_cb, void *gl_cb,
+		       void *data, __u32 lvb_len, void *lvb_swabber,
+		       struct lustre_handle *lockh)
+{
+	struct config_llog_data *cld = (struct config_llog_data *)data;
+	struct ldlm_enqueue_info einfo = {
+		.ei_type	= type,
+		.ei_mode	= mode,
+		.ei_cb_bl	= mgc_blocking_ast,
+		.ei_cb_cp	= ldlm_completion_ast,
+	};
+	struct ptlrpc_request *req;
+	int short_limit = cld_is_sptlrpc(cld);
+	int rc;
+
+	CDEBUG(D_MGC, "Enqueue for %s (res %#llx)\n", cld->cld_logname,
+	       cld->cld_resid.name[0]);
+
+	/* We need a callback for every lockholder, so don't try to
+	   ldlm_lock_match (see rev 1.1.2.11.2.47) */
+	req = ptlrpc_request_alloc_pack(class_exp2cliimp(exp),
+					&RQF_LDLM_ENQUEUE, LUSTRE_DLM_VERSION,
+					LDLM_ENQUEUE);
+	if (req == NULL)
+		return -ENOMEM;
+
+	req_capsule_set_size(&req->rq_pill, &RMF_DLM_LVB, RCL_SERVER, 0);
+	ptlrpc_request_set_replen(req);
+
+	/* check if this is server or client */
+	if (cld->cld_cfg.cfg_sb) {
+		struct lustre_sb_info *lsi = s2lsi(cld->cld_cfg.cfg_sb);
+		if (lsi && IS_SERVER(lsi))
+			short_limit = 1;
+	}
+	/* Limit how long we will wait for the enqueue to complete */
+	req->rq_delay_limit = short_limit ? 5 : MGC_ENQUEUE_LIMIT;
+	rc = ldlm_cli_enqueue(exp, &req, &einfo, &cld->cld_resid, NULL, flags,
+			      NULL, 0, LVB_T_NONE, lockh, 0);
+	/* A failed enqueue should still call the mgc_blocking_ast,
+	   where it will be requeued if needed ("grant failed"). */
+	ptlrpc_req_finished(req);
+	return rc;
+}
+
+static void mgc_notify_active(struct obd_device *unused)
+{
+	/* wakeup mgc_requeue_thread to requeue mgc lock */
+	spin_lock(&config_list_lock);
+	rq_state |= RQ_NOW;
+	spin_unlock(&config_list_lock);
+	wake_up(&rq_waitq);
+
+	/* TODO: Help the MGS rebuild nidtbl. -jay */
+}
+
+/* Send target_reg message to MGS */
+static int mgc_target_register(struct obd_export *exp,
+			       struct mgs_target_info *mti)
+{
+	struct ptlrpc_request  *req;
+	struct mgs_target_info *req_mti, *rep_mti;
+	int		     rc;
+
+	req = ptlrpc_request_alloc_pack(class_exp2cliimp(exp),
+					&RQF_MGS_TARGET_REG, LUSTRE_MGS_VERSION,
+					MGS_TARGET_REG);
+	if (req == NULL)
+		return -ENOMEM;
+
+	req_mti = req_capsule_client_get(&req->rq_pill, &RMF_MGS_TARGET_INFO);
+	if (!req_mti) {
+		ptlrpc_req_finished(req);
+		return -ENOMEM;
+	}
+
+	memcpy(req_mti, mti, sizeof(*req_mti));
+	ptlrpc_request_set_replen(req);
+	CDEBUG(D_MGC, "register %s\n", mti->mti_svname);
+	/* Limit how long we will wait for the enqueue to complete */
+	req->rq_delay_limit = MGC_TARGET_REG_LIMIT;
+
+	rc = ptlrpc_queue_wait(req);
+	if (!rc) {
+		rep_mti = req_capsule_server_get(&req->rq_pill,
+						 &RMF_MGS_TARGET_INFO);
+		memcpy(mti, rep_mti, sizeof(*rep_mti));
+		CDEBUG(D_MGC, "register %s got index = %d\n",
+		       mti->mti_svname, mti->mti_stripe_index);
+	}
+	ptlrpc_req_finished(req);
+
+	return rc;
+}
+
+static int mgc_set_info_async(const struct lu_env *env, struct obd_export *exp,
+		       u32 keylen, void *key, u32 vallen,
+		       void *val, struct ptlrpc_request_set *set)
+{
+	int rc = -EINVAL;
+
+	/* Turn off initial_recov after we try all backup servers once */
+	if (KEY_IS(KEY_INIT_RECOV_BACKUP)) {
+		struct obd_import *imp = class_exp2cliimp(exp);
+		int value;
+		if (vallen != sizeof(int))
+			return -EINVAL;
+		value = *(int *)val;
+		CDEBUG(D_MGC, "InitRecov %s %d/d%d:i%d:r%d:or%d:%s\n",
+		       imp->imp_obd->obd_name, value,
+		       imp->imp_deactive, imp->imp_invalid,
+		       imp->imp_replayable, imp->imp_obd->obd_replayable,
+		       ptlrpc_import_state_name(imp->imp_state));
+		/* Resurrect if we previously died */
+		if ((imp->imp_state != LUSTRE_IMP_FULL &&
+		     imp->imp_state != LUSTRE_IMP_NEW) || value > 1)
+			ptlrpc_reconnect_import(imp);
+		return 0;
+	}
+	if (KEY_IS(KEY_SET_INFO)) {
+		struct mgs_send_param *msp;
+
+		msp = (struct mgs_send_param *)val;
+		rc =  mgc_set_mgs_param(exp, msp);
+		return rc;
+	}
+	if (KEY_IS(KEY_MGSSEC)) {
+		struct client_obd     *cli = &exp->exp_obd->u.cli;
+		struct sptlrpc_flavor  flvr;
+
+		/*
+		 * empty string means using current flavor, if which haven't
+		 * been set yet, set it as null.
+		 *
+		 * if flavor has been set previously, check the asking flavor
+		 * must match the existing one.
+		 */
+		if (vallen == 0) {
+			if (cli->cl_flvr_mgc.sf_rpc != SPTLRPC_FLVR_INVALID)
+				return 0;
+			val = "null";
+			vallen = 4;
+		}
+
+		rc = sptlrpc_parse_flavor(val, &flvr);
+		if (rc) {
+			CERROR("invalid sptlrpc flavor %s to MGS\n",
+			       (char *) val);
+			return rc;
+		}
+
+		/*
+		 * caller already hold a mutex
+		 */
+		if (cli->cl_flvr_mgc.sf_rpc == SPTLRPC_FLVR_INVALID) {
+			cli->cl_flvr_mgc = flvr;
+		} else if (memcmp(&cli->cl_flvr_mgc, &flvr,
+				  sizeof(flvr)) != 0) {
+			char    str[20];
+
+			sptlrpc_flavor2name(&cli->cl_flvr_mgc,
+					    str, sizeof(str));
+			LCONSOLE_ERROR("asking sptlrpc flavor %s to MGS but currently %s is in use\n",
+				       (char *) val, str);
+			rc = -EPERM;
+		}
+		return rc;
+	}
+
+	return rc;
+}
+
+static int mgc_get_info(const struct lu_env *env, struct obd_export *exp,
+			__u32 keylen, void *key, __u32 *vallen, void *val,
+			struct lov_stripe_md *unused)
+{
+	int rc = -EINVAL;
+
+	if (KEY_IS(KEY_CONN_DATA)) {
+		struct obd_import *imp = class_exp2cliimp(exp);
+		struct obd_connect_data *data = val;
+
+		if (*vallen == sizeof(*data)) {
+			*data = imp->imp_connect_data;
+			rc = 0;
+		}
+	}
+
+	return rc;
+}
+
+static int mgc_import_event(struct obd_device *obd,
+			    struct obd_import *imp,
+			    enum obd_import_event event)
+{
+	LASSERT(imp->imp_obd == obd);
+	CDEBUG(D_MGC, "import event %#x\n", event);
+
+	switch (event) {
+	case IMP_EVENT_DISCON:
+		/* MGC imports should not wait for recovery */
+		if (OCD_HAS_FLAG(&imp->imp_connect_data, IMP_RECOV))
+			ptlrpc_pinger_ir_down();
+		break;
+	case IMP_EVENT_INACTIVE:
+		break;
+	case IMP_EVENT_INVALIDATE: {
+		struct ldlm_namespace *ns = obd->obd_namespace;
+		ldlm_namespace_cleanup(ns, LDLM_FL_LOCAL_ONLY);
+		break;
+	}
+	case IMP_EVENT_ACTIVE:
+		CDEBUG(D_INFO, "%s: Reactivating import\n", obd->obd_name);
+		/* Clearing obd_no_recov allows us to continue pinging */
+		obd->obd_no_recov = 0;
+		mgc_notify_active(obd);
+		if (OCD_HAS_FLAG(&imp->imp_connect_data, IMP_RECOV))
+			ptlrpc_pinger_ir_up();
+		break;
+	case IMP_EVENT_OCD:
+		break;
+	case IMP_EVENT_DEACTIVATE:
+	case IMP_EVENT_ACTIVATE:
+		break;
+	default:
+		CERROR("Unknown import event %#x\n", event);
+		LBUG();
+	}
+	return 0;
+}
+
+enum {
+	CONFIG_READ_NRPAGES_INIT = 1 << (20 - PAGE_CACHE_SHIFT),
+	CONFIG_READ_NRPAGES      = 4
+};
+
+static int mgc_apply_recover_logs(struct obd_device *mgc,
+				  struct config_llog_data *cld,
+				  __u64 max_version,
+				  void *data, int datalen, bool mne_swab)
+{
+	struct config_llog_instance *cfg = &cld->cld_cfg;
+	struct lustre_sb_info       *lsi = s2lsi(cfg->cfg_sb);
+	struct mgs_nidtbl_entry *entry;
+	struct lustre_cfg       *lcfg;
+	struct lustre_cfg_bufs   bufs;
+	u64   prev_version = 0;
+	char *inst;
+	char *buf;
+	int   bufsz;
+	int   pos;
+	int   rc  = 0;
+	int   off = 0;
+
+	LASSERT(cfg->cfg_instance != NULL);
+	LASSERT(cfg->cfg_sb == cfg->cfg_instance);
+
+	OBD_ALLOC(inst, PAGE_CACHE_SIZE);
+	if (inst == NULL)
+		return -ENOMEM;
+
+	if (!IS_SERVER(lsi)) {
+		pos = snprintf(inst, PAGE_CACHE_SIZE, "%p", cfg->cfg_instance);
+		if (pos >= PAGE_CACHE_SIZE) {
+			OBD_FREE(inst, PAGE_CACHE_SIZE);
+			return -E2BIG;
+		}
+	} else {
+		LASSERT(IS_MDT(lsi));
+		rc = server_name2svname(lsi->lsi_svname, inst, NULL,
+					PAGE_CACHE_SIZE);
+		if (rc) {
+			OBD_FREE(inst, PAGE_CACHE_SIZE);
+			return -EINVAL;
+		}
+		pos = strlen(inst);
+	}
+
+	++pos;
+	buf   = inst + pos;
+	bufsz = PAGE_CACHE_SIZE - pos;
+
+	while (datalen > 0) {
+		int   entry_len = sizeof(*entry);
+		int   is_ost;
+		struct obd_device *obd;
+		char *obdname;
+		char *cname;
+		char *params;
+		char *uuid;
+
+		rc = -EINVAL;
+		if (datalen < sizeof(*entry))
+			break;
+
+		entry = (typeof(entry))(data + off);
+
+		/* sanity check */
+		if (entry->mne_nid_type != 0) /* only support type 0 for ipv4 */
+			break;
+		if (entry->mne_nid_count == 0) /* at least one nid entry */
+			break;
+		if (entry->mne_nid_size != sizeof(lnet_nid_t))
+			break;
+
+		entry_len += entry->mne_nid_count * entry->mne_nid_size;
+		if (datalen < entry_len) /* must have entry_len at least */
+			break;
+
+		/* Keep this swab for normal mixed endian handling. LU-1644 */
+		if (mne_swab)
+			lustre_swab_mgs_nidtbl_entry(entry);
+		if (entry->mne_length > PAGE_CACHE_SIZE) {
+			CERROR("MNE too large (%u)\n", entry->mne_length);
+			break;
+		}
+
+		if (entry->mne_length < entry_len)
+			break;
+
+		off     += entry->mne_length;
+		datalen -= entry->mne_length;
+		if (datalen < 0)
+			break;
+
+		if (entry->mne_version > max_version) {
+			CERROR("entry index(%lld) is over max_index(%lld)\n",
+			       entry->mne_version, max_version);
+			break;
+		}
+
+		if (prev_version >= entry->mne_version) {
+			CERROR("index unsorted, prev %lld, now %lld\n",
+			       prev_version, entry->mne_version);
+			break;
+		}
+		prev_version = entry->mne_version;
+
+		/*
+		 * Write a string with format "nid::instance" to
+		 * lustre/<osc|mdc>/<target>-<osc|mdc>-<instance>/import.
+		 */
+
+		is_ost = entry->mne_type == LDD_F_SV_TYPE_OST;
+		memset(buf, 0, bufsz);
+		obdname = buf;
+		pos = 0;
+
+		/* lustre-OST0001-osc-<instance #> */
+		strcpy(obdname, cld->cld_logname);
+		cname = strrchr(obdname, '-');
+		if (cname == NULL) {
+			CERROR("mgc %s: invalid logname %s\n",
+			       mgc->obd_name, obdname);
+			break;
+		}
+
+		pos = cname - obdname;
+		obdname[pos] = 0;
+		pos += sprintf(obdname + pos, "-%s%04x",
+				  is_ost ? "OST" : "MDT", entry->mne_index);
+
+		cname = is_ost ? "osc" : "mdc",
+		pos += sprintf(obdname + pos, "-%s-%s", cname, inst);
+		lustre_cfg_bufs_reset(&bufs, obdname);
+
+		/* find the obd by obdname */
+		obd = class_name2obd(obdname);
+		if (obd == NULL) {
+			CDEBUG(D_INFO, "mgc %s: cannot find obdname %s\n",
+			       mgc->obd_name, obdname);
+			rc = 0;
+			/* this is a safe race, when the ost is starting up...*/
+			continue;
+		}
+
+		/* osc.import = "connection=<Conn UUID>::<target instance>" */
+		++pos;
+		params = buf + pos;
+		pos += sprintf(params, "%s.import=%s", cname, "connection=");
+		uuid = buf + pos;
+
+		down_read(&obd->u.cli.cl_sem);
+		if (obd->u.cli.cl_import == NULL) {
+			/* client does not connect to the OST yet */
+			up_read(&obd->u.cli.cl_sem);
+			rc = 0;
+			continue;
+		}
+
+		/* TODO: iterate all nids to find one */
+		/* find uuid by nid */
+		rc = client_import_find_conn(obd->u.cli.cl_import,
+					     entry->u.nids[0],
+					     (struct obd_uuid *)uuid);
+		up_read(&obd->u.cli.cl_sem);
+		if (rc < 0) {
+			CERROR("mgc: cannot find uuid by nid %s\n",
+			       libcfs_nid2str(entry->u.nids[0]));
+			break;
+		}
+
+		CDEBUG(D_INFO, "Find uuid %s by nid %s\n",
+		       uuid, libcfs_nid2str(entry->u.nids[0]));
+
+		pos += strlen(uuid);
+		pos += sprintf(buf + pos, "::%u", entry->mne_instance);
+		LASSERT(pos < bufsz);
+
+		lustre_cfg_bufs_set_string(&bufs, 1, params);
+
+		rc = -ENOMEM;
+		lcfg = lustre_cfg_new(LCFG_PARAM, &bufs);
+		if (lcfg == NULL) {
+			CERROR("mgc: cannot allocate memory\n");
+			break;
+		}
+
+		CDEBUG(D_INFO, "ir apply logs %lld/%lld for %s -> %s\n",
+		       prev_version, max_version, obdname, params);
+
+		rc = class_process_config(lcfg);
+		lustre_cfg_free(lcfg);
+		if (rc)
+			CDEBUG(D_INFO, "process config for %s error %d\n",
+			       obdname, rc);
+
+		/* continue, even one with error */
+	}
+
+	OBD_FREE(inst, PAGE_CACHE_SIZE);
+	return rc;
+}
+
+/**
+ * This function is called if this client was notified for target restarting
+ * by the MGS. A CONFIG_READ RPC is going to send to fetch recovery logs.
+ */
+static int mgc_process_recover_log(struct obd_device *obd,
+				   struct config_llog_data *cld)
+{
+	struct ptlrpc_request *req = NULL;
+	struct config_llog_instance *cfg = &cld->cld_cfg;
+	struct mgs_config_body *body;
+	struct mgs_config_res  *res;
+	struct ptlrpc_bulk_desc *desc;
+	struct page **pages;
+	int nrpages;
+	bool eof = true;
+	bool mne_swab = false;
+	int i;
+	int ealen;
+	int rc;
+
+	/* allocate buffer for bulk transfer.
+	 * if this is the first time for this mgs to read logs,
+	 * CONFIG_READ_NRPAGES_INIT will be used since it will read all logs
+	 * once; otherwise, it only reads increment of logs, this should be
+	 * small and CONFIG_READ_NRPAGES will be used.
+	 */
+	nrpages = CONFIG_READ_NRPAGES;
+	if (cfg->cfg_last_idx == 0) /* the first time */
+		nrpages = CONFIG_READ_NRPAGES_INIT;
+
+	OBD_ALLOC(pages, sizeof(*pages) * nrpages);
+	if (pages == NULL) {
+		rc = -ENOMEM;
+		goto out;
+	}
+
+	for (i = 0; i < nrpages; i++) {
+		pages[i] = alloc_page(GFP_IOFS);
+		if (pages[i] == NULL) {
+			rc = -ENOMEM;
+			goto out;
+		}
+	}
+
+again:
+	LASSERT(cld_is_recover(cld));
+	LASSERT(mutex_is_locked(&cld->cld_lock));
+	req = ptlrpc_request_alloc(class_exp2cliimp(cld->cld_mgcexp),
+				   &RQF_MGS_CONFIG_READ);
+	if (req == NULL) {
+		rc = -ENOMEM;
+		goto out;
+	}
+
+	rc = ptlrpc_request_pack(req, LUSTRE_MGS_VERSION, MGS_CONFIG_READ);
+	if (rc)
+		goto out;
+
+	/* pack request */
+	body = req_capsule_client_get(&req->rq_pill, &RMF_MGS_CONFIG_BODY);
+	LASSERT(body != NULL);
+	LASSERT(sizeof(body->mcb_name) > strlen(cld->cld_logname));
+	if (strlcpy(body->mcb_name, cld->cld_logname, sizeof(body->mcb_name))
+	    >= sizeof(body->mcb_name)) {
+		rc = -E2BIG;
+		goto out;
+	}
+	body->mcb_offset = cfg->cfg_last_idx + 1;
+	body->mcb_type   = cld->cld_type;
+	body->mcb_bits   = PAGE_CACHE_SHIFT;
+	body->mcb_units  = nrpages;
+
+	/* allocate bulk transfer descriptor */
+	desc = ptlrpc_prep_bulk_imp(req, nrpages, 1, BULK_PUT_SINK,
+				    MGS_BULK_PORTAL);
+	if (desc == NULL) {
+		rc = -ENOMEM;
+		goto out;
+	}
+
+	for (i = 0; i < nrpages; i++)
+		ptlrpc_prep_bulk_page_pin(desc, pages[i], 0, PAGE_CACHE_SIZE);
+
+	ptlrpc_request_set_replen(req);
+	rc = ptlrpc_queue_wait(req);
+	if (rc)
+		goto out;
+
+	res = req_capsule_server_get(&req->rq_pill, &RMF_MGS_CONFIG_RES);
+	if (res->mcr_size < res->mcr_offset) {
+		rc = -EINVAL;
+		goto out;
+	}
+
+	/* always update the index even though it might have errors with
+	 * handling the recover logs */
+	cfg->cfg_last_idx = res->mcr_offset;
+	eof = res->mcr_offset == res->mcr_size;
+
+	CDEBUG(D_INFO, "Latest version %lld, more %d.\n",
+	       res->mcr_offset, eof == false);
+
+	ealen = sptlrpc_cli_unwrap_bulk_read(req, req->rq_bulk, 0);
+	if (ealen < 0) {
+		rc = ealen;
+		goto out;
+	}
+
+	if (ealen > nrpages << PAGE_CACHE_SHIFT) {
+		rc = -EINVAL;
+		goto out;
+	}
+
+	if (ealen == 0) { /* no logs transferred */
+		if (!eof)
+			rc = -EINVAL;
+		goto out;
+	}
+
+	mne_swab = !!ptlrpc_rep_need_swab(req);
+#if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(3, 2, 50, 0)
+	/* This import flag means the server did an extra swab of IR MNE
+	 * records (fixed in LU-1252), reverse it here if needed. LU-1644 */
+	if (unlikely(req->rq_import->imp_need_mne_swab))
+		mne_swab = !mne_swab;
+#else
+#warning "LU-1644: Remove old OBD_CONNECT_MNE_SWAB fixup and imp_need_mne_swab"
+#endif
+
+	for (i = 0; i < nrpages && ealen > 0; i++) {
+		int rc2;
+		void *ptr;
+
+		ptr = kmap(pages[i]);
+		rc2 = mgc_apply_recover_logs(obd, cld, res->mcr_offset, ptr,
+					     min_t(int, ealen, PAGE_CACHE_SIZE),
+					     mne_swab);
+		kunmap(pages[i]);
+		if (rc2 < 0) {
+			CWARN("Process recover log %s error %d\n",
+			      cld->cld_logname, rc2);
+			break;
+		}
+
+		ealen -= PAGE_CACHE_SIZE;
+	}
+
+out:
+	if (req)
+		ptlrpc_req_finished(req);
+
+	if (rc == 0 && !eof)
+		goto again;
+
+	if (pages) {
+		for (i = 0; i < nrpages; i++) {
+			if (pages[i] == NULL)
+				break;
+			__free_page(pages[i]);
+		}
+		OBD_FREE(pages, sizeof(*pages) * nrpages);
+	}
+	return rc;
+}
+
+/* local_only means it cannot get remote llogs */
+static int mgc_process_cfg_log(struct obd_device *mgc,
+			       struct config_llog_data *cld, int local_only)
+{
+	struct llog_ctxt	*ctxt;
+	struct lustre_sb_info	*lsi = NULL;
+	int			 rc = 0;
+	bool			 sptlrpc_started = false;
+	struct lu_env		*env;
+
+	LASSERT(cld);
+	LASSERT(mutex_is_locked(&cld->cld_lock));
+
+	/*
+	 * local copy of sptlrpc log is controlled elsewhere, don't try to
+	 * read it up here.
+	 */
+	if (cld_is_sptlrpc(cld) && local_only)
+		return 0;
+
+	if (cld->cld_cfg.cfg_sb)
+		lsi = s2lsi(cld->cld_cfg.cfg_sb);
+
+	OBD_ALLOC_PTR(env);
+	if (env == NULL)
+		return -ENOMEM;
+
+	rc = lu_env_init(env, LCT_MG_THREAD);
+	if (rc)
+		goto out_free;
+
+	ctxt = llog_get_context(mgc, LLOG_CONFIG_REPL_CTXT);
+	LASSERT(ctxt);
+
+	if (local_only) /* no local log at client side */ {
+		rc = -EIO;
+		goto out_pop;
+	}
+
+	if (cld_is_sptlrpc(cld)) {
+		sptlrpc_conf_log_update_begin(cld->cld_logname);
+		sptlrpc_started = true;
+	}
+
+	/* logname and instance info should be the same, so use our
+	 * copy of the instance for the update.  The cfg_last_idx will
+	 * be updated here. */
+	rc = class_config_parse_llog(env, ctxt, cld->cld_logname,
+				     &cld->cld_cfg);
+
+out_pop:
+	__llog_ctxt_put(env, ctxt);
+
+	/*
+	 * update settings on existing OBDs. doing it inside
+	 * of llog_process_lock so no device is attaching/detaching
+	 * in parallel.
+	 * the logname must be <fsname>-sptlrpc
+	 */
+	if (sptlrpc_started) {
+		LASSERT(cld_is_sptlrpc(cld));
+		sptlrpc_conf_log_update_end(cld->cld_logname);
+		class_notify_sptlrpc_conf(cld->cld_logname,
+					  strlen(cld->cld_logname) -
+					  strlen("-sptlrpc"));
+	}
+
+	lu_env_fini(env);
+out_free:
+	OBD_FREE_PTR(env);
+	return rc;
+}
+
+/** Get a config log from the MGS and process it.
+ * This func is called for both clients and servers.
+ * Copy the log locally before parsing it if appropriate (non-MGS server)
+ */
+int mgc_process_log(struct obd_device *mgc, struct config_llog_data *cld)
+{
+	struct lustre_handle lockh = { 0 };
+	__u64 flags = LDLM_FL_NO_LRU;
+	int rc = 0, rcl;
+
+	LASSERT(cld);
+
+	/* I don't want multiple processes running process_log at once --
+	   sounds like badness.  It actually might be fine, as long as
+	   we're not trying to update from the same log
+	   simultaneously (in which case we should use a per-log sem.) */
+	mutex_lock(&cld->cld_lock);
+	if (cld->cld_stopping) {
+		mutex_unlock(&cld->cld_lock);
+		return 0;
+	}
+
+	OBD_FAIL_TIMEOUT(OBD_FAIL_MGC_PAUSE_PROCESS_LOG, 20);
+
+	CDEBUG(D_MGC, "Process log %s:%p from %d\n", cld->cld_logname,
+	       cld->cld_cfg.cfg_instance, cld->cld_cfg.cfg_last_idx + 1);
+
+	/* Get the cfg lock on the llog */
+	rcl = mgc_enqueue(mgc->u.cli.cl_mgc_mgsexp, NULL, LDLM_PLAIN, NULL,
+			  LCK_CR, &flags, NULL, NULL, NULL,
+			  cld, 0, NULL, &lockh);
+	if (rcl == 0) {
+		/* Get the cld, it will be released in mgc_blocking_ast. */
+		config_log_get(cld);
+		rc = ldlm_lock_set_data(&lockh, (void *)cld);
+		LASSERT(rc == 0);
+	} else {
+		CDEBUG(D_MGC, "Can't get cfg lock: %d\n", rcl);
+
+		/* mark cld_lostlock so that it will requeue
+		 * after MGC becomes available. */
+		cld->cld_lostlock = 1;
+		/* Get extra reference, it will be put in requeue thread */
+		config_log_get(cld);
+	}
+
+
+	if (cld_is_recover(cld)) {
+		rc = 0; /* this is not a fatal error for recover log */
+		if (rcl == 0)
+			rc = mgc_process_recover_log(mgc, cld);
+	} else {
+		rc = mgc_process_cfg_log(mgc, cld, rcl != 0);
+	}
+
+	CDEBUG(D_MGC, "%s: configuration from log '%s' %sed (%d).\n",
+	       mgc->obd_name, cld->cld_logname, rc ? "fail" : "succeed", rc);
+
+	mutex_unlock(&cld->cld_lock);
+
+	/* Now drop the lock so MGS can revoke it */
+	if (!rcl)
+		ldlm_lock_decref(&lockh, LCK_CR);
+
+	return rc;
+}
+
+
+/** Called from lustre_process_log.
+ * LCFG_LOG_START gets the config log from the MGS, processes it to start
+ * any services, and adds it to the list logs to watch (follow).
+ */
+static int mgc_process_config(struct obd_device *obd, u32 len, void *buf)
+{
+	struct lustre_cfg *lcfg = buf;
+	struct config_llog_instance *cfg = NULL;
+	char *logname;
+	int rc = 0;
+
+	switch (lcfg->lcfg_command) {
+	case LCFG_LOV_ADD_OBD: {
+		/* Overloading this cfg command: register a new target */
+		struct mgs_target_info *mti;
+
+		if (LUSTRE_CFG_BUFLEN(lcfg, 1) !=
+		    sizeof(struct mgs_target_info)) {
+			rc = -EINVAL;
+			goto out;
+		}
+
+		mti = (struct mgs_target_info *)lustre_cfg_buf(lcfg, 1);
+		CDEBUG(D_MGC, "add_target %s %#x\n",
+		       mti->mti_svname, mti->mti_flags);
+		rc = mgc_target_register(obd->u.cli.cl_mgc_mgsexp, mti);
+		break;
+	}
+	case LCFG_LOV_DEL_OBD:
+		/* Unregister has no meaning at the moment. */
+		CERROR("lov_del_obd unimplemented\n");
+		rc = -ENOSYS;
+		break;
+	case LCFG_SPTLRPC_CONF: {
+		rc = sptlrpc_process_config(lcfg);
+		break;
+	}
+	case LCFG_LOG_START: {
+		struct config_llog_data *cld;
+		struct super_block *sb;
+
+		logname = lustre_cfg_string(lcfg, 1);
+		cfg = (struct config_llog_instance *)lustre_cfg_buf(lcfg, 2);
+		sb = *(struct super_block **)lustre_cfg_buf(lcfg, 3);
+
+		CDEBUG(D_MGC, "parse_log %s from %d\n", logname,
+		       cfg->cfg_last_idx);
+
+		/* We're only called through here on the initial mount */
+		rc = config_log_add(obd, logname, cfg, sb);
+		if (rc)
+			break;
+		cld = config_log_find(logname, cfg);
+		if (cld == NULL) {
+			rc = -ENOENT;
+			break;
+		}
+
+		/* COMPAT_146 */
+		/* FIXME only set this for old logs!  Right now this forces
+		   us to always skip the "inside markers" check */
+		cld->cld_cfg.cfg_flags |= CFG_F_COMPAT146;
+
+		rc = mgc_process_log(obd, cld);
+		if (rc == 0 && cld->cld_recover != NULL) {
+			if (OCD_HAS_FLAG(&obd->u.cli.cl_import->
+					 imp_connect_data, IMP_RECOV)) {
+				rc = mgc_process_log(obd, cld->cld_recover);
+			} else {
+				struct config_llog_data *cir = cld->cld_recover;
+				cld->cld_recover = NULL;
+				config_log_put(cir);
+			}
+			if (rc)
+				CERROR("Cannot process recover llog %d\n", rc);
+		}
+
+		if (rc == 0 && cld->cld_params != NULL) {
+			rc = mgc_process_log(obd, cld->cld_params);
+			if (rc == -ENOENT) {
+				CDEBUG(D_MGC,
+				       "There is no params config file yet\n");
+				rc = 0;
+			}
+			/* params log is optional */
+			if (rc)
+				CERROR(
+				       "%s: can't process params llog: rc = %d\n",
+				       obd->obd_name, rc);
+		}
+		config_log_put(cld);
+
+		break;
+	}
+	case LCFG_LOG_END: {
+		logname = lustre_cfg_string(lcfg, 1);
+
+		if (lcfg->lcfg_bufcount >= 2)
+			cfg = (struct config_llog_instance *)lustre_cfg_buf(
+				lcfg, 2);
+		rc = config_log_end(logname, cfg);
+		break;
+	}
+	default: {
+		CERROR("Unknown command: %d\n", lcfg->lcfg_command);
+		rc = -EINVAL;
+		goto out;
+
+	}
+	}
+out:
+	return rc;
+}
+
+struct obd_ops mgc_obd_ops = {
+	.o_owner	= THIS_MODULE,
+	.o_setup	= mgc_setup,
+	.o_precleanup   = mgc_precleanup,
+	.o_cleanup      = mgc_cleanup,
+	.o_add_conn     = client_import_add_conn,
+	.o_del_conn     = client_import_del_conn,
+	.o_connect      = client_connect_import,
+	.o_disconnect   = client_disconnect_export,
+	/* .o_enqueue      = mgc_enqueue, */
+	/* .o_iocontrol    = mgc_iocontrol, */
+	.o_set_info_async = mgc_set_info_async,
+	.o_get_info       = mgc_get_info,
+	.o_import_event = mgc_import_event,
+	.o_process_config = mgc_process_config,
+};
+
+static int __init mgc_init(void)
+{
+	return class_register_type(&mgc_obd_ops, NULL, NULL,
+				   LUSTRE_MGC_NAME, NULL);
+}
+
+static void /*__exit*/ mgc_exit(void)
+{
+	class_unregister_type(LUSTRE_MGC_NAME);
+}
+
+MODULE_AUTHOR("Sun Microsystems, Inc. <http://www.lustre.org/>");
+MODULE_DESCRIPTION("Lustre Management Client");
+MODULE_LICENSE("GPL");
+
+module_init(mgc_init);
+module_exit(mgc_exit);
diff --git a/kernel/drivers/staging/lustre/lustre/obdclass/Makefile b/kernel/drivers/staging/lustre/lustre/obdclass/Makefile
new file mode 100644
index 000000000..e89468179
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/obdclass/Makefile
@@ -0,0 +1,11 @@
+obj-$(CONFIG_LUSTRE_FS) += obdclass.o
+
+obdclass-y := linux/linux-module.o linux/linux-obdo.o linux/linux-sysctl.o \
+	      llog.o llog_cat.o llog_obd.o llog_swab.o class_obd.o debug.o \
+	      genops.o uuid.o lprocfs_status.o \
+	      lustre_handles.o lustre_peer.o \
+	      statfs_pack.o obdo.o obd_config.o obd_mount.o \
+	      lu_object.o dt_object.o capa.o cl_object.o   \
+	      cl_page.o cl_lock.o cl_io.o lu_ref.o acl.o
+
+obdclass-$(CONFIG_PROC_FS) += lprocfs_counters.o
diff --git a/kernel/drivers/staging/lustre/lustre/obdclass/acl.c b/kernel/drivers/staging/lustre/lustre/obdclass/acl.c
new file mode 100644
index 000000000..9a69f6b35
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/obdclass/acl.c
@@ -0,0 +1,548 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdclass/acl.c
+ *
+ * Lustre Access Control List.
+ *
+ * Author: Fan Yong <fanyong@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_SEC
+#include "../include/lu_object.h"
+#include "../include/lustre_acl.h"
+#include "../include/lustre_eacl.h"
+#include "../include/obd_support.h"
+
+#ifdef CONFIG_FS_POSIX_ACL
+
+#define CFS_ACL_XATTR_VERSION POSIX_ACL_XATTR_VERSION
+
+enum {
+	ES_UNK  = 0,    /* unknown stat */
+	ES_UNC  = 1,    /* ACL entry is not changed */
+	ES_MOD  = 2,    /* ACL entry is modified */
+	ES_ADD  = 3,    /* ACL entry is added */
+	ES_DEL  = 4     /* ACL entry is deleted */
+};
+
+static inline void lustre_ext_acl_le_to_cpu(ext_acl_xattr_entry *d,
+					    ext_acl_xattr_entry *s)
+{
+	d->e_tag	= le16_to_cpu(s->e_tag);
+	d->e_perm       = le16_to_cpu(s->e_perm);
+	d->e_id	 = le32_to_cpu(s->e_id);
+	d->e_stat       = le32_to_cpu(s->e_stat);
+}
+
+static inline void lustre_ext_acl_cpu_to_le(ext_acl_xattr_entry *d,
+					    ext_acl_xattr_entry *s)
+{
+	d->e_tag	= cpu_to_le16(s->e_tag);
+	d->e_perm       = cpu_to_le16(s->e_perm);
+	d->e_id	 = cpu_to_le32(s->e_id);
+	d->e_stat       = cpu_to_le32(s->e_stat);
+}
+
+static inline void lustre_posix_acl_le_to_cpu(posix_acl_xattr_entry *d,
+					      posix_acl_xattr_entry *s)
+{
+	d->e_tag	= le16_to_cpu(s->e_tag);
+	d->e_perm       = le16_to_cpu(s->e_perm);
+	d->e_id	 = le32_to_cpu(s->e_id);
+}
+
+static inline void lustre_posix_acl_cpu_to_le(posix_acl_xattr_entry *d,
+					      posix_acl_xattr_entry *s)
+{
+	d->e_tag	= cpu_to_le16(s->e_tag);
+	d->e_perm       = cpu_to_le16(s->e_perm);
+	d->e_id	 = cpu_to_le32(s->e_id);
+}
+
+
+/* if "new_count == 0", then "new = {a_version, NULL}", NOT NULL. */
+static int lustre_posix_acl_xattr_reduce_space(posix_acl_xattr_header **header,
+					       int old_count, int new_count)
+{
+	int old_size = CFS_ACL_XATTR_SIZE(old_count, posix_acl_xattr);
+	int new_size = CFS_ACL_XATTR_SIZE(new_count, posix_acl_xattr);
+	posix_acl_xattr_header *new;
+
+	if (unlikely(old_count <= new_count))
+		return old_size;
+
+	OBD_ALLOC(new, new_size);
+	if (unlikely(new == NULL))
+		return -ENOMEM;
+
+	memcpy(new, *header, new_size);
+	OBD_FREE(*header, old_size);
+	*header = new;
+	return new_size;
+}
+
+/* if "new_count == 0", then "new = {0, NULL}", NOT NULL. */
+static int lustre_ext_acl_xattr_reduce_space(ext_acl_xattr_header **header,
+					     int old_count)
+{
+	int ext_count = le32_to_cpu((*header)->a_count);
+	int ext_size = CFS_ACL_XATTR_SIZE(ext_count, ext_acl_xattr);
+	int old_size = CFS_ACL_XATTR_SIZE(old_count, ext_acl_xattr);
+	ext_acl_xattr_header *new;
+
+	if (unlikely(old_count <= ext_count))
+		return 0;
+
+	OBD_ALLOC(new, ext_size);
+	if (unlikely(new == NULL))
+		return -ENOMEM;
+
+	memcpy(new, *header, ext_size);
+	OBD_FREE(*header, old_size);
+	*header = new;
+	return 0;
+}
+
+/*
+ * Generate new extended ACL based on the posix ACL.
+ */
+ext_acl_xattr_header *
+lustre_posix_acl_xattr_2ext(posix_acl_xattr_header *header, int size)
+{
+	int count, i, esize;
+	ext_acl_xattr_header *new;
+
+	if (unlikely(size < 0))
+		return ERR_PTR(-EINVAL);
+	else if (!size)
+		count = 0;
+	else
+		count = CFS_ACL_XATTR_COUNT(size, posix_acl_xattr);
+	esize = CFS_ACL_XATTR_SIZE(count, ext_acl_xattr);
+	OBD_ALLOC(new, esize);
+	if (unlikely(new == NULL))
+		return ERR_PTR(-ENOMEM);
+
+	new->a_count = cpu_to_le32(count);
+	for (i = 0; i < count; i++) {
+		new->a_entries[i].e_tag  = header->a_entries[i].e_tag;
+		new->a_entries[i].e_perm = header->a_entries[i].e_perm;
+		new->a_entries[i].e_id   = header->a_entries[i].e_id;
+		new->a_entries[i].e_stat = cpu_to_le32(ES_UNK);
+	}
+
+	return new;
+}
+EXPORT_SYMBOL(lustre_posix_acl_xattr_2ext);
+
+/*
+ * Filter out the "nobody" entries in the posix ACL.
+ */
+int lustre_posix_acl_xattr_filter(posix_acl_xattr_header *header, size_t size,
+				  posix_acl_xattr_header **out)
+{
+	int count, i, j, rc = 0;
+	__u32 id;
+	posix_acl_xattr_header *new;
+
+	if (!size)
+		return 0;
+	if (size < sizeof(*new))
+		return -EINVAL;
+
+	OBD_ALLOC(new, size);
+	if (unlikely(new == NULL))
+		return -ENOMEM;
+
+	new->a_version = cpu_to_le32(CFS_ACL_XATTR_VERSION);
+	count = CFS_ACL_XATTR_COUNT(size, posix_acl_xattr);
+	for (i = 0, j = 0; i < count; i++) {
+		id = le32_to_cpu(header->a_entries[i].e_id);
+		switch (le16_to_cpu(header->a_entries[i].e_tag)) {
+		case ACL_USER_OBJ:
+		case ACL_GROUP_OBJ:
+		case ACL_MASK:
+		case ACL_OTHER:
+			if (id != ACL_UNDEFINED_ID) {
+				rc = -EIO;
+				goto _out;
+			}
+
+			memcpy(&new->a_entries[j++], &header->a_entries[i],
+			       sizeof(posix_acl_xattr_entry));
+			break;
+		case ACL_USER:
+			if (id != NOBODY_UID)
+				memcpy(&new->a_entries[j++],
+				       &header->a_entries[i],
+				       sizeof(posix_acl_xattr_entry));
+			break;
+		case ACL_GROUP:
+			if (id != NOBODY_GID)
+				memcpy(&new->a_entries[j++],
+				       &header->a_entries[i],
+				       sizeof(posix_acl_xattr_entry));
+			break;
+		default:
+			rc = -EIO;
+			goto _out;
+		}
+	}
+
+	/* free unused space. */
+	rc = lustre_posix_acl_xattr_reduce_space(&new, count, j);
+	if (rc >= 0) {
+		size = rc;
+		*out = new;
+		rc = 0;
+	}
+
+_out:
+	if (rc) {
+		OBD_FREE(new, size);
+		size = rc;
+	}
+	return size;
+}
+EXPORT_SYMBOL(lustre_posix_acl_xattr_filter);
+
+/*
+ * Release the posix ACL space.
+ */
+void lustre_posix_acl_xattr_free(posix_acl_xattr_header *header, int size)
+{
+	OBD_FREE(header, size);
+}
+EXPORT_SYMBOL(lustre_posix_acl_xattr_free);
+
+/*
+ * Release the extended ACL space.
+ */
+void lustre_ext_acl_xattr_free(ext_acl_xattr_header *header)
+{
+	OBD_FREE(header, CFS_ACL_XATTR_SIZE(le32_to_cpu(header->a_count), \
+					    ext_acl_xattr));
+}
+EXPORT_SYMBOL(lustre_ext_acl_xattr_free);
+
+static ext_acl_xattr_entry *
+lustre_ext_acl_xattr_search(ext_acl_xattr_header *header,
+			    posix_acl_xattr_entry *entry, int *pos)
+{
+	int once, start, end, i, j, count = le32_to_cpu(header->a_count);
+
+	once = 0;
+	start = *pos;
+	end = count;
+
+again:
+	for (i = start; i < end; i++) {
+		if (header->a_entries[i].e_tag == entry->e_tag &&
+		    header->a_entries[i].e_id == entry->e_id) {
+			j = i;
+			if (++i >= count)
+				i = 0;
+			*pos = i;
+			return &header->a_entries[j];
+		}
+	}
+
+	if (!once) {
+		once = 1;
+		start = 0;
+		end = *pos;
+		goto again;
+	}
+
+	return NULL;
+}
+
+/*
+ * Merge the posix ACL and the extended ACL into new posix ACL.
+ */
+int lustre_acl_xattr_merge2posix(posix_acl_xattr_header *posix_header, int size,
+				 ext_acl_xattr_header *ext_header,
+				 posix_acl_xattr_header **out)
+{
+	int posix_count, posix_size, i, j;
+	int ext_count = le32_to_cpu(ext_header->a_count), pos = 0, rc = 0;
+	posix_acl_xattr_entry pe = {ACL_MASK, 0, ACL_UNDEFINED_ID};
+	posix_acl_xattr_header *new;
+	ext_acl_xattr_entry *ee, ae;
+
+	lustre_posix_acl_cpu_to_le(&pe, &pe);
+	ee = lustre_ext_acl_xattr_search(ext_header, &pe, &pos);
+	if (ee == NULL || le32_to_cpu(ee->e_stat) == ES_DEL) {
+		/* there are only base ACL entries at most. */
+		posix_count = 3;
+		posix_size = CFS_ACL_XATTR_SIZE(posix_count, posix_acl_xattr);
+		OBD_ALLOC(new, posix_size);
+		if (unlikely(new == NULL))
+			return -ENOMEM;
+
+		new->a_version = cpu_to_le32(CFS_ACL_XATTR_VERSION);
+		for (i = 0, j = 0; i < ext_count; i++) {
+			lustre_ext_acl_le_to_cpu(&ae,
+						 &ext_header->a_entries[i]);
+			switch (ae.e_tag) {
+			case ACL_USER_OBJ:
+			case ACL_GROUP_OBJ:
+			case ACL_OTHER:
+				if (ae.e_id != ACL_UNDEFINED_ID) {
+					rc = -EIO;
+					goto _out;
+				}
+
+				if (ae.e_stat != ES_DEL) {
+					new->a_entries[j].e_tag =
+						ext_header->a_entries[i].e_tag;
+					new->a_entries[j].e_perm =
+						ext_header->a_entries[i].e_perm;
+					new->a_entries[j++].e_id =
+						ext_header->a_entries[i].e_id;
+				}
+				break;
+			case ACL_MASK:
+			case ACL_USER:
+			case ACL_GROUP:
+				if (ae.e_stat == ES_DEL)
+					break;
+			default:
+				rc = -EIO;
+				goto _out;
+			}
+		}
+	} else {
+		/* maybe there are valid ACL_USER or ACL_GROUP entries in the
+		 * original server-side ACL, they are regarded as ES_UNC stat.*/
+		int ori_posix_count;
+
+		if (unlikely(size < 0))
+			return -EINVAL;
+		else if (!size)
+			ori_posix_count = 0;
+		else
+			ori_posix_count =
+				CFS_ACL_XATTR_COUNT(size, posix_acl_xattr);
+		posix_count = ori_posix_count + ext_count;
+		posix_size =
+			CFS_ACL_XATTR_SIZE(posix_count, posix_acl_xattr);
+		OBD_ALLOC(new, posix_size);
+		if (unlikely(new == NULL))
+			return -ENOMEM;
+
+		new->a_version = cpu_to_le32(CFS_ACL_XATTR_VERSION);
+		/* 1. process the unchanged ACL entries
+		 *    in the original server-side ACL. */
+		pos = 0;
+		for (i = 0, j = 0; i < ori_posix_count; i++) {
+			ee = lustre_ext_acl_xattr_search(ext_header,
+					&posix_header->a_entries[i], &pos);
+			if (ee == NULL)
+				memcpy(&new->a_entries[j++],
+				       &posix_header->a_entries[i],
+				       sizeof(posix_acl_xattr_entry));
+		}
+
+		/* 2. process the non-deleted entries
+		 *    from client-side extended ACL. */
+		for (i = 0; i < ext_count; i++) {
+			if (le16_to_cpu(ext_header->a_entries[i].e_stat) !=
+			    ES_DEL) {
+				new->a_entries[j].e_tag =
+						ext_header->a_entries[i].e_tag;
+				new->a_entries[j].e_perm =
+						ext_header->a_entries[i].e_perm;
+				new->a_entries[j++].e_id =
+						ext_header->a_entries[i].e_id;
+			}
+		}
+	}
+
+	/* free unused space. */
+	rc = lustre_posix_acl_xattr_reduce_space(&new, posix_count, j);
+	if (rc >= 0) {
+		posix_size = rc;
+		*out = new;
+		rc = 0;
+	}
+
+_out:
+	if (rc) {
+		OBD_FREE(new, posix_size);
+		posix_size = rc;
+	}
+	return posix_size;
+}
+EXPORT_SYMBOL(lustre_acl_xattr_merge2posix);
+
+/*
+ * Merge the posix ACL and the extended ACL into new extended ACL.
+ */
+ext_acl_xattr_header *
+lustre_acl_xattr_merge2ext(posix_acl_xattr_header *posix_header, int size,
+			   ext_acl_xattr_header *ext_header)
+{
+	int ori_ext_count, posix_count, ext_count, ext_size;
+	int i, j, pos = 0, rc = 0;
+	posix_acl_xattr_entry pae;
+	ext_acl_xattr_header *new;
+	ext_acl_xattr_entry *ee, eae;
+
+	if (unlikely(size < 0))
+		return ERR_PTR(-EINVAL);
+	else if (!size)
+		posix_count = 0;
+	else
+		posix_count = CFS_ACL_XATTR_COUNT(size, posix_acl_xattr);
+	ori_ext_count = le32_to_cpu(ext_header->a_count);
+	ext_count = posix_count + ori_ext_count;
+	ext_size = CFS_ACL_XATTR_SIZE(ext_count, ext_acl_xattr);
+
+	OBD_ALLOC(new, ext_size);
+	if (unlikely(new == NULL))
+		return ERR_PTR(-ENOMEM);
+
+	for (i = 0, j = 0; i < posix_count; i++) {
+		lustre_posix_acl_le_to_cpu(&pae, &posix_header->a_entries[i]);
+		switch (pae.e_tag) {
+		case ACL_USER_OBJ:
+		case ACL_GROUP_OBJ:
+		case ACL_MASK:
+		case ACL_OTHER:
+			if (pae.e_id != ACL_UNDEFINED_ID) {
+				rc = -EIO;
+				goto out;
+		}
+		case ACL_USER:
+			/* ignore "nobody" entry. */
+			if (pae.e_id == NOBODY_UID)
+				break;
+
+			new->a_entries[j].e_tag =
+					posix_header->a_entries[i].e_tag;
+			new->a_entries[j].e_perm =
+					posix_header->a_entries[i].e_perm;
+			new->a_entries[j].e_id =
+					posix_header->a_entries[i].e_id;
+			ee = lustre_ext_acl_xattr_search(ext_header,
+					&posix_header->a_entries[i], &pos);
+			if (ee) {
+				if (posix_header->a_entries[i].e_perm !=
+								ee->e_perm)
+					/* entry modified. */
+					ee->e_stat =
+					new->a_entries[j++].e_stat =
+							cpu_to_le32(ES_MOD);
+				else
+					/* entry unchanged. */
+					ee->e_stat =
+					new->a_entries[j++].e_stat =
+							cpu_to_le32(ES_UNC);
+			} else {
+				/* new entry. */
+				new->a_entries[j++].e_stat =
+							cpu_to_le32(ES_ADD);
+			}
+			break;
+		case ACL_GROUP:
+			/* ignore "nobody" entry. */
+			if (pae.e_id == NOBODY_GID)
+				break;
+			new->a_entries[j].e_tag =
+					posix_header->a_entries[i].e_tag;
+			new->a_entries[j].e_perm =
+					posix_header->a_entries[i].e_perm;
+			new->a_entries[j].e_id =
+					posix_header->a_entries[i].e_id;
+			ee = lustre_ext_acl_xattr_search(ext_header,
+					&posix_header->a_entries[i], &pos);
+			if (ee) {
+				if (posix_header->a_entries[i].e_perm !=
+								ee->e_perm)
+					/* entry modified. */
+					ee->e_stat =
+					new->a_entries[j++].e_stat =
+							cpu_to_le32(ES_MOD);
+				else
+					/* entry unchanged. */
+					ee->e_stat =
+					new->a_entries[j++].e_stat =
+							cpu_to_le32(ES_UNC);
+			} else {
+				/* new entry. */
+				new->a_entries[j++].e_stat =
+							cpu_to_le32(ES_ADD);
+			}
+			break;
+		default:
+			rc = -EIO;
+			goto out;
+		}
+	}
+
+	/* process deleted entries. */
+	for (i = 0; i < ori_ext_count; i++) {
+		lustre_ext_acl_le_to_cpu(&eae, &ext_header->a_entries[i]);
+		if (eae.e_stat == ES_UNK) {
+			/* ignore "nobody" entry. */
+			if ((eae.e_tag == ACL_USER && eae.e_id == NOBODY_UID) ||
+			    (eae.e_tag == ACL_GROUP && eae.e_id == NOBODY_GID))
+				continue;
+
+			new->a_entries[j].e_tag =
+						ext_header->a_entries[i].e_tag;
+			new->a_entries[j].e_perm =
+						ext_header->a_entries[i].e_perm;
+			new->a_entries[j].e_id = ext_header->a_entries[i].e_id;
+			new->a_entries[j++].e_stat = cpu_to_le32(ES_DEL);
+		}
+	}
+
+	new->a_count = cpu_to_le32(j);
+	/* free unused space. */
+	rc = lustre_ext_acl_xattr_reduce_space(&new, ext_count);
+
+out:
+	if (rc) {
+		OBD_FREE(new, ext_size);
+		new = ERR_PTR(rc);
+	}
+	return new;
+}
+EXPORT_SYMBOL(lustre_acl_xattr_merge2ext);
+
+#endif
diff --git a/kernel/drivers/staging/lustre/lustre/obdclass/capa.c b/kernel/drivers/staging/lustre/lustre/obdclass/capa.c
new file mode 100644
index 000000000..d206b1046
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/obdclass/capa.c
@@ -0,0 +1,421 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdclass/capa.c
+ *
+ * Lustre Capability Hash Management
+ *
+ * Author: Lai Siyao<lsy@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_SEC
+
+#include <linux/fs.h>
+#include <asm/unistd.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/crypto.h>
+
+#include "../include/obd_class.h"
+#include "../include/lustre_debug.h"
+#include "../include/lustre/lustre_idl.h"
+
+#include <linux/list.h>
+#include "../include/lustre_capa.h"
+
+#define NR_CAPAHASH 32
+#define CAPA_HASH_SIZE 3000	      /* for MDS & OSS */
+
+struct kmem_cache *capa_cachep = NULL;
+
+/* lock for capa hash/capa_list/fo_capa_keys */
+DEFINE_SPINLOCK(capa_lock);
+
+struct list_head capa_list[CAPA_SITE_MAX];
+
+static struct capa_hmac_alg capa_hmac_algs[] = {
+	DEF_CAPA_HMAC_ALG("sha1", SHA1, 20, 20),
+};
+/* capa count */
+int capa_count[CAPA_SITE_MAX] = { 0, };
+
+EXPORT_SYMBOL(capa_cachep);
+EXPORT_SYMBOL(capa_list);
+EXPORT_SYMBOL(capa_lock);
+EXPORT_SYMBOL(capa_count);
+
+static inline
+unsigned int ll_crypto_tfm_alg_min_keysize(struct crypto_blkcipher *tfm)
+{
+	return crypto_blkcipher_tfm(tfm)->__crt_alg->cra_blkcipher.min_keysize;
+}
+
+struct hlist_head *init_capa_hash(void)
+{
+	struct hlist_head *hash;
+	int nr_hash, i;
+
+	OBD_ALLOC(hash, PAGE_CACHE_SIZE);
+	if (!hash)
+		return NULL;
+
+	nr_hash = PAGE_CACHE_SIZE / sizeof(struct hlist_head);
+	LASSERT(nr_hash > NR_CAPAHASH);
+
+	for (i = 0; i < NR_CAPAHASH; i++)
+		INIT_HLIST_HEAD(hash + i);
+	return hash;
+}
+EXPORT_SYMBOL(init_capa_hash);
+
+static inline int capa_on_server(struct obd_capa *ocapa)
+{
+	return ocapa->c_site == CAPA_SITE_SERVER;
+}
+
+static inline void capa_delete(struct obd_capa *ocapa)
+{
+	LASSERT(capa_on_server(ocapa));
+	hlist_del_init(&ocapa->u.tgt.c_hash);
+	list_del_init(&ocapa->c_list);
+	capa_count[ocapa->c_site]--;
+	/* release the ref when alloc */
+	capa_put(ocapa);
+}
+
+void cleanup_capa_hash(struct hlist_head *hash)
+{
+	int i;
+	struct hlist_node *next;
+	struct obd_capa *oc;
+
+	spin_lock(&capa_lock);
+	for (i = 0; i < NR_CAPAHASH; i++) {
+		hlist_for_each_entry_safe(oc, next, hash + i,
+					      u.tgt.c_hash)
+			capa_delete(oc);
+	}
+	spin_unlock(&capa_lock);
+
+	OBD_FREE(hash, PAGE_CACHE_SIZE);
+}
+EXPORT_SYMBOL(cleanup_capa_hash);
+
+static inline int capa_hashfn(struct lu_fid *fid)
+{
+	return (fid_oid(fid) ^ fid_ver(fid)) *
+	       (unsigned long)(fid_seq(fid) + 1) % NR_CAPAHASH;
+}
+
+/* capa renewal time check is earlier than that on client, which is to prevent
+ * client renew right after obtaining it. */
+static inline int capa_is_to_expire(struct obd_capa *oc)
+{
+	return time_before(cfs_time_sub(oc->c_expiry,
+					cfs_time_seconds(oc->c_capa.lc_timeout)*2/3),
+			   cfs_time_current());
+}
+
+static struct obd_capa *find_capa(struct lustre_capa *capa,
+				  struct hlist_head *head, int alive)
+{
+	struct obd_capa *ocapa;
+	int len = alive ? offsetof(struct lustre_capa, lc_keyid):sizeof(*capa);
+
+	hlist_for_each_entry(ocapa, head, u.tgt.c_hash) {
+		if (memcmp(&ocapa->c_capa, capa, len))
+			continue;
+		/* don't return one that will expire soon in this case */
+		if (alive && capa_is_to_expire(ocapa))
+			continue;
+
+		LASSERT(capa_on_server(ocapa));
+
+		DEBUG_CAPA(D_SEC, &ocapa->c_capa, "found");
+		return ocapa;
+	}
+
+	return NULL;
+}
+
+#define LRU_CAPA_DELETE_COUNT 12
+static inline void capa_delete_lru(struct list_head *head)
+{
+	struct obd_capa *ocapa;
+	struct list_head *node = head->next;
+	int count = 0;
+
+	/* free LRU_CAPA_DELETE_COUNT unused capa from head */
+	while (count++ < LRU_CAPA_DELETE_COUNT) {
+		ocapa = list_entry(node, struct obd_capa, c_list);
+		node = node->next;
+		if (atomic_read(&ocapa->c_refc))
+			continue;
+
+		DEBUG_CAPA(D_SEC, &ocapa->c_capa, "free lru");
+		capa_delete(ocapa);
+	}
+}
+
+/* add or update */
+struct obd_capa *capa_add(struct hlist_head *hash, struct lustre_capa *capa)
+{
+	struct hlist_head *head = hash + capa_hashfn(&capa->lc_fid);
+	struct obd_capa *ocapa, *old = NULL;
+	struct list_head *list = &capa_list[CAPA_SITE_SERVER];
+
+	ocapa = alloc_capa(CAPA_SITE_SERVER);
+	if (IS_ERR(ocapa))
+		return NULL;
+
+	spin_lock(&capa_lock);
+	old = find_capa(capa, head, 0);
+	if (!old) {
+		ocapa->c_capa = *capa;
+		set_capa_expiry(ocapa);
+		hlist_add_head(&ocapa->u.tgt.c_hash, head);
+		list_add_tail(&ocapa->c_list, list);
+		capa_get(ocapa);
+		capa_count[CAPA_SITE_SERVER]++;
+		if (capa_count[CAPA_SITE_SERVER] > CAPA_HASH_SIZE)
+			capa_delete_lru(list);
+		spin_unlock(&capa_lock);
+		return ocapa;
+	}
+	capa_get(old);
+	spin_unlock(&capa_lock);
+	capa_put(ocapa);
+	return old;
+}
+EXPORT_SYMBOL(capa_add);
+
+struct obd_capa *capa_lookup(struct hlist_head *hash, struct lustre_capa *capa,
+			     int alive)
+{
+	struct obd_capa *ocapa;
+
+	spin_lock(&capa_lock);
+	ocapa = find_capa(capa, hash + capa_hashfn(&capa->lc_fid), alive);
+	if (ocapa) {
+		list_move_tail(&ocapa->c_list,
+				   &capa_list[CAPA_SITE_SERVER]);
+		capa_get(ocapa);
+	}
+	spin_unlock(&capa_lock);
+
+	return ocapa;
+}
+EXPORT_SYMBOL(capa_lookup);
+
+static inline int ll_crypto_hmac(struct crypto_hash *tfm,
+				 u8 *key, unsigned int *keylen,
+				 struct scatterlist *sg,
+				 unsigned int size, u8 *result)
+{
+	struct hash_desc desc;
+	int	      rv;
+	desc.tfm   = tfm;
+	desc.flags = 0;
+	rv = crypto_hash_setkey(desc.tfm, key, *keylen);
+	if (rv) {
+		CERROR("failed to hash setkey: %d\n", rv);
+		return rv;
+	}
+	return crypto_hash_digest(&desc, sg, size, result);
+}
+
+int capa_hmac(__u8 *hmac, struct lustre_capa *capa, __u8 *key)
+{
+	struct crypto_hash *tfm;
+	struct capa_hmac_alg  *alg;
+	int keylen;
+	struct scatterlist sl;
+
+	if (capa_alg(capa) != CAPA_HMAC_ALG_SHA1) {
+		CERROR("unknown capability hmac algorithm!\n");
+		return -EFAULT;
+	}
+
+	alg = &capa_hmac_algs[capa_alg(capa)];
+
+	tfm = crypto_alloc_hash(alg->ha_name, 0, 0);
+	if (IS_ERR(tfm)) {
+		CERROR("crypto_alloc_tfm failed, check whether your kernel has crypto support!\n");
+		return PTR_ERR(tfm);
+	}
+	keylen = alg->ha_keylen;
+
+	sg_init_table(&sl, 1);
+	sg_set_page(&sl, virt_to_page(capa),
+		    offsetof(struct lustre_capa, lc_hmac),
+		    (unsigned long)(capa) % PAGE_CACHE_SIZE);
+
+	ll_crypto_hmac(tfm, key, &keylen, &sl, sl.length, hmac);
+	crypto_free_hash(tfm);
+
+	return 0;
+}
+EXPORT_SYMBOL(capa_hmac);
+
+int capa_encrypt_id(__u32 *d, __u32 *s, __u8 *key, int keylen)
+{
+	struct crypto_blkcipher *tfm;
+	struct scatterlist sd;
+	struct scatterlist ss;
+	struct blkcipher_desc desc;
+	unsigned int min;
+	int rc;
+	char alg[CRYPTO_MAX_ALG_NAME+1] = "aes";
+
+	/* passing "aes" in a variable instead of a constant string keeps gcc
+	 * 4.3.2 happy */
+	tfm = crypto_alloc_blkcipher(alg, 0, 0);
+	if (IS_ERR(tfm)) {
+		CERROR("failed to load transform for aes\n");
+		return PTR_ERR(tfm);
+	}
+
+	min = ll_crypto_tfm_alg_min_keysize(tfm);
+	if (keylen < min) {
+		CERROR("keylen at least %d bits for aes\n", min * 8);
+		rc = -EINVAL;
+		goto out;
+	}
+
+	rc = crypto_blkcipher_setkey(tfm, key, min);
+	if (rc) {
+		CERROR("failed to setting key for aes\n");
+		goto out;
+	}
+
+	sg_init_table(&sd, 1);
+	sg_set_page(&sd, virt_to_page(d), 16,
+		    (unsigned long)(d) % PAGE_CACHE_SIZE);
+
+	sg_init_table(&ss, 1);
+	sg_set_page(&ss, virt_to_page(s), 16,
+		    (unsigned long)(s) % PAGE_CACHE_SIZE);
+	desc.tfm   = tfm;
+	desc.info  = NULL;
+	desc.flags = 0;
+	rc = crypto_blkcipher_encrypt(&desc, &sd, &ss, 16);
+	if (rc) {
+		CERROR("failed to encrypt for aes\n");
+		goto out;
+	}
+
+out:
+	crypto_free_blkcipher(tfm);
+	return rc;
+}
+EXPORT_SYMBOL(capa_encrypt_id);
+
+int capa_decrypt_id(__u32 *d, __u32 *s, __u8 *key, int keylen)
+{
+	struct crypto_blkcipher *tfm;
+	struct scatterlist sd;
+	struct scatterlist ss;
+	struct blkcipher_desc desc;
+	unsigned int min;
+	int rc;
+	char alg[CRYPTO_MAX_ALG_NAME+1] = "aes";
+
+	/* passing "aes" in a variable instead of a constant string keeps gcc
+	 * 4.3.2 happy */
+	tfm = crypto_alloc_blkcipher(alg, 0, 0);
+	if (IS_ERR(tfm)) {
+		CERROR("failed to load transform for aes\n");
+		return PTR_ERR(tfm);
+	}
+
+	min = ll_crypto_tfm_alg_min_keysize(tfm);
+	if (keylen < min) {
+		CERROR("keylen at least %d bits for aes\n", min * 8);
+		rc = -EINVAL;
+		goto out;
+	}
+
+	rc = crypto_blkcipher_setkey(tfm, key, min);
+	if (rc) {
+		CERROR("failed to setting key for aes\n");
+		goto out;
+	}
+
+	sg_init_table(&sd, 1);
+	sg_set_page(&sd, virt_to_page(d), 16,
+		    (unsigned long)(d) % PAGE_CACHE_SIZE);
+
+	sg_init_table(&ss, 1);
+	sg_set_page(&ss, virt_to_page(s), 16,
+		    (unsigned long)(s) % PAGE_CACHE_SIZE);
+
+	desc.tfm   = tfm;
+	desc.info  = NULL;
+	desc.flags = 0;
+	rc = crypto_blkcipher_decrypt(&desc, &sd, &ss, 16);
+	if (rc) {
+		CERROR("failed to decrypt for aes\n");
+		goto out;
+	}
+
+out:
+	crypto_free_blkcipher(tfm);
+	return rc;
+}
+EXPORT_SYMBOL(capa_decrypt_id);
+
+void capa_cpy(void *capa, struct obd_capa *ocapa)
+{
+	spin_lock(&ocapa->c_lock);
+	*(struct lustre_capa *)capa = ocapa->c_capa;
+	spin_unlock(&ocapa->c_lock);
+}
+EXPORT_SYMBOL(capa_cpy);
+
+void _debug_capa(struct lustre_capa *c,
+		 struct libcfs_debug_msg_data *msgdata,
+		 const char *fmt, ...)
+{
+	va_list args;
+	va_start(args, fmt);
+	libcfs_debug_vmsg2(msgdata, fmt, args,
+			   " capability@%p fid " DFID " opc %#llx uid %llu gid %llu flags %u alg %d keyid %u timeout %u expiry %u\n",
+			   c, PFID(capa_fid(c)), capa_opc(c),
+			   capa_uid(c), capa_gid(c), capa_flags(c),
+			   capa_alg(c), capa_keyid(c), capa_timeout(c),
+			   capa_expiry(c));
+	va_end(args);
+}
+EXPORT_SYMBOL(_debug_capa);
diff --git a/kernel/drivers/staging/lustre/lustre/obdclass/cl_internal.h b/kernel/drivers/staging/lustre/lustre/obdclass/cl_internal.h
new file mode 100644
index 000000000..7eb0ad7b3
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/obdclass/cl_internal.h
@@ -0,0 +1,121 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Internal cl interfaces.
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ */
+#ifndef _CL_INTERNAL_H
+#define _CL_INTERNAL_H
+
+#define CLT_PVEC_SIZE (14)
+
+/**
+ * Possible levels of the nesting. Currently this is 2: there are "top"
+ * entities (files, extent locks), and "sub" entities (stripes and stripe
+ * locks). This is used only for debugging counters right now.
+ */
+enum clt_nesting_level {
+	CNL_TOP,
+	CNL_SUB,
+	CNL_NR
+};
+
+/**
+ * Counters used to check correctness of cl_lock interface usage.
+ */
+struct cl_thread_counters {
+	/**
+	 * Number of outstanding calls to cl_lock_mutex_get() made by the
+	 * current thread. For debugging.
+	 */
+	int	   ctc_nr_locks_locked;
+	/** List of locked locks. */
+	struct lu_ref ctc_locks_locked;
+	/** Number of outstanding holds on locks. */
+	int	   ctc_nr_held;
+	/** Number of outstanding uses on locks. */
+	int	   ctc_nr_used;
+	/** Number of held extent locks. */
+	int	   ctc_nr_locks_acquired;
+};
+
+/**
+ * Thread local state internal for generic cl-code.
+ */
+struct cl_thread_info {
+	/*
+	 * Common fields.
+	 */
+	struct cl_io	 clt_io;
+	struct cl_2queue     clt_queue;
+
+	/*
+	 * Fields used by cl_lock.c
+	 */
+	struct cl_lock_descr clt_descr;
+	struct cl_page_list  clt_list;
+	/**
+	 * Counters for every level of lock nesting.
+	 */
+	struct cl_thread_counters clt_counters[CNL_NR];
+	/** @} debugging */
+
+	/*
+	 * Fields used by cl_page.c
+	 */
+	struct cl_page      *clt_pvec[CLT_PVEC_SIZE];
+
+	/*
+	 * Fields used by cl_io.c
+	 */
+	/**
+	 * Pointer to the topmost ongoing IO in this thread.
+	 */
+	struct cl_io	*clt_current_io;
+	/**
+	 * Used for submitting a sync io.
+	 */
+	struct cl_sync_io    clt_anchor;
+	/**
+	 * Fields used by cl_lock_discard_pages().
+	 */
+	pgoff_t	      clt_next_index;
+	pgoff_t	      clt_fn_index; /* first non-overlapped index */
+};
+
+struct cl_thread_info *cl_env_info(const struct lu_env *env);
+
+#endif /* _CL_INTERNAL_H */
diff --git a/kernel/drivers/staging/lustre/lustre/obdclass/cl_io.c b/kernel/drivers/staging/lustre/lustre/obdclass/cl_io.c
new file mode 100644
index 000000000..3141b6043
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/obdclass/cl_io.c
@@ -0,0 +1,1669 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Client IO.
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+
+#include "../include/obd_class.h"
+#include "../include/obd_support.h"
+#include "../include/lustre_fid.h"
+#include <linux/list.h>
+#include "../include/cl_object.h"
+#include "cl_internal.h"
+
+/*****************************************************************************
+ *
+ * cl_io interface.
+ *
+ */
+
+#define cl_io_for_each(slice, io) \
+	list_for_each_entry((slice), &io->ci_layers, cis_linkage)
+#define cl_io_for_each_reverse(slice, io)		 \
+	list_for_each_entry_reverse((slice), &io->ci_layers, cis_linkage)
+
+static inline int cl_io_type_is_valid(enum cl_io_type type)
+{
+	return CIT_READ <= type && type < CIT_OP_NR;
+}
+
+static inline int cl_io_is_loopable(const struct cl_io *io)
+{
+	return cl_io_type_is_valid(io->ci_type) && io->ci_type != CIT_MISC;
+}
+
+/**
+ * Returns true iff there is an IO ongoing in the given environment.
+ */
+int cl_io_is_going(const struct lu_env *env)
+{
+	return cl_env_info(env)->clt_current_io != NULL;
+}
+EXPORT_SYMBOL(cl_io_is_going);
+
+/**
+ * cl_io invariant that holds at all times when exported cl_io_*() functions
+ * are entered and left.
+ */
+static int cl_io_invariant(const struct cl_io *io)
+{
+	struct cl_io *up;
+
+	up = io->ci_parent;
+	return
+		/*
+		 * io can own pages only when it is ongoing. Sub-io might
+		 * still be in CIS_LOCKED state when top-io is in
+		 * CIS_IO_GOING.
+		 */
+		ergo(io->ci_owned_nr > 0, io->ci_state == CIS_IO_GOING ||
+		     (io->ci_state == CIS_LOCKED && up != NULL));
+}
+
+/**
+ * Finalize \a io, by calling cl_io_operations::cio_fini() bottom-to-top.
+ */
+void cl_io_fini(const struct lu_env *env, struct cl_io *io)
+{
+	struct cl_io_slice    *slice;
+	struct cl_thread_info *info;
+
+	LINVRNT(cl_io_type_is_valid(io->ci_type));
+	LINVRNT(cl_io_invariant(io));
+
+	while (!list_empty(&io->ci_layers)) {
+		slice = container_of(io->ci_layers.prev, struct cl_io_slice,
+				     cis_linkage);
+		list_del_init(&slice->cis_linkage);
+		if (slice->cis_iop->op[io->ci_type].cio_fini != NULL)
+			slice->cis_iop->op[io->ci_type].cio_fini(env, slice);
+		/*
+		 * Invalidate slice to catch use after free. This assumes that
+		 * slices are allocated within session and can be touched
+		 * after ->cio_fini() returns.
+		 */
+		slice->cis_io = NULL;
+	}
+	io->ci_state = CIS_FINI;
+	info = cl_env_info(env);
+	if (info->clt_current_io == io)
+		info->clt_current_io = NULL;
+
+	/* sanity check for layout change */
+	switch (io->ci_type) {
+	case CIT_READ:
+	case CIT_WRITE:
+		break;
+	case CIT_FAULT:
+	case CIT_FSYNC:
+		LASSERT(!io->ci_need_restart);
+		break;
+	case CIT_SETATTR:
+	case CIT_MISC:
+		/* Check ignore layout change conf */
+		LASSERT(ergo(io->ci_ignore_layout || !io->ci_verify_layout,
+				!io->ci_need_restart));
+		break;
+	default:
+		LBUG();
+	}
+}
+EXPORT_SYMBOL(cl_io_fini);
+
+static int cl_io_init0(const struct lu_env *env, struct cl_io *io,
+		       enum cl_io_type iot, struct cl_object *obj)
+{
+	struct cl_object *scan;
+	int result;
+
+	LINVRNT(io->ci_state == CIS_ZERO || io->ci_state == CIS_FINI);
+	LINVRNT(cl_io_type_is_valid(iot));
+	LINVRNT(cl_io_invariant(io));
+
+	io->ci_type = iot;
+	INIT_LIST_HEAD(&io->ci_lockset.cls_todo);
+	INIT_LIST_HEAD(&io->ci_lockset.cls_curr);
+	INIT_LIST_HEAD(&io->ci_lockset.cls_done);
+	INIT_LIST_HEAD(&io->ci_layers);
+
+	result = 0;
+	cl_object_for_each(scan, obj) {
+		if (scan->co_ops->coo_io_init != NULL) {
+			result = scan->co_ops->coo_io_init(env, scan, io);
+			if (result != 0)
+				break;
+		}
+	}
+	if (result == 0)
+		io->ci_state = CIS_INIT;
+	return result;
+}
+
+/**
+ * Initialize sub-io, by calling cl_io_operations::cio_init() top-to-bottom.
+ *
+ * \pre obj != cl_object_top(obj)
+ */
+int cl_io_sub_init(const struct lu_env *env, struct cl_io *io,
+		   enum cl_io_type iot, struct cl_object *obj)
+{
+	struct cl_thread_info *info = cl_env_info(env);
+
+	LASSERT(obj != cl_object_top(obj));
+	if (info->clt_current_io == NULL)
+		info->clt_current_io = io;
+	return cl_io_init0(env, io, iot, obj);
+}
+EXPORT_SYMBOL(cl_io_sub_init);
+
+/**
+ * Initialize \a io, by calling cl_io_operations::cio_init() top-to-bottom.
+ *
+ * Caller has to call cl_io_fini() after a call to cl_io_init(), no matter
+ * what the latter returned.
+ *
+ * \pre obj == cl_object_top(obj)
+ * \pre cl_io_type_is_valid(iot)
+ * \post cl_io_type_is_valid(io->ci_type) && io->ci_type == iot
+ */
+int cl_io_init(const struct lu_env *env, struct cl_io *io,
+	       enum cl_io_type iot, struct cl_object *obj)
+{
+	struct cl_thread_info *info = cl_env_info(env);
+
+	LASSERT(obj == cl_object_top(obj));
+	LASSERT(info->clt_current_io == NULL);
+
+	info->clt_current_io = io;
+	return cl_io_init0(env, io, iot, obj);
+}
+EXPORT_SYMBOL(cl_io_init);
+
+/**
+ * Initialize read or write io.
+ *
+ * \pre iot == CIT_READ || iot == CIT_WRITE
+ */
+int cl_io_rw_init(const struct lu_env *env, struct cl_io *io,
+		  enum cl_io_type iot, loff_t pos, size_t count)
+{
+	LINVRNT(iot == CIT_READ || iot == CIT_WRITE);
+	LINVRNT(io->ci_obj != NULL);
+
+	LU_OBJECT_HEADER(D_VFSTRACE, env, &io->ci_obj->co_lu,
+			 "io range: %u [%llu, %llu) %u %u\n",
+			 iot, (__u64)pos, (__u64)pos + count,
+			 io->u.ci_rw.crw_nonblock, io->u.ci_wr.wr_append);
+	io->u.ci_rw.crw_pos    = pos;
+	io->u.ci_rw.crw_count  = count;
+	return cl_io_init(env, io, iot, io->ci_obj);
+}
+EXPORT_SYMBOL(cl_io_rw_init);
+
+static inline const struct lu_fid *
+cl_lock_descr_fid(const struct cl_lock_descr *descr)
+{
+	return lu_object_fid(&descr->cld_obj->co_lu);
+}
+
+static int cl_lock_descr_sort(const struct cl_lock_descr *d0,
+			      const struct cl_lock_descr *d1)
+{
+	return lu_fid_cmp(cl_lock_descr_fid(d0), cl_lock_descr_fid(d1)) ?:
+		__diff_normalize(d0->cld_start, d1->cld_start);
+}
+
+static int cl_lock_descr_cmp(const struct cl_lock_descr *d0,
+			     const struct cl_lock_descr *d1)
+{
+	int ret;
+
+	ret = lu_fid_cmp(cl_lock_descr_fid(d0), cl_lock_descr_fid(d1));
+	if (ret)
+		return ret;
+	if (d0->cld_end < d1->cld_start)
+		return -1;
+	if (d0->cld_start > d0->cld_end)
+		return 1;
+	return 0;
+}
+
+static void cl_lock_descr_merge(struct cl_lock_descr *d0,
+				const struct cl_lock_descr *d1)
+{
+	d0->cld_start = min(d0->cld_start, d1->cld_start);
+	d0->cld_end = max(d0->cld_end, d1->cld_end);
+
+	if (d1->cld_mode == CLM_WRITE && d0->cld_mode != CLM_WRITE)
+		d0->cld_mode = CLM_WRITE;
+
+	if (d1->cld_mode == CLM_GROUP && d0->cld_mode != CLM_GROUP)
+		d0->cld_mode = CLM_GROUP;
+}
+
+/*
+ * Sort locks in lexicographical order of their (fid, start-offset) pairs.
+ */
+static void cl_io_locks_sort(struct cl_io *io)
+{
+	int done = 0;
+
+	/* hidden treasure: bubble sort for now. */
+	do {
+		struct cl_io_lock_link *curr;
+		struct cl_io_lock_link *prev;
+		struct cl_io_lock_link *temp;
+
+		done = 1;
+		prev = NULL;
+
+		list_for_each_entry_safe(curr, temp,
+					     &io->ci_lockset.cls_todo,
+					     cill_linkage) {
+			if (prev != NULL) {
+				switch (cl_lock_descr_sort(&prev->cill_descr,
+							  &curr->cill_descr)) {
+				case 0:
+					/*
+					 * IMPOSSIBLE: Identical locks are
+					 *	     already removed at
+					 *	     this point.
+					 */
+				default:
+					LBUG();
+				case +1:
+					list_move_tail(&curr->cill_linkage,
+							   &prev->cill_linkage);
+					done = 0;
+					continue; /* don't change prev: it's
+						   * still "previous" */
+				case -1: /* already in order */
+					break;
+				}
+			}
+			prev = curr;
+		}
+	} while (!done);
+}
+
+/**
+ * Check whether \a queue contains locks matching \a need.
+ *
+ * \retval +ve there is a matching lock in the \a queue
+ * \retval   0 there are no matching locks in the \a queue
+ */
+int cl_queue_match(const struct list_head *queue,
+		   const struct cl_lock_descr *need)
+{
+       struct cl_io_lock_link *scan;
+
+       list_for_each_entry(scan, queue, cill_linkage) {
+	       if (cl_lock_descr_match(&scan->cill_descr, need))
+		       return +1;
+       }
+       return 0;
+}
+EXPORT_SYMBOL(cl_queue_match);
+
+static int cl_queue_merge(const struct list_head *queue,
+			  const struct cl_lock_descr *need)
+{
+       struct cl_io_lock_link *scan;
+
+       list_for_each_entry(scan, queue, cill_linkage) {
+	       if (cl_lock_descr_cmp(&scan->cill_descr, need))
+		       continue;
+	       cl_lock_descr_merge(&scan->cill_descr, need);
+	       CDEBUG(D_VFSTRACE, "lock: %d: [%lu, %lu]\n",
+		      scan->cill_descr.cld_mode, scan->cill_descr.cld_start,
+		      scan->cill_descr.cld_end);
+	       return +1;
+       }
+       return 0;
+
+}
+
+static int cl_lockset_match(const struct cl_lockset *set,
+			    const struct cl_lock_descr *need)
+{
+	return cl_queue_match(&set->cls_curr, need) ||
+	       cl_queue_match(&set->cls_done, need);
+}
+
+static int cl_lockset_merge(const struct cl_lockset *set,
+			    const struct cl_lock_descr *need)
+{
+	return cl_queue_merge(&set->cls_todo, need) ||
+	       cl_lockset_match(set, need);
+}
+
+static int cl_lockset_lock_one(const struct lu_env *env,
+			       struct cl_io *io, struct cl_lockset *set,
+			       struct cl_io_lock_link *link)
+{
+	struct cl_lock *lock;
+	int	     result;
+
+	lock = cl_lock_request(env, io, &link->cill_descr, "io", io);
+
+	if (!IS_ERR(lock)) {
+		link->cill_lock = lock;
+		list_move(&link->cill_linkage, &set->cls_curr);
+		if (!(link->cill_descr.cld_enq_flags & CEF_ASYNC)) {
+			result = cl_wait(env, lock);
+			if (result == 0)
+				list_move(&link->cill_linkage,
+					      &set->cls_done);
+		} else
+			result = 0;
+	} else
+		result = PTR_ERR(lock);
+	return result;
+}
+
+static void cl_lock_link_fini(const struct lu_env *env, struct cl_io *io,
+			      struct cl_io_lock_link *link)
+{
+	struct cl_lock *lock = link->cill_lock;
+
+	list_del_init(&link->cill_linkage);
+	if (lock != NULL) {
+		cl_lock_release(env, lock, "io", io);
+		link->cill_lock = NULL;
+	}
+	if (link->cill_fini != NULL)
+		link->cill_fini(env, link);
+}
+
+static int cl_lockset_lock(const struct lu_env *env, struct cl_io *io,
+			   struct cl_lockset *set)
+{
+	struct cl_io_lock_link *link;
+	struct cl_io_lock_link *temp;
+	struct cl_lock	 *lock;
+	int result;
+
+	result = 0;
+	list_for_each_entry_safe(link, temp, &set->cls_todo, cill_linkage) {
+		if (!cl_lockset_match(set, &link->cill_descr)) {
+			/* XXX some locking to guarantee that locks aren't
+			 * expanded in between. */
+			result = cl_lockset_lock_one(env, io, set, link);
+			if (result != 0)
+				break;
+		} else
+			cl_lock_link_fini(env, io, link);
+	}
+	if (result == 0) {
+		list_for_each_entry_safe(link, temp,
+					     &set->cls_curr, cill_linkage) {
+			lock = link->cill_lock;
+			result = cl_wait(env, lock);
+			if (result == 0)
+				list_move(&link->cill_linkage,
+					      &set->cls_done);
+			else
+				break;
+		}
+	}
+	return result;
+}
+
+/**
+ * Takes locks necessary for the current iteration of io.
+ *
+ * Calls cl_io_operations::cio_lock() top-to-bottom to collect locks required
+ * by layers for the current iteration. Then sort locks (to avoid dead-locks),
+ * and acquire them.
+ */
+int cl_io_lock(const struct lu_env *env, struct cl_io *io)
+{
+	const struct cl_io_slice *scan;
+	int result = 0;
+
+	LINVRNT(cl_io_is_loopable(io));
+	LINVRNT(io->ci_state == CIS_IT_STARTED);
+	LINVRNT(cl_io_invariant(io));
+
+	cl_io_for_each(scan, io) {
+		if (scan->cis_iop->op[io->ci_type].cio_lock == NULL)
+			continue;
+		result = scan->cis_iop->op[io->ci_type].cio_lock(env, scan);
+		if (result != 0)
+			break;
+	}
+	if (result == 0) {
+		cl_io_locks_sort(io);
+		result = cl_lockset_lock(env, io, &io->ci_lockset);
+	}
+	if (result != 0)
+		cl_io_unlock(env, io);
+	else
+		io->ci_state = CIS_LOCKED;
+	return result;
+}
+EXPORT_SYMBOL(cl_io_lock);
+
+/**
+ * Release locks takes by io.
+ */
+void cl_io_unlock(const struct lu_env *env, struct cl_io *io)
+{
+	struct cl_lockset	*set;
+	struct cl_io_lock_link   *link;
+	struct cl_io_lock_link   *temp;
+	const struct cl_io_slice *scan;
+
+	LASSERT(cl_io_is_loopable(io));
+	LASSERT(CIS_IT_STARTED <= io->ci_state && io->ci_state < CIS_UNLOCKED);
+	LINVRNT(cl_io_invariant(io));
+
+	set = &io->ci_lockset;
+
+	list_for_each_entry_safe(link, temp, &set->cls_todo, cill_linkage)
+		cl_lock_link_fini(env, io, link);
+
+	list_for_each_entry_safe(link, temp, &set->cls_curr, cill_linkage)
+		cl_lock_link_fini(env, io, link);
+
+	list_for_each_entry_safe(link, temp, &set->cls_done, cill_linkage) {
+		cl_unuse(env, link->cill_lock);
+		cl_lock_link_fini(env, io, link);
+	}
+	cl_io_for_each_reverse(scan, io) {
+		if (scan->cis_iop->op[io->ci_type].cio_unlock != NULL)
+			scan->cis_iop->op[io->ci_type].cio_unlock(env, scan);
+	}
+	io->ci_state = CIS_UNLOCKED;
+	LASSERT(!cl_env_info(env)->clt_counters[CNL_TOP].ctc_nr_locks_acquired);
+}
+EXPORT_SYMBOL(cl_io_unlock);
+
+/**
+ * Prepares next iteration of io.
+ *
+ * Calls cl_io_operations::cio_iter_init() top-to-bottom. This exists to give
+ * layers a chance to modify io parameters, e.g., so that lov can restrict io
+ * to a single stripe.
+ */
+int cl_io_iter_init(const struct lu_env *env, struct cl_io *io)
+{
+	const struct cl_io_slice *scan;
+	int result;
+
+	LINVRNT(cl_io_is_loopable(io));
+	LINVRNT(io->ci_state == CIS_INIT || io->ci_state == CIS_IT_ENDED);
+	LINVRNT(cl_io_invariant(io));
+
+	result = 0;
+	cl_io_for_each(scan, io) {
+		if (scan->cis_iop->op[io->ci_type].cio_iter_init == NULL)
+			continue;
+		result = scan->cis_iop->op[io->ci_type].cio_iter_init(env,
+								      scan);
+		if (result != 0)
+			break;
+	}
+	if (result == 0)
+		io->ci_state = CIS_IT_STARTED;
+	return result;
+}
+EXPORT_SYMBOL(cl_io_iter_init);
+
+/**
+ * Finalizes io iteration.
+ *
+ * Calls cl_io_operations::cio_iter_fini() bottom-to-top.
+ */
+void cl_io_iter_fini(const struct lu_env *env, struct cl_io *io)
+{
+	const struct cl_io_slice *scan;
+
+	LINVRNT(cl_io_is_loopable(io));
+	LINVRNT(io->ci_state == CIS_UNLOCKED);
+	LINVRNT(cl_io_invariant(io));
+
+	cl_io_for_each_reverse(scan, io) {
+		if (scan->cis_iop->op[io->ci_type].cio_iter_fini != NULL)
+			scan->cis_iop->op[io->ci_type].cio_iter_fini(env, scan);
+	}
+	io->ci_state = CIS_IT_ENDED;
+}
+EXPORT_SYMBOL(cl_io_iter_fini);
+
+/**
+ * Records that read or write io progressed \a nob bytes forward.
+ */
+void cl_io_rw_advance(const struct lu_env *env, struct cl_io *io, size_t nob)
+{
+	const struct cl_io_slice *scan;
+
+	LINVRNT(io->ci_type == CIT_READ || io->ci_type == CIT_WRITE ||
+		nob == 0);
+	LINVRNT(cl_io_is_loopable(io));
+	LINVRNT(cl_io_invariant(io));
+
+	io->u.ci_rw.crw_pos   += nob;
+	io->u.ci_rw.crw_count -= nob;
+
+	/* layers have to be notified. */
+	cl_io_for_each_reverse(scan, io) {
+		if (scan->cis_iop->op[io->ci_type].cio_advance != NULL)
+			scan->cis_iop->op[io->ci_type].cio_advance(env, scan,
+								   nob);
+	}
+}
+EXPORT_SYMBOL(cl_io_rw_advance);
+
+/**
+ * Adds a lock to a lockset.
+ */
+int cl_io_lock_add(const struct lu_env *env, struct cl_io *io,
+		   struct cl_io_lock_link *link)
+{
+	int result;
+
+	if (cl_lockset_merge(&io->ci_lockset, &link->cill_descr))
+		result = +1;
+	else {
+		list_add(&link->cill_linkage, &io->ci_lockset.cls_todo);
+		result = 0;
+	}
+	return result;
+}
+EXPORT_SYMBOL(cl_io_lock_add);
+
+static void cl_free_io_lock_link(const struct lu_env *env,
+				 struct cl_io_lock_link *link)
+{
+	OBD_FREE_PTR(link);
+}
+
+/**
+ * Allocates new lock link, and uses it to add a lock to a lockset.
+ */
+int cl_io_lock_alloc_add(const struct lu_env *env, struct cl_io *io,
+			 struct cl_lock_descr *descr)
+{
+	struct cl_io_lock_link *link;
+	int result;
+
+	OBD_ALLOC_PTR(link);
+	if (link != NULL) {
+		link->cill_descr     = *descr;
+		link->cill_fini      = cl_free_io_lock_link;
+		result = cl_io_lock_add(env, io, link);
+		if (result) /* lock match */
+			link->cill_fini(env, link);
+	} else
+		result = -ENOMEM;
+
+	return result;
+}
+EXPORT_SYMBOL(cl_io_lock_alloc_add);
+
+/**
+ * Starts io by calling cl_io_operations::cio_start() top-to-bottom.
+ */
+int cl_io_start(const struct lu_env *env, struct cl_io *io)
+{
+	const struct cl_io_slice *scan;
+	int result = 0;
+
+	LINVRNT(cl_io_is_loopable(io));
+	LINVRNT(io->ci_state == CIS_LOCKED);
+	LINVRNT(cl_io_invariant(io));
+
+	io->ci_state = CIS_IO_GOING;
+	cl_io_for_each(scan, io) {
+		if (scan->cis_iop->op[io->ci_type].cio_start == NULL)
+			continue;
+		result = scan->cis_iop->op[io->ci_type].cio_start(env, scan);
+		if (result != 0)
+			break;
+	}
+	if (result >= 0)
+		result = 0;
+	return result;
+}
+EXPORT_SYMBOL(cl_io_start);
+
+/**
+ * Wait until current io iteration is finished by calling
+ * cl_io_operations::cio_end() bottom-to-top.
+ */
+void cl_io_end(const struct lu_env *env, struct cl_io *io)
+{
+	const struct cl_io_slice *scan;
+
+	LINVRNT(cl_io_is_loopable(io));
+	LINVRNT(io->ci_state == CIS_IO_GOING);
+	LINVRNT(cl_io_invariant(io));
+
+	cl_io_for_each_reverse(scan, io) {
+		if (scan->cis_iop->op[io->ci_type].cio_end != NULL)
+			scan->cis_iop->op[io->ci_type].cio_end(env, scan);
+		/* TODO: error handling. */
+	}
+	io->ci_state = CIS_IO_FINISHED;
+}
+EXPORT_SYMBOL(cl_io_end);
+
+static const struct cl_page_slice *
+cl_io_slice_page(const struct cl_io_slice *ios, struct cl_page *page)
+{
+	const struct cl_page_slice *slice;
+
+	slice = cl_page_at(page, ios->cis_obj->co_lu.lo_dev->ld_type);
+	LINVRNT(slice != NULL);
+	return slice;
+}
+
+/**
+ * True iff \a page is within \a io range.
+ */
+static int cl_page_in_io(const struct cl_page *page, const struct cl_io *io)
+{
+	int     result = 1;
+	loff_t  start;
+	loff_t  end;
+	pgoff_t idx;
+
+	idx = page->cp_index;
+	switch (io->ci_type) {
+	case CIT_READ:
+	case CIT_WRITE:
+		/*
+		 * check that [start, end) and [pos, pos + count) extents
+		 * overlap.
+		 */
+		if (!cl_io_is_append(io)) {
+			const struct cl_io_rw_common *crw = &(io->u.ci_rw);
+			start = cl_offset(page->cp_obj, idx);
+			end   = cl_offset(page->cp_obj, idx + 1);
+			result = crw->crw_pos < end &&
+				 start < crw->crw_pos + crw->crw_count;
+		}
+		break;
+	case CIT_FAULT:
+		result = io->u.ci_fault.ft_index == idx;
+		break;
+	default:
+		LBUG();
+	}
+	return result;
+}
+
+/**
+ * Called by read io, when page has to be read from the server.
+ *
+ * \see cl_io_operations::cio_read_page()
+ */
+int cl_io_read_page(const struct lu_env *env, struct cl_io *io,
+		    struct cl_page *page)
+{
+	const struct cl_io_slice *scan;
+	struct cl_2queue	 *queue;
+	int		       result = 0;
+
+	LINVRNT(io->ci_type == CIT_READ || io->ci_type == CIT_FAULT);
+	LINVRNT(cl_page_is_owned(page, io));
+	LINVRNT(io->ci_state == CIS_IO_GOING || io->ci_state == CIS_LOCKED);
+	LINVRNT(cl_page_in_io(page, io));
+	LINVRNT(cl_io_invariant(io));
+
+	queue = &io->ci_queue;
+
+	cl_2queue_init(queue);
+	/*
+	 * ->cio_read_page() methods called in the loop below are supposed to
+	 * never block waiting for network (the only subtle point is the
+	 * creation of new pages for read-ahead that might result in cache
+	 * shrinking, but currently only clean pages are shrunk and this
+	 * requires no network io).
+	 *
+	 * Should this ever starts blocking, retry loop would be needed for
+	 * "parallel io" (see CLO_REPEAT loops in cl_lock.c).
+	 */
+	cl_io_for_each(scan, io) {
+		if (scan->cis_iop->cio_read_page != NULL) {
+			const struct cl_page_slice *slice;
+
+			slice = cl_io_slice_page(scan, page);
+			LINVRNT(slice != NULL);
+			result = scan->cis_iop->cio_read_page(env, scan, slice);
+			if (result != 0)
+				break;
+		}
+	}
+	if (result == 0)
+		result = cl_io_submit_rw(env, io, CRT_READ, queue);
+	/*
+	 * Unlock unsent pages in case of error.
+	 */
+	cl_page_list_disown(env, io, &queue->c2_qin);
+	cl_2queue_fini(env, queue);
+	return result;
+}
+EXPORT_SYMBOL(cl_io_read_page);
+
+/**
+ * Called by write io to prepare page to receive data from user buffer.
+ *
+ * \see cl_io_operations::cio_prepare_write()
+ */
+int cl_io_prepare_write(const struct lu_env *env, struct cl_io *io,
+			struct cl_page *page, unsigned from, unsigned to)
+{
+	const struct cl_io_slice *scan;
+	int result = 0;
+
+	LINVRNT(io->ci_type == CIT_WRITE);
+	LINVRNT(cl_page_is_owned(page, io));
+	LINVRNT(io->ci_state == CIS_IO_GOING || io->ci_state == CIS_LOCKED);
+	LINVRNT(cl_io_invariant(io));
+	LASSERT(cl_page_in_io(page, io));
+
+	cl_io_for_each_reverse(scan, io) {
+		if (scan->cis_iop->cio_prepare_write != NULL) {
+			const struct cl_page_slice *slice;
+
+			slice = cl_io_slice_page(scan, page);
+			result = scan->cis_iop->cio_prepare_write(env, scan,
+								  slice,
+								  from, to);
+			if (result != 0)
+				break;
+		}
+	}
+	return result;
+}
+EXPORT_SYMBOL(cl_io_prepare_write);
+
+/**
+ * Called by write io after user data were copied into a page.
+ *
+ * \see cl_io_operations::cio_commit_write()
+ */
+int cl_io_commit_write(const struct lu_env *env, struct cl_io *io,
+		       struct cl_page *page, unsigned from, unsigned to)
+{
+	const struct cl_io_slice *scan;
+	int result = 0;
+
+	LINVRNT(io->ci_type == CIT_WRITE);
+	LINVRNT(io->ci_state == CIS_IO_GOING || io->ci_state == CIS_LOCKED);
+	LINVRNT(cl_io_invariant(io));
+	/*
+	 * XXX Uh... not nice. Top level cl_io_commit_write() call (vvp->lov)
+	 * already called cl_page_cache_add(), moving page into CPS_CACHED
+	 * state. Better (and more general) way of dealing with such situation
+	 * is needed.
+	 */
+	LASSERT(cl_page_is_owned(page, io) || page->cp_parent != NULL);
+	LASSERT(cl_page_in_io(page, io));
+
+	cl_io_for_each(scan, io) {
+		if (scan->cis_iop->cio_commit_write != NULL) {
+			const struct cl_page_slice *slice;
+
+			slice = cl_io_slice_page(scan, page);
+			result = scan->cis_iop->cio_commit_write(env, scan,
+								 slice,
+								 from, to);
+			if (result != 0)
+				break;
+		}
+	}
+	LINVRNT(result <= 0);
+	return result;
+}
+EXPORT_SYMBOL(cl_io_commit_write);
+
+/**
+ * Submits a list of pages for immediate io.
+ *
+ * After the function gets returned, The submitted pages are moved to
+ * queue->c2_qout queue, and queue->c2_qin contain both the pages don't need
+ * to be submitted, and the pages are errant to submit.
+ *
+ * \returns 0 if at least one page was submitted, error code otherwise.
+ * \see cl_io_operations::cio_submit()
+ */
+int cl_io_submit_rw(const struct lu_env *env, struct cl_io *io,
+		    enum cl_req_type crt, struct cl_2queue *queue)
+{
+	const struct cl_io_slice *scan;
+	int result = 0;
+
+	LINVRNT(crt < ARRAY_SIZE(scan->cis_iop->req_op));
+
+	cl_io_for_each(scan, io) {
+		if (scan->cis_iop->req_op[crt].cio_submit == NULL)
+			continue;
+		result = scan->cis_iop->req_op[crt].cio_submit(env, scan, crt,
+							       queue);
+		if (result != 0)
+			break;
+	}
+	/*
+	 * If ->cio_submit() failed, no pages were sent.
+	 */
+	LASSERT(ergo(result != 0, list_empty(&queue->c2_qout.pl_pages)));
+	return result;
+}
+EXPORT_SYMBOL(cl_io_submit_rw);
+
+/**
+ * Submit a sync_io and wait for the IO to be finished, or error happens.
+ * If \a timeout is zero, it means to wait for the IO unconditionally.
+ */
+int cl_io_submit_sync(const struct lu_env *env, struct cl_io *io,
+		      enum cl_req_type iot, struct cl_2queue *queue,
+		      long timeout)
+{
+	struct cl_sync_io *anchor = &cl_env_info(env)->clt_anchor;
+	struct cl_page *pg;
+	int rc;
+
+	cl_page_list_for_each(pg, &queue->c2_qin) {
+		LASSERT(pg->cp_sync_io == NULL);
+		pg->cp_sync_io = anchor;
+	}
+
+	cl_sync_io_init(anchor, queue->c2_qin.pl_nr);
+	rc = cl_io_submit_rw(env, io, iot, queue);
+	if (rc == 0) {
+		/*
+		 * If some pages weren't sent for any reason (e.g.,
+		 * read found up-to-date pages in the cache, or write found
+		 * clean pages), count them as completed to avoid infinite
+		 * wait.
+		 */
+		 cl_page_list_for_each(pg, &queue->c2_qin) {
+			pg->cp_sync_io = NULL;
+			cl_sync_io_note(anchor, +1);
+		 }
+
+		 /* wait for the IO to be finished. */
+		 rc = cl_sync_io_wait(env, io, &queue->c2_qout,
+				      anchor, timeout);
+	} else {
+		LASSERT(list_empty(&queue->c2_qout.pl_pages));
+		cl_page_list_for_each(pg, &queue->c2_qin)
+			pg->cp_sync_io = NULL;
+	}
+	return rc;
+}
+EXPORT_SYMBOL(cl_io_submit_sync);
+
+/**
+ * Cancel an IO which has been submitted by cl_io_submit_rw.
+ */
+int cl_io_cancel(const struct lu_env *env, struct cl_io *io,
+		 struct cl_page_list *queue)
+{
+	struct cl_page *page;
+	int result = 0;
+
+	CERROR("Canceling ongoing page transmission\n");
+	cl_page_list_for_each(page, queue) {
+		int rc;
+
+		LINVRNT(cl_page_in_io(page, io));
+		rc = cl_page_cancel(env, page);
+		result = result ?: rc;
+	}
+	return result;
+}
+EXPORT_SYMBOL(cl_io_cancel);
+
+/**
+ * Main io loop.
+ *
+ * Pumps io through iterations calling
+ *
+ *    - cl_io_iter_init()
+ *
+ *    - cl_io_lock()
+ *
+ *    - cl_io_start()
+ *
+ *    - cl_io_end()
+ *
+ *    - cl_io_unlock()
+ *
+ *    - cl_io_iter_fini()
+ *
+ * repeatedly until there is no more io to do.
+ */
+int cl_io_loop(const struct lu_env *env, struct cl_io *io)
+{
+	int result   = 0;
+
+	LINVRNT(cl_io_is_loopable(io));
+
+	do {
+		size_t nob;
+
+		io->ci_continue = 0;
+		result = cl_io_iter_init(env, io);
+		if (result == 0) {
+			nob    = io->ci_nob;
+			result = cl_io_lock(env, io);
+			if (result == 0) {
+				/*
+				 * Notify layers that locks has been taken,
+				 * and do actual i/o.
+				 *
+				 *   - llite: kms, short read;
+				 *   - llite: generic_file_read();
+				 */
+				result = cl_io_start(env, io);
+				/*
+				 * Send any remaining pending
+				 * io, etc.
+				 *
+				 *   - llite: ll_rw_stats_tally.
+				 */
+				cl_io_end(env, io);
+				cl_io_unlock(env, io);
+				cl_io_rw_advance(env, io, io->ci_nob - nob);
+			}
+		}
+		cl_io_iter_fini(env, io);
+	} while (result == 0 && io->ci_continue);
+	if (result == 0)
+		result = io->ci_result;
+	return result < 0 ? result : 0;
+}
+EXPORT_SYMBOL(cl_io_loop);
+
+/**
+ * Adds io slice to the cl_io.
+ *
+ * This is called by cl_object_operations::coo_io_init() methods to add a
+ * per-layer state to the io. New state is added at the end of
+ * cl_io::ci_layers list, that is, it is at the bottom of the stack.
+ *
+ * \see cl_lock_slice_add(), cl_req_slice_add(), cl_page_slice_add()
+ */
+void cl_io_slice_add(struct cl_io *io, struct cl_io_slice *slice,
+		     struct cl_object *obj,
+		     const struct cl_io_operations *ops)
+{
+	struct list_head *linkage = &slice->cis_linkage;
+
+	LASSERT((linkage->prev == NULL && linkage->next == NULL) ||
+		list_empty(linkage));
+
+	list_add_tail(linkage, &io->ci_layers);
+	slice->cis_io  = io;
+	slice->cis_obj = obj;
+	slice->cis_iop = ops;
+}
+EXPORT_SYMBOL(cl_io_slice_add);
+
+
+/**
+ * Initializes page list.
+ */
+void cl_page_list_init(struct cl_page_list *plist)
+{
+	plist->pl_nr = 0;
+	INIT_LIST_HEAD(&plist->pl_pages);
+	plist->pl_owner = current;
+}
+EXPORT_SYMBOL(cl_page_list_init);
+
+/**
+ * Adds a page to a page list.
+ */
+void cl_page_list_add(struct cl_page_list *plist, struct cl_page *page)
+{
+	/* it would be better to check that page is owned by "current" io, but
+	 * it is not passed here. */
+	LASSERT(page->cp_owner != NULL);
+	LINVRNT(plist->pl_owner == current);
+
+	lockdep_off();
+	mutex_lock(&page->cp_mutex);
+	lockdep_on();
+	LASSERT(list_empty(&page->cp_batch));
+	list_add_tail(&page->cp_batch, &plist->pl_pages);
+	++plist->pl_nr;
+	lu_ref_add_at(&page->cp_reference, &page->cp_queue_ref, "queue", plist);
+	cl_page_get(page);
+}
+EXPORT_SYMBOL(cl_page_list_add);
+
+/**
+ * Removes a page from a page list.
+ */
+void cl_page_list_del(const struct lu_env *env,
+		      struct cl_page_list *plist, struct cl_page *page)
+{
+	LASSERT(plist->pl_nr > 0);
+	LINVRNT(plist->pl_owner == current);
+
+	list_del_init(&page->cp_batch);
+	lockdep_off();
+	mutex_unlock(&page->cp_mutex);
+	lockdep_on();
+	--plist->pl_nr;
+	lu_ref_del_at(&page->cp_reference, &page->cp_queue_ref, "queue", plist);
+	cl_page_put(env, page);
+}
+EXPORT_SYMBOL(cl_page_list_del);
+
+/**
+ * Moves a page from one page list to another.
+ */
+void cl_page_list_move(struct cl_page_list *dst, struct cl_page_list *src,
+		       struct cl_page *page)
+{
+	LASSERT(src->pl_nr > 0);
+	LINVRNT(dst->pl_owner == current);
+	LINVRNT(src->pl_owner == current);
+
+	list_move_tail(&page->cp_batch, &dst->pl_pages);
+	--src->pl_nr;
+	++dst->pl_nr;
+	lu_ref_set_at(&page->cp_reference, &page->cp_queue_ref, "queue",
+		      src, dst);
+}
+EXPORT_SYMBOL(cl_page_list_move);
+
+/**
+ * splice the cl_page_list, just as list head does
+ */
+void cl_page_list_splice(struct cl_page_list *list, struct cl_page_list *head)
+{
+	struct cl_page *page;
+	struct cl_page *tmp;
+
+	LINVRNT(list->pl_owner == current);
+	LINVRNT(head->pl_owner == current);
+
+	cl_page_list_for_each_safe(page, tmp, list)
+		cl_page_list_move(head, list, page);
+}
+EXPORT_SYMBOL(cl_page_list_splice);
+
+void cl_page_disown0(const struct lu_env *env,
+		     struct cl_io *io, struct cl_page *pg);
+
+/**
+ * Disowns pages in a queue.
+ */
+void cl_page_list_disown(const struct lu_env *env,
+			 struct cl_io *io, struct cl_page_list *plist)
+{
+	struct cl_page *page;
+	struct cl_page *temp;
+
+	LINVRNT(plist->pl_owner == current);
+
+	cl_page_list_for_each_safe(page, temp, plist) {
+		LASSERT(plist->pl_nr > 0);
+
+		list_del_init(&page->cp_batch);
+		lockdep_off();
+		mutex_unlock(&page->cp_mutex);
+		lockdep_on();
+		--plist->pl_nr;
+		/*
+		 * cl_page_disown0 rather than usual cl_page_disown() is used,
+		 * because pages are possibly in CPS_FREEING state already due
+		 * to the call to cl_page_list_discard().
+		 */
+		/*
+		 * XXX cl_page_disown0() will fail if page is not locked.
+		 */
+		cl_page_disown0(env, io, page);
+		lu_ref_del_at(&page->cp_reference, &page->cp_queue_ref, "queue",
+			      plist);
+		cl_page_put(env, page);
+	}
+}
+EXPORT_SYMBOL(cl_page_list_disown);
+
+/**
+ * Releases pages from queue.
+ */
+void cl_page_list_fini(const struct lu_env *env, struct cl_page_list *plist)
+{
+	struct cl_page *page;
+	struct cl_page *temp;
+
+	LINVRNT(plist->pl_owner == current);
+
+	cl_page_list_for_each_safe(page, temp, plist)
+		cl_page_list_del(env, plist, page);
+	LASSERT(plist->pl_nr == 0);
+}
+EXPORT_SYMBOL(cl_page_list_fini);
+
+/**
+ * Owns all pages in a queue.
+ */
+int cl_page_list_own(const struct lu_env *env,
+		     struct cl_io *io, struct cl_page_list *plist)
+{
+	struct cl_page *page;
+	struct cl_page *temp;
+	pgoff_t index = 0;
+	int result;
+
+	LINVRNT(plist->pl_owner == current);
+
+	result = 0;
+	cl_page_list_for_each_safe(page, temp, plist) {
+		LASSERT(index <= page->cp_index);
+		index = page->cp_index;
+		if (cl_page_own(env, io, page) == 0)
+			result = result ?: page->cp_error;
+		else
+			cl_page_list_del(env, plist, page);
+	}
+	return result;
+}
+EXPORT_SYMBOL(cl_page_list_own);
+
+/**
+ * Assumes all pages in a queue.
+ */
+void cl_page_list_assume(const struct lu_env *env,
+			 struct cl_io *io, struct cl_page_list *plist)
+{
+	struct cl_page *page;
+
+	LINVRNT(plist->pl_owner == current);
+
+	cl_page_list_for_each(page, plist)
+		cl_page_assume(env, io, page);
+}
+EXPORT_SYMBOL(cl_page_list_assume);
+
+/**
+ * Discards all pages in a queue.
+ */
+void cl_page_list_discard(const struct lu_env *env, struct cl_io *io,
+			  struct cl_page_list *plist)
+{
+	struct cl_page *page;
+
+	LINVRNT(plist->pl_owner == current);
+	cl_page_list_for_each(page, plist)
+		cl_page_discard(env, io, page);
+}
+EXPORT_SYMBOL(cl_page_list_discard);
+
+/**
+ * Unmaps all pages in a queue from user virtual memory.
+ */
+int cl_page_list_unmap(const struct lu_env *env, struct cl_io *io,
+			struct cl_page_list *plist)
+{
+	struct cl_page *page;
+	int result;
+
+	LINVRNT(plist->pl_owner == current);
+	result = 0;
+	cl_page_list_for_each(page, plist) {
+		result = cl_page_unmap(env, io, page);
+		if (result != 0)
+			break;
+	}
+	return result;
+}
+EXPORT_SYMBOL(cl_page_list_unmap);
+
+/**
+ * Initialize dual page queue.
+ */
+void cl_2queue_init(struct cl_2queue *queue)
+{
+	cl_page_list_init(&queue->c2_qin);
+	cl_page_list_init(&queue->c2_qout);
+}
+EXPORT_SYMBOL(cl_2queue_init);
+
+/**
+ * Add a page to the incoming page list of 2-queue.
+ */
+void cl_2queue_add(struct cl_2queue *queue, struct cl_page *page)
+{
+	cl_page_list_add(&queue->c2_qin, page);
+}
+EXPORT_SYMBOL(cl_2queue_add);
+
+/**
+ * Disown pages in both lists of a 2-queue.
+ */
+void cl_2queue_disown(const struct lu_env *env,
+		      struct cl_io *io, struct cl_2queue *queue)
+{
+	cl_page_list_disown(env, io, &queue->c2_qin);
+	cl_page_list_disown(env, io, &queue->c2_qout);
+}
+EXPORT_SYMBOL(cl_2queue_disown);
+
+/**
+ * Discard (truncate) pages in both lists of a 2-queue.
+ */
+void cl_2queue_discard(const struct lu_env *env,
+		       struct cl_io *io, struct cl_2queue *queue)
+{
+	cl_page_list_discard(env, io, &queue->c2_qin);
+	cl_page_list_discard(env, io, &queue->c2_qout);
+}
+EXPORT_SYMBOL(cl_2queue_discard);
+
+/**
+ * Assume to own the pages in cl_2queue
+ */
+void cl_2queue_assume(const struct lu_env *env,
+		      struct cl_io *io, struct cl_2queue *queue)
+{
+	cl_page_list_assume(env, io, &queue->c2_qin);
+	cl_page_list_assume(env, io, &queue->c2_qout);
+}
+EXPORT_SYMBOL(cl_2queue_assume);
+
+/**
+ * Finalize both page lists of a 2-queue.
+ */
+void cl_2queue_fini(const struct lu_env *env, struct cl_2queue *queue)
+{
+	cl_page_list_fini(env, &queue->c2_qout);
+	cl_page_list_fini(env, &queue->c2_qin);
+}
+EXPORT_SYMBOL(cl_2queue_fini);
+
+/**
+ * Initialize a 2-queue to contain \a page in its incoming page list.
+ */
+void cl_2queue_init_page(struct cl_2queue *queue, struct cl_page *page)
+{
+	cl_2queue_init(queue);
+	cl_2queue_add(queue, page);
+}
+EXPORT_SYMBOL(cl_2queue_init_page);
+
+/**
+ * Returns top-level io.
+ *
+ * \see cl_object_top(), cl_page_top().
+ */
+struct cl_io *cl_io_top(struct cl_io *io)
+{
+	while (io->ci_parent != NULL)
+		io = io->ci_parent;
+	return io;
+}
+EXPORT_SYMBOL(cl_io_top);
+
+/**
+ * Prints human readable representation of \a io to the \a f.
+ */
+void cl_io_print(const struct lu_env *env, void *cookie,
+		 lu_printer_t printer, const struct cl_io *io)
+{
+}
+
+/**
+ * Adds request slice to the compound request.
+ *
+ * This is called by cl_device_operations::cdo_req_init() methods to add a
+ * per-layer state to the request. New state is added at the end of
+ * cl_req::crq_layers list, that is, it is at the bottom of the stack.
+ *
+ * \see cl_lock_slice_add(), cl_page_slice_add(), cl_io_slice_add()
+ */
+void cl_req_slice_add(struct cl_req *req, struct cl_req_slice *slice,
+		      struct cl_device *dev,
+		      const struct cl_req_operations *ops)
+{
+	list_add_tail(&slice->crs_linkage, &req->crq_layers);
+	slice->crs_dev = dev;
+	slice->crs_ops = ops;
+	slice->crs_req = req;
+}
+EXPORT_SYMBOL(cl_req_slice_add);
+
+static void cl_req_free(const struct lu_env *env, struct cl_req *req)
+{
+	unsigned i;
+
+	LASSERT(list_empty(&req->crq_pages));
+	LASSERT(req->crq_nrpages == 0);
+	LINVRNT(list_empty(&req->crq_layers));
+	LINVRNT(equi(req->crq_nrobjs > 0, req->crq_o != NULL));
+
+	if (req->crq_o != NULL) {
+		for (i = 0; i < req->crq_nrobjs; ++i) {
+			struct cl_object *obj = req->crq_o[i].ro_obj;
+			if (obj != NULL) {
+				lu_object_ref_del_at(&obj->co_lu,
+						     &req->crq_o[i].ro_obj_ref,
+						     "cl_req", req);
+				cl_object_put(env, obj);
+			}
+		}
+		OBD_FREE(req->crq_o, req->crq_nrobjs * sizeof(req->crq_o[0]));
+	}
+	OBD_FREE_PTR(req);
+}
+
+static int cl_req_init(const struct lu_env *env, struct cl_req *req,
+		       struct cl_page *page)
+{
+	struct cl_device     *dev;
+	struct cl_page_slice *slice;
+	int result;
+
+	result = 0;
+	page = cl_page_top(page);
+	do {
+		list_for_each_entry(slice, &page->cp_layers, cpl_linkage) {
+			dev = lu2cl_dev(slice->cpl_obj->co_lu.lo_dev);
+			if (dev->cd_ops->cdo_req_init != NULL) {
+				result = dev->cd_ops->cdo_req_init(env,
+								   dev, req);
+				if (result != 0)
+					break;
+			}
+		}
+		page = page->cp_child;
+	} while (page != NULL && result == 0);
+	return result;
+}
+
+/**
+ * Invokes per-request transfer completion call-backs
+ * (cl_req_operations::cro_completion()) bottom-to-top.
+ */
+void cl_req_completion(const struct lu_env *env, struct cl_req *req, int rc)
+{
+	struct cl_req_slice *slice;
+
+	/*
+	 * for the lack of list_for_each_entry_reverse_safe()...
+	 */
+	while (!list_empty(&req->crq_layers)) {
+		slice = list_entry(req->crq_layers.prev,
+				       struct cl_req_slice, crs_linkage);
+		list_del_init(&slice->crs_linkage);
+		if (slice->crs_ops->cro_completion != NULL)
+			slice->crs_ops->cro_completion(env, slice, rc);
+	}
+	cl_req_free(env, req);
+}
+EXPORT_SYMBOL(cl_req_completion);
+
+/**
+ * Allocates new transfer request.
+ */
+struct cl_req *cl_req_alloc(const struct lu_env *env, struct cl_page *page,
+			    enum cl_req_type crt, int nr_objects)
+{
+	struct cl_req *req;
+
+	LINVRNT(nr_objects > 0);
+
+	OBD_ALLOC_PTR(req);
+	if (req != NULL) {
+		int result;
+
+		req->crq_type = crt;
+		INIT_LIST_HEAD(&req->crq_pages);
+		INIT_LIST_HEAD(&req->crq_layers);
+
+		OBD_ALLOC(req->crq_o, nr_objects * sizeof(req->crq_o[0]));
+		if (req->crq_o != NULL) {
+			req->crq_nrobjs = nr_objects;
+			result = cl_req_init(env, req, page);
+		} else
+			result = -ENOMEM;
+		if (result != 0) {
+			cl_req_completion(env, req, result);
+			req = ERR_PTR(result);
+		}
+	} else
+		req = ERR_PTR(-ENOMEM);
+	return req;
+}
+EXPORT_SYMBOL(cl_req_alloc);
+
+/**
+ * Adds a page to a request.
+ */
+void cl_req_page_add(const struct lu_env *env,
+		     struct cl_req *req, struct cl_page *page)
+{
+	struct cl_object  *obj;
+	struct cl_req_obj *rqo;
+	int i;
+
+	page = cl_page_top(page);
+
+	LASSERT(list_empty(&page->cp_flight));
+	LASSERT(page->cp_req == NULL);
+
+	CL_PAGE_DEBUG(D_PAGE, env, page, "req %p, %d, %u\n",
+		      req, req->crq_type, req->crq_nrpages);
+
+	list_add_tail(&page->cp_flight, &req->crq_pages);
+	++req->crq_nrpages;
+	page->cp_req = req;
+	obj = cl_object_top(page->cp_obj);
+	for (i = 0, rqo = req->crq_o; obj != rqo->ro_obj; ++i, ++rqo) {
+		if (rqo->ro_obj == NULL) {
+			rqo->ro_obj = obj;
+			cl_object_get(obj);
+			lu_object_ref_add_at(&obj->co_lu, &rqo->ro_obj_ref,
+					     "cl_req", req);
+			break;
+		}
+	}
+	LASSERT(i < req->crq_nrobjs);
+}
+EXPORT_SYMBOL(cl_req_page_add);
+
+/**
+ * Removes a page from a request.
+ */
+void cl_req_page_done(const struct lu_env *env, struct cl_page *page)
+{
+	struct cl_req *req = page->cp_req;
+
+	page = cl_page_top(page);
+
+	LASSERT(!list_empty(&page->cp_flight));
+	LASSERT(req->crq_nrpages > 0);
+
+	list_del_init(&page->cp_flight);
+	--req->crq_nrpages;
+	page->cp_req = NULL;
+}
+EXPORT_SYMBOL(cl_req_page_done);
+
+/**
+ * Notifies layers that request is about to depart by calling
+ * cl_req_operations::cro_prep() top-to-bottom.
+ */
+int cl_req_prep(const struct lu_env *env, struct cl_req *req)
+{
+	int i;
+	int result;
+	const struct cl_req_slice *slice;
+
+	/*
+	 * Check that the caller of cl_req_alloc() didn't lie about the number
+	 * of objects.
+	 */
+	for (i = 0; i < req->crq_nrobjs; ++i)
+		LASSERT(req->crq_o[i].ro_obj != NULL);
+
+	result = 0;
+	list_for_each_entry(slice, &req->crq_layers, crs_linkage) {
+		if (slice->crs_ops->cro_prep != NULL) {
+			result = slice->crs_ops->cro_prep(env, slice);
+			if (result != 0)
+				break;
+		}
+	}
+	return result;
+}
+EXPORT_SYMBOL(cl_req_prep);
+
+/**
+ * Fills in attributes that are passed to server together with transfer. Only
+ * attributes from \a flags may be touched. This can be called multiple times
+ * for the same request.
+ */
+void cl_req_attr_set(const struct lu_env *env, struct cl_req *req,
+		     struct cl_req_attr *attr, u64 flags)
+{
+	const struct cl_req_slice *slice;
+	struct cl_page	    *page;
+	int i;
+
+	LASSERT(!list_empty(&req->crq_pages));
+
+	/* Take any page to use as a model. */
+	page = list_entry(req->crq_pages.next, struct cl_page, cp_flight);
+
+	for (i = 0; i < req->crq_nrobjs; ++i) {
+		list_for_each_entry(slice, &req->crq_layers, crs_linkage) {
+			const struct cl_page_slice *scan;
+			const struct cl_object     *obj;
+
+			scan = cl_page_at(page,
+					  slice->crs_dev->cd_lu_dev.ld_type);
+			LASSERT(scan != NULL);
+			obj = scan->cpl_obj;
+			if (slice->crs_ops->cro_attr_set != NULL)
+				slice->crs_ops->cro_attr_set(env, slice, obj,
+							     attr + i, flags);
+		}
+	}
+}
+EXPORT_SYMBOL(cl_req_attr_set);
+
+/* XXX complete(), init_completion(), and wait_for_completion(), until they are
+ * implemented in libcfs. */
+# include <linux/sched.h>
+
+/**
+ * Initialize synchronous io wait anchor, for transfer of \a nrpages pages.
+ */
+void cl_sync_io_init(struct cl_sync_io *anchor, int nrpages)
+{
+	init_waitqueue_head(&anchor->csi_waitq);
+	atomic_set(&anchor->csi_sync_nr, nrpages);
+	atomic_set(&anchor->csi_barrier, nrpages > 0);
+	anchor->csi_sync_rc = 0;
+}
+EXPORT_SYMBOL(cl_sync_io_init);
+
+/**
+ * Wait until all transfer completes. Transfer completion routine has to call
+ * cl_sync_io_note() for every page.
+ */
+int cl_sync_io_wait(const struct lu_env *env, struct cl_io *io,
+		    struct cl_page_list *queue, struct cl_sync_io *anchor,
+		    long timeout)
+{
+	struct l_wait_info lwi = LWI_TIMEOUT_INTR(cfs_time_seconds(timeout),
+						  NULL, NULL, NULL);
+	int rc;
+
+	LASSERT(timeout >= 0);
+
+	rc = l_wait_event(anchor->csi_waitq,
+			  atomic_read(&anchor->csi_sync_nr) == 0,
+			  &lwi);
+	if (rc < 0) {
+		CERROR("SYNC IO failed with error: %d, try to cancel %d remaining pages\n",
+		       rc, atomic_read(&anchor->csi_sync_nr));
+
+		(void)cl_io_cancel(env, io, queue);
+
+		lwi = (struct l_wait_info) { 0 };
+		(void)l_wait_event(anchor->csi_waitq,
+				   atomic_read(&anchor->csi_sync_nr) == 0,
+				   &lwi);
+	} else {
+		rc = anchor->csi_sync_rc;
+	}
+	LASSERT(atomic_read(&anchor->csi_sync_nr) == 0);
+	cl_page_list_assume(env, io, queue);
+
+	/* wait until cl_sync_io_note() has done wakeup */
+	while (unlikely(atomic_read(&anchor->csi_barrier) != 0)) {
+		cpu_relax();
+	}
+
+	POISON(anchor, 0x5a, sizeof(*anchor));
+	return rc;
+}
+EXPORT_SYMBOL(cl_sync_io_wait);
+
+/**
+ * Indicate that transfer of a single page completed.
+ */
+void cl_sync_io_note(struct cl_sync_io *anchor, int ioret)
+{
+	if (anchor->csi_sync_rc == 0 && ioret < 0)
+		anchor->csi_sync_rc = ioret;
+	/*
+	 * Synchronous IO done without releasing page lock (e.g., as a part of
+	 * ->{prepare,commit}_write(). Completion is used to signal the end of
+	 * IO.
+	 */
+	LASSERT(atomic_read(&anchor->csi_sync_nr) > 0);
+	if (atomic_dec_and_test(&anchor->csi_sync_nr)) {
+		wake_up_all(&anchor->csi_waitq);
+		/* it's safe to nuke or reuse anchor now */
+		atomic_set(&anchor->csi_barrier, 0);
+	}
+}
+EXPORT_SYMBOL(cl_sync_io_note);
diff --git a/kernel/drivers/staging/lustre/lustre/obdclass/cl_lock.c b/kernel/drivers/staging/lustre/lustre/obdclass/cl_lock.c
new file mode 100644
index 000000000..b081167f9
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/obdclass/cl_lock.c
@@ -0,0 +1,2239 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Client Extent Lock.
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+
+#include "../include/obd_class.h"
+#include "../include/obd_support.h"
+#include "../include/lustre_fid.h"
+#include <linux/list.h>
+#include "../include/cl_object.h"
+#include "cl_internal.h"
+
+/** Lock class of cl_lock::cll_guard */
+static struct lock_class_key cl_lock_guard_class;
+static struct kmem_cache *cl_lock_kmem;
+
+static struct lu_kmem_descr cl_lock_caches[] = {
+	{
+		.ckd_cache = &cl_lock_kmem,
+		.ckd_name  = "cl_lock_kmem",
+		.ckd_size  = sizeof (struct cl_lock)
+	},
+	{
+		.ckd_cache = NULL
+	}
+};
+
+#define CS_LOCK_INC(o, item)
+#define CS_LOCK_DEC(o, item)
+#define CS_LOCKSTATE_INC(o, state)
+#define CS_LOCKSTATE_DEC(o, state)
+
+/**
+ * Basic lock invariant that is maintained at all times. Caller either has a
+ * reference to \a lock, or somehow assures that \a lock cannot be freed.
+ *
+ * \see cl_lock_invariant()
+ */
+static int cl_lock_invariant_trusted(const struct lu_env *env,
+				     const struct cl_lock *lock)
+{
+	return  ergo(lock->cll_state == CLS_FREEING, lock->cll_holds == 0) &&
+		atomic_read(&lock->cll_ref) >= lock->cll_holds &&
+		lock->cll_holds >= lock->cll_users &&
+		lock->cll_holds >= 0 &&
+		lock->cll_users >= 0 &&
+		lock->cll_depth >= 0;
+}
+
+/**
+ * Stronger lock invariant, checking that caller has a reference on a lock.
+ *
+ * \see cl_lock_invariant_trusted()
+ */
+static int cl_lock_invariant(const struct lu_env *env,
+			     const struct cl_lock *lock)
+{
+	int result;
+
+	result = atomic_read(&lock->cll_ref) > 0 &&
+		cl_lock_invariant_trusted(env, lock);
+	if (!result && env != NULL)
+		CL_LOCK_DEBUG(D_ERROR, env, lock, "invariant broken");
+	return result;
+}
+
+/**
+ * Returns lock "nesting": 0 for a top-lock and 1 for a sub-lock.
+ */
+static enum clt_nesting_level cl_lock_nesting(const struct cl_lock *lock)
+{
+	return cl_object_header(lock->cll_descr.cld_obj)->coh_nesting;
+}
+
+/**
+ * Returns a set of counters for this lock, depending on a lock nesting.
+ */
+static struct cl_thread_counters *cl_lock_counters(const struct lu_env *env,
+						   const struct cl_lock *lock)
+{
+	struct cl_thread_info *info;
+	enum clt_nesting_level nesting;
+
+	info = cl_env_info(env);
+	nesting = cl_lock_nesting(lock);
+	LASSERT(nesting < ARRAY_SIZE(info->clt_counters));
+	return &info->clt_counters[nesting];
+}
+
+static void cl_lock_trace0(int level, const struct lu_env *env,
+			   const char *prefix, const struct cl_lock *lock,
+			   const char *func, const int line)
+{
+	struct cl_object_header *h = cl_object_header(lock->cll_descr.cld_obj);
+	CDEBUG(level, "%s: %p@(%d %p %d %d %d %d %d %lx)(%p/%d/%d) at %s():%d\n",
+	       prefix, lock, atomic_read(&lock->cll_ref),
+	       lock->cll_guarder, lock->cll_depth,
+	       lock->cll_state, lock->cll_error, lock->cll_holds,
+	       lock->cll_users, lock->cll_flags,
+	       env, h->coh_nesting, cl_lock_nr_mutexed(env),
+	       func, line);
+}
+#define cl_lock_trace(level, env, prefix, lock)			 \
+	cl_lock_trace0(level, env, prefix, lock, __func__, __LINE__)
+
+#define RETIP ((unsigned long)__builtin_return_address(0))
+
+#ifdef CONFIG_LOCKDEP
+static struct lock_class_key cl_lock_key;
+
+static void cl_lock_lockdep_init(struct cl_lock *lock)
+{
+	lockdep_set_class_and_name(lock, &cl_lock_key, "EXT");
+}
+
+static void cl_lock_lockdep_acquire(const struct lu_env *env,
+				    struct cl_lock *lock, __u32 enqflags)
+{
+	cl_lock_counters(env, lock)->ctc_nr_locks_acquired++;
+	lock_map_acquire(&lock->dep_map);
+}
+
+static void cl_lock_lockdep_release(const struct lu_env *env,
+				    struct cl_lock *lock)
+{
+	cl_lock_counters(env, lock)->ctc_nr_locks_acquired--;
+	lock_release(&lock->dep_map, 0, RETIP);
+}
+
+#else /* !CONFIG_LOCKDEP */
+
+static void cl_lock_lockdep_init(struct cl_lock *lock)
+{}
+static void cl_lock_lockdep_acquire(const struct lu_env *env,
+				    struct cl_lock *lock, __u32 enqflags)
+{}
+static void cl_lock_lockdep_release(const struct lu_env *env,
+				    struct cl_lock *lock)
+{}
+
+#endif /* !CONFIG_LOCKDEP */
+
+/**
+ * Adds lock slice to the compound lock.
+ *
+ * This is called by cl_object_operations::coo_lock_init() methods to add a
+ * per-layer state to the lock. New state is added at the end of
+ * cl_lock::cll_layers list, that is, it is at the bottom of the stack.
+ *
+ * \see cl_req_slice_add(), cl_page_slice_add(), cl_io_slice_add()
+ */
+void cl_lock_slice_add(struct cl_lock *lock, struct cl_lock_slice *slice,
+		       struct cl_object *obj,
+		       const struct cl_lock_operations *ops)
+{
+	slice->cls_lock = lock;
+	list_add_tail(&slice->cls_linkage, &lock->cll_layers);
+	slice->cls_obj = obj;
+	slice->cls_ops = ops;
+}
+EXPORT_SYMBOL(cl_lock_slice_add);
+
+/**
+ * Returns true iff a lock with the mode \a has provides at least the same
+ * guarantees as a lock with the mode \a need.
+ */
+int cl_lock_mode_match(enum cl_lock_mode has, enum cl_lock_mode need)
+{
+	LINVRNT(need == CLM_READ || need == CLM_WRITE ||
+		need == CLM_PHANTOM || need == CLM_GROUP);
+	LINVRNT(has == CLM_READ || has == CLM_WRITE ||
+		has == CLM_PHANTOM || has == CLM_GROUP);
+	CLASSERT(CLM_PHANTOM < CLM_READ);
+	CLASSERT(CLM_READ < CLM_WRITE);
+	CLASSERT(CLM_WRITE < CLM_GROUP);
+
+	if (has != CLM_GROUP)
+		return need <= has;
+	else
+		return need == has;
+}
+EXPORT_SYMBOL(cl_lock_mode_match);
+
+/**
+ * Returns true iff extent portions of lock descriptions match.
+ */
+int cl_lock_ext_match(const struct cl_lock_descr *has,
+		      const struct cl_lock_descr *need)
+{
+	return
+		has->cld_start <= need->cld_start &&
+		has->cld_end >= need->cld_end &&
+		cl_lock_mode_match(has->cld_mode, need->cld_mode) &&
+		(has->cld_mode != CLM_GROUP || has->cld_gid == need->cld_gid);
+}
+EXPORT_SYMBOL(cl_lock_ext_match);
+
+/**
+ * Returns true iff a lock with the description \a has provides at least the
+ * same guarantees as a lock with the description \a need.
+ */
+int cl_lock_descr_match(const struct cl_lock_descr *has,
+			const struct cl_lock_descr *need)
+{
+	return
+		cl_object_same(has->cld_obj, need->cld_obj) &&
+		cl_lock_ext_match(has, need);
+}
+EXPORT_SYMBOL(cl_lock_descr_match);
+
+static void cl_lock_free(const struct lu_env *env, struct cl_lock *lock)
+{
+	struct cl_object *obj = lock->cll_descr.cld_obj;
+
+	LINVRNT(!cl_lock_is_mutexed(lock));
+
+	cl_lock_trace(D_DLMTRACE, env, "free lock", lock);
+	might_sleep();
+	while (!list_empty(&lock->cll_layers)) {
+		struct cl_lock_slice *slice;
+
+		slice = list_entry(lock->cll_layers.next,
+				       struct cl_lock_slice, cls_linkage);
+		list_del_init(lock->cll_layers.next);
+		slice->cls_ops->clo_fini(env, slice);
+	}
+	CS_LOCK_DEC(obj, total);
+	CS_LOCKSTATE_DEC(obj, lock->cll_state);
+	lu_object_ref_del_at(&obj->co_lu, &lock->cll_obj_ref, "cl_lock", lock);
+	cl_object_put(env, obj);
+	lu_ref_fini(&lock->cll_reference);
+	lu_ref_fini(&lock->cll_holders);
+	mutex_destroy(&lock->cll_guard);
+	OBD_SLAB_FREE_PTR(lock, cl_lock_kmem);
+}
+
+/**
+ * Releases a reference on a lock.
+ *
+ * When last reference is released, lock is returned to the cache, unless it
+ * is in cl_lock_state::CLS_FREEING state, in which case it is destroyed
+ * immediately.
+ *
+ * \see cl_object_put(), cl_page_put()
+ */
+void cl_lock_put(const struct lu_env *env, struct cl_lock *lock)
+{
+	struct cl_object	*obj;
+
+	LINVRNT(cl_lock_invariant(env, lock));
+	obj = lock->cll_descr.cld_obj;
+	LINVRNT(obj != NULL);
+
+	CDEBUG(D_TRACE, "releasing reference: %d %p %lu\n",
+	       atomic_read(&lock->cll_ref), lock, RETIP);
+
+	if (atomic_dec_and_test(&lock->cll_ref)) {
+		if (lock->cll_state == CLS_FREEING) {
+			LASSERT(list_empty(&lock->cll_linkage));
+			cl_lock_free(env, lock);
+		}
+		CS_LOCK_DEC(obj, busy);
+	}
+}
+EXPORT_SYMBOL(cl_lock_put);
+
+/**
+ * Acquires an additional reference to a lock.
+ *
+ * This can be called only by caller already possessing a reference to \a
+ * lock.
+ *
+ * \see cl_object_get(), cl_page_get()
+ */
+void cl_lock_get(struct cl_lock *lock)
+{
+	LINVRNT(cl_lock_invariant(NULL, lock));
+	CDEBUG(D_TRACE, "acquiring reference: %d %p %lu\n",
+	       atomic_read(&lock->cll_ref), lock, RETIP);
+	atomic_inc(&lock->cll_ref);
+}
+EXPORT_SYMBOL(cl_lock_get);
+
+/**
+ * Acquires a reference to a lock.
+ *
+ * This is much like cl_lock_get(), except that this function can be used to
+ * acquire initial reference to the cached lock. Caller has to deal with all
+ * possible races. Use with care!
+ *
+ * \see cl_page_get_trust()
+ */
+void cl_lock_get_trust(struct cl_lock *lock)
+{
+	CDEBUG(D_TRACE, "acquiring trusted reference: %d %p %lu\n",
+	       atomic_read(&lock->cll_ref), lock, RETIP);
+	if (atomic_inc_return(&lock->cll_ref) == 1)
+		CS_LOCK_INC(lock->cll_descr.cld_obj, busy);
+}
+EXPORT_SYMBOL(cl_lock_get_trust);
+
+/**
+ * Helper function destroying the lock that wasn't completely initialized.
+ *
+ * Other threads can acquire references to the top-lock through its
+ * sub-locks. Hence, it cannot be cl_lock_free()-ed immediately.
+ */
+static void cl_lock_finish(const struct lu_env *env, struct cl_lock *lock)
+{
+	cl_lock_mutex_get(env, lock);
+	cl_lock_cancel(env, lock);
+	cl_lock_delete(env, lock);
+	cl_lock_mutex_put(env, lock);
+	cl_lock_put(env, lock);
+}
+
+static struct cl_lock *cl_lock_alloc(const struct lu_env *env,
+				     struct cl_object *obj,
+				     const struct cl_io *io,
+				     const struct cl_lock_descr *descr)
+{
+	struct cl_lock	  *lock;
+	struct lu_object_header *head;
+
+	OBD_SLAB_ALLOC_PTR_GFP(lock, cl_lock_kmem, GFP_NOFS);
+	if (lock != NULL) {
+		atomic_set(&lock->cll_ref, 1);
+		lock->cll_descr = *descr;
+		lock->cll_state = CLS_NEW;
+		cl_object_get(obj);
+		lu_object_ref_add_at(&obj->co_lu, &lock->cll_obj_ref, "cl_lock",
+				     lock);
+		INIT_LIST_HEAD(&lock->cll_layers);
+		INIT_LIST_HEAD(&lock->cll_linkage);
+		INIT_LIST_HEAD(&lock->cll_inclosure);
+		lu_ref_init(&lock->cll_reference);
+		lu_ref_init(&lock->cll_holders);
+		mutex_init(&lock->cll_guard);
+		lockdep_set_class(&lock->cll_guard, &cl_lock_guard_class);
+		init_waitqueue_head(&lock->cll_wq);
+		head = obj->co_lu.lo_header;
+		CS_LOCKSTATE_INC(obj, CLS_NEW);
+		CS_LOCK_INC(obj, total);
+		CS_LOCK_INC(obj, create);
+		cl_lock_lockdep_init(lock);
+		list_for_each_entry(obj, &head->loh_layers,
+					co_lu.lo_linkage) {
+			int err;
+
+			err = obj->co_ops->coo_lock_init(env, obj, lock, io);
+			if (err != 0) {
+				cl_lock_finish(env, lock);
+				lock = ERR_PTR(err);
+				break;
+			}
+		}
+	} else
+		lock = ERR_PTR(-ENOMEM);
+	return lock;
+}
+
+/**
+ * Transfer the lock into INTRANSIT state and return the original state.
+ *
+ * \pre  state: CLS_CACHED, CLS_HELD or CLS_ENQUEUED
+ * \post state: CLS_INTRANSIT
+ * \see CLS_INTRANSIT
+ */
+enum cl_lock_state cl_lock_intransit(const struct lu_env *env,
+				     struct cl_lock *lock)
+{
+	enum cl_lock_state state = lock->cll_state;
+
+	LASSERT(cl_lock_is_mutexed(lock));
+	LASSERT(state != CLS_INTRANSIT);
+	LASSERTF(state >= CLS_ENQUEUED && state <= CLS_CACHED,
+		 "Malformed lock state %d.\n", state);
+
+	cl_lock_state_set(env, lock, CLS_INTRANSIT);
+	lock->cll_intransit_owner = current;
+	cl_lock_hold_add(env, lock, "intransit", current);
+	return state;
+}
+EXPORT_SYMBOL(cl_lock_intransit);
+
+/**
+ *  Exit the intransit state and restore the lock state to the original state
+ */
+void cl_lock_extransit(const struct lu_env *env, struct cl_lock *lock,
+		       enum cl_lock_state state)
+{
+	LASSERT(cl_lock_is_mutexed(lock));
+	LASSERT(lock->cll_state == CLS_INTRANSIT);
+	LASSERT(state != CLS_INTRANSIT);
+	LASSERT(lock->cll_intransit_owner == current);
+
+	lock->cll_intransit_owner = NULL;
+	cl_lock_state_set(env, lock, state);
+	cl_lock_unhold(env, lock, "intransit", current);
+}
+EXPORT_SYMBOL(cl_lock_extransit);
+
+/**
+ * Checking whether the lock is intransit state
+ */
+int cl_lock_is_intransit(struct cl_lock *lock)
+{
+	LASSERT(cl_lock_is_mutexed(lock));
+	return lock->cll_state == CLS_INTRANSIT &&
+	       lock->cll_intransit_owner != current;
+}
+EXPORT_SYMBOL(cl_lock_is_intransit);
+/**
+ * Returns true iff lock is "suitable" for given io. E.g., locks acquired by
+ * truncate and O_APPEND cannot be reused for read/non-append-write, as they
+ * cover multiple stripes and can trigger cascading timeouts.
+ */
+static int cl_lock_fits_into(const struct lu_env *env,
+			     const struct cl_lock *lock,
+			     const struct cl_lock_descr *need,
+			     const struct cl_io *io)
+{
+	const struct cl_lock_slice *slice;
+
+	LINVRNT(cl_lock_invariant_trusted(env, lock));
+	list_for_each_entry(slice, &lock->cll_layers, cls_linkage) {
+		if (slice->cls_ops->clo_fits_into != NULL &&
+		    !slice->cls_ops->clo_fits_into(env, slice, need, io))
+			return 0;
+	}
+	return 1;
+}
+
+static struct cl_lock *cl_lock_lookup(const struct lu_env *env,
+				      struct cl_object *obj,
+				      const struct cl_io *io,
+				      const struct cl_lock_descr *need)
+{
+	struct cl_lock	  *lock;
+	struct cl_object_header *head;
+
+	head = cl_object_header(obj);
+	assert_spin_locked(&head->coh_lock_guard);
+	CS_LOCK_INC(obj, lookup);
+	list_for_each_entry(lock, &head->coh_locks, cll_linkage) {
+		int matched;
+
+		matched = cl_lock_ext_match(&lock->cll_descr, need) &&
+			  lock->cll_state < CLS_FREEING &&
+			  lock->cll_error == 0 &&
+			  !(lock->cll_flags & CLF_CANCELLED) &&
+			  cl_lock_fits_into(env, lock, need, io);
+		CDEBUG(D_DLMTRACE, "has: "DDESCR"(%d) need: "DDESCR": %d\n",
+		       PDESCR(&lock->cll_descr), lock->cll_state, PDESCR(need),
+		       matched);
+		if (matched) {
+			cl_lock_get_trust(lock);
+			CS_LOCK_INC(obj, hit);
+			return lock;
+		}
+	}
+	return NULL;
+}
+
+/**
+ * Returns a lock matching description \a need.
+ *
+ * This is the main entry point into the cl_lock caching interface. First, a
+ * cache (implemented as a per-object linked list) is consulted. If lock is
+ * found there, it is returned immediately. Otherwise new lock is allocated
+ * and returned. In any case, additional reference to lock is acquired.
+ *
+ * \see cl_object_find(), cl_page_find()
+ */
+static struct cl_lock *cl_lock_find(const struct lu_env *env,
+				    const struct cl_io *io,
+				    const struct cl_lock_descr *need)
+{
+	struct cl_object_header *head;
+	struct cl_object	*obj;
+	struct cl_lock	  *lock;
+
+	obj  = need->cld_obj;
+	head = cl_object_header(obj);
+
+	spin_lock(&head->coh_lock_guard);
+	lock = cl_lock_lookup(env, obj, io, need);
+	spin_unlock(&head->coh_lock_guard);
+
+	if (lock == NULL) {
+		lock = cl_lock_alloc(env, obj, io, need);
+		if (!IS_ERR(lock)) {
+			struct cl_lock *ghost;
+
+			spin_lock(&head->coh_lock_guard);
+			ghost = cl_lock_lookup(env, obj, io, need);
+			if (ghost == NULL) {
+				cl_lock_get_trust(lock);
+				list_add_tail(&lock->cll_linkage,
+						  &head->coh_locks);
+				spin_unlock(&head->coh_lock_guard);
+				CS_LOCK_INC(obj, busy);
+			} else {
+				spin_unlock(&head->coh_lock_guard);
+				/*
+				 * Other threads can acquire references to the
+				 * top-lock through its sub-locks. Hence, it
+				 * cannot be cl_lock_free()-ed immediately.
+				 */
+				cl_lock_finish(env, lock);
+				lock = ghost;
+			}
+		}
+	}
+	return lock;
+}
+
+/**
+ * Returns existing lock matching given description. This is similar to
+ * cl_lock_find() except that no new lock is created, and returned lock is
+ * guaranteed to be in enum cl_lock_state::CLS_HELD state.
+ */
+struct cl_lock *cl_lock_peek(const struct lu_env *env, const struct cl_io *io,
+			     const struct cl_lock_descr *need,
+			     const char *scope, const void *source)
+{
+	struct cl_object_header *head;
+	struct cl_object	*obj;
+	struct cl_lock	  *lock;
+
+	obj  = need->cld_obj;
+	head = cl_object_header(obj);
+
+	do {
+		spin_lock(&head->coh_lock_guard);
+		lock = cl_lock_lookup(env, obj, io, need);
+		spin_unlock(&head->coh_lock_guard);
+		if (lock == NULL)
+			return NULL;
+
+		cl_lock_mutex_get(env, lock);
+		if (lock->cll_state == CLS_INTRANSIT)
+			/* Don't care return value. */
+			cl_lock_state_wait(env, lock);
+		if (lock->cll_state == CLS_FREEING) {
+			cl_lock_mutex_put(env, lock);
+			cl_lock_put(env, lock);
+			lock = NULL;
+		}
+	} while (lock == NULL);
+
+	cl_lock_hold_add(env, lock, scope, source);
+	cl_lock_user_add(env, lock);
+	if (lock->cll_state == CLS_CACHED)
+		cl_use_try(env, lock, 1);
+	if (lock->cll_state == CLS_HELD) {
+		cl_lock_mutex_put(env, lock);
+		cl_lock_lockdep_acquire(env, lock, 0);
+		cl_lock_put(env, lock);
+	} else {
+		cl_unuse_try(env, lock);
+		cl_lock_unhold(env, lock, scope, source);
+		cl_lock_mutex_put(env, lock);
+		cl_lock_put(env, lock);
+		lock = NULL;
+	}
+
+	return lock;
+}
+EXPORT_SYMBOL(cl_lock_peek);
+
+/**
+ * Returns a slice within a lock, corresponding to the given layer in the
+ * device stack.
+ *
+ * \see cl_page_at()
+ */
+const struct cl_lock_slice *cl_lock_at(const struct cl_lock *lock,
+				       const struct lu_device_type *dtype)
+{
+	const struct cl_lock_slice *slice;
+
+	LINVRNT(cl_lock_invariant_trusted(NULL, lock));
+
+	list_for_each_entry(slice, &lock->cll_layers, cls_linkage) {
+		if (slice->cls_obj->co_lu.lo_dev->ld_type == dtype)
+			return slice;
+	}
+	return NULL;
+}
+EXPORT_SYMBOL(cl_lock_at);
+
+static void cl_lock_mutex_tail(const struct lu_env *env, struct cl_lock *lock)
+{
+	struct cl_thread_counters *counters;
+
+	counters = cl_lock_counters(env, lock);
+	lock->cll_depth++;
+	counters->ctc_nr_locks_locked++;
+	lu_ref_add(&counters->ctc_locks_locked, "cll_guard", lock);
+	cl_lock_trace(D_TRACE, env, "got mutex", lock);
+}
+
+/**
+ * Locks cl_lock object.
+ *
+ * This is used to manipulate cl_lock fields, and to serialize state
+ * transitions in the lock state machine.
+ *
+ * \post cl_lock_is_mutexed(lock)
+ *
+ * \see cl_lock_mutex_put()
+ */
+void cl_lock_mutex_get(const struct lu_env *env, struct cl_lock *lock)
+{
+	LINVRNT(cl_lock_invariant(env, lock));
+
+	if (lock->cll_guarder == current) {
+		LINVRNT(cl_lock_is_mutexed(lock));
+		LINVRNT(lock->cll_depth > 0);
+	} else {
+		struct cl_object_header *hdr;
+		struct cl_thread_info   *info;
+		int i;
+
+		LINVRNT(lock->cll_guarder != current);
+		hdr = cl_object_header(lock->cll_descr.cld_obj);
+		/*
+		 * Check that mutices are taken in the bottom-to-top order.
+		 */
+		info = cl_env_info(env);
+		for (i = 0; i < hdr->coh_nesting; ++i)
+			LASSERT(info->clt_counters[i].ctc_nr_locks_locked == 0);
+		mutex_lock_nested(&lock->cll_guard, hdr->coh_nesting);
+		lock->cll_guarder = current;
+		LINVRNT(lock->cll_depth == 0);
+	}
+	cl_lock_mutex_tail(env, lock);
+}
+EXPORT_SYMBOL(cl_lock_mutex_get);
+
+/**
+ * Try-locks cl_lock object.
+ *
+ * \retval 0 \a lock was successfully locked
+ *
+ * \retval -EBUSY \a lock cannot be locked right now
+ *
+ * \post ergo(result == 0, cl_lock_is_mutexed(lock))
+ *
+ * \see cl_lock_mutex_get()
+ */
+int cl_lock_mutex_try(const struct lu_env *env, struct cl_lock *lock)
+{
+	int result;
+
+	LINVRNT(cl_lock_invariant_trusted(env, lock));
+
+	result = 0;
+	if (lock->cll_guarder == current) {
+		LINVRNT(lock->cll_depth > 0);
+		cl_lock_mutex_tail(env, lock);
+	} else if (mutex_trylock(&lock->cll_guard)) {
+		LINVRNT(lock->cll_depth == 0);
+		lock->cll_guarder = current;
+		cl_lock_mutex_tail(env, lock);
+	} else
+		result = -EBUSY;
+	return result;
+}
+EXPORT_SYMBOL(cl_lock_mutex_try);
+
+/**
+ {* Unlocks cl_lock object.
+ *
+ * \pre cl_lock_is_mutexed(lock)
+ *
+ * \see cl_lock_mutex_get()
+ */
+void cl_lock_mutex_put(const struct lu_env *env, struct cl_lock *lock)
+{
+	struct cl_thread_counters *counters;
+
+	LINVRNT(cl_lock_invariant(env, lock));
+	LINVRNT(cl_lock_is_mutexed(lock));
+	LINVRNT(lock->cll_guarder == current);
+	LINVRNT(lock->cll_depth > 0);
+
+	counters = cl_lock_counters(env, lock);
+	LINVRNT(counters->ctc_nr_locks_locked > 0);
+
+	cl_lock_trace(D_TRACE, env, "put mutex", lock);
+	lu_ref_del(&counters->ctc_locks_locked, "cll_guard", lock);
+	counters->ctc_nr_locks_locked--;
+	if (--lock->cll_depth == 0) {
+		lock->cll_guarder = NULL;
+		mutex_unlock(&lock->cll_guard);
+	}
+}
+EXPORT_SYMBOL(cl_lock_mutex_put);
+
+/**
+ * Returns true iff lock's mutex is owned by the current thread.
+ */
+int cl_lock_is_mutexed(struct cl_lock *lock)
+{
+	return lock->cll_guarder == current;
+}
+EXPORT_SYMBOL(cl_lock_is_mutexed);
+
+/**
+ * Returns number of cl_lock mutices held by the current thread (environment).
+ */
+int cl_lock_nr_mutexed(const struct lu_env *env)
+{
+	struct cl_thread_info *info;
+	int i;
+	int locked;
+
+	/*
+	 * NOTE: if summation across all nesting levels (currently 2) proves
+	 *       too expensive, a summary counter can be added to
+	 *       struct cl_thread_info.
+	 */
+	info = cl_env_info(env);
+	for (i = 0, locked = 0; i < ARRAY_SIZE(info->clt_counters); ++i)
+		locked += info->clt_counters[i].ctc_nr_locks_locked;
+	return locked;
+}
+EXPORT_SYMBOL(cl_lock_nr_mutexed);
+
+static void cl_lock_cancel0(const struct lu_env *env, struct cl_lock *lock)
+{
+	LINVRNT(cl_lock_is_mutexed(lock));
+	LINVRNT(cl_lock_invariant(env, lock));
+	if (!(lock->cll_flags & CLF_CANCELLED)) {
+		const struct cl_lock_slice *slice;
+
+		lock->cll_flags |= CLF_CANCELLED;
+		list_for_each_entry_reverse(slice, &lock->cll_layers,
+						cls_linkage) {
+			if (slice->cls_ops->clo_cancel != NULL)
+				slice->cls_ops->clo_cancel(env, slice);
+		}
+	}
+}
+
+static void cl_lock_delete0(const struct lu_env *env, struct cl_lock *lock)
+{
+	struct cl_object_header    *head;
+	const struct cl_lock_slice *slice;
+
+	LINVRNT(cl_lock_is_mutexed(lock));
+	LINVRNT(cl_lock_invariant(env, lock));
+
+	if (lock->cll_state < CLS_FREEING) {
+		bool in_cache;
+
+		LASSERT(lock->cll_state != CLS_INTRANSIT);
+		cl_lock_state_set(env, lock, CLS_FREEING);
+
+		head = cl_object_header(lock->cll_descr.cld_obj);
+
+		spin_lock(&head->coh_lock_guard);
+		in_cache = !list_empty(&lock->cll_linkage);
+		if (in_cache)
+			list_del_init(&lock->cll_linkage);
+		spin_unlock(&head->coh_lock_guard);
+
+		if (in_cache) /* coh_locks cache holds a refcount. */
+			cl_lock_put(env, lock);
+
+		/*
+		 * From now on, no new references to this lock can be acquired
+		 * by cl_lock_lookup().
+		 */
+		list_for_each_entry_reverse(slice, &lock->cll_layers,
+						cls_linkage) {
+			if (slice->cls_ops->clo_delete != NULL)
+				slice->cls_ops->clo_delete(env, slice);
+		}
+		/*
+		 * From now on, no new references to this lock can be acquired
+		 * by layer-specific means (like a pointer from struct
+		 * ldlm_lock in osc, or a pointer from top-lock to sub-lock in
+		 * lov).
+		 *
+		 * Lock will be finally freed in cl_lock_put() when last of
+		 * existing references goes away.
+		 */
+	}
+}
+
+/**
+ * Mod(ifie)s cl_lock::cll_holds counter for a given lock. Also, for a
+ * top-lock (nesting == 0) accounts for this modification in the per-thread
+ * debugging counters. Sub-lock holds can be released by a thread different
+ * from one that acquired it.
+ */
+static void cl_lock_hold_mod(const struct lu_env *env, struct cl_lock *lock,
+			     int delta)
+{
+	struct cl_thread_counters *counters;
+	enum clt_nesting_level     nesting;
+
+	lock->cll_holds += delta;
+	nesting = cl_lock_nesting(lock);
+	if (nesting == CNL_TOP) {
+		counters = &cl_env_info(env)->clt_counters[CNL_TOP];
+		counters->ctc_nr_held += delta;
+		LASSERT(counters->ctc_nr_held >= 0);
+	}
+}
+
+/**
+ * Mod(ifie)s cl_lock::cll_users counter for a given lock. See
+ * cl_lock_hold_mod() for the explanation of the debugging code.
+ */
+static void cl_lock_used_mod(const struct lu_env *env, struct cl_lock *lock,
+			     int delta)
+{
+	struct cl_thread_counters *counters;
+	enum clt_nesting_level     nesting;
+
+	lock->cll_users += delta;
+	nesting = cl_lock_nesting(lock);
+	if (nesting == CNL_TOP) {
+		counters = &cl_env_info(env)->clt_counters[CNL_TOP];
+		counters->ctc_nr_used += delta;
+		LASSERT(counters->ctc_nr_used >= 0);
+	}
+}
+
+void cl_lock_hold_release(const struct lu_env *env, struct cl_lock *lock,
+			  const char *scope, const void *source)
+{
+	LINVRNT(cl_lock_is_mutexed(lock));
+	LINVRNT(cl_lock_invariant(env, lock));
+	LASSERT(lock->cll_holds > 0);
+
+	cl_lock_trace(D_DLMTRACE, env, "hold release lock", lock);
+	lu_ref_del(&lock->cll_holders, scope, source);
+	cl_lock_hold_mod(env, lock, -1);
+	if (lock->cll_holds == 0) {
+		CL_LOCK_ASSERT(lock->cll_state != CLS_HELD, env, lock);
+		if (lock->cll_descr.cld_mode == CLM_PHANTOM ||
+		    lock->cll_descr.cld_mode == CLM_GROUP ||
+		    lock->cll_state != CLS_CACHED)
+			/*
+			 * If lock is still phantom or grouplock when user is
+			 * done with it---destroy the lock.
+			 */
+			lock->cll_flags |= CLF_CANCELPEND|CLF_DOOMED;
+		if (lock->cll_flags & CLF_CANCELPEND) {
+			lock->cll_flags &= ~CLF_CANCELPEND;
+			cl_lock_cancel0(env, lock);
+		}
+		if (lock->cll_flags & CLF_DOOMED) {
+			/* no longer doomed: it's dead... Jim. */
+			lock->cll_flags &= ~CLF_DOOMED;
+			cl_lock_delete0(env, lock);
+		}
+	}
+}
+EXPORT_SYMBOL(cl_lock_hold_release);
+
+/**
+ * Waits until lock state is changed.
+ *
+ * This function is called with cl_lock mutex locked, atomically releases
+ * mutex and goes to sleep, waiting for a lock state change (signaled by
+ * cl_lock_signal()), and re-acquires the mutex before return.
+ *
+ * This function is used to wait until lock state machine makes some progress
+ * and to emulate synchronous operations on top of asynchronous lock
+ * interface.
+ *
+ * \retval -EINTR wait was interrupted
+ *
+ * \retval 0 wait wasn't interrupted
+ *
+ * \pre cl_lock_is_mutexed(lock)
+ *
+ * \see cl_lock_signal()
+ */
+int cl_lock_state_wait(const struct lu_env *env, struct cl_lock *lock)
+{
+	wait_queue_t waiter;
+	sigset_t blocked;
+	int result;
+
+	LINVRNT(cl_lock_is_mutexed(lock));
+	LINVRNT(cl_lock_invariant(env, lock));
+	LASSERT(lock->cll_depth == 1);
+	LASSERT(lock->cll_state != CLS_FREEING); /* too late to wait */
+
+	cl_lock_trace(D_DLMTRACE, env, "state wait lock", lock);
+	result = lock->cll_error;
+	if (result == 0) {
+		/* To avoid being interrupted by the 'non-fatal' signals
+		 * (SIGCHLD, for instance), we'd block them temporarily.
+		 * LU-305 */
+		blocked = cfs_block_sigsinv(LUSTRE_FATAL_SIGS);
+
+		init_waitqueue_entry(&waiter, current);
+		add_wait_queue(&lock->cll_wq, &waiter);
+		set_current_state(TASK_INTERRUPTIBLE);
+		cl_lock_mutex_put(env, lock);
+
+		LASSERT(cl_lock_nr_mutexed(env) == 0);
+
+		/* Returning ERESTARTSYS instead of EINTR so syscalls
+		 * can be restarted if signals are pending here */
+		result = -ERESTARTSYS;
+		if (likely(!OBD_FAIL_CHECK(OBD_FAIL_LOCK_STATE_WAIT_INTR))) {
+			schedule();
+			if (!cfs_signal_pending())
+				result = 0;
+		}
+
+		cl_lock_mutex_get(env, lock);
+		set_current_state(TASK_RUNNING);
+		remove_wait_queue(&lock->cll_wq, &waiter);
+
+		/* Restore old blocked signals */
+		cfs_restore_sigs(blocked);
+	}
+	return result;
+}
+EXPORT_SYMBOL(cl_lock_state_wait);
+
+static void cl_lock_state_signal(const struct lu_env *env, struct cl_lock *lock,
+				 enum cl_lock_state state)
+{
+	const struct cl_lock_slice *slice;
+
+	LINVRNT(cl_lock_is_mutexed(lock));
+	LINVRNT(cl_lock_invariant(env, lock));
+
+	list_for_each_entry(slice, &lock->cll_layers, cls_linkage)
+		if (slice->cls_ops->clo_state != NULL)
+			slice->cls_ops->clo_state(env, slice, state);
+	wake_up_all(&lock->cll_wq);
+}
+
+/**
+ * Notifies waiters that lock state changed.
+ *
+ * Wakes up all waiters sleeping in cl_lock_state_wait(), also notifies all
+ * layers about state change by calling cl_lock_operations::clo_state()
+ * top-to-bottom.
+ */
+void cl_lock_signal(const struct lu_env *env, struct cl_lock *lock)
+{
+	cl_lock_trace(D_DLMTRACE, env, "state signal lock", lock);
+	cl_lock_state_signal(env, lock, lock->cll_state);
+}
+EXPORT_SYMBOL(cl_lock_signal);
+
+/**
+ * Changes lock state.
+ *
+ * This function is invoked to notify layers that lock state changed, possible
+ * as a result of an asynchronous event such as call-back reception.
+ *
+ * \post lock->cll_state == state
+ *
+ * \see cl_lock_operations::clo_state()
+ */
+void cl_lock_state_set(const struct lu_env *env, struct cl_lock *lock,
+		       enum cl_lock_state state)
+{
+	LASSERT(lock->cll_state <= state ||
+		(lock->cll_state == CLS_CACHED &&
+		 (state == CLS_HELD || /* lock found in cache */
+		  state == CLS_NEW  ||   /* sub-lock canceled */
+		  state == CLS_INTRANSIT)) ||
+		/* lock is in transit state */
+		lock->cll_state == CLS_INTRANSIT);
+
+	if (lock->cll_state != state) {
+		CS_LOCKSTATE_DEC(lock->cll_descr.cld_obj, lock->cll_state);
+		CS_LOCKSTATE_INC(lock->cll_descr.cld_obj, state);
+
+		cl_lock_state_signal(env, lock, state);
+		lock->cll_state = state;
+	}
+}
+EXPORT_SYMBOL(cl_lock_state_set);
+
+static int cl_unuse_try_internal(const struct lu_env *env, struct cl_lock *lock)
+{
+	const struct cl_lock_slice *slice;
+	int result;
+
+	do {
+		result = 0;
+
+		LINVRNT(cl_lock_is_mutexed(lock));
+		LINVRNT(cl_lock_invariant(env, lock));
+		LASSERT(lock->cll_state == CLS_INTRANSIT);
+
+		result = -ENOSYS;
+		list_for_each_entry_reverse(slice, &lock->cll_layers,
+						cls_linkage) {
+			if (slice->cls_ops->clo_unuse != NULL) {
+				result = slice->cls_ops->clo_unuse(env, slice);
+				if (result != 0)
+					break;
+			}
+		}
+		LASSERT(result != -ENOSYS);
+	} while (result == CLO_REPEAT);
+
+	return result;
+}
+
+/**
+ * Yanks lock from the cache (cl_lock_state::CLS_CACHED state) by calling
+ * cl_lock_operations::clo_use() top-to-bottom to notify layers.
+ * @atomic = 1, it must unuse the lock to recovery the lock to keep the
+ *  use process atomic
+ */
+int cl_use_try(const struct lu_env *env, struct cl_lock *lock, int atomic)
+{
+	const struct cl_lock_slice *slice;
+	int result;
+	enum cl_lock_state state;
+
+	cl_lock_trace(D_DLMTRACE, env, "use lock", lock);
+
+	LASSERT(lock->cll_state == CLS_CACHED);
+	if (lock->cll_error)
+		return lock->cll_error;
+
+	result = -ENOSYS;
+	state = cl_lock_intransit(env, lock);
+	list_for_each_entry(slice, &lock->cll_layers, cls_linkage) {
+		if (slice->cls_ops->clo_use != NULL) {
+			result = slice->cls_ops->clo_use(env, slice);
+			if (result != 0)
+				break;
+		}
+	}
+	LASSERT(result != -ENOSYS);
+
+	LASSERTF(lock->cll_state == CLS_INTRANSIT, "Wrong state %d.\n",
+		 lock->cll_state);
+
+	if (result == 0) {
+		state = CLS_HELD;
+	} else {
+		if (result == -ESTALE) {
+			/*
+			 * ESTALE means sublock being cancelled
+			 * at this time, and set lock state to
+			 * be NEW here and ask the caller to repeat.
+			 */
+			state = CLS_NEW;
+			result = CLO_REPEAT;
+		}
+
+		/* @atomic means back-off-on-failure. */
+		if (atomic) {
+			int rc;
+			rc = cl_unuse_try_internal(env, lock);
+			/* Vet the results. */
+			if (rc < 0 && result > 0)
+				result = rc;
+		}
+
+	}
+	cl_lock_extransit(env, lock, state);
+	return result;
+}
+EXPORT_SYMBOL(cl_use_try);
+
+/**
+ * Helper for cl_enqueue_try() that calls ->clo_enqueue() across all layers
+ * top-to-bottom.
+ */
+static int cl_enqueue_kick(const struct lu_env *env,
+			   struct cl_lock *lock,
+			   struct cl_io *io, __u32 flags)
+{
+	int result;
+	const struct cl_lock_slice *slice;
+
+	result = -ENOSYS;
+	list_for_each_entry(slice, &lock->cll_layers, cls_linkage) {
+		if (slice->cls_ops->clo_enqueue != NULL) {
+			result = slice->cls_ops->clo_enqueue(env,
+							     slice, io, flags);
+			if (result != 0)
+				break;
+		}
+	}
+	LASSERT(result != -ENOSYS);
+	return result;
+}
+
+/**
+ * Tries to enqueue a lock.
+ *
+ * This function is called repeatedly by cl_enqueue() until either lock is
+ * enqueued, or error occurs. This function does not block waiting for
+ * networking communication to complete.
+ *
+ * \post ergo(result == 0, lock->cll_state == CLS_ENQUEUED ||
+ *			 lock->cll_state == CLS_HELD)
+ *
+ * \see cl_enqueue() cl_lock_operations::clo_enqueue()
+ * \see cl_lock_state::CLS_ENQUEUED
+ */
+int cl_enqueue_try(const struct lu_env *env, struct cl_lock *lock,
+		   struct cl_io *io, __u32 flags)
+{
+	int result;
+
+	cl_lock_trace(D_DLMTRACE, env, "enqueue lock", lock);
+	do {
+		LINVRNT(cl_lock_is_mutexed(lock));
+
+		result = lock->cll_error;
+		if (result != 0)
+			break;
+
+		switch (lock->cll_state) {
+		case CLS_NEW:
+			cl_lock_state_set(env, lock, CLS_QUEUING);
+			/* fall-through */
+		case CLS_QUEUING:
+			/* kick layers. */
+			result = cl_enqueue_kick(env, lock, io, flags);
+			/* For AGL case, the cl_lock::cll_state may
+			 * become CLS_HELD already. */
+			if (result == 0 && lock->cll_state == CLS_QUEUING)
+				cl_lock_state_set(env, lock, CLS_ENQUEUED);
+			break;
+		case CLS_INTRANSIT:
+			LASSERT(cl_lock_is_intransit(lock));
+			result = CLO_WAIT;
+			break;
+		case CLS_CACHED:
+			/* yank lock from the cache. */
+			result = cl_use_try(env, lock, 0);
+			break;
+		case CLS_ENQUEUED:
+		case CLS_HELD:
+			result = 0;
+			break;
+		default:
+		case CLS_FREEING:
+			/*
+			 * impossible, only held locks with increased
+			 * ->cll_holds can be enqueued, and they cannot be
+			 * freed.
+			 */
+			LBUG();
+		}
+	} while (result == CLO_REPEAT);
+	return result;
+}
+EXPORT_SYMBOL(cl_enqueue_try);
+
+/**
+ * Cancel the conflicting lock found during previous enqueue.
+ *
+ * \retval 0 conflicting lock has been canceled.
+ * \retval -ve error code.
+ */
+int cl_lock_enqueue_wait(const struct lu_env *env,
+			 struct cl_lock *lock,
+			 int keep_mutex)
+{
+	struct cl_lock  *conflict;
+	int	      rc = 0;
+
+	LASSERT(cl_lock_is_mutexed(lock));
+	LASSERT(lock->cll_state == CLS_QUEUING);
+	LASSERT(lock->cll_conflict != NULL);
+
+	conflict = lock->cll_conflict;
+	lock->cll_conflict = NULL;
+
+	cl_lock_mutex_put(env, lock);
+	LASSERT(cl_lock_nr_mutexed(env) == 0);
+
+	cl_lock_mutex_get(env, conflict);
+	cl_lock_trace(D_DLMTRACE, env, "enqueue wait", conflict);
+	cl_lock_cancel(env, conflict);
+	cl_lock_delete(env, conflict);
+
+	while (conflict->cll_state != CLS_FREEING) {
+		rc = cl_lock_state_wait(env, conflict);
+		if (rc != 0)
+			break;
+	}
+	cl_lock_mutex_put(env, conflict);
+	lu_ref_del(&conflict->cll_reference, "cancel-wait", lock);
+	cl_lock_put(env, conflict);
+
+	if (keep_mutex)
+		cl_lock_mutex_get(env, lock);
+
+	LASSERT(rc <= 0);
+	return rc;
+}
+EXPORT_SYMBOL(cl_lock_enqueue_wait);
+
+static int cl_enqueue_locked(const struct lu_env *env, struct cl_lock *lock,
+			     struct cl_io *io, __u32 enqflags)
+{
+	int result;
+
+	LINVRNT(cl_lock_is_mutexed(lock));
+	LINVRNT(cl_lock_invariant(env, lock));
+	LASSERT(lock->cll_holds > 0);
+
+	cl_lock_user_add(env, lock);
+	do {
+		result = cl_enqueue_try(env, lock, io, enqflags);
+		if (result == CLO_WAIT) {
+			if (lock->cll_conflict != NULL)
+				result = cl_lock_enqueue_wait(env, lock, 1);
+			else
+				result = cl_lock_state_wait(env, lock);
+			if (result == 0)
+				continue;
+		}
+		break;
+	} while (1);
+	if (result != 0)
+		cl_unuse_try(env, lock);
+	LASSERT(ergo(result == 0 && !(enqflags & CEF_AGL),
+		     lock->cll_state == CLS_ENQUEUED ||
+		     lock->cll_state == CLS_HELD));
+	return result;
+}
+
+/**
+ * Enqueues a lock.
+ *
+ * \pre current thread or io owns a hold on lock.
+ *
+ * \post ergo(result == 0, lock->users increased)
+ * \post ergo(result == 0, lock->cll_state == CLS_ENQUEUED ||
+ *			 lock->cll_state == CLS_HELD)
+ */
+int cl_enqueue(const struct lu_env *env, struct cl_lock *lock,
+	       struct cl_io *io, __u32 enqflags)
+{
+	int result;
+
+	cl_lock_lockdep_acquire(env, lock, enqflags);
+	cl_lock_mutex_get(env, lock);
+	result = cl_enqueue_locked(env, lock, io, enqflags);
+	cl_lock_mutex_put(env, lock);
+	if (result != 0)
+		cl_lock_lockdep_release(env, lock);
+	LASSERT(ergo(result == 0, lock->cll_state == CLS_ENQUEUED ||
+		     lock->cll_state == CLS_HELD));
+	return result;
+}
+EXPORT_SYMBOL(cl_enqueue);
+
+/**
+ * Tries to unlock a lock.
+ *
+ * This function is called to release underlying resource:
+ * 1. for top lock, the resource is sublocks it held;
+ * 2. for sublock, the resource is the reference to dlmlock.
+ *
+ * cl_unuse_try is a one-shot operation, so it must NOT return CLO_WAIT.
+ *
+ * \see cl_unuse() cl_lock_operations::clo_unuse()
+ * \see cl_lock_state::CLS_CACHED
+ */
+int cl_unuse_try(const struct lu_env *env, struct cl_lock *lock)
+{
+	int			 result;
+	enum cl_lock_state	  state = CLS_NEW;
+
+	cl_lock_trace(D_DLMTRACE, env, "unuse lock", lock);
+
+	if (lock->cll_users > 1) {
+		cl_lock_user_del(env, lock);
+		return 0;
+	}
+
+	/* Only if the lock is in CLS_HELD or CLS_ENQUEUED state, it can hold
+	 * underlying resources. */
+	if (!(lock->cll_state == CLS_HELD || lock->cll_state == CLS_ENQUEUED)) {
+		cl_lock_user_del(env, lock);
+		return 0;
+	}
+
+	/*
+	 * New lock users (->cll_users) are not protecting unlocking
+	 * from proceeding. From this point, lock eventually reaches
+	 * CLS_CACHED, is reinitialized to CLS_NEW or fails into
+	 * CLS_FREEING.
+	 */
+	state = cl_lock_intransit(env, lock);
+
+	result = cl_unuse_try_internal(env, lock);
+	LASSERT(lock->cll_state == CLS_INTRANSIT);
+	LASSERT(result != CLO_WAIT);
+	cl_lock_user_del(env, lock);
+	if (result == 0 || result == -ESTALE) {
+		/*
+		 * Return lock back to the cache. This is the only
+		 * place where lock is moved into CLS_CACHED state.
+		 *
+		 * If one of ->clo_unuse() methods returned -ESTALE, lock
+		 * cannot be placed into cache and has to be
+		 * re-initialized. This happens e.g., when a sub-lock was
+		 * canceled while unlocking was in progress.
+		 */
+		if (state == CLS_HELD && result == 0)
+			state = CLS_CACHED;
+		else
+			state = CLS_NEW;
+		cl_lock_extransit(env, lock, state);
+
+		/*
+		 * Hide -ESTALE error.
+		 * If the lock is a glimpse lock, and it has multiple
+		 * stripes. Assuming that one of its sublock returned -ENAVAIL,
+		 * and other sublocks are matched write locks. In this case,
+		 * we can't set this lock to error because otherwise some of
+		 * its sublocks may not be canceled. This causes some dirty
+		 * pages won't be written to OSTs. -jay
+		 */
+		result = 0;
+	} else {
+		CERROR("result = %d, this is unlikely!\n", result);
+		state = CLS_NEW;
+		cl_lock_extransit(env, lock, state);
+	}
+	return result ?: lock->cll_error;
+}
+EXPORT_SYMBOL(cl_unuse_try);
+
+static void cl_unuse_locked(const struct lu_env *env, struct cl_lock *lock)
+{
+	int result;
+
+	result = cl_unuse_try(env, lock);
+	if (result)
+		CL_LOCK_DEBUG(D_ERROR, env, lock, "unuse return %d\n", result);
+}
+
+/**
+ * Unlocks a lock.
+ */
+void cl_unuse(const struct lu_env *env, struct cl_lock *lock)
+{
+	cl_lock_mutex_get(env, lock);
+	cl_unuse_locked(env, lock);
+	cl_lock_mutex_put(env, lock);
+	cl_lock_lockdep_release(env, lock);
+}
+EXPORT_SYMBOL(cl_unuse);
+
+/**
+ * Tries to wait for a lock.
+ *
+ * This function is called repeatedly by cl_wait() until either lock is
+ * granted, or error occurs. This function does not block waiting for network
+ * communication to complete.
+ *
+ * \see cl_wait() cl_lock_operations::clo_wait()
+ * \see cl_lock_state::CLS_HELD
+ */
+int cl_wait_try(const struct lu_env *env, struct cl_lock *lock)
+{
+	const struct cl_lock_slice *slice;
+	int			 result;
+
+	cl_lock_trace(D_DLMTRACE, env, "wait lock try", lock);
+	do {
+		LINVRNT(cl_lock_is_mutexed(lock));
+		LINVRNT(cl_lock_invariant(env, lock));
+		LASSERTF(lock->cll_state == CLS_QUEUING ||
+			 lock->cll_state == CLS_ENQUEUED ||
+			 lock->cll_state == CLS_HELD ||
+			 lock->cll_state == CLS_INTRANSIT,
+			 "lock state: %d\n", lock->cll_state);
+		LASSERT(lock->cll_users > 0);
+		LASSERT(lock->cll_holds > 0);
+
+		result = lock->cll_error;
+		if (result != 0)
+			break;
+
+		if (cl_lock_is_intransit(lock)) {
+			result = CLO_WAIT;
+			break;
+		}
+
+		if (lock->cll_state == CLS_HELD)
+			/* nothing to do */
+			break;
+
+		result = -ENOSYS;
+		list_for_each_entry(slice, &lock->cll_layers, cls_linkage) {
+			if (slice->cls_ops->clo_wait != NULL) {
+				result = slice->cls_ops->clo_wait(env, slice);
+				if (result != 0)
+					break;
+			}
+		}
+		LASSERT(result != -ENOSYS);
+		if (result == 0) {
+			LASSERT(lock->cll_state != CLS_INTRANSIT);
+			cl_lock_state_set(env, lock, CLS_HELD);
+		}
+	} while (result == CLO_REPEAT);
+	return result;
+}
+EXPORT_SYMBOL(cl_wait_try);
+
+/**
+ * Waits until enqueued lock is granted.
+ *
+ * \pre current thread or io owns a hold on the lock
+ * \pre ergo(result == 0, lock->cll_state == CLS_ENQUEUED ||
+ *			lock->cll_state == CLS_HELD)
+ *
+ * \post ergo(result == 0, lock->cll_state == CLS_HELD)
+ */
+int cl_wait(const struct lu_env *env, struct cl_lock *lock)
+{
+	int result;
+
+	cl_lock_mutex_get(env, lock);
+
+	LINVRNT(cl_lock_invariant(env, lock));
+	LASSERTF(lock->cll_state == CLS_ENQUEUED || lock->cll_state == CLS_HELD,
+		 "Wrong state %d \n", lock->cll_state);
+	LASSERT(lock->cll_holds > 0);
+
+	do {
+		result = cl_wait_try(env, lock);
+		if (result == CLO_WAIT) {
+			result = cl_lock_state_wait(env, lock);
+			if (result == 0)
+				continue;
+		}
+		break;
+	} while (1);
+	if (result < 0) {
+		cl_unuse_try(env, lock);
+		cl_lock_lockdep_release(env, lock);
+	}
+	cl_lock_trace(D_DLMTRACE, env, "wait lock", lock);
+	cl_lock_mutex_put(env, lock);
+	LASSERT(ergo(result == 0, lock->cll_state == CLS_HELD));
+	return result;
+}
+EXPORT_SYMBOL(cl_wait);
+
+/**
+ * Executes cl_lock_operations::clo_weigh(), and sums results to estimate lock
+ * value.
+ */
+unsigned long cl_lock_weigh(const struct lu_env *env, struct cl_lock *lock)
+{
+	const struct cl_lock_slice *slice;
+	unsigned long pound;
+	unsigned long ounce;
+
+	LINVRNT(cl_lock_is_mutexed(lock));
+	LINVRNT(cl_lock_invariant(env, lock));
+
+	pound = 0;
+	list_for_each_entry_reverse(slice, &lock->cll_layers, cls_linkage) {
+		if (slice->cls_ops->clo_weigh != NULL) {
+			ounce = slice->cls_ops->clo_weigh(env, slice);
+			pound += ounce;
+			if (pound < ounce) /* over-weight^Wflow */
+				pound = ~0UL;
+		}
+	}
+	return pound;
+}
+EXPORT_SYMBOL(cl_lock_weigh);
+
+/**
+ * Notifies layers that lock description changed.
+ *
+ * The server can grant client a lock different from one that was requested
+ * (e.g., larger in extent). This method is called when actually granted lock
+ * description becomes known to let layers to accommodate for changed lock
+ * description.
+ *
+ * \see cl_lock_operations::clo_modify()
+ */
+int cl_lock_modify(const struct lu_env *env, struct cl_lock *lock,
+		   const struct cl_lock_descr *desc)
+{
+	const struct cl_lock_slice *slice;
+	struct cl_object	   *obj = lock->cll_descr.cld_obj;
+	struct cl_object_header    *hdr = cl_object_header(obj);
+	int result;
+
+	cl_lock_trace(D_DLMTRACE, env, "modify lock", lock);
+	/* don't allow object to change */
+	LASSERT(obj == desc->cld_obj);
+	LINVRNT(cl_lock_is_mutexed(lock));
+	LINVRNT(cl_lock_invariant(env, lock));
+
+	list_for_each_entry_reverse(slice, &lock->cll_layers, cls_linkage) {
+		if (slice->cls_ops->clo_modify != NULL) {
+			result = slice->cls_ops->clo_modify(env, slice, desc);
+			if (result != 0)
+				return result;
+		}
+	}
+	CL_LOCK_DEBUG(D_DLMTRACE, env, lock, " -> "DDESCR"@"DFID"\n",
+		      PDESCR(desc), PFID(lu_object_fid(&desc->cld_obj->co_lu)));
+	/*
+	 * Just replace description in place. Nothing more is needed for
+	 * now. If locks were indexed according to their extent and/or mode,
+	 * that index would have to be updated here.
+	 */
+	spin_lock(&hdr->coh_lock_guard);
+	lock->cll_descr = *desc;
+	spin_unlock(&hdr->coh_lock_guard);
+	return 0;
+}
+EXPORT_SYMBOL(cl_lock_modify);
+
+/**
+ * Initializes lock closure with a given origin.
+ *
+ * \see cl_lock_closure
+ */
+void cl_lock_closure_init(const struct lu_env *env,
+			  struct cl_lock_closure *closure,
+			  struct cl_lock *origin, int wait)
+{
+	LINVRNT(cl_lock_is_mutexed(origin));
+	LINVRNT(cl_lock_invariant(env, origin));
+
+	INIT_LIST_HEAD(&closure->clc_list);
+	closure->clc_origin = origin;
+	closure->clc_wait   = wait;
+	closure->clc_nr     = 0;
+}
+EXPORT_SYMBOL(cl_lock_closure_init);
+
+/**
+ * Builds a closure of \a lock.
+ *
+ * Building of a closure consists of adding initial lock (\a lock) into it,
+ * and calling cl_lock_operations::clo_closure() methods of \a lock. These
+ * methods might call cl_lock_closure_build() recursively again, adding more
+ * locks to the closure, etc.
+ *
+ * \see cl_lock_closure
+ */
+int cl_lock_closure_build(const struct lu_env *env, struct cl_lock *lock,
+			  struct cl_lock_closure *closure)
+{
+	const struct cl_lock_slice *slice;
+	int result;
+
+	LINVRNT(cl_lock_is_mutexed(closure->clc_origin));
+	LINVRNT(cl_lock_invariant(env, closure->clc_origin));
+
+	result = cl_lock_enclosure(env, lock, closure);
+	if (result == 0) {
+		list_for_each_entry(slice, &lock->cll_layers, cls_linkage) {
+			if (slice->cls_ops->clo_closure != NULL) {
+				result = slice->cls_ops->clo_closure(env, slice,
+								     closure);
+				if (result != 0)
+					break;
+			}
+		}
+	}
+	if (result != 0)
+		cl_lock_disclosure(env, closure);
+	return result;
+}
+EXPORT_SYMBOL(cl_lock_closure_build);
+
+/**
+ * Adds new lock to a closure.
+ *
+ * Try-locks \a lock and if succeeded, adds it to the closure (never more than
+ * once). If try-lock failed, returns CLO_REPEAT, after optionally waiting
+ * until next try-lock is likely to succeed.
+ */
+int cl_lock_enclosure(const struct lu_env *env, struct cl_lock *lock,
+		      struct cl_lock_closure *closure)
+{
+	int result = 0;
+
+	cl_lock_trace(D_DLMTRACE, env, "enclosure lock", lock);
+	if (!cl_lock_mutex_try(env, lock)) {
+		/*
+		 * If lock->cll_inclosure is not empty, lock is already in
+		 * this closure.
+		 */
+		if (list_empty(&lock->cll_inclosure)) {
+			cl_lock_get_trust(lock);
+			lu_ref_add(&lock->cll_reference, "closure", closure);
+			list_add(&lock->cll_inclosure, &closure->clc_list);
+			closure->clc_nr++;
+		} else
+			cl_lock_mutex_put(env, lock);
+		result = 0;
+	} else {
+		cl_lock_disclosure(env, closure);
+		if (closure->clc_wait) {
+			cl_lock_get_trust(lock);
+			lu_ref_add(&lock->cll_reference, "closure-w", closure);
+			cl_lock_mutex_put(env, closure->clc_origin);
+
+			LASSERT(cl_lock_nr_mutexed(env) == 0);
+			cl_lock_mutex_get(env, lock);
+			cl_lock_mutex_put(env, lock);
+
+			cl_lock_mutex_get(env, closure->clc_origin);
+			lu_ref_del(&lock->cll_reference, "closure-w", closure);
+			cl_lock_put(env, lock);
+		}
+		result = CLO_REPEAT;
+	}
+	return result;
+}
+EXPORT_SYMBOL(cl_lock_enclosure);
+
+/** Releases mutices of enclosed locks. */
+void cl_lock_disclosure(const struct lu_env *env,
+			struct cl_lock_closure *closure)
+{
+	struct cl_lock *scan;
+	struct cl_lock *temp;
+
+	cl_lock_trace(D_DLMTRACE, env, "disclosure lock", closure->clc_origin);
+	list_for_each_entry_safe(scan, temp, &closure->clc_list,
+				     cll_inclosure){
+		list_del_init(&scan->cll_inclosure);
+		cl_lock_mutex_put(env, scan);
+		lu_ref_del(&scan->cll_reference, "closure", closure);
+		cl_lock_put(env, scan);
+		closure->clc_nr--;
+	}
+	LASSERT(closure->clc_nr == 0);
+}
+EXPORT_SYMBOL(cl_lock_disclosure);
+
+/** Finalizes a closure. */
+void cl_lock_closure_fini(struct cl_lock_closure *closure)
+{
+	LASSERT(closure->clc_nr == 0);
+	LASSERT(list_empty(&closure->clc_list));
+}
+EXPORT_SYMBOL(cl_lock_closure_fini);
+
+/**
+ * Destroys this lock. Notifies layers (bottom-to-top) that lock is being
+ * destroyed, then destroy the lock. If there are holds on the lock, postpone
+ * destruction until all holds are released. This is called when a decision is
+ * made to destroy the lock in the future. E.g., when a blocking AST is
+ * received on it, or fatal communication error happens.
+ *
+ * Caller must have a reference on this lock to prevent a situation, when
+ * deleted lock lingers in memory for indefinite time, because nobody calls
+ * cl_lock_put() to finish it.
+ *
+ * \pre atomic_read(&lock->cll_ref) > 0
+ * \pre ergo(cl_lock_nesting(lock) == CNL_TOP,
+ *	   cl_lock_nr_mutexed(env) == 1)
+ *      [i.e., if a top-lock is deleted, mutices of no other locks can be
+ *      held, as deletion of sub-locks might require releasing a top-lock
+ *      mutex]
+ *
+ * \see cl_lock_operations::clo_delete()
+ * \see cl_lock::cll_holds
+ */
+void cl_lock_delete(const struct lu_env *env, struct cl_lock *lock)
+{
+	LINVRNT(cl_lock_is_mutexed(lock));
+	LINVRNT(cl_lock_invariant(env, lock));
+	LASSERT(ergo(cl_lock_nesting(lock) == CNL_TOP,
+		     cl_lock_nr_mutexed(env) == 1));
+
+	cl_lock_trace(D_DLMTRACE, env, "delete lock", lock);
+	if (lock->cll_holds == 0)
+		cl_lock_delete0(env, lock);
+	else
+		lock->cll_flags |= CLF_DOOMED;
+}
+EXPORT_SYMBOL(cl_lock_delete);
+
+/**
+ * Mark lock as irrecoverably failed, and mark it for destruction. This
+ * happens when, e.g., server fails to grant a lock to us, or networking
+ * time-out happens.
+ *
+ * \pre atomic_read(&lock->cll_ref) > 0
+ *
+ * \see clo_lock_delete()
+ * \see cl_lock::cll_holds
+ */
+void cl_lock_error(const struct lu_env *env, struct cl_lock *lock, int error)
+{
+	LINVRNT(cl_lock_is_mutexed(lock));
+	LINVRNT(cl_lock_invariant(env, lock));
+
+	if (lock->cll_error == 0 && error != 0) {
+		cl_lock_trace(D_DLMTRACE, env, "set lock error", lock);
+		lock->cll_error = error;
+		cl_lock_signal(env, lock);
+		cl_lock_cancel(env, lock);
+		cl_lock_delete(env, lock);
+	}
+}
+EXPORT_SYMBOL(cl_lock_error);
+
+/**
+ * Cancels this lock. Notifies layers
+ * (bottom-to-top) that lock is being cancelled, then destroy the lock. If
+ * there are holds on the lock, postpone cancellation until
+ * all holds are released.
+ *
+ * Cancellation notification is delivered to layers at most once.
+ *
+ * \see cl_lock_operations::clo_cancel()
+ * \see cl_lock::cll_holds
+ */
+void cl_lock_cancel(const struct lu_env *env, struct cl_lock *lock)
+{
+	LINVRNT(cl_lock_is_mutexed(lock));
+	LINVRNT(cl_lock_invariant(env, lock));
+
+	cl_lock_trace(D_DLMTRACE, env, "cancel lock", lock);
+	if (lock->cll_holds == 0)
+		cl_lock_cancel0(env, lock);
+	else
+		lock->cll_flags |= CLF_CANCELPEND;
+}
+EXPORT_SYMBOL(cl_lock_cancel);
+
+/**
+ * Finds an existing lock covering given index and optionally different from a
+ * given \a except lock.
+ */
+struct cl_lock *cl_lock_at_pgoff(const struct lu_env *env,
+				 struct cl_object *obj, pgoff_t index,
+				 struct cl_lock *except,
+				 int pending, int canceld)
+{
+	struct cl_object_header *head;
+	struct cl_lock	  *scan;
+	struct cl_lock	  *lock;
+	struct cl_lock_descr    *need;
+
+	head = cl_object_header(obj);
+	need = &cl_env_info(env)->clt_descr;
+	lock = NULL;
+
+	need->cld_mode = CLM_READ; /* CLM_READ matches both READ & WRITE, but
+				    * not PHANTOM */
+	need->cld_start = need->cld_end = index;
+	need->cld_enq_flags = 0;
+
+	spin_lock(&head->coh_lock_guard);
+	/* It is fine to match any group lock since there could be only one
+	 * with a uniq gid and it conflicts with all other lock modes too */
+	list_for_each_entry(scan, &head->coh_locks, cll_linkage) {
+		if (scan != except &&
+		    (scan->cll_descr.cld_mode == CLM_GROUP ||
+		    cl_lock_ext_match(&scan->cll_descr, need)) &&
+		    scan->cll_state >= CLS_HELD &&
+		    scan->cll_state < CLS_FREEING &&
+		    /*
+		     * This check is racy as the lock can be canceled right
+		     * after it is done, but this is fine, because page exists
+		     * already.
+		     */
+		    (canceld || !(scan->cll_flags & CLF_CANCELLED)) &&
+		    (pending || !(scan->cll_flags & CLF_CANCELPEND))) {
+			/* Don't increase cs_hit here since this
+			 * is just a helper function. */
+			cl_lock_get_trust(scan);
+			lock = scan;
+			break;
+		}
+	}
+	spin_unlock(&head->coh_lock_guard);
+	return lock;
+}
+EXPORT_SYMBOL(cl_lock_at_pgoff);
+
+/**
+ * Calculate the page offset at the layer of @lock.
+ * At the time of this writing, @page is top page and @lock is sub lock.
+ */
+static pgoff_t pgoff_at_lock(struct cl_page *page, struct cl_lock *lock)
+{
+	struct lu_device_type *dtype;
+	const struct cl_page_slice *slice;
+
+	dtype = lock->cll_descr.cld_obj->co_lu.lo_dev->ld_type;
+	slice = cl_page_at(page, dtype);
+	LASSERT(slice != NULL);
+	return slice->cpl_page->cp_index;
+}
+
+/**
+ * Check if page @page is covered by an extra lock or discard it.
+ */
+static int check_and_discard_cb(const struct lu_env *env, struct cl_io *io,
+				struct cl_page *page, void *cbdata)
+{
+	struct cl_thread_info *info = cl_env_info(env);
+	struct cl_lock *lock = cbdata;
+	pgoff_t index = pgoff_at_lock(page, lock);
+
+	if (index >= info->clt_fn_index) {
+		struct cl_lock *tmp;
+
+		/* refresh non-overlapped index */
+		tmp = cl_lock_at_pgoff(env, lock->cll_descr.cld_obj, index,
+					lock, 1, 0);
+		if (tmp != NULL) {
+			/* Cache the first-non-overlapped index so as to skip
+			 * all pages within [index, clt_fn_index). This
+			 * is safe because if tmp lock is canceled, it will
+			 * discard these pages. */
+			info->clt_fn_index = tmp->cll_descr.cld_end + 1;
+			if (tmp->cll_descr.cld_end == CL_PAGE_EOF)
+				info->clt_fn_index = CL_PAGE_EOF;
+			cl_lock_put(env, tmp);
+		} else if (cl_page_own(env, io, page) == 0) {
+			/* discard the page */
+			cl_page_unmap(env, io, page);
+			cl_page_discard(env, io, page);
+			cl_page_disown(env, io, page);
+		} else {
+			LASSERT(page->cp_state == CPS_FREEING);
+		}
+	}
+
+	info->clt_next_index = index + 1;
+	return CLP_GANG_OKAY;
+}
+
+static int discard_cb(const struct lu_env *env, struct cl_io *io,
+		      struct cl_page *page, void *cbdata)
+{
+	struct cl_thread_info *info = cl_env_info(env);
+	struct cl_lock *lock   = cbdata;
+
+	LASSERT(lock->cll_descr.cld_mode >= CLM_WRITE);
+	KLASSERT(ergo(page->cp_type == CPT_CACHEABLE,
+		      !PageWriteback(cl_page_vmpage(env, page))));
+	KLASSERT(ergo(page->cp_type == CPT_CACHEABLE,
+		      !PageDirty(cl_page_vmpage(env, page))));
+
+	info->clt_next_index = pgoff_at_lock(page, lock) + 1;
+	if (cl_page_own(env, io, page) == 0) {
+		/* discard the page */
+		cl_page_unmap(env, io, page);
+		cl_page_discard(env, io, page);
+		cl_page_disown(env, io, page);
+	} else {
+		LASSERT(page->cp_state == CPS_FREEING);
+	}
+
+	return CLP_GANG_OKAY;
+}
+
+/**
+ * Discard pages protected by the given lock. This function traverses radix
+ * tree to find all covering pages and discard them. If a page is being covered
+ * by other locks, it should remain in cache.
+ *
+ * If error happens on any step, the process continues anyway (the reasoning
+ * behind this being that lock cancellation cannot be delayed indefinitely).
+ */
+int cl_lock_discard_pages(const struct lu_env *env, struct cl_lock *lock)
+{
+	struct cl_thread_info *info  = cl_env_info(env);
+	struct cl_io	  *io    = &info->clt_io;
+	struct cl_lock_descr  *descr = &lock->cll_descr;
+	cl_page_gang_cb_t      cb;
+	int res;
+	int result;
+
+	LINVRNT(cl_lock_invariant(env, lock));
+
+	io->ci_obj = cl_object_top(descr->cld_obj);
+	io->ci_ignore_layout = 1;
+	result = cl_io_init(env, io, CIT_MISC, io->ci_obj);
+	if (result != 0)
+		goto out;
+
+	cb = descr->cld_mode == CLM_READ ? check_and_discard_cb : discard_cb;
+	info->clt_fn_index = info->clt_next_index = descr->cld_start;
+	do {
+		res = cl_page_gang_lookup(env, descr->cld_obj, io,
+					  info->clt_next_index, descr->cld_end,
+					  cb, (void *)lock);
+		if (info->clt_next_index > descr->cld_end)
+			break;
+
+		if (res == CLP_GANG_RESCHED)
+			cond_resched();
+	} while (res != CLP_GANG_OKAY);
+out:
+	cl_io_fini(env, io);
+	return result;
+}
+EXPORT_SYMBOL(cl_lock_discard_pages);
+
+/**
+ * Eliminate all locks for a given object.
+ *
+ * Caller has to guarantee that no lock is in active use.
+ *
+ * \param cancel when this is set, cl_locks_prune() cancels locks before
+ *	       destroying.
+ */
+void cl_locks_prune(const struct lu_env *env, struct cl_object *obj, int cancel)
+{
+	struct cl_object_header *head;
+	struct cl_lock	  *lock;
+
+	head = cl_object_header(obj);
+	/*
+	 * If locks are destroyed without cancellation, all pages must be
+	 * already destroyed (as otherwise they will be left unprotected).
+	 */
+	LASSERT(ergo(!cancel,
+		     head->coh_tree.rnode == NULL && head->coh_pages == 0));
+
+	spin_lock(&head->coh_lock_guard);
+	while (!list_empty(&head->coh_locks)) {
+		lock = container_of(head->coh_locks.next,
+				    struct cl_lock, cll_linkage);
+		cl_lock_get_trust(lock);
+		spin_unlock(&head->coh_lock_guard);
+		lu_ref_add(&lock->cll_reference, "prune", current);
+
+again:
+		cl_lock_mutex_get(env, lock);
+		if (lock->cll_state < CLS_FREEING) {
+			LASSERT(lock->cll_users <= 1);
+			if (unlikely(lock->cll_users == 1)) {
+				struct l_wait_info lwi = { 0 };
+
+				cl_lock_mutex_put(env, lock);
+				l_wait_event(lock->cll_wq,
+					     lock->cll_users == 0,
+					     &lwi);
+				goto again;
+			}
+
+			if (cancel)
+				cl_lock_cancel(env, lock);
+			cl_lock_delete(env, lock);
+		}
+		cl_lock_mutex_put(env, lock);
+		lu_ref_del(&lock->cll_reference, "prune", current);
+		cl_lock_put(env, lock);
+		spin_lock(&head->coh_lock_guard);
+	}
+	spin_unlock(&head->coh_lock_guard);
+}
+EXPORT_SYMBOL(cl_locks_prune);
+
+static struct cl_lock *cl_lock_hold_mutex(const struct lu_env *env,
+					  const struct cl_io *io,
+					  const struct cl_lock_descr *need,
+					  const char *scope, const void *source)
+{
+	struct cl_lock *lock;
+
+	while (1) {
+		lock = cl_lock_find(env, io, need);
+		if (IS_ERR(lock))
+			break;
+		cl_lock_mutex_get(env, lock);
+		if (lock->cll_state < CLS_FREEING &&
+		    !(lock->cll_flags & CLF_CANCELLED)) {
+			cl_lock_hold_mod(env, lock, +1);
+			lu_ref_add(&lock->cll_holders, scope, source);
+			lu_ref_add(&lock->cll_reference, scope, source);
+			break;
+		}
+		cl_lock_mutex_put(env, lock);
+		cl_lock_put(env, lock);
+	}
+	return lock;
+}
+
+/**
+ * Returns a lock matching \a need description with a reference and a hold on
+ * it.
+ *
+ * This is much like cl_lock_find(), except that cl_lock_hold() additionally
+ * guarantees that lock is not in the CLS_FREEING state on return.
+ */
+struct cl_lock *cl_lock_hold(const struct lu_env *env, const struct cl_io *io,
+			     const struct cl_lock_descr *need,
+			     const char *scope, const void *source)
+{
+	struct cl_lock *lock;
+
+	lock = cl_lock_hold_mutex(env, io, need, scope, source);
+	if (!IS_ERR(lock))
+		cl_lock_mutex_put(env, lock);
+	return lock;
+}
+EXPORT_SYMBOL(cl_lock_hold);
+
+/**
+ * Main high-level entry point of cl_lock interface that finds existing or
+ * enqueues new lock matching given description.
+ */
+struct cl_lock *cl_lock_request(const struct lu_env *env, struct cl_io *io,
+				const struct cl_lock_descr *need,
+				const char *scope, const void *source)
+{
+	struct cl_lock       *lock;
+	int		   rc;
+	__u32		 enqflags = need->cld_enq_flags;
+
+	do {
+		lock = cl_lock_hold_mutex(env, io, need, scope, source);
+		if (IS_ERR(lock))
+			break;
+
+		rc = cl_enqueue_locked(env, lock, io, enqflags);
+		if (rc == 0) {
+			if (cl_lock_fits_into(env, lock, need, io)) {
+				if (!(enqflags & CEF_AGL)) {
+					cl_lock_mutex_put(env, lock);
+					cl_lock_lockdep_acquire(env, lock,
+								enqflags);
+					break;
+				}
+				rc = 1;
+			}
+			cl_unuse_locked(env, lock);
+		}
+		cl_lock_trace(D_DLMTRACE, env,
+			      rc <= 0 ? "enqueue failed" : "agl succeed", lock);
+		cl_lock_hold_release(env, lock, scope, source);
+		cl_lock_mutex_put(env, lock);
+		lu_ref_del(&lock->cll_reference, scope, source);
+		cl_lock_put(env, lock);
+		if (rc > 0) {
+			LASSERT(enqflags & CEF_AGL);
+			lock = NULL;
+		} else if (rc != 0) {
+			lock = ERR_PTR(rc);
+		}
+	} while (rc == 0);
+	return lock;
+}
+EXPORT_SYMBOL(cl_lock_request);
+
+/**
+ * Adds a hold to a known lock.
+ */
+void cl_lock_hold_add(const struct lu_env *env, struct cl_lock *lock,
+		      const char *scope, const void *source)
+{
+	LINVRNT(cl_lock_is_mutexed(lock));
+	LINVRNT(cl_lock_invariant(env, lock));
+	LASSERT(lock->cll_state != CLS_FREEING);
+
+	cl_lock_hold_mod(env, lock, +1);
+	cl_lock_get(lock);
+	lu_ref_add(&lock->cll_holders, scope, source);
+	lu_ref_add(&lock->cll_reference, scope, source);
+}
+EXPORT_SYMBOL(cl_lock_hold_add);
+
+/**
+ * Releases a hold and a reference on a lock, on which caller acquired a
+ * mutex.
+ */
+void cl_lock_unhold(const struct lu_env *env, struct cl_lock *lock,
+		    const char *scope, const void *source)
+{
+	LINVRNT(cl_lock_invariant(env, lock));
+	cl_lock_hold_release(env, lock, scope, source);
+	lu_ref_del(&lock->cll_reference, scope, source);
+	cl_lock_put(env, lock);
+}
+EXPORT_SYMBOL(cl_lock_unhold);
+
+/**
+ * Releases a hold and a reference on a lock, obtained by cl_lock_hold().
+ */
+void cl_lock_release(const struct lu_env *env, struct cl_lock *lock,
+		     const char *scope, const void *source)
+{
+	LINVRNT(cl_lock_invariant(env, lock));
+	cl_lock_trace(D_DLMTRACE, env, "release lock", lock);
+	cl_lock_mutex_get(env, lock);
+	cl_lock_hold_release(env, lock, scope, source);
+	cl_lock_mutex_put(env, lock);
+	lu_ref_del(&lock->cll_reference, scope, source);
+	cl_lock_put(env, lock);
+}
+EXPORT_SYMBOL(cl_lock_release);
+
+void cl_lock_user_add(const struct lu_env *env, struct cl_lock *lock)
+{
+	LINVRNT(cl_lock_is_mutexed(lock));
+	LINVRNT(cl_lock_invariant(env, lock));
+
+	cl_lock_used_mod(env, lock, +1);
+}
+EXPORT_SYMBOL(cl_lock_user_add);
+
+void cl_lock_user_del(const struct lu_env *env, struct cl_lock *lock)
+{
+	LINVRNT(cl_lock_is_mutexed(lock));
+	LINVRNT(cl_lock_invariant(env, lock));
+	LASSERT(lock->cll_users > 0);
+
+	cl_lock_used_mod(env, lock, -1);
+	if (lock->cll_users == 0)
+		wake_up_all(&lock->cll_wq);
+}
+EXPORT_SYMBOL(cl_lock_user_del);
+
+const char *cl_lock_mode_name(const enum cl_lock_mode mode)
+{
+	static const char *names[] = {
+		[CLM_PHANTOM] = "P",
+		[CLM_READ]    = "R",
+		[CLM_WRITE]   = "W",
+		[CLM_GROUP]   = "G"
+	};
+	if (0 <= mode && mode < ARRAY_SIZE(names))
+		return names[mode];
+	else
+		return "U";
+}
+EXPORT_SYMBOL(cl_lock_mode_name);
+
+/**
+ * Prints human readable representation of a lock description.
+ */
+void cl_lock_descr_print(const struct lu_env *env, void *cookie,
+		       lu_printer_t printer,
+		       const struct cl_lock_descr *descr)
+{
+	const struct lu_fid  *fid;
+
+	fid = lu_object_fid(&descr->cld_obj->co_lu);
+	(*printer)(env, cookie, DDESCR"@"DFID, PDESCR(descr), PFID(fid));
+}
+EXPORT_SYMBOL(cl_lock_descr_print);
+
+/**
+ * Prints human readable representation of \a lock to the \a f.
+ */
+void cl_lock_print(const struct lu_env *env, void *cookie,
+		   lu_printer_t printer, const struct cl_lock *lock)
+{
+	const struct cl_lock_slice *slice;
+	(*printer)(env, cookie, "lock@%p[%d %d %d %d %d %08lx] ",
+		   lock, atomic_read(&lock->cll_ref),
+		   lock->cll_state, lock->cll_error, lock->cll_holds,
+		   lock->cll_users, lock->cll_flags);
+	cl_lock_descr_print(env, cookie, printer, &lock->cll_descr);
+	(*printer)(env, cookie, " {\n");
+
+	list_for_each_entry(slice, &lock->cll_layers, cls_linkage) {
+		(*printer)(env, cookie, "    %s@%p: ",
+			   slice->cls_obj->co_lu.lo_dev->ld_type->ldt_name,
+			   slice);
+		if (slice->cls_ops->clo_print != NULL)
+			slice->cls_ops->clo_print(env, cookie, printer, slice);
+		(*printer)(env, cookie, "\n");
+	}
+	(*printer)(env, cookie, "} lock@%p\n", lock);
+}
+EXPORT_SYMBOL(cl_lock_print);
+
+int cl_lock_init(void)
+{
+	return lu_kmem_init(cl_lock_caches);
+}
+
+void cl_lock_fini(void)
+{
+	lu_kmem_fini(cl_lock_caches);
+}
diff --git a/kernel/drivers/staging/lustre/lustre/obdclass/cl_object.c b/kernel/drivers/staging/lustre/lustre/obdclass/cl_object.c
new file mode 100644
index 000000000..f13d1fbff
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/obdclass/cl_object.c
@@ -0,0 +1,1139 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Client Lustre Object.
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ */
+
+/*
+ * Locking.
+ *
+ *  i_mutex
+ *      PG_locked
+ *	  ->coh_page_guard
+ *	  ->coh_lock_guard
+ *	  ->coh_attr_guard
+ *	  ->ls_guard
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+
+#include "../../include/linux/libcfs/libcfs.h"
+/* class_put_type() */
+#include "../include/obd_class.h"
+#include "../include/obd_support.h"
+#include "../include/lustre_fid.h"
+#include <linux/list.h>
+#include "../../include/linux/libcfs/libcfs_hash.h"	/* for cfs_hash stuff */
+#include "../include/cl_object.h"
+#include "cl_internal.h"
+
+static struct kmem_cache *cl_env_kmem;
+
+/** Lock class of cl_object_header::coh_page_guard */
+static struct lock_class_key cl_page_guard_class;
+/** Lock class of cl_object_header::coh_lock_guard */
+static struct lock_class_key cl_lock_guard_class;
+/** Lock class of cl_object_header::coh_attr_guard */
+static struct lock_class_key cl_attr_guard_class;
+
+extern __u32 lu_context_tags_default;
+extern __u32 lu_session_tags_default;
+/**
+ * Initialize cl_object_header.
+ */
+int cl_object_header_init(struct cl_object_header *h)
+{
+	int result;
+
+	result = lu_object_header_init(&h->coh_lu);
+	if (result == 0) {
+		spin_lock_init(&h->coh_page_guard);
+		spin_lock_init(&h->coh_lock_guard);
+		spin_lock_init(&h->coh_attr_guard);
+		lockdep_set_class(&h->coh_page_guard, &cl_page_guard_class);
+		lockdep_set_class(&h->coh_lock_guard, &cl_lock_guard_class);
+		lockdep_set_class(&h->coh_attr_guard, &cl_attr_guard_class);
+		h->coh_pages = 0;
+		/* XXX hard coded GFP_* mask. */
+		INIT_RADIX_TREE(&h->coh_tree, GFP_ATOMIC);
+		INIT_LIST_HEAD(&h->coh_locks);
+		h->coh_page_bufsize = ALIGN(sizeof(struct cl_page), 8);
+	}
+	return result;
+}
+EXPORT_SYMBOL(cl_object_header_init);
+
+/**
+ * Finalize cl_object_header.
+ */
+void cl_object_header_fini(struct cl_object_header *h)
+{
+	LASSERT(list_empty(&h->coh_locks));
+	lu_object_header_fini(&h->coh_lu);
+}
+EXPORT_SYMBOL(cl_object_header_fini);
+
+/**
+ * Returns a cl_object with a given \a fid.
+ *
+ * Returns either cached or newly created object. Additional reference on the
+ * returned object is acquired.
+ *
+ * \see lu_object_find(), cl_page_find(), cl_lock_find()
+ */
+struct cl_object *cl_object_find(const struct lu_env *env,
+				 struct cl_device *cd, const struct lu_fid *fid,
+				 const struct cl_object_conf *c)
+{
+	might_sleep();
+	return lu2cl(lu_object_find_slice(env, cl2lu_dev(cd), fid, &c->coc_lu));
+}
+EXPORT_SYMBOL(cl_object_find);
+
+/**
+ * Releases a reference on \a o.
+ *
+ * When last reference is released object is returned to the cache, unless
+ * lu_object_header_flags::LU_OBJECT_HEARD_BANSHEE bit is set in its header.
+ *
+ * \see cl_page_put(), cl_lock_put().
+ */
+void cl_object_put(const struct lu_env *env, struct cl_object *o)
+{
+	lu_object_put(env, &o->co_lu);
+}
+EXPORT_SYMBOL(cl_object_put);
+
+/**
+ * Acquire an additional reference to the object \a o.
+ *
+ * This can only be used to acquire _additional_ reference, i.e., caller
+ * already has to possess at least one reference to \a o before calling this.
+ *
+ * \see cl_page_get(), cl_lock_get().
+ */
+void cl_object_get(struct cl_object *o)
+{
+	lu_object_get(&o->co_lu);
+}
+EXPORT_SYMBOL(cl_object_get);
+
+/**
+ * Returns the top-object for a given \a o.
+ *
+ * \see cl_page_top(), cl_io_top()
+ */
+struct cl_object *cl_object_top(struct cl_object *o)
+{
+	struct cl_object_header *hdr = cl_object_header(o);
+	struct cl_object *top;
+
+	while (hdr->coh_parent != NULL)
+		hdr = hdr->coh_parent;
+
+	top = lu2cl(lu_object_top(&hdr->coh_lu));
+	CDEBUG(D_TRACE, "%p -> %p\n", o, top);
+	return top;
+}
+EXPORT_SYMBOL(cl_object_top);
+
+/**
+ * Returns pointer to the lock protecting data-attributes for the given object
+ * \a o.
+ *
+ * Data-attributes are protected by the cl_object_header::coh_attr_guard
+ * spin-lock in the top-object.
+ *
+ * \see cl_attr, cl_object_attr_lock(), cl_object_operations::coo_attr_get().
+ */
+static spinlock_t *cl_object_attr_guard(struct cl_object *o)
+{
+	return &cl_object_header(cl_object_top(o))->coh_attr_guard;
+}
+
+/**
+ * Locks data-attributes.
+ *
+ * Prevents data-attributes from changing, until lock is released by
+ * cl_object_attr_unlock(). This has to be called before calls to
+ * cl_object_attr_get(), cl_object_attr_set().
+ */
+void cl_object_attr_lock(struct cl_object *o)
+	__acquires(cl_object_attr_guard(o))
+{
+	spin_lock(cl_object_attr_guard(o));
+}
+EXPORT_SYMBOL(cl_object_attr_lock);
+
+/**
+ * Releases data-attributes lock, acquired by cl_object_attr_lock().
+ */
+void cl_object_attr_unlock(struct cl_object *o)
+	__releases(cl_object_attr_guard(o))
+{
+	spin_unlock(cl_object_attr_guard(o));
+}
+EXPORT_SYMBOL(cl_object_attr_unlock);
+
+/**
+ * Returns data-attributes of an object \a obj.
+ *
+ * Every layer is asked (by calling cl_object_operations::coo_attr_get())
+ * top-to-bottom to fill in parts of \a attr that this layer is responsible
+ * for.
+ */
+int cl_object_attr_get(const struct lu_env *env, struct cl_object *obj,
+		       struct cl_attr *attr)
+{
+	struct lu_object_header *top;
+	int result;
+
+	assert_spin_locked(cl_object_attr_guard(obj));
+
+	top = obj->co_lu.lo_header;
+	result = 0;
+	list_for_each_entry(obj, &top->loh_layers, co_lu.lo_linkage) {
+		if (obj->co_ops->coo_attr_get != NULL) {
+			result = obj->co_ops->coo_attr_get(env, obj, attr);
+			if (result != 0) {
+				if (result > 0)
+					result = 0;
+				break;
+			}
+		}
+	}
+	return result;
+}
+EXPORT_SYMBOL(cl_object_attr_get);
+
+/**
+ * Updates data-attributes of an object \a obj.
+ *
+ * Only attributes, mentioned in a validness bit-mask \a v are
+ * updated. Calls cl_object_operations::coo_attr_set() on every layer, bottom
+ * to top.
+ */
+int cl_object_attr_set(const struct lu_env *env, struct cl_object *obj,
+		       const struct cl_attr *attr, unsigned v)
+{
+	struct lu_object_header *top;
+	int result;
+
+	assert_spin_locked(cl_object_attr_guard(obj));
+
+	top = obj->co_lu.lo_header;
+	result = 0;
+	list_for_each_entry_reverse(obj, &top->loh_layers,
+					co_lu.lo_linkage) {
+		if (obj->co_ops->coo_attr_set != NULL) {
+			result = obj->co_ops->coo_attr_set(env, obj, attr, v);
+			if (result != 0) {
+				if (result > 0)
+					result = 0;
+				break;
+			}
+		}
+	}
+	return result;
+}
+EXPORT_SYMBOL(cl_object_attr_set);
+
+/**
+ * Notifies layers (bottom-to-top) that glimpse AST was received.
+ *
+ * Layers have to fill \a lvb fields with information that will be shipped
+ * back to glimpse issuer.
+ *
+ * \see cl_lock_operations::clo_glimpse()
+ */
+int cl_object_glimpse(const struct lu_env *env, struct cl_object *obj,
+		      struct ost_lvb *lvb)
+{
+	struct lu_object_header *top;
+	int result;
+
+	top = obj->co_lu.lo_header;
+	result = 0;
+	list_for_each_entry_reverse(obj, &top->loh_layers,
+					co_lu.lo_linkage) {
+		if (obj->co_ops->coo_glimpse != NULL) {
+			result = obj->co_ops->coo_glimpse(env, obj, lvb);
+			if (result != 0)
+				break;
+		}
+	}
+	LU_OBJECT_HEADER(D_DLMTRACE, env, lu_object_top(top),
+			 "size: %llu mtime: %llu atime: %llu ctime: %llu blocks: %llu\n",
+			 lvb->lvb_size, lvb->lvb_mtime, lvb->lvb_atime,
+			 lvb->lvb_ctime, lvb->lvb_blocks);
+	return result;
+}
+EXPORT_SYMBOL(cl_object_glimpse);
+
+/**
+ * Updates a configuration of an object \a obj.
+ */
+int cl_conf_set(const struct lu_env *env, struct cl_object *obj,
+		const struct cl_object_conf *conf)
+{
+	struct lu_object_header *top;
+	int result;
+
+	top = obj->co_lu.lo_header;
+	result = 0;
+	list_for_each_entry(obj, &top->loh_layers, co_lu.lo_linkage) {
+		if (obj->co_ops->coo_conf_set != NULL) {
+			result = obj->co_ops->coo_conf_set(env, obj, conf);
+			if (result != 0)
+				break;
+		}
+	}
+	return result;
+}
+EXPORT_SYMBOL(cl_conf_set);
+
+/**
+ * Helper function removing all object locks, and marking object for
+ * deletion. All object pages must have been deleted at this point.
+ *
+ * This is called by cl_inode_fini() and lov_object_delete() to destroy top-
+ * and sub- objects respectively.
+ */
+void cl_object_kill(const struct lu_env *env, struct cl_object *obj)
+{
+	struct cl_object_header *hdr;
+
+	hdr = cl_object_header(obj);
+	LASSERT(hdr->coh_tree.rnode == NULL);
+	LASSERT(hdr->coh_pages == 0);
+
+	set_bit(LU_OBJECT_HEARD_BANSHEE, &hdr->coh_lu.loh_flags);
+	/*
+	 * Destroy all locks. Object destruction (including cl_inode_fini())
+	 * cannot cancel the locks, because in the case of a local client,
+	 * where client and server share the same thread running
+	 * prune_icache(), this can dead-lock with ldlm_cancel_handler()
+	 * waiting on __wait_on_freeing_inode().
+	 */
+	cl_locks_prune(env, obj, 0);
+}
+EXPORT_SYMBOL(cl_object_kill);
+
+/**
+ * Prunes caches of pages and locks for this object.
+ */
+void cl_object_prune(const struct lu_env *env, struct cl_object *obj)
+{
+	cl_pages_prune(env, obj);
+	cl_locks_prune(env, obj, 1);
+}
+EXPORT_SYMBOL(cl_object_prune);
+
+/**
+ * Check if the object has locks.
+ */
+int cl_object_has_locks(struct cl_object *obj)
+{
+	struct cl_object_header *head = cl_object_header(obj);
+	int has;
+
+	spin_lock(&head->coh_lock_guard);
+	has = list_empty(&head->coh_locks);
+	spin_unlock(&head->coh_lock_guard);
+
+	return (has == 0);
+}
+EXPORT_SYMBOL(cl_object_has_locks);
+
+void cache_stats_init(struct cache_stats *cs, const char *name)
+{
+	int i;
+
+	cs->cs_name = name;
+	for (i = 0; i < CS_NR; i++)
+		atomic_set(&cs->cs_stats[i], 0);
+}
+
+int cache_stats_print(const struct cache_stats *cs, struct seq_file *m, int h)
+{
+	int i;
+	/*
+	 *   lookup    hit    total  cached create
+	 * env: ...... ...... ...... ...... ......
+	 */
+	if (h) {
+		const char *names[CS_NR] = CS_NAMES;
+
+		seq_printf(m, "%6s", " ");
+		for (i = 0; i < CS_NR; i++)
+			seq_printf(m, "%8s", names[i]);
+		seq_printf(m, "\n");
+	}
+
+	seq_printf(m, "%5.5s:", cs->cs_name);
+	for (i = 0; i < CS_NR; i++)
+		seq_printf(m, "%8u", atomic_read(&cs->cs_stats[i]));
+	return 0;
+}
+
+/**
+ * Initialize client site.
+ *
+ * Perform common initialization (lu_site_init()), and initialize statistical
+ * counters. Also perform global initializations on the first call.
+ */
+int cl_site_init(struct cl_site *s, struct cl_device *d)
+{
+	int i;
+	int result;
+
+	result = lu_site_init(&s->cs_lu, &d->cd_lu_dev);
+	if (result == 0) {
+		cache_stats_init(&s->cs_pages, "pages");
+		cache_stats_init(&s->cs_locks, "locks");
+		for (i = 0; i < ARRAY_SIZE(s->cs_pages_state); ++i)
+			atomic_set(&s->cs_pages_state[0], 0);
+		for (i = 0; i < ARRAY_SIZE(s->cs_locks_state); ++i)
+			atomic_set(&s->cs_locks_state[i], 0);
+	}
+	return result;
+}
+EXPORT_SYMBOL(cl_site_init);
+
+/**
+ * Finalize client site. Dual to cl_site_init().
+ */
+void cl_site_fini(struct cl_site *s)
+{
+	lu_site_fini(&s->cs_lu);
+}
+EXPORT_SYMBOL(cl_site_fini);
+
+static struct cache_stats cl_env_stats = {
+	.cs_name    = "envs",
+	.cs_stats = { ATOMIC_INIT(0), }
+};
+
+/**
+ * Outputs client site statistical counters into a buffer. Suitable for
+ * ll_rd_*()-style functions.
+ */
+int cl_site_stats_print(const struct cl_site *site, struct seq_file *m)
+{
+	int i;
+	static const char *pstate[] = {
+		[CPS_CACHED]  = "c",
+		[CPS_OWNED]   = "o",
+		[CPS_PAGEOUT] = "w",
+		[CPS_PAGEIN]  = "r",
+		[CPS_FREEING] = "f"
+	};
+	static const char *lstate[] = {
+		[CLS_NEW]       = "n",
+		[CLS_QUEUING]   = "q",
+		[CLS_ENQUEUED]  = "e",
+		[CLS_HELD]      = "h",
+		[CLS_INTRANSIT] = "t",
+		[CLS_CACHED]    = "c",
+		[CLS_FREEING]   = "f"
+	};
+/*
+       lookup    hit  total   busy create
+pages: ...... ...... ...... ...... ...... [...... ...... ...... ......]
+locks: ...... ...... ...... ...... ...... [...... ...... ...... ...... ......]
+  env: ...... ...... ...... ...... ......
+ */
+	lu_site_stats_print(&site->cs_lu, m);
+	cache_stats_print(&site->cs_pages, m, 1);
+	seq_printf(m, " [");
+	for (i = 0; i < ARRAY_SIZE(site->cs_pages_state); ++i)
+		seq_printf(m, "%s: %u ", pstate[i],
+				atomic_read(&site->cs_pages_state[i]));
+	seq_printf(m, "]\n");
+	cache_stats_print(&site->cs_locks, m, 0);
+	seq_printf(m, " [");
+	for (i = 0; i < ARRAY_SIZE(site->cs_locks_state); ++i)
+		seq_printf(m, "%s: %u ", lstate[i],
+				atomic_read(&site->cs_locks_state[i]));
+	seq_printf(m, "]\n");
+	cache_stats_print(&cl_env_stats, m, 0);
+	seq_printf(m, "\n");
+	return 0;
+}
+EXPORT_SYMBOL(cl_site_stats_print);
+
+/*****************************************************************************
+ *
+ * lu_env handling on client.
+ *
+ */
+
+/**
+ * The most efficient way is to store cl_env pointer in task specific
+ * structures. On Linux, it wont' be easy to use task_struct->journal_info
+ * because Lustre code may call into other fs which has certain assumptions
+ * about journal_info. Currently following fields in task_struct are identified
+ * can be used for this purpose:
+ *  - cl_env: for liblustre.
+ *  - tux_info: only on RedHat kernel.
+ *  - ...
+ * \note As long as we use task_struct to store cl_env, we assume that once
+ * called into Lustre, we'll never call into the other part of the kernel
+ * which will use those fields in task_struct without explicitly exiting
+ * Lustre.
+ *
+ * If there's no space in task_struct is available, hash will be used.
+ * bz20044, bz22683.
+ */
+
+struct cl_env {
+	void	     *ce_magic;
+	struct lu_env     ce_lu;
+	struct lu_context ce_ses;
+
+	/**
+	 * This allows cl_env to be entered into cl_env_hash which implements
+	 * the current thread -> client environment lookup.
+	 */
+	struct hlist_node  ce_node;
+	/**
+	 * Owner for the current cl_env.
+	 *
+	 * If LL_TASK_CL_ENV is defined, this point to the owning current,
+	 * only for debugging purpose ;
+	 * Otherwise hash is used, and this is the key for cfs_hash.
+	 * Now current thread pid is stored. Note using thread pointer would
+	 * lead to unbalanced hash because of its specific allocation locality
+	 * and could be varied for different platforms and OSes, even different
+	 * OS versions.
+	 */
+	void	     *ce_owner;
+
+	/*
+	 * Linkage into global list of all client environments. Used for
+	 * garbage collection.
+	 */
+	struct list_head	ce_linkage;
+	/*
+	 *
+	 */
+	int	       ce_ref;
+	/*
+	 * Debugging field: address of the caller who made original
+	 * allocation.
+	 */
+	void	     *ce_debug;
+};
+
+#define CL_ENV_INC(counter)
+#define CL_ENV_DEC(counter)
+
+static void cl_env_init0(struct cl_env *cle, void *debug)
+{
+	LASSERT(cle->ce_ref == 0);
+	LASSERT(cle->ce_magic == &cl_env_init0);
+	LASSERT(cle->ce_debug == NULL && cle->ce_owner == NULL);
+
+	cle->ce_ref = 1;
+	cle->ce_debug = debug;
+	CL_ENV_INC(busy);
+}
+
+
+/*
+ * The implementation of using hash table to connect cl_env and thread
+ */
+
+static struct cfs_hash *cl_env_hash;
+
+static unsigned cl_env_hops_hash(struct cfs_hash *lh,
+				 const void *key, unsigned mask)
+{
+#if BITS_PER_LONG == 64
+	return cfs_hash_u64_hash((__u64)key, mask);
+#else
+	return cfs_hash_u32_hash((__u32)key, mask);
+#endif
+}
+
+static void *cl_env_hops_obj(struct hlist_node *hn)
+{
+	struct cl_env *cle = hlist_entry(hn, struct cl_env, ce_node);
+	LASSERT(cle->ce_magic == &cl_env_init0);
+	return (void *)cle;
+}
+
+static int cl_env_hops_keycmp(const void *key, struct hlist_node *hn)
+{
+	struct cl_env *cle = cl_env_hops_obj(hn);
+
+	LASSERT(cle->ce_owner != NULL);
+	return (key == cle->ce_owner);
+}
+
+static void cl_env_hops_noop(struct cfs_hash *hs, struct hlist_node *hn)
+{
+	struct cl_env *cle = hlist_entry(hn, struct cl_env, ce_node);
+	LASSERT(cle->ce_magic == &cl_env_init0);
+}
+
+static cfs_hash_ops_t cl_env_hops = {
+	.hs_hash	= cl_env_hops_hash,
+	.hs_key	 = cl_env_hops_obj,
+	.hs_keycmp      = cl_env_hops_keycmp,
+	.hs_object      = cl_env_hops_obj,
+	.hs_get	 = cl_env_hops_noop,
+	.hs_put_locked  = cl_env_hops_noop,
+};
+
+static inline struct cl_env *cl_env_fetch(void)
+{
+	struct cl_env *cle;
+
+	cle = cfs_hash_lookup(cl_env_hash, (void *) (long) current->pid);
+	LASSERT(ergo(cle, cle->ce_magic == &cl_env_init0));
+	return cle;
+}
+
+static inline void cl_env_attach(struct cl_env *cle)
+{
+	if (cle) {
+		int rc;
+
+		LASSERT(cle->ce_owner == NULL);
+		cle->ce_owner = (void *) (long) current->pid;
+		rc = cfs_hash_add_unique(cl_env_hash, cle->ce_owner,
+					 &cle->ce_node);
+		LASSERT(rc == 0);
+	}
+}
+
+static inline void cl_env_do_detach(struct cl_env *cle)
+{
+	void *cookie;
+
+	LASSERT(cle->ce_owner == (void *) (long) current->pid);
+	cookie = cfs_hash_del(cl_env_hash, cle->ce_owner,
+			      &cle->ce_node);
+	LASSERT(cookie == cle);
+	cle->ce_owner = NULL;
+}
+
+static int cl_env_store_init(void) {
+	cl_env_hash = cfs_hash_create("cl_env",
+				      HASH_CL_ENV_BITS, HASH_CL_ENV_BITS,
+				      HASH_CL_ENV_BKT_BITS, 0,
+				      CFS_HASH_MIN_THETA,
+				      CFS_HASH_MAX_THETA,
+				      &cl_env_hops,
+				      CFS_HASH_RW_BKTLOCK);
+	return cl_env_hash != NULL ? 0 :-ENOMEM;
+}
+
+static void cl_env_store_fini(void)
+{
+	cfs_hash_putref(cl_env_hash);
+}
+
+
+static inline struct cl_env *cl_env_detach(struct cl_env *cle)
+{
+	if (cle == NULL)
+		cle = cl_env_fetch();
+
+	if (cle && cle->ce_owner)
+		cl_env_do_detach(cle);
+
+	return cle;
+}
+
+static struct lu_env *cl_env_new(__u32 ctx_tags, __u32 ses_tags, void *debug)
+{
+	struct lu_env *env;
+	struct cl_env *cle;
+
+	OBD_SLAB_ALLOC_PTR_GFP(cle, cl_env_kmem, GFP_NOFS);
+	if (cle != NULL) {
+		int rc;
+
+		INIT_LIST_HEAD(&cle->ce_linkage);
+		cle->ce_magic = &cl_env_init0;
+		env = &cle->ce_lu;
+		rc = lu_env_init(env, LCT_CL_THREAD|ctx_tags);
+		if (rc == 0) {
+			rc = lu_context_init(&cle->ce_ses,
+					     LCT_SESSION | ses_tags);
+			if (rc == 0) {
+				lu_context_enter(&cle->ce_ses);
+				env->le_ses = &cle->ce_ses;
+				cl_env_init0(cle, debug);
+			} else
+				lu_env_fini(env);
+		}
+		if (rc != 0) {
+			OBD_SLAB_FREE_PTR(cle, cl_env_kmem);
+			env = ERR_PTR(rc);
+		} else {
+			CL_ENV_INC(create);
+			CL_ENV_INC(total);
+		}
+	} else
+		env = ERR_PTR(-ENOMEM);
+	return env;
+}
+
+static void cl_env_fini(struct cl_env *cle)
+{
+	CL_ENV_DEC(total);
+	lu_context_fini(&cle->ce_lu.le_ctx);
+	lu_context_fini(&cle->ce_ses);
+	OBD_SLAB_FREE_PTR(cle, cl_env_kmem);
+}
+
+static inline struct cl_env *cl_env_container(struct lu_env *env)
+{
+	return container_of(env, struct cl_env, ce_lu);
+}
+
+struct lu_env *cl_env_peek(int *refcheck)
+{
+	struct lu_env *env;
+	struct cl_env *cle;
+
+	CL_ENV_INC(lookup);
+
+	/* check that we don't go far from untrusted pointer */
+	CLASSERT(offsetof(struct cl_env, ce_magic) == 0);
+
+	env = NULL;
+	cle = cl_env_fetch();
+	if (cle != NULL) {
+		CL_ENV_INC(hit);
+		env = &cle->ce_lu;
+		*refcheck = ++cle->ce_ref;
+	}
+	CDEBUG(D_OTHER, "%d@%p\n", cle ? cle->ce_ref : 0, cle);
+	return env;
+}
+EXPORT_SYMBOL(cl_env_peek);
+
+/**
+ * Returns lu_env: if there already is an environment associated with the
+ * current thread, it is returned, otherwise, new environment is allocated.
+ *
+ * \param refcheck pointer to a counter used to detect environment leaks. In
+ * the usual case cl_env_get() and cl_env_put() are called in the same lexical
+ * scope and pointer to the same integer is passed as \a refcheck. This is
+ * used to detect missed cl_env_put().
+ *
+ * \see cl_env_put()
+ */
+struct lu_env *cl_env_get(int *refcheck)
+{
+	struct lu_env *env;
+
+	env = cl_env_peek(refcheck);
+	if (env == NULL) {
+		env = cl_env_new(lu_context_tags_default,
+				 lu_session_tags_default,
+				 __builtin_return_address(0));
+
+		if (!IS_ERR(env)) {
+			struct cl_env *cle;
+
+			cle = cl_env_container(env);
+			cl_env_attach(cle);
+			*refcheck = cle->ce_ref;
+			CDEBUG(D_OTHER, "%d@%p\n", cle->ce_ref, cle);
+		}
+	}
+	return env;
+}
+EXPORT_SYMBOL(cl_env_get);
+
+/**
+ * Forces an allocation of a fresh environment with given tags.
+ *
+ * \see cl_env_get()
+ */
+struct lu_env *cl_env_alloc(int *refcheck, __u32 tags)
+{
+	struct lu_env *env;
+
+	LASSERT(cl_env_peek(refcheck) == NULL);
+	env = cl_env_new(tags, tags, __builtin_return_address(0));
+	if (!IS_ERR(env)) {
+		struct cl_env *cle;
+
+		cle = cl_env_container(env);
+		*refcheck = cle->ce_ref;
+		CDEBUG(D_OTHER, "%d@%p\n", cle->ce_ref, cle);
+	}
+	return env;
+}
+EXPORT_SYMBOL(cl_env_alloc);
+
+static void cl_env_exit(struct cl_env *cle)
+{
+	LASSERT(cle->ce_owner == NULL);
+	lu_context_exit(&cle->ce_lu.le_ctx);
+	lu_context_exit(&cle->ce_ses);
+}
+
+/**
+ * Release an environment.
+ *
+ * Decrement \a env reference counter. When counter drops to 0, nothing in
+ * this thread is using environment and it is returned to the allocation
+ * cache, or freed straight away, if cache is large enough.
+ */
+void cl_env_put(struct lu_env *env, int *refcheck)
+{
+	struct cl_env *cle;
+
+	cle = cl_env_container(env);
+
+	LASSERT(cle->ce_ref > 0);
+	LASSERT(ergo(refcheck != NULL, cle->ce_ref == *refcheck));
+
+	CDEBUG(D_OTHER, "%d@%p\n", cle->ce_ref, cle);
+	if (--cle->ce_ref == 0) {
+		CL_ENV_DEC(busy);
+		cl_env_detach(cle);
+		cle->ce_debug = NULL;
+		cl_env_exit(cle);
+		cl_env_fini(cle);
+	}
+}
+EXPORT_SYMBOL(cl_env_put);
+
+/**
+ * Declares a point of re-entrancy.
+ *
+ * \see cl_env_reexit()
+ */
+void *cl_env_reenter(void)
+{
+	return cl_env_detach(NULL);
+}
+EXPORT_SYMBOL(cl_env_reenter);
+
+/**
+ * Exits re-entrancy.
+ */
+void cl_env_reexit(void *cookie)
+{
+	cl_env_detach(NULL);
+	cl_env_attach(cookie);
+}
+EXPORT_SYMBOL(cl_env_reexit);
+
+/**
+ * Setup user-supplied \a env as a current environment. This is to be used to
+ * guaranteed that environment exists even when cl_env_get() fails. It is up
+ * to user to ensure proper concurrency control.
+ *
+ * \see cl_env_unplant()
+ */
+void cl_env_implant(struct lu_env *env, int *refcheck)
+{
+	struct cl_env *cle = cl_env_container(env);
+
+	LASSERT(cle->ce_ref > 0);
+
+	cl_env_attach(cle);
+	cl_env_get(refcheck);
+	CDEBUG(D_OTHER, "%d@%p\n", cle->ce_ref, cle);
+}
+EXPORT_SYMBOL(cl_env_implant);
+
+/**
+ * Detach environment installed earlier by cl_env_implant().
+ */
+void cl_env_unplant(struct lu_env *env, int *refcheck)
+{
+	struct cl_env *cle = cl_env_container(env);
+
+	LASSERT(cle->ce_ref > 1);
+
+	CDEBUG(D_OTHER, "%d@%p\n", cle->ce_ref, cle);
+
+	cl_env_detach(cle);
+	cl_env_put(env, refcheck);
+}
+EXPORT_SYMBOL(cl_env_unplant);
+
+struct lu_env *cl_env_nested_get(struct cl_env_nest *nest)
+{
+	struct lu_env *env;
+
+	nest->cen_cookie = NULL;
+	env = cl_env_peek(&nest->cen_refcheck);
+	if (env != NULL) {
+		if (!cl_io_is_going(env))
+			return env;
+		else {
+			cl_env_put(env, &nest->cen_refcheck);
+			nest->cen_cookie = cl_env_reenter();
+		}
+	}
+	env = cl_env_get(&nest->cen_refcheck);
+	if (IS_ERR(env)) {
+		cl_env_reexit(nest->cen_cookie);
+		return env;
+	}
+
+	LASSERT(!cl_io_is_going(env));
+	return env;
+}
+EXPORT_SYMBOL(cl_env_nested_get);
+
+void cl_env_nested_put(struct cl_env_nest *nest, struct lu_env *env)
+{
+	cl_env_put(env, &nest->cen_refcheck);
+	cl_env_reexit(nest->cen_cookie);
+}
+EXPORT_SYMBOL(cl_env_nested_put);
+
+/**
+ * Converts struct cl_attr to struct ost_lvb.
+ *
+ * \see cl_lvb2attr
+ */
+void cl_attr2lvb(struct ost_lvb *lvb, const struct cl_attr *attr)
+{
+	lvb->lvb_size   = attr->cat_size;
+	lvb->lvb_mtime  = attr->cat_mtime;
+	lvb->lvb_atime  = attr->cat_atime;
+	lvb->lvb_ctime  = attr->cat_ctime;
+	lvb->lvb_blocks = attr->cat_blocks;
+}
+EXPORT_SYMBOL(cl_attr2lvb);
+
+/**
+ * Converts struct ost_lvb to struct cl_attr.
+ *
+ * \see cl_attr2lvb
+ */
+void cl_lvb2attr(struct cl_attr *attr, const struct ost_lvb *lvb)
+{
+	attr->cat_size   = lvb->lvb_size;
+	attr->cat_mtime  = lvb->lvb_mtime;
+	attr->cat_atime  = lvb->lvb_atime;
+	attr->cat_ctime  = lvb->lvb_ctime;
+	attr->cat_blocks = lvb->lvb_blocks;
+}
+EXPORT_SYMBOL(cl_lvb2attr);
+
+/*****************************************************************************
+ *
+ * Temporary prototype thing: mirror obd-devices into cl devices.
+ *
+ */
+
+struct cl_device *cl_type_setup(const struct lu_env *env, struct lu_site *site,
+				struct lu_device_type *ldt,
+				struct lu_device *next)
+{
+	const char       *typename;
+	struct lu_device *d;
+
+	LASSERT(ldt != NULL);
+
+	typename = ldt->ldt_name;
+	d = ldt->ldt_ops->ldto_device_alloc(env, ldt, NULL);
+	if (!IS_ERR(d)) {
+		int rc;
+
+		if (site != NULL)
+			d->ld_site = site;
+		rc = ldt->ldt_ops->ldto_device_init(env, d, typename, next);
+		if (rc == 0) {
+			lu_device_get(d);
+			lu_ref_add(&d->ld_reference,
+				   "lu-stack", &lu_site_init);
+		} else {
+			ldt->ldt_ops->ldto_device_free(env, d);
+			CERROR("can't init device '%s', %d\n", typename, rc);
+			d = ERR_PTR(rc);
+		}
+	} else
+		CERROR("Cannot allocate device: '%s'\n", typename);
+	return lu2cl_dev(d);
+}
+EXPORT_SYMBOL(cl_type_setup);
+
+/**
+ * Finalize device stack by calling lu_stack_fini().
+ */
+void cl_stack_fini(const struct lu_env *env, struct cl_device *cl)
+{
+	lu_stack_fini(env, cl2lu_dev(cl));
+}
+EXPORT_SYMBOL(cl_stack_fini);
+
+int  cl_lock_init(void);
+void cl_lock_fini(void);
+
+int  cl_page_init(void);
+void cl_page_fini(void);
+
+static struct lu_context_key cl_key;
+
+struct cl_thread_info *cl_env_info(const struct lu_env *env)
+{
+	return lu_context_key_get(&env->le_ctx, &cl_key);
+}
+
+/* defines cl0_key_{init,fini}() */
+LU_KEY_INIT_FINI(cl0, struct cl_thread_info);
+
+static void *cl_key_init(const struct lu_context *ctx,
+			 struct lu_context_key *key)
+{
+	struct cl_thread_info *info;
+
+	info = cl0_key_init(ctx, key);
+	if (!IS_ERR(info)) {
+		int i;
+
+		for (i = 0; i < ARRAY_SIZE(info->clt_counters); ++i)
+			lu_ref_init(&info->clt_counters[i].ctc_locks_locked);
+	}
+	return info;
+}
+
+static void cl_key_fini(const struct lu_context *ctx,
+			struct lu_context_key *key, void *data)
+{
+	struct cl_thread_info *info;
+	int i;
+
+	info = data;
+	for (i = 0; i < ARRAY_SIZE(info->clt_counters); ++i)
+		lu_ref_fini(&info->clt_counters[i].ctc_locks_locked);
+	cl0_key_fini(ctx, key, data);
+}
+
+static void cl_key_exit(const struct lu_context *ctx,
+			struct lu_context_key *key, void *data)
+{
+	struct cl_thread_info *info = data;
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(info->clt_counters); ++i) {
+		LASSERT(info->clt_counters[i].ctc_nr_held == 0);
+		LASSERT(info->clt_counters[i].ctc_nr_used == 0);
+		LASSERT(info->clt_counters[i].ctc_nr_locks_acquired == 0);
+		LASSERT(info->clt_counters[i].ctc_nr_locks_locked == 0);
+		lu_ref_fini(&info->clt_counters[i].ctc_locks_locked);
+		lu_ref_init(&info->clt_counters[i].ctc_locks_locked);
+	}
+}
+
+static struct lu_context_key cl_key = {
+	.lct_tags = LCT_CL_THREAD,
+	.lct_init = cl_key_init,
+	.lct_fini = cl_key_fini,
+	.lct_exit = cl_key_exit
+};
+
+static struct lu_kmem_descr cl_object_caches[] = {
+	{
+		.ckd_cache = &cl_env_kmem,
+		.ckd_name  = "cl_env_kmem",
+		.ckd_size  = sizeof (struct cl_env)
+	},
+	{
+		.ckd_cache = NULL
+	}
+};
+
+/**
+ * Global initialization of cl-data. Create kmem caches, register
+ * lu_context_key's, etc.
+ *
+ * \see cl_global_fini()
+ */
+int cl_global_init(void)
+{
+	int result;
+
+	result = cl_env_store_init();
+	if (result)
+		return result;
+
+	result = lu_kmem_init(cl_object_caches);
+	if (result)
+		goto out_store;
+
+	LU_CONTEXT_KEY_INIT(&cl_key);
+	result = lu_context_key_register(&cl_key);
+	if (result)
+		goto out_kmem;
+
+	result = cl_lock_init();
+	if (result)
+		goto out_context;
+
+	result = cl_page_init();
+	if (result)
+		goto out_lock;
+
+	return 0;
+out_lock:
+	cl_lock_fini();
+out_context:
+	lu_context_key_degister(&cl_key);
+out_kmem:
+	lu_kmem_fini(cl_object_caches);
+out_store:
+	cl_env_store_fini();
+	return result;
+}
+
+/**
+ * Finalization of global cl-data. Dual to cl_global_init().
+ */
+void cl_global_fini(void)
+{
+	cl_lock_fini();
+	cl_page_fini();
+	lu_context_key_degister(&cl_key);
+	lu_kmem_fini(cl_object_caches);
+	cl_env_store_fini();
+}
diff --git a/kernel/drivers/staging/lustre/lustre/obdclass/cl_page.c b/kernel/drivers/staging/lustre/lustre/obdclass/cl_page.c
new file mode 100644
index 000000000..b7dd04808
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/obdclass/cl_page.c
@@ -0,0 +1,1553 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Client Lustre Page.
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+
+#include "../../include/linux/libcfs/libcfs.h"
+#include "../include/obd_class.h"
+#include "../include/obd_support.h"
+#include <linux/list.h>
+
+#include "../include/cl_object.h"
+#include "cl_internal.h"
+
+static void cl_page_delete0(const struct lu_env *env, struct cl_page *pg,
+			    int radix);
+
+# define PASSERT(env, page, expr)				       \
+  do {								    \
+	  if (unlikely(!(expr))) {				      \
+		  CL_PAGE_DEBUG(D_ERROR, (env), (page), #expr "\n");    \
+		  LASSERT(0);					   \
+	  }							     \
+  } while (0)
+
+# define PINVRNT(env, page, exp) \
+	((void)sizeof(env), (void)sizeof(page), (void)sizeof !!(exp))
+
+/* Disable page statistic by default due to huge performance penalty. */
+#define CS_PAGE_INC(o, item)
+#define CS_PAGE_DEC(o, item)
+#define CS_PAGESTATE_INC(o, state)
+#define CS_PAGESTATE_DEC(o, state)
+
+/**
+ * Internal version of cl_page_top, it should be called if the page is
+ * known to be not freed, says with page referenced, or radix tree lock held,
+ * or page owned.
+ */
+static struct cl_page *cl_page_top_trusted(struct cl_page *page)
+{
+	while (page->cp_parent != NULL)
+		page = page->cp_parent;
+	return page;
+}
+
+/**
+ * Internal version of cl_page_get().
+ *
+ * This function can be used to obtain initial reference to previously
+ * unreferenced cached object. It can be called only if concurrent page
+ * reclamation is somehow prevented, e.g., by locking page radix-tree
+ * (cl_object_header::hdr->coh_page_guard), or by keeping a lock on a VM page,
+ * associated with \a page.
+ *
+ * Use with care! Not exported.
+ */
+static void cl_page_get_trust(struct cl_page *page)
+{
+	LASSERT(atomic_read(&page->cp_ref) > 0);
+	atomic_inc(&page->cp_ref);
+}
+
+/**
+ * Returns a slice within a page, corresponding to the given layer in the
+ * device stack.
+ *
+ * \see cl_lock_at()
+ */
+static const struct cl_page_slice *
+cl_page_at_trusted(const struct cl_page *page,
+		   const struct lu_device_type *dtype)
+{
+	const struct cl_page_slice *slice;
+
+	page = cl_page_top_trusted((struct cl_page *)page);
+	do {
+		list_for_each_entry(slice, &page->cp_layers, cpl_linkage) {
+			if (slice->cpl_obj->co_lu.lo_dev->ld_type == dtype)
+				return slice;
+		}
+		page = page->cp_child;
+	} while (page != NULL);
+	return NULL;
+}
+
+/**
+ * Returns a page with given index in the given object, or NULL if no page is
+ * found. Acquires a reference on \a page.
+ *
+ * Locking: called under cl_object_header::coh_page_guard spin-lock.
+ */
+struct cl_page *cl_page_lookup(struct cl_object_header *hdr, pgoff_t index)
+{
+	struct cl_page *page;
+
+	assert_spin_locked(&hdr->coh_page_guard);
+
+	page = radix_tree_lookup(&hdr->coh_tree, index);
+	if (page != NULL)
+		cl_page_get_trust(page);
+	return page;
+}
+EXPORT_SYMBOL(cl_page_lookup);
+
+/**
+ * Returns a list of pages by a given [start, end] of \a obj.
+ *
+ * \param resched If not NULL, then we give up before hogging CPU for too
+ * long and set *resched = 1, in that case caller should implement a retry
+ * logic.
+ *
+ * Gang tree lookup (radix_tree_gang_lookup()) optimization is absolutely
+ * crucial in the face of [offset, EOF] locks.
+ *
+ * Return at least one page in @queue unless there is no covered page.
+ */
+int cl_page_gang_lookup(const struct lu_env *env, struct cl_object *obj,
+			struct cl_io *io, pgoff_t start, pgoff_t end,
+			cl_page_gang_cb_t cb, void *cbdata)
+{
+	struct cl_object_header *hdr;
+	struct cl_page	  *page;
+	struct cl_page	 **pvec;
+	const struct cl_page_slice  *slice;
+	const struct lu_device_type *dtype;
+	pgoff_t		  idx;
+	unsigned int	     nr;
+	unsigned int	     i;
+	unsigned int	     j;
+	int		      res = CLP_GANG_OKAY;
+	int		      tree_lock = 1;
+
+	idx = start;
+	hdr = cl_object_header(obj);
+	pvec = cl_env_info(env)->clt_pvec;
+	dtype = cl_object_top(obj)->co_lu.lo_dev->ld_type;
+	spin_lock(&hdr->coh_page_guard);
+	while ((nr = radix_tree_gang_lookup(&hdr->coh_tree, (void **)pvec,
+					    idx, CLT_PVEC_SIZE)) > 0) {
+		int end_of_region = 0;
+		idx = pvec[nr - 1]->cp_index + 1;
+		for (i = 0, j = 0; i < nr; ++i) {
+			page = pvec[i];
+			pvec[i] = NULL;
+
+			LASSERT(page->cp_type == CPT_CACHEABLE);
+			if (page->cp_index > end) {
+				end_of_region = 1;
+				break;
+			}
+			if (page->cp_state == CPS_FREEING)
+				continue;
+
+			slice = cl_page_at_trusted(page, dtype);
+			/*
+			 * Pages for lsm-less file has no underneath sub-page
+			 * for osc, in case of ...
+			 */
+			PASSERT(env, page, slice != NULL);
+
+			page = slice->cpl_page;
+			/*
+			 * Can safely call cl_page_get_trust() under
+			 * radix-tree spin-lock.
+			 *
+			 * XXX not true, because @page is from object another
+			 * than @hdr and protected by different tree lock.
+			 */
+			cl_page_get_trust(page);
+			lu_ref_add_atomic(&page->cp_reference,
+					  "gang_lookup", current);
+			pvec[j++] = page;
+		}
+
+		/*
+		 * Here a delicate locking dance is performed. Current thread
+		 * holds a reference to a page, but has to own it before it
+		 * can be placed into queue. Owning implies waiting, so
+		 * radix-tree lock is to be released. After a wait one has to
+		 * check that pages weren't truncated (cl_page_own() returns
+		 * error in the latter case).
+		 */
+		spin_unlock(&hdr->coh_page_guard);
+		tree_lock = 0;
+
+		for (i = 0; i < j; ++i) {
+			page = pvec[i];
+			if (res == CLP_GANG_OKAY)
+				res = (*cb)(env, io, page, cbdata);
+			lu_ref_del(&page->cp_reference,
+				   "gang_lookup", current);
+			cl_page_put(env, page);
+		}
+		if (nr < CLT_PVEC_SIZE || end_of_region)
+			break;
+
+		if (res == CLP_GANG_OKAY && need_resched())
+			res = CLP_GANG_RESCHED;
+		if (res != CLP_GANG_OKAY)
+			break;
+
+		spin_lock(&hdr->coh_page_guard);
+		tree_lock = 1;
+	}
+	if (tree_lock)
+		spin_unlock(&hdr->coh_page_guard);
+	return res;
+}
+EXPORT_SYMBOL(cl_page_gang_lookup);
+
+static void cl_page_free(const struct lu_env *env, struct cl_page *page)
+{
+	struct cl_object *obj  = page->cp_obj;
+	int pagesize = cl_object_header(obj)->coh_page_bufsize;
+
+	PASSERT(env, page, list_empty(&page->cp_batch));
+	PASSERT(env, page, page->cp_owner == NULL);
+	PASSERT(env, page, page->cp_req == NULL);
+	PASSERT(env, page, page->cp_parent == NULL);
+	PASSERT(env, page, page->cp_state == CPS_FREEING);
+
+	might_sleep();
+	while (!list_empty(&page->cp_layers)) {
+		struct cl_page_slice *slice;
+
+		slice = list_entry(page->cp_layers.next,
+				       struct cl_page_slice, cpl_linkage);
+		list_del_init(page->cp_layers.next);
+		slice->cpl_ops->cpo_fini(env, slice);
+	}
+	CS_PAGE_DEC(obj, total);
+	CS_PAGESTATE_DEC(obj, page->cp_state);
+	lu_object_ref_del_at(&obj->co_lu, &page->cp_obj_ref, "cl_page", page);
+	cl_object_put(env, obj);
+	lu_ref_fini(&page->cp_reference);
+	OBD_FREE(page, pagesize);
+}
+
+/**
+ * Helper function updating page state. This is the only place in the code
+ * where cl_page::cp_state field is mutated.
+ */
+static inline void cl_page_state_set_trust(struct cl_page *page,
+					   enum cl_page_state state)
+{
+	/* bypass const. */
+	*(enum cl_page_state *)&page->cp_state = state;
+}
+
+static struct cl_page *cl_page_alloc(const struct lu_env *env,
+		struct cl_object *o, pgoff_t ind, struct page *vmpage,
+		enum cl_page_type type)
+{
+	struct cl_page	  *page;
+	struct lu_object_header *head;
+
+	OBD_ALLOC_GFP(page, cl_object_header(o)->coh_page_bufsize,
+			GFP_NOFS);
+	if (page != NULL) {
+		int result = 0;
+		atomic_set(&page->cp_ref, 1);
+		if (type == CPT_CACHEABLE) /* for radix tree */
+			atomic_inc(&page->cp_ref);
+		page->cp_obj = o;
+		cl_object_get(o);
+		lu_object_ref_add_at(&o->co_lu, &page->cp_obj_ref, "cl_page",
+				     page);
+		page->cp_index = ind;
+		cl_page_state_set_trust(page, CPS_CACHED);
+		page->cp_type = type;
+		INIT_LIST_HEAD(&page->cp_layers);
+		INIT_LIST_HEAD(&page->cp_batch);
+		INIT_LIST_HEAD(&page->cp_flight);
+		mutex_init(&page->cp_mutex);
+		lu_ref_init(&page->cp_reference);
+		head = o->co_lu.lo_header;
+		list_for_each_entry(o, &head->loh_layers,
+					co_lu.lo_linkage) {
+			if (o->co_ops->coo_page_init != NULL) {
+				result = o->co_ops->coo_page_init(env, o,
+								  page, vmpage);
+				if (result != 0) {
+					cl_page_delete0(env, page, 0);
+					cl_page_free(env, page);
+					page = ERR_PTR(result);
+					break;
+				}
+			}
+		}
+		if (result == 0) {
+			CS_PAGE_INC(o, total);
+			CS_PAGE_INC(o, create);
+			CS_PAGESTATE_DEC(o, CPS_CACHED);
+		}
+	} else {
+		page = ERR_PTR(-ENOMEM);
+	}
+	return page;
+}
+
+/**
+ * Returns a cl_page with index \a idx at the object \a o, and associated with
+ * the VM page \a vmpage.
+ *
+ * This is the main entry point into the cl_page caching interface. First, a
+ * cache (implemented as a per-object radix tree) is consulted. If page is
+ * found there, it is returned immediately. Otherwise new page is allocated
+ * and returned. In any case, additional reference to page is acquired.
+ *
+ * \see cl_object_find(), cl_lock_find()
+ */
+static struct cl_page *cl_page_find0(const struct lu_env *env,
+				     struct cl_object *o,
+				     pgoff_t idx, struct page *vmpage,
+				     enum cl_page_type type,
+				     struct cl_page *parent)
+{
+	struct cl_page	  *page = NULL;
+	struct cl_page	  *ghost = NULL;
+	struct cl_object_header *hdr;
+	int err;
+
+	LASSERT(type == CPT_CACHEABLE || type == CPT_TRANSIENT);
+	might_sleep();
+
+	hdr = cl_object_header(o);
+	CS_PAGE_INC(o, lookup);
+
+	CDEBUG(D_PAGE, "%lu@"DFID" %p %lx %d\n",
+	       idx, PFID(&hdr->coh_lu.loh_fid), vmpage, vmpage->private, type);
+	/* fast path. */
+	if (type == CPT_CACHEABLE) {
+		/* vmpage lock is used to protect the child/parent
+		 * relationship */
+		KLASSERT(PageLocked(vmpage));
+		/*
+		 * cl_vmpage_page() can be called here without any locks as
+		 *
+		 *     - "vmpage" is locked (which prevents ->private from
+		 *       concurrent updates), and
+		 *
+		 *     - "o" cannot be destroyed while current thread holds a
+		 *       reference on it.
+		 */
+		page = cl_vmpage_page(vmpage, o);
+		PINVRNT(env, page,
+			ergo(page != NULL,
+			     cl_page_vmpage(env, page) == vmpage &&
+			     (void *)radix_tree_lookup(&hdr->coh_tree,
+						       idx) == page));
+	}
+
+	if (page != NULL) {
+		CS_PAGE_INC(o, hit);
+		return page;
+	}
+
+	/* allocate and initialize cl_page */
+	page = cl_page_alloc(env, o, idx, vmpage, type);
+	if (IS_ERR(page))
+		return page;
+
+	if (type == CPT_TRANSIENT) {
+		if (parent) {
+			LASSERT(page->cp_parent == NULL);
+			page->cp_parent = parent;
+			parent->cp_child = page;
+		}
+		return page;
+	}
+
+	/*
+	 * XXX optimization: use radix_tree_preload() here, and change tree
+	 * gfp mask to GFP_KERNEL in cl_object_header_init().
+	 */
+	spin_lock(&hdr->coh_page_guard);
+	err = radix_tree_insert(&hdr->coh_tree, idx, page);
+	if (err != 0) {
+		ghost = page;
+		/*
+		 * Noted by Jay: a lock on \a vmpage protects cl_page_find()
+		 * from this race, but
+		 *
+		 *     0. it's better to have cl_page interface "locally
+		 *     consistent" so that its correctness can be reasoned
+		 *     about without appealing to the (obscure world of) VM
+		 *     locking.
+		 *
+		 *     1. handling this race allows ->coh_tree to remain
+		 *     consistent even when VM locking is somehow busted,
+		 *     which is very useful during diagnosing and debugging.
+		 */
+		page = ERR_PTR(err);
+		CL_PAGE_DEBUG(D_ERROR, env, ghost,
+			      "fail to insert into radix tree: %d\n", err);
+	} else {
+		if (parent) {
+			LASSERT(page->cp_parent == NULL);
+			page->cp_parent = parent;
+			parent->cp_child = page;
+		}
+		hdr->coh_pages++;
+	}
+	spin_unlock(&hdr->coh_page_guard);
+
+	if (unlikely(ghost != NULL)) {
+		cl_page_delete0(env, ghost, 0);
+		cl_page_free(env, ghost);
+	}
+	return page;
+}
+
+struct cl_page *cl_page_find(const struct lu_env *env, struct cl_object *o,
+			     pgoff_t idx, struct page *vmpage,
+			     enum cl_page_type type)
+{
+	return cl_page_find0(env, o, idx, vmpage, type, NULL);
+}
+EXPORT_SYMBOL(cl_page_find);
+
+
+struct cl_page *cl_page_find_sub(const struct lu_env *env, struct cl_object *o,
+				 pgoff_t idx, struct page *vmpage,
+				 struct cl_page *parent)
+{
+	return cl_page_find0(env, o, idx, vmpage, parent->cp_type, parent);
+}
+EXPORT_SYMBOL(cl_page_find_sub);
+
+static inline int cl_page_invariant(const struct cl_page *pg)
+{
+	struct cl_object_header *header;
+	struct cl_page	  *parent;
+	struct cl_page	  *child;
+	struct cl_io	    *owner;
+
+	/*
+	 * Page invariant is protected by a VM lock.
+	 */
+	LINVRNT(cl_page_is_vmlocked(NULL, pg));
+
+	header = cl_object_header(pg->cp_obj);
+	parent = pg->cp_parent;
+	child  = pg->cp_child;
+	owner  = pg->cp_owner;
+
+	return cl_page_in_use(pg) &&
+		ergo(parent != NULL, parent->cp_child == pg) &&
+		ergo(child != NULL, child->cp_parent == pg) &&
+		ergo(child != NULL, pg->cp_obj != child->cp_obj) &&
+		ergo(parent != NULL, pg->cp_obj != parent->cp_obj) &&
+		ergo(owner != NULL && parent != NULL,
+		     parent->cp_owner == pg->cp_owner->ci_parent) &&
+		ergo(owner != NULL && child != NULL,
+		     child->cp_owner->ci_parent == owner) &&
+		/*
+		 * Either page is early in initialization (has neither child
+		 * nor parent yet), or it is in the object radix tree.
+		 */
+		ergo(pg->cp_state < CPS_FREEING && pg->cp_type == CPT_CACHEABLE,
+		     (void *)radix_tree_lookup(&header->coh_tree,
+					       pg->cp_index) == pg ||
+		     (child == NULL && parent == NULL));
+}
+
+static void cl_page_state_set0(const struct lu_env *env,
+			       struct cl_page *page, enum cl_page_state state)
+{
+	enum cl_page_state old;
+
+	/*
+	 * Matrix of allowed state transitions [old][new], for sanity
+	 * checking.
+	 */
+	static const int allowed_transitions[CPS_NR][CPS_NR] = {
+		[CPS_CACHED] = {
+			[CPS_CACHED]  = 0,
+			[CPS_OWNED]   = 1, /* io finds existing cached page */
+			[CPS_PAGEIN]  = 0,
+			[CPS_PAGEOUT] = 1, /* write-out from the cache */
+			[CPS_FREEING] = 1, /* eviction on the memory pressure */
+		},
+		[CPS_OWNED] = {
+			[CPS_CACHED]  = 1, /* release to the cache */
+			[CPS_OWNED]   = 0,
+			[CPS_PAGEIN]  = 1, /* start read immediately */
+			[CPS_PAGEOUT] = 1, /* start write immediately */
+			[CPS_FREEING] = 1, /* lock invalidation or truncate */
+		},
+		[CPS_PAGEIN] = {
+			[CPS_CACHED]  = 1, /* io completion */
+			[CPS_OWNED]   = 0,
+			[CPS_PAGEIN]  = 0,
+			[CPS_PAGEOUT] = 0,
+			[CPS_FREEING] = 0,
+		},
+		[CPS_PAGEOUT] = {
+			[CPS_CACHED]  = 1, /* io completion */
+			[CPS_OWNED]   = 0,
+			[CPS_PAGEIN]  = 0,
+			[CPS_PAGEOUT] = 0,
+			[CPS_FREEING] = 0,
+		},
+		[CPS_FREEING] = {
+			[CPS_CACHED]  = 0,
+			[CPS_OWNED]   = 0,
+			[CPS_PAGEIN]  = 0,
+			[CPS_PAGEOUT] = 0,
+			[CPS_FREEING] = 0,
+		}
+	};
+
+	old = page->cp_state;
+	PASSERT(env, page, allowed_transitions[old][state]);
+	CL_PAGE_HEADER(D_TRACE, env, page, "%d -> %d\n", old, state);
+	for (; page != NULL; page = page->cp_child) {
+		PASSERT(env, page, page->cp_state == old);
+		PASSERT(env, page,
+			equi(state == CPS_OWNED, page->cp_owner != NULL));
+
+		CS_PAGESTATE_DEC(page->cp_obj, page->cp_state);
+		CS_PAGESTATE_INC(page->cp_obj, state);
+		cl_page_state_set_trust(page, state);
+	}
+}
+
+static void cl_page_state_set(const struct lu_env *env,
+			      struct cl_page *page, enum cl_page_state state)
+{
+	cl_page_state_set0(env, page, state);
+}
+
+/**
+ * Acquires an additional reference to a page.
+ *
+ * This can be called only by caller already possessing a reference to \a
+ * page.
+ *
+ * \see cl_object_get(), cl_lock_get().
+ */
+void cl_page_get(struct cl_page *page)
+{
+	cl_page_get_trust(page);
+}
+EXPORT_SYMBOL(cl_page_get);
+
+/**
+ * Releases a reference to a page.
+ *
+ * When last reference is released, page is returned to the cache, unless it
+ * is in cl_page_state::CPS_FREEING state, in which case it is immediately
+ * destroyed.
+ *
+ * \see cl_object_put(), cl_lock_put().
+ */
+void cl_page_put(const struct lu_env *env, struct cl_page *page)
+{
+	PASSERT(env, page, atomic_read(&page->cp_ref) > !!page->cp_parent);
+
+	CL_PAGE_HEADER(D_TRACE, env, page, "%d\n",
+		       atomic_read(&page->cp_ref));
+
+	if (atomic_dec_and_test(&page->cp_ref)) {
+		LASSERT(page->cp_state == CPS_FREEING);
+
+		LASSERT(atomic_read(&page->cp_ref) == 0);
+		PASSERT(env, page, page->cp_owner == NULL);
+		PASSERT(env, page, list_empty(&page->cp_batch));
+		/*
+		 * Page is no longer reachable by other threads. Tear
+		 * it down.
+		 */
+		cl_page_free(env, page);
+	}
+}
+EXPORT_SYMBOL(cl_page_put);
+
+/**
+ * Returns a VM page associated with a given cl_page.
+ */
+struct page *cl_page_vmpage(const struct lu_env *env, struct cl_page *page)
+{
+	const struct cl_page_slice *slice;
+
+	/*
+	 * Find uppermost layer with ->cpo_vmpage() method, and return its
+	 * result.
+	 */
+	page = cl_page_top(page);
+	do {
+		list_for_each_entry(slice, &page->cp_layers, cpl_linkage) {
+			if (slice->cpl_ops->cpo_vmpage != NULL)
+				return slice->cpl_ops->cpo_vmpage(env, slice);
+		}
+		page = page->cp_child;
+	} while (page != NULL);
+	LBUG(); /* ->cpo_vmpage() has to be defined somewhere in the stack */
+}
+EXPORT_SYMBOL(cl_page_vmpage);
+
+/**
+ * Returns a cl_page associated with a VM page, and given cl_object.
+ */
+struct cl_page *cl_vmpage_page(struct page *vmpage, struct cl_object *obj)
+{
+	struct cl_page *top;
+	struct cl_page *page;
+
+	KLASSERT(PageLocked(vmpage));
+
+	/*
+	 * NOTE: absence of races and liveness of data are guaranteed by page
+	 *       lock on a "vmpage". That works because object destruction has
+	 *       bottom-to-top pass.
+	 */
+
+	/*
+	 * This loop assumes that ->private points to the top-most page. This
+	 * can be rectified easily.
+	 */
+	top = (struct cl_page *)vmpage->private;
+	if (top == NULL)
+		return NULL;
+
+	for (page = top; page != NULL; page = page->cp_child) {
+		if (cl_object_same(page->cp_obj, obj)) {
+			cl_page_get_trust(page);
+			break;
+		}
+	}
+	LASSERT(ergo(page, page->cp_type == CPT_CACHEABLE));
+	return page;
+}
+EXPORT_SYMBOL(cl_vmpage_page);
+
+/**
+ * Returns the top-page for a given page.
+ *
+ * \see cl_object_top(), cl_io_top()
+ */
+struct cl_page *cl_page_top(struct cl_page *page)
+{
+	return cl_page_top_trusted(page);
+}
+EXPORT_SYMBOL(cl_page_top);
+
+const struct cl_page_slice *cl_page_at(const struct cl_page *page,
+				       const struct lu_device_type *dtype)
+{
+	return cl_page_at_trusted(page, dtype);
+}
+EXPORT_SYMBOL(cl_page_at);
+
+#define CL_PAGE_OP(opname) offsetof(struct cl_page_operations, opname)
+
+#define CL_PAGE_INVOKE(_env, _page, _op, _proto, ...)		   \
+({								      \
+	const struct lu_env	*__env  = (_env);		    \
+	struct cl_page	     *__page = (_page);		   \
+	const struct cl_page_slice *__scan;			     \
+	int			 __result;			   \
+	ptrdiff_t		   __op   = (_op);		     \
+	int		       (*__method)_proto;		    \
+									\
+	__result = 0;						   \
+	__page = cl_page_top(__page);				   \
+	do {							    \
+		list_for_each_entry(__scan, &__page->cp_layers,     \
+					cpl_linkage) {		  \
+			__method = *(void **)((char *)__scan->cpl_ops + \
+					      __op);		    \
+			if (__method != NULL) {			 \
+				__result = (*__method)(__env, __scan,   \
+						       ## __VA_ARGS__); \
+				if (__result != 0)		      \
+					break;			  \
+			}					       \
+		}						       \
+		__page = __page->cp_child;			      \
+	} while (__page != NULL && __result == 0);		      \
+	if (__result > 0)					       \
+		__result = 0;					   \
+	__result;						       \
+})
+
+#define CL_PAGE_INVOID(_env, _page, _op, _proto, ...)		   \
+do {								    \
+	const struct lu_env	*__env  = (_env);		    \
+	struct cl_page	     *__page = (_page);		   \
+	const struct cl_page_slice *__scan;			     \
+	ptrdiff_t		   __op   = (_op);		     \
+	void		      (*__method)_proto;		    \
+									\
+	__page = cl_page_top(__page);				   \
+	do {							    \
+		list_for_each_entry(__scan, &__page->cp_layers,     \
+					cpl_linkage) {		  \
+			__method = *(void **)((char *)__scan->cpl_ops + \
+					      __op);		    \
+			if (__method != NULL)			   \
+				(*__method)(__env, __scan,	      \
+					    ## __VA_ARGS__);	    \
+		}						       \
+		__page = __page->cp_child;			      \
+	} while (__page != NULL);				       \
+} while (0)
+
+#define CL_PAGE_INVOID_REVERSE(_env, _page, _op, _proto, ...)	       \
+do {									\
+	const struct lu_env	*__env  = (_env);			\
+	struct cl_page	     *__page = (_page);		       \
+	const struct cl_page_slice *__scan;				 \
+	ptrdiff_t		   __op   = (_op);			 \
+	void		      (*__method)_proto;			\
+									    \
+	/* get to the bottom page. */				       \
+	while (__page->cp_child != NULL)				    \
+		__page = __page->cp_child;				  \
+	do {								\
+		list_for_each_entry_reverse(__scan, &__page->cp_layers, \
+						cpl_linkage) {	      \
+			__method = *(void **)((char *)__scan->cpl_ops +     \
+					      __op);			\
+			if (__method != NULL)			       \
+				(*__method)(__env, __scan,		  \
+					    ## __VA_ARGS__);		\
+		}							   \
+		__page = __page->cp_parent;				 \
+	} while (__page != NULL);					   \
+} while (0)
+
+static int cl_page_invoke(const struct lu_env *env,
+			  struct cl_io *io, struct cl_page *page, ptrdiff_t op)
+
+{
+	PINVRNT(env, page, cl_object_same(page->cp_obj, io->ci_obj));
+	return CL_PAGE_INVOKE(env, page, op,
+			      (const struct lu_env *,
+			       const struct cl_page_slice *, struct cl_io *),
+			      io);
+}
+
+static void cl_page_invoid(const struct lu_env *env,
+			   struct cl_io *io, struct cl_page *page, ptrdiff_t op)
+
+{
+	PINVRNT(env, page, cl_object_same(page->cp_obj, io->ci_obj));
+	CL_PAGE_INVOID(env, page, op,
+		       (const struct lu_env *,
+			const struct cl_page_slice *, struct cl_io *), io);
+}
+
+static void cl_page_owner_clear(struct cl_page *page)
+{
+	for (page = cl_page_top(page); page != NULL; page = page->cp_child) {
+		if (page->cp_owner != NULL) {
+			LASSERT(page->cp_owner->ci_owned_nr > 0);
+			page->cp_owner->ci_owned_nr--;
+			page->cp_owner = NULL;
+			page->cp_task = NULL;
+		}
+	}
+}
+
+static void cl_page_owner_set(struct cl_page *page)
+{
+	for (page = cl_page_top(page); page != NULL; page = page->cp_child) {
+		LASSERT(page->cp_owner != NULL);
+		page->cp_owner->ci_owned_nr++;
+	}
+}
+
+void cl_page_disown0(const struct lu_env *env,
+		     struct cl_io *io, struct cl_page *pg)
+{
+	enum cl_page_state state;
+
+	state = pg->cp_state;
+	PINVRNT(env, pg, state == CPS_OWNED || state == CPS_FREEING);
+	PINVRNT(env, pg, cl_page_invariant(pg));
+	cl_page_owner_clear(pg);
+
+	if (state == CPS_OWNED)
+		cl_page_state_set(env, pg, CPS_CACHED);
+	/*
+	 * Completion call-backs are executed in the bottom-up order, so that
+	 * uppermost layer (llite), responsible for VFS/VM interaction runs
+	 * last and can release locks safely.
+	 */
+	CL_PAGE_INVOID_REVERSE(env, pg, CL_PAGE_OP(cpo_disown),
+			       (const struct lu_env *,
+				const struct cl_page_slice *, struct cl_io *),
+			       io);
+}
+
+/**
+ * returns true, iff page is owned by the given io.
+ */
+int cl_page_is_owned(const struct cl_page *pg, const struct cl_io *io)
+{
+	LINVRNT(cl_object_same(pg->cp_obj, io->ci_obj));
+	return pg->cp_state == CPS_OWNED && pg->cp_owner == io;
+}
+EXPORT_SYMBOL(cl_page_is_owned);
+
+/**
+ * Try to own a page by IO.
+ *
+ * Waits until page is in cl_page_state::CPS_CACHED state, and then switch it
+ * into cl_page_state::CPS_OWNED state.
+ *
+ * \pre  !cl_page_is_owned(pg, io)
+ * \post result == 0 iff cl_page_is_owned(pg, io)
+ *
+ * \retval 0   success
+ *
+ * \retval -ve failure, e.g., page was destroyed (and landed in
+ *	     cl_page_state::CPS_FREEING instead of cl_page_state::CPS_CACHED).
+ *	     or, page was owned by another thread, or in IO.
+ *
+ * \see cl_page_disown()
+ * \see cl_page_operations::cpo_own()
+ * \see cl_page_own_try()
+ * \see cl_page_own
+ */
+static int cl_page_own0(const struct lu_env *env, struct cl_io *io,
+			struct cl_page *pg, int nonblock)
+{
+	int result;
+
+	PINVRNT(env, pg, !cl_page_is_owned(pg, io));
+
+	pg = cl_page_top(pg);
+	io = cl_io_top(io);
+
+	if (pg->cp_state == CPS_FREEING) {
+		result = -ENOENT;
+	} else {
+		result = CL_PAGE_INVOKE(env, pg, CL_PAGE_OP(cpo_own),
+					(const struct lu_env *,
+					 const struct cl_page_slice *,
+					 struct cl_io *, int),
+					io, nonblock);
+		if (result == 0) {
+			PASSERT(env, pg, pg->cp_owner == NULL);
+			PASSERT(env, pg, pg->cp_req == NULL);
+			pg->cp_owner = io;
+			pg->cp_task  = current;
+			cl_page_owner_set(pg);
+			if (pg->cp_state != CPS_FREEING) {
+				cl_page_state_set(env, pg, CPS_OWNED);
+			} else {
+				cl_page_disown0(env, io, pg);
+				result = -ENOENT;
+			}
+		}
+	}
+	PINVRNT(env, pg, ergo(result == 0, cl_page_invariant(pg)));
+	return result;
+}
+
+/**
+ * Own a page, might be blocked.
+ *
+ * \see cl_page_own0()
+ */
+int cl_page_own(const struct lu_env *env, struct cl_io *io, struct cl_page *pg)
+{
+	return cl_page_own0(env, io, pg, 0);
+}
+EXPORT_SYMBOL(cl_page_own);
+
+/**
+ * Nonblock version of cl_page_own().
+ *
+ * \see cl_page_own0()
+ */
+int cl_page_own_try(const struct lu_env *env, struct cl_io *io,
+		    struct cl_page *pg)
+{
+	return cl_page_own0(env, io, pg, 1);
+}
+EXPORT_SYMBOL(cl_page_own_try);
+
+
+/**
+ * Assume page ownership.
+ *
+ * Called when page is already locked by the hosting VM.
+ *
+ * \pre !cl_page_is_owned(pg, io)
+ * \post cl_page_is_owned(pg, io)
+ *
+ * \see cl_page_operations::cpo_assume()
+ */
+void cl_page_assume(const struct lu_env *env,
+		    struct cl_io *io, struct cl_page *pg)
+{
+	PINVRNT(env, pg, cl_object_same(pg->cp_obj, io->ci_obj));
+
+	pg = cl_page_top(pg);
+	io = cl_io_top(io);
+
+	cl_page_invoid(env, io, pg, CL_PAGE_OP(cpo_assume));
+	PASSERT(env, pg, pg->cp_owner == NULL);
+	pg->cp_owner = io;
+	pg->cp_task = current;
+	cl_page_owner_set(pg);
+	cl_page_state_set(env, pg, CPS_OWNED);
+}
+EXPORT_SYMBOL(cl_page_assume);
+
+/**
+ * Releases page ownership without unlocking the page.
+ *
+ * Moves page into cl_page_state::CPS_CACHED without releasing a lock on the
+ * underlying VM page (as VM is supposed to do this itself).
+ *
+ * \pre   cl_page_is_owned(pg, io)
+ * \post !cl_page_is_owned(pg, io)
+ *
+ * \see cl_page_assume()
+ */
+void cl_page_unassume(const struct lu_env *env,
+		      struct cl_io *io, struct cl_page *pg)
+{
+	PINVRNT(env, pg, cl_page_is_owned(pg, io));
+	PINVRNT(env, pg, cl_page_invariant(pg));
+
+	pg = cl_page_top(pg);
+	io = cl_io_top(io);
+	cl_page_owner_clear(pg);
+	cl_page_state_set(env, pg, CPS_CACHED);
+	CL_PAGE_INVOID_REVERSE(env, pg, CL_PAGE_OP(cpo_unassume),
+			       (const struct lu_env *,
+				const struct cl_page_slice *, struct cl_io *),
+			       io);
+}
+EXPORT_SYMBOL(cl_page_unassume);
+
+/**
+ * Releases page ownership.
+ *
+ * Moves page into cl_page_state::CPS_CACHED.
+ *
+ * \pre   cl_page_is_owned(pg, io)
+ * \post !cl_page_is_owned(pg, io)
+ *
+ * \see cl_page_own()
+ * \see cl_page_operations::cpo_disown()
+ */
+void cl_page_disown(const struct lu_env *env,
+		    struct cl_io *io, struct cl_page *pg)
+{
+	PINVRNT(env, pg, cl_page_is_owned(pg, io));
+
+	pg = cl_page_top(pg);
+	io = cl_io_top(io);
+	cl_page_disown0(env, io, pg);
+}
+EXPORT_SYMBOL(cl_page_disown);
+
+/**
+ * Called when page is to be removed from the object, e.g., as a result of
+ * truncate.
+ *
+ * Calls cl_page_operations::cpo_discard() top-to-bottom.
+ *
+ * \pre cl_page_is_owned(pg, io)
+ *
+ * \see cl_page_operations::cpo_discard()
+ */
+void cl_page_discard(const struct lu_env *env,
+		     struct cl_io *io, struct cl_page *pg)
+{
+	PINVRNT(env, pg, cl_page_is_owned(pg, io));
+	PINVRNT(env, pg, cl_page_invariant(pg));
+
+	cl_page_invoid(env, io, pg, CL_PAGE_OP(cpo_discard));
+}
+EXPORT_SYMBOL(cl_page_discard);
+
+/**
+ * Version of cl_page_delete() that can be called for not fully constructed
+ * pages, e.g,. in a error handling cl_page_find()->cl_page_delete0()
+ * path. Doesn't check page invariant.
+ */
+static void cl_page_delete0(const struct lu_env *env, struct cl_page *pg,
+			    int radix)
+{
+	struct cl_page *tmp = pg;
+
+	PASSERT(env, pg, pg == cl_page_top(pg));
+	PASSERT(env, pg, pg->cp_state != CPS_FREEING);
+
+	/*
+	 * Severe all ways to obtain new pointers to @pg.
+	 */
+	cl_page_owner_clear(pg);
+
+	/*
+	 * unexport the page firstly before freeing it so that
+	 * the page content is considered to be invalid.
+	 * We have to do this because a CPS_FREEING cl_page may
+	 * be NOT under the protection of a cl_lock.
+	 * Afterwards, if this page is found by other threads, then this
+	 * page will be forced to reread.
+	 */
+	cl_page_export(env, pg, 0);
+	cl_page_state_set0(env, pg, CPS_FREEING);
+
+	CL_PAGE_INVOID(env, pg, CL_PAGE_OP(cpo_delete),
+		       (const struct lu_env *, const struct cl_page_slice *));
+
+	if (tmp->cp_type == CPT_CACHEABLE) {
+		if (!radix)
+			/* !radix means that @pg is not yet in the radix tree,
+			 * skip removing it.
+			 */
+			tmp = pg->cp_child;
+		for (; tmp != NULL; tmp = tmp->cp_child) {
+			void		    *value;
+			struct cl_object_header *hdr;
+
+			hdr = cl_object_header(tmp->cp_obj);
+			spin_lock(&hdr->coh_page_guard);
+			value = radix_tree_delete(&hdr->coh_tree,
+						  tmp->cp_index);
+			PASSERT(env, tmp, value == tmp);
+			PASSERT(env, tmp, hdr->coh_pages > 0);
+			hdr->coh_pages--;
+			spin_unlock(&hdr->coh_page_guard);
+			cl_page_put(env, tmp);
+		}
+	}
+}
+
+/**
+ * Called when a decision is made to throw page out of memory.
+ *
+ * Notifies all layers about page destruction by calling
+ * cl_page_operations::cpo_delete() method top-to-bottom.
+ *
+ * Moves page into cl_page_state::CPS_FREEING state (this is the only place
+ * where transition to this state happens).
+ *
+ * Eliminates all venues through which new references to the page can be
+ * obtained:
+ *
+ *     - removes page from the radix trees,
+ *
+ *     - breaks linkage from VM page to cl_page.
+ *
+ * Once page reaches cl_page_state::CPS_FREEING, all remaining references will
+ * drain after some time, at which point page will be recycled.
+ *
+ * \pre  pg == cl_page_top(pg)
+ * \pre  VM page is locked
+ * \post pg->cp_state == CPS_FREEING
+ *
+ * \see cl_page_operations::cpo_delete()
+ */
+void cl_page_delete(const struct lu_env *env, struct cl_page *pg)
+{
+	PINVRNT(env, pg, cl_page_invariant(pg));
+	cl_page_delete0(env, pg, 1);
+}
+EXPORT_SYMBOL(cl_page_delete);
+
+/**
+ * Unmaps page from user virtual memory.
+ *
+ * Calls cl_page_operations::cpo_unmap() through all layers top-to-bottom. The
+ * layer responsible for VM interaction has to unmap page from user space
+ * virtual memory.
+ *
+ * \see cl_page_operations::cpo_unmap()
+ */
+int cl_page_unmap(const struct lu_env *env,
+		  struct cl_io *io, struct cl_page *pg)
+{
+	PINVRNT(env, pg, cl_page_is_owned(pg, io));
+	PINVRNT(env, pg, cl_page_invariant(pg));
+
+	return cl_page_invoke(env, io, pg, CL_PAGE_OP(cpo_unmap));
+}
+EXPORT_SYMBOL(cl_page_unmap);
+
+/**
+ * Marks page up-to-date.
+ *
+ * Call cl_page_operations::cpo_export() through all layers top-to-bottom. The
+ * layer responsible for VM interaction has to mark/clear page as up-to-date
+ * by the \a uptodate argument.
+ *
+ * \see cl_page_operations::cpo_export()
+ */
+void cl_page_export(const struct lu_env *env, struct cl_page *pg, int uptodate)
+{
+	PINVRNT(env, pg, cl_page_invariant(pg));
+	CL_PAGE_INVOID(env, pg, CL_PAGE_OP(cpo_export),
+		       (const struct lu_env *,
+			const struct cl_page_slice *, int), uptodate);
+}
+EXPORT_SYMBOL(cl_page_export);
+
+/**
+ * Returns true, iff \a pg is VM locked in a suitable sense by the calling
+ * thread.
+ */
+int cl_page_is_vmlocked(const struct lu_env *env, const struct cl_page *pg)
+{
+	int result;
+	const struct cl_page_slice *slice;
+
+	pg = cl_page_top_trusted((struct cl_page *)pg);
+	slice = container_of(pg->cp_layers.next,
+			     const struct cl_page_slice, cpl_linkage);
+	PASSERT(env, pg, slice->cpl_ops->cpo_is_vmlocked != NULL);
+	/*
+	 * Call ->cpo_is_vmlocked() directly instead of going through
+	 * CL_PAGE_INVOKE(), because cl_page_is_vmlocked() is used by
+	 * cl_page_invariant().
+	 */
+	result = slice->cpl_ops->cpo_is_vmlocked(env, slice);
+	PASSERT(env, pg, result == -EBUSY || result == -ENODATA);
+	return result == -EBUSY;
+}
+EXPORT_SYMBOL(cl_page_is_vmlocked);
+
+static enum cl_page_state cl_req_type_state(enum cl_req_type crt)
+{
+	return crt == CRT_WRITE ? CPS_PAGEOUT : CPS_PAGEIN;
+}
+
+static void cl_page_io_start(const struct lu_env *env,
+			     struct cl_page *pg, enum cl_req_type crt)
+{
+	/*
+	 * Page is queued for IO, change its state.
+	 */
+	cl_page_owner_clear(pg);
+	cl_page_state_set(env, pg, cl_req_type_state(crt));
+}
+
+/**
+ * Prepares page for immediate transfer. cl_page_operations::cpo_prep() is
+ * called top-to-bottom. Every layer either agrees to submit this page (by
+ * returning 0), or requests to omit this page (by returning -EALREADY). Layer
+ * handling interactions with the VM also has to inform VM that page is under
+ * transfer now.
+ */
+int cl_page_prep(const struct lu_env *env, struct cl_io *io,
+		 struct cl_page *pg, enum cl_req_type crt)
+{
+	int result;
+
+	PINVRNT(env, pg, cl_page_is_owned(pg, io));
+	PINVRNT(env, pg, cl_page_invariant(pg));
+	PINVRNT(env, pg, crt < CRT_NR);
+
+	/*
+	 * XXX this has to be called bottom-to-top, so that llite can set up
+	 * PG_writeback without risking other layers deciding to skip this
+	 * page.
+	 */
+	if (crt >= CRT_NR)
+		return -EINVAL;
+	result = cl_page_invoke(env, io, pg, CL_PAGE_OP(io[crt].cpo_prep));
+	if (result == 0)
+		cl_page_io_start(env, pg, crt);
+
+	KLASSERT(ergo(crt == CRT_WRITE && pg->cp_type == CPT_CACHEABLE,
+		      equi(result == 0,
+			   PageWriteback(cl_page_vmpage(env, pg)))));
+	CL_PAGE_HEADER(D_TRACE, env, pg, "%d %d\n", crt, result);
+	return result;
+}
+EXPORT_SYMBOL(cl_page_prep);
+
+/**
+ * Notify layers about transfer completion.
+ *
+ * Invoked by transfer sub-system (which is a part of osc) to notify layers
+ * that a transfer, of which this page is a part of has completed.
+ *
+ * Completion call-backs are executed in the bottom-up order, so that
+ * uppermost layer (llite), responsible for the VFS/VM interaction runs last
+ * and can release locks safely.
+ *
+ * \pre  pg->cp_state == CPS_PAGEIN || pg->cp_state == CPS_PAGEOUT
+ * \post pg->cp_state == CPS_CACHED
+ *
+ * \see cl_page_operations::cpo_completion()
+ */
+void cl_page_completion(const struct lu_env *env,
+			struct cl_page *pg, enum cl_req_type crt, int ioret)
+{
+	struct cl_sync_io *anchor = pg->cp_sync_io;
+
+	PASSERT(env, pg, crt < CRT_NR);
+	/* cl_page::cp_req already cleared by the caller (osc_completion()) */
+	PASSERT(env, pg, pg->cp_req == NULL);
+	PASSERT(env, pg, pg->cp_state == cl_req_type_state(crt));
+
+	CL_PAGE_HEADER(D_TRACE, env, pg, "%d %d\n", crt, ioret);
+	if (crt == CRT_READ && ioret == 0) {
+		PASSERT(env, pg, !(pg->cp_flags & CPF_READ_COMPLETED));
+		pg->cp_flags |= CPF_READ_COMPLETED;
+	}
+
+	cl_page_state_set(env, pg, CPS_CACHED);
+	if (crt >= CRT_NR)
+		return;
+	CL_PAGE_INVOID_REVERSE(env, pg, CL_PAGE_OP(io[crt].cpo_completion),
+			       (const struct lu_env *,
+				const struct cl_page_slice *, int), ioret);
+	if (anchor) {
+		LASSERT(cl_page_is_vmlocked(env, pg));
+		LASSERT(pg->cp_sync_io == anchor);
+		pg->cp_sync_io = NULL;
+	}
+	/*
+	 * As page->cp_obj is pinned by a reference from page->cp_req, it is
+	 * safe to call cl_page_put() without risking object destruction in a
+	 * non-blocking context.
+	 */
+	cl_page_put(env, pg);
+
+	if (anchor)
+		cl_sync_io_note(anchor, ioret);
+}
+EXPORT_SYMBOL(cl_page_completion);
+
+/**
+ * Notify layers that transfer formation engine decided to yank this page from
+ * the cache and to make it a part of a transfer.
+ *
+ * \pre  pg->cp_state == CPS_CACHED
+ * \post pg->cp_state == CPS_PAGEIN || pg->cp_state == CPS_PAGEOUT
+ *
+ * \see cl_page_operations::cpo_make_ready()
+ */
+int cl_page_make_ready(const struct lu_env *env, struct cl_page *pg,
+		       enum cl_req_type crt)
+{
+	int result;
+
+	PINVRNT(env, pg, crt < CRT_NR);
+
+	if (crt >= CRT_NR)
+		return -EINVAL;
+	result = CL_PAGE_INVOKE(env, pg, CL_PAGE_OP(io[crt].cpo_make_ready),
+				(const struct lu_env *,
+				 const struct cl_page_slice *));
+	if (result == 0) {
+		PASSERT(env, pg, pg->cp_state == CPS_CACHED);
+		cl_page_io_start(env, pg, crt);
+	}
+	CL_PAGE_HEADER(D_TRACE, env, pg, "%d %d\n", crt, result);
+	return result;
+}
+EXPORT_SYMBOL(cl_page_make_ready);
+
+/**
+ * Notify layers that high level io decided to place this page into a cache
+ * for future transfer.
+ *
+ * The layer implementing transfer engine (osc) has to register this page in
+ * its queues.
+ *
+ * \pre  cl_page_is_owned(pg, io)
+ * \post cl_page_is_owned(pg, io)
+ *
+ * \see cl_page_operations::cpo_cache_add()
+ */
+int cl_page_cache_add(const struct lu_env *env, struct cl_io *io,
+		      struct cl_page *pg, enum cl_req_type crt)
+{
+	const struct cl_page_slice *scan;
+	int result = 0;
+
+	PINVRNT(env, pg, crt < CRT_NR);
+	PINVRNT(env, pg, cl_page_is_owned(pg, io));
+	PINVRNT(env, pg, cl_page_invariant(pg));
+
+	if (crt >= CRT_NR)
+		return -EINVAL;
+
+	list_for_each_entry(scan, &pg->cp_layers, cpl_linkage) {
+		if (scan->cpl_ops->io[crt].cpo_cache_add == NULL)
+			continue;
+
+		result = scan->cpl_ops->io[crt].cpo_cache_add(env, scan, io);
+		if (result != 0)
+			break;
+	}
+	CL_PAGE_HEADER(D_TRACE, env, pg, "%d %d\n", crt, result);
+	return result;
+}
+EXPORT_SYMBOL(cl_page_cache_add);
+
+/**
+ * Called if a pge is being written back by kernel's intention.
+ *
+ * \pre  cl_page_is_owned(pg, io)
+ * \post ergo(result == 0, pg->cp_state == CPS_PAGEOUT)
+ *
+ * \see cl_page_operations::cpo_flush()
+ */
+int cl_page_flush(const struct lu_env *env, struct cl_io *io,
+		  struct cl_page *pg)
+{
+	int result;
+
+	PINVRNT(env, pg, cl_page_is_owned(pg, io));
+	PINVRNT(env, pg, cl_page_invariant(pg));
+
+	result = cl_page_invoke(env, io, pg, CL_PAGE_OP(cpo_flush));
+
+	CL_PAGE_HEADER(D_TRACE, env, pg, "%d\n", result);
+	return result;
+}
+EXPORT_SYMBOL(cl_page_flush);
+
+/**
+ * Checks whether page is protected by any extent lock is at least required
+ * mode.
+ *
+ * \return the same as in cl_page_operations::cpo_is_under_lock() method.
+ * \see cl_page_operations::cpo_is_under_lock()
+ */
+int cl_page_is_under_lock(const struct lu_env *env, struct cl_io *io,
+			  struct cl_page *page)
+{
+	int rc;
+
+	PINVRNT(env, page, cl_page_invariant(page));
+
+	rc = CL_PAGE_INVOKE(env, page, CL_PAGE_OP(cpo_is_under_lock),
+			    (const struct lu_env *,
+			     const struct cl_page_slice *, struct cl_io *),
+			    io);
+	PASSERT(env, page, rc != 0);
+	return rc;
+}
+EXPORT_SYMBOL(cl_page_is_under_lock);
+
+static int page_prune_cb(const struct lu_env *env, struct cl_io *io,
+			 struct cl_page *page, void *cbdata)
+{
+	cl_page_own(env, io, page);
+	cl_page_unmap(env, io, page);
+	cl_page_discard(env, io, page);
+	cl_page_disown(env, io, page);
+	return CLP_GANG_OKAY;
+}
+
+/**
+ * Purges all cached pages belonging to the object \a obj.
+ */
+int cl_pages_prune(const struct lu_env *env, struct cl_object *clobj)
+{
+	struct cl_thread_info   *info;
+	struct cl_object	*obj = cl_object_top(clobj);
+	struct cl_io	    *io;
+	int		      result;
+
+	info  = cl_env_info(env);
+	io    = &info->clt_io;
+
+	/*
+	 * initialize the io. This is ugly since we never do IO in this
+	 * function, we just make cl_page_list functions happy. -jay
+	 */
+	io->ci_obj = obj;
+	io->ci_ignore_layout = 1;
+	result = cl_io_init(env, io, CIT_MISC, obj);
+	if (result != 0) {
+		cl_io_fini(env, io);
+		return io->ci_result;
+	}
+
+	do {
+		result = cl_page_gang_lookup(env, obj, io, 0, CL_PAGE_EOF,
+					     page_prune_cb, NULL);
+		if (result == CLP_GANG_RESCHED)
+			cond_resched();
+	} while (result != CLP_GANG_OKAY);
+
+	cl_io_fini(env, io);
+	return result;
+}
+EXPORT_SYMBOL(cl_pages_prune);
+
+/**
+ * Tells transfer engine that only part of a page is to be transmitted.
+ *
+ * \see cl_page_operations::cpo_clip()
+ */
+void cl_page_clip(const struct lu_env *env, struct cl_page *pg,
+		  int from, int to)
+{
+	PINVRNT(env, pg, cl_page_invariant(pg));
+
+	CL_PAGE_HEADER(D_TRACE, env, pg, "%d %d\n", from, to);
+	CL_PAGE_INVOID(env, pg, CL_PAGE_OP(cpo_clip),
+		       (const struct lu_env *,
+			const struct cl_page_slice *,int, int),
+		       from, to);
+}
+EXPORT_SYMBOL(cl_page_clip);
+
+/**
+ * Prints human readable representation of \a pg to the \a f.
+ */
+void cl_page_header_print(const struct lu_env *env, void *cookie,
+			  lu_printer_t printer, const struct cl_page *pg)
+{
+	(*printer)(env, cookie,
+		   "page@%p[%d %p:%lu ^%p_%p %d %d %d %p %p %#x]\n",
+		   pg, atomic_read(&pg->cp_ref), pg->cp_obj,
+		   pg->cp_index, pg->cp_parent, pg->cp_child,
+		   pg->cp_state, pg->cp_error, pg->cp_type,
+		   pg->cp_owner, pg->cp_req, pg->cp_flags);
+}
+EXPORT_SYMBOL(cl_page_header_print);
+
+/**
+ * Prints human readable representation of \a pg to the \a f.
+ */
+void cl_page_print(const struct lu_env *env, void *cookie,
+		   lu_printer_t printer, const struct cl_page *pg)
+{
+	struct cl_page *scan;
+
+	for (scan = cl_page_top((struct cl_page *)pg);
+	     scan != NULL; scan = scan->cp_child)
+		cl_page_header_print(env, cookie, printer, scan);
+	CL_PAGE_INVOKE(env, (struct cl_page *)pg, CL_PAGE_OP(cpo_print),
+		       (const struct lu_env *env,
+			const struct cl_page_slice *slice,
+			void *cookie, lu_printer_t p), cookie, printer);
+	(*printer)(env, cookie, "end page@%p\n", pg);
+}
+EXPORT_SYMBOL(cl_page_print);
+
+/**
+ * Cancel a page which is still in a transfer.
+ */
+int cl_page_cancel(const struct lu_env *env, struct cl_page *page)
+{
+	return CL_PAGE_INVOKE(env, page, CL_PAGE_OP(cpo_cancel),
+			      (const struct lu_env *,
+			       const struct cl_page_slice *));
+}
+EXPORT_SYMBOL(cl_page_cancel);
+
+/**
+ * Converts a byte offset within object \a obj into a page index.
+ */
+loff_t cl_offset(const struct cl_object *obj, pgoff_t idx)
+{
+	/*
+	 * XXX for now.
+	 */
+	return (loff_t)idx << PAGE_CACHE_SHIFT;
+}
+EXPORT_SYMBOL(cl_offset);
+
+/**
+ * Converts a page index into a byte offset within object \a obj.
+ */
+pgoff_t cl_index(const struct cl_object *obj, loff_t offset)
+{
+	/*
+	 * XXX for now.
+	 */
+	return offset >> PAGE_CACHE_SHIFT;
+}
+EXPORT_SYMBOL(cl_index);
+
+int cl_page_size(const struct cl_object *obj)
+{
+	return 1 << PAGE_CACHE_SHIFT;
+}
+EXPORT_SYMBOL(cl_page_size);
+
+/**
+ * Adds page slice to the compound page.
+ *
+ * This is called by cl_object_operations::coo_page_init() methods to add a
+ * per-layer state to the page. New state is added at the end of
+ * cl_page::cp_layers list, that is, it is at the bottom of the stack.
+ *
+ * \see cl_lock_slice_add(), cl_req_slice_add(), cl_io_slice_add()
+ */
+void cl_page_slice_add(struct cl_page *page, struct cl_page_slice *slice,
+		       struct cl_object *obj,
+		       const struct cl_page_operations *ops)
+{
+	list_add_tail(&slice->cpl_linkage, &page->cp_layers);
+	slice->cpl_obj  = obj;
+	slice->cpl_ops  = ops;
+	slice->cpl_page = page;
+}
+EXPORT_SYMBOL(cl_page_slice_add);
+
+int  cl_page_init(void)
+{
+	return 0;
+}
+
+void cl_page_fini(void)
+{
+}
diff --git a/kernel/drivers/staging/lustre/lustre/obdclass/class_obd.c b/kernel/drivers/staging/lustre/lustre/obdclass/class_obd.c
new file mode 100644
index 000000000..d4b74b670
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/obdclass/class_obd.c
@@ -0,0 +1,704 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 1999, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+# include <linux/atomic.h>
+
+#include "../include/obd_support.h"
+#include "../include/obd_class.h"
+#include "../../include/linux/lnet/lnetctl.h"
+#include "../include/lustre_debug.h"
+#include "../include/lprocfs_status.h"
+#include "../include/lustre/lustre_build_version.h"
+#include <linux/list.h>
+#include "../include/cl_object.h"
+#include "llog_internal.h"
+
+
+struct obd_device *obd_devs[MAX_OBD_DEVICES];
+EXPORT_SYMBOL(obd_devs);
+struct list_head obd_types;
+DEFINE_RWLOCK(obd_dev_lock);
+
+__u64 obd_max_pages = 0;
+EXPORT_SYMBOL(obd_max_pages);
+__u64 obd_max_alloc = 0;
+EXPORT_SYMBOL(obd_max_alloc);
+__u64 obd_alloc;
+EXPORT_SYMBOL(obd_alloc);
+__u64 obd_pages;
+EXPORT_SYMBOL(obd_pages);
+static DEFINE_SPINLOCK(obd_updatemax_lock);
+
+/* The following are visible and mutable through /proc/sys/lustre/. */
+unsigned int obd_alloc_fail_rate = 0;
+EXPORT_SYMBOL(obd_alloc_fail_rate);
+unsigned int obd_debug_peer_on_timeout;
+EXPORT_SYMBOL(obd_debug_peer_on_timeout);
+unsigned int obd_dump_on_timeout;
+EXPORT_SYMBOL(obd_dump_on_timeout);
+unsigned int obd_dump_on_eviction;
+EXPORT_SYMBOL(obd_dump_on_eviction);
+unsigned int obd_max_dirty_pages = 256;
+EXPORT_SYMBOL(obd_max_dirty_pages);
+atomic_t obd_dirty_pages;
+EXPORT_SYMBOL(obd_dirty_pages);
+unsigned int obd_timeout = OBD_TIMEOUT_DEFAULT;   /* seconds */
+EXPORT_SYMBOL(obd_timeout);
+unsigned int ldlm_timeout = LDLM_TIMEOUT_DEFAULT; /* seconds */
+EXPORT_SYMBOL(ldlm_timeout);
+unsigned int obd_timeout_set;
+EXPORT_SYMBOL(obd_timeout_set);
+unsigned int ldlm_timeout_set;
+EXPORT_SYMBOL(ldlm_timeout_set);
+/* Adaptive timeout defs here instead of ptlrpc module for /proc/sys/ access */
+unsigned int at_min = 0;
+EXPORT_SYMBOL(at_min);
+unsigned int at_max = 600;
+EXPORT_SYMBOL(at_max);
+unsigned int at_history = 600;
+EXPORT_SYMBOL(at_history);
+int at_early_margin = 5;
+EXPORT_SYMBOL(at_early_margin);
+int at_extra = 30;
+EXPORT_SYMBOL(at_extra);
+
+atomic_t obd_dirty_transit_pages;
+EXPORT_SYMBOL(obd_dirty_transit_pages);
+
+char obd_jobid_var[JOBSTATS_JOBID_VAR_MAX_LEN + 1] = JOBSTATS_DISABLE;
+EXPORT_SYMBOL(obd_jobid_var);
+
+char obd_jobid_node[JOBSTATS_JOBID_SIZE + 1];
+
+/* Get jobid of current process from stored variable or calculate
+ * it from pid and user_id.
+ *
+ * Historically this was also done by reading the environment variable
+ * stored in between the "env_start" & "env_end" of task struct.
+ * This is now deprecated.
+ */
+int lustre_get_jobid(char *jobid)
+{
+	memset(jobid, 0, JOBSTATS_JOBID_SIZE);
+	/* Jobstats isn't enabled */
+	if (strcmp(obd_jobid_var, JOBSTATS_DISABLE) == 0)
+		return 0;
+
+	/* Use process name + fsuid as jobid */
+	if (strcmp(obd_jobid_var, JOBSTATS_PROCNAME_UID) == 0) {
+		snprintf(jobid, JOBSTATS_JOBID_SIZE, "%s.%u",
+			 current_comm(),
+			 from_kuid(&init_user_ns, current_fsuid()));
+		return 0;
+	}
+
+	/* Whole node dedicated to single job */
+	if (strcmp(obd_jobid_var, JOBSTATS_NODELOCAL) == 0) {
+		strcpy(jobid, obd_jobid_node);
+		return 0;
+	}
+
+	return -ENOENT;
+}
+EXPORT_SYMBOL(lustre_get_jobid);
+
+int obd_alloc_fail(const void *ptr, const char *name, const char *type,
+		   size_t size, const char *file, int line)
+{
+	if (ptr == NULL ||
+	    (cfs_rand() & OBD_ALLOC_FAIL_MASK) < obd_alloc_fail_rate) {
+		CERROR("%s%salloc of %s (%llu bytes) failed at %s:%d\n",
+		       ptr ? "force " :"", type, name, (__u64)size, file,
+		       line);
+		CERROR("%llu total bytes and %llu total pages (%llu bytes) allocated by Lustre, %d total bytes by LNET\n",
+		       obd_memory_sum(),
+		       obd_pages_sum() << PAGE_CACHE_SHIFT,
+		       obd_pages_sum(),
+		       atomic_read(&libcfs_kmemory));
+		return 1;
+	}
+	return 0;
+}
+EXPORT_SYMBOL(obd_alloc_fail);
+
+static inline void obd_data2conn(struct lustre_handle *conn,
+				 struct obd_ioctl_data *data)
+{
+	memset(conn, 0, sizeof(*conn));
+	conn->cookie = data->ioc_cookie;
+}
+
+static inline void obd_conn2data(struct obd_ioctl_data *data,
+				 struct lustre_handle *conn)
+{
+	data->ioc_cookie = conn->cookie;
+}
+
+int class_resolve_dev_name(__u32 len, const char *name)
+{
+	int rc;
+	int dev;
+
+	if (!len || !name) {
+		CERROR("No name passed,!\n");
+		rc = -EINVAL;
+		goto out;
+	}
+	if (name[len - 1] != 0) {
+		CERROR("Name not nul terminated!\n");
+		rc = -EINVAL;
+		goto out;
+	}
+
+	CDEBUG(D_IOCTL, "device name %s\n", name);
+	dev = class_name2dev(name);
+	if (dev == -1) {
+		CDEBUG(D_IOCTL, "No device for name %s!\n", name);
+		rc = -EINVAL;
+		goto out;
+	}
+
+	CDEBUG(D_IOCTL, "device name %s, dev %d\n", name, dev);
+	rc = dev;
+
+out:
+	return rc;
+}
+
+int class_handle_ioctl(unsigned int cmd, unsigned long arg)
+{
+	char *buf = NULL;
+	struct obd_ioctl_data *data;
+	struct libcfs_debug_ioctl_data *debug_data;
+	struct obd_device *obd = NULL;
+	int err = 0, len = 0;
+
+	/* only for debugging */
+	if (cmd == LIBCFS_IOC_DEBUG_MASK) {
+		debug_data = (struct libcfs_debug_ioctl_data *)arg;
+		libcfs_subsystem_debug = debug_data->subs;
+		libcfs_debug = debug_data->debug;
+		return 0;
+	}
+
+	CDEBUG(D_IOCTL, "cmd = %x\n", cmd);
+	if (obd_ioctl_getdata(&buf, &len, (void *)arg)) {
+		CERROR("OBD ioctl: data error\n");
+		return -EINVAL;
+	}
+	data = (struct obd_ioctl_data *)buf;
+
+	switch (cmd) {
+	case OBD_IOC_PROCESS_CFG: {
+		struct lustre_cfg *lcfg;
+
+		if (!data->ioc_plen1 || !data->ioc_pbuf1) {
+			CERROR("No config buffer passed!\n");
+			err = -EINVAL;
+			goto out;
+		}
+		OBD_ALLOC(lcfg, data->ioc_plen1);
+		if (lcfg == NULL) {
+			err = -ENOMEM;
+			goto out;
+		}
+		err = copy_from_user(lcfg, data->ioc_pbuf1,
+					 data->ioc_plen1);
+		if (!err)
+			err = lustre_cfg_sanity_check(lcfg, data->ioc_plen1);
+		if (!err)
+			err = class_process_config(lcfg);
+
+		OBD_FREE(lcfg, data->ioc_plen1);
+		goto out;
+	}
+
+	case OBD_GET_VERSION:
+		if (!data->ioc_inlbuf1) {
+			CERROR("No buffer passed in ioctl\n");
+			err = -EINVAL;
+			goto out;
+		}
+
+		if (strlen(BUILD_VERSION) + 1 > data->ioc_inllen1) {
+			CERROR("ioctl buffer too small to hold version\n");
+			err = -EINVAL;
+			goto out;
+		}
+
+		memcpy(data->ioc_bulk, BUILD_VERSION,
+		       strlen(BUILD_VERSION) + 1);
+
+		err = obd_ioctl_popdata((void *)arg, data, len);
+		if (err)
+			err = -EFAULT;
+		goto out;
+
+	case OBD_IOC_NAME2DEV: {
+		/* Resolve a device name.  This does not change the
+		 * currently selected device.
+		 */
+		int dev;
+
+		dev = class_resolve_dev_name(data->ioc_inllen1,
+					     data->ioc_inlbuf1);
+		data->ioc_dev = dev;
+		if (dev < 0) {
+			err = -EINVAL;
+			goto out;
+		}
+
+		err = obd_ioctl_popdata((void *)arg, data, sizeof(*data));
+		if (err)
+			err = -EFAULT;
+		goto out;
+	}
+
+	case OBD_IOC_UUID2DEV: {
+		/* Resolve a device uuid.  This does not change the
+		 * currently selected device.
+		 */
+		int dev;
+		struct obd_uuid uuid;
+
+		if (!data->ioc_inllen1 || !data->ioc_inlbuf1) {
+			CERROR("No UUID passed!\n");
+			err = -EINVAL;
+			goto out;
+		}
+		if (data->ioc_inlbuf1[data->ioc_inllen1 - 1] != 0) {
+			CERROR("UUID not NUL terminated!\n");
+			err = -EINVAL;
+			goto out;
+		}
+
+		CDEBUG(D_IOCTL, "device name %s\n", data->ioc_inlbuf1);
+		obd_str2uuid(&uuid, data->ioc_inlbuf1);
+		dev = class_uuid2dev(&uuid);
+		data->ioc_dev = dev;
+		if (dev == -1) {
+			CDEBUG(D_IOCTL, "No device for UUID %s!\n",
+			       data->ioc_inlbuf1);
+			err = -EINVAL;
+			goto out;
+		}
+
+		CDEBUG(D_IOCTL, "device name %s, dev %d\n", data->ioc_inlbuf1,
+		       dev);
+		err = obd_ioctl_popdata((void *)arg, data, sizeof(*data));
+		if (err)
+			err = -EFAULT;
+		goto out;
+	}
+
+	case OBD_IOC_CLOSE_UUID: {
+		CDEBUG(D_IOCTL, "closing all connections to uuid %s (NOOP)\n",
+		       data->ioc_inlbuf1);
+		err = 0;
+		goto out;
+	}
+
+	case OBD_IOC_GETDEVICE: {
+		int     index = data->ioc_count;
+		char    *status, *str;
+
+		if (!data->ioc_inlbuf1) {
+			CERROR("No buffer passed in ioctl\n");
+			err = -EINVAL;
+			goto out;
+		}
+		if (data->ioc_inllen1 < 128) {
+			CERROR("ioctl buffer too small to hold version\n");
+			err = -EINVAL;
+			goto out;
+		}
+
+		obd = class_num2obd(index);
+		if (!obd) {
+			err = -ENOENT;
+			goto out;
+		}
+
+		if (obd->obd_stopping)
+			status = "ST";
+		else if (obd->obd_set_up)
+			status = "UP";
+		else if (obd->obd_attached)
+			status = "AT";
+		else
+			status = "--";
+		str = (char *)data->ioc_bulk;
+		snprintf(str, len - sizeof(*data), "%3d %s %s %s %s %d",
+			 (int)index, status, obd->obd_type->typ_name,
+			 obd->obd_name, obd->obd_uuid.uuid,
+			 atomic_read(&obd->obd_refcount));
+		err = obd_ioctl_popdata((void *)arg, data, len);
+
+		err = 0;
+		goto out;
+	}
+
+	}
+
+	if (data->ioc_dev == OBD_DEV_BY_DEVNAME) {
+		if (data->ioc_inllen4 <= 0 || data->ioc_inlbuf4 == NULL) {
+			err = -EINVAL;
+			goto out;
+		}
+		if (strnlen(data->ioc_inlbuf4, MAX_OBD_NAME) >= MAX_OBD_NAME) {
+			err = -EINVAL;
+			goto out;
+		}
+		obd = class_name2obd(data->ioc_inlbuf4);
+	} else if (data->ioc_dev < class_devno_max()) {
+		obd = class_num2obd(data->ioc_dev);
+	} else {
+		CERROR("OBD ioctl: No device\n");
+		err = -EINVAL;
+		goto out;
+	}
+
+	if (obd == NULL) {
+		CERROR("OBD ioctl : No Device %d\n", data->ioc_dev);
+		err = -EINVAL;
+		goto out;
+	}
+	LASSERT(obd->obd_magic == OBD_DEVICE_MAGIC);
+
+	if (!obd->obd_set_up || obd->obd_stopping) {
+		CERROR("OBD ioctl: device not setup %d\n", data->ioc_dev);
+		err = -EINVAL;
+		goto out;
+	}
+
+	switch (cmd) {
+	case OBD_IOC_NO_TRANSNO: {
+		if (!obd->obd_attached) {
+			CERROR("Device %d not attached\n", obd->obd_minor);
+			err = -ENODEV;
+			goto out;
+		}
+		CDEBUG(D_HA, "%s: disabling committed-transno notification\n",
+		       obd->obd_name);
+		obd->obd_no_transno = 1;
+		err = 0;
+		goto out;
+	}
+
+	default: {
+		err = obd_iocontrol(cmd, obd->obd_self_export, len, data, NULL);
+		if (err)
+			goto out;
+
+		err = obd_ioctl_popdata((void *)arg, data, len);
+		if (err)
+			err = -EFAULT;
+		goto out;
+	}
+	}
+
+ out:
+	if (buf)
+		obd_ioctl_freedata(buf, len);
+	return err;
+} /* class_handle_ioctl */
+
+#define OBD_INIT_CHECK
+int obd_init_checks(void)
+{
+	__u64 u64val, div64val;
+	char buf[64];
+	int len, ret = 0;
+
+	CDEBUG(D_INFO, "LPU64=%s, LPD64=%s, LPX64=%s\n", "%llu", "%lld", "%#llx");
+
+	CDEBUG(D_INFO, "OBD_OBJECT_EOF = %#llx\n", (__u64)OBD_OBJECT_EOF);
+
+	u64val = OBD_OBJECT_EOF;
+	CDEBUG(D_INFO, "u64val OBD_OBJECT_EOF = %#llx\n", u64val);
+	if (u64val != OBD_OBJECT_EOF) {
+		CERROR("__u64 %#llx(%d) != 0xffffffffffffffff\n",
+		       u64val, (int)sizeof(u64val));
+		ret = -EINVAL;
+	}
+	len = snprintf(buf, sizeof(buf), "%#llx", u64val);
+	if (len != 18) {
+		CWARN("LPX64 wrong length! strlen(%s)=%d != 18\n", buf, len);
+		ret = -EINVAL;
+	}
+
+	div64val = OBD_OBJECT_EOF;
+	CDEBUG(D_INFO, "u64val OBD_OBJECT_EOF = %#llx\n", u64val);
+	if (u64val != OBD_OBJECT_EOF) {
+		CERROR("__u64 %#llx(%d) != 0xffffffffffffffff\n",
+		       u64val, (int)sizeof(u64val));
+		ret = -EOVERFLOW;
+	}
+	if (u64val >> 8 != OBD_OBJECT_EOF >> 8) {
+		CERROR("__u64 %#llx(%d) != 0xffffffffffffffff\n",
+		       u64val, (int)sizeof(u64val));
+		return -EOVERFLOW;
+	}
+	if (do_div(div64val, 256) != (u64val & 255)) {
+		CERROR("do_div(%#llx,256) != %llu\n", u64val, u64val &255);
+		return -EOVERFLOW;
+	}
+	if (u64val >> 8 != div64val) {
+		CERROR("do_div(%#llx,256) %llu != %llu\n",
+		       u64val, div64val, u64val >> 8);
+		return -EOVERFLOW;
+	}
+	len = snprintf(buf, sizeof(buf), "%#llx", u64val);
+	if (len != 18) {
+		CWARN("LPX64 wrong length! strlen(%s)=%d != 18\n", buf, len);
+		ret = -EINVAL;
+	}
+	len = snprintf(buf, sizeof(buf), "%llu", u64val);
+	if (len != 20) {
+		CWARN("LPU64 wrong length! strlen(%s)=%d != 20\n", buf, len);
+		ret = -EINVAL;
+	}
+	len = snprintf(buf, sizeof(buf), "%lld", u64val);
+	if (len != 2) {
+		CWARN("LPD64 wrong length! strlen(%s)=%d != 2\n", buf, len);
+		ret = -EINVAL;
+	}
+	if ((u64val & ~CFS_PAGE_MASK) >= PAGE_CACHE_SIZE) {
+		CWARN("mask failed: u64val %llu >= %llu\n", u64val,
+		      (__u64)PAGE_CACHE_SIZE);
+		ret = -EINVAL;
+	}
+
+	return ret;
+}
+
+#if defined (CONFIG_PROC_FS)
+extern int class_procfs_init(void);
+extern int class_procfs_clean(void);
+#else
+static inline int class_procfs_init(void)
+{ return 0; }
+static inline int class_procfs_clean(void)
+{ return 0; }
+#endif
+
+static int __init init_obdclass(void)
+{
+	int i, err;
+	int lustre_register_fs(void);
+
+	for (i = CAPA_SITE_CLIENT; i < CAPA_SITE_MAX; i++)
+		INIT_LIST_HEAD(&capa_list[i]);
+
+	LCONSOLE_INFO("Lustre: Build Version: "BUILD_VERSION"\n");
+
+	spin_lock_init(&obd_types_lock);
+	obd_zombie_impexp_init();
+
+	if (IS_ENABLED(CONFIG_PROC_FS)) {
+		obd_memory = lprocfs_alloc_stats(OBD_STATS_NUM,
+						 LPROCFS_STATS_FLAG_NONE |
+						 LPROCFS_STATS_FLAG_IRQ_SAFE);
+
+		if (obd_memory == NULL) {
+			CERROR("kmalloc of 'obd_memory' failed\n");
+			return -ENOMEM;
+		}
+
+		lprocfs_counter_init(obd_memory, OBD_MEMORY_STAT,
+				     LPROCFS_CNTR_AVGMINMAX,
+				     "memused", "bytes");
+		lprocfs_counter_init(obd_memory, OBD_MEMORY_PAGES_STAT,
+				     LPROCFS_CNTR_AVGMINMAX,
+				     "pagesused", "pages");
+	}
+
+	err = obd_init_checks();
+	if (err == -EOVERFLOW)
+		return err;
+
+	class_init_uuidlist();
+	err = class_handle_init();
+	if (err)
+		return err;
+
+	INIT_LIST_HEAD(&obd_types);
+
+	err = misc_register(&obd_psdev);
+	if (err) {
+		CERROR("cannot register %d err %d\n", OBD_DEV_MINOR, err);
+		return err;
+	}
+
+	/* This struct is already zeroed for us (static global) */
+	for (i = 0; i < class_devno_max(); i++)
+		obd_devs[i] = NULL;
+
+	/* Default the dirty page cache cap to 1/2 of system memory.
+	 * For clients with less memory, a larger fraction is needed
+	 * for other purposes (mostly for BGL). */
+	if (totalram_pages <= 512 << (20 - PAGE_CACHE_SHIFT))
+		obd_max_dirty_pages = totalram_pages / 4;
+	else
+		obd_max_dirty_pages = totalram_pages / 2;
+
+	err = obd_init_caches();
+	if (err)
+		return err;
+
+	obd_sysctl_init();
+
+	err = class_procfs_init();
+	if (err)
+		return err;
+
+	err = lu_global_init();
+	if (err)
+		return err;
+
+	err = cl_global_init();
+	if (err != 0)
+		return err;
+
+
+	err = llog_info_init();
+	if (err)
+		return err;
+
+	err = lustre_register_fs();
+
+	return err;
+}
+
+void obd_update_maxusage(void)
+{
+	__u64 max1, max2;
+
+	max1 = obd_pages_sum();
+	max2 = obd_memory_sum();
+
+	spin_lock(&obd_updatemax_lock);
+	if (max1 > obd_max_pages)
+		obd_max_pages = max1;
+	if (max2 > obd_max_alloc)
+		obd_max_alloc = max2;
+	spin_unlock(&obd_updatemax_lock);
+}
+EXPORT_SYMBOL(obd_update_maxusage);
+
+#if defined (CONFIG_PROC_FS)
+__u64 obd_memory_max(void)
+{
+	__u64 ret;
+
+	spin_lock(&obd_updatemax_lock);
+	ret = obd_max_alloc;
+	spin_unlock(&obd_updatemax_lock);
+
+	return ret;
+}
+EXPORT_SYMBOL(obd_memory_max);
+
+__u64 obd_pages_max(void)
+{
+	__u64 ret;
+
+	spin_lock(&obd_updatemax_lock);
+	ret = obd_max_pages;
+	spin_unlock(&obd_updatemax_lock);
+
+	return ret;
+}
+EXPORT_SYMBOL(obd_pages_max);
+#endif
+
+/* liblustre doesn't call cleanup_obdclass, apparently.  we carry on in this
+ * ifdef to the end of the file to cover module and versioning goo.*/
+static void cleanup_obdclass(void)
+{
+	int i;
+	int lustre_unregister_fs(void);
+	__u64 memory_leaked, pages_leaked;
+	__u64 memory_max, pages_max;
+
+	lustre_unregister_fs();
+
+	misc_deregister(&obd_psdev);
+	for (i = 0; i < class_devno_max(); i++) {
+		struct obd_device *obd = class_num2obd(i);
+		if (obd && obd->obd_set_up &&
+		    OBT(obd) && OBP(obd, detach)) {
+			/* XXX should this call generic detach otherwise? */
+			LASSERT(obd->obd_magic == OBD_DEVICE_MAGIC);
+			OBP(obd, detach)(obd);
+		}
+	}
+	llog_info_fini();
+	cl_global_fini();
+	lu_global_fini();
+
+	obd_cleanup_caches();
+	obd_sysctl_clean();
+
+	class_procfs_clean();
+
+	class_handle_cleanup();
+	class_exit_uuidlist();
+	obd_zombie_impexp_stop();
+
+	memory_leaked = obd_memory_sum();
+	pages_leaked = obd_pages_sum();
+
+	memory_max = obd_memory_max();
+	pages_max = obd_pages_max();
+
+	lprocfs_free_stats(&obd_memory);
+	CDEBUG((memory_leaked) ? D_ERROR : D_INFO,
+	       "obd_memory max: %llu, leaked: %llu\n",
+	       memory_max, memory_leaked);
+	CDEBUG((pages_leaked) ? D_ERROR : D_INFO,
+	       "obd_memory_pages max: %llu, leaked: %llu\n",
+	       pages_max, pages_leaked);
+}
+
+MODULE_AUTHOR("Sun Microsystems, Inc. <http://www.lustre.org/>");
+MODULE_DESCRIPTION("Lustre Class Driver Build Version: " BUILD_VERSION);
+MODULE_LICENSE("GPL");
+MODULE_VERSION(LUSTRE_VERSION_STRING);
+
+module_init(init_obdclass);
+module_exit(cleanup_obdclass);
diff --git a/kernel/drivers/staging/lustre/lustre/obdclass/debug.c b/kernel/drivers/staging/lustre/lustre/obdclass/debug.c
new file mode 100644
index 000000000..9c934e6d2
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/obdclass/debug.c
@@ -0,0 +1,109 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdclass/debug.c
+ *
+ * Helper routines for dumping data structs for debugging.
+ */
+
+#define DEBUG_SUBSYSTEM D_OTHER
+
+#include <linux/unaligned/access_ok.h>
+
+#include "../include/obd_support.h"
+#include "../include/lustre_debug.h"
+#include "../include/lustre_net.h"
+
+void dump_lniobuf(struct niobuf_local *nb)
+{
+	CDEBUG(D_RPCTRACE,
+	       "niobuf_local: file_offset=%lld, len=%d, page=%p, rc=%d\n",
+	       nb->lnb_file_offset, nb->len, nb->page, nb->rc);
+	CDEBUG(D_RPCTRACE, "nb->page: index = %ld\n",
+			nb->page ? page_index(nb->page) : -1);
+}
+EXPORT_SYMBOL(dump_lniobuf);
+
+#define LPDS sizeof(__u64)
+int block_debug_setup(void *addr, int len, __u64 off, __u64 id)
+{
+	LASSERT(addr);
+
+	put_unaligned_le64(off, addr);
+	put_unaligned_le64(id, addr+LPDS);
+	addr += len - LPDS - LPDS;
+	put_unaligned_le64(off, addr);
+	put_unaligned_le64(id, addr+LPDS);
+
+	return 0;
+}
+EXPORT_SYMBOL(block_debug_setup);
+
+int block_debug_check(char *who, void *addr, int end, __u64 off, __u64 id)
+{
+	__u64 ne_off;
+	int err = 0;
+
+	LASSERT(addr);
+
+	ne_off = le64_to_cpu (off);
+	id = le64_to_cpu (id);
+	if (memcmp(addr, (char *)&ne_off, LPDS)) {
+		CDEBUG(D_ERROR, "%s: id %#llx offset %llu off: %#llx != %#llx\n",
+		       who, id, off, *(__u64 *)addr, ne_off);
+		err = -EINVAL;
+	}
+	if (memcmp(addr + LPDS, (char *)&id, LPDS)) {
+		CDEBUG(D_ERROR, "%s: id %#llx offset %llu id: %#llx != %#llx\n",
+		       who, id, off, *(__u64 *)(addr + LPDS), id);
+		err = -EINVAL;
+	}
+
+	addr += end - LPDS - LPDS;
+	if (memcmp(addr, (char *)&ne_off, LPDS)) {
+		CDEBUG(D_ERROR, "%s: id %#llx offset %llu end off: %#llx != %#llx\n",
+		       who, id, off, *(__u64 *)addr, ne_off);
+		err = -EINVAL;
+	}
+	if (memcmp(addr + LPDS, (char *)&id, LPDS)) {
+		CDEBUG(D_ERROR, "%s: id %#llx offset %llu end id: %#llx != %#llx\n",
+		       who, id, off, *(__u64 *)(addr + LPDS), id);
+		err = -EINVAL;
+	}
+
+	return err;
+}
+EXPORT_SYMBOL(block_debug_check);
+#undef LPDS
diff --git a/kernel/drivers/staging/lustre/lustre/obdclass/dt_object.c b/kernel/drivers/staging/lustre/lustre/obdclass/dt_object.c
new file mode 100644
index 000000000..b1eee0a6d
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/obdclass/dt_object.c
@@ -0,0 +1,1059 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdclass/dt_object.c
+ *
+ * Dt Object.
+ * Generic functions from dt_object.h
+ *
+ * Author: Nikita Danilov <nikita@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+
+#include "../include/obd.h"
+#include "../include/dt_object.h"
+#include <linux/list.h>
+/* fid_be_to_cpu() */
+#include "../include/lustre_fid.h"
+
+#include "../include/lustre_quota.h"
+
+/* context key constructor/destructor: dt_global_key_init, dt_global_key_fini */
+LU_KEY_INIT(dt_global, struct dt_thread_info);
+LU_KEY_FINI(dt_global, struct dt_thread_info);
+
+struct lu_context_key dt_key = {
+	.lct_tags = LCT_MD_THREAD | LCT_DT_THREAD | LCT_MG_THREAD | LCT_LOCAL,
+	.lct_init = dt_global_key_init,
+	.lct_fini = dt_global_key_fini
+};
+EXPORT_SYMBOL(dt_key);
+
+/* no lock is necessary to protect the list, because call-backs
+ * are added during system startup. Please refer to "struct dt_device".
+ */
+void dt_txn_callback_add(struct dt_device *dev, struct dt_txn_callback *cb)
+{
+	list_add(&cb->dtc_linkage, &dev->dd_txn_callbacks);
+}
+EXPORT_SYMBOL(dt_txn_callback_add);
+
+void dt_txn_callback_del(struct dt_device *dev, struct dt_txn_callback *cb)
+{
+	list_del_init(&cb->dtc_linkage);
+}
+EXPORT_SYMBOL(dt_txn_callback_del);
+
+int dt_txn_hook_start(const struct lu_env *env,
+		      struct dt_device *dev, struct thandle *th)
+{
+	int rc = 0;
+	struct dt_txn_callback *cb;
+
+	if (th->th_local)
+		return 0;
+
+	list_for_each_entry(cb, &dev->dd_txn_callbacks, dtc_linkage) {
+		if (cb->dtc_txn_start == NULL ||
+		    !(cb->dtc_tag & env->le_ctx.lc_tags))
+			continue;
+		rc = cb->dtc_txn_start(env, th, cb->dtc_cookie);
+		if (rc < 0)
+			break;
+	}
+	return rc;
+}
+EXPORT_SYMBOL(dt_txn_hook_start);
+
+int dt_txn_hook_stop(const struct lu_env *env, struct thandle *txn)
+{
+	struct dt_device       *dev = txn->th_dev;
+	struct dt_txn_callback *cb;
+	int		     rc = 0;
+
+	if (txn->th_local)
+		return 0;
+
+	list_for_each_entry(cb, &dev->dd_txn_callbacks, dtc_linkage) {
+		if (cb->dtc_txn_stop == NULL ||
+		    !(cb->dtc_tag & env->le_ctx.lc_tags))
+			continue;
+		rc = cb->dtc_txn_stop(env, txn, cb->dtc_cookie);
+		if (rc < 0)
+			break;
+	}
+	return rc;
+}
+EXPORT_SYMBOL(dt_txn_hook_stop);
+
+void dt_txn_hook_commit(struct thandle *txn)
+{
+	struct dt_txn_callback *cb;
+
+	if (txn->th_local)
+		return;
+
+	list_for_each_entry(cb, &txn->th_dev->dd_txn_callbacks,
+				dtc_linkage) {
+		if (cb->dtc_txn_commit)
+			cb->dtc_txn_commit(txn, cb->dtc_cookie);
+	}
+}
+EXPORT_SYMBOL(dt_txn_hook_commit);
+
+int dt_device_init(struct dt_device *dev, struct lu_device_type *t)
+{
+
+	INIT_LIST_HEAD(&dev->dd_txn_callbacks);
+	return lu_device_init(&dev->dd_lu_dev, t);
+}
+EXPORT_SYMBOL(dt_device_init);
+
+void dt_device_fini(struct dt_device *dev)
+{
+	lu_device_fini(&dev->dd_lu_dev);
+}
+EXPORT_SYMBOL(dt_device_fini);
+
+int dt_object_init(struct dt_object *obj,
+		   struct lu_object_header *h, struct lu_device *d)
+
+{
+	return lu_object_init(&obj->do_lu, h, d);
+}
+EXPORT_SYMBOL(dt_object_init);
+
+void dt_object_fini(struct dt_object *obj)
+{
+	lu_object_fini(&obj->do_lu);
+}
+EXPORT_SYMBOL(dt_object_fini);
+
+int dt_try_as_dir(const struct lu_env *env, struct dt_object *obj)
+{
+	if (obj->do_index_ops == NULL)
+		obj->do_ops->do_index_try(env, obj, &dt_directory_features);
+	return obj->do_index_ops != NULL;
+}
+EXPORT_SYMBOL(dt_try_as_dir);
+
+enum dt_format_type dt_mode_to_dft(__u32 mode)
+{
+	enum dt_format_type result;
+
+	switch (mode & S_IFMT) {
+	case S_IFDIR:
+		result = DFT_DIR;
+		break;
+	case S_IFREG:
+		result = DFT_REGULAR;
+		break;
+	case S_IFLNK:
+		result = DFT_SYM;
+		break;
+	case S_IFCHR:
+	case S_IFBLK:
+	case S_IFIFO:
+	case S_IFSOCK:
+		result = DFT_NODE;
+		break;
+	default:
+		LBUG();
+		break;
+	}
+	return result;
+}
+EXPORT_SYMBOL(dt_mode_to_dft);
+
+/**
+ * lookup fid for object named \a name in directory \a dir.
+ */
+
+int dt_lookup_dir(const struct lu_env *env, struct dt_object *dir,
+		  const char *name, struct lu_fid *fid)
+{
+	if (dt_try_as_dir(env, dir))
+		return dt_lookup(env, dir, (struct dt_rec *)fid,
+				 (const struct dt_key *)name, BYPASS_CAPA);
+	return -ENOTDIR;
+}
+EXPORT_SYMBOL(dt_lookup_dir);
+
+/* this differs from dt_locate by top_dev as parameter
+ * but not one from lu_site */
+struct dt_object *dt_locate_at(const struct lu_env *env,
+			       struct dt_device *dev, const struct lu_fid *fid,
+			       struct lu_device *top_dev)
+{
+	struct lu_object *lo, *n;
+
+	lo = lu_object_find_at(env, top_dev, fid, NULL);
+	if (IS_ERR(lo))
+		return (void *)lo;
+
+	LASSERT(lo != NULL);
+
+	list_for_each_entry(n, &lo->lo_header->loh_layers, lo_linkage) {
+		if (n->lo_dev == &dev->dd_lu_dev)
+			return container_of0(n, struct dt_object, do_lu);
+	}
+	return ERR_PTR(-ENOENT);
+}
+EXPORT_SYMBOL(dt_locate_at);
+
+/**
+ * find a object named \a entry in given \a dfh->dfh_o directory.
+ */
+static int dt_find_entry(const struct lu_env *env, const char *entry, void *data)
+{
+	struct dt_find_hint  *dfh = data;
+	struct dt_device     *dt = dfh->dfh_dt;
+	struct lu_fid	*fid = dfh->dfh_fid;
+	struct dt_object     *obj = dfh->dfh_o;
+	int		   result;
+
+	result = dt_lookup_dir(env, obj, entry, fid);
+	lu_object_put(env, &obj->do_lu);
+	if (result == 0) {
+		obj = dt_locate(env, dt, fid);
+		if (IS_ERR(obj))
+			result = PTR_ERR(obj);
+	}
+	dfh->dfh_o = obj;
+	return result;
+}
+
+/**
+ * Abstract function which parses path name. This function feeds
+ * path component to \a entry_func.
+ */
+int dt_path_parser(const struct lu_env *env,
+		   char *path, dt_entry_func_t entry_func,
+		   void *data)
+{
+	char *e;
+	int rc = 0;
+
+	while (1) {
+		e = strsep(&path, "/");
+		if (e == NULL)
+			break;
+
+		if (e[0] == 0) {
+			if (!path || path[0] == '\0')
+				break;
+			continue;
+		}
+		rc = entry_func(env, e, data);
+		if (rc)
+			break;
+	}
+
+	return rc;
+}
+
+struct dt_object *
+dt_store_resolve(const struct lu_env *env, struct dt_device *dt,
+		 const char *path, struct lu_fid *fid)
+{
+	struct dt_thread_info *info = dt_info(env);
+	struct dt_find_hint   *dfh = &info->dti_dfh;
+	struct dt_object      *obj;
+	char		      *local = info->dti_buf;
+	int		       result;
+
+
+	dfh->dfh_dt = dt;
+	dfh->dfh_fid = fid;
+
+	strncpy(local, path, DT_MAX_PATH);
+	local[DT_MAX_PATH - 1] = '\0';
+
+	result = dt->dd_ops->dt_root_get(env, dt, fid);
+	if (result == 0) {
+		obj = dt_locate(env, dt, fid);
+		if (!IS_ERR(obj)) {
+			dfh->dfh_o = obj;
+			result = dt_path_parser(env, local, dt_find_entry, dfh);
+			if (result != 0)
+				obj = ERR_PTR(result);
+			else
+				obj = dfh->dfh_o;
+		}
+	} else {
+		obj = ERR_PTR(result);
+	}
+	return obj;
+}
+EXPORT_SYMBOL(dt_store_resolve);
+
+static struct dt_object *dt_reg_open(const struct lu_env *env,
+				     struct dt_device *dt,
+				     struct dt_object *p,
+				     const char *name,
+				     struct lu_fid *fid)
+{
+	struct dt_object *o;
+	int result;
+
+	result = dt_lookup_dir(env, p, name, fid);
+	if (result == 0){
+		o = dt_locate(env, dt, fid);
+	} else
+		o = ERR_PTR(result);
+
+	return o;
+}
+
+/**
+ * Open dt object named \a filename from \a dirname directory.
+ *      \param  dt      dt device
+ *      \param  fid     on success, object fid is stored in *fid
+ */
+struct dt_object *dt_store_open(const struct lu_env *env,
+				struct dt_device *dt,
+				const char *dirname,
+				const char *filename,
+				struct lu_fid *fid)
+{
+	struct dt_object *file;
+	struct dt_object *dir;
+
+	dir = dt_store_resolve(env, dt, dirname, fid);
+	if (!IS_ERR(dir)) {
+		file = dt_reg_open(env, dt, dir,
+				   filename, fid);
+		lu_object_put(env, &dir->do_lu);
+	} else {
+		file = dir;
+	}
+	return file;
+}
+EXPORT_SYMBOL(dt_store_open);
+
+struct dt_object *dt_find_or_create(const struct lu_env *env,
+				    struct dt_device *dt,
+				    const struct lu_fid *fid,
+				    struct dt_object_format *dof,
+				    struct lu_attr *at)
+{
+	struct dt_object *dto;
+	struct thandle *th;
+	int rc;
+
+	dto = dt_locate(env, dt, fid);
+	if (IS_ERR(dto))
+		return dto;
+
+	LASSERT(dto != NULL);
+	if (dt_object_exists(dto))
+		return dto;
+
+	th = dt_trans_create(env, dt);
+	if (IS_ERR(th)) {
+		rc = PTR_ERR(th);
+		goto out;
+	}
+
+	rc = dt_declare_create(env, dto, at, NULL, dof, th);
+	if (rc)
+		goto trans_stop;
+
+	rc = dt_trans_start_local(env, dt, th);
+	if (rc)
+		goto trans_stop;
+
+	dt_write_lock(env, dto, 0);
+	if (dt_object_exists(dto)) {
+		rc = 0;
+		goto unlock;
+	}
+
+	CDEBUG(D_OTHER, "create new object "DFID"\n", PFID(fid));
+
+	rc = dt_create(env, dto, at, NULL, dof, th);
+	if (rc)
+		goto unlock;
+	LASSERT(dt_object_exists(dto));
+unlock:
+	dt_write_unlock(env, dto);
+trans_stop:
+	dt_trans_stop(env, dt, th);
+out:
+	if (rc) {
+		lu_object_put(env, &dto->do_lu);
+		return ERR_PTR(rc);
+	}
+	return dto;
+}
+EXPORT_SYMBOL(dt_find_or_create);
+
+/* dt class init function. */
+int dt_global_init(void)
+{
+	LU_CONTEXT_KEY_INIT(&dt_key);
+	return lu_context_key_register(&dt_key);
+}
+
+void dt_global_fini(void)
+{
+	lu_context_key_degister(&dt_key);
+}
+
+/**
+ * Generic read helper. May return an error for partial reads.
+ *
+ * \param env  lustre environment
+ * \param dt   object to be read
+ * \param buf  lu_buf to be filled, with buffer pointer and length
+ * \param pos position to start reading, updated as data is read
+ *
+ * \retval real size of data read
+ * \retval -ve errno on failure
+ */
+int dt_read(const struct lu_env *env, struct dt_object *dt,
+	    struct lu_buf *buf, loff_t *pos)
+{
+	LASSERTF(dt != NULL, "dt is NULL when we want to read record\n");
+	return dt->do_body_ops->dbo_read(env, dt, buf, pos, BYPASS_CAPA);
+}
+EXPORT_SYMBOL(dt_read);
+
+/**
+ * Read structures of fixed size from storage.  Unlike dt_read(), using
+ * dt_record_read() will return an error for partial reads.
+ *
+ * \param env  lustre environment
+ * \param dt   object to be read
+ * \param buf  lu_buf to be filled, with buffer pointer and length
+ * \param pos position to start reading, updated as data is read
+ *
+ * \retval 0 on successfully reading full buffer
+ * \retval -EFAULT on short read
+ * \retval -ve errno on failure
+ */
+int dt_record_read(const struct lu_env *env, struct dt_object *dt,
+		   struct lu_buf *buf, loff_t *pos)
+{
+	int rc;
+
+	LASSERTF(dt != NULL, "dt is NULL when we want to read record\n");
+
+	rc = dt->do_body_ops->dbo_read(env, dt, buf, pos, BYPASS_CAPA);
+
+	if (rc == buf->lb_len)
+		rc = 0;
+	else if (rc >= 0)
+		rc = -EFAULT;
+	return rc;
+}
+EXPORT_SYMBOL(dt_record_read);
+
+int dt_record_write(const struct lu_env *env, struct dt_object *dt,
+		    const struct lu_buf *buf, loff_t *pos, struct thandle *th)
+{
+	int rc;
+
+	LASSERTF(dt != NULL, "dt is NULL when we want to write record\n");
+	LASSERT(th != NULL);
+	LASSERT(dt->do_body_ops);
+	LASSERT(dt->do_body_ops->dbo_write);
+	rc = dt->do_body_ops->dbo_write(env, dt, buf, pos, th, BYPASS_CAPA, 1);
+	if (rc == buf->lb_len)
+		rc = 0;
+	else if (rc >= 0)
+		rc = -EFAULT;
+	return rc;
+}
+EXPORT_SYMBOL(dt_record_write);
+
+int dt_declare_version_set(const struct lu_env *env, struct dt_object *o,
+			   struct thandle *th)
+{
+	struct lu_buf vbuf;
+	char *xname = XATTR_NAME_VERSION;
+
+	LASSERT(o);
+	vbuf.lb_buf = NULL;
+	vbuf.lb_len = sizeof(dt_obj_version_t);
+	return dt_declare_xattr_set(env, o, &vbuf, xname, 0, th);
+
+}
+EXPORT_SYMBOL(dt_declare_version_set);
+
+void dt_version_set(const struct lu_env *env, struct dt_object *o,
+		    dt_obj_version_t version, struct thandle *th)
+{
+	struct lu_buf vbuf;
+	char *xname = XATTR_NAME_VERSION;
+	int rc;
+
+	LASSERT(o);
+	vbuf.lb_buf = &version;
+	vbuf.lb_len = sizeof(version);
+
+	rc = dt_xattr_set(env, o, &vbuf, xname, 0, th, BYPASS_CAPA);
+	if (rc < 0)
+		CDEBUG(D_INODE, "Can't set version, rc %d\n", rc);
+	return;
+}
+EXPORT_SYMBOL(dt_version_set);
+
+dt_obj_version_t dt_version_get(const struct lu_env *env, struct dt_object *o)
+{
+	struct lu_buf vbuf;
+	char *xname = XATTR_NAME_VERSION;
+	dt_obj_version_t version;
+	int rc;
+
+	LASSERT(o);
+	vbuf.lb_buf = &version;
+	vbuf.lb_len = sizeof(version);
+	rc = dt_xattr_get(env, o, &vbuf, xname, BYPASS_CAPA);
+	if (rc != sizeof(version)) {
+		CDEBUG(D_INODE, "Can't get version, rc %d\n", rc);
+		version = 0;
+	}
+	return version;
+}
+EXPORT_SYMBOL(dt_version_get);
+
+/* list of all supported index types */
+
+/* directories */
+const struct dt_index_features dt_directory_features;
+EXPORT_SYMBOL(dt_directory_features);
+
+/* scrub iterator */
+const struct dt_index_features dt_otable_features;
+EXPORT_SYMBOL(dt_otable_features);
+
+/* lfsck */
+const struct dt_index_features dt_lfsck_features = {
+	.dif_flags		= DT_IND_UPDATE,
+	.dif_keysize_min	= sizeof(struct lu_fid),
+	.dif_keysize_max	= sizeof(struct lu_fid),
+	.dif_recsize_min	= sizeof(__u8),
+	.dif_recsize_max	= sizeof(__u8),
+	.dif_ptrsize		= 4
+};
+EXPORT_SYMBOL(dt_lfsck_features);
+
+/* accounting indexes */
+const struct dt_index_features dt_acct_features = {
+	.dif_flags		= DT_IND_UPDATE,
+	.dif_keysize_min	= sizeof(__u64), /* 64-bit uid/gid */
+	.dif_keysize_max	= sizeof(__u64), /* 64-bit uid/gid */
+	.dif_recsize_min	= sizeof(struct lquota_acct_rec), /* 16 bytes */
+	.dif_recsize_max	= sizeof(struct lquota_acct_rec), /* 16 bytes */
+	.dif_ptrsize		= 4
+};
+EXPORT_SYMBOL(dt_acct_features);
+
+/* global quota files */
+const struct dt_index_features dt_quota_glb_features = {
+	.dif_flags		= DT_IND_UPDATE,
+	/* a different key would have to be used for per-directory quota */
+	.dif_keysize_min	= sizeof(__u64), /* 64-bit uid/gid */
+	.dif_keysize_max	= sizeof(__u64), /* 64-bit uid/gid */
+	.dif_recsize_min	= sizeof(struct lquota_glb_rec), /* 32 bytes */
+	.dif_recsize_max	= sizeof(struct lquota_glb_rec), /* 32 bytes */
+	.dif_ptrsize		= 4
+};
+EXPORT_SYMBOL(dt_quota_glb_features);
+
+/* slave quota files */
+const struct dt_index_features dt_quota_slv_features = {
+	.dif_flags		= DT_IND_UPDATE,
+	/* a different key would have to be used for per-directory quota */
+	.dif_keysize_min	= sizeof(__u64), /* 64-bit uid/gid */
+	.dif_keysize_max	= sizeof(__u64), /* 64-bit uid/gid */
+	.dif_recsize_min	= sizeof(struct lquota_slv_rec), /* 8 bytes */
+	.dif_recsize_max	= sizeof(struct lquota_slv_rec), /* 8 bytes */
+	.dif_ptrsize		= 4
+};
+EXPORT_SYMBOL(dt_quota_slv_features);
+
+/* helper function returning what dt_index_features structure should be used
+ * based on the FID sequence. This is used by OBD_IDX_READ RPC */
+static inline const struct dt_index_features *dt_index_feat_select(__u64 seq,
+								   __u32 mode)
+{
+	if (seq == FID_SEQ_QUOTA_GLB) {
+		/* global quota index */
+		if (!S_ISREG(mode))
+			/* global quota index should be a regular file */
+			return ERR_PTR(-ENOENT);
+		return &dt_quota_glb_features;
+	} else if (seq == FID_SEQ_QUOTA) {
+		/* quota slave index */
+		if (!S_ISREG(mode))
+			/* slave index should be a regular file */
+			return ERR_PTR(-ENOENT);
+		return &dt_quota_slv_features;
+	} else if (seq >= FID_SEQ_NORMAL) {
+		/* object is part of the namespace, verify that it is a
+		 * directory */
+		if (!S_ISDIR(mode))
+			/* sorry, we can only deal with directory */
+			return ERR_PTR(-ENOTDIR);
+		return &dt_directory_features;
+	}
+
+	return ERR_PTR(-EOPNOTSUPP);
+}
+
+/*
+ * Fill a lu_idxpage with key/record pairs read for transfer via OBD_IDX_READ
+ * RPC
+ *
+ * \param env - is the environment passed by the caller
+ * \param lp  - is a pointer to the lu_page to fill
+ * \param nob - is the maximum number of bytes that should be copied
+ * \param iops - is the index operation vector associated with the index object
+ * \param it   - is a pointer to the current iterator
+ * \param attr - is the index attribute to pass to iops->rec()
+ * \param arg  - is a pointer to the idx_info structure
+ */
+static int dt_index_page_build(const struct lu_env *env, union lu_page *lp,
+			       int nob, const struct dt_it_ops *iops,
+			       struct dt_it *it, __u32 attr, void *arg)
+{
+	struct idx_info		*ii = (struct idx_info *)arg;
+	struct lu_idxpage	*lip = &lp->lp_idx;
+	char			*entry;
+	int			 rc, size;
+
+	/* no support for variable key & record size for now */
+	LASSERT((ii->ii_flags & II_FL_VARKEY) == 0);
+	LASSERT((ii->ii_flags & II_FL_VARREC) == 0);
+
+	/* initialize the header of the new container */
+	memset(lip, 0, LIP_HDR_SIZE);
+	lip->lip_magic = LIP_MAGIC;
+	nob	   -= LIP_HDR_SIZE;
+
+	/* compute size needed to store a key/record pair */
+	size = ii->ii_recsize + ii->ii_keysize;
+	if ((ii->ii_flags & II_FL_NOHASH) == 0)
+		/* add hash if the client wants it */
+		size += sizeof(__u64);
+
+	entry = lip->lip_entries;
+	do {
+		char		*tmp_entry = entry;
+		struct dt_key	*key;
+		__u64		 hash;
+
+		/* fetch 64-bit hash value */
+		hash = iops->store(env, it);
+		ii->ii_hash_end = hash;
+
+		if (OBD_FAIL_CHECK(OBD_FAIL_OBD_IDX_READ_BREAK)) {
+			if (lip->lip_nr != 0) {
+				rc = 0;
+				goto out;
+			}
+		}
+
+		if (nob < size) {
+			if (lip->lip_nr == 0)
+				rc = -EINVAL;
+			else
+				rc = 0;
+			goto out;
+		}
+
+		if ((ii->ii_flags & II_FL_NOHASH) == 0) {
+			/* client wants to the 64-bit hash value associated with
+			 * each record */
+			memcpy(tmp_entry, &hash, sizeof(hash));
+			tmp_entry += sizeof(hash);
+		}
+
+		/* then the key value */
+		LASSERT(iops->key_size(env, it) == ii->ii_keysize);
+		key = iops->key(env, it);
+		memcpy(tmp_entry, key, ii->ii_keysize);
+		tmp_entry += ii->ii_keysize;
+
+		/* and finally the record */
+		rc = iops->rec(env, it, (struct dt_rec *)tmp_entry, attr);
+		if (rc != -ESTALE) {
+			if (rc != 0)
+				goto out;
+
+			/* hash/key/record successfully copied! */
+			lip->lip_nr++;
+			if (unlikely(lip->lip_nr == 1 && ii->ii_count == 0))
+				ii->ii_hash_start = hash;
+			entry = tmp_entry + ii->ii_recsize;
+			nob -= size;
+		}
+
+		/* move on to the next record */
+		do {
+			rc = iops->next(env, it);
+		} while (rc == -ESTALE);
+
+	} while (rc == 0);
+
+	goto out;
+out:
+	if (rc >= 0 && lip->lip_nr > 0)
+		/* one more container */
+		ii->ii_count++;
+	if (rc > 0)
+		/* no more entries */
+		ii->ii_hash_end = II_END_OFF;
+	return rc;
+}
+
+/*
+ * Walk index and fill lu_page containers with key/record pairs
+ *
+ * \param env - is the environment passed by the caller
+ * \param obj - is the index object to parse
+ * \param rdpg - is the lu_rdpg descriptor associated with the transfer
+ * \param filler - is the callback function responsible for filling a lu_page
+ *		 with key/record pairs in the format wanted by the caller
+ * \param arg    - is an opaq argument passed to the filler function
+ *
+ * \retval sum (in bytes) of all filled lu_pages
+ * \retval -ve errno on failure
+ */
+int dt_index_walk(const struct lu_env *env, struct dt_object *obj,
+		  const struct lu_rdpg *rdpg, dt_index_page_build_t filler,
+		  void *arg)
+{
+	struct dt_it		*it;
+	const struct dt_it_ops	*iops;
+	unsigned int		 pageidx, nob, nlupgs = 0;
+	int			 rc;
+
+	LASSERT(rdpg->rp_pages != NULL);
+	LASSERT(obj->do_index_ops != NULL);
+
+	nob = rdpg->rp_count;
+	if (nob <= 0)
+		return -EFAULT;
+
+	/* Iterate through index and fill containers from @rdpg */
+	iops = &obj->do_index_ops->dio_it;
+	LASSERT(iops != NULL);
+	it = iops->init(env, obj, rdpg->rp_attrs, BYPASS_CAPA);
+	if (IS_ERR(it))
+		return PTR_ERR(it);
+
+	rc = iops->load(env, it, rdpg->rp_hash);
+	if (rc == 0) {
+		/*
+		 * Iterator didn't find record with exactly the key requested.
+		 *
+		 * It is currently either
+		 *
+		 *     - positioned above record with key less than
+		 *     requested---skip it.
+		 *     - or not positioned at all (is in IAM_IT_SKEWED
+		 *     state)---position it on the next item.
+		 */
+		rc = iops->next(env, it);
+	} else if (rc > 0) {
+		rc = 0;
+	}
+
+	/* Fill containers one after the other. There might be multiple
+	 * containers per physical page.
+	 *
+	 * At this point and across for-loop:
+	 *  rc == 0 -> ok, proceed.
+	 *  rc >  0 -> end of index.
+	 *  rc <  0 -> error. */
+	for (pageidx = 0; rc == 0 && nob > 0; pageidx++) {
+		union lu_page	*lp;
+		int		 i;
+
+		LASSERT(pageidx < rdpg->rp_npages);
+		lp = kmap(rdpg->rp_pages[pageidx]);
+
+		/* fill lu pages */
+		for (i = 0; i < LU_PAGE_COUNT; i++, lp++, nob -= LU_PAGE_SIZE) {
+			rc = filler(env, lp, min_t(int, nob, LU_PAGE_SIZE),
+				    iops, it, rdpg->rp_attrs, arg);
+			if (rc < 0)
+				break;
+			/* one more lu_page */
+			nlupgs++;
+			if (rc > 0)
+				/* end of index */
+				break;
+		}
+		kunmap(rdpg->rp_pages[i]);
+	}
+
+	iops->put(env, it);
+	iops->fini(env, it);
+
+	if (rc >= 0)
+		rc = min_t(unsigned int, nlupgs * LU_PAGE_SIZE, rdpg->rp_count);
+
+	return rc;
+}
+EXPORT_SYMBOL(dt_index_walk);
+
+/**
+ * Walk key/record pairs of an index and copy them into 4KB containers to be
+ * transferred over the network. This is the common handler for OBD_IDX_READ
+ * RPC processing.
+ *
+ * \param env - is the environment passed by the caller
+ * \param dev - is the dt_device storing the index
+ * \param ii  - is the idx_info structure packed by the client in the
+ *	      OBD_IDX_READ request
+ * \param rdpg - is the lu_rdpg descriptor
+ *
+ * \retval on success, return sum (in bytes) of all filled containers
+ * \retval appropriate error otherwise.
+ */
+int dt_index_read(const struct lu_env *env, struct dt_device *dev,
+		  struct idx_info *ii, const struct lu_rdpg *rdpg)
+{
+	const struct dt_index_features	*feat;
+	struct dt_object		*obj;
+	int				 rc;
+
+	/* rp_count shouldn't be null and should be a multiple of the container
+	 * size */
+	if (rdpg->rp_count <= 0 && (rdpg->rp_count & (LU_PAGE_SIZE - 1)) != 0)
+		return -EFAULT;
+
+	if (fid_seq(&ii->ii_fid) >= FID_SEQ_NORMAL)
+		/* we don't support directory transfer via OBD_IDX_READ for the
+		 * time being */
+		return -EOPNOTSUPP;
+
+	if (!fid_is_quota(&ii->ii_fid))
+		/* block access to all local files except quota files */
+		return -EPERM;
+
+	/* lookup index object subject to the transfer */
+	obj = dt_locate(env, dev, &ii->ii_fid);
+	if (IS_ERR(obj))
+		return PTR_ERR(obj);
+	if (dt_object_exists(obj) == 0) {
+		rc = -ENOENT;
+		goto out;
+	}
+
+	/* fetch index features associated with index object */
+	feat = dt_index_feat_select(fid_seq(&ii->ii_fid),
+				    lu_object_attr(&obj->do_lu));
+	if (IS_ERR(feat)) {
+		rc = PTR_ERR(feat);
+		goto out;
+	}
+
+	/* load index feature if not done already */
+	if (obj->do_index_ops == NULL) {
+		rc = obj->do_ops->do_index_try(env, obj, feat);
+		if (rc)
+			goto out;
+	}
+
+	/* fill ii_flags with supported index features */
+	ii->ii_flags &= II_FL_NOHASH;
+
+	ii->ii_keysize = feat->dif_keysize_max;
+	if ((feat->dif_flags & DT_IND_VARKEY) != 0) {
+		/* key size is variable */
+		ii->ii_flags |= II_FL_VARKEY;
+		/* we don't support variable key size for the time being */
+		rc = -EOPNOTSUPP;
+		goto out;
+	}
+
+	ii->ii_recsize = feat->dif_recsize_max;
+	if ((feat->dif_flags & DT_IND_VARREC) != 0) {
+		/* record size is variable */
+		ii->ii_flags |= II_FL_VARREC;
+		/* we don't support variable record size for the time being */
+		rc = -EOPNOTSUPP;
+		goto out;
+	}
+
+	if ((feat->dif_flags & DT_IND_NONUNQ) != 0)
+		/* key isn't necessarily unique */
+		ii->ii_flags |= II_FL_NONUNQ;
+
+	dt_read_lock(env, obj, 0);
+	/* fetch object version before walking the index */
+	ii->ii_version = dt_version_get(env, obj);
+
+	/* walk the index and fill lu_idxpages with key/record pairs */
+	rc = dt_index_walk(env, obj, rdpg, dt_index_page_build ,ii);
+	dt_read_unlock(env, obj);
+
+	if (rc == 0) {
+		/* index is empty */
+		LASSERT(ii->ii_count == 0);
+		ii->ii_hash_end = II_END_OFF;
+	}
+
+	goto out;
+out:
+	lu_object_put(env, &obj->do_lu);
+	return rc;
+}
+EXPORT_SYMBOL(dt_index_read);
+
+#if defined (CONFIG_PROC_FS)
+
+int lprocfs_dt_rd_blksize(char *page, char **start, off_t off,
+			  int count, int *eof, void *data)
+{
+	struct dt_device *dt = data;
+	struct obd_statfs osfs;
+	int rc = dt_statfs(NULL, dt, &osfs);
+
+	if (rc == 0) {
+		*eof = 1;
+		rc = snprintf(page, count, "%u\n",
+				(unsigned) osfs.os_bsize);
+	}
+
+	return rc;
+}
+EXPORT_SYMBOL(lprocfs_dt_rd_blksize);
+
+int lprocfs_dt_rd_kbytestotal(char *page, char **start, off_t off,
+			      int count, int *eof, void *data)
+{
+	struct dt_device *dt = data;
+	struct obd_statfs osfs;
+	int rc = dt_statfs(NULL, dt, &osfs);
+
+	if (rc == 0) {
+		__u32 blk_size = osfs.os_bsize >> 10;
+		__u64 result = osfs.os_blocks;
+
+		while (blk_size >>= 1)
+			result <<= 1;
+
+		*eof = 1;
+		rc = snprintf(page, count, "%llu\n", result);
+	}
+
+	return rc;
+}
+EXPORT_SYMBOL(lprocfs_dt_rd_kbytestotal);
+
+int lprocfs_dt_rd_kbytesfree(char *page, char **start, off_t off,
+			     int count, int *eof, void *data)
+{
+	struct dt_device *dt = data;
+	struct obd_statfs osfs;
+	int rc = dt_statfs(NULL, dt, &osfs);
+
+	if (rc == 0) {
+		__u32 blk_size = osfs.os_bsize >> 10;
+		__u64 result = osfs.os_bfree;
+
+		while (blk_size >>= 1)
+			result <<= 1;
+
+		*eof = 1;
+		rc = snprintf(page, count, "%llu\n", result);
+	}
+
+	return rc;
+}
+EXPORT_SYMBOL(lprocfs_dt_rd_kbytesfree);
+
+int lprocfs_dt_rd_kbytesavail(char *page, char **start, off_t off,
+			      int count, int *eof, void *data)
+{
+	struct dt_device *dt = data;
+	struct obd_statfs osfs;
+	int rc = dt_statfs(NULL, dt, &osfs);
+
+	if (rc == 0) {
+		__u32 blk_size = osfs.os_bsize >> 10;
+		__u64 result = osfs.os_bavail;
+
+		while (blk_size >>= 1)
+			result <<= 1;
+
+		*eof = 1;
+		rc = snprintf(page, count, "%llu\n", result);
+	}
+
+	return rc;
+}
+EXPORT_SYMBOL(lprocfs_dt_rd_kbytesavail);
+
+int lprocfs_dt_rd_filestotal(char *page, char **start, off_t off,
+			     int count, int *eof, void *data)
+{
+	struct dt_device *dt = data;
+	struct obd_statfs osfs;
+	int rc = dt_statfs(NULL, dt, &osfs);
+
+	if (rc == 0) {
+		*eof = 1;
+		rc = snprintf(page, count, "%llu\n", osfs.os_files);
+	}
+
+	return rc;
+}
+EXPORT_SYMBOL(lprocfs_dt_rd_filestotal);
+
+int lprocfs_dt_rd_filesfree(char *page, char **start, off_t off,
+			    int count, int *eof, void *data)
+{
+	struct dt_device *dt = data;
+	struct obd_statfs osfs;
+	int rc = dt_statfs(NULL, dt, &osfs);
+
+	if (rc == 0) {
+		*eof = 1;
+		rc = snprintf(page, count, "%llu\n", osfs.os_ffree);
+	}
+
+	return rc;
+}
+EXPORT_SYMBOL(lprocfs_dt_rd_filesfree);
+
+#endif /* CONFIG_PROC_FS */
diff --git a/kernel/drivers/staging/lustre/lustre/obdclass/genops.c b/kernel/drivers/staging/lustre/lustre/obdclass/genops.c
new file mode 100644
index 000000000..66b56784f
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/obdclass/genops.c
@@ -0,0 +1,1833 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 1999, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdclass/genops.c
+ *
+ * These are the only exported functions, they provide some generic
+ * infrastructure for managing object devices
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+#include "../include/obd_class.h"
+#include "../include/lprocfs_status.h"
+
+spinlock_t obd_types_lock;
+
+struct kmem_cache *obd_device_cachep;
+struct kmem_cache *obdo_cachep;
+EXPORT_SYMBOL(obdo_cachep);
+static struct kmem_cache *import_cachep;
+
+static struct list_head      obd_zombie_imports;
+static struct list_head      obd_zombie_exports;
+static spinlock_t  obd_zombie_impexp_lock;
+static void obd_zombie_impexp_notify(void);
+static void obd_zombie_export_add(struct obd_export *exp);
+static void obd_zombie_import_add(struct obd_import *imp);
+static void print_export_data(struct obd_export *exp,
+			      const char *status, int locks);
+
+int (*ptlrpc_put_connection_superhack)(struct ptlrpc_connection *c);
+EXPORT_SYMBOL(ptlrpc_put_connection_superhack);
+
+/*
+ * support functions: we could use inter-module communication, but this
+ * is more portable to other OS's
+ */
+static struct obd_device *obd_device_alloc(void)
+{
+	struct obd_device *obd;
+
+	OBD_SLAB_ALLOC_PTR_GFP(obd, obd_device_cachep, GFP_NOFS);
+	if (obd != NULL) {
+		obd->obd_magic = OBD_DEVICE_MAGIC;
+	}
+	return obd;
+}
+
+static void obd_device_free(struct obd_device *obd)
+{
+	LASSERT(obd != NULL);
+	LASSERTF(obd->obd_magic == OBD_DEVICE_MAGIC, "obd %p obd_magic %08x != %08x\n",
+		 obd, obd->obd_magic, OBD_DEVICE_MAGIC);
+	if (obd->obd_namespace != NULL) {
+		CERROR("obd %p: namespace %p was not properly cleaned up (obd_force=%d)!\n",
+		       obd, obd->obd_namespace, obd->obd_force);
+		LBUG();
+	}
+	lu_ref_fini(&obd->obd_reference);
+	OBD_SLAB_FREE_PTR(obd, obd_device_cachep);
+}
+
+struct obd_type *class_search_type(const char *name)
+{
+	struct list_head *tmp;
+	struct obd_type *type;
+
+	spin_lock(&obd_types_lock);
+	list_for_each(tmp, &obd_types) {
+		type = list_entry(tmp, struct obd_type, typ_chain);
+		if (strcmp(type->typ_name, name) == 0) {
+			spin_unlock(&obd_types_lock);
+			return type;
+		}
+	}
+	spin_unlock(&obd_types_lock);
+	return NULL;
+}
+EXPORT_SYMBOL(class_search_type);
+
+struct obd_type *class_get_type(const char *name)
+{
+	struct obd_type *type = class_search_type(name);
+
+	if (!type) {
+		const char *modname = name;
+
+		if (strcmp(modname, "obdfilter") == 0)
+			modname = "ofd";
+
+		if (strcmp(modname, LUSTRE_LWP_NAME) == 0)
+			modname = LUSTRE_OSP_NAME;
+
+		if (!strncmp(modname, LUSTRE_MDS_NAME, strlen(LUSTRE_MDS_NAME)))
+			modname = LUSTRE_MDT_NAME;
+
+		if (!request_module("%s", modname)) {
+			CDEBUG(D_INFO, "Loaded module '%s'\n", modname);
+			type = class_search_type(name);
+		} else {
+			LCONSOLE_ERROR_MSG(0x158, "Can't load module '%s'\n",
+					   modname);
+		}
+	}
+	if (type) {
+		spin_lock(&type->obd_type_lock);
+		type->typ_refcnt++;
+		try_module_get(type->typ_dt_ops->o_owner);
+		spin_unlock(&type->obd_type_lock);
+	}
+	return type;
+}
+EXPORT_SYMBOL(class_get_type);
+
+void class_put_type(struct obd_type *type)
+{
+	LASSERT(type);
+	spin_lock(&type->obd_type_lock);
+	type->typ_refcnt--;
+	module_put(type->typ_dt_ops->o_owner);
+	spin_unlock(&type->obd_type_lock);
+}
+EXPORT_SYMBOL(class_put_type);
+
+#define CLASS_MAX_NAME 1024
+
+int class_register_type(struct obd_ops *dt_ops, struct md_ops *md_ops,
+			struct lprocfs_vars *vars, const char *name,
+			struct lu_device_type *ldt)
+{
+	struct obd_type *type;
+	int rc = 0;
+
+	/* sanity check */
+	LASSERT(strnlen(name, CLASS_MAX_NAME) < CLASS_MAX_NAME);
+
+	if (class_search_type(name)) {
+		CDEBUG(D_IOCTL, "Type %s already registered\n", name);
+		return -EEXIST;
+	}
+
+	rc = -ENOMEM;
+	OBD_ALLOC(type, sizeof(*type));
+	if (type == NULL)
+		return rc;
+
+	OBD_ALLOC_PTR(type->typ_dt_ops);
+	OBD_ALLOC_PTR(type->typ_md_ops);
+	OBD_ALLOC(type->typ_name, strlen(name) + 1);
+
+	if (type->typ_dt_ops == NULL ||
+	    type->typ_md_ops == NULL ||
+	    type->typ_name == NULL)
+		goto failed;
+
+	*(type->typ_dt_ops) = *dt_ops;
+	/* md_ops is optional */
+	if (md_ops)
+		*(type->typ_md_ops) = *md_ops;
+	strcpy(type->typ_name, name);
+	spin_lock_init(&type->obd_type_lock);
+
+	type->typ_procroot = lprocfs_register(type->typ_name, proc_lustre_root,
+					      vars, type);
+	if (IS_ERR(type->typ_procroot)) {
+		rc = PTR_ERR(type->typ_procroot);
+		type->typ_procroot = NULL;
+		goto failed;
+	}
+
+	if (ldt != NULL) {
+		type->typ_lu = ldt;
+		rc = lu_device_type_init(ldt);
+		if (rc != 0)
+			goto failed;
+	}
+
+	spin_lock(&obd_types_lock);
+	list_add(&type->typ_chain, &obd_types);
+	spin_unlock(&obd_types_lock);
+
+	return 0;
+
+ failed:
+	if (type->typ_name != NULL)
+		OBD_FREE(type->typ_name, strlen(name) + 1);
+	if (type->typ_md_ops != NULL)
+		OBD_FREE_PTR(type->typ_md_ops);
+	if (type->typ_dt_ops != NULL)
+		OBD_FREE_PTR(type->typ_dt_ops);
+	OBD_FREE(type, sizeof(*type));
+	return rc;
+}
+EXPORT_SYMBOL(class_register_type);
+
+int class_unregister_type(const char *name)
+{
+	struct obd_type *type = class_search_type(name);
+
+	if (!type) {
+		CERROR("unknown obd type\n");
+		return -EINVAL;
+	}
+
+	if (type->typ_refcnt) {
+		CERROR("type %s has refcount (%d)\n", name, type->typ_refcnt);
+		/* This is a bad situation, let's make the best of it */
+		/* Remove ops, but leave the name for debugging */
+		OBD_FREE_PTR(type->typ_dt_ops);
+		OBD_FREE_PTR(type->typ_md_ops);
+		return -EBUSY;
+	}
+
+	if (type->typ_procroot) {
+		lprocfs_remove(&type->typ_procroot);
+	}
+
+	if (type->typ_lu)
+		lu_device_type_fini(type->typ_lu);
+
+	spin_lock(&obd_types_lock);
+	list_del(&type->typ_chain);
+	spin_unlock(&obd_types_lock);
+	OBD_FREE(type->typ_name, strlen(name) + 1);
+	if (type->typ_dt_ops != NULL)
+		OBD_FREE_PTR(type->typ_dt_ops);
+	if (type->typ_md_ops != NULL)
+		OBD_FREE_PTR(type->typ_md_ops);
+	OBD_FREE(type, sizeof(*type));
+	return 0;
+} /* class_unregister_type */
+EXPORT_SYMBOL(class_unregister_type);
+
+/**
+ * Create a new obd device.
+ *
+ * Find an empty slot in ::obd_devs[], create a new obd device in it.
+ *
+ * \param[in] type_name obd device type string.
+ * \param[in] name      obd device name.
+ *
+ * \retval NULL if create fails, otherwise return the obd device
+ *	 pointer created.
+ */
+struct obd_device *class_newdev(const char *type_name, const char *name)
+{
+	struct obd_device *result = NULL;
+	struct obd_device *newdev;
+	struct obd_type *type = NULL;
+	int i;
+	int new_obd_minor = 0;
+
+	if (strlen(name) >= MAX_OBD_NAME) {
+		CERROR("name/uuid must be < %u bytes long\n", MAX_OBD_NAME);
+		return ERR_PTR(-EINVAL);
+	}
+
+	type = class_get_type(type_name);
+	if (type == NULL){
+		CERROR("OBD: unknown type: %s\n", type_name);
+		return ERR_PTR(-ENODEV);
+	}
+
+	newdev = obd_device_alloc();
+	if (newdev == NULL) {
+		result = ERR_PTR(-ENOMEM);
+		goto out_type;
+	}
+
+	LASSERT(newdev->obd_magic == OBD_DEVICE_MAGIC);
+
+	write_lock(&obd_dev_lock);
+	for (i = 0; i < class_devno_max(); i++) {
+		struct obd_device *obd = class_num2obd(i);
+
+		if (obd && (strcmp(name, obd->obd_name) == 0)) {
+			CERROR("Device %s already exists at %d, won't add\n",
+			       name, i);
+			if (result) {
+				LASSERTF(result->obd_magic == OBD_DEVICE_MAGIC,
+					 "%p obd_magic %08x != %08x\n", result,
+					 result->obd_magic, OBD_DEVICE_MAGIC);
+				LASSERTF(result->obd_minor == new_obd_minor,
+					 "%p obd_minor %d != %d\n", result,
+					 result->obd_minor, new_obd_minor);
+
+				obd_devs[result->obd_minor] = NULL;
+				result->obd_name[0] = '\0';
+			 }
+			result = ERR_PTR(-EEXIST);
+			break;
+		}
+		if (!result && !obd) {
+			result = newdev;
+			result->obd_minor = i;
+			new_obd_minor = i;
+			result->obd_type = type;
+			strncpy(result->obd_name, name,
+				sizeof(result->obd_name) - 1);
+			obd_devs[i] = result;
+		}
+	}
+	write_unlock(&obd_dev_lock);
+
+	if (result == NULL && i >= class_devno_max()) {
+		CERROR("all %u OBD devices used, increase MAX_OBD_DEVICES\n",
+		       class_devno_max());
+		result = ERR_PTR(-EOVERFLOW);
+		goto out;
+	}
+
+	if (IS_ERR(result))
+		goto out;
+
+	CDEBUG(D_IOCTL, "Adding new device %s (%p)\n",
+	       result->obd_name, result);
+
+	return result;
+out:
+	obd_device_free(newdev);
+out_type:
+	class_put_type(type);
+	return result;
+}
+
+void class_release_dev(struct obd_device *obd)
+{
+	struct obd_type *obd_type = obd->obd_type;
+
+	LASSERTF(obd->obd_magic == OBD_DEVICE_MAGIC, "%p obd_magic %08x != %08x\n",
+		 obd, obd->obd_magic, OBD_DEVICE_MAGIC);
+	LASSERTF(obd == obd_devs[obd->obd_minor], "obd %p != obd_devs[%d] %p\n",
+		 obd, obd->obd_minor, obd_devs[obd->obd_minor]);
+	LASSERT(obd_type != NULL);
+
+	CDEBUG(D_INFO, "Release obd device %s at %d obd_type name =%s\n",
+	       obd->obd_name, obd->obd_minor, obd->obd_type->typ_name);
+
+	write_lock(&obd_dev_lock);
+	obd_devs[obd->obd_minor] = NULL;
+	write_unlock(&obd_dev_lock);
+	obd_device_free(obd);
+
+	class_put_type(obd_type);
+}
+
+int class_name2dev(const char *name)
+{
+	int i;
+
+	if (!name)
+		return -1;
+
+	read_lock(&obd_dev_lock);
+	for (i = 0; i < class_devno_max(); i++) {
+		struct obd_device *obd = class_num2obd(i);
+
+		if (obd && strcmp(name, obd->obd_name) == 0) {
+			/* Make sure we finished attaching before we give
+			   out any references */
+			LASSERT(obd->obd_magic == OBD_DEVICE_MAGIC);
+			if (obd->obd_attached) {
+				read_unlock(&obd_dev_lock);
+				return i;
+			}
+			break;
+		}
+	}
+	read_unlock(&obd_dev_lock);
+
+	return -1;
+}
+EXPORT_SYMBOL(class_name2dev);
+
+struct obd_device *class_name2obd(const char *name)
+{
+	int dev = class_name2dev(name);
+
+	if (dev < 0 || dev > class_devno_max())
+		return NULL;
+	return class_num2obd(dev);
+}
+EXPORT_SYMBOL(class_name2obd);
+
+int class_uuid2dev(struct obd_uuid *uuid)
+{
+	int i;
+
+	read_lock(&obd_dev_lock);
+	for (i = 0; i < class_devno_max(); i++) {
+		struct obd_device *obd = class_num2obd(i);
+
+		if (obd && obd_uuid_equals(uuid, &obd->obd_uuid)) {
+			LASSERT(obd->obd_magic == OBD_DEVICE_MAGIC);
+			read_unlock(&obd_dev_lock);
+			return i;
+		}
+	}
+	read_unlock(&obd_dev_lock);
+
+	return -1;
+}
+EXPORT_SYMBOL(class_uuid2dev);
+
+struct obd_device *class_uuid2obd(struct obd_uuid *uuid)
+{
+	int dev = class_uuid2dev(uuid);
+	if (dev < 0)
+		return NULL;
+	return class_num2obd(dev);
+}
+EXPORT_SYMBOL(class_uuid2obd);
+
+/**
+ * Get obd device from ::obd_devs[]
+ *
+ * \param num [in] array index
+ *
+ * \retval NULL if ::obd_devs[\a num] does not contains an obd device
+ *	 otherwise return the obd device there.
+ */
+struct obd_device *class_num2obd(int num)
+{
+	struct obd_device *obd = NULL;
+
+	if (num < class_devno_max()) {
+		obd = obd_devs[num];
+		if (obd == NULL)
+			return NULL;
+
+		LASSERTF(obd->obd_magic == OBD_DEVICE_MAGIC,
+			 "%p obd_magic %08x != %08x\n",
+			 obd, obd->obd_magic, OBD_DEVICE_MAGIC);
+		LASSERTF(obd->obd_minor == num,
+			 "%p obd_minor %0d != %0d\n",
+			 obd, obd->obd_minor, num);
+	}
+
+	return obd;
+}
+EXPORT_SYMBOL(class_num2obd);
+
+/**
+ * Get obd devices count. Device in any
+ *    state are counted
+ * \retval obd device count
+ */
+int get_devices_count(void)
+{
+	int index, max_index = class_devno_max(), dev_count = 0;
+
+	read_lock(&obd_dev_lock);
+	for (index = 0; index <= max_index; index++) {
+		struct obd_device *obd = class_num2obd(index);
+		if (obd != NULL)
+			dev_count++;
+	}
+	read_unlock(&obd_dev_lock);
+
+	return dev_count;
+}
+EXPORT_SYMBOL(get_devices_count);
+
+void class_obd_list(void)
+{
+	char *status;
+	int i;
+
+	read_lock(&obd_dev_lock);
+	for (i = 0; i < class_devno_max(); i++) {
+		struct obd_device *obd = class_num2obd(i);
+
+		if (obd == NULL)
+			continue;
+		if (obd->obd_stopping)
+			status = "ST";
+		else if (obd->obd_set_up)
+			status = "UP";
+		else if (obd->obd_attached)
+			status = "AT";
+		else
+			status = "--";
+		LCONSOLE(D_CONFIG, "%3d %s %s %s %s %d\n",
+			 i, status, obd->obd_type->typ_name,
+			 obd->obd_name, obd->obd_uuid.uuid,
+			 atomic_read(&obd->obd_refcount));
+	}
+	read_unlock(&obd_dev_lock);
+	return;
+}
+
+/* Search for a client OBD connected to tgt_uuid.  If grp_uuid is
+   specified, then only the client with that uuid is returned,
+   otherwise any client connected to the tgt is returned. */
+struct obd_device *class_find_client_obd(struct obd_uuid *tgt_uuid,
+					  const char *typ_name,
+					  struct obd_uuid *grp_uuid)
+{
+	int i;
+
+	read_lock(&obd_dev_lock);
+	for (i = 0; i < class_devno_max(); i++) {
+		struct obd_device *obd = class_num2obd(i);
+
+		if (obd == NULL)
+			continue;
+		if ((strncmp(obd->obd_type->typ_name, typ_name,
+			     strlen(typ_name)) == 0)) {
+			if (obd_uuid_equals(tgt_uuid,
+					    &obd->u.cli.cl_target_uuid) &&
+			    ((grp_uuid)? obd_uuid_equals(grp_uuid,
+							 &obd->obd_uuid) : 1)) {
+				read_unlock(&obd_dev_lock);
+				return obd;
+			}
+		}
+	}
+	read_unlock(&obd_dev_lock);
+
+	return NULL;
+}
+EXPORT_SYMBOL(class_find_client_obd);
+
+/* Iterate the obd_device list looking devices have grp_uuid. Start
+   searching at *next, and if a device is found, the next index to look
+   at is saved in *next. If next is NULL, then the first matching device
+   will always be returned. */
+struct obd_device *class_devices_in_group(struct obd_uuid *grp_uuid, int *next)
+{
+	int i;
+
+	if (next == NULL)
+		i = 0;
+	else if (*next >= 0 && *next < class_devno_max())
+		i = *next;
+	else
+		return NULL;
+
+	read_lock(&obd_dev_lock);
+	for (; i < class_devno_max(); i++) {
+		struct obd_device *obd = class_num2obd(i);
+
+		if (obd == NULL)
+			continue;
+		if (obd_uuid_equals(grp_uuid, &obd->obd_uuid)) {
+			if (next != NULL)
+				*next = i+1;
+			read_unlock(&obd_dev_lock);
+			return obd;
+		}
+	}
+	read_unlock(&obd_dev_lock);
+
+	return NULL;
+}
+EXPORT_SYMBOL(class_devices_in_group);
+
+/**
+ * to notify sptlrpc log for \a fsname has changed, let every relevant OBD
+ * adjust sptlrpc settings accordingly.
+ */
+int class_notify_sptlrpc_conf(const char *fsname, int namelen)
+{
+	struct obd_device  *obd;
+	const char	 *type;
+	int		 i, rc = 0, rc2;
+
+	LASSERT(namelen > 0);
+
+	read_lock(&obd_dev_lock);
+	for (i = 0; i < class_devno_max(); i++) {
+		obd = class_num2obd(i);
+
+		if (obd == NULL || obd->obd_set_up == 0 || obd->obd_stopping)
+			continue;
+
+		/* only notify mdc, osc, mdt, ost */
+		type = obd->obd_type->typ_name;
+		if (strcmp(type, LUSTRE_MDC_NAME) != 0 &&
+		    strcmp(type, LUSTRE_OSC_NAME) != 0 &&
+		    strcmp(type, LUSTRE_MDT_NAME) != 0 &&
+		    strcmp(type, LUSTRE_OST_NAME) != 0)
+			continue;
+
+		if (strncmp(obd->obd_name, fsname, namelen))
+			continue;
+
+		class_incref(obd, __func__, obd);
+		read_unlock(&obd_dev_lock);
+		rc2 = obd_set_info_async(NULL, obd->obd_self_export,
+					 sizeof(KEY_SPTLRPC_CONF),
+					 KEY_SPTLRPC_CONF, 0, NULL, NULL);
+		rc = rc ? rc : rc2;
+		class_decref(obd, __func__, obd);
+		read_lock(&obd_dev_lock);
+	}
+	read_unlock(&obd_dev_lock);
+	return rc;
+}
+EXPORT_SYMBOL(class_notify_sptlrpc_conf);
+
+void obd_cleanup_caches(void)
+{
+	if (obd_device_cachep) {
+		kmem_cache_destroy(obd_device_cachep);
+		obd_device_cachep = NULL;
+	}
+	if (obdo_cachep) {
+		kmem_cache_destroy(obdo_cachep);
+		obdo_cachep = NULL;
+	}
+	if (import_cachep) {
+		kmem_cache_destroy(import_cachep);
+		import_cachep = NULL;
+	}
+	if (capa_cachep) {
+		kmem_cache_destroy(capa_cachep);
+		capa_cachep = NULL;
+	}
+}
+
+int obd_init_caches(void)
+{
+	LASSERT(obd_device_cachep == NULL);
+	obd_device_cachep = kmem_cache_create("ll_obd_dev_cache",
+						 sizeof(struct obd_device),
+						 0, 0, NULL);
+	if (!obd_device_cachep)
+		goto out;
+
+	LASSERT(obdo_cachep == NULL);
+	obdo_cachep = kmem_cache_create("ll_obdo_cache", sizeof(struct obdo),
+					   0, 0, NULL);
+	if (!obdo_cachep)
+		goto out;
+
+	LASSERT(import_cachep == NULL);
+	import_cachep = kmem_cache_create("ll_import_cache",
+					     sizeof(struct obd_import),
+					     0, 0, NULL);
+	if (!import_cachep)
+		goto out;
+
+	LASSERT(capa_cachep == NULL);
+	capa_cachep = kmem_cache_create("capa_cache",
+					   sizeof(struct obd_capa), 0, 0, NULL);
+	if (!capa_cachep)
+		goto out;
+
+	return 0;
+ out:
+	obd_cleanup_caches();
+	return -ENOMEM;
+
+}
+
+/* map connection to client */
+struct obd_export *class_conn2export(struct lustre_handle *conn)
+{
+	struct obd_export *export;
+
+	if (!conn) {
+		CDEBUG(D_CACHE, "looking for null handle\n");
+		return NULL;
+	}
+
+	if (conn->cookie == -1) {  /* this means assign a new connection */
+		CDEBUG(D_CACHE, "want a new connection\n");
+		return NULL;
+	}
+
+	CDEBUG(D_INFO, "looking for export cookie %#llx\n", conn->cookie);
+	export = class_handle2object(conn->cookie);
+	return export;
+}
+EXPORT_SYMBOL(class_conn2export);
+
+struct obd_device *class_exp2obd(struct obd_export *exp)
+{
+	if (exp)
+		return exp->exp_obd;
+	return NULL;
+}
+EXPORT_SYMBOL(class_exp2obd);
+
+struct obd_device *class_conn2obd(struct lustre_handle *conn)
+{
+	struct obd_export *export;
+	export = class_conn2export(conn);
+	if (export) {
+		struct obd_device *obd = export->exp_obd;
+		class_export_put(export);
+		return obd;
+	}
+	return NULL;
+}
+EXPORT_SYMBOL(class_conn2obd);
+
+struct obd_import *class_exp2cliimp(struct obd_export *exp)
+{
+	struct obd_device *obd = exp->exp_obd;
+	if (obd == NULL)
+		return NULL;
+	return obd->u.cli.cl_import;
+}
+EXPORT_SYMBOL(class_exp2cliimp);
+
+struct obd_import *class_conn2cliimp(struct lustre_handle *conn)
+{
+	struct obd_device *obd = class_conn2obd(conn);
+	if (obd == NULL)
+		return NULL;
+	return obd->u.cli.cl_import;
+}
+EXPORT_SYMBOL(class_conn2cliimp);
+
+/* Export management functions */
+static void class_export_destroy(struct obd_export *exp)
+{
+	struct obd_device *obd = exp->exp_obd;
+
+	LASSERT_ATOMIC_ZERO(&exp->exp_refcount);
+	LASSERT(obd != NULL);
+
+	CDEBUG(D_IOCTL, "destroying export %p/%s for %s\n", exp,
+	       exp->exp_client_uuid.uuid, obd->obd_name);
+
+	/* "Local" exports (lctl, LOV->{mdc,osc}) have no connection. */
+	if (exp->exp_connection)
+		ptlrpc_put_connection_superhack(exp->exp_connection);
+
+	LASSERT(list_empty(&exp->exp_outstanding_replies));
+	LASSERT(list_empty(&exp->exp_uncommitted_replies));
+	LASSERT(list_empty(&exp->exp_req_replay_queue));
+	LASSERT(list_empty(&exp->exp_hp_rpcs));
+	obd_destroy_export(exp);
+	class_decref(obd, "export", exp);
+
+	OBD_FREE_RCU(exp, sizeof(*exp), &exp->exp_handle);
+}
+
+static void export_handle_addref(void *export)
+{
+	class_export_get(export);
+}
+
+static struct portals_handle_ops export_handle_ops = {
+	.hop_addref = export_handle_addref,
+	.hop_free   = NULL,
+};
+
+struct obd_export *class_export_get(struct obd_export *exp)
+{
+	atomic_inc(&exp->exp_refcount);
+	CDEBUG(D_INFO, "GETting export %p : new refcount %d\n", exp,
+	       atomic_read(&exp->exp_refcount));
+	return exp;
+}
+EXPORT_SYMBOL(class_export_get);
+
+void class_export_put(struct obd_export *exp)
+{
+	LASSERT(exp != NULL);
+	LASSERT_ATOMIC_GT_LT(&exp->exp_refcount, 0, LI_POISON);
+	CDEBUG(D_INFO, "PUTting export %p : new refcount %d\n", exp,
+	       atomic_read(&exp->exp_refcount) - 1);
+
+	if (atomic_dec_and_test(&exp->exp_refcount)) {
+		LASSERT(!list_empty(&exp->exp_obd_chain));
+		CDEBUG(D_IOCTL, "final put %p/%s\n",
+		       exp, exp->exp_client_uuid.uuid);
+
+		/* release nid stat refererence */
+		lprocfs_exp_cleanup(exp);
+
+		obd_zombie_export_add(exp);
+	}
+}
+EXPORT_SYMBOL(class_export_put);
+
+/* Creates a new export, adds it to the hash table, and returns a
+ * pointer to it. The refcount is 2: one for the hash reference, and
+ * one for the pointer returned by this function. */
+struct obd_export *class_new_export(struct obd_device *obd,
+				    struct obd_uuid *cluuid)
+{
+	struct obd_export *export;
+	struct cfs_hash *hash = NULL;
+	int rc = 0;
+
+	OBD_ALLOC_PTR(export);
+	if (!export)
+		return ERR_PTR(-ENOMEM);
+
+	export->exp_conn_cnt = 0;
+	export->exp_lock_hash = NULL;
+	export->exp_flock_hash = NULL;
+	atomic_set(&export->exp_refcount, 2);
+	atomic_set(&export->exp_rpc_count, 0);
+	atomic_set(&export->exp_cb_count, 0);
+	atomic_set(&export->exp_locks_count, 0);
+#if LUSTRE_TRACKS_LOCK_EXP_REFS
+	INIT_LIST_HEAD(&export->exp_locks_list);
+	spin_lock_init(&export->exp_locks_list_guard);
+#endif
+	atomic_set(&export->exp_replay_count, 0);
+	export->exp_obd = obd;
+	INIT_LIST_HEAD(&export->exp_outstanding_replies);
+	spin_lock_init(&export->exp_uncommitted_replies_lock);
+	INIT_LIST_HEAD(&export->exp_uncommitted_replies);
+	INIT_LIST_HEAD(&export->exp_req_replay_queue);
+	INIT_LIST_HEAD(&export->exp_handle.h_link);
+	INIT_LIST_HEAD(&export->exp_hp_rpcs);
+	class_handle_hash(&export->exp_handle, &export_handle_ops);
+	export->exp_last_request_time = get_seconds();
+	spin_lock_init(&export->exp_lock);
+	spin_lock_init(&export->exp_rpc_lock);
+	INIT_HLIST_NODE(&export->exp_uuid_hash);
+	INIT_HLIST_NODE(&export->exp_nid_hash);
+	spin_lock_init(&export->exp_bl_list_lock);
+	INIT_LIST_HEAD(&export->exp_bl_list);
+
+	export->exp_sp_peer = LUSTRE_SP_ANY;
+	export->exp_flvr.sf_rpc = SPTLRPC_FLVR_INVALID;
+	export->exp_client_uuid = *cluuid;
+	obd_init_export(export);
+
+	spin_lock(&obd->obd_dev_lock);
+	/* shouldn't happen, but might race */
+	if (obd->obd_stopping) {
+		rc = -ENODEV;
+		goto exit_unlock;
+	}
+
+	hash = cfs_hash_getref(obd->obd_uuid_hash);
+	if (hash == NULL) {
+		rc = -ENODEV;
+		goto exit_unlock;
+	}
+	spin_unlock(&obd->obd_dev_lock);
+
+	if (!obd_uuid_equals(cluuid, &obd->obd_uuid)) {
+		rc = cfs_hash_add_unique(hash, cluuid, &export->exp_uuid_hash);
+		if (rc != 0) {
+			LCONSOLE_WARN("%s: denying duplicate export for %s, %d\n",
+				      obd->obd_name, cluuid->uuid, rc);
+			rc = -EALREADY;
+			goto exit_err;
+		}
+	}
+
+	spin_lock(&obd->obd_dev_lock);
+	if (obd->obd_stopping) {
+		cfs_hash_del(hash, cluuid, &export->exp_uuid_hash);
+		rc = -ENODEV;
+		goto exit_unlock;
+	}
+
+	class_incref(obd, "export", export);
+	list_add(&export->exp_obd_chain, &export->exp_obd->obd_exports);
+	list_add_tail(&export->exp_obd_chain_timed,
+			  &export->exp_obd->obd_exports_timed);
+	export->exp_obd->obd_num_exports++;
+	spin_unlock(&obd->obd_dev_lock);
+	cfs_hash_putref(hash);
+	return export;
+
+exit_unlock:
+	spin_unlock(&obd->obd_dev_lock);
+exit_err:
+	if (hash)
+		cfs_hash_putref(hash);
+	class_handle_unhash(&export->exp_handle);
+	LASSERT(hlist_unhashed(&export->exp_uuid_hash));
+	obd_destroy_export(export);
+	OBD_FREE_PTR(export);
+	return ERR_PTR(rc);
+}
+EXPORT_SYMBOL(class_new_export);
+
+void class_unlink_export(struct obd_export *exp)
+{
+	class_handle_unhash(&exp->exp_handle);
+
+	spin_lock(&exp->exp_obd->obd_dev_lock);
+	/* delete an uuid-export hashitem from hashtables */
+	if (!hlist_unhashed(&exp->exp_uuid_hash))
+		cfs_hash_del(exp->exp_obd->obd_uuid_hash,
+			     &exp->exp_client_uuid,
+			     &exp->exp_uuid_hash);
+
+	list_move(&exp->exp_obd_chain, &exp->exp_obd->obd_unlinked_exports);
+	list_del_init(&exp->exp_obd_chain_timed);
+	exp->exp_obd->obd_num_exports--;
+	spin_unlock(&exp->exp_obd->obd_dev_lock);
+	class_export_put(exp);
+}
+EXPORT_SYMBOL(class_unlink_export);
+
+/* Import management functions */
+static void class_import_destroy(struct obd_import *imp)
+{
+	CDEBUG(D_IOCTL, "destroying import %p for %s\n", imp,
+		imp->imp_obd->obd_name);
+
+	LASSERT_ATOMIC_ZERO(&imp->imp_refcount);
+
+	ptlrpc_put_connection_superhack(imp->imp_connection);
+
+	while (!list_empty(&imp->imp_conn_list)) {
+		struct obd_import_conn *imp_conn;
+
+		imp_conn = list_entry(imp->imp_conn_list.next,
+					  struct obd_import_conn, oic_item);
+		list_del_init(&imp_conn->oic_item);
+		ptlrpc_put_connection_superhack(imp_conn->oic_conn);
+		OBD_FREE(imp_conn, sizeof(*imp_conn));
+	}
+
+	LASSERT(imp->imp_sec == NULL);
+	class_decref(imp->imp_obd, "import", imp);
+	OBD_FREE_RCU(imp, sizeof(*imp), &imp->imp_handle);
+}
+
+static void import_handle_addref(void *import)
+{
+	class_import_get(import);
+}
+
+static struct portals_handle_ops import_handle_ops = {
+	.hop_addref = import_handle_addref,
+	.hop_free   = NULL,
+};
+
+struct obd_import *class_import_get(struct obd_import *import)
+{
+	atomic_inc(&import->imp_refcount);
+	CDEBUG(D_INFO, "import %p refcount=%d obd=%s\n", import,
+	       atomic_read(&import->imp_refcount),
+	       import->imp_obd->obd_name);
+	return import;
+}
+EXPORT_SYMBOL(class_import_get);
+
+void class_import_put(struct obd_import *imp)
+{
+	LASSERT(list_empty(&imp->imp_zombie_chain));
+	LASSERT_ATOMIC_GT_LT(&imp->imp_refcount, 0, LI_POISON);
+
+	CDEBUG(D_INFO, "import %p refcount=%d obd=%s\n", imp,
+	       atomic_read(&imp->imp_refcount) - 1,
+	       imp->imp_obd->obd_name);
+
+	if (atomic_dec_and_test(&imp->imp_refcount)) {
+		CDEBUG(D_INFO, "final put import %p\n", imp);
+		obd_zombie_import_add(imp);
+	}
+
+	/* catch possible import put race */
+	LASSERT_ATOMIC_GE_LT(&imp->imp_refcount, 0, LI_POISON);
+}
+EXPORT_SYMBOL(class_import_put);
+
+static void init_imp_at(struct imp_at *at) {
+	int i;
+	at_init(&at->iat_net_latency, 0, 0);
+	for (i = 0; i < IMP_AT_MAX_PORTALS; i++) {
+		/* max service estimates are tracked on the server side, so
+		   don't use the AT history here, just use the last reported
+		   val. (But keep hist for proc histogram, worst_ever) */
+		at_init(&at->iat_service_estimate[i], INITIAL_CONNECT_TIMEOUT,
+			AT_FLG_NOHIST);
+	}
+}
+
+struct obd_import *class_new_import(struct obd_device *obd)
+{
+	struct obd_import *imp;
+
+	OBD_ALLOC(imp, sizeof(*imp));
+	if (imp == NULL)
+		return NULL;
+
+	INIT_LIST_HEAD(&imp->imp_pinger_chain);
+	INIT_LIST_HEAD(&imp->imp_zombie_chain);
+	INIT_LIST_HEAD(&imp->imp_replay_list);
+	INIT_LIST_HEAD(&imp->imp_sending_list);
+	INIT_LIST_HEAD(&imp->imp_delayed_list);
+	INIT_LIST_HEAD(&imp->imp_committed_list);
+	imp->imp_replay_cursor = &imp->imp_committed_list;
+	spin_lock_init(&imp->imp_lock);
+	imp->imp_last_success_conn = 0;
+	imp->imp_state = LUSTRE_IMP_NEW;
+	imp->imp_obd = class_incref(obd, "import", imp);
+	mutex_init(&imp->imp_sec_mutex);
+	init_waitqueue_head(&imp->imp_recovery_waitq);
+
+	atomic_set(&imp->imp_refcount, 2);
+	atomic_set(&imp->imp_unregistering, 0);
+	atomic_set(&imp->imp_inflight, 0);
+	atomic_set(&imp->imp_replay_inflight, 0);
+	atomic_set(&imp->imp_inval_count, 0);
+	INIT_LIST_HEAD(&imp->imp_conn_list);
+	INIT_LIST_HEAD(&imp->imp_handle.h_link);
+	class_handle_hash(&imp->imp_handle, &import_handle_ops);
+	init_imp_at(&imp->imp_at);
+
+	/* the default magic is V2, will be used in connect RPC, and
+	 * then adjusted according to the flags in request/reply. */
+	imp->imp_msg_magic = LUSTRE_MSG_MAGIC_V2;
+
+	return imp;
+}
+EXPORT_SYMBOL(class_new_import);
+
+void class_destroy_import(struct obd_import *import)
+{
+	LASSERT(import != NULL);
+	LASSERT(import != LP_POISON);
+
+	class_handle_unhash(&import->imp_handle);
+
+	spin_lock(&import->imp_lock);
+	import->imp_generation++;
+	spin_unlock(&import->imp_lock);
+	class_import_put(import);
+}
+EXPORT_SYMBOL(class_destroy_import);
+
+#if LUSTRE_TRACKS_LOCK_EXP_REFS
+
+void __class_export_add_lock_ref(struct obd_export *exp, struct ldlm_lock *lock)
+{
+	spin_lock(&exp->exp_locks_list_guard);
+
+	LASSERT(lock->l_exp_refs_nr >= 0);
+
+	if (lock->l_exp_refs_target != NULL &&
+	    lock->l_exp_refs_target != exp) {
+		LCONSOLE_WARN("setting export %p for lock %p which already has export %p\n",
+			      exp, lock, lock->l_exp_refs_target);
+	}
+	if ((lock->l_exp_refs_nr ++) == 0) {
+		list_add(&lock->l_exp_refs_link, &exp->exp_locks_list);
+		lock->l_exp_refs_target = exp;
+	}
+	CDEBUG(D_INFO, "lock = %p, export = %p, refs = %u\n",
+	       lock, exp, lock->l_exp_refs_nr);
+	spin_unlock(&exp->exp_locks_list_guard);
+}
+EXPORT_SYMBOL(__class_export_add_lock_ref);
+
+void __class_export_del_lock_ref(struct obd_export *exp, struct ldlm_lock *lock)
+{
+	spin_lock(&exp->exp_locks_list_guard);
+	LASSERT(lock->l_exp_refs_nr > 0);
+	if (lock->l_exp_refs_target != exp) {
+		LCONSOLE_WARN("lock %p, mismatching export pointers: %p, %p\n",
+			      lock, lock->l_exp_refs_target, exp);
+	}
+	if (-- lock->l_exp_refs_nr == 0) {
+		list_del_init(&lock->l_exp_refs_link);
+		lock->l_exp_refs_target = NULL;
+	}
+	CDEBUG(D_INFO, "lock = %p, export = %p, refs = %u\n",
+	       lock, exp, lock->l_exp_refs_nr);
+	spin_unlock(&exp->exp_locks_list_guard);
+}
+EXPORT_SYMBOL(__class_export_del_lock_ref);
+#endif
+
+/* A connection defines an export context in which preallocation can
+   be managed. This releases the export pointer reference, and returns
+   the export handle, so the export refcount is 1 when this function
+   returns. */
+int class_connect(struct lustre_handle *conn, struct obd_device *obd,
+		  struct obd_uuid *cluuid)
+{
+	struct obd_export *export;
+	LASSERT(conn != NULL);
+	LASSERT(obd != NULL);
+	LASSERT(cluuid != NULL);
+
+	export = class_new_export(obd, cluuid);
+	if (IS_ERR(export))
+		return PTR_ERR(export);
+
+	conn->cookie = export->exp_handle.h_cookie;
+	class_export_put(export);
+
+	CDEBUG(D_IOCTL, "connect: client %s, cookie %#llx\n",
+	       cluuid->uuid, conn->cookie);
+	return 0;
+}
+EXPORT_SYMBOL(class_connect);
+
+/* if export is involved in recovery then clean up related things */
+static void class_export_recovery_cleanup(struct obd_export *exp)
+{
+	struct obd_device *obd = exp->exp_obd;
+
+	spin_lock(&obd->obd_recovery_task_lock);
+	if (exp->exp_delayed)
+		obd->obd_delayed_clients--;
+	if (obd->obd_recovering) {
+		if (exp->exp_in_recovery) {
+			spin_lock(&exp->exp_lock);
+			exp->exp_in_recovery = 0;
+			spin_unlock(&exp->exp_lock);
+			LASSERT_ATOMIC_POS(&obd->obd_connected_clients);
+			atomic_dec(&obd->obd_connected_clients);
+		}
+
+		/* if called during recovery then should update
+		 * obd_stale_clients counter,
+		 * lightweight exports are not counted */
+		if (exp->exp_failed &&
+		    (exp_connect_flags(exp) & OBD_CONNECT_LIGHTWEIGHT) == 0)
+			exp->exp_obd->obd_stale_clients++;
+	}
+	spin_unlock(&obd->obd_recovery_task_lock);
+
+	spin_lock(&exp->exp_lock);
+	/** Cleanup req replay fields */
+	if (exp->exp_req_replay_needed) {
+		exp->exp_req_replay_needed = 0;
+
+		LASSERT(atomic_read(&obd->obd_req_replay_clients));
+		atomic_dec(&obd->obd_req_replay_clients);
+	}
+
+	/** Cleanup lock replay data */
+	if (exp->exp_lock_replay_needed) {
+		exp->exp_lock_replay_needed = 0;
+
+		LASSERT(atomic_read(&obd->obd_lock_replay_clients));
+		atomic_dec(&obd->obd_lock_replay_clients);
+	}
+	spin_unlock(&exp->exp_lock);
+}
+
+/* This function removes 1-3 references from the export:
+ * 1 - for export pointer passed
+ * and if disconnect really need
+ * 2 - removing from hash
+ * 3 - in client_unlink_export
+ * The export pointer passed to this function can destroyed */
+int class_disconnect(struct obd_export *export)
+{
+	int already_disconnected;
+
+	if (export == NULL) {
+		CWARN("attempting to free NULL export %p\n", export);
+		return -EINVAL;
+	}
+
+	spin_lock(&export->exp_lock);
+	already_disconnected = export->exp_disconnected;
+	export->exp_disconnected = 1;
+	spin_unlock(&export->exp_lock);
+
+	/* class_cleanup(), abort_recovery(), and class_fail_export()
+	 * all end up in here, and if any of them race we shouldn't
+	 * call extra class_export_puts(). */
+	if (already_disconnected) {
+		LASSERT(hlist_unhashed(&export->exp_nid_hash));
+		goto no_disconn;
+	}
+
+	CDEBUG(D_IOCTL, "disconnect: cookie %#llx\n",
+	       export->exp_handle.h_cookie);
+
+	if (!hlist_unhashed(&export->exp_nid_hash))
+		cfs_hash_del(export->exp_obd->obd_nid_hash,
+			     &export->exp_connection->c_peer.nid,
+			     &export->exp_nid_hash);
+
+	class_export_recovery_cleanup(export);
+	class_unlink_export(export);
+no_disconn:
+	class_export_put(export);
+	return 0;
+}
+EXPORT_SYMBOL(class_disconnect);
+
+/* Return non-zero for a fully connected export */
+int class_connected_export(struct obd_export *exp)
+{
+	if (exp) {
+		int connected;
+		spin_lock(&exp->exp_lock);
+		connected = exp->exp_conn_cnt > 0;
+		spin_unlock(&exp->exp_lock);
+		return connected;
+	}
+	return 0;
+}
+EXPORT_SYMBOL(class_connected_export);
+
+static void class_disconnect_export_list(struct list_head *list,
+					 enum obd_option flags)
+{
+	int rc;
+	struct obd_export *exp;
+
+	/* It's possible that an export may disconnect itself, but
+	 * nothing else will be added to this list. */
+	while (!list_empty(list)) {
+		exp = list_entry(list->next, struct obd_export,
+				     exp_obd_chain);
+		/* need for safe call CDEBUG after obd_disconnect */
+		class_export_get(exp);
+
+		spin_lock(&exp->exp_lock);
+		exp->exp_flags = flags;
+		spin_unlock(&exp->exp_lock);
+
+		if (obd_uuid_equals(&exp->exp_client_uuid,
+				    &exp->exp_obd->obd_uuid)) {
+			CDEBUG(D_HA,
+			       "exp %p export uuid == obd uuid, don't discon\n",
+			       exp);
+			/* Need to delete this now so we don't end up pointing
+			 * to work_list later when this export is cleaned up. */
+			list_del_init(&exp->exp_obd_chain);
+			class_export_put(exp);
+			continue;
+		}
+
+		class_export_get(exp);
+		CDEBUG(D_HA, "%s: disconnecting export at %s (%p), last request at " CFS_TIME_T "\n",
+		       exp->exp_obd->obd_name, obd_export_nid2str(exp),
+		       exp, exp->exp_last_request_time);
+		/* release one export reference anyway */
+		rc = obd_disconnect(exp);
+
+		CDEBUG(D_HA, "disconnected export at %s (%p): rc %d\n",
+		       obd_export_nid2str(exp), exp, rc);
+		class_export_put(exp);
+	}
+}
+
+void class_disconnect_exports(struct obd_device *obd)
+{
+	struct list_head work_list;
+
+	/* Move all of the exports from obd_exports to a work list, en masse. */
+	INIT_LIST_HEAD(&work_list);
+	spin_lock(&obd->obd_dev_lock);
+	list_splice_init(&obd->obd_exports, &work_list);
+	list_splice_init(&obd->obd_delayed_exports, &work_list);
+	spin_unlock(&obd->obd_dev_lock);
+
+	if (!list_empty(&work_list)) {
+		CDEBUG(D_HA, "OBD device %d (%p) has exports, disconnecting them\n",
+		       obd->obd_minor, obd);
+		class_disconnect_export_list(&work_list,
+					     exp_flags_from_obd(obd));
+	} else
+		CDEBUG(D_HA, "OBD device %d (%p) has no exports\n",
+		       obd->obd_minor, obd);
+}
+EXPORT_SYMBOL(class_disconnect_exports);
+
+/* Remove exports that have not completed recovery.
+ */
+void class_disconnect_stale_exports(struct obd_device *obd,
+				    int (*test_export)(struct obd_export *))
+{
+	struct list_head work_list;
+	struct obd_export *exp, *n;
+	int evicted = 0;
+
+	INIT_LIST_HEAD(&work_list);
+	spin_lock(&obd->obd_dev_lock);
+	list_for_each_entry_safe(exp, n, &obd->obd_exports,
+				     exp_obd_chain) {
+		/* don't count self-export as client */
+		if (obd_uuid_equals(&exp->exp_client_uuid,
+				    &exp->exp_obd->obd_uuid))
+			continue;
+
+		/* don't evict clients which have no slot in last_rcvd
+		 * (e.g. lightweight connection) */
+		if (exp->exp_target_data.ted_lr_idx == -1)
+			continue;
+
+		spin_lock(&exp->exp_lock);
+		if (exp->exp_failed || test_export(exp)) {
+			spin_unlock(&exp->exp_lock);
+			continue;
+		}
+		exp->exp_failed = 1;
+		spin_unlock(&exp->exp_lock);
+
+		list_move(&exp->exp_obd_chain, &work_list);
+		evicted++;
+		CDEBUG(D_HA, "%s: disconnect stale client %s@%s\n",
+		       obd->obd_name, exp->exp_client_uuid.uuid,
+		       exp->exp_connection == NULL ? "<unknown>" :
+		       libcfs_nid2str(exp->exp_connection->c_peer.nid));
+		print_export_data(exp, "EVICTING", 0);
+	}
+	spin_unlock(&obd->obd_dev_lock);
+
+	if (evicted)
+		LCONSOLE_WARN("%s: disconnecting %d stale clients\n",
+			      obd->obd_name, evicted);
+
+	class_disconnect_export_list(&work_list, exp_flags_from_obd(obd) |
+						 OBD_OPT_ABORT_RECOV);
+}
+EXPORT_SYMBOL(class_disconnect_stale_exports);
+
+void class_fail_export(struct obd_export *exp)
+{
+	int rc, already_failed;
+
+	spin_lock(&exp->exp_lock);
+	already_failed = exp->exp_failed;
+	exp->exp_failed = 1;
+	spin_unlock(&exp->exp_lock);
+
+	if (already_failed) {
+		CDEBUG(D_HA, "disconnecting dead export %p/%s; skipping\n",
+		       exp, exp->exp_client_uuid.uuid);
+		return;
+	}
+
+	CDEBUG(D_HA, "disconnecting export %p/%s\n",
+	       exp, exp->exp_client_uuid.uuid);
+
+	if (obd_dump_on_timeout)
+		libcfs_debug_dumplog();
+
+	/* need for safe call CDEBUG after obd_disconnect */
+	class_export_get(exp);
+
+	/* Most callers into obd_disconnect are removing their own reference
+	 * (request, for example) in addition to the one from the hash table.
+	 * We don't have such a reference here, so make one. */
+	class_export_get(exp);
+	rc = obd_disconnect(exp);
+	if (rc)
+		CERROR("disconnecting export %p failed: %d\n", exp, rc);
+	else
+		CDEBUG(D_HA, "disconnected export %p/%s\n",
+		       exp, exp->exp_client_uuid.uuid);
+	class_export_put(exp);
+}
+EXPORT_SYMBOL(class_fail_export);
+
+char *obd_export_nid2str(struct obd_export *exp)
+{
+	if (exp->exp_connection != NULL)
+		return libcfs_nid2str(exp->exp_connection->c_peer.nid);
+
+	return "(no nid)";
+}
+EXPORT_SYMBOL(obd_export_nid2str);
+
+int obd_export_evict_by_nid(struct obd_device *obd, const char *nid)
+{
+	struct cfs_hash *nid_hash;
+	struct obd_export *doomed_exp = NULL;
+	int exports_evicted = 0;
+
+	lnet_nid_t nid_key = libcfs_str2nid((char *)nid);
+
+	spin_lock(&obd->obd_dev_lock);
+	/* umount has run already, so evict thread should leave
+	 * its task to umount thread now */
+	if (obd->obd_stopping) {
+		spin_unlock(&obd->obd_dev_lock);
+		return exports_evicted;
+	}
+	nid_hash = obd->obd_nid_hash;
+	cfs_hash_getref(nid_hash);
+	spin_unlock(&obd->obd_dev_lock);
+
+	do {
+		doomed_exp = cfs_hash_lookup(nid_hash, &nid_key);
+		if (doomed_exp == NULL)
+			break;
+
+		LASSERTF(doomed_exp->exp_connection->c_peer.nid == nid_key,
+			 "nid %s found, wanted nid %s, requested nid %s\n",
+			 obd_export_nid2str(doomed_exp),
+			 libcfs_nid2str(nid_key), nid);
+		LASSERTF(doomed_exp != obd->obd_self_export,
+			 "self-export is hashed by NID?\n");
+		exports_evicted++;
+		LCONSOLE_WARN("%s: evicting %s (at %s) by administrative request\n",
+			      obd->obd_name,
+			      obd_uuid2str(&doomed_exp->exp_client_uuid),
+			      obd_export_nid2str(doomed_exp));
+		class_fail_export(doomed_exp);
+		class_export_put(doomed_exp);
+	} while (1);
+
+	cfs_hash_putref(nid_hash);
+
+	if (!exports_evicted)
+		CDEBUG(D_HA,
+		       "%s: can't disconnect NID '%s': no exports found\n",
+		       obd->obd_name, nid);
+	return exports_evicted;
+}
+EXPORT_SYMBOL(obd_export_evict_by_nid);
+
+int obd_export_evict_by_uuid(struct obd_device *obd, const char *uuid)
+{
+	struct cfs_hash *uuid_hash;
+	struct obd_export *doomed_exp = NULL;
+	struct obd_uuid doomed_uuid;
+	int exports_evicted = 0;
+
+	spin_lock(&obd->obd_dev_lock);
+	if (obd->obd_stopping) {
+		spin_unlock(&obd->obd_dev_lock);
+		return exports_evicted;
+	}
+	uuid_hash = obd->obd_uuid_hash;
+	cfs_hash_getref(uuid_hash);
+	spin_unlock(&obd->obd_dev_lock);
+
+	obd_str2uuid(&doomed_uuid, uuid);
+	if (obd_uuid_equals(&doomed_uuid, &obd->obd_uuid)) {
+		CERROR("%s: can't evict myself\n", obd->obd_name);
+		cfs_hash_putref(uuid_hash);
+		return exports_evicted;
+	}
+
+	doomed_exp = cfs_hash_lookup(uuid_hash, &doomed_uuid);
+
+	if (doomed_exp == NULL) {
+		CERROR("%s: can't disconnect %s: no exports found\n",
+		       obd->obd_name, uuid);
+	} else {
+		CWARN("%s: evicting %s at administrative request\n",
+		       obd->obd_name, doomed_exp->exp_client_uuid.uuid);
+		class_fail_export(doomed_exp);
+		class_export_put(doomed_exp);
+		exports_evicted++;
+	}
+	cfs_hash_putref(uuid_hash);
+
+	return exports_evicted;
+}
+EXPORT_SYMBOL(obd_export_evict_by_uuid);
+
+#if LUSTRE_TRACKS_LOCK_EXP_REFS
+void (*class_export_dump_hook)(struct obd_export*) = NULL;
+EXPORT_SYMBOL(class_export_dump_hook);
+#endif
+
+static void print_export_data(struct obd_export *exp, const char *status,
+			      int locks)
+{
+	struct ptlrpc_reply_state *rs;
+	struct ptlrpc_reply_state *first_reply = NULL;
+	int nreplies = 0;
+
+	spin_lock(&exp->exp_lock);
+	list_for_each_entry(rs, &exp->exp_outstanding_replies,
+				rs_exp_list) {
+		if (nreplies == 0)
+			first_reply = rs;
+		nreplies++;
+	}
+	spin_unlock(&exp->exp_lock);
+
+	CDEBUG(D_HA, "%s: %s %p %s %s %d (%d %d %d) %d %d %d %d: %p %s %llu\n",
+	       exp->exp_obd->obd_name, status, exp, exp->exp_client_uuid.uuid,
+	       obd_export_nid2str(exp), atomic_read(&exp->exp_refcount),
+	       atomic_read(&exp->exp_rpc_count),
+	       atomic_read(&exp->exp_cb_count),
+	       atomic_read(&exp->exp_locks_count),
+	       exp->exp_disconnected, exp->exp_delayed, exp->exp_failed,
+	       nreplies, first_reply, nreplies > 3 ? "..." : "",
+	       exp->exp_last_committed);
+#if LUSTRE_TRACKS_LOCK_EXP_REFS
+	if (locks && class_export_dump_hook != NULL)
+		class_export_dump_hook(exp);
+#endif
+}
+
+void dump_exports(struct obd_device *obd, int locks)
+{
+	struct obd_export *exp;
+
+	spin_lock(&obd->obd_dev_lock);
+	list_for_each_entry(exp, &obd->obd_exports, exp_obd_chain)
+		print_export_data(exp, "ACTIVE", locks);
+	list_for_each_entry(exp, &obd->obd_unlinked_exports, exp_obd_chain)
+		print_export_data(exp, "UNLINKED", locks);
+	list_for_each_entry(exp, &obd->obd_delayed_exports, exp_obd_chain)
+		print_export_data(exp, "DELAYED", locks);
+	spin_unlock(&obd->obd_dev_lock);
+	spin_lock(&obd_zombie_impexp_lock);
+	list_for_each_entry(exp, &obd_zombie_exports, exp_obd_chain)
+		print_export_data(exp, "ZOMBIE", locks);
+	spin_unlock(&obd_zombie_impexp_lock);
+}
+EXPORT_SYMBOL(dump_exports);
+
+void obd_exports_barrier(struct obd_device *obd)
+{
+	int waited = 2;
+	LASSERT(list_empty(&obd->obd_exports));
+	spin_lock(&obd->obd_dev_lock);
+	while (!list_empty(&obd->obd_unlinked_exports)) {
+		spin_unlock(&obd->obd_dev_lock);
+		set_current_state(TASK_UNINTERRUPTIBLE);
+		schedule_timeout(cfs_time_seconds(waited));
+		if (waited > 5 && IS_PO2(waited)) {
+			LCONSOLE_WARN("%s is waiting for obd_unlinked_exports more than %d seconds. The obd refcount = %d. Is it stuck?\n",
+				      obd->obd_name, waited,
+				      atomic_read(&obd->obd_refcount));
+			dump_exports(obd, 1);
+		}
+		waited *= 2;
+		spin_lock(&obd->obd_dev_lock);
+	}
+	spin_unlock(&obd->obd_dev_lock);
+}
+EXPORT_SYMBOL(obd_exports_barrier);
+
+/* Total amount of zombies to be destroyed */
+static int zombies_count;
+
+/**
+ * kill zombie imports and exports
+ */
+void obd_zombie_impexp_cull(void)
+{
+	struct obd_import *import;
+	struct obd_export *export;
+
+	do {
+		spin_lock(&obd_zombie_impexp_lock);
+
+		import = NULL;
+		if (!list_empty(&obd_zombie_imports)) {
+			import = list_entry(obd_zombie_imports.next,
+						struct obd_import,
+						imp_zombie_chain);
+			list_del_init(&import->imp_zombie_chain);
+		}
+
+		export = NULL;
+		if (!list_empty(&obd_zombie_exports)) {
+			export = list_entry(obd_zombie_exports.next,
+						struct obd_export,
+						exp_obd_chain);
+			list_del_init(&export->exp_obd_chain);
+		}
+
+		spin_unlock(&obd_zombie_impexp_lock);
+
+		if (import != NULL) {
+			class_import_destroy(import);
+			spin_lock(&obd_zombie_impexp_lock);
+			zombies_count--;
+			spin_unlock(&obd_zombie_impexp_lock);
+		}
+
+		if (export != NULL) {
+			class_export_destroy(export);
+			spin_lock(&obd_zombie_impexp_lock);
+			zombies_count--;
+			spin_unlock(&obd_zombie_impexp_lock);
+		}
+
+		cond_resched();
+	} while (import != NULL || export != NULL);
+}
+
+static struct completion	obd_zombie_start;
+static struct completion	obd_zombie_stop;
+static unsigned long		obd_zombie_flags;
+static wait_queue_head_t		obd_zombie_waitq;
+static pid_t			obd_zombie_pid;
+
+enum {
+	OBD_ZOMBIE_STOP		= 0x0001,
+};
+
+/**
+ * check for work for kill zombie import/export thread.
+ */
+static int obd_zombie_impexp_check(void *arg)
+{
+	int rc;
+
+	spin_lock(&obd_zombie_impexp_lock);
+	rc = (zombies_count == 0) &&
+	     !test_bit(OBD_ZOMBIE_STOP, &obd_zombie_flags);
+	spin_unlock(&obd_zombie_impexp_lock);
+
+	return rc;
+}
+
+/**
+ * Add export to the obd_zombie thread and notify it.
+ */
+static void obd_zombie_export_add(struct obd_export *exp) {
+	spin_lock(&exp->exp_obd->obd_dev_lock);
+	LASSERT(!list_empty(&exp->exp_obd_chain));
+	list_del_init(&exp->exp_obd_chain);
+	spin_unlock(&exp->exp_obd->obd_dev_lock);
+	spin_lock(&obd_zombie_impexp_lock);
+	zombies_count++;
+	list_add(&exp->exp_obd_chain, &obd_zombie_exports);
+	spin_unlock(&obd_zombie_impexp_lock);
+
+	obd_zombie_impexp_notify();
+}
+
+/**
+ * Add import to the obd_zombie thread and notify it.
+ */
+static void obd_zombie_import_add(struct obd_import *imp) {
+	LASSERT(imp->imp_sec == NULL);
+	LASSERT(imp->imp_rq_pool == NULL);
+	spin_lock(&obd_zombie_impexp_lock);
+	LASSERT(list_empty(&imp->imp_zombie_chain));
+	zombies_count++;
+	list_add(&imp->imp_zombie_chain, &obd_zombie_imports);
+	spin_unlock(&obd_zombie_impexp_lock);
+
+	obd_zombie_impexp_notify();
+}
+
+/**
+ * notify import/export destroy thread about new zombie.
+ */
+static void obd_zombie_impexp_notify(void)
+{
+	/*
+	 * Make sure obd_zombie_impexp_thread get this notification.
+	 * It is possible this signal only get by obd_zombie_barrier, and
+	 * barrier gulps this notification and sleeps away and hangs ensues
+	 */
+	wake_up_all(&obd_zombie_waitq);
+}
+
+/**
+ * check whether obd_zombie is idle
+ */
+static int obd_zombie_is_idle(void)
+{
+	int rc;
+
+	LASSERT(!test_bit(OBD_ZOMBIE_STOP, &obd_zombie_flags));
+	spin_lock(&obd_zombie_impexp_lock);
+	rc = (zombies_count == 0);
+	spin_unlock(&obd_zombie_impexp_lock);
+	return rc;
+}
+
+/**
+ * wait when obd_zombie import/export queues become empty
+ */
+void obd_zombie_barrier(void)
+{
+	struct l_wait_info lwi = { 0 };
+
+	if (obd_zombie_pid == current_pid())
+		/* don't wait for myself */
+		return;
+	l_wait_event(obd_zombie_waitq, obd_zombie_is_idle(), &lwi);
+}
+EXPORT_SYMBOL(obd_zombie_barrier);
+
+
+/**
+ * destroy zombie export/import thread.
+ */
+static int obd_zombie_impexp_thread(void *unused)
+{
+	unshare_fs_struct();
+	complete(&obd_zombie_start);
+
+	obd_zombie_pid = current_pid();
+
+	while (!test_bit(OBD_ZOMBIE_STOP, &obd_zombie_flags)) {
+		struct l_wait_info lwi = { 0 };
+
+		l_wait_event(obd_zombie_waitq,
+			     !obd_zombie_impexp_check(NULL), &lwi);
+		obd_zombie_impexp_cull();
+
+		/*
+		 * Notify obd_zombie_barrier callers that queues
+		 * may be empty.
+		 */
+		wake_up(&obd_zombie_waitq);
+	}
+
+	complete(&obd_zombie_stop);
+
+	return 0;
+}
+
+
+/**
+ * start destroy zombie import/export thread
+ */
+int obd_zombie_impexp_init(void)
+{
+	struct task_struct *task;
+
+	INIT_LIST_HEAD(&obd_zombie_imports);
+	INIT_LIST_HEAD(&obd_zombie_exports);
+	spin_lock_init(&obd_zombie_impexp_lock);
+	init_completion(&obd_zombie_start);
+	init_completion(&obd_zombie_stop);
+	init_waitqueue_head(&obd_zombie_waitq);
+	obd_zombie_pid = 0;
+
+	task = kthread_run(obd_zombie_impexp_thread, NULL, "obd_zombid");
+	if (IS_ERR(task))
+		return PTR_ERR(task);
+
+	wait_for_completion(&obd_zombie_start);
+	return 0;
+}
+/**
+ * stop destroy zombie import/export thread
+ */
+void obd_zombie_impexp_stop(void)
+{
+	set_bit(OBD_ZOMBIE_STOP, &obd_zombie_flags);
+	obd_zombie_impexp_notify();
+	wait_for_completion(&obd_zombie_stop);
+}
+
+/***** Kernel-userspace comm helpers *******/
+
+/* Get length of entire message, including header */
+int kuc_len(int payload_len)
+{
+	return sizeof(struct kuc_hdr) + payload_len;
+}
+EXPORT_SYMBOL(kuc_len);
+
+/* Get a pointer to kuc header, given a ptr to the payload
+ * @param p Pointer to payload area
+ * @returns Pointer to kuc header
+ */
+struct kuc_hdr *kuc_ptr(void *p)
+{
+	struct kuc_hdr *lh = ((struct kuc_hdr *)p) - 1;
+	LASSERT(lh->kuc_magic == KUC_MAGIC);
+	return lh;
+}
+EXPORT_SYMBOL(kuc_ptr);
+
+/* Test if payload is part of kuc message
+ * @param p Pointer to payload area
+ * @returns boolean
+ */
+int kuc_ispayload(void *p)
+{
+	struct kuc_hdr *kh = ((struct kuc_hdr *)p) - 1;
+
+	if (kh->kuc_magic == KUC_MAGIC)
+		return 1;
+	else
+		return 0;
+}
+EXPORT_SYMBOL(kuc_ispayload);
+
+/* Alloc space for a message, and fill in header
+ * @return Pointer to payload area
+ */
+void *kuc_alloc(int payload_len, int transport, int type)
+{
+	struct kuc_hdr *lh;
+	int len = kuc_len(payload_len);
+
+	OBD_ALLOC(lh, len);
+	if (lh == NULL)
+		return ERR_PTR(-ENOMEM);
+
+	lh->kuc_magic = KUC_MAGIC;
+	lh->kuc_transport = transport;
+	lh->kuc_msgtype = type;
+	lh->kuc_msglen = len;
+
+	return (void *)(lh + 1);
+}
+EXPORT_SYMBOL(kuc_alloc);
+
+/* Takes pointer to payload area */
+inline void kuc_free(void *p, int payload_len)
+{
+	struct kuc_hdr *lh = kuc_ptr(p);
+	OBD_FREE(lh, kuc_len(payload_len));
+}
+EXPORT_SYMBOL(kuc_free);
diff --git a/kernel/drivers/staging/lustre/lustre/obdclass/linux/linux-module.c b/kernel/drivers/staging/lustre/lustre/obdclass/linux/linux-module.c
new file mode 100644
index 000000000..06944b863
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/obdclass/linux/linux-module.c
@@ -0,0 +1,449 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdclass/linux/linux-module.c
+ *
+ * Object Devices Class Driver
+ * These are the only exported functions, they provide some generic
+ * infrastructure for managing object devices
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+
+#include <linux/module.h>
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/major.h>
+#include <linux/sched.h>
+#include <linux/lp.h>
+#include <linux/slab.h>
+#include <linux/ioport.h>
+#include <linux/fcntl.h>
+#include <linux/delay.h>
+#include <linux/skbuff.h>
+#include <linux/proc_fs.h>
+#include <linux/fs.h>
+#include <linux/poll.h>
+#include <linux/list.h>
+#include <linux/highmem.h>
+#include <linux/io.h>
+#include <asm/ioctls.h>
+#include <linux/poll.h>
+#include <linux/uaccess.h>
+#include <linux/miscdevice.h>
+#include <linux/seq_file.h>
+
+#include "../../../include/linux/libcfs/libcfs.h"
+#include "../../../include/linux/lnet/lnetctl.h"
+#include "../../include/obd_support.h"
+#include "../../include/obd_class.h"
+#include "../../include/lprocfs_status.h"
+#include "../../include/lustre_ver.h"
+#include "../../include/lustre/lustre_build_version.h"
+
+int proc_version;
+
+/* buffer MUST be at least the size of obd_ioctl_hdr */
+int obd_ioctl_getdata(char **buf, int *len, void *arg)
+{
+	struct obd_ioctl_hdr hdr;
+	struct obd_ioctl_data *data;
+	int err;
+	int offset = 0;
+
+	if (copy_from_user(&hdr, (void *)arg, sizeof(hdr)))
+		return -EFAULT;
+
+	if (hdr.ioc_version != OBD_IOCTL_VERSION) {
+		CERROR("Version mismatch kernel (%x) vs application (%x)\n",
+		       OBD_IOCTL_VERSION, hdr.ioc_version);
+		return -EINVAL;
+	}
+
+	if (hdr.ioc_len > OBD_MAX_IOCTL_BUFFER) {
+		CERROR("User buffer len %d exceeds %d max buffer\n",
+		       hdr.ioc_len, OBD_MAX_IOCTL_BUFFER);
+		return -EINVAL;
+	}
+
+	if (hdr.ioc_len < sizeof(struct obd_ioctl_data)) {
+		CERROR("User buffer too small for ioctl (%d)\n", hdr.ioc_len);
+		return -EINVAL;
+	}
+
+	/* When there are lots of processes calling vmalloc on multi-core
+	 * system, the high lock contention will hurt performance badly,
+	 * obdfilter-survey is an example, which relies on ioctl. So we'd
+	 * better avoid vmalloc on ioctl path. LU-66 */
+	OBD_ALLOC_LARGE(*buf, hdr.ioc_len);
+	if (*buf == NULL) {
+		CERROR("Cannot allocate control buffer of len %d\n",
+		       hdr.ioc_len);
+		return -EINVAL;
+	}
+	*len = hdr.ioc_len;
+	data = (struct obd_ioctl_data *)*buf;
+
+	if (copy_from_user(*buf, (void *)arg, hdr.ioc_len)) {
+		err = -EFAULT;
+		goto free_buf;
+	}
+	if (hdr.ioc_len != data->ioc_len) {
+		err = -EINVAL;
+		goto free_buf;
+	}
+
+	if (obd_ioctl_is_invalid(data)) {
+		CERROR("ioctl not correctly formatted\n");
+		err = -EINVAL;
+		goto free_buf;
+	}
+
+	if (data->ioc_inllen1) {
+		data->ioc_inlbuf1 = &data->ioc_bulk[0];
+		offset += cfs_size_round(data->ioc_inllen1);
+	}
+
+	if (data->ioc_inllen2) {
+		data->ioc_inlbuf2 = &data->ioc_bulk[0] + offset;
+		offset += cfs_size_round(data->ioc_inllen2);
+	}
+
+	if (data->ioc_inllen3) {
+		data->ioc_inlbuf3 = &data->ioc_bulk[0] + offset;
+		offset += cfs_size_round(data->ioc_inllen3);
+	}
+
+	if (data->ioc_inllen4) {
+		data->ioc_inlbuf4 = &data->ioc_bulk[0] + offset;
+	}
+
+	return 0;
+
+free_buf:
+	OBD_FREE_LARGE(*buf, hdr.ioc_len);
+	return err;
+}
+EXPORT_SYMBOL(obd_ioctl_getdata);
+
+int obd_ioctl_popdata(void *arg, void *data, int len)
+{
+	int err;
+
+	err = copy_to_user(arg, data, len);
+	if (err)
+		err = -EFAULT;
+	return err;
+}
+EXPORT_SYMBOL(obd_ioctl_popdata);
+
+/*  opening /dev/obd */
+static int obd_class_open(struct inode *inode, struct file *file)
+{
+	try_module_get(THIS_MODULE);
+	return 0;
+}
+
+/*  closing /dev/obd */
+static int obd_class_release(struct inode *inode, struct file *file)
+{
+	module_put(THIS_MODULE);
+	return 0;
+}
+
+/* to control /dev/obd */
+static long obd_class_ioctl(struct file *filp, unsigned int cmd,
+			    unsigned long arg)
+{
+	int err = 0;
+
+	/* Allow non-root access for OBD_IOC_PING_TARGET - used by lfs check */
+	if (!capable(CFS_CAP_SYS_ADMIN) && (cmd != OBD_IOC_PING_TARGET))
+		return err = -EACCES;
+	if ((cmd & 0xffffff00) == ((int)'T') << 8) /* ignore all tty ioctls */
+		return err = -ENOTTY;
+
+	err = class_handle_ioctl(cmd, (unsigned long)arg);
+
+	return err;
+}
+
+/* declare character device */
+static struct file_operations obd_psdev_fops = {
+	.owner	  = THIS_MODULE,
+	.unlocked_ioctl = obd_class_ioctl, /* unlocked_ioctl */
+	.open	   = obd_class_open,      /* open */
+	.release	= obd_class_release,   /* release */
+};
+
+/* modules setup */
+struct miscdevice obd_psdev = {
+	.minor = OBD_DEV_MINOR,
+	.name  = OBD_DEV_NAME,
+	.fops  = &obd_psdev_fops,
+};
+
+
+#if defined (CONFIG_PROC_FS)
+static int obd_proc_version_seq_show(struct seq_file *m, void *v)
+{
+	seq_printf(m, "lustre: %s\nkernel: %s\nbuild:  %s\n",
+		   LUSTRE_VERSION_STRING, "patchless_client", BUILD_VERSION);
+	return 0;
+}
+LPROC_SEQ_FOPS_RO(obd_proc_version);
+
+int obd_proc_pinger_seq_show(struct seq_file *m, void *v)
+{
+	seq_printf(m, "%s\n", "on");
+	return 0;
+}
+LPROC_SEQ_FOPS_RO(obd_proc_pinger);
+
+static int obd_proc_health_seq_show(struct seq_file *m, void *v)
+{
+	bool healthy = true;
+	int i;
+
+	if (libcfs_catastrophe)
+		seq_printf(m, "LBUG\n");
+
+	read_lock(&obd_dev_lock);
+	for (i = 0; i < class_devno_max(); i++) {
+		struct obd_device *obd;
+
+		obd = class_num2obd(i);
+		if (obd == NULL || !obd->obd_attached || !obd->obd_set_up)
+			continue;
+
+		LASSERT(obd->obd_magic == OBD_DEVICE_MAGIC);
+		if (obd->obd_stopping)
+			continue;
+
+		class_incref(obd, __func__, current);
+		read_unlock(&obd_dev_lock);
+
+		if (obd_health_check(NULL, obd)) {
+			seq_printf(m, "device %s reported unhealthy\n",
+				   obd->obd_name);
+			healthy = false;
+		}
+		class_decref(obd, __func__, current);
+		read_lock(&obd_dev_lock);
+	}
+	read_unlock(&obd_dev_lock);
+
+	if (healthy)
+		seq_puts(m, "healthy\n");
+	else
+		seq_puts(m, "NOT HEALTHY\n");
+
+	return 0;
+}
+LPROC_SEQ_FOPS_RO(obd_proc_health);
+
+static int obd_proc_jobid_var_seq_show(struct seq_file *m, void *v)
+{
+	seq_printf(m, "%s\n", obd_jobid_var);
+	return 0;
+}
+
+static ssize_t obd_proc_jobid_var_seq_write(struct file *file,
+				const char __user *buffer,
+				size_t count, loff_t *off)
+{
+	if (!count || count > JOBSTATS_JOBID_VAR_MAX_LEN)
+		return -EINVAL;
+
+	memset(obd_jobid_var, 0, JOBSTATS_JOBID_VAR_MAX_LEN + 1);
+
+	/* This might leave the var invalid on error, which is probably fine.*/
+	if (copy_from_user(obd_jobid_var, buffer, count))
+		return -EFAULT;
+
+	/* Trim the trailing '\n' if any */
+	if (obd_jobid_var[count - 1] == '\n')
+		obd_jobid_var[count - 1] = 0;
+
+	return count;
+}
+LPROC_SEQ_FOPS(obd_proc_jobid_var);
+
+static int obd_proc_jobid_name_seq_show(struct seq_file *m, void *v)
+{
+	seq_printf(m, "%s\n", obd_jobid_var);
+	return 0;
+}
+
+static ssize_t obd_proc_jobid_name_seq_write(struct file *file,
+					     const char __user *buffer,
+					     size_t count, loff_t *off)
+{
+	if (!count || count > JOBSTATS_JOBID_SIZE)
+		return -EINVAL;
+
+	if (copy_from_user(obd_jobid_node, buffer, count))
+		return -EFAULT;
+
+	obd_jobid_node[count] = 0;
+
+	/* Trim the trailing '\n' if any */
+	if (obd_jobid_node[count - 1] == '\n')
+		obd_jobid_node[count - 1] = 0;
+
+	return count;
+}
+LPROC_SEQ_FOPS(obd_proc_jobid_name);
+
+/* Root for /proc/fs/lustre */
+struct proc_dir_entry *proc_lustre_root = NULL;
+EXPORT_SYMBOL(proc_lustre_root);
+
+struct lprocfs_vars lprocfs_base[] = {
+	{ "version", &obd_proc_version_fops },
+	{ "pinger", &obd_proc_pinger_fops },
+	{ "health_check", &obd_proc_health_fops },
+	{ "jobid_var", &obd_proc_jobid_var_fops },
+	{ .name =	"jobid_name",
+	  .fops =	&obd_proc_jobid_name_fops},
+	{ NULL }
+};
+
+static void *obd_device_list_seq_start(struct seq_file *p, loff_t *pos)
+{
+	if (*pos >= class_devno_max())
+		return NULL;
+
+	return pos;
+}
+
+static void obd_device_list_seq_stop(struct seq_file *p, void *v)
+{
+}
+
+static void *obd_device_list_seq_next(struct seq_file *p, void *v, loff_t *pos)
+{
+	++*pos;
+	if (*pos >= class_devno_max())
+		return NULL;
+
+	return pos;
+}
+
+static int obd_device_list_seq_show(struct seq_file *p, void *v)
+{
+	loff_t index = *(loff_t *)v;
+	struct obd_device *obd = class_num2obd((int)index);
+	char *status;
+
+	if (obd == NULL)
+		return 0;
+
+	LASSERT(obd->obd_magic == OBD_DEVICE_MAGIC);
+	if (obd->obd_stopping)
+		status = "ST";
+	else if (obd->obd_inactive)
+		status = "IN";
+	else if (obd->obd_set_up)
+		status = "UP";
+	else if (obd->obd_attached)
+		status = "AT";
+	else
+		status = "--";
+
+	seq_printf(p, "%3d %s %s %s %s %d\n",
+		   (int)index, status, obd->obd_type->typ_name,
+		   obd->obd_name, obd->obd_uuid.uuid,
+		   atomic_read(&obd->obd_refcount));
+	return 0;
+}
+
+struct seq_operations obd_device_list_sops = {
+	.start = obd_device_list_seq_start,
+	.stop = obd_device_list_seq_stop,
+	.next = obd_device_list_seq_next,
+	.show = obd_device_list_seq_show,
+};
+
+static int obd_device_list_open(struct inode *inode, struct file *file)
+{
+	struct seq_file *seq;
+	int rc = seq_open(file, &obd_device_list_sops);
+
+	if (rc)
+		return rc;
+
+	seq = file->private_data;
+	seq->private = PDE_DATA(inode);
+
+	return 0;
+}
+
+struct file_operations obd_device_list_fops = {
+	.owner   = THIS_MODULE,
+	.open    = obd_device_list_open,
+	.read    = seq_read,
+	.llseek  = seq_lseek,
+	.release = seq_release,
+};
+
+int class_procfs_init(void)
+{
+	int rc = 0;
+
+	proc_lustre_root = lprocfs_register("fs/lustre", NULL,
+					    lprocfs_base, NULL);
+	if (IS_ERR(proc_lustre_root)) {
+		rc = PTR_ERR(proc_lustre_root);
+		proc_lustre_root = NULL;
+		goto out;
+	}
+
+	rc = lprocfs_seq_create(proc_lustre_root, "devices", 0444,
+				&obd_device_list_fops, NULL);
+out:
+	if (rc)
+		CERROR("error adding /proc/fs/lustre/devices file\n");
+	return 0;
+}
+
+int class_procfs_clean(void)
+{
+	if (proc_lustre_root) {
+		lprocfs_remove(&proc_lustre_root);
+	}
+	return 0;
+}
+#endif /* CONFIG_PROC_FS */
diff --git a/kernel/drivers/staging/lustre/lustre/obdclass/linux/linux-obdo.c b/kernel/drivers/staging/lustre/lustre/obdclass/linux/linux-obdo.c
new file mode 100644
index 000000000..62ed706b1
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/obdclass/linux/linux-obdo.c
@@ -0,0 +1,222 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdclass/linux/linux-obdo.c
+ *
+ * Object Devices Class Driver
+ * These are the only exported functions, they provide some generic
+ * infrastructure for managing object devices
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+
+#include <linux/module.h>
+#include "../../include/obd_class.h"
+#include "../../include/lustre/lustre_idl.h"
+
+#include <linux/fs.h>
+#include <linux/pagemap.h> /* for PAGE_CACHE_SIZE */
+
+/*FIXME: Just copy from obdo_from_inode*/
+void obdo_from_la(struct obdo *dst, struct lu_attr *la, __u64 valid)
+{
+	u32 newvalid = 0;
+
+	if (valid & LA_ATIME) {
+		dst->o_atime = la->la_atime;
+		newvalid |= OBD_MD_FLATIME;
+	}
+	if (valid & LA_MTIME) {
+		dst->o_mtime = la->la_mtime;
+		newvalid |= OBD_MD_FLMTIME;
+	}
+	if (valid & LA_CTIME) {
+		dst->o_ctime = la->la_ctime;
+		newvalid |= OBD_MD_FLCTIME;
+	}
+	if (valid & LA_SIZE) {
+		dst->o_size = la->la_size;
+		newvalid |= OBD_MD_FLSIZE;
+	}
+	if (valid & LA_BLOCKS) {  /* allocation of space (x512 bytes) */
+		dst->o_blocks = la->la_blocks;
+		newvalid |= OBD_MD_FLBLOCKS;
+	}
+	if (valid & LA_TYPE) {
+		dst->o_mode = (dst->o_mode & S_IALLUGO) |
+			      (la->la_mode & S_IFMT);
+		newvalid |= OBD_MD_FLTYPE;
+	}
+	if (valid & LA_MODE) {
+		dst->o_mode = (dst->o_mode & S_IFMT) |
+			      (la->la_mode & S_IALLUGO);
+		newvalid |= OBD_MD_FLMODE;
+	}
+	if (valid & LA_UID) {
+		dst->o_uid = la->la_uid;
+		newvalid |= OBD_MD_FLUID;
+	}
+	if (valid & LA_GID) {
+		dst->o_gid = la->la_gid;
+		newvalid |= OBD_MD_FLGID;
+	}
+	dst->o_valid |= newvalid;
+}
+EXPORT_SYMBOL(obdo_from_la);
+
+/*FIXME: Just copy from obdo_from_inode*/
+void la_from_obdo(struct lu_attr *dst, struct obdo *obdo, u32 valid)
+{
+	__u64 newvalid = 0;
+
+	valid &= obdo->o_valid;
+
+	if (valid & OBD_MD_FLATIME) {
+		dst->la_atime = obdo->o_atime;
+		newvalid |= LA_ATIME;
+	}
+	if (valid & OBD_MD_FLMTIME) {
+		dst->la_mtime = obdo->o_mtime;
+		newvalid |= LA_MTIME;
+	}
+	if (valid & OBD_MD_FLCTIME) {
+		dst->la_ctime = obdo->o_ctime;
+		newvalid |= LA_CTIME;
+	}
+	if (valid & OBD_MD_FLSIZE) {
+		dst->la_size = obdo->o_size;
+		newvalid |= LA_SIZE;
+	}
+	if (valid & OBD_MD_FLBLOCKS) {
+		dst->la_blocks = obdo->o_blocks;
+		newvalid |= LA_BLOCKS;
+	}
+	if (valid & OBD_MD_FLTYPE) {
+		dst->la_mode = (dst->la_mode & S_IALLUGO) |
+			       (obdo->o_mode & S_IFMT);
+		newvalid |= LA_TYPE;
+	}
+	if (valid & OBD_MD_FLMODE) {
+		dst->la_mode = (dst->la_mode & S_IFMT) |
+			       (obdo->o_mode & S_IALLUGO);
+		newvalid |= LA_MODE;
+	}
+	if (valid & OBD_MD_FLUID) {
+		dst->la_uid = obdo->o_uid;
+		newvalid |= LA_UID;
+	}
+	if (valid & OBD_MD_FLGID) {
+		dst->la_gid = obdo->o_gid;
+		newvalid |= LA_GID;
+	}
+	dst->la_valid = newvalid;
+}
+EXPORT_SYMBOL(la_from_obdo);
+
+void obdo_refresh_inode(struct inode *dst, struct obdo *src, u32 valid)
+{
+	valid &= src->o_valid;
+
+	if (valid & (OBD_MD_FLCTIME | OBD_MD_FLMTIME))
+		CDEBUG(D_INODE,
+		       "valid %#llx, cur time %lu/%lu, new %llu/%llu\n",
+		       src->o_valid, LTIME_S(dst->i_mtime),
+		       LTIME_S(dst->i_ctime), src->o_mtime, src->o_ctime);
+
+	if (valid & OBD_MD_FLATIME && src->o_atime > LTIME_S(dst->i_atime))
+		LTIME_S(dst->i_atime) = src->o_atime;
+	if (valid & OBD_MD_FLMTIME && src->o_mtime > LTIME_S(dst->i_mtime))
+		LTIME_S(dst->i_mtime) = src->o_mtime;
+	if (valid & OBD_MD_FLCTIME && src->o_ctime > LTIME_S(dst->i_ctime))
+		LTIME_S(dst->i_ctime) = src->o_ctime;
+	if (valid & OBD_MD_FLSIZE)
+		i_size_write(dst, src->o_size);
+	/* optimum IO size */
+	if (valid & OBD_MD_FLBLKSZ && src->o_blksize > (1 << dst->i_blkbits))
+		dst->i_blkbits = ffs(src->o_blksize) - 1;
+
+	if (dst->i_blkbits < PAGE_CACHE_SHIFT)
+		dst->i_blkbits = PAGE_CACHE_SHIFT;
+
+	/* allocation of space */
+	if (valid & OBD_MD_FLBLOCKS && src->o_blocks > dst->i_blocks)
+		/*
+		 * XXX shouldn't overflow be checked here like in
+		 * obdo_to_inode().
+		 */
+		dst->i_blocks = src->o_blocks;
+}
+EXPORT_SYMBOL(obdo_refresh_inode);
+
+void obdo_to_inode(struct inode *dst, struct obdo *src, u32 valid)
+{
+	valid &= src->o_valid;
+
+	LASSERTF(!(valid & (OBD_MD_FLTYPE | OBD_MD_FLGENER | OBD_MD_FLFID |
+			    OBD_MD_FLID | OBD_MD_FLGROUP)),
+		 "object "DOSTID", valid %x\n", POSTID(&src->o_oi), valid);
+
+	if (valid & (OBD_MD_FLCTIME | OBD_MD_FLMTIME))
+		CDEBUG(D_INODE,
+		       "valid %#llx, cur time %lu/%lu, new %llu/%llu\n",
+		       src->o_valid, LTIME_S(dst->i_mtime),
+		       LTIME_S(dst->i_ctime), src->o_mtime, src->o_ctime);
+
+	if (valid & OBD_MD_FLATIME)
+		LTIME_S(dst->i_atime) = src->o_atime;
+	if (valid & OBD_MD_FLMTIME)
+		LTIME_S(dst->i_mtime) = src->o_mtime;
+	if (valid & OBD_MD_FLCTIME && src->o_ctime > LTIME_S(dst->i_ctime))
+		LTIME_S(dst->i_ctime) = src->o_ctime;
+	if (valid & OBD_MD_FLSIZE)
+		i_size_write(dst, src->o_size);
+	if (valid & OBD_MD_FLBLOCKS) { /* allocation of space */
+		dst->i_blocks = src->o_blocks;
+		if (dst->i_blocks < src->o_blocks) /* overflow */
+			dst->i_blocks = -1;
+
+	}
+	if (valid & OBD_MD_FLBLKSZ)
+		dst->i_blkbits = ffs(src->o_blksize)-1;
+	if (valid & OBD_MD_FLMODE)
+		dst->i_mode = (dst->i_mode & S_IFMT) | (src->o_mode & ~S_IFMT);
+	if (valid & OBD_MD_FLUID)
+		dst->i_uid = make_kuid(&init_user_ns, src->o_uid);
+	if (valid & OBD_MD_FLGID)
+		dst->i_gid = make_kgid(&init_user_ns, src->o_gid);
+	if (valid & OBD_MD_FLFLAGS)
+		dst->i_flags = src->o_flags;
+}
+EXPORT_SYMBOL(obdo_to_inode);
diff --git a/kernel/drivers/staging/lustre/lustre/obdclass/linux/linux-sysctl.c b/kernel/drivers/staging/lustre/lustre/obdclass/linux/linux-sysctl.c
new file mode 100644
index 000000000..4b62d2576
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/obdclass/linux/linux-sysctl.c
@@ -0,0 +1,405 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 1999, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#include <linux/module.h>
+#include <linux/sysctl.h>
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/proc_fs.h>
+#include <linux/slab.h>
+#include <linux/stat.h>
+#include <linux/ctype.h>
+#include <linux/bitops.h>
+#include <linux/uaccess.h>
+#include <linux/utsname.h>
+
+#define DEBUG_SUBSYSTEM S_CLASS
+
+#include "../../include/obd_support.h"
+#include "../../include/lprocfs_status.h"
+
+#ifdef CONFIG_SYSCTL
+static struct ctl_table_header *obd_table_header;
+#endif
+
+
+#define OBD_SYSCTL 300
+
+enum {
+	OBD_TIMEOUT = 3,	/* RPC timeout before recovery/intr */
+	OBD_DUMP_ON_TIMEOUT,    /* dump kernel debug log upon eviction */
+	OBD_MEMUSED,	    /* bytes currently OBD_ALLOCated */
+	OBD_PAGESUSED,	  /* pages currently OBD_PAGE_ALLOCated */
+	OBD_MAXMEMUSED,	 /* maximum bytes OBD_ALLOCated concurrently */
+	OBD_MAXPAGESUSED,       /* maximum pages OBD_PAGE_ALLOCated concurrently */
+	OBD_SYNCFILTER,	 /* XXX temporary, as we play with sync osts.. */
+	OBD_LDLM_TIMEOUT,       /* LDLM timeout for ASTs before client eviction */
+	OBD_DUMP_ON_EVICTION,   /* dump kernel debug log upon eviction */
+	OBD_DEBUG_PEER_ON_TIMEOUT, /* dump peer debug when RPC times out */
+	OBD_ALLOC_FAIL_RATE,    /* memory allocation random failure rate */
+	OBD_MAX_DIRTY_PAGES,    /* maximum dirty pages */
+	OBD_AT_MIN,	     /* Adaptive timeouts params */
+	OBD_AT_MAX,
+	OBD_AT_EXTRA,
+	OBD_AT_EARLY_MARGIN,
+	OBD_AT_HISTORY,
+};
+
+
+#ifdef CONFIG_SYSCTL
+static int proc_set_timeout(struct ctl_table *table, int write,
+			void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	int rc;
+
+	rc = proc_dointvec(table, write, buffer, lenp, ppos);
+	if (ldlm_timeout >= obd_timeout)
+		ldlm_timeout = max(obd_timeout / 3, 1U);
+	return rc;
+}
+
+static int proc_memory_alloc(struct ctl_table *table, int write,
+			void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	char buf[22];
+	int len;
+
+	if (!*lenp || (*ppos && !write)) {
+		*lenp = 0;
+		return 0;
+	}
+	if (write)
+		return -EINVAL;
+
+	len = snprintf(buf, sizeof(buf), "%llu\n", obd_memory_sum());
+	if (len > *lenp)
+		len = *lenp;
+	buf[len] = '\0';
+	if (copy_to_user(buffer, buf, len))
+		return -EFAULT;
+	*lenp = len;
+	*ppos += *lenp;
+	return 0;
+}
+
+static int proc_pages_alloc(struct ctl_table *table, int write,
+			void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	char buf[22];
+	int len;
+
+	if (!*lenp || (*ppos && !write)) {
+		*lenp = 0;
+		return 0;
+	}
+	if (write)
+		return -EINVAL;
+
+	len = snprintf(buf, sizeof(buf), "%llu\n", obd_pages_sum());
+	if (len > *lenp)
+		len = *lenp;
+	buf[len] = '\0';
+	if (copy_to_user(buffer, buf, len))
+		return -EFAULT;
+	*lenp = len;
+	*ppos += *lenp;
+	return 0;
+}
+
+static int proc_mem_max(struct ctl_table *table, int write, void __user *buffer,
+		 size_t *lenp, loff_t *ppos)
+{
+	char buf[22];
+	int len;
+
+	if (!*lenp || (*ppos && !write)) {
+		*lenp = 0;
+		return 0;
+	}
+	if (write)
+		return -EINVAL;
+
+	len = snprintf(buf, sizeof(buf), "%llu\n", obd_memory_max());
+	if (len > *lenp)
+		len = *lenp;
+	buf[len] = '\0';
+	if (copy_to_user(buffer, buf, len))
+		return -EFAULT;
+	*lenp = len;
+	*ppos += *lenp;
+	return 0;
+}
+
+static int proc_pages_max(struct ctl_table *table, int write,
+			void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	char buf[22];
+	int len;
+
+	if (!*lenp || (*ppos && !write)) {
+		*lenp = 0;
+		return 0;
+	}
+	if (write)
+		return -EINVAL;
+
+	len = snprintf(buf, sizeof(buf), "%llu\n", obd_pages_max());
+	if (len > *lenp)
+		len = *lenp;
+	buf[len] = '\0';
+	if (copy_to_user(buffer, buf, len))
+		return -EFAULT;
+	*lenp = len;
+	*ppos += *lenp;
+	return 0;
+}
+
+static int proc_max_dirty_pages_in_mb(struct ctl_table *table, int write,
+			       void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	int rc = 0;
+
+	if (!table->data || !table->maxlen || !*lenp || (*ppos && !write)) {
+		*lenp = 0;
+		return 0;
+	}
+	if (write) {
+		rc = lprocfs_write_frac_helper(buffer, *lenp,
+					       (unsigned int *)table->data,
+					       1 << (20 - PAGE_CACHE_SHIFT));
+		/* Don't allow them to let dirty pages exceed 90% of system
+		 * memory and set a hard minimum of 4MB. */
+		if (obd_max_dirty_pages > ((totalram_pages / 10) * 9)) {
+			CERROR("Refusing to set max dirty pages to %u, which is more than 90%% of available RAM; setting to %lu\n",
+			       obd_max_dirty_pages,
+			       ((totalram_pages / 10) * 9));
+			obd_max_dirty_pages = (totalram_pages / 10) * 9;
+		} else if (obd_max_dirty_pages < 4 << (20 - PAGE_CACHE_SHIFT)) {
+			obd_max_dirty_pages = 4 << (20 - PAGE_CACHE_SHIFT);
+		}
+	} else {
+		char buf[21];
+		int len;
+
+		len = lprocfs_read_frac_helper(buf, sizeof(buf),
+					       *(unsigned int *)table->data,
+					       1 << (20 - PAGE_CACHE_SHIFT));
+		if (len > *lenp)
+			len = *lenp;
+		buf[len] = '\0';
+		if (copy_to_user(buffer, buf, len))
+			return -EFAULT;
+		*lenp = len;
+	}
+	*ppos += *lenp;
+	return rc;
+}
+
+static int proc_alloc_fail_rate(struct ctl_table *table, int write,
+			 void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	int rc	  = 0;
+
+	if (!table->data || !table->maxlen || !*lenp || (*ppos && !write)) {
+		*lenp = 0;
+		return 0;
+	}
+	if (write) {
+		rc = lprocfs_write_frac_helper(buffer, *lenp,
+					       (unsigned int *)table->data,
+					       OBD_ALLOC_FAIL_MULT);
+	} else {
+		char buf[21];
+		int  len;
+
+		len = lprocfs_read_frac_helper(buf, 21,
+					       *(unsigned int *)table->data,
+					       OBD_ALLOC_FAIL_MULT);
+		if (len > *lenp)
+			len = *lenp;
+		buf[len] = '\0';
+		if (copy_to_user(buffer, buf, len))
+			return -EFAULT;
+		*lenp = len;
+	}
+	*ppos += *lenp;
+	return rc;
+}
+
+static struct ctl_table obd_table[] = {
+	{
+		.procname = "timeout",
+		.data     = &obd_timeout,
+		.maxlen   = sizeof(int),
+		.mode     = 0644,
+		.proc_handler = &proc_set_timeout
+	},
+	{
+		.procname = "debug_peer_on_timeout",
+		.data     = &obd_debug_peer_on_timeout,
+		.maxlen   = sizeof(int),
+		.mode     = 0644,
+		.proc_handler = &proc_dointvec
+	},
+	{
+		.procname = "dump_on_timeout",
+		.data     = &obd_dump_on_timeout,
+		.maxlen   = sizeof(int),
+		.mode     = 0644,
+		.proc_handler = &proc_dointvec
+	},
+	{
+		.procname = "dump_on_eviction",
+		.data     = &obd_dump_on_eviction,
+		.maxlen   = sizeof(int),
+		.mode     = 0644,
+		.proc_handler = &proc_dointvec
+	},
+	{
+		.procname = "memused",
+		.data     = NULL,
+		.maxlen   = 0,
+		.mode     = 0444,
+		.proc_handler = &proc_memory_alloc
+	},
+	{
+		.procname = "pagesused",
+		.data     = NULL,
+		.maxlen   = 0,
+		.mode     = 0444,
+		.proc_handler = &proc_pages_alloc
+	},
+	{
+		.procname = "memused_max",
+		.data     = NULL,
+		.maxlen   = 0,
+		.mode     = 0444,
+		.proc_handler = &proc_mem_max
+	},
+	{
+		.procname = "pagesused_max",
+		.data     = NULL,
+		.maxlen   = 0,
+		.mode     = 0444,
+		.proc_handler = &proc_pages_max
+	},
+	{
+		.procname = "ldlm_timeout",
+		.data     = &ldlm_timeout,
+		.maxlen   = sizeof(int),
+		.mode     = 0644,
+		.proc_handler = &proc_set_timeout
+	},
+	{
+		.procname = "alloc_fail_rate",
+		.data     = &obd_alloc_fail_rate,
+		.maxlen   = sizeof(int),
+		.mode     = 0644,
+		.proc_handler = &proc_alloc_fail_rate
+	},
+	{
+		.procname = "max_dirty_mb",
+		.data     = &obd_max_dirty_pages,
+		.maxlen   = sizeof(int),
+		.mode     = 0644,
+		.proc_handler = &proc_max_dirty_pages_in_mb
+	},
+	{
+		.procname = "at_min",
+		.data     = &at_min,
+		.maxlen   = sizeof(int),
+		.mode     = 0644,
+		.proc_handler = &proc_dointvec,
+	},
+	{
+		.procname = "at_max",
+		.data     = &at_max,
+		.maxlen   = sizeof(int),
+		.mode     = 0644,
+		.proc_handler = &proc_dointvec,
+	},
+	{
+		.procname = "at_extra",
+		.data     = &at_extra,
+		.maxlen   = sizeof(int),
+		.mode     = 0644,
+		.proc_handler = &proc_dointvec,
+	},
+	{
+		.procname = "at_early_margin",
+		.data     = &at_early_margin,
+		.maxlen   = sizeof(int),
+		.mode     = 0644,
+		.proc_handler = &proc_dointvec,
+	},
+	{
+		.procname = "at_history",
+		.data     = &at_history,
+		.maxlen   = sizeof(int),
+		.mode     = 0644,
+		.proc_handler = &proc_dointvec,
+	},
+	{}
+};
+
+static struct ctl_table parent_table[] = {
+	{
+		.procname = "lustre",
+		.data     = NULL,
+		.maxlen   = 0,
+		.mode     = 0555,
+		.child    = obd_table
+	},
+	{}
+};
+#endif
+
+void obd_sysctl_init(void)
+{
+#ifdef CONFIG_SYSCTL
+	if (!obd_table_header)
+		obd_table_header = register_sysctl_table(parent_table);
+#endif
+}
+
+void obd_sysctl_clean(void)
+{
+#ifdef CONFIG_SYSCTL
+	if (obd_table_header)
+		unregister_sysctl_table(obd_table_header);
+	obd_table_header = NULL;
+#endif
+}
diff --git a/kernel/drivers/staging/lustre/lustre/obdclass/llog.c b/kernel/drivers/staging/lustre/lustre/obdclass/llog.c
new file mode 100644
index 000000000..114be4a78
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/obdclass/llog.c
@@ -0,0 +1,1007 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdclass/llog.c
+ *
+ * OST<->MDS recovery logging infrastructure.
+ * Invariants in implementation:
+ * - we do not share logs among different OST<->MDS connections, so that
+ *   if an OST or MDS fails it need only look at log(s) relevant to itself
+ *
+ * Author: Andreas Dilger <adilger@clusterfs.com>
+ * Author: Alex Zhuravlev <bzzz@whamcloud.com>
+ * Author: Mikhail Pershin <tappro@whamcloud.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LOG
+
+
+#include "../include/obd_class.h"
+#include "../include/lustre_log.h"
+#include "llog_internal.h"
+
+/*
+ * Allocate a new log or catalog handle
+ * Used inside llog_open().
+ */
+static struct llog_handle *llog_alloc_handle(void)
+{
+	struct llog_handle *loghandle;
+
+	OBD_ALLOC_PTR(loghandle);
+	if (loghandle == NULL)
+		return NULL;
+
+	init_rwsem(&loghandle->lgh_lock);
+	spin_lock_init(&loghandle->lgh_hdr_lock);
+	INIT_LIST_HEAD(&loghandle->u.phd.phd_entry);
+	atomic_set(&loghandle->lgh_refcount, 1);
+
+	return loghandle;
+}
+
+/*
+ * Free llog handle and header data if exists. Used in llog_close() only
+ */
+static void llog_free_handle(struct llog_handle *loghandle)
+{
+	LASSERT(loghandle != NULL);
+
+	/* failed llog_init_handle */
+	if (!loghandle->lgh_hdr)
+		goto out;
+
+	if (loghandle->lgh_hdr->llh_flags & LLOG_F_IS_PLAIN)
+		LASSERT(list_empty(&loghandle->u.phd.phd_entry));
+	else if (loghandle->lgh_hdr->llh_flags & LLOG_F_IS_CAT)
+		LASSERT(list_empty(&loghandle->u.chd.chd_head));
+	LASSERT(sizeof(*(loghandle->lgh_hdr)) == LLOG_CHUNK_SIZE);
+	OBD_FREE(loghandle->lgh_hdr, LLOG_CHUNK_SIZE);
+out:
+	OBD_FREE_PTR(loghandle);
+}
+
+void llog_handle_get(struct llog_handle *loghandle)
+{
+	atomic_inc(&loghandle->lgh_refcount);
+}
+
+void llog_handle_put(struct llog_handle *loghandle)
+{
+	LASSERT(atomic_read(&loghandle->lgh_refcount) > 0);
+	if (atomic_dec_and_test(&loghandle->lgh_refcount))
+		llog_free_handle(loghandle);
+}
+
+/* returns negative on error; 0 if success; 1 if success & log destroyed */
+int llog_cancel_rec(const struct lu_env *env, struct llog_handle *loghandle,
+		    int index)
+{
+	struct llog_log_hdr *llh = loghandle->lgh_hdr;
+	int rc = 0;
+
+	CDEBUG(D_RPCTRACE, "Canceling %d in log "DOSTID"\n",
+	       index, POSTID(&loghandle->lgh_id.lgl_oi));
+
+	if (index == 0) {
+		CERROR("Can't cancel index 0 which is header\n");
+		return -EINVAL;
+	}
+
+	spin_lock(&loghandle->lgh_hdr_lock);
+	if (!ext2_clear_bit(index, llh->llh_bitmap)) {
+		spin_unlock(&loghandle->lgh_hdr_lock);
+		CDEBUG(D_RPCTRACE, "Catalog index %u already clear?\n", index);
+		return -ENOENT;
+	}
+
+	llh->llh_count--;
+
+	if ((llh->llh_flags & LLOG_F_ZAP_WHEN_EMPTY) &&
+	    (llh->llh_count == 1) &&
+	    (loghandle->lgh_last_idx == (LLOG_BITMAP_BYTES * 8) - 1)) {
+		spin_unlock(&loghandle->lgh_hdr_lock);
+		rc = llog_destroy(env, loghandle);
+		if (rc < 0) {
+			CERROR("%s: can't destroy empty llog #"DOSTID
+			       "#%08x: rc = %d\n",
+			       loghandle->lgh_ctxt->loc_obd->obd_name,
+			       POSTID(&loghandle->lgh_id.lgl_oi),
+			       loghandle->lgh_id.lgl_ogen, rc);
+			goto out_err;
+		}
+		return 1;
+	}
+	spin_unlock(&loghandle->lgh_hdr_lock);
+
+	rc = llog_write(env, loghandle, &llh->llh_hdr, NULL, 0, NULL, 0);
+	if (rc < 0) {
+		CERROR("%s: fail to write header for llog #"DOSTID
+		       "#%08x: rc = %d\n",
+		       loghandle->lgh_ctxt->loc_obd->obd_name,
+		       POSTID(&loghandle->lgh_id.lgl_oi),
+		       loghandle->lgh_id.lgl_ogen, rc);
+		goto out_err;
+	}
+	return 0;
+out_err:
+	spin_lock(&loghandle->lgh_hdr_lock);
+	ext2_set_bit(index, llh->llh_bitmap);
+	llh->llh_count++;
+	spin_unlock(&loghandle->lgh_hdr_lock);
+	return rc;
+}
+EXPORT_SYMBOL(llog_cancel_rec);
+
+static int llog_read_header(const struct lu_env *env,
+			    struct llog_handle *handle,
+			    struct obd_uuid *uuid)
+{
+	struct llog_operations *lop;
+	int rc;
+
+	rc = llog_handle2ops(handle, &lop);
+	if (rc)
+		return rc;
+
+	if (lop->lop_read_header == NULL)
+		return -EOPNOTSUPP;
+
+	rc = lop->lop_read_header(env, handle);
+	if (rc == LLOG_EEMPTY) {
+		struct llog_log_hdr *llh = handle->lgh_hdr;
+
+		handle->lgh_last_idx = 0; /* header is record with index 0 */
+		llh->llh_count = 1;	 /* for the header record */
+		llh->llh_hdr.lrh_type = LLOG_HDR_MAGIC;
+		llh->llh_hdr.lrh_len = llh->llh_tail.lrt_len = LLOG_CHUNK_SIZE;
+		llh->llh_hdr.lrh_index = llh->llh_tail.lrt_index = 0;
+		llh->llh_timestamp = get_seconds();
+		if (uuid)
+			memcpy(&llh->llh_tgtuuid, uuid,
+			       sizeof(llh->llh_tgtuuid));
+		llh->llh_bitmap_offset = offsetof(typeof(*llh), llh_bitmap);
+		ext2_set_bit(0, llh->llh_bitmap);
+		rc = 0;
+	}
+	return rc;
+}
+
+int llog_init_handle(const struct lu_env *env, struct llog_handle *handle,
+		     int flags, struct obd_uuid *uuid)
+{
+	struct llog_log_hdr	*llh;
+	int			 rc;
+
+	LASSERT(handle->lgh_hdr == NULL);
+
+	OBD_ALLOC_PTR(llh);
+	if (llh == NULL)
+		return -ENOMEM;
+	handle->lgh_hdr = llh;
+	/* first assign flags to use llog_client_ops */
+	llh->llh_flags = flags;
+	rc = llog_read_header(env, handle, uuid);
+	if (rc == 0) {
+		if (unlikely((llh->llh_flags & LLOG_F_IS_PLAIN &&
+			      flags & LLOG_F_IS_CAT) ||
+			     (llh->llh_flags & LLOG_F_IS_CAT &&
+			      flags & LLOG_F_IS_PLAIN))) {
+			CERROR("%s: llog type is %s but initializing %s\n",
+			       handle->lgh_ctxt->loc_obd->obd_name,
+			       llh->llh_flags & LLOG_F_IS_CAT ?
+			       "catalog" : "plain",
+			       flags & LLOG_F_IS_CAT ? "catalog" : "plain");
+			rc = -EINVAL;
+			goto out;
+		} else if (llh->llh_flags &
+			   (LLOG_F_IS_PLAIN | LLOG_F_IS_CAT)) {
+			/*
+			 * it is possible to open llog without specifying llog
+			 * type so it is taken from llh_flags
+			 */
+			flags = llh->llh_flags;
+		} else {
+			/* for some reason the llh_flags has no type set */
+			CERROR("llog type is not specified!\n");
+			rc = -EINVAL;
+			goto out;
+		}
+		if (unlikely(uuid &&
+			     !obd_uuid_equals(uuid, &llh->llh_tgtuuid))) {
+			CERROR("%s: llog uuid mismatch: %s/%s\n",
+			       handle->lgh_ctxt->loc_obd->obd_name,
+			       (char *)uuid->uuid,
+			       (char *)llh->llh_tgtuuid.uuid);
+			rc = -EEXIST;
+			goto out;
+		}
+	}
+	if (flags & LLOG_F_IS_CAT) {
+		LASSERT(list_empty(&handle->u.chd.chd_head));
+		INIT_LIST_HEAD(&handle->u.chd.chd_head);
+		llh->llh_size = sizeof(struct llog_logid_rec);
+	} else if (!(flags & LLOG_F_IS_PLAIN)) {
+		CERROR("%s: unknown flags: %#x (expected %#x or %#x)\n",
+		       handle->lgh_ctxt->loc_obd->obd_name,
+		       flags, LLOG_F_IS_CAT, LLOG_F_IS_PLAIN);
+		rc = -EINVAL;
+	}
+out:
+	if (rc) {
+		OBD_FREE_PTR(llh);
+		handle->lgh_hdr = NULL;
+	}
+	return rc;
+}
+EXPORT_SYMBOL(llog_init_handle);
+
+static int llog_process_thread(void *arg)
+{
+	struct llog_process_info	*lpi = arg;
+	struct llog_handle		*loghandle = lpi->lpi_loghandle;
+	struct llog_log_hdr		*llh = loghandle->lgh_hdr;
+	struct llog_process_cat_data	*cd  = lpi->lpi_catdata;
+	char				*buf;
+	__u64				 cur_offset = LLOG_CHUNK_SIZE;
+	__u64				 last_offset;
+	int				 rc = 0, index = 1, last_index;
+	int				 saved_index = 0;
+	int				 last_called_index = 0;
+
+	LASSERT(llh);
+
+	OBD_ALLOC(buf, LLOG_CHUNK_SIZE);
+	if (!buf) {
+		lpi->lpi_rc = -ENOMEM;
+		return 0;
+	}
+
+	if (cd != NULL) {
+		last_called_index = cd->lpcd_first_idx;
+		index = cd->lpcd_first_idx + 1;
+	}
+	if (cd != NULL && cd->lpcd_last_idx)
+		last_index = cd->lpcd_last_idx;
+	else
+		last_index = LLOG_BITMAP_BYTES * 8 - 1;
+
+	while (rc == 0) {
+		struct llog_rec_hdr *rec;
+
+		/* skip records not set in bitmap */
+		while (index <= last_index &&
+		       !ext2_test_bit(index, llh->llh_bitmap))
+			++index;
+
+		LASSERT(index <= last_index + 1);
+		if (index == last_index + 1)
+			break;
+repeat:
+		CDEBUG(D_OTHER, "index: %d last_index %d\n",
+		       index, last_index);
+
+		/* get the buf with our target record; avoid old garbage */
+		memset(buf, 0, LLOG_CHUNK_SIZE);
+		last_offset = cur_offset;
+		rc = llog_next_block(lpi->lpi_env, loghandle, &saved_index,
+				     index, &cur_offset, buf, LLOG_CHUNK_SIZE);
+		if (rc)
+			goto out;
+
+		/* NB: when rec->lrh_len is accessed it is already swabbed
+		 * since it is used at the "end" of the loop and the rec
+		 * swabbing is done at the beginning of the loop. */
+		for (rec = (struct llog_rec_hdr *)buf;
+		     (char *)rec < buf + LLOG_CHUNK_SIZE;
+		     rec = (struct llog_rec_hdr *)((char *)rec + rec->lrh_len)){
+
+			CDEBUG(D_OTHER, "processing rec 0x%p type %#x\n",
+			       rec, rec->lrh_type);
+
+			if (LLOG_REC_HDR_NEEDS_SWABBING(rec))
+				lustre_swab_llog_rec(rec);
+
+			CDEBUG(D_OTHER, "after swabbing, type=%#x idx=%d\n",
+			       rec->lrh_type, rec->lrh_index);
+
+			if (rec->lrh_index == 0) {
+				/* probably another rec just got added? */
+				rc = 0;
+				if (index <= loghandle->lgh_last_idx)
+					goto repeat;
+				goto out; /* no more records */
+			}
+			if (rec->lrh_len == 0 ||
+			    rec->lrh_len > LLOG_CHUNK_SIZE) {
+				CWARN("invalid length %d in llog record for index %d/%d\n",
+				      rec->lrh_len,
+				      rec->lrh_index, index);
+				rc = -EINVAL;
+				goto out;
+			}
+
+			if (rec->lrh_index < index) {
+				CDEBUG(D_OTHER, "skipping lrh_index %d\n",
+				       rec->lrh_index);
+				continue;
+			}
+
+			CDEBUG(D_OTHER,
+			       "lrh_index: %d lrh_len: %d (%d remains)\n",
+			       rec->lrh_index, rec->lrh_len,
+			       (int)(buf + LLOG_CHUNK_SIZE - (char *)rec));
+
+			loghandle->lgh_cur_idx = rec->lrh_index;
+			loghandle->lgh_cur_offset = (char *)rec - (char *)buf +
+						    last_offset;
+
+			/* if set, process the callback on this record */
+			if (ext2_test_bit(index, llh->llh_bitmap)) {
+				rc = lpi->lpi_cb(lpi->lpi_env, loghandle, rec,
+						 lpi->lpi_cbdata);
+				last_called_index = index;
+				if (rc == LLOG_PROC_BREAK) {
+					goto out;
+				} else if (rc == LLOG_DEL_RECORD) {
+					llog_cancel_rec(lpi->lpi_env,
+							loghandle,
+							rec->lrh_index);
+					rc = 0;
+				}
+				if (rc)
+					goto out;
+			} else {
+				CDEBUG(D_OTHER, "Skipped index %d\n", index);
+			}
+
+			/* next record, still in buffer? */
+			++index;
+			if (index > last_index) {
+				rc = 0;
+				goto out;
+			}
+		}
+	}
+
+out:
+	if (cd != NULL)
+		cd->lpcd_last_idx = last_called_index;
+
+	OBD_FREE(buf, LLOG_CHUNK_SIZE);
+	lpi->lpi_rc = rc;
+	return 0;
+}
+
+static int llog_process_thread_daemonize(void *arg)
+{
+	struct llog_process_info	*lpi = arg;
+	struct lu_env			 env;
+	int				 rc;
+
+	unshare_fs_struct();
+
+	/* client env has no keys, tags is just 0 */
+	rc = lu_env_init(&env, LCT_LOCAL | LCT_MG_THREAD);
+	if (rc)
+		goto out;
+	lpi->lpi_env = &env;
+
+	rc = llog_process_thread(arg);
+
+	lu_env_fini(&env);
+out:
+	complete(&lpi->lpi_completion);
+	return rc;
+}
+
+int llog_process_or_fork(const struct lu_env *env,
+			 struct llog_handle *loghandle,
+			 llog_cb_t cb, void *data, void *catdata, bool fork)
+{
+	struct llog_process_info *lpi;
+	int		      rc;
+
+	OBD_ALLOC_PTR(lpi);
+	if (lpi == NULL) {
+		CERROR("cannot alloc pointer\n");
+		return -ENOMEM;
+	}
+	lpi->lpi_loghandle = loghandle;
+	lpi->lpi_cb	= cb;
+	lpi->lpi_cbdata    = data;
+	lpi->lpi_catdata   = catdata;
+
+	if (fork) {
+		/* The new thread can't use parent env,
+		 * init the new one in llog_process_thread_daemonize. */
+		lpi->lpi_env = NULL;
+		init_completion(&lpi->lpi_completion);
+		rc = PTR_ERR(kthread_run(llog_process_thread_daemonize, lpi,
+					     "llog_process_thread"));
+		if (IS_ERR_VALUE(rc)) {
+			CERROR("%s: cannot start thread: rc = %d\n",
+			       loghandle->lgh_ctxt->loc_obd->obd_name, rc);
+			OBD_FREE_PTR(lpi);
+			return rc;
+		}
+		wait_for_completion(&lpi->lpi_completion);
+	} else {
+		lpi->lpi_env = env;
+		llog_process_thread(lpi);
+	}
+	rc = lpi->lpi_rc;
+	OBD_FREE_PTR(lpi);
+	return rc;
+}
+EXPORT_SYMBOL(llog_process_or_fork);
+
+int llog_process(const struct lu_env *env, struct llog_handle *loghandle,
+		 llog_cb_t cb, void *data, void *catdata)
+{
+	return llog_process_or_fork(env, loghandle, cb, data, catdata, true);
+}
+EXPORT_SYMBOL(llog_process);
+
+int llog_reverse_process(const struct lu_env *env,
+			 struct llog_handle *loghandle, llog_cb_t cb,
+			 void *data, void *catdata)
+{
+	struct llog_log_hdr *llh = loghandle->lgh_hdr;
+	struct llog_process_cat_data *cd = catdata;
+	void *buf;
+	int rc = 0, first_index = 1, index, idx;
+
+	OBD_ALLOC(buf, LLOG_CHUNK_SIZE);
+	if (!buf)
+		return -ENOMEM;
+
+	if (cd != NULL)
+		first_index = cd->lpcd_first_idx + 1;
+	if (cd != NULL && cd->lpcd_last_idx)
+		index = cd->lpcd_last_idx;
+	else
+		index = LLOG_BITMAP_BYTES * 8 - 1;
+
+	while (rc == 0) {
+		struct llog_rec_hdr *rec;
+		struct llog_rec_tail *tail;
+
+		/* skip records not set in bitmap */
+		while (index >= first_index &&
+		       !ext2_test_bit(index, llh->llh_bitmap))
+			--index;
+
+		LASSERT(index >= first_index - 1);
+		if (index == first_index - 1)
+			break;
+
+		/* get the buf with our target record; avoid old garbage */
+		memset(buf, 0, LLOG_CHUNK_SIZE);
+		rc = llog_prev_block(env, loghandle, index, buf,
+				     LLOG_CHUNK_SIZE);
+		if (rc)
+			goto out;
+
+		rec = buf;
+		idx = rec->lrh_index;
+		CDEBUG(D_RPCTRACE, "index %u : idx %u\n", index, idx);
+		while (idx < index) {
+			rec = (void *)rec + rec->lrh_len;
+			if (LLOG_REC_HDR_NEEDS_SWABBING(rec))
+				lustre_swab_llog_rec(rec);
+			idx ++;
+		}
+		LASSERT(idx == index);
+		tail = (void *)rec + rec->lrh_len - sizeof(*tail);
+
+		/* process records in buffer, starting where we found one */
+		while ((void *)tail > buf) {
+			if (tail->lrt_index == 0) {
+				/* no more records */
+				rc = 0;
+				goto out;
+			}
+
+			/* if set, process the callback on this record */
+			if (ext2_test_bit(index, llh->llh_bitmap)) {
+				rec = (void *)tail - tail->lrt_len +
+				      sizeof(*tail);
+
+				rc = cb(env, loghandle, rec, data);
+				if (rc == LLOG_PROC_BREAK) {
+					goto out;
+				} else if (rc == LLOG_DEL_RECORD) {
+					llog_cancel_rec(env, loghandle,
+							tail->lrt_index);
+					rc = 0;
+				}
+				if (rc)
+					goto out;
+			}
+
+			/* previous record, still in buffer? */
+			--index;
+			if (index < first_index) {
+				rc = 0;
+				goto out;
+			}
+			tail = (void *)tail - tail->lrt_len;
+		}
+	}
+
+out:
+	if (buf)
+		OBD_FREE(buf, LLOG_CHUNK_SIZE);
+	return rc;
+}
+EXPORT_SYMBOL(llog_reverse_process);
+
+/**
+ * new llog API
+ *
+ * API functions:
+ *      llog_open - open llog, may not exist
+ *      llog_exist - check if llog exists
+ *      llog_close - close opened llog, pair for open, frees llog_handle
+ *      llog_declare_create - declare llog creation
+ *      llog_create - create new llog on disk, need transaction handle
+ *      llog_declare_write_rec - declaration of llog write
+ *      llog_write_rec - write llog record on disk, need transaction handle
+ *      llog_declare_add - declare llog catalog record addition
+ *      llog_add - add llog record in catalog, need transaction handle
+ */
+int llog_exist(struct llog_handle *loghandle)
+{
+	struct llog_operations	*lop;
+	int			 rc;
+
+	rc = llog_handle2ops(loghandle, &lop);
+	if (rc)
+		return rc;
+	if (lop->lop_exist == NULL)
+		return -EOPNOTSUPP;
+
+	rc = lop->lop_exist(loghandle);
+	return rc;
+}
+EXPORT_SYMBOL(llog_exist);
+
+int llog_declare_create(const struct lu_env *env,
+			struct llog_handle *loghandle, struct thandle *th)
+{
+	struct llog_operations	*lop;
+	int			 raised, rc;
+
+	rc = llog_handle2ops(loghandle, &lop);
+	if (rc)
+		return rc;
+	if (lop->lop_declare_create == NULL)
+		return -EOPNOTSUPP;
+
+	raised = cfs_cap_raised(CFS_CAP_SYS_RESOURCE);
+	if (!raised)
+		cfs_cap_raise(CFS_CAP_SYS_RESOURCE);
+	rc = lop->lop_declare_create(env, loghandle, th);
+	if (!raised)
+		cfs_cap_lower(CFS_CAP_SYS_RESOURCE);
+	return rc;
+}
+EXPORT_SYMBOL(llog_declare_create);
+
+int llog_create(const struct lu_env *env, struct llog_handle *handle,
+		struct thandle *th)
+{
+	struct llog_operations	*lop;
+	int			 raised, rc;
+
+	rc = llog_handle2ops(handle, &lop);
+	if (rc)
+		return rc;
+	if (lop->lop_create == NULL)
+		return -EOPNOTSUPP;
+
+	raised = cfs_cap_raised(CFS_CAP_SYS_RESOURCE);
+	if (!raised)
+		cfs_cap_raise(CFS_CAP_SYS_RESOURCE);
+	rc = lop->lop_create(env, handle, th);
+	if (!raised)
+		cfs_cap_lower(CFS_CAP_SYS_RESOURCE);
+	return rc;
+}
+EXPORT_SYMBOL(llog_create);
+
+int llog_declare_write_rec(const struct lu_env *env,
+			   struct llog_handle *handle,
+			   struct llog_rec_hdr *rec, int idx,
+			   struct thandle *th)
+{
+	struct llog_operations	*lop;
+	int			 raised, rc;
+
+	rc = llog_handle2ops(handle, &lop);
+	if (rc)
+		return rc;
+	LASSERT(lop);
+	if (lop->lop_declare_write_rec == NULL)
+		return -EOPNOTSUPP;
+
+	raised = cfs_cap_raised(CFS_CAP_SYS_RESOURCE);
+	if (!raised)
+		cfs_cap_raise(CFS_CAP_SYS_RESOURCE);
+	rc = lop->lop_declare_write_rec(env, handle, rec, idx, th);
+	if (!raised)
+		cfs_cap_lower(CFS_CAP_SYS_RESOURCE);
+	return rc;
+}
+EXPORT_SYMBOL(llog_declare_write_rec);
+
+int llog_write_rec(const struct lu_env *env, struct llog_handle *handle,
+		   struct llog_rec_hdr *rec, struct llog_cookie *logcookies,
+		   int numcookies, void *buf, int idx, struct thandle *th)
+{
+	struct llog_operations	*lop;
+	int			 raised, rc, buflen;
+
+	rc = llog_handle2ops(handle, &lop);
+	if (rc)
+		return rc;
+
+	LASSERT(lop);
+	if (lop->lop_write_rec == NULL)
+		return -EOPNOTSUPP;
+
+	if (buf)
+		buflen = rec->lrh_len + sizeof(struct llog_rec_hdr) +
+			 sizeof(struct llog_rec_tail);
+	else
+		buflen = rec->lrh_len;
+	LASSERT(cfs_size_round(buflen) == buflen);
+
+	raised = cfs_cap_raised(CFS_CAP_SYS_RESOURCE);
+	if (!raised)
+		cfs_cap_raise(CFS_CAP_SYS_RESOURCE);
+	rc = lop->lop_write_rec(env, handle, rec, logcookies, numcookies,
+				buf, idx, th);
+	if (!raised)
+		cfs_cap_lower(CFS_CAP_SYS_RESOURCE);
+	return rc;
+}
+EXPORT_SYMBOL(llog_write_rec);
+
+int llog_add(const struct lu_env *env, struct llog_handle *lgh,
+	     struct llog_rec_hdr *rec, struct llog_cookie *logcookies,
+	     void *buf, struct thandle *th)
+{
+	int raised, rc;
+
+	if (lgh->lgh_logops->lop_add == NULL)
+		return -EOPNOTSUPP;
+
+	raised = cfs_cap_raised(CFS_CAP_SYS_RESOURCE);
+	if (!raised)
+		cfs_cap_raise(CFS_CAP_SYS_RESOURCE);
+	rc = lgh->lgh_logops->lop_add(env, lgh, rec, logcookies, buf, th);
+	if (!raised)
+		cfs_cap_lower(CFS_CAP_SYS_RESOURCE);
+	return rc;
+}
+EXPORT_SYMBOL(llog_add);
+
+int llog_declare_add(const struct lu_env *env, struct llog_handle *lgh,
+		     struct llog_rec_hdr *rec, struct thandle *th)
+{
+	int raised, rc;
+
+	if (lgh->lgh_logops->lop_declare_add == NULL)
+		return -EOPNOTSUPP;
+
+	raised = cfs_cap_raised(CFS_CAP_SYS_RESOURCE);
+	if (!raised)
+		cfs_cap_raise(CFS_CAP_SYS_RESOURCE);
+	rc = lgh->lgh_logops->lop_declare_add(env, lgh, rec, th);
+	if (!raised)
+		cfs_cap_lower(CFS_CAP_SYS_RESOURCE);
+	return rc;
+}
+EXPORT_SYMBOL(llog_declare_add);
+
+/**
+ * Helper function to open llog or create it if doesn't exist.
+ * It hides all transaction handling from caller.
+ */
+int llog_open_create(const struct lu_env *env, struct llog_ctxt *ctxt,
+		     struct llog_handle **res, struct llog_logid *logid,
+		     char *name)
+{
+	struct dt_device	*d;
+	struct thandle		*th;
+	int			 rc;
+
+	rc = llog_open(env, ctxt, res, logid, name, LLOG_OPEN_NEW);
+	if (rc)
+		return rc;
+
+	if (llog_exist(*res))
+		return 0;
+
+	LASSERT((*res)->lgh_obj != NULL);
+
+	d = lu2dt_dev((*res)->lgh_obj->do_lu.lo_dev);
+
+	th = dt_trans_create(env, d);
+	if (IS_ERR(th)) {
+		rc = PTR_ERR(th);
+		goto out;
+	}
+
+	rc = llog_declare_create(env, *res, th);
+	if (rc == 0) {
+		rc = dt_trans_start_local(env, d, th);
+		if (rc == 0)
+			rc = llog_create(env, *res, th);
+	}
+	dt_trans_stop(env, d, th);
+out:
+	if (rc)
+		llog_close(env, *res);
+	return rc;
+}
+EXPORT_SYMBOL(llog_open_create);
+
+/**
+ * Helper function to delete existent llog.
+ */
+int llog_erase(const struct lu_env *env, struct llog_ctxt *ctxt,
+	       struct llog_logid *logid, char *name)
+{
+	struct llog_handle	*handle;
+	int			 rc = 0, rc2;
+
+	/* nothing to erase */
+	if (name == NULL && logid == NULL)
+		return 0;
+
+	rc = llog_open(env, ctxt, &handle, logid, name, LLOG_OPEN_EXISTS);
+	if (rc < 0)
+		return rc;
+
+	rc = llog_init_handle(env, handle, LLOG_F_IS_PLAIN, NULL);
+	if (rc == 0)
+		rc = llog_destroy(env, handle);
+
+	rc2 = llog_close(env, handle);
+	if (rc == 0)
+		rc = rc2;
+	return rc;
+}
+EXPORT_SYMBOL(llog_erase);
+
+/*
+ * Helper function for write record in llog.
+ * It hides all transaction handling from caller.
+ * Valid only with local llog.
+ */
+int llog_write(const struct lu_env *env, struct llog_handle *loghandle,
+	       struct llog_rec_hdr *rec, struct llog_cookie *reccookie,
+	       int cookiecount, void *buf, int idx)
+{
+	struct dt_device	*dt;
+	struct thandle		*th;
+	int			 rc;
+
+	LASSERT(loghandle);
+	LASSERT(loghandle->lgh_ctxt);
+	LASSERT(loghandle->lgh_obj != NULL);
+
+	dt = lu2dt_dev(loghandle->lgh_obj->do_lu.lo_dev);
+
+	th = dt_trans_create(env, dt);
+	if (IS_ERR(th))
+		return PTR_ERR(th);
+
+	rc = llog_declare_write_rec(env, loghandle, rec, idx, th);
+	if (rc)
+		goto out_trans;
+
+	rc = dt_trans_start_local(env, dt, th);
+	if (rc)
+		goto out_trans;
+
+	down_write(&loghandle->lgh_lock);
+	rc = llog_write_rec(env, loghandle, rec, reccookie,
+			    cookiecount, buf, idx, th);
+	up_write(&loghandle->lgh_lock);
+out_trans:
+	dt_trans_stop(env, dt, th);
+	return rc;
+}
+EXPORT_SYMBOL(llog_write);
+
+int llog_open(const struct lu_env *env, struct llog_ctxt *ctxt,
+	      struct llog_handle **lgh, struct llog_logid *logid,
+	      char *name, enum llog_open_param open_param)
+{
+	int	 raised;
+	int	 rc;
+
+	LASSERT(ctxt);
+	LASSERT(ctxt->loc_logops);
+
+	if (ctxt->loc_logops->lop_open == NULL) {
+		*lgh = NULL;
+		return -EOPNOTSUPP;
+	}
+
+	*lgh = llog_alloc_handle();
+	if (*lgh == NULL)
+		return -ENOMEM;
+	(*lgh)->lgh_ctxt = ctxt;
+	(*lgh)->lgh_logops = ctxt->loc_logops;
+
+	raised = cfs_cap_raised(CFS_CAP_SYS_RESOURCE);
+	if (!raised)
+		cfs_cap_raise(CFS_CAP_SYS_RESOURCE);
+	rc = ctxt->loc_logops->lop_open(env, *lgh, logid, name, open_param);
+	if (!raised)
+		cfs_cap_lower(CFS_CAP_SYS_RESOURCE);
+	if (rc) {
+		llog_free_handle(*lgh);
+		*lgh = NULL;
+	}
+	return rc;
+}
+EXPORT_SYMBOL(llog_open);
+
+int llog_close(const struct lu_env *env, struct llog_handle *loghandle)
+{
+	struct llog_operations	*lop;
+	int			 rc;
+
+	rc = llog_handle2ops(loghandle, &lop);
+	if (rc)
+		goto out;
+	if (lop->lop_close == NULL) {
+		rc = -EOPNOTSUPP;
+		goto out;
+	}
+	rc = lop->lop_close(env, loghandle);
+out:
+	llog_handle_put(loghandle);
+	return rc;
+}
+EXPORT_SYMBOL(llog_close);
+
+int llog_is_empty(const struct lu_env *env, struct llog_ctxt *ctxt,
+		  char *name)
+{
+	struct llog_handle	*llh;
+	int			 rc = 0;
+
+	rc = llog_open(env, ctxt, &llh, NULL, name, LLOG_OPEN_EXISTS);
+	if (rc < 0) {
+		if (likely(rc == -ENOENT))
+			rc = 0;
+		goto out;
+	}
+
+	rc = llog_init_handle(env, llh, LLOG_F_IS_PLAIN, NULL);
+	if (rc)
+		goto out_close;
+	rc = llog_get_size(llh);
+
+out_close:
+	llog_close(env, llh);
+out:
+	/* header is record 1 */
+	return rc <= 1;
+}
+EXPORT_SYMBOL(llog_is_empty);
+
+int llog_copy_handler(const struct lu_env *env, struct llog_handle *llh,
+		      struct llog_rec_hdr *rec, void *data)
+{
+	struct llog_handle	*copy_llh = data;
+
+	/* Append all records */
+	return llog_write(env, copy_llh, rec, NULL, 0, NULL, -1);
+}
+EXPORT_SYMBOL(llog_copy_handler);
+
+/* backup plain llog */
+int llog_backup(const struct lu_env *env, struct obd_device *obd,
+		struct llog_ctxt *ctxt, struct llog_ctxt *bctxt,
+		char *name, char *backup)
+{
+	struct llog_handle	*llh, *bllh;
+	int			 rc;
+
+
+
+	/* open original log */
+	rc = llog_open(env, ctxt, &llh, NULL, name, LLOG_OPEN_EXISTS);
+	if (rc < 0) {
+		/* the -ENOENT case is also reported to the caller
+		 * but silently so it should handle that if needed.
+		 */
+		if (rc != -ENOENT)
+			CERROR("%s: failed to open log %s: rc = %d\n",
+			       obd->obd_name, name, rc);
+		return rc;
+	}
+
+	rc = llog_init_handle(env, llh, LLOG_F_IS_PLAIN, NULL);
+	if (rc)
+		goto out_close;
+
+	/* Make sure there's no old backup log */
+	rc = llog_erase(env, bctxt, NULL, backup);
+	if (rc < 0 && rc != -ENOENT)
+		goto out_close;
+
+	/* open backup log */
+	rc = llog_open_create(env, bctxt, &bllh, NULL, backup);
+	if (rc) {
+		CERROR("%s: failed to open backup logfile %s: rc = %d\n",
+		       obd->obd_name, backup, rc);
+		goto out_close;
+	}
+
+	/* check that backup llog is not the same object as original one */
+	if (llh->lgh_obj == bllh->lgh_obj) {
+		CERROR("%s: backup llog %s to itself (%s), objects %p/%p\n",
+		       obd->obd_name, name, backup, llh->lgh_obj,
+		       bllh->lgh_obj);
+		rc = -EEXIST;
+		goto out_backup;
+	}
+
+	rc = llog_init_handle(env, bllh, LLOG_F_IS_PLAIN, NULL);
+	if (rc)
+		goto out_backup;
+
+	/* Copy log record by record */
+	rc = llog_process_or_fork(env, llh, llog_copy_handler, (void *)bllh,
+				  NULL, false);
+	if (rc)
+		CERROR("%s: failed to backup log %s: rc = %d\n",
+		       obd->obd_name, name, rc);
+out_backup:
+	llog_close(env, bllh);
+out_close:
+	llog_close(env, llh);
+	return rc;
+}
+EXPORT_SYMBOL(llog_backup);
diff --git a/kernel/drivers/staging/lustre/lustre/obdclass/llog_cat.c b/kernel/drivers/staging/lustre/lustre/obdclass/llog_cat.c
new file mode 100644
index 000000000..c8f6ab006
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/obdclass/llog_cat.c
@@ -0,0 +1,815 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdclass/llog_cat.c
+ *
+ * OST<->MDS recovery logging infrastructure.
+ *
+ * Invariants in implementation:
+ * - we do not share logs among different OST<->MDS connections, so that
+ *   if an OST or MDS fails it need only look at log(s) relevant to itself
+ *
+ * Author: Andreas Dilger <adilger@clusterfs.com>
+ * Author: Alexey Zhuravlev <alexey.zhuravlev@intel.com>
+ * Author: Mikhail Pershin <mike.pershin@intel.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LOG
+
+
+#include "../include/obd_class.h"
+
+#include "llog_internal.h"
+
+/* Create a new log handle and add it to the open list.
+ * This log handle will be closed when all of the records in it are removed.
+ *
+ * Assumes caller has already pushed us into the kernel context and is locking.
+ */
+static int llog_cat_new_log(const struct lu_env *env,
+			    struct llog_handle *cathandle,
+			    struct llog_handle *loghandle,
+			    struct thandle *th)
+{
+
+	struct llog_log_hdr *llh;
+	struct llog_logid_rec rec = { { 0 }, };
+	int rc, index, bitmap_size;
+
+	llh = cathandle->lgh_hdr;
+	bitmap_size = LLOG_BITMAP_SIZE(llh);
+
+	index = (cathandle->lgh_last_idx + 1) % bitmap_size;
+
+	/* maximum number of available slots in catlog is bitmap_size - 2 */
+	if (llh->llh_cat_idx == index) {
+		CERROR("no free catalog slots for log...\n");
+		return -ENOSPC;
+	}
+
+	if (OBD_FAIL_CHECK(OBD_FAIL_MDS_LLOG_CREATE_FAILED))
+		return -ENOSPC;
+
+	rc = llog_create(env, loghandle, th);
+	/* if llog is already created, no need to initialize it */
+	if (rc == -EEXIST) {
+		return 0;
+	} else if (rc != 0) {
+		CERROR("%s: can't create new plain llog in catalog: rc = %d\n",
+		       loghandle->lgh_ctxt->loc_obd->obd_name, rc);
+		return rc;
+	}
+
+	rc = llog_init_handle(env, loghandle,
+			      LLOG_F_IS_PLAIN | LLOG_F_ZAP_WHEN_EMPTY,
+			      &cathandle->lgh_hdr->llh_tgtuuid);
+	if (rc)
+		goto out_destroy;
+
+	if (index == 0)
+		index = 1;
+
+	spin_lock(&loghandle->lgh_hdr_lock);
+	llh->llh_count++;
+	if (ext2_set_bit(index, llh->llh_bitmap)) {
+		CERROR("argh, index %u already set in log bitmap?\n",
+		       index);
+		spin_unlock(&loghandle->lgh_hdr_lock);
+		LBUG(); /* should never happen */
+	}
+	spin_unlock(&loghandle->lgh_hdr_lock);
+
+	cathandle->lgh_last_idx = index;
+	llh->llh_tail.lrt_index = index;
+
+	CDEBUG(D_RPCTRACE,
+	       "new recovery log "DOSTID":%x for index %u of catalog"
+	       DOSTID"\n", POSTID(&loghandle->lgh_id.lgl_oi),
+	       loghandle->lgh_id.lgl_ogen, index,
+	       POSTID(&cathandle->lgh_id.lgl_oi));
+	/* build the record for this log in the catalog */
+	rec.lid_hdr.lrh_len = sizeof(rec);
+	rec.lid_hdr.lrh_index = index;
+	rec.lid_hdr.lrh_type = LLOG_LOGID_MAGIC;
+	rec.lid_id = loghandle->lgh_id;
+	rec.lid_tail.lrt_len = sizeof(rec);
+	rec.lid_tail.lrt_index = index;
+
+	/* update the catalog: header and record */
+	rc = llog_write_rec(env, cathandle, &rec.lid_hdr,
+			    &loghandle->u.phd.phd_cookie, 1, NULL, index, th);
+	if (rc < 0)
+		goto out_destroy;
+
+	loghandle->lgh_hdr->llh_cat_idx = index;
+	return 0;
+out_destroy:
+	llog_destroy(env, loghandle);
+	return rc;
+}
+
+/* Open an existent log handle and add it to the open list.
+ * This log handle will be closed when all of the records in it are removed.
+ *
+ * Assumes caller has already pushed us into the kernel context and is locking.
+ * We return a lock on the handle to ensure nobody yanks it from us.
+ *
+ * This takes extra reference on llog_handle via llog_handle_get() and require
+ * this reference to be put by caller using llog_handle_put()
+ */
+int llog_cat_id2handle(const struct lu_env *env, struct llog_handle *cathandle,
+		       struct llog_handle **res, struct llog_logid *logid)
+{
+	struct llog_handle	*loghandle;
+	int			 rc = 0;
+
+	if (cathandle == NULL)
+		return -EBADF;
+
+	down_write(&cathandle->lgh_lock);
+	list_for_each_entry(loghandle, &cathandle->u.chd.chd_head,
+				u.phd.phd_entry) {
+		struct llog_logid *cgl = &loghandle->lgh_id;
+
+		if (ostid_id(&cgl->lgl_oi) == ostid_id(&logid->lgl_oi) &&
+		    ostid_seq(&cgl->lgl_oi) == ostid_seq(&logid->lgl_oi)) {
+			if (cgl->lgl_ogen != logid->lgl_ogen) {
+				CERROR("%s: log "DOSTID" generation %x != %x\n",
+				       loghandle->lgh_ctxt->loc_obd->obd_name,
+				       POSTID(&logid->lgl_oi), cgl->lgl_ogen,
+				       logid->lgl_ogen);
+				continue;
+			}
+			loghandle->u.phd.phd_cat_handle = cathandle;
+			up_write(&cathandle->lgh_lock);
+			rc = 0;
+			goto out;
+		}
+	}
+	up_write(&cathandle->lgh_lock);
+
+	rc = llog_open(env, cathandle->lgh_ctxt, &loghandle, logid, NULL,
+		       LLOG_OPEN_EXISTS);
+	if (rc < 0) {
+		CERROR("%s: error opening log id "DOSTID":%x: rc = %d\n",
+		       cathandle->lgh_ctxt->loc_obd->obd_name,
+		       POSTID(&logid->lgl_oi), logid->lgl_ogen, rc);
+		return rc;
+	}
+
+	rc = llog_init_handle(env, loghandle, LLOG_F_IS_PLAIN, NULL);
+	if (rc < 0) {
+		llog_close(env, loghandle);
+		loghandle = NULL;
+		return rc;
+	}
+
+	down_write(&cathandle->lgh_lock);
+	list_add(&loghandle->u.phd.phd_entry, &cathandle->u.chd.chd_head);
+	up_write(&cathandle->lgh_lock);
+
+	loghandle->u.phd.phd_cat_handle = cathandle;
+	loghandle->u.phd.phd_cookie.lgc_lgl = cathandle->lgh_id;
+	loghandle->u.phd.phd_cookie.lgc_index =
+				loghandle->lgh_hdr->llh_cat_idx;
+out:
+	llog_handle_get(loghandle);
+	*res = loghandle;
+	return 0;
+}
+
+int llog_cat_close(const struct lu_env *env, struct llog_handle *cathandle)
+{
+	struct llog_handle	*loghandle, *n;
+	int			 rc;
+
+	list_for_each_entry_safe(loghandle, n, &cathandle->u.chd.chd_head,
+				     u.phd.phd_entry) {
+		struct llog_log_hdr	*llh = loghandle->lgh_hdr;
+		int			 index;
+
+		/* unlink open-not-created llogs */
+		list_del_init(&loghandle->u.phd.phd_entry);
+		llh = loghandle->lgh_hdr;
+		if (loghandle->lgh_obj != NULL && llh != NULL &&
+		    (llh->llh_flags & LLOG_F_ZAP_WHEN_EMPTY) &&
+		    (llh->llh_count == 1)) {
+			rc = llog_destroy(env, loghandle);
+			if (rc)
+				CERROR("%s: failure destroying log during cleanup: rc = %d\n",
+				       loghandle->lgh_ctxt->loc_obd->obd_name,
+				       rc);
+
+			index = loghandle->u.phd.phd_cookie.lgc_index;
+			llog_cat_cleanup(env, cathandle, NULL, index);
+		}
+		llog_close(env, loghandle);
+	}
+	/* if handle was stored in ctxt, remove it too */
+	if (cathandle->lgh_ctxt->loc_handle == cathandle)
+		cathandle->lgh_ctxt->loc_handle = NULL;
+	rc = llog_close(env, cathandle);
+	return rc;
+}
+EXPORT_SYMBOL(llog_cat_close);
+
+/**
+ * lockdep markers for nested struct llog_handle::lgh_lock locking.
+ */
+enum {
+	LLOGH_CAT,
+	LLOGH_LOG
+};
+
+/** Return the currently active log handle.  If the current log handle doesn't
+ * have enough space left for the current record, start a new one.
+ *
+ * If reclen is 0, we only want to know what the currently active log is,
+ * otherwise we get a lock on this log so nobody can steal our space.
+ *
+ * Assumes caller has already pushed us into the kernel context and is locking.
+ *
+ * NOTE: loghandle is write-locked upon successful return
+ */
+static struct llog_handle *llog_cat_current_log(struct llog_handle *cathandle,
+						struct thandle *th)
+{
+	struct llog_handle *loghandle = NULL;
+
+	down_read_nested(&cathandle->lgh_lock, LLOGH_CAT);
+	loghandle = cathandle->u.chd.chd_current_log;
+	if (loghandle) {
+		struct llog_log_hdr *llh;
+
+		down_write_nested(&loghandle->lgh_lock, LLOGH_LOG);
+		llh = loghandle->lgh_hdr;
+		if (llh == NULL ||
+		    loghandle->lgh_last_idx < LLOG_BITMAP_SIZE(llh) - 1) {
+			up_read(&cathandle->lgh_lock);
+			return loghandle;
+		} else {
+			up_write(&loghandle->lgh_lock);
+		}
+	}
+	up_read(&cathandle->lgh_lock);
+
+	/* time to use next log */
+
+	/* first, we have to make sure the state hasn't changed */
+	down_write_nested(&cathandle->lgh_lock, LLOGH_CAT);
+	loghandle = cathandle->u.chd.chd_current_log;
+	if (loghandle) {
+		struct llog_log_hdr *llh;
+
+		down_write_nested(&loghandle->lgh_lock, LLOGH_LOG);
+		llh = loghandle->lgh_hdr;
+		LASSERT(llh);
+		if (loghandle->lgh_last_idx < LLOG_BITMAP_SIZE(llh) - 1) {
+			up_write(&cathandle->lgh_lock);
+			return loghandle;
+		} else {
+			up_write(&loghandle->lgh_lock);
+		}
+	}
+
+	CDEBUG(D_INODE, "use next log\n");
+
+	loghandle = cathandle->u.chd.chd_next_log;
+	cathandle->u.chd.chd_current_log = loghandle;
+	cathandle->u.chd.chd_next_log = NULL;
+	down_write_nested(&loghandle->lgh_lock, LLOGH_LOG);
+	up_write(&cathandle->lgh_lock);
+	LASSERT(loghandle);
+	return loghandle;
+}
+
+/* Add a single record to the recovery log(s) using a catalog
+ * Returns as llog_write_record
+ *
+ * Assumes caller has already pushed us into the kernel context.
+ */
+int llog_cat_add_rec(const struct lu_env *env, struct llog_handle *cathandle,
+		     struct llog_rec_hdr *rec, struct llog_cookie *reccookie,
+		     void *buf, struct thandle *th)
+{
+	struct llog_handle *loghandle;
+	int rc;
+
+	LASSERT(rec->lrh_len <= LLOG_CHUNK_SIZE);
+	loghandle = llog_cat_current_log(cathandle, th);
+	LASSERT(!IS_ERR(loghandle));
+
+	/* loghandle is already locked by llog_cat_current_log() for us */
+	if (!llog_exist(loghandle)) {
+		rc = llog_cat_new_log(env, cathandle, loghandle, th);
+		if (rc < 0) {
+			up_write(&loghandle->lgh_lock);
+			return rc;
+		}
+	}
+	/* now let's try to add the record */
+	rc = llog_write_rec(env, loghandle, rec, reccookie, 1, buf, -1, th);
+	if (rc < 0)
+		CDEBUG_LIMIT(rc == -ENOSPC ? D_HA : D_ERROR,
+			     "llog_write_rec %d: lh=%p\n", rc, loghandle);
+	up_write(&loghandle->lgh_lock);
+	if (rc == -ENOSPC) {
+		/* try to use next log */
+		loghandle = llog_cat_current_log(cathandle, th);
+		LASSERT(!IS_ERR(loghandle));
+		/* new llog can be created concurrently */
+		if (!llog_exist(loghandle)) {
+			rc = llog_cat_new_log(env, cathandle, loghandle, th);
+			if (rc < 0) {
+				up_write(&loghandle->lgh_lock);
+				return rc;
+			}
+		}
+		/* now let's try to add the record */
+		rc = llog_write_rec(env, loghandle, rec, reccookie, 1, buf,
+				    -1, th);
+		if (rc < 0)
+			CERROR("llog_write_rec %d: lh=%p\n", rc, loghandle);
+		up_write(&loghandle->lgh_lock);
+	}
+
+	return rc;
+}
+EXPORT_SYMBOL(llog_cat_add_rec);
+
+int llog_cat_declare_add_rec(const struct lu_env *env,
+			     struct llog_handle *cathandle,
+			     struct llog_rec_hdr *rec, struct thandle *th)
+{
+	struct llog_handle	*loghandle, *next;
+	int			 rc = 0;
+
+	if (cathandle->u.chd.chd_current_log == NULL) {
+		/* declare new plain llog */
+		down_write(&cathandle->lgh_lock);
+		if (cathandle->u.chd.chd_current_log == NULL) {
+			rc = llog_open(env, cathandle->lgh_ctxt, &loghandle,
+				       NULL, NULL, LLOG_OPEN_NEW);
+			if (rc == 0) {
+				cathandle->u.chd.chd_current_log = loghandle;
+				list_add_tail(&loghandle->u.phd.phd_entry,
+						  &cathandle->u.chd.chd_head);
+			}
+		}
+		up_write(&cathandle->lgh_lock);
+	} else if (cathandle->u.chd.chd_next_log == NULL) {
+		/* declare next plain llog */
+		down_write(&cathandle->lgh_lock);
+		if (cathandle->u.chd.chd_next_log == NULL) {
+			rc = llog_open(env, cathandle->lgh_ctxt, &loghandle,
+				       NULL, NULL, LLOG_OPEN_NEW);
+			if (rc == 0) {
+				cathandle->u.chd.chd_next_log = loghandle;
+				list_add_tail(&loghandle->u.phd.phd_entry,
+						  &cathandle->u.chd.chd_head);
+			}
+		}
+		up_write(&cathandle->lgh_lock);
+	}
+	if (rc)
+		goto out;
+
+	if (!llog_exist(cathandle->u.chd.chd_current_log)) {
+		rc = llog_declare_create(env, cathandle->u.chd.chd_current_log,
+					 th);
+		if (rc)
+			goto out;
+		llog_declare_write_rec(env, cathandle, NULL, -1, th);
+	}
+	/* declare records in the llogs */
+	rc = llog_declare_write_rec(env, cathandle->u.chd.chd_current_log,
+				    rec, -1, th);
+	if (rc)
+		goto out;
+
+	next = cathandle->u.chd.chd_next_log;
+	if (next) {
+		if (!llog_exist(next)) {
+			rc = llog_declare_create(env, next, th);
+			llog_declare_write_rec(env, cathandle, NULL, -1, th);
+		}
+		llog_declare_write_rec(env, next, rec, -1, th);
+	}
+out:
+	return rc;
+}
+EXPORT_SYMBOL(llog_cat_declare_add_rec);
+
+int llog_cat_add(const struct lu_env *env, struct llog_handle *cathandle,
+		 struct llog_rec_hdr *rec, struct llog_cookie *reccookie,
+		 void *buf)
+{
+	struct llog_ctxt	*ctxt;
+	struct dt_device	*dt;
+	struct thandle		*th = NULL;
+	int			 rc;
+
+	ctxt = cathandle->lgh_ctxt;
+	LASSERT(ctxt);
+	LASSERT(ctxt->loc_exp);
+
+	if (cathandle->lgh_obj != NULL) {
+		dt = ctxt->loc_exp->exp_obd->obd_lvfs_ctxt.dt;
+		LASSERT(dt);
+
+		th = dt_trans_create(env, dt);
+		if (IS_ERR(th))
+			return PTR_ERR(th);
+
+		rc = llog_cat_declare_add_rec(env, cathandle, rec, th);
+		if (rc)
+			goto out_trans;
+
+		rc = dt_trans_start_local(env, dt, th);
+		if (rc)
+			goto out_trans;
+		rc = llog_cat_add_rec(env, cathandle, rec, reccookie, buf, th);
+out_trans:
+		dt_trans_stop(env, dt, th);
+	} else { /* lvfs compat code */
+		LASSERT(cathandle->lgh_file != NULL);
+		rc = llog_cat_declare_add_rec(env, cathandle, rec, th);
+		if (rc == 0)
+			rc = llog_cat_add_rec(env, cathandle, rec, reccookie,
+					      buf, th);
+	}
+	return rc;
+}
+EXPORT_SYMBOL(llog_cat_add);
+
+/* For each cookie in the cookie array, we clear the log in-use bit and either:
+ * - the log is empty, so mark it free in the catalog header and delete it
+ * - the log is not empty, just write out the log header
+ *
+ * The cookies may be in different log files, so we need to get new logs
+ * each time.
+ *
+ * Assumes caller has already pushed us into the kernel context.
+ */
+int llog_cat_cancel_records(const struct lu_env *env,
+			    struct llog_handle *cathandle, int count,
+			    struct llog_cookie *cookies)
+{
+	int i, index, rc = 0, failed = 0;
+
+	for (i = 0; i < count; i++, cookies++) {
+		struct llog_handle	*loghandle;
+		struct llog_logid	*lgl = &cookies->lgc_lgl;
+		int			 lrc;
+
+		rc = llog_cat_id2handle(env, cathandle, &loghandle, lgl);
+		if (rc) {
+			CERROR("%s: cannot find handle for llog "DOSTID": %d\n",
+			       cathandle->lgh_ctxt->loc_obd->obd_name,
+			       POSTID(&lgl->lgl_oi), rc);
+			failed++;
+			continue;
+		}
+
+		lrc = llog_cancel_rec(env, loghandle, cookies->lgc_index);
+		if (lrc == 1) {	  /* log has been destroyed */
+			index = loghandle->u.phd.phd_cookie.lgc_index;
+			rc = llog_cat_cleanup(env, cathandle, loghandle,
+					      index);
+		} else if (lrc == -ENOENT) {
+			if (rc == 0) /* ENOENT shouldn't rewrite any error */
+				rc = lrc;
+		} else if (lrc < 0) {
+			failed++;
+			rc = lrc;
+		}
+		llog_handle_put(loghandle);
+	}
+	if (rc)
+		CERROR("%s: fail to cancel %d of %d llog-records: rc = %d\n",
+		       cathandle->lgh_ctxt->loc_obd->obd_name, failed, count,
+		       rc);
+
+	return rc;
+}
+EXPORT_SYMBOL(llog_cat_cancel_records);
+
+static int llog_cat_process_cb(const struct lu_env *env,
+			       struct llog_handle *cat_llh,
+			       struct llog_rec_hdr *rec, void *data)
+{
+	struct llog_process_data *d = data;
+	struct llog_logid_rec *lir = (struct llog_logid_rec *)rec;
+	struct llog_handle *llh;
+	int rc;
+
+	if (rec->lrh_type != LLOG_LOGID_MAGIC) {
+		CERROR("invalid record in catalog\n");
+		return -EINVAL;
+	}
+	CDEBUG(D_HA, "processing log "DOSTID":%x at index %u of catalog "
+	       DOSTID"\n", POSTID(&lir->lid_id.lgl_oi), lir->lid_id.lgl_ogen,
+	       rec->lrh_index, POSTID(&cat_llh->lgh_id.lgl_oi));
+
+	rc = llog_cat_id2handle(env, cat_llh, &llh, &lir->lid_id);
+	if (rc) {
+		CERROR("%s: cannot find handle for llog "DOSTID": %d\n",
+		       cat_llh->lgh_ctxt->loc_obd->obd_name,
+		       POSTID(&lir->lid_id.lgl_oi), rc);
+		return rc;
+	}
+
+	if (rec->lrh_index < d->lpd_startcat)
+		/* Skip processing of the logs until startcat */
+		rc = 0;
+	else if (d->lpd_startidx > 0) {
+		struct llog_process_cat_data cd;
+
+		cd.lpcd_first_idx = d->lpd_startidx;
+		cd.lpcd_last_idx = 0;
+		rc = llog_process_or_fork(env, llh, d->lpd_cb, d->lpd_data,
+					  &cd, false);
+		/* Continue processing the next log from idx 0 */
+		d->lpd_startidx = 0;
+	} else {
+		rc = llog_process_or_fork(env, llh, d->lpd_cb, d->lpd_data,
+					  NULL, false);
+	}
+
+	llog_handle_put(llh);
+
+	return rc;
+}
+
+int llog_cat_process_or_fork(const struct lu_env *env,
+			     struct llog_handle *cat_llh,
+			     llog_cb_t cb, void *data, int startcat,
+			     int startidx, bool fork)
+{
+	struct llog_process_data d;
+	struct llog_log_hdr *llh = cat_llh->lgh_hdr;
+	int rc;
+
+	LASSERT(llh->llh_flags & LLOG_F_IS_CAT);
+	d.lpd_data = data;
+	d.lpd_cb = cb;
+	d.lpd_startcat = startcat;
+	d.lpd_startidx = startidx;
+
+	if (llh->llh_cat_idx > cat_llh->lgh_last_idx) {
+		struct llog_process_cat_data cd;
+
+		CWARN("catlog "DOSTID" crosses index zero\n",
+		      POSTID(&cat_llh->lgh_id.lgl_oi));
+
+		cd.lpcd_first_idx = llh->llh_cat_idx;
+		cd.lpcd_last_idx = 0;
+		rc = llog_process_or_fork(env, cat_llh, llog_cat_process_cb,
+					  &d, &cd, fork);
+		if (rc != 0)
+			return rc;
+
+		cd.lpcd_first_idx = 0;
+		cd.lpcd_last_idx = cat_llh->lgh_last_idx;
+		rc = llog_process_or_fork(env, cat_llh, llog_cat_process_cb,
+					  &d, &cd, fork);
+	} else {
+		rc = llog_process_or_fork(env, cat_llh, llog_cat_process_cb,
+					  &d, NULL, fork);
+	}
+
+	return rc;
+}
+EXPORT_SYMBOL(llog_cat_process_or_fork);
+
+int llog_cat_process(const struct lu_env *env, struct llog_handle *cat_llh,
+		     llog_cb_t cb, void *data, int startcat, int startidx)
+{
+	return llog_cat_process_or_fork(env, cat_llh, cb, data, startcat,
+					startidx, false);
+}
+EXPORT_SYMBOL(llog_cat_process);
+
+static int llog_cat_reverse_process_cb(const struct lu_env *env,
+				       struct llog_handle *cat_llh,
+				       struct llog_rec_hdr *rec, void *data)
+{
+	struct llog_process_data *d = data;
+	struct llog_logid_rec *lir = (struct llog_logid_rec *)rec;
+	struct llog_handle *llh;
+	int rc;
+
+	if (le32_to_cpu(rec->lrh_type) != LLOG_LOGID_MAGIC) {
+		CERROR("invalid record in catalog\n");
+		return -EINVAL;
+	}
+	CDEBUG(D_HA, "processing log "DOSTID":%x at index %u of catalog "
+	       DOSTID"\n", POSTID(&lir->lid_id.lgl_oi), lir->lid_id.lgl_ogen,
+	       le32_to_cpu(rec->lrh_index), POSTID(&cat_llh->lgh_id.lgl_oi));
+
+	rc = llog_cat_id2handle(env, cat_llh, &llh, &lir->lid_id);
+	if (rc) {
+		CERROR("%s: cannot find handle for llog "DOSTID": %d\n",
+		       cat_llh->lgh_ctxt->loc_obd->obd_name,
+		       POSTID(&lir->lid_id.lgl_oi), rc);
+		return rc;
+	}
+
+	rc = llog_reverse_process(env, llh, d->lpd_cb, d->lpd_data, NULL);
+	llog_handle_put(llh);
+	return rc;
+}
+
+int llog_cat_reverse_process(const struct lu_env *env,
+			     struct llog_handle *cat_llh,
+			     llog_cb_t cb, void *data)
+{
+	struct llog_process_data d;
+	struct llog_process_cat_data cd;
+	struct llog_log_hdr *llh = cat_llh->lgh_hdr;
+	int rc;
+
+	LASSERT(llh->llh_flags & LLOG_F_IS_CAT);
+	d.lpd_data = data;
+	d.lpd_cb = cb;
+
+	if (llh->llh_cat_idx > cat_llh->lgh_last_idx) {
+		CWARN("catalog "DOSTID" crosses index zero\n",
+		      POSTID(&cat_llh->lgh_id.lgl_oi));
+
+		cd.lpcd_first_idx = 0;
+		cd.lpcd_last_idx = cat_llh->lgh_last_idx;
+		rc = llog_reverse_process(env, cat_llh,
+					  llog_cat_reverse_process_cb,
+					  &d, &cd);
+		if (rc != 0)
+			return rc;
+
+		cd.lpcd_first_idx = le32_to_cpu(llh->llh_cat_idx);
+		cd.lpcd_last_idx = 0;
+		rc = llog_reverse_process(env, cat_llh,
+					  llog_cat_reverse_process_cb,
+					  &d, &cd);
+	} else {
+		rc = llog_reverse_process(env, cat_llh,
+					  llog_cat_reverse_process_cb,
+					  &d, NULL);
+	}
+
+	return rc;
+}
+EXPORT_SYMBOL(llog_cat_reverse_process);
+
+static int llog_cat_set_first_idx(struct llog_handle *cathandle, int index)
+{
+	struct llog_log_hdr *llh = cathandle->lgh_hdr;
+	int i, bitmap_size, idx;
+
+	bitmap_size = LLOG_BITMAP_SIZE(llh);
+	if (llh->llh_cat_idx == (index - 1)) {
+		idx = llh->llh_cat_idx + 1;
+		llh->llh_cat_idx = idx;
+		if (idx == cathandle->lgh_last_idx)
+			goto out;
+		for (i = (index + 1) % bitmap_size;
+		     i != cathandle->lgh_last_idx;
+		     i = (i + 1) % bitmap_size) {
+			if (!ext2_test_bit(i, llh->llh_bitmap)) {
+				idx = llh->llh_cat_idx + 1;
+				llh->llh_cat_idx = idx;
+			} else if (i == 0) {
+				llh->llh_cat_idx = 0;
+			} else {
+				break;
+			}
+		}
+out:
+		CDEBUG(D_RPCTRACE, "set catlog "DOSTID" first idx %u\n",
+		       POSTID(&cathandle->lgh_id.lgl_oi), llh->llh_cat_idx);
+	}
+
+	return 0;
+}
+
+/* Cleanup deleted plain llog traces from catalog */
+int llog_cat_cleanup(const struct lu_env *env, struct llog_handle *cathandle,
+		     struct llog_handle *loghandle, int index)
+{
+	int rc;
+
+	LASSERT(index);
+	if (loghandle != NULL) {
+		/* remove destroyed llog from catalog list and
+		 * chd_current_log variable */
+		down_write(&cathandle->lgh_lock);
+		if (cathandle->u.chd.chd_current_log == loghandle)
+			cathandle->u.chd.chd_current_log = NULL;
+		list_del_init(&loghandle->u.phd.phd_entry);
+		up_write(&cathandle->lgh_lock);
+		LASSERT(index == loghandle->u.phd.phd_cookie.lgc_index);
+		/* llog was opened and keep in a list, close it now */
+		llog_close(env, loghandle);
+	}
+	/* remove plain llog entry from catalog by index */
+	llog_cat_set_first_idx(cathandle, index);
+	rc = llog_cancel_rec(env, cathandle, index);
+	if (rc == 0)
+		CDEBUG(D_HA, "cancel plain log at index %u of catalog " DOSTID "\n",
+		       index, POSTID(&cathandle->lgh_id.lgl_oi));
+	return rc;
+}
+
+static int cat_cancel_cb(const struct lu_env *env, struct llog_handle *cathandle,
+		  struct llog_rec_hdr *rec, void *data)
+{
+	struct llog_logid_rec	*lir = (struct llog_logid_rec *)rec;
+	struct llog_handle	*loghandle;
+	struct llog_log_hdr	*llh;
+	int			 rc;
+
+	if (rec->lrh_type != LLOG_LOGID_MAGIC) {
+		CERROR("invalid record in catalog\n");
+		return -EINVAL;
+	}
+
+	CDEBUG(D_HA, "processing log "DOSTID":%x at index %u of catalog "
+	       DOSTID"\n", POSTID(&lir->lid_id.lgl_oi), lir->lid_id.lgl_ogen,
+	       rec->lrh_index, POSTID(&cathandle->lgh_id.lgl_oi));
+
+	rc = llog_cat_id2handle(env, cathandle, &loghandle, &lir->lid_id);
+	if (rc) {
+		CERROR("%s: cannot find handle for llog "DOSTID": %d\n",
+		       cathandle->lgh_ctxt->loc_obd->obd_name,
+		       POSTID(&lir->lid_id.lgl_oi), rc);
+		if (rc == -ENOENT || rc == -ESTALE) {
+			/* remove index from catalog */
+			llog_cat_cleanup(env, cathandle, NULL, rec->lrh_index);
+		}
+		return rc;
+	}
+
+	llh = loghandle->lgh_hdr;
+	if ((llh->llh_flags & LLOG_F_ZAP_WHEN_EMPTY) &&
+	    (llh->llh_count == 1)) {
+		rc = llog_destroy(env, loghandle);
+		if (rc)
+			CERROR("%s: fail to destroy empty log: rc = %d\n",
+			       loghandle->lgh_ctxt->loc_obd->obd_name, rc);
+
+		llog_cat_cleanup(env, cathandle, loghandle,
+				 loghandle->u.phd.phd_cookie.lgc_index);
+	}
+	llog_handle_put(loghandle);
+
+	return rc;
+}
+
+/* helper to initialize catalog llog and process it to cancel */
+int llog_cat_init_and_process(const struct lu_env *env,
+			      struct llog_handle *llh)
+{
+	int rc;
+
+	rc = llog_init_handle(env, llh, LLOG_F_IS_CAT, NULL);
+	if (rc)
+		return rc;
+
+	rc = llog_process_or_fork(env, llh, cat_cancel_cb, NULL, NULL, false);
+	if (rc)
+		CERROR("%s: llog_process() with cat_cancel_cb failed: rc = %d\n",
+		       llh->lgh_ctxt->loc_obd->obd_name, rc);
+	return 0;
+}
+EXPORT_SYMBOL(llog_cat_init_and_process);
diff --git a/kernel/drivers/staging/lustre/lustre/obdclass/llog_internal.h b/kernel/drivers/staging/lustre/lustre/obdclass/llog_internal.h
new file mode 100644
index 000000000..5332131a2
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/obdclass/llog_internal.h
@@ -0,0 +1,98 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef __LLOG_INTERNAL_H__
+#define __LLOG_INTERNAL_H__
+
+#include "../include/lustre_log.h"
+
+struct llog_process_info {
+	struct llog_handle *lpi_loghandle;
+	llog_cb_t	   lpi_cb;
+	void	       *lpi_cbdata;
+	void	       *lpi_catdata;
+	int		 lpi_rc;
+	struct completion	lpi_completion;
+	const struct lu_env	*lpi_env;
+
+};
+
+struct llog_thread_info {
+	struct lu_attr			 lgi_attr;
+	struct lu_fid			 lgi_fid;
+	struct dt_object_format		 lgi_dof;
+	struct lu_buf			 lgi_buf;
+	loff_t				 lgi_off;
+	struct llog_rec_hdr		 lgi_lrh;
+	struct llog_rec_tail		 lgi_tail;
+};
+
+extern struct lu_context_key llog_thread_key;
+
+static inline struct llog_thread_info *llog_info(const struct lu_env *env)
+{
+	struct llog_thread_info *lgi;
+
+	lgi = lu_context_key_get(&env->le_ctx, &llog_thread_key);
+	LASSERT(lgi);
+	return lgi;
+}
+
+static inline void
+lustre_build_llog_lvfs_oid(struct llog_logid *logid, __u64 ino, __u32 gen)
+{
+	ostid_set_seq_llog(&logid->lgl_oi);
+	ostid_set_id(&logid->lgl_oi, ino);
+	logid->lgl_ogen = gen;
+}
+
+int llog_info_init(void);
+void llog_info_fini(void);
+
+void llog_handle_get(struct llog_handle *loghandle);
+void llog_handle_put(struct llog_handle *loghandle);
+int llog_cat_id2handle(const struct lu_env *env, struct llog_handle *cathandle,
+		       struct llog_handle **res, struct llog_logid *logid);
+int class_config_dump_handler(const struct lu_env *env,
+			      struct llog_handle *handle,
+			      struct llog_rec_hdr *rec, void *data);
+int class_config_parse_rec(struct llog_rec_hdr *rec, char *buf, int size);
+int llog_process_or_fork(const struct lu_env *env,
+			 struct llog_handle *loghandle,
+			 llog_cb_t cb, void *data, void *catdata, bool fork);
+int llog_cat_cleanup(const struct lu_env *env, struct llog_handle *cathandle,
+		     struct llog_handle *loghandle, int index);
+#endif
diff --git a/kernel/drivers/staging/lustre/lustre/obdclass/llog_obd.c b/kernel/drivers/staging/lustre/lustre/obdclass/llog_obd.c
new file mode 100644
index 000000000..978d886a1
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/obdclass/llog_obd.c
@@ -0,0 +1,262 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_LOG
+
+
+#include "../include/obd_class.h"
+#include "../include/lustre_log.h"
+#include "llog_internal.h"
+
+/* helper functions for calling the llog obd methods */
+static struct llog_ctxt *llog_new_ctxt(struct obd_device *obd)
+{
+	struct llog_ctxt *ctxt;
+
+	OBD_ALLOC_PTR(ctxt);
+	if (!ctxt)
+		return NULL;
+
+	ctxt->loc_obd = obd;
+	atomic_set(&ctxt->loc_refcount, 1);
+
+	return ctxt;
+}
+
+static void llog_ctxt_destroy(struct llog_ctxt *ctxt)
+{
+	if (ctxt->loc_exp) {
+		class_export_put(ctxt->loc_exp);
+		ctxt->loc_exp = NULL;
+	}
+	if (ctxt->loc_imp) {
+		class_import_put(ctxt->loc_imp);
+		ctxt->loc_imp = NULL;
+	}
+	OBD_FREE_PTR(ctxt);
+}
+
+int __llog_ctxt_put(const struct lu_env *env, struct llog_ctxt *ctxt)
+{
+	struct obd_llog_group *olg = ctxt->loc_olg;
+	struct obd_device *obd;
+	int rc = 0;
+
+	spin_lock(&olg->olg_lock);
+	if (!atomic_dec_and_test(&ctxt->loc_refcount)) {
+		spin_unlock(&olg->olg_lock);
+		return rc;
+	}
+	olg->olg_ctxts[ctxt->loc_idx] = NULL;
+	spin_unlock(&olg->olg_lock);
+
+	obd = ctxt->loc_obd;
+	spin_lock(&obd->obd_dev_lock);
+	/* sync with llog ctxt user thread */
+	spin_unlock(&obd->obd_dev_lock);
+
+	/* obd->obd_starting is needed for the case of cleanup
+	 * in error case while obd is starting up. */
+	LASSERTF(obd->obd_starting == 1 ||
+		 obd->obd_stopping == 1 || obd->obd_set_up == 0,
+		 "wrong obd state: %d/%d/%d\n", !!obd->obd_starting,
+		 !!obd->obd_stopping, !!obd->obd_set_up);
+
+	/* cleanup the llog ctxt here */
+	if (CTXTP(ctxt, cleanup))
+		rc = CTXTP(ctxt, cleanup)(env, ctxt);
+
+	llog_ctxt_destroy(ctxt);
+	wake_up(&olg->olg_waitq);
+	return rc;
+}
+EXPORT_SYMBOL(__llog_ctxt_put);
+
+int llog_cleanup(const struct lu_env *env, struct llog_ctxt *ctxt)
+{
+	struct l_wait_info lwi = LWI_INTR(LWI_ON_SIGNAL_NOOP, NULL);
+	struct obd_llog_group *olg;
+	int rc, idx;
+
+	LASSERT(ctxt != NULL);
+	LASSERT(ctxt != LP_POISON);
+
+	olg = ctxt->loc_olg;
+	LASSERT(olg != NULL);
+	LASSERT(olg != LP_POISON);
+
+	idx = ctxt->loc_idx;
+
+	/*
+	 * Banlance the ctxt get when calling llog_cleanup()
+	 */
+	LASSERT(atomic_read(&ctxt->loc_refcount) < LI_POISON);
+	LASSERT(atomic_read(&ctxt->loc_refcount) > 1);
+	llog_ctxt_put(ctxt);
+
+	/*
+	 * Try to free the ctxt.
+	 */
+	rc = __llog_ctxt_put(env, ctxt);
+	if (rc)
+		CERROR("Error %d while cleaning up ctxt %p\n",
+		       rc, ctxt);
+
+	l_wait_event(olg->olg_waitq,
+		     llog_group_ctxt_null(olg, idx), &lwi);
+
+	return rc;
+}
+EXPORT_SYMBOL(llog_cleanup);
+
+int llog_setup(const struct lu_env *env, struct obd_device *obd,
+	       struct obd_llog_group *olg, int index,
+	       struct obd_device *disk_obd, struct llog_operations *op)
+{
+	struct llog_ctxt *ctxt;
+	int rc = 0;
+
+	if (index < 0 || index >= LLOG_MAX_CTXTS)
+		return -EINVAL;
+
+	LASSERT(olg != NULL);
+
+	ctxt = llog_new_ctxt(obd);
+	if (!ctxt)
+		return -ENOMEM;
+
+	ctxt->loc_obd = obd;
+	ctxt->loc_olg = olg;
+	ctxt->loc_idx = index;
+	ctxt->loc_logops = op;
+	mutex_init(&ctxt->loc_mutex);
+	ctxt->loc_exp = class_export_get(disk_obd->obd_self_export);
+	ctxt->loc_flags = LLOG_CTXT_FLAG_UNINITIALIZED;
+
+	rc = llog_group_set_ctxt(olg, ctxt, index);
+	if (rc) {
+		llog_ctxt_destroy(ctxt);
+		if (rc == -EEXIST) {
+			ctxt = llog_group_get_ctxt(olg, index);
+			if (ctxt) {
+				/*
+				 * mds_lov_update_desc() might call here multiple
+				 * times. So if the llog is already set up then
+				 * don't to do it again.
+				 */
+				CDEBUG(D_CONFIG, "obd %s ctxt %d already set up\n",
+				       obd->obd_name, index);
+				LASSERT(ctxt->loc_olg == olg);
+				LASSERT(ctxt->loc_obd == obd);
+				LASSERT(ctxt->loc_exp == disk_obd->obd_self_export);
+				LASSERT(ctxt->loc_logops == op);
+				llog_ctxt_put(ctxt);
+			}
+			rc = 0;
+		}
+		return rc;
+	}
+
+	if (op->lop_setup) {
+		if (OBD_FAIL_CHECK(OBD_FAIL_OBD_LLOG_SETUP))
+			rc = -EOPNOTSUPP;
+		else
+			rc = op->lop_setup(env, obd, olg, index, disk_obd);
+	}
+
+	if (rc) {
+		CERROR("%s: ctxt %d lop_setup=%p failed: rc = %d\n",
+		       obd->obd_name, index, op->lop_setup, rc);
+		llog_group_clear_ctxt(olg, index);
+		llog_ctxt_destroy(ctxt);
+	} else {
+		CDEBUG(D_CONFIG, "obd %s ctxt %d is initialized\n",
+		       obd->obd_name, index);
+		ctxt->loc_flags &= ~LLOG_CTXT_FLAG_UNINITIALIZED;
+	}
+
+	return rc;
+}
+EXPORT_SYMBOL(llog_setup);
+
+int llog_sync(struct llog_ctxt *ctxt, struct obd_export *exp, int flags)
+{
+	int rc = 0;
+
+	if (!ctxt)
+		return 0;
+
+	if (CTXTP(ctxt, sync))
+		rc = CTXTP(ctxt, sync)(ctxt, exp, flags);
+
+	return rc;
+}
+EXPORT_SYMBOL(llog_sync);
+
+int llog_cancel(const struct lu_env *env, struct llog_ctxt *ctxt,
+		struct llog_cookie *cookies, int flags)
+{
+	int rc;
+
+	if (!ctxt) {
+		CERROR("No ctxt\n");
+		return -ENODEV;
+	}
+
+	CTXT_CHECK_OP(ctxt, cancel, -EOPNOTSUPP);
+	rc = CTXTP(ctxt, cancel)(env, ctxt, cookies, flags);
+	return rc;
+}
+EXPORT_SYMBOL(llog_cancel);
+
+/* context key constructor/destructor: llog_key_init, llog_key_fini */
+LU_KEY_INIT_FINI(llog, struct llog_thread_info);
+/* context key: llog_thread_key */
+LU_CONTEXT_KEY_DEFINE(llog, LCT_MD_THREAD | LCT_MG_THREAD | LCT_LOCAL);
+LU_KEY_INIT_GENERIC(llog);
+EXPORT_SYMBOL(llog_thread_key);
+
+int llog_info_init(void)
+{
+	llog_key_init_generic(&llog_thread_key, NULL);
+	lu_context_key_register(&llog_thread_key);
+	return 0;
+}
+
+void llog_info_fini(void)
+{
+	lu_context_key_degister(&llog_thread_key);
+}
diff --git a/kernel/drivers/staging/lustre/lustre/obdclass/llog_swab.c b/kernel/drivers/staging/lustre/lustre/obdclass/llog_swab.c
new file mode 100644
index 000000000..a2d5aa105
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/obdclass/llog_swab.c
@@ -0,0 +1,415 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdclass/llog_swab.c
+ *
+ * Swabbing of llog datatypes (from disk or over the wire).
+ *
+ * Author: jacob berkman  <jacob@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LOG
+
+
+#include "../include/lustre_log.h"
+
+static void print_llogd_body(struct llogd_body *d)
+{
+	CDEBUG(D_OTHER, "llogd body: %p\n", d);
+	CDEBUG(D_OTHER, "\tlgd_logid.lgl_oi: "DOSTID"\n",
+	       POSTID(&d->lgd_logid.lgl_oi));
+	CDEBUG(D_OTHER, "\tlgd_logid.lgl_ogen: %#x\n", d->lgd_logid.lgl_ogen);
+	CDEBUG(D_OTHER, "\tlgd_ctxt_idx: %#x\n", d->lgd_ctxt_idx);
+	CDEBUG(D_OTHER, "\tlgd_llh_flags: %#x\n", d->lgd_llh_flags);
+	CDEBUG(D_OTHER, "\tlgd_index: %#x\n", d->lgd_index);
+	CDEBUG(D_OTHER, "\tlgd_saved_index: %#x\n", d->lgd_saved_index);
+	CDEBUG(D_OTHER, "\tlgd_len: %#x\n", d->lgd_len);
+	CDEBUG(D_OTHER, "\tlgd_cur_offset: %#llx\n", d->lgd_cur_offset);
+}
+
+void lustre_swab_lu_fid(struct lu_fid *fid)
+{
+	__swab64s(&fid->f_seq);
+	__swab32s(&fid->f_oid);
+	__swab32s(&fid->f_ver);
+}
+EXPORT_SYMBOL(lustre_swab_lu_fid);
+
+void lustre_swab_ost_id(struct ost_id *oid)
+{
+	if (fid_seq_is_mdt0(oid->oi.oi_seq)) {
+		__swab64s(&oid->oi.oi_id);
+		__swab64s(&oid->oi.oi_seq);
+	} else {
+		lustre_swab_lu_fid(&oid->oi_fid);
+	}
+}
+EXPORT_SYMBOL(lustre_swab_ost_id);
+
+void lustre_swab_llog_id(struct llog_logid *log_id)
+{
+	__swab64s(&log_id->lgl_oi.oi.oi_id);
+	__swab64s(&log_id->lgl_oi.oi.oi_seq);
+	__swab32s(&log_id->lgl_ogen);
+}
+EXPORT_SYMBOL(lustre_swab_llog_id);
+
+void lustre_swab_llogd_body(struct llogd_body *d)
+{
+	print_llogd_body(d);
+	lustre_swab_llog_id(&d->lgd_logid);
+	__swab32s(&d->lgd_ctxt_idx);
+	__swab32s(&d->lgd_llh_flags);
+	__swab32s(&d->lgd_index);
+	__swab32s(&d->lgd_saved_index);
+	__swab32s(&d->lgd_len);
+	__swab64s(&d->lgd_cur_offset);
+	print_llogd_body(d);
+}
+EXPORT_SYMBOL(lustre_swab_llogd_body);
+
+void lustre_swab_llogd_conn_body(struct llogd_conn_body *d)
+{
+	__swab64s(&d->lgdc_gen.mnt_cnt);
+	__swab64s(&d->lgdc_gen.conn_cnt);
+	lustre_swab_llog_id(&d->lgdc_logid);
+	__swab32s(&d->lgdc_ctxt_idx);
+}
+EXPORT_SYMBOL(lustre_swab_llogd_conn_body);
+
+void lustre_swab_ll_fid(struct ll_fid *fid)
+{
+	__swab64s(&fid->id);
+	__swab32s(&fid->generation);
+	__swab32s(&fid->f_type);
+}
+EXPORT_SYMBOL(lustre_swab_ll_fid);
+
+void lustre_swab_lu_seq_range(struct lu_seq_range *range)
+{
+	__swab64s(&range->lsr_start);
+	__swab64s(&range->lsr_end);
+	__swab32s(&range->lsr_index);
+	__swab32s(&range->lsr_flags);
+}
+EXPORT_SYMBOL(lustre_swab_lu_seq_range);
+
+void lustre_swab_llog_rec(struct llog_rec_hdr *rec)
+{
+	struct llog_rec_tail *tail = NULL;
+
+	__swab32s(&rec->lrh_len);
+	__swab32s(&rec->lrh_index);
+	__swab32s(&rec->lrh_type);
+	__swab32s(&rec->lrh_id);
+
+	switch (rec->lrh_type) {
+	case OST_SZ_REC:
+	{
+		struct llog_size_change_rec *lsc =
+			(struct llog_size_change_rec *)rec;
+
+		lustre_swab_ll_fid(&lsc->lsc_fid);
+		__swab32s(&lsc->lsc_ioepoch);
+		tail = &lsc->lsc_tail;
+		break;
+	}
+	case MDS_UNLINK_REC:
+	{
+		struct llog_unlink_rec *lur = (struct llog_unlink_rec *)rec;
+
+		__swab64s(&lur->lur_oid);
+		__swab32s(&lur->lur_oseq);
+		__swab32s(&lur->lur_count);
+		tail = &lur->lur_tail;
+		break;
+	}
+	case MDS_UNLINK64_REC:
+	{
+		struct llog_unlink64_rec *lur =
+			(struct llog_unlink64_rec *)rec;
+
+		lustre_swab_lu_fid(&lur->lur_fid);
+		__swab32s(&lur->lur_count);
+		tail = &lur->lur_tail;
+		break;
+	}
+	case CHANGELOG_REC:
+	{
+		struct llog_changelog_rec *cr =
+			(struct llog_changelog_rec *)rec;
+
+		__swab16s(&cr->cr.cr_namelen);
+		__swab16s(&cr->cr.cr_flags);
+		__swab32s(&cr->cr.cr_type);
+		__swab64s(&cr->cr.cr_index);
+		__swab64s(&cr->cr.cr_prev);
+		__swab64s(&cr->cr.cr_time);
+		lustre_swab_lu_fid(&cr->cr.cr_tfid);
+		lustre_swab_lu_fid(&cr->cr.cr_pfid);
+		if (CHANGELOG_REC_EXTENDED(&cr->cr)) {
+			struct llog_changelog_ext_rec *ext =
+				(struct llog_changelog_ext_rec *)rec;
+
+			lustre_swab_lu_fid(&ext->cr.cr_sfid);
+			lustre_swab_lu_fid(&ext->cr.cr_spfid);
+			tail = &ext->cr_tail;
+		} else {
+			tail = &cr->cr_tail;
+		}
+		tail = (struct llog_rec_tail *)((char *)tail +
+						cr->cr.cr_namelen);
+		break;
+	}
+	case CHANGELOG_USER_REC:
+	{
+		struct llog_changelog_user_rec *cur =
+			(struct llog_changelog_user_rec *)rec;
+
+		__swab32s(&cur->cur_id);
+		__swab64s(&cur->cur_endrec);
+		tail = &cur->cur_tail;
+		break;
+	}
+
+	case HSM_AGENT_REC: {
+		struct llog_agent_req_rec *arr =
+			(struct llog_agent_req_rec *)rec;
+
+		__swab32s(&arr->arr_hai.hai_len);
+		__swab32s(&arr->arr_hai.hai_action);
+		lustre_swab_lu_fid(&arr->arr_hai.hai_fid);
+		lustre_swab_lu_fid(&arr->arr_hai.hai_dfid);
+		__swab64s(&arr->arr_hai.hai_cookie);
+		__swab64s(&arr->arr_hai.hai_extent.offset);
+		__swab64s(&arr->arr_hai.hai_extent.length);
+		__swab64s(&arr->arr_hai.hai_gid);
+		/* no swabing for opaque data */
+		/* hai_data[0]; */
+		break;
+	}
+
+	case MDS_SETATTR64_REC:
+	{
+		struct llog_setattr64_rec *lsr =
+			(struct llog_setattr64_rec *)rec;
+
+		lustre_swab_ost_id(&lsr->lsr_oi);
+		__swab32s(&lsr->lsr_uid);
+		__swab32s(&lsr->lsr_uid_h);
+		__swab32s(&lsr->lsr_gid);
+		__swab32s(&lsr->lsr_gid_h);
+		tail = &lsr->lsr_tail;
+		break;
+	}
+	case OBD_CFG_REC:
+		/* these are swabbed as they are consumed */
+		break;
+	case LLOG_HDR_MAGIC:
+	{
+		struct llog_log_hdr *llh = (struct llog_log_hdr *)rec;
+
+		__swab64s(&llh->llh_timestamp);
+		__swab32s(&llh->llh_count);
+		__swab32s(&llh->llh_bitmap_offset);
+		__swab32s(&llh->llh_flags);
+		__swab32s(&llh->llh_size);
+		__swab32s(&llh->llh_cat_idx);
+		tail = &llh->llh_tail;
+		break;
+	}
+	case LLOG_LOGID_MAGIC:
+	{
+		struct llog_logid_rec *lid = (struct llog_logid_rec *)rec;
+
+		lustre_swab_llog_id(&lid->lid_id);
+		tail = &lid->lid_tail;
+		break;
+	}
+	case LLOG_GEN_REC:
+	{
+		struct llog_gen_rec *lgr = (struct llog_gen_rec *)rec;
+
+		__swab64s(&lgr->lgr_gen.mnt_cnt);
+		__swab64s(&lgr->lgr_gen.conn_cnt);
+		tail = &lgr->lgr_tail;
+		break;
+	}
+	case LLOG_PAD_MAGIC:
+		break;
+	default:
+		CERROR("Unknown llog rec type %#x swabbing rec %p\n",
+		       rec->lrh_type, rec);
+	}
+
+	if (tail) {
+		__swab32s(&tail->lrt_len);
+		__swab32s(&tail->lrt_index);
+	}
+}
+EXPORT_SYMBOL(lustre_swab_llog_rec);
+
+static void print_llog_hdr(struct llog_log_hdr *h)
+{
+	CDEBUG(D_OTHER, "llog header: %p\n", h);
+	CDEBUG(D_OTHER, "\tllh_hdr.lrh_index: %#x\n", h->llh_hdr.lrh_index);
+	CDEBUG(D_OTHER, "\tllh_hdr.lrh_len: %#x\n", h->llh_hdr.lrh_len);
+	CDEBUG(D_OTHER, "\tllh_hdr.lrh_type: %#x\n", h->llh_hdr.lrh_type);
+	CDEBUG(D_OTHER, "\tllh_timestamp: %#llx\n", h->llh_timestamp);
+	CDEBUG(D_OTHER, "\tllh_count: %#x\n", h->llh_count);
+	CDEBUG(D_OTHER, "\tllh_bitmap_offset: %#x\n", h->llh_bitmap_offset);
+	CDEBUG(D_OTHER, "\tllh_flags: %#x\n", h->llh_flags);
+	CDEBUG(D_OTHER, "\tllh_size: %#x\n", h->llh_size);
+	CDEBUG(D_OTHER, "\tllh_cat_idx: %#x\n", h->llh_cat_idx);
+	CDEBUG(D_OTHER, "\tllh_tail.lrt_index: %#x\n", h->llh_tail.lrt_index);
+	CDEBUG(D_OTHER, "\tllh_tail.lrt_len: %#x\n", h->llh_tail.lrt_len);
+}
+
+void lustre_swab_llog_hdr(struct llog_log_hdr *h)
+{
+	print_llog_hdr(h);
+
+	lustre_swab_llog_rec(&h->llh_hdr);
+
+	print_llog_hdr(h);
+}
+EXPORT_SYMBOL(lustre_swab_llog_hdr);
+
+static void print_lustre_cfg(struct lustre_cfg *lcfg)
+{
+	int i;
+
+	if (!(libcfs_debug & D_OTHER)) /* don't loop on nothing */
+		return;
+	CDEBUG(D_OTHER, "lustre_cfg: %p\n", lcfg);
+	CDEBUG(D_OTHER, "\tlcfg->lcfg_version: %#x\n", lcfg->lcfg_version);
+
+	CDEBUG(D_OTHER, "\tlcfg->lcfg_command: %#x\n", lcfg->lcfg_command);
+	CDEBUG(D_OTHER, "\tlcfg->lcfg_num: %#x\n", lcfg->lcfg_num);
+	CDEBUG(D_OTHER, "\tlcfg->lcfg_flags: %#x\n", lcfg->lcfg_flags);
+	CDEBUG(D_OTHER, "\tlcfg->lcfg_nid: %s\n", libcfs_nid2str(lcfg->lcfg_nid));
+
+	CDEBUG(D_OTHER, "\tlcfg->lcfg_bufcount: %d\n", lcfg->lcfg_bufcount);
+	if (lcfg->lcfg_bufcount < LUSTRE_CFG_MAX_BUFCOUNT)
+		for (i = 0; i < lcfg->lcfg_bufcount; i++)
+			CDEBUG(D_OTHER, "\tlcfg->lcfg_buflens[%d]: %d\n",
+			       i, lcfg->lcfg_buflens[i]);
+}
+
+void lustre_swab_lustre_cfg(struct lustre_cfg *lcfg)
+{
+	int i;
+
+	__swab32s(&lcfg->lcfg_version);
+
+	if (lcfg->lcfg_version != LUSTRE_CFG_VERSION) {
+		CERROR("not swabbing lustre_cfg version %#x (expecting %#x)\n",
+		       lcfg->lcfg_version, LUSTRE_CFG_VERSION);
+		return;
+	}
+
+	__swab32s(&lcfg->lcfg_command);
+	__swab32s(&lcfg->lcfg_num);
+	__swab32s(&lcfg->lcfg_flags);
+	__swab64s(&lcfg->lcfg_nid);
+	__swab32s(&lcfg->lcfg_bufcount);
+	for (i = 0; i < lcfg->lcfg_bufcount && i < LUSTRE_CFG_MAX_BUFCOUNT; i++)
+		__swab32s(&lcfg->lcfg_buflens[i]);
+
+	print_lustre_cfg(lcfg);
+	return;
+}
+EXPORT_SYMBOL(lustre_swab_lustre_cfg);
+
+/* used only for compatibility with old on-disk cfg_marker data */
+struct cfg_marker32 {
+	__u32   cm_step;
+	__u32   cm_flags;
+	__u32   cm_vers;
+	__u32   padding;
+	__u32   cm_createtime;
+	__u32   cm_canceltime;
+	char    cm_tgtname[MTI_NAME_MAXLEN];
+	char    cm_comment[MTI_NAME_MAXLEN];
+};
+
+#define MTI_NAMELEN32    (MTI_NAME_MAXLEN - \
+	(sizeof(struct cfg_marker) - sizeof(struct cfg_marker32)))
+
+void lustre_swab_cfg_marker(struct cfg_marker *marker, int swab, int size)
+{
+	struct cfg_marker32 *cm32 = (struct cfg_marker32 *)marker;
+
+	if (swab) {
+		__swab32s(&marker->cm_step);
+		__swab32s(&marker->cm_flags);
+		__swab32s(&marker->cm_vers);
+	}
+	if (size == sizeof(*cm32)) {
+		__u32 createtime, canceltime;
+		/* There was a problem with the original declaration of
+		 * cfg_marker on 32-bit systems because it used time_t as
+		 * a wire protocol structure, and didn't verify this in
+		 * wirecheck.  We now have to convert the offsets of the
+		 * later fields in order to work on 32- and 64-bit systems.
+		 *
+		 * Fortunately, the cm_comment field has no functional use
+		 * so can be sacrificed when converting the timestamp size.
+		 *
+		 * Overwrite fields from the end first, so they are not
+		 * clobbered, and use memmove() instead of memcpy() because
+		 * the source and target buffers overlap.  bug 16771 */
+		createtime = cm32->cm_createtime;
+		canceltime = cm32->cm_canceltime;
+		memmove(marker->cm_comment, cm32->cm_comment, MTI_NAMELEN32);
+		marker->cm_comment[MTI_NAMELEN32 - 1] = '\0';
+		memmove(marker->cm_tgtname, cm32->cm_tgtname,
+			sizeof(marker->cm_tgtname));
+		if (swab) {
+			__swab32s(&createtime);
+			__swab32s(&canceltime);
+		}
+		marker->cm_createtime = createtime;
+		marker->cm_canceltime = canceltime;
+		CDEBUG(D_CONFIG, "Find old cfg_marker(Srv32b,Clt64b) for target %s, converting\n",
+		       marker->cm_tgtname);
+	} else if (swab) {
+		__swab64s(&marker->cm_createtime);
+		__swab64s(&marker->cm_canceltime);
+	}
+
+	return;
+}
+EXPORT_SYMBOL(lustre_swab_cfg_marker);
diff --git a/kernel/drivers/staging/lustre/lustre/obdclass/lprocfs_counters.c b/kernel/drivers/staging/lustre/lustre/obdclass/lprocfs_counters.c
new file mode 100644
index 000000000..c49dfe541
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/obdclass/lprocfs_counters.c
@@ -0,0 +1,139 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ *
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, 2013, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdclass/lprocfs_counters.c
+ *
+ * Lustre lprocfs counter routines
+ *
+ * Author: Andreas Dilger <andreas.dilger@intel.com>
+ */
+
+#include <linux/module.h>
+#include "../include/lprocfs_status.h"
+#include "../include/obd_support.h"
+
+struct lprocfs_stats *obd_memory = NULL;
+EXPORT_SYMBOL(obd_memory);
+
+void lprocfs_counter_add(struct lprocfs_stats *stats, int idx, long amount)
+{
+	struct lprocfs_counter		*percpu_cntr;
+	struct lprocfs_counter_header	*header;
+	int				smp_id;
+	unsigned long			flags = 0;
+
+	if (stats == NULL)
+		return;
+
+	LASSERTF(0 <= idx && idx < stats->ls_num,
+		 "idx %d, ls_num %hu\n", idx, stats->ls_num);
+
+	/* With per-client stats, statistics are allocated only for
+	 * single CPU area, so the smp_id should be 0 always. */
+	smp_id = lprocfs_stats_lock(stats, LPROCFS_GET_SMP_ID, &flags);
+	if (smp_id < 0)
+		return;
+
+	header = &stats->ls_cnt_header[idx];
+	percpu_cntr = lprocfs_stats_counter_get(stats, smp_id, idx);
+	percpu_cntr->lc_count++;
+
+	if (header->lc_config & LPROCFS_CNTR_AVGMINMAX) {
+		/*
+		 * lprocfs_counter_add() can be called in interrupt context,
+		 * as memory allocation could trigger memory shrinker call
+		 * ldlm_pool_shrink(), which calls lprocfs_counter_add().
+		 * LU-1727.
+		 *
+		 * Only obd_memory uses LPROCFS_STATS_FLAG_IRQ_SAFE
+		 * flag, because it needs accurate counting lest memory leak
+		 * check reports error.
+		 */
+		if (in_interrupt() &&
+		    (stats->ls_flags & LPROCFS_STATS_FLAG_IRQ_SAFE) != 0)
+			percpu_cntr->lc_sum_irq += amount;
+		else
+			percpu_cntr->lc_sum += amount;
+
+		if (header->lc_config & LPROCFS_CNTR_STDDEV)
+			percpu_cntr->lc_sumsquare += (__s64)amount * amount;
+		if (amount < percpu_cntr->lc_min)
+			percpu_cntr->lc_min = amount;
+		if (amount > percpu_cntr->lc_max)
+			percpu_cntr->lc_max = amount;
+	}
+	lprocfs_stats_unlock(stats, LPROCFS_GET_SMP_ID, &flags);
+}
+EXPORT_SYMBOL(lprocfs_counter_add);
+
+void lprocfs_counter_sub(struct lprocfs_stats *stats, int idx, long amount)
+{
+	struct lprocfs_counter		*percpu_cntr;
+	struct lprocfs_counter_header	*header;
+	int				smp_id;
+	unsigned long			flags = 0;
+
+	if (stats == NULL)
+		return;
+
+	LASSERTF(0 <= idx && idx < stats->ls_num,
+		 "idx %d, ls_num %hu\n", idx, stats->ls_num);
+
+	/* With per-client stats, statistics are allocated only for
+	 * single CPU area, so the smp_id should be 0 always. */
+	smp_id = lprocfs_stats_lock(stats, LPROCFS_GET_SMP_ID, &flags);
+	if (smp_id < 0)
+		return;
+
+	header = &stats->ls_cnt_header[idx];
+	percpu_cntr = lprocfs_stats_counter_get(stats, smp_id, idx);
+	if (header->lc_config & LPROCFS_CNTR_AVGMINMAX) {
+		/*
+		 * Sometimes we use RCU callbacks to free memory which calls
+		 * lprocfs_counter_sub(), and RCU callbacks may execute in
+		 * softirq context - right now that's the only case we're in
+		 * softirq context here, use separate counter for that.
+		 * bz20650.
+		 *
+		 * Only obd_memory uses LPROCFS_STATS_FLAG_IRQ_SAFE
+		 * flag, because it needs accurate counting lest memory leak
+		 * check reports error.
+		 */
+		if (in_interrupt() &&
+		    (stats->ls_flags & LPROCFS_STATS_FLAG_IRQ_SAFE) != 0)
+			percpu_cntr->lc_sum_irq -= amount;
+		else
+			percpu_cntr->lc_sum -= amount;
+	}
+	lprocfs_stats_unlock(stats, LPROCFS_GET_SMP_ID, &flags);
+}
+EXPORT_SYMBOL(lprocfs_counter_sub);
diff --git a/kernel/drivers/staging/lustre/lustre/obdclass/lprocfs_status.c b/kernel/drivers/staging/lustre/lustre/obdclass/lprocfs_status.c
new file mode 100644
index 000000000..c171c6c6c
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/obdclass/lprocfs_status.c
@@ -0,0 +1,2059 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdclass/lprocfs_status.c
+ *
+ * Author: Hariharan Thantry <thantry@users.sourceforge.net>
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+
+
+#include "../include/obd_class.h"
+#include "../include/lprocfs_status.h"
+#include "../include/lustre/lustre_idl.h"
+#include <linux/seq_file.h>
+#include <linux/ctype.h>
+
+static const char * const obd_connect_names[] = {
+	"read_only",
+	"lov_index",
+	"unused",
+	"write_grant",
+	"server_lock",
+	"version",
+	"request_portal",
+	"acl",
+	"xattr",
+	"create_on_write",
+	"truncate_lock",
+	"initial_transno",
+	"inode_bit_locks",
+	"join_file(obsolete)",
+	"getattr_by_fid",
+	"no_oh_for_devices",
+	"remote_client",
+	"remote_client_by_force",
+	"max_byte_per_rpc",
+	"64bit_qdata",
+	"mds_capability",
+	"oss_capability",
+	"early_lock_cancel",
+	"som",
+	"adaptive_timeouts",
+	"lru_resize",
+	"mds_mds_connection",
+	"real_conn",
+	"change_qunit_size",
+	"alt_checksum_algorithm",
+	"fid_is_enabled",
+	"version_recovery",
+	"pools",
+	"grant_shrink",
+	"skip_orphan",
+	"large_ea",
+	"full20",
+	"layout_lock",
+	"64bithash",
+	"object_max_bytes",
+	"imp_recov",
+	"jobstats",
+	"umask",
+	"einprogress",
+	"grant_param",
+	"flock_owner",
+	"lvb_type",
+	"nanoseconds_times",
+	"lightweight_conn",
+	"short_io",
+	"pingless",
+	"flock_deadlock",
+	"disp_stripe",
+	"unknown",
+	NULL
+};
+
+int obd_connect_flags2str(char *page, int count, __u64 flags, char *sep)
+{
+	__u64 mask = 1;
+	int i, ret = 0;
+
+	for (i = 0; obd_connect_names[i] != NULL; i++, mask <<= 1) {
+		if (flags & mask)
+			ret += snprintf(page + ret, count - ret, "%s%s",
+					ret ? sep : "", obd_connect_names[i]);
+	}
+	if (flags & ~(mask - 1))
+		ret += snprintf(page + ret, count - ret,
+				"%sunknown flags %#llx",
+				ret ? sep : "", flags & ~(mask - 1));
+	return ret;
+}
+EXPORT_SYMBOL(obd_connect_flags2str);
+
+int lprocfs_read_frac_helper(char *buffer, unsigned long count, long val,
+			     int mult)
+{
+	long decimal_val, frac_val;
+	int prtn;
+
+	if (count < 10)
+		return -EINVAL;
+
+	decimal_val = val / mult;
+	prtn = snprintf(buffer, count, "%ld", decimal_val);
+	frac_val = val % mult;
+
+	if (prtn < (count - 4) && frac_val > 0) {
+		long temp_frac;
+		int i, temp_mult = 1, frac_bits = 0;
+
+		temp_frac = frac_val * 10;
+		buffer[prtn++] = '.';
+		while (frac_bits < 2 && (temp_frac / mult) < 1) {
+			/* only reserved 2 bits fraction */
+			buffer[prtn++] = '0';
+			temp_frac *= 10;
+			frac_bits++;
+		}
+		/*
+		 * Need to think these cases :
+		 *      1. #echo x.00 > /proc/xxx       output result : x
+		 *      2. #echo x.0x > /proc/xxx       output result : x.0x
+		 *      3. #echo x.x0 > /proc/xxx       output result : x.x
+		 *      4. #echo x.xx > /proc/xxx       output result : x.xx
+		 *      Only reserved 2 bits fraction.
+		 */
+		for (i = 0; i < (5 - prtn); i++)
+			temp_mult *= 10;
+
+		frac_bits = min((int)count - prtn, 3 - frac_bits);
+		prtn += snprintf(buffer + prtn, frac_bits, "%ld",
+				 frac_val * temp_mult / mult);
+
+		prtn--;
+		while (buffer[prtn] < '1' || buffer[prtn] > '9') {
+			prtn--;
+			if (buffer[prtn] == '.') {
+				prtn--;
+				break;
+			}
+		}
+		prtn++;
+	}
+	buffer[prtn++] = '\n';
+	return prtn;
+}
+EXPORT_SYMBOL(lprocfs_read_frac_helper);
+
+int lprocfs_write_frac_helper(const char __user *buffer, unsigned long count,
+			      int *val, int mult)
+{
+	char kernbuf[20], *end, *pbuf;
+
+	if (count > (sizeof(kernbuf) - 1))
+		return -EINVAL;
+
+	if (copy_from_user(kernbuf, buffer, count))
+		return -EFAULT;
+
+	kernbuf[count] = '\0';
+	pbuf = kernbuf;
+	if (*pbuf == '-') {
+		mult = -mult;
+		pbuf++;
+	}
+
+	*val = (int)simple_strtoul(pbuf, &end, 10) * mult;
+	if (pbuf == end)
+		return -EINVAL;
+
+	if (end != NULL && *end == '.') {
+		int temp_val, pow = 1;
+		int i;
+
+		pbuf = end + 1;
+		if (strlen(pbuf) > 5)
+			pbuf[5] = '\0'; /*only allow 5bits fractional*/
+
+		temp_val = (int)simple_strtoul(pbuf, &end, 10) * mult;
+
+		if (pbuf < end) {
+			for (i = 0; i < (end - pbuf); i++)
+				pow *= 10;
+
+			*val += temp_val / pow;
+		}
+	}
+	return 0;
+}
+EXPORT_SYMBOL(lprocfs_write_frac_helper);
+
+#if defined (CONFIG_PROC_FS)
+
+static int lprocfs_no_percpu_stats;
+module_param(lprocfs_no_percpu_stats, int, 0644);
+MODULE_PARM_DESC(lprocfs_no_percpu_stats, "Do not alloc percpu data for lprocfs stats");
+
+#define MAX_STRING_SIZE 128
+
+int lprocfs_single_release(struct inode *inode, struct file *file)
+{
+	return single_release(inode, file);
+}
+EXPORT_SYMBOL(lprocfs_single_release);
+
+int lprocfs_seq_release(struct inode *inode, struct file *file)
+{
+	return seq_release(inode, file);
+}
+EXPORT_SYMBOL(lprocfs_seq_release);
+
+/* lprocfs API calls */
+
+struct proc_dir_entry *lprocfs_add_simple(struct proc_dir_entry *root,
+				     char *name, void *data,
+				     struct file_operations *fops)
+{
+	struct proc_dir_entry *proc;
+	umode_t mode = 0;
+
+	if (root == NULL || name == NULL || fops == NULL)
+		return ERR_PTR(-EINVAL);
+
+	if (fops->read)
+		mode = 0444;
+	if (fops->write)
+		mode |= 0200;
+	proc = proc_create_data(name, mode, root, fops, data);
+	if (!proc) {
+		CERROR("LprocFS: No memory to create /proc entry %s", name);
+		return ERR_PTR(-ENOMEM);
+	}
+	return proc;
+}
+EXPORT_SYMBOL(lprocfs_add_simple);
+
+struct proc_dir_entry *lprocfs_add_symlink(const char *name,
+			struct proc_dir_entry *parent, const char *format, ...)
+{
+	struct proc_dir_entry *entry;
+	char *dest;
+	va_list ap;
+
+	if (parent == NULL || format == NULL)
+		return NULL;
+
+	OBD_ALLOC_WAIT(dest, MAX_STRING_SIZE + 1);
+	if (dest == NULL)
+		return NULL;
+
+	va_start(ap, format);
+	vsnprintf(dest, MAX_STRING_SIZE, format, ap);
+	va_end(ap);
+
+	entry = proc_symlink(name, parent, dest);
+	if (entry == NULL)
+		CERROR("LprocFS: Could not create symbolic link from %s to %s",
+			name, dest);
+
+	OBD_FREE(dest, MAX_STRING_SIZE + 1);
+	return entry;
+}
+EXPORT_SYMBOL(lprocfs_add_symlink);
+
+static struct file_operations lprocfs_generic_fops = { };
+
+/**
+ * Add /proc entries.
+ *
+ * \param root [in]  The parent proc entry on which new entry will be added.
+ * \param list [in]  Array of proc entries to be added.
+ * \param data [in]  The argument to be passed when entries read/write routines
+ *		   are called through /proc file.
+ *
+ * \retval 0   on success
+ *	 < 0 on error
+ */
+int lprocfs_add_vars(struct proc_dir_entry *root, struct lprocfs_vars *list,
+		     void *data)
+{
+	if (root == NULL || list == NULL)
+		return -EINVAL;
+
+	while (list->name != NULL) {
+		struct proc_dir_entry *proc;
+		umode_t mode = 0;
+
+		if (list->proc_mode != 0000) {
+			mode = list->proc_mode;
+		} else if (list->fops) {
+			if (list->fops->read)
+				mode = 0444;
+			if (list->fops->write)
+				mode |= 0200;
+		}
+		proc = proc_create_data(list->name, mode, root,
+					list->fops ?: &lprocfs_generic_fops,
+					list->data ?: data);
+		if (proc == NULL)
+			return -ENOMEM;
+		list++;
+	}
+	return 0;
+}
+EXPORT_SYMBOL(lprocfs_add_vars);
+
+void lprocfs_remove(struct proc_dir_entry **rooth)
+{
+	proc_remove(*rooth);
+	*rooth = NULL;
+}
+EXPORT_SYMBOL(lprocfs_remove);
+
+void lprocfs_remove_proc_entry(const char *name, struct proc_dir_entry *parent)
+{
+	LASSERT(parent != NULL);
+	remove_proc_entry(name, parent);
+}
+EXPORT_SYMBOL(lprocfs_remove_proc_entry);
+
+struct proc_dir_entry *lprocfs_register(const char *name,
+					struct proc_dir_entry *parent,
+					struct lprocfs_vars *list, void *data)
+{
+	struct proc_dir_entry *entry;
+
+	entry = proc_mkdir(name, parent);
+	if (entry == NULL) {
+		entry = ERR_PTR(-ENOMEM);
+		goto out;
+	}
+
+	if (list != NULL) {
+		int rc = lprocfs_add_vars(entry, list, data);
+		if (rc != 0) {
+			lprocfs_remove(&entry);
+			entry = ERR_PTR(rc);
+		}
+	}
+out:
+	return entry;
+}
+EXPORT_SYMBOL(lprocfs_register);
+
+/* Generic callbacks */
+int lprocfs_rd_uint(struct seq_file *m, void *data)
+{
+	seq_printf(m, "%u\n", *(unsigned int *)data);
+	return 0;
+}
+EXPORT_SYMBOL(lprocfs_rd_uint);
+
+int lprocfs_wr_uint(struct file *file, const char __user *buffer,
+		    unsigned long count, void *data)
+{
+	unsigned *p = data;
+	char dummy[MAX_STRING_SIZE + 1], *end;
+	unsigned long tmp;
+
+	dummy[MAX_STRING_SIZE] = '\0';
+	if (copy_from_user(dummy, buffer, MAX_STRING_SIZE))
+		return -EFAULT;
+
+	tmp = simple_strtoul(dummy, &end, 0);
+	if (dummy == end)
+		return -EINVAL;
+
+	*p = (unsigned int)tmp;
+	return count;
+}
+EXPORT_SYMBOL(lprocfs_wr_uint);
+
+int lprocfs_rd_u64(struct seq_file *m, void *data)
+{
+	seq_printf(m, "%llu\n", *(__u64 *)data);
+	return 0;
+}
+EXPORT_SYMBOL(lprocfs_rd_u64);
+
+int lprocfs_rd_atomic(struct seq_file *m, void *data)
+{
+	atomic_t *atom = data;
+	LASSERT(atom != NULL);
+	seq_printf(m, "%d\n", atomic_read(atom));
+	return 0;
+}
+EXPORT_SYMBOL(lprocfs_rd_atomic);
+
+int lprocfs_wr_atomic(struct file *file, const char __user *buffer,
+		      unsigned long count, void *data)
+{
+	atomic_t *atm = data;
+	int val = 0;
+	int rc;
+
+	rc = lprocfs_write_helper(buffer, count, &val);
+	if (rc < 0)
+		return rc;
+
+	if (val <= 0)
+		return -ERANGE;
+
+	atomic_set(atm, val);
+	return count;
+}
+EXPORT_SYMBOL(lprocfs_wr_atomic);
+
+int lprocfs_rd_uuid(struct seq_file *m, void *data)
+{
+	struct obd_device *obd = data;
+
+	LASSERT(obd != NULL);
+	seq_printf(m, "%s\n", obd->obd_uuid.uuid);
+	return 0;
+}
+EXPORT_SYMBOL(lprocfs_rd_uuid);
+
+int lprocfs_rd_name(struct seq_file *m, void *data)
+{
+	struct obd_device *dev = data;
+
+	LASSERT(dev != NULL);
+	seq_printf(m, "%s\n", dev->obd_name);
+	return 0;
+}
+EXPORT_SYMBOL(lprocfs_rd_name);
+
+int lprocfs_rd_blksize(struct seq_file *m, void *data)
+{
+	struct obd_device *obd = data;
+	struct obd_statfs  osfs;
+	int rc = obd_statfs(NULL, obd->obd_self_export, &osfs,
+			    cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS),
+			    OBD_STATFS_NODELAY);
+	if (!rc)
+		seq_printf(m, "%u\n", osfs.os_bsize);
+
+	return rc;
+}
+EXPORT_SYMBOL(lprocfs_rd_blksize);
+
+int lprocfs_rd_kbytestotal(struct seq_file *m, void *data)
+{
+	struct obd_device *obd = data;
+	struct obd_statfs  osfs;
+	int rc = obd_statfs(NULL, obd->obd_self_export, &osfs,
+			    cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS),
+			    OBD_STATFS_NODELAY);
+	if (!rc) {
+		__u32 blk_size = osfs.os_bsize >> 10;
+		__u64 result = osfs.os_blocks;
+
+		while (blk_size >>= 1)
+			result <<= 1;
+
+		seq_printf(m, "%llu\n", result);
+	}
+
+	return rc;
+}
+EXPORT_SYMBOL(lprocfs_rd_kbytestotal);
+
+int lprocfs_rd_kbytesfree(struct seq_file *m, void *data)
+{
+	struct obd_device *obd = data;
+	struct obd_statfs  osfs;
+	int rc = obd_statfs(NULL, obd->obd_self_export, &osfs,
+			    cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS),
+			    OBD_STATFS_NODELAY);
+	if (!rc) {
+		__u32 blk_size = osfs.os_bsize >> 10;
+		__u64 result = osfs.os_bfree;
+
+		while (blk_size >>= 1)
+			result <<= 1;
+
+		seq_printf(m, "%llu\n", result);
+	}
+
+	return rc;
+}
+EXPORT_SYMBOL(lprocfs_rd_kbytesfree);
+
+int lprocfs_rd_kbytesavail(struct seq_file *m, void *data)
+{
+	struct obd_device *obd = data;
+	struct obd_statfs  osfs;
+	int rc = obd_statfs(NULL, obd->obd_self_export, &osfs,
+			    cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS),
+			    OBD_STATFS_NODELAY);
+	if (!rc) {
+		__u32 blk_size = osfs.os_bsize >> 10;
+		__u64 result = osfs.os_bavail;
+
+		while (blk_size >>= 1)
+			result <<= 1;
+
+		seq_printf(m, "%llu\n", result);
+	}
+
+	return rc;
+}
+EXPORT_SYMBOL(lprocfs_rd_kbytesavail);
+
+int lprocfs_rd_filestotal(struct seq_file *m, void *data)
+{
+	struct obd_device *obd = data;
+	struct obd_statfs  osfs;
+	int rc = obd_statfs(NULL, obd->obd_self_export, &osfs,
+			    cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS),
+			    OBD_STATFS_NODELAY);
+	if (!rc)
+		seq_printf(m, "%llu\n", osfs.os_files);
+
+	return rc;
+}
+EXPORT_SYMBOL(lprocfs_rd_filestotal);
+
+int lprocfs_rd_filesfree(struct seq_file *m, void *data)
+{
+	struct obd_device *obd = data;
+	struct obd_statfs  osfs;
+	int rc = obd_statfs(NULL, obd->obd_self_export, &osfs,
+			    cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS),
+			    OBD_STATFS_NODELAY);
+	if (!rc)
+		seq_printf(m, "%llu\n", osfs.os_ffree);
+
+	return rc;
+}
+EXPORT_SYMBOL(lprocfs_rd_filesfree);
+
+int lprocfs_rd_server_uuid(struct seq_file *m, void *data)
+{
+	struct obd_device *obd = data;
+	struct obd_import *imp;
+	char *imp_state_name = NULL;
+
+	LASSERT(obd != NULL);
+	LPROCFS_CLIMP_CHECK(obd);
+	imp = obd->u.cli.cl_import;
+	imp_state_name = ptlrpc_import_state_name(imp->imp_state);
+	seq_printf(m, "%s\t%s%s\n",
+		   obd2cli_tgt(obd), imp_state_name,
+		   imp->imp_deactive ? "\tDEACTIVATED" : "");
+
+	LPROCFS_CLIMP_EXIT(obd);
+
+	return 0;
+}
+EXPORT_SYMBOL(lprocfs_rd_server_uuid);
+
+int lprocfs_rd_conn_uuid(struct seq_file *m, void *data)
+{
+	struct obd_device *obd = data;
+	struct ptlrpc_connection *conn;
+
+	LASSERT(obd != NULL);
+
+	LPROCFS_CLIMP_CHECK(obd);
+	conn = obd->u.cli.cl_import->imp_connection;
+	if (conn && obd->u.cli.cl_import)
+		seq_printf(m, "%s\n", conn->c_remote_uuid.uuid);
+	else
+		seq_puts(m, "<none>\n");
+
+	LPROCFS_CLIMP_EXIT(obd);
+
+	return 0;
+}
+EXPORT_SYMBOL(lprocfs_rd_conn_uuid);
+
+/** add up per-cpu counters */
+void lprocfs_stats_collect(struct lprocfs_stats *stats, int idx,
+			   struct lprocfs_counter *cnt)
+{
+	unsigned int			num_entry;
+	struct lprocfs_counter		*percpu_cntr;
+	int				i;
+	unsigned long			flags = 0;
+
+	memset(cnt, 0, sizeof(*cnt));
+
+	if (stats == NULL) {
+		/* set count to 1 to avoid divide-by-zero errs in callers */
+		cnt->lc_count = 1;
+		return;
+	}
+
+	cnt->lc_min = LC_MIN_INIT;
+
+	num_entry = lprocfs_stats_lock(stats, LPROCFS_GET_NUM_CPU, &flags);
+
+	for (i = 0; i < num_entry; i++) {
+		if (stats->ls_percpu[i] == NULL)
+			continue;
+		percpu_cntr = lprocfs_stats_counter_get(stats, i, idx);
+
+		cnt->lc_count += percpu_cntr->lc_count;
+		cnt->lc_sum += percpu_cntr->lc_sum;
+		if (percpu_cntr->lc_min < cnt->lc_min)
+			cnt->lc_min = percpu_cntr->lc_min;
+		if (percpu_cntr->lc_max > cnt->lc_max)
+			cnt->lc_max = percpu_cntr->lc_max;
+		cnt->lc_sumsquare += percpu_cntr->lc_sumsquare;
+	}
+
+	lprocfs_stats_unlock(stats, LPROCFS_GET_NUM_CPU, &flags);
+}
+EXPORT_SYMBOL(lprocfs_stats_collect);
+
+/**
+ * Append a space separated list of current set flags to str.
+ */
+#define flag2str(flag, first)						\
+	do {								\
+		if (imp->imp_##flag)					\
+		     seq_printf(m, "%s" #flag, first ? "" : ", ");	\
+	} while (0)
+static int obd_import_flags2str(struct obd_import *imp, struct seq_file *m)
+{
+	bool first = true;
+
+	if (imp->imp_obd->obd_no_recov) {
+		seq_printf(m, "no_recov");
+		first = false;
+	}
+
+	flag2str(invalid, first);
+	first = false;
+	flag2str(deactive, first);
+	flag2str(replayable, first);
+	flag2str(pingable, first);
+	return 0;
+}
+#undef flags2str
+
+static void obd_connect_seq_flags2str(struct seq_file *m, __u64 flags, char *sep)
+{
+	__u64 mask = 1;
+	int i;
+	bool first = true;
+
+	for (i = 0; obd_connect_names[i] != NULL; i++, mask <<= 1) {
+		if (flags & mask) {
+			seq_printf(m, "%s%s",
+					first ? sep : "", obd_connect_names[i]);
+			first = false;
+		}
+	}
+	if (flags & ~(mask - 1))
+		seq_printf(m, "%sunknown flags %#llx",
+				first ? sep : "", flags & ~(mask - 1));
+}
+
+int lprocfs_rd_import(struct seq_file *m, void *data)
+{
+	struct lprocfs_counter		ret;
+	struct lprocfs_counter_header	*header;
+	struct obd_device		*obd	= (struct obd_device *)data;
+	struct obd_import		*imp;
+	struct obd_import_conn		*conn;
+	int				j;
+	int				k;
+	int				rw	= 0;
+
+	LASSERT(obd != NULL);
+	LPROCFS_CLIMP_CHECK(obd);
+	imp = obd->u.cli.cl_import;
+
+	seq_printf(m,
+		     "import:\n"
+		     "    name: %s\n"
+		     "    target: %s\n"
+		     "    state: %s\n"
+		     "    instance: %u\n"
+		     "    connect_flags: [",
+		     obd->obd_name,
+		     obd2cli_tgt(obd),
+		     ptlrpc_import_state_name(imp->imp_state),
+		     imp->imp_connect_data.ocd_instance);
+	obd_connect_seq_flags2str(m, imp->imp_connect_data.ocd_connect_flags, ", ");
+	seq_printf(m,
+		      "]\n"
+		      "    import_flags: [");
+	obd_import_flags2str(imp, m);
+
+	seq_printf(m,
+		      "]\n"
+		      "    connection:\n"
+		      "       failover_nids: [");
+	spin_lock(&imp->imp_lock);
+	j = 0;
+	list_for_each_entry(conn, &imp->imp_conn_list, oic_item) {
+		seq_printf(m, "%s%s", j ? ", " : "",
+			   libcfs_nid2str(conn->oic_conn->c_peer.nid));
+		j++;
+	}
+	seq_printf(m,
+		      "]\n"
+		      "       current_connection: %s\n"
+		      "       connection_attempts: %u\n"
+		      "       generation: %u\n"
+		      "       in-progress_invalidations: %u\n",
+		      imp->imp_connection == NULL ? "<none>" :
+			      libcfs_nid2str(imp->imp_connection->c_peer.nid),
+		      imp->imp_conn_cnt,
+		      imp->imp_generation,
+		      atomic_read(&imp->imp_inval_count));
+	spin_unlock(&imp->imp_lock);
+
+	if (obd->obd_svc_stats == NULL)
+		goto out_climp;
+
+	header = &obd->obd_svc_stats->ls_cnt_header[PTLRPC_REQWAIT_CNTR];
+	lprocfs_stats_collect(obd->obd_svc_stats, PTLRPC_REQWAIT_CNTR, &ret);
+	if (ret.lc_count != 0) {
+		/* first argument to do_div MUST be __u64 */
+		__u64 sum = ret.lc_sum;
+		do_div(sum, ret.lc_count);
+		ret.lc_sum = sum;
+	} else
+		ret.lc_sum = 0;
+	seq_printf(m,
+		      "    rpcs:\n"
+		      "       inflight: %u\n"
+		      "       unregistering: %u\n"
+		      "       timeouts: %u\n"
+		      "       avg_waittime: %llu %s\n",
+		      atomic_read(&imp->imp_inflight),
+		      atomic_read(&imp->imp_unregistering),
+		      atomic_read(&imp->imp_timeouts),
+		      ret.lc_sum, header->lc_units);
+
+	k = 0;
+	for (j = 0; j < IMP_AT_MAX_PORTALS; j++) {
+		if (imp->imp_at.iat_portal[j] == 0)
+			break;
+		k = max_t(unsigned int, k,
+			  at_get(&imp->imp_at.iat_service_estimate[j]));
+	}
+	seq_printf(m,
+		      "    service_estimates:\n"
+		      "       services: %u sec\n"
+		      "       network: %u sec\n",
+		      k,
+		      at_get(&imp->imp_at.iat_net_latency));
+
+	seq_printf(m,
+		      "    transactions:\n"
+		      "       last_replay: %llu\n"
+		      "       peer_committed: %llu\n"
+		      "       last_checked: %llu\n",
+		      imp->imp_last_replay_transno,
+		      imp->imp_peer_committed_transno,
+		      imp->imp_last_transno_checked);
+
+	/* avg data rates */
+	for (rw = 0; rw <= 1; rw++) {
+		lprocfs_stats_collect(obd->obd_svc_stats,
+				      PTLRPC_LAST_CNTR + BRW_READ_BYTES + rw,
+				      &ret);
+		if (ret.lc_sum > 0 && ret.lc_count > 0) {
+			/* first argument to do_div MUST be __u64 */
+			__u64 sum = ret.lc_sum;
+			do_div(sum, ret.lc_count);
+			ret.lc_sum = sum;
+			seq_printf(m,
+				      "    %s_data_averages:\n"
+				      "       bytes_per_rpc: %llu\n",
+				      rw ? "write" : "read",
+				      ret.lc_sum);
+		}
+		k = (int)ret.lc_sum;
+		j = opcode_offset(OST_READ + rw) + EXTRA_MAX_OPCODES;
+		header = &obd->obd_svc_stats->ls_cnt_header[j];
+		lprocfs_stats_collect(obd->obd_svc_stats, j, &ret);
+		if (ret.lc_sum > 0 && ret.lc_count != 0) {
+			/* first argument to do_div MUST be __u64 */
+			__u64 sum = ret.lc_sum;
+			do_div(sum, ret.lc_count);
+			ret.lc_sum = sum;
+			seq_printf(m,
+				      "       %s_per_rpc: %llu\n",
+				      header->lc_units, ret.lc_sum);
+			j = (int)ret.lc_sum;
+			if (j > 0)
+				seq_printf(m,
+					      "       MB_per_sec: %u.%.02u\n",
+					      k / j, (100 * k / j) % 100);
+		}
+	}
+
+out_climp:
+	LPROCFS_CLIMP_EXIT(obd);
+	return 0;
+}
+EXPORT_SYMBOL(lprocfs_rd_import);
+
+int lprocfs_rd_state(struct seq_file *m, void *data)
+{
+	struct obd_device *obd = (struct obd_device *)data;
+	struct obd_import *imp;
+	int j, k;
+
+	LASSERT(obd != NULL);
+	LPROCFS_CLIMP_CHECK(obd);
+	imp = obd->u.cli.cl_import;
+
+	seq_printf(m, "current_state: %s\n",
+		     ptlrpc_import_state_name(imp->imp_state));
+	seq_printf(m, "state_history:\n");
+	k = imp->imp_state_hist_idx;
+	for (j = 0; j < IMP_STATE_HIST_LEN; j++) {
+		struct import_state_hist *ish =
+			&imp->imp_state_hist[(k + j) % IMP_STATE_HIST_LEN];
+		if (ish->ish_state == 0)
+			continue;
+		seq_printf(m, " - ["CFS_TIME_T", %s]\n",
+			      ish->ish_time,
+			      ptlrpc_import_state_name(ish->ish_state));
+	}
+
+	LPROCFS_CLIMP_EXIT(obd);
+	return 0;
+}
+EXPORT_SYMBOL(lprocfs_rd_state);
+
+int lprocfs_at_hist_helper(struct seq_file *m, struct adaptive_timeout *at)
+{
+	int i;
+	for (i = 0; i < AT_BINS; i++)
+		seq_printf(m, "%3u ", at->at_hist[i]);
+	seq_printf(m, "\n");
+	return 0;
+}
+EXPORT_SYMBOL(lprocfs_at_hist_helper);
+
+/* See also ptlrpc_lprocfs_rd_timeouts */
+int lprocfs_rd_timeouts(struct seq_file *m, void *data)
+{
+	struct obd_device *obd = (struct obd_device *)data;
+	struct obd_import *imp;
+	unsigned int cur, worst;
+	time_t now, worstt;
+	struct dhms ts;
+	int i;
+
+	LASSERT(obd != NULL);
+	LPROCFS_CLIMP_CHECK(obd);
+	imp = obd->u.cli.cl_import;
+
+	now = get_seconds();
+
+	/* Some network health info for kicks */
+	s2dhms(&ts, now - imp->imp_last_reply_time);
+	seq_printf(m, "%-10s : %ld, "DHMS_FMT" ago\n",
+		       "last reply", imp->imp_last_reply_time, DHMS_VARS(&ts));
+
+	cur = at_get(&imp->imp_at.iat_net_latency);
+	worst = imp->imp_at.iat_net_latency.at_worst_ever;
+	worstt = imp->imp_at.iat_net_latency.at_worst_time;
+	s2dhms(&ts, now - worstt);
+	seq_printf(m, "%-10s : cur %3u  worst %3u (at %ld, "DHMS_FMT" ago) ",
+		       "network", cur, worst, worstt, DHMS_VARS(&ts));
+	lprocfs_at_hist_helper(m, &imp->imp_at.iat_net_latency);
+
+	for (i = 0; i < IMP_AT_MAX_PORTALS; i++) {
+		if (imp->imp_at.iat_portal[i] == 0)
+			break;
+		cur = at_get(&imp->imp_at.iat_service_estimate[i]);
+		worst = imp->imp_at.iat_service_estimate[i].at_worst_ever;
+		worstt = imp->imp_at.iat_service_estimate[i].at_worst_time;
+		s2dhms(&ts, now - worstt);
+		seq_printf(m, "portal %-2d  : cur %3u  worst %3u (at %ld, "
+			       DHMS_FMT" ago) ", imp->imp_at.iat_portal[i],
+			       cur, worst, worstt, DHMS_VARS(&ts));
+		lprocfs_at_hist_helper(m, &imp->imp_at.iat_service_estimate[i]);
+	}
+
+	LPROCFS_CLIMP_EXIT(obd);
+	return 0;
+}
+EXPORT_SYMBOL(lprocfs_rd_timeouts);
+
+int lprocfs_rd_connect_flags(struct seq_file *m, void *data)
+{
+	struct obd_device *obd = data;
+	__u64 flags;
+
+	LPROCFS_CLIMP_CHECK(obd);
+	flags = obd->u.cli.cl_import->imp_connect_data.ocd_connect_flags;
+	seq_printf(m, "flags=%#llx\n", flags);
+	obd_connect_seq_flags2str(m, flags, "\n");
+	seq_printf(m, "\n");
+	LPROCFS_CLIMP_EXIT(obd);
+	return 0;
+}
+EXPORT_SYMBOL(lprocfs_rd_connect_flags);
+
+int lprocfs_rd_num_exports(struct seq_file *m, void *data)
+{
+	struct obd_device *obd = data;
+
+	LASSERT(obd != NULL);
+	seq_printf(m, "%u\n", obd->obd_num_exports);
+	return 0;
+}
+EXPORT_SYMBOL(lprocfs_rd_num_exports);
+
+int lprocfs_rd_numrefs(struct seq_file *m, void *data)
+{
+	struct obd_type *class = (struct obd_type *) data;
+
+	LASSERT(class != NULL);
+	seq_printf(m, "%d\n", class->typ_refcnt);
+	return 0;
+}
+EXPORT_SYMBOL(lprocfs_rd_numrefs);
+
+int lprocfs_obd_setup(struct obd_device *obd, struct lprocfs_vars *list)
+{
+	int rc = 0;
+
+	LASSERT(obd != NULL);
+	LASSERT(obd->obd_magic == OBD_DEVICE_MAGIC);
+	LASSERT(obd->obd_type->typ_procroot != NULL);
+
+	obd->obd_proc_entry = lprocfs_register(obd->obd_name,
+					       obd->obd_type->typ_procroot,
+					       list, obd);
+	if (IS_ERR(obd->obd_proc_entry)) {
+		rc = PTR_ERR(obd->obd_proc_entry);
+		CERROR("error %d setting up lprocfs for %s\n",
+		       rc, obd->obd_name);
+		obd->obd_proc_entry = NULL;
+	}
+	return rc;
+}
+EXPORT_SYMBOL(lprocfs_obd_setup);
+
+int lprocfs_obd_cleanup(struct obd_device *obd)
+{
+	if (!obd)
+		return -EINVAL;
+	if (obd->obd_proc_exports_entry) {
+		/* Should be no exports left */
+		lprocfs_remove(&obd->obd_proc_exports_entry);
+		obd->obd_proc_exports_entry = NULL;
+	}
+	if (obd->obd_proc_entry) {
+		lprocfs_remove(&obd->obd_proc_entry);
+		obd->obd_proc_entry = NULL;
+	}
+	return 0;
+}
+EXPORT_SYMBOL(lprocfs_obd_cleanup);
+
+static void lprocfs_free_client_stats(struct nid_stat *client_stat)
+{
+	CDEBUG(D_CONFIG, "stat %p - data %p/%p\n", client_stat,
+	       client_stat->nid_proc, client_stat->nid_stats);
+
+	LASSERTF(atomic_read(&client_stat->nid_exp_ref_count) == 0,
+		 "nid %s:count %d\n", libcfs_nid2str(client_stat->nid),
+		 atomic_read(&client_stat->nid_exp_ref_count));
+
+	if (client_stat->nid_proc)
+		lprocfs_remove(&client_stat->nid_proc);
+
+	if (client_stat->nid_stats)
+		lprocfs_free_stats(&client_stat->nid_stats);
+
+	if (client_stat->nid_ldlm_stats)
+		lprocfs_free_stats(&client_stat->nid_ldlm_stats);
+
+	OBD_FREE_PTR(client_stat);
+	return;
+
+}
+
+void lprocfs_free_per_client_stats(struct obd_device *obd)
+{
+	struct cfs_hash *hash = obd->obd_nid_stats_hash;
+	struct nid_stat *stat;
+
+	/* we need extra list - because hash_exit called to early */
+	/* not need locking because all clients is died */
+	while (!list_empty(&obd->obd_nid_stats)) {
+		stat = list_entry(obd->obd_nid_stats.next,
+				      struct nid_stat, nid_list);
+		list_del_init(&stat->nid_list);
+		cfs_hash_del(hash, &stat->nid, &stat->nid_hash);
+		lprocfs_free_client_stats(stat);
+	}
+}
+EXPORT_SYMBOL(lprocfs_free_per_client_stats);
+
+int lprocfs_stats_alloc_one(struct lprocfs_stats *stats, unsigned int cpuid)
+{
+	struct lprocfs_counter  *cntr;
+	unsigned int            percpusize;
+	int                     rc = -ENOMEM;
+	unsigned long           flags = 0;
+	int                     i;
+
+	LASSERT(stats->ls_percpu[cpuid] == NULL);
+	LASSERT((stats->ls_flags & LPROCFS_STATS_FLAG_NOPERCPU) == 0);
+
+	percpusize = lprocfs_stats_counter_size(stats);
+	LIBCFS_ALLOC_ATOMIC(stats->ls_percpu[cpuid], percpusize);
+	if (stats->ls_percpu[cpuid] != NULL) {
+		rc = 0;
+		if (unlikely(stats->ls_biggest_alloc_num <= cpuid)) {
+			if (stats->ls_flags & LPROCFS_STATS_FLAG_IRQ_SAFE)
+				spin_lock_irqsave(&stats->ls_lock, flags);
+			else
+				spin_lock(&stats->ls_lock);
+			if (stats->ls_biggest_alloc_num <= cpuid)
+				stats->ls_biggest_alloc_num = cpuid + 1;
+			if (stats->ls_flags & LPROCFS_STATS_FLAG_IRQ_SAFE)
+				spin_unlock_irqrestore(&stats->ls_lock, flags);
+			else
+				spin_unlock(&stats->ls_lock);
+		}
+		/* initialize the ls_percpu[cpuid] non-zero counter */
+		for (i = 0; i < stats->ls_num; ++i) {
+			cntr = lprocfs_stats_counter_get(stats, cpuid, i);
+			cntr->lc_min = LC_MIN_INIT;
+		}
+	}
+	return rc;
+}
+EXPORT_SYMBOL(lprocfs_stats_alloc_one);
+
+struct lprocfs_stats *lprocfs_alloc_stats(unsigned int num,
+					  enum lprocfs_stats_flags flags)
+{
+	struct lprocfs_stats	*stats;
+	unsigned int		num_entry;
+	unsigned int		percpusize = 0;
+	int			i;
+
+	if (num == 0)
+		return NULL;
+
+	if (lprocfs_no_percpu_stats != 0)
+		flags |= LPROCFS_STATS_FLAG_NOPERCPU;
+
+	if (flags & LPROCFS_STATS_FLAG_NOPERCPU)
+		num_entry = 1;
+	else
+		num_entry = num_possible_cpus();
+
+	/* alloc percpu pointers for all possible cpu slots */
+	LIBCFS_ALLOC(stats, offsetof(typeof(*stats), ls_percpu[num_entry]));
+	if (stats == NULL)
+		return NULL;
+
+	stats->ls_num = num;
+	stats->ls_flags = flags;
+	spin_lock_init(&stats->ls_lock);
+
+	/* alloc num of counter headers */
+	LIBCFS_ALLOC(stats->ls_cnt_header,
+		     stats->ls_num * sizeof(struct lprocfs_counter_header));
+	if (stats->ls_cnt_header == NULL)
+		goto fail;
+
+	if ((flags & LPROCFS_STATS_FLAG_NOPERCPU) != 0) {
+		/* contains only one set counters */
+		percpusize = lprocfs_stats_counter_size(stats);
+		LIBCFS_ALLOC_ATOMIC(stats->ls_percpu[0], percpusize);
+		if (stats->ls_percpu[0] == NULL)
+			goto fail;
+		stats->ls_biggest_alloc_num = 1;
+	} else if ((flags & LPROCFS_STATS_FLAG_IRQ_SAFE) != 0) {
+		/* alloc all percpu data, currently only obd_memory use this */
+		for (i = 0; i < num_entry; ++i)
+			if (lprocfs_stats_alloc_one(stats, i) < 0)
+				goto fail;
+	}
+
+	return stats;
+
+fail:
+	lprocfs_free_stats(&stats);
+	return NULL;
+}
+EXPORT_SYMBOL(lprocfs_alloc_stats);
+
+void lprocfs_free_stats(struct lprocfs_stats **statsh)
+{
+	struct lprocfs_stats *stats = *statsh;
+	unsigned int num_entry;
+	unsigned int percpusize;
+	unsigned int i;
+
+	if (stats == NULL || stats->ls_num == 0)
+		return;
+	*statsh = NULL;
+
+	if (stats->ls_flags & LPROCFS_STATS_FLAG_NOPERCPU)
+		num_entry = 1;
+	else
+		num_entry = num_possible_cpus();
+
+	percpusize = lprocfs_stats_counter_size(stats);
+	for (i = 0; i < num_entry; i++)
+		if (stats->ls_percpu[i] != NULL)
+			LIBCFS_FREE(stats->ls_percpu[i], percpusize);
+	if (stats->ls_cnt_header != NULL)
+		LIBCFS_FREE(stats->ls_cnt_header, stats->ls_num *
+					sizeof(struct lprocfs_counter_header));
+	LIBCFS_FREE(stats, offsetof(typeof(*stats), ls_percpu[num_entry]));
+}
+EXPORT_SYMBOL(lprocfs_free_stats);
+
+void lprocfs_clear_stats(struct lprocfs_stats *stats)
+{
+	struct lprocfs_counter		*percpu_cntr;
+	int				i;
+	int				j;
+	unsigned int			num_entry;
+	unsigned long			flags = 0;
+
+	num_entry = lprocfs_stats_lock(stats, LPROCFS_GET_NUM_CPU, &flags);
+
+	for (i = 0; i < num_entry; i++) {
+		if (stats->ls_percpu[i] == NULL)
+			continue;
+		for (j = 0; j < stats->ls_num; j++) {
+			percpu_cntr = lprocfs_stats_counter_get(stats, i, j);
+			percpu_cntr->lc_count		= 0;
+			percpu_cntr->lc_min		= LC_MIN_INIT;
+			percpu_cntr->lc_max		= 0;
+			percpu_cntr->lc_sumsquare	= 0;
+			percpu_cntr->lc_sum		= 0;
+			if (stats->ls_flags & LPROCFS_STATS_FLAG_IRQ_SAFE)
+				percpu_cntr->lc_sum_irq	= 0;
+		}
+	}
+
+	lprocfs_stats_unlock(stats, LPROCFS_GET_NUM_CPU, &flags);
+}
+EXPORT_SYMBOL(lprocfs_clear_stats);
+
+static ssize_t lprocfs_stats_seq_write(struct file *file,
+				       const char __user *buf,
+				       size_t len, loff_t *off)
+{
+	struct seq_file *seq = file->private_data;
+	struct lprocfs_stats *stats = seq->private;
+
+	lprocfs_clear_stats(stats);
+
+	return len;
+}
+
+static void *lprocfs_stats_seq_start(struct seq_file *p, loff_t *pos)
+{
+	struct lprocfs_stats *stats = p->private;
+
+	return (*pos < stats->ls_num) ? pos : NULL;
+}
+
+static void lprocfs_stats_seq_stop(struct seq_file *p, void *v)
+{
+}
+
+static void *lprocfs_stats_seq_next(struct seq_file *p, void *v, loff_t *pos)
+{
+	(*pos)++;
+	return lprocfs_stats_seq_start(p, pos);
+}
+
+/* seq file export of one lprocfs counter */
+static int lprocfs_stats_seq_show(struct seq_file *p, void *v)
+{
+	struct lprocfs_stats		*stats	= p->private;
+	struct lprocfs_counter_header   *hdr;
+	struct lprocfs_counter           ctr;
+	int                              idx    = *(loff_t *)v;
+
+	if (idx == 0) {
+		struct timeval now;
+		do_gettimeofday(&now);
+		seq_printf(p, "%-25s %lu.%lu secs.usecs\n",
+			   "snapshot_time",
+			   now.tv_sec, (unsigned long)now.tv_usec);
+	}
+
+	hdr = &stats->ls_cnt_header[idx];
+	lprocfs_stats_collect(stats, idx, &ctr);
+
+	if (ctr.lc_count != 0) {
+		seq_printf(p, "%-25s %lld samples [%s]",
+			   hdr->lc_name, ctr.lc_count, hdr->lc_units);
+
+		if ((hdr->lc_config & LPROCFS_CNTR_AVGMINMAX) &&
+		    (ctr.lc_count > 0)) {
+			seq_printf(p, " %lld %lld %lld",
+				   ctr.lc_min, ctr.lc_max, ctr.lc_sum);
+			if (hdr->lc_config & LPROCFS_CNTR_STDDEV)
+				seq_printf(p, " %lld", ctr.lc_sumsquare);
+		}
+		seq_putc(p, '\n');
+	}
+
+	return 0;
+}
+
+static const struct seq_operations lprocfs_stats_seq_sops = {
+	.start	= lprocfs_stats_seq_start,
+	.stop	= lprocfs_stats_seq_stop,
+	.next	= lprocfs_stats_seq_next,
+	.show	= lprocfs_stats_seq_show,
+};
+
+static int lprocfs_stats_seq_open(struct inode *inode, struct file *file)
+{
+	struct seq_file *seq;
+	int rc;
+
+	rc = seq_open(file, &lprocfs_stats_seq_sops);
+	if (rc)
+		return rc;
+	seq = file->private_data;
+	seq->private = PDE_DATA(inode);
+	return 0;
+}
+
+struct file_operations lprocfs_stats_seq_fops = {
+	.owner   = THIS_MODULE,
+	.open    = lprocfs_stats_seq_open,
+	.read    = seq_read,
+	.write   = lprocfs_stats_seq_write,
+	.llseek  = seq_lseek,
+	.release = lprocfs_seq_release,
+};
+
+int lprocfs_register_stats(struct proc_dir_entry *root, const char *name,
+			   struct lprocfs_stats *stats)
+{
+	struct proc_dir_entry *entry;
+	LASSERT(root != NULL);
+
+	entry = proc_create_data(name, 0644, root,
+				 &lprocfs_stats_seq_fops, stats);
+	if (entry == NULL)
+		return -ENOMEM;
+
+	return 0;
+}
+EXPORT_SYMBOL(lprocfs_register_stats);
+
+void lprocfs_counter_init(struct lprocfs_stats *stats, int index,
+			  unsigned conf, const char *name, const char *units)
+{
+	struct lprocfs_counter_header	*header;
+	struct lprocfs_counter		*percpu_cntr;
+	unsigned long			flags = 0;
+	unsigned int			i;
+	unsigned int			num_cpu;
+
+	LASSERT(stats != NULL);
+
+	header = &stats->ls_cnt_header[index];
+	LASSERTF(header != NULL, "Failed to allocate stats header:[%d]%s/%s\n",
+		 index, name, units);
+
+	header->lc_config = conf;
+	header->lc_name   = name;
+	header->lc_units  = units;
+
+	num_cpu = lprocfs_stats_lock(stats, LPROCFS_GET_NUM_CPU, &flags);
+	for (i = 0; i < num_cpu; ++i) {
+		if (stats->ls_percpu[i] == NULL)
+			continue;
+		percpu_cntr = lprocfs_stats_counter_get(stats, i, index);
+		percpu_cntr->lc_count		= 0;
+		percpu_cntr->lc_min		= LC_MIN_INIT;
+		percpu_cntr->lc_max		= 0;
+		percpu_cntr->lc_sumsquare	= 0;
+		percpu_cntr->lc_sum		= 0;
+		if ((stats->ls_flags & LPROCFS_STATS_FLAG_IRQ_SAFE) != 0)
+			percpu_cntr->lc_sum_irq	= 0;
+	}
+	lprocfs_stats_unlock(stats, LPROCFS_GET_NUM_CPU, &flags);
+}
+EXPORT_SYMBOL(lprocfs_counter_init);
+
+#define LPROCFS_OBD_OP_INIT(base, stats, op)			       \
+do {								       \
+	unsigned int coffset = base + OBD_COUNTER_OFFSET(op);	      \
+	LASSERT(coffset < stats->ls_num);				  \
+	lprocfs_counter_init(stats, coffset, 0, #op, "reqs");	      \
+} while (0)
+
+void lprocfs_init_ops_stats(int num_private_stats, struct lprocfs_stats *stats)
+{
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, iocontrol);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, get_info);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, set_info_async);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, attach);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, detach);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, setup);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, precleanup);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, cleanup);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, process_config);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, postrecov);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, add_conn);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, del_conn);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, connect);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, reconnect);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, disconnect);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, fid_init);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, fid_fini);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, fid_alloc);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, statfs);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, statfs_async);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, packmd);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, unpackmd);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, preallocate);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, create);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, destroy);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, setattr);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, setattr_async);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, getattr);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, getattr_async);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, adjust_kms);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, preprw);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, commitrw);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, find_cbdata);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, init_export);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, destroy_export);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, import_event);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, notify);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, health_check);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, get_uuid);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, quotacheck);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, quotactl);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, pool_new);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, pool_rem);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, pool_add);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, pool_del);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, getref);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, putref);
+}
+EXPORT_SYMBOL(lprocfs_init_ops_stats);
+
+int lprocfs_alloc_obd_stats(struct obd_device *obd, unsigned num_private_stats)
+{
+	struct lprocfs_stats *stats;
+	unsigned int num_stats;
+	int rc, i;
+
+	LASSERT(obd->obd_stats == NULL);
+	LASSERT(obd->obd_proc_entry != NULL);
+	LASSERT(obd->obd_cntr_base == 0);
+
+	num_stats = ((int)sizeof(*obd->obd_type->typ_dt_ops) / sizeof(void *)) +
+		num_private_stats - 1 /* o_owner */;
+	stats = lprocfs_alloc_stats(num_stats, 0);
+	if (stats == NULL)
+		return -ENOMEM;
+
+	lprocfs_init_ops_stats(num_private_stats, stats);
+
+	for (i = num_private_stats; i < num_stats; i++) {
+		/* If this LBUGs, it is likely that an obd
+		 * operation was added to struct obd_ops in
+		 * <obd.h>, and that the corresponding line item
+		 * LPROCFS_OBD_OP_INIT(.., .., opname)
+		 * is missing from the list above. */
+		LASSERTF(stats->ls_cnt_header[i].lc_name != NULL,
+			 "Missing obd_stat initializer obd_op operation at offset %d.\n",
+			 i - num_private_stats);
+	}
+	rc = lprocfs_register_stats(obd->obd_proc_entry, "stats", stats);
+	if (rc < 0) {
+		lprocfs_free_stats(&stats);
+	} else {
+		obd->obd_stats  = stats;
+		obd->obd_cntr_base = num_private_stats;
+	}
+	return rc;
+}
+EXPORT_SYMBOL(lprocfs_alloc_obd_stats);
+
+void lprocfs_free_obd_stats(struct obd_device *obd)
+{
+	if (obd->obd_stats)
+		lprocfs_free_stats(&obd->obd_stats);
+}
+EXPORT_SYMBOL(lprocfs_free_obd_stats);
+
+#define LPROCFS_MD_OP_INIT(base, stats, op)			     \
+do {								    \
+	unsigned int coffset = base + MD_COUNTER_OFFSET(op);	    \
+	LASSERT(coffset < stats->ls_num);			       \
+	lprocfs_counter_init(stats, coffset, 0, #op, "reqs");	   \
+} while (0)
+
+void lprocfs_init_mps_stats(int num_private_stats, struct lprocfs_stats *stats)
+{
+	LPROCFS_MD_OP_INIT(num_private_stats, stats, getstatus);
+	LPROCFS_MD_OP_INIT(num_private_stats, stats, null_inode);
+	LPROCFS_MD_OP_INIT(num_private_stats, stats, find_cbdata);
+	LPROCFS_MD_OP_INIT(num_private_stats, stats, close);
+	LPROCFS_MD_OP_INIT(num_private_stats, stats, create);
+	LPROCFS_MD_OP_INIT(num_private_stats, stats, done_writing);
+	LPROCFS_MD_OP_INIT(num_private_stats, stats, enqueue);
+	LPROCFS_MD_OP_INIT(num_private_stats, stats, getattr);
+	LPROCFS_MD_OP_INIT(num_private_stats, stats, getattr_name);
+	LPROCFS_MD_OP_INIT(num_private_stats, stats, intent_lock);
+	LPROCFS_MD_OP_INIT(num_private_stats, stats, link);
+	LPROCFS_MD_OP_INIT(num_private_stats, stats, rename);
+	LPROCFS_MD_OP_INIT(num_private_stats, stats, is_subdir);
+	LPROCFS_MD_OP_INIT(num_private_stats, stats, setattr);
+	LPROCFS_MD_OP_INIT(num_private_stats, stats, sync);
+	LPROCFS_MD_OP_INIT(num_private_stats, stats, readpage);
+	LPROCFS_MD_OP_INIT(num_private_stats, stats, unlink);
+	LPROCFS_MD_OP_INIT(num_private_stats, stats, setxattr);
+	LPROCFS_MD_OP_INIT(num_private_stats, stats, getxattr);
+	LPROCFS_MD_OP_INIT(num_private_stats, stats, init_ea_size);
+	LPROCFS_MD_OP_INIT(num_private_stats, stats, get_lustre_md);
+	LPROCFS_MD_OP_INIT(num_private_stats, stats, free_lustre_md);
+	LPROCFS_MD_OP_INIT(num_private_stats, stats, set_open_replay_data);
+	LPROCFS_MD_OP_INIT(num_private_stats, stats, clear_open_replay_data);
+	LPROCFS_MD_OP_INIT(num_private_stats, stats, set_lock_data);
+	LPROCFS_MD_OP_INIT(num_private_stats, stats, lock_match);
+	LPROCFS_MD_OP_INIT(num_private_stats, stats, cancel_unused);
+	LPROCFS_MD_OP_INIT(num_private_stats, stats, renew_capa);
+	LPROCFS_MD_OP_INIT(num_private_stats, stats, unpack_capa);
+	LPROCFS_MD_OP_INIT(num_private_stats, stats, get_remote_perm);
+	LPROCFS_MD_OP_INIT(num_private_stats, stats, intent_getattr_async);
+	LPROCFS_MD_OP_INIT(num_private_stats, stats, revalidate_lock);
+}
+EXPORT_SYMBOL(lprocfs_init_mps_stats);
+
+int lprocfs_alloc_md_stats(struct obd_device *obd,
+			   unsigned num_private_stats)
+{
+	struct lprocfs_stats *stats;
+	unsigned int num_stats;
+	int rc, i;
+
+	LASSERT(obd->md_stats == NULL);
+	LASSERT(obd->obd_proc_entry != NULL);
+	LASSERT(obd->md_cntr_base == 0);
+
+	num_stats = 1 + MD_COUNTER_OFFSET(revalidate_lock) +
+		    num_private_stats;
+	stats = lprocfs_alloc_stats(num_stats, 0);
+	if (stats == NULL)
+		return -ENOMEM;
+
+	lprocfs_init_mps_stats(num_private_stats, stats);
+
+	for (i = num_private_stats; i < num_stats; i++) {
+		if (stats->ls_cnt_header[i].lc_name == NULL) {
+			CERROR("Missing md_stat initializer md_op operation at offset %d. Aborting.\n",
+			       i - num_private_stats);
+			LBUG();
+		}
+	}
+	rc = lprocfs_register_stats(obd->obd_proc_entry, "md_stats", stats);
+	if (rc < 0) {
+		lprocfs_free_stats(&stats);
+	} else {
+		obd->md_stats  = stats;
+		obd->md_cntr_base = num_private_stats;
+	}
+	return rc;
+}
+EXPORT_SYMBOL(lprocfs_alloc_md_stats);
+
+void lprocfs_free_md_stats(struct obd_device *obd)
+{
+	struct lprocfs_stats *stats = obd->md_stats;
+
+	if (stats != NULL) {
+		obd->md_stats = NULL;
+		obd->md_cntr_base = 0;
+		lprocfs_free_stats(&stats);
+	}
+}
+EXPORT_SYMBOL(lprocfs_free_md_stats);
+
+void lprocfs_init_ldlm_stats(struct lprocfs_stats *ldlm_stats)
+{
+	lprocfs_counter_init(ldlm_stats,
+			     LDLM_ENQUEUE - LDLM_FIRST_OPC,
+			     0, "ldlm_enqueue", "reqs");
+	lprocfs_counter_init(ldlm_stats,
+			     LDLM_CONVERT - LDLM_FIRST_OPC,
+			     0, "ldlm_convert", "reqs");
+	lprocfs_counter_init(ldlm_stats,
+			     LDLM_CANCEL - LDLM_FIRST_OPC,
+			     0, "ldlm_cancel", "reqs");
+	lprocfs_counter_init(ldlm_stats,
+			     LDLM_BL_CALLBACK - LDLM_FIRST_OPC,
+			     0, "ldlm_bl_callback", "reqs");
+	lprocfs_counter_init(ldlm_stats,
+			     LDLM_CP_CALLBACK - LDLM_FIRST_OPC,
+			     0, "ldlm_cp_callback", "reqs");
+	lprocfs_counter_init(ldlm_stats,
+			     LDLM_GL_CALLBACK - LDLM_FIRST_OPC,
+			     0, "ldlm_gl_callback", "reqs");
+}
+EXPORT_SYMBOL(lprocfs_init_ldlm_stats);
+
+int lprocfs_exp_print_uuid(struct cfs_hash *hs, struct cfs_hash_bd *bd,
+			   struct hlist_node *hnode, void *data)
+
+{
+	struct obd_export *exp = cfs_hash_object(hs, hnode);
+	struct seq_file *m = (struct seq_file *)data;
+
+	if (exp->exp_nid_stats)
+		seq_printf(m, "%s\n", obd_uuid2str(&exp->exp_client_uuid));
+
+	return 0;
+}
+
+static int
+lproc_exp_uuid_seq_show(struct seq_file *m, void *unused)
+{
+	struct nid_stat *stats = (struct nid_stat *)m->private;
+	struct obd_device *obd = stats->nid_obd;
+
+	cfs_hash_for_each_key(obd->obd_nid_hash, &stats->nid,
+			      lprocfs_exp_print_uuid, m);
+	return 0;
+}
+
+LPROC_SEQ_FOPS_RO(lproc_exp_uuid);
+
+struct exp_hash_cb_data {
+	struct seq_file *m;
+	bool		first;
+};
+
+int lprocfs_exp_print_hash(struct cfs_hash *hs, struct cfs_hash_bd *bd,
+			   struct hlist_node *hnode, void *cb_data)
+
+{
+	struct exp_hash_cb_data *data = (struct exp_hash_cb_data *)cb_data;
+	struct obd_export       *exp = cfs_hash_object(hs, hnode);
+
+	if (exp->exp_lock_hash != NULL) {
+		if (data->first) {
+			cfs_hash_debug_header(data->m);
+			data->first = false;
+		}
+		cfs_hash_debug_str(hs, data->m);
+	}
+
+	return 0;
+}
+
+static int
+lproc_exp_hash_seq_show(struct seq_file *m, void *unused)
+{
+	struct nid_stat *stats = (struct nid_stat *)m->private;
+	struct obd_device *obd = stats->nid_obd;
+	struct exp_hash_cb_data cb_data = {
+		.m = m,
+		.first = true
+	};
+
+	cfs_hash_for_each_key(obd->obd_nid_hash, &stats->nid,
+			      lprocfs_exp_print_hash, &cb_data);
+	return 0;
+}
+
+LPROC_SEQ_FOPS_RO(lproc_exp_hash);
+
+int lprocfs_nid_stats_clear_read(struct seq_file *m, void *data)
+{
+	seq_printf(m, "%s\n",
+		   "Write into this file to clear all nid stats and stale nid entries");
+	return 0;
+}
+EXPORT_SYMBOL(lprocfs_nid_stats_clear_read);
+
+static int lprocfs_nid_stats_clear_write_cb(void *obj, void *data)
+{
+	struct nid_stat *stat = obj;
+
+	CDEBUG(D_INFO, "refcnt %d\n", atomic_read(&stat->nid_exp_ref_count));
+	if (atomic_read(&stat->nid_exp_ref_count) == 1) {
+		/* object has only hash references. */
+		spin_lock(&stat->nid_obd->obd_nid_lock);
+		list_move(&stat->nid_list, data);
+		spin_unlock(&stat->nid_obd->obd_nid_lock);
+		return 1;
+	}
+	/* we has reference to object - only clear data*/
+	if (stat->nid_stats)
+		lprocfs_clear_stats(stat->nid_stats);
+
+	return 0;
+}
+
+int lprocfs_nid_stats_clear_write(struct file *file, const char *buffer,
+				  unsigned long count, void *data)
+{
+	struct obd_device *obd = (struct obd_device *)data;
+	struct nid_stat *client_stat;
+	LIST_HEAD(free_list);
+
+	cfs_hash_cond_del(obd->obd_nid_stats_hash,
+			  lprocfs_nid_stats_clear_write_cb, &free_list);
+
+	while (!list_empty(&free_list)) {
+		client_stat = list_entry(free_list.next, struct nid_stat,
+					     nid_list);
+		list_del_init(&client_stat->nid_list);
+		lprocfs_free_client_stats(client_stat);
+	}
+
+	return count;
+}
+EXPORT_SYMBOL(lprocfs_nid_stats_clear_write);
+
+int lprocfs_exp_setup(struct obd_export *exp, lnet_nid_t *nid, int *newnid)
+{
+	struct nid_stat *new_stat, *old_stat;
+	struct obd_device *obd = NULL;
+	struct proc_dir_entry *entry;
+	char *buffer = NULL;
+	int rc = 0;
+
+	*newnid = 0;
+
+	if (!exp || !exp->exp_obd || !exp->exp_obd->obd_proc_exports_entry ||
+	    !exp->exp_obd->obd_nid_stats_hash)
+		return -EINVAL;
+
+	/* not test against zero because eric say:
+	 * You may only test nid against another nid, or LNET_NID_ANY.
+	 * Anything else is nonsense.*/
+	if (!nid || *nid == LNET_NID_ANY)
+		return 0;
+
+	obd = exp->exp_obd;
+
+	CDEBUG(D_CONFIG, "using hash %p\n", obd->obd_nid_stats_hash);
+
+	OBD_ALLOC_PTR(new_stat);
+	if (new_stat == NULL)
+		return -ENOMEM;
+
+	new_stat->nid	       = *nid;
+	new_stat->nid_obd	   = exp->exp_obd;
+	/* we need set default refcount to 1 to balance obd_disconnect */
+	atomic_set(&new_stat->nid_exp_ref_count, 1);
+
+	old_stat = cfs_hash_findadd_unique(obd->obd_nid_stats_hash,
+					   nid, &new_stat->nid_hash);
+	CDEBUG(D_INFO, "Found stats %p for nid %s - ref %d\n",
+	       old_stat, libcfs_nid2str(*nid),
+	       atomic_read(&new_stat->nid_exp_ref_count));
+
+	/* We need to release old stats because lprocfs_exp_cleanup() hasn't
+	 * been and will never be called. */
+	if (exp->exp_nid_stats) {
+		nidstat_putref(exp->exp_nid_stats);
+		exp->exp_nid_stats = NULL;
+	}
+
+	/* Return -EALREADY here so that we know that the /proc
+	 * entry already has been created */
+	if (old_stat != new_stat) {
+		exp->exp_nid_stats = old_stat;
+		rc = -EALREADY;
+		goto destroy_new;
+	}
+	/* not found - create */
+	OBD_ALLOC(buffer, LNET_NIDSTR_SIZE);
+	if (buffer == NULL) {
+		rc = -ENOMEM;
+		goto destroy_new;
+	}
+
+	memcpy(buffer, libcfs_nid2str(*nid), LNET_NIDSTR_SIZE);
+	new_stat->nid_proc = lprocfs_register(buffer,
+					      obd->obd_proc_exports_entry,
+					      NULL, NULL);
+	OBD_FREE(buffer, LNET_NIDSTR_SIZE);
+
+	if (IS_ERR(new_stat->nid_proc)) {
+		CERROR("Error making export directory for nid %s\n",
+		       libcfs_nid2str(*nid));
+		rc = PTR_ERR(new_stat->nid_proc);
+		new_stat->nid_proc = NULL;
+		goto destroy_new_ns;
+	}
+
+	entry = lprocfs_add_simple(new_stat->nid_proc, "uuid",
+				   new_stat, &lproc_exp_uuid_fops);
+	if (IS_ERR(entry)) {
+		CWARN("Error adding the NID stats file\n");
+		rc = PTR_ERR(entry);
+		goto destroy_new_ns;
+	}
+
+	entry = lprocfs_add_simple(new_stat->nid_proc, "hash",
+				   new_stat, &lproc_exp_hash_fops);
+	if (IS_ERR(entry)) {
+		CWARN("Error adding the hash file\n");
+		rc = PTR_ERR(entry);
+		goto destroy_new_ns;
+	}
+
+	exp->exp_nid_stats = new_stat;
+	*newnid = 1;
+	/* protect competitive add to list, not need locking on destroy */
+	spin_lock(&obd->obd_nid_lock);
+	list_add(&new_stat->nid_list, &obd->obd_nid_stats);
+	spin_unlock(&obd->obd_nid_lock);
+
+	return rc;
+
+destroy_new_ns:
+	if (new_stat->nid_proc != NULL)
+		lprocfs_remove(&new_stat->nid_proc);
+	cfs_hash_del(obd->obd_nid_stats_hash, nid, &new_stat->nid_hash);
+
+destroy_new:
+	nidstat_putref(new_stat);
+	OBD_FREE_PTR(new_stat);
+	return rc;
+}
+EXPORT_SYMBOL(lprocfs_exp_setup);
+
+int lprocfs_exp_cleanup(struct obd_export *exp)
+{
+	struct nid_stat *stat = exp->exp_nid_stats;
+
+	if (!stat || !exp->exp_obd)
+		return 0;
+
+	nidstat_putref(exp->exp_nid_stats);
+	exp->exp_nid_stats = NULL;
+
+	return 0;
+}
+EXPORT_SYMBOL(lprocfs_exp_cleanup);
+
+__s64 lprocfs_read_helper(struct lprocfs_counter *lc,
+			  struct lprocfs_counter_header *header,
+			  enum lprocfs_stats_flags flags,
+			  enum lprocfs_fields_flags field)
+{
+	__s64 ret = 0;
+
+	if (lc == NULL || header == NULL)
+		return 0;
+
+	switch (field) {
+	case LPROCFS_FIELDS_FLAGS_CONFIG:
+		ret = header->lc_config;
+		break;
+	case LPROCFS_FIELDS_FLAGS_SUM:
+		ret = lc->lc_sum;
+		if ((flags & LPROCFS_STATS_FLAG_IRQ_SAFE) != 0)
+			ret += lc->lc_sum_irq;
+		break;
+	case LPROCFS_FIELDS_FLAGS_MIN:
+		ret = lc->lc_min;
+		break;
+	case LPROCFS_FIELDS_FLAGS_MAX:
+		ret = lc->lc_max;
+		break;
+	case LPROCFS_FIELDS_FLAGS_AVG:
+		ret = (lc->lc_max - lc->lc_min) / 2;
+		break;
+	case LPROCFS_FIELDS_FLAGS_SUMSQUARE:
+		ret = lc->lc_sumsquare;
+		break;
+	case LPROCFS_FIELDS_FLAGS_COUNT:
+		ret = lc->lc_count;
+		break;
+	default:
+		break;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL(lprocfs_read_helper);
+
+int lprocfs_write_helper(const char __user *buffer, unsigned long count,
+			 int *val)
+{
+	return lprocfs_write_frac_helper(buffer, count, val, 1);
+}
+EXPORT_SYMBOL(lprocfs_write_helper);
+
+int lprocfs_seq_read_frac_helper(struct seq_file *m, long val, int mult)
+{
+	long decimal_val, frac_val;
+
+	decimal_val = val / mult;
+	seq_printf(m, "%ld", decimal_val);
+	frac_val = val % mult;
+
+	if (frac_val > 0) {
+		frac_val *= 100;
+		frac_val /= mult;
+	}
+	if (frac_val > 0) {
+		/* Three cases: x0, xx, 0x */
+		if ((frac_val % 10) != 0)
+			seq_printf(m, ".%ld", frac_val);
+		else
+			seq_printf(m, ".%ld", frac_val / 10);
+	}
+
+	seq_printf(m, "\n");
+	return 0;
+}
+EXPORT_SYMBOL(lprocfs_seq_read_frac_helper);
+
+int lprocfs_write_u64_helper(const char __user *buffer, unsigned long count,
+			     __u64 *val)
+{
+	return lprocfs_write_frac_u64_helper(buffer, count, val, 1);
+}
+EXPORT_SYMBOL(lprocfs_write_u64_helper);
+
+int lprocfs_write_frac_u64_helper(const char *buffer, unsigned long count,
+			      __u64 *val, int mult)
+{
+	char kernbuf[22], *end, *pbuf;
+	__u64 whole, frac = 0, units;
+	unsigned frac_d = 1;
+	int sign = 1;
+
+	if (count > (sizeof(kernbuf) - 1))
+		return -EINVAL;
+
+	if (copy_from_user(kernbuf, buffer, count))
+		return -EFAULT;
+
+	kernbuf[count] = '\0';
+	pbuf = kernbuf;
+	if (*pbuf == '-') {
+		sign = -1;
+		pbuf++;
+	}
+
+	whole = simple_strtoull(pbuf, &end, 10);
+	if (pbuf == end)
+		return -EINVAL;
+
+	if (*end == '.') {
+		int i;
+		pbuf = end + 1;
+
+		/* need to limit frac_d to a __u32 */
+		if (strlen(pbuf) > 10)
+			pbuf[10] = '\0';
+
+		frac = simple_strtoull(pbuf, &end, 10);
+		/* count decimal places */
+		for (i = 0; i < (end - pbuf); i++)
+			frac_d *= 10;
+	}
+
+	units = 1;
+	switch (tolower(*end)) {
+	case 'p':
+		units <<= 10;
+	case 't':
+		units <<= 10;
+	case 'g':
+		units <<= 10;
+	case 'm':
+		units <<= 10;
+	case 'k':
+		units <<= 10;
+	}
+	/* Specified units override the multiplier */
+	if (units > 1)
+		mult = units;
+
+	frac *= mult;
+	do_div(frac, frac_d);
+	*val = sign * (whole * mult + frac);
+	return 0;
+}
+EXPORT_SYMBOL(lprocfs_write_frac_u64_helper);
+
+static char *lprocfs_strnstr(const char *s1, const char *s2, size_t len)
+{
+	size_t l2;
+
+	l2 = strlen(s2);
+	if (!l2)
+		return (char *)s1;
+	while (len >= l2) {
+		len--;
+		if (!memcmp(s1, s2, l2))
+			return (char *)s1;
+		s1++;
+	}
+	return NULL;
+}
+
+/**
+ * Find the string \a name in the input \a buffer, and return a pointer to the
+ * value immediately following \a name, reducing \a count appropriately.
+ * If \a name is not found the original \a buffer is returned.
+ */
+char *lprocfs_find_named_value(const char *buffer, const char *name,
+			       size_t *count)
+{
+	char *val;
+	size_t buflen = *count;
+
+	/* there is no strnstr() in rhel5 and ubuntu kernels */
+	val = lprocfs_strnstr(buffer, name, buflen);
+	if (val == NULL)
+		return (char *)buffer;
+
+	val += strlen(name);			     /* skip prefix */
+	while (val < buffer + buflen && isspace(*val)) /* skip separator */
+		val++;
+
+	*count = 0;
+	while (val < buffer + buflen && isalnum(*val)) {
+		++*count;
+		++val;
+	}
+
+	return val - *count;
+}
+EXPORT_SYMBOL(lprocfs_find_named_value);
+
+int lprocfs_seq_create(struct proc_dir_entry *parent,
+		       const char *name,
+		       umode_t mode,
+		       const struct file_operations *seq_fops,
+		       void *data)
+{
+	struct proc_dir_entry *entry;
+
+	/* Disallow secretly (un)writable entries. */
+	LASSERT((seq_fops->write == NULL) == ((mode & 0222) == 0));
+	entry = proc_create_data(name, mode, parent, seq_fops, data);
+
+	if (entry == NULL)
+		return -ENOMEM;
+
+	return 0;
+}
+EXPORT_SYMBOL(lprocfs_seq_create);
+
+int lprocfs_obd_seq_create(struct obd_device *dev,
+			   const char *name,
+			   umode_t mode,
+			   const struct file_operations *seq_fops,
+			   void *data)
+{
+	return lprocfs_seq_create(dev->obd_proc_entry, name,
+				  mode, seq_fops, data);
+}
+EXPORT_SYMBOL(lprocfs_obd_seq_create);
+
+void lprocfs_oh_tally(struct obd_histogram *oh, unsigned int value)
+{
+	if (value >= OBD_HIST_MAX)
+		value = OBD_HIST_MAX - 1;
+
+	spin_lock(&oh->oh_lock);
+	oh->oh_buckets[value]++;
+	spin_unlock(&oh->oh_lock);
+}
+EXPORT_SYMBOL(lprocfs_oh_tally);
+
+void lprocfs_oh_tally_log2(struct obd_histogram *oh, unsigned int value)
+{
+	unsigned int val;
+
+	for (val = 0; ((1 << val) < value) && (val <= OBD_HIST_MAX); val++)
+		;
+
+	lprocfs_oh_tally(oh, val);
+}
+EXPORT_SYMBOL(lprocfs_oh_tally_log2);
+
+unsigned long lprocfs_oh_sum(struct obd_histogram *oh)
+{
+	unsigned long ret = 0;
+	int i;
+
+	for (i = 0; i < OBD_HIST_MAX; i++)
+		ret +=  oh->oh_buckets[i];
+	return ret;
+}
+EXPORT_SYMBOL(lprocfs_oh_sum);
+
+void lprocfs_oh_clear(struct obd_histogram *oh)
+{
+	spin_lock(&oh->oh_lock);
+	memset(oh->oh_buckets, 0, sizeof(oh->oh_buckets));
+	spin_unlock(&oh->oh_lock);
+}
+EXPORT_SYMBOL(lprocfs_oh_clear);
+
+int lprocfs_obd_rd_max_pages_per_rpc(struct seq_file *m, void *data)
+{
+	struct obd_device *dev = data;
+	struct client_obd *cli = &dev->u.cli;
+
+	client_obd_list_lock(&cli->cl_loi_list_lock);
+	seq_printf(m, "%d\n", cli->cl_max_pages_per_rpc);
+	client_obd_list_unlock(&cli->cl_loi_list_lock);
+
+	return 0;
+}
+EXPORT_SYMBOL(lprocfs_obd_rd_max_pages_per_rpc);
+
+#endif
diff --git a/kernel/drivers/staging/lustre/lustre/obdclass/lu_object.c b/kernel/drivers/staging/lustre/lustre/obdclass/lu_object.c
new file mode 100644
index 000000000..20c077995
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/obdclass/lu_object.c
@@ -0,0 +1,2192 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdclass/lu_object.c
+ *
+ * Lustre Object.
+ * These are the only exported functions, they provide some generic
+ * infrastructure for managing object devices
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+
+#include "../../include/linux/libcfs/libcfs.h"
+
+# include <linux/module.h>
+
+/* hash_long() */
+#include "../../include/linux/libcfs/libcfs_hash.h"
+#include "../include/obd_class.h"
+#include "../include/obd_support.h"
+#include "../include/lustre_disk.h"
+#include "../include/lustre_fid.h"
+#include "../include/lu_object.h"
+#include "../include/lu_ref.h"
+#include <linux/list.h>
+
+static void lu_object_free(const struct lu_env *env, struct lu_object *o);
+
+/**
+ * Decrease reference counter on object. If last reference is freed, return
+ * object to the cache, unless lu_object_is_dying(o) holds. In the latter
+ * case, free object immediately.
+ */
+void lu_object_put(const struct lu_env *env, struct lu_object *o)
+{
+	struct lu_site_bkt_data *bkt;
+	struct lu_object_header *top;
+	struct lu_site	  *site;
+	struct lu_object	*orig;
+	struct cfs_hash_bd	    bd;
+	const struct lu_fid     *fid;
+
+	top  = o->lo_header;
+	site = o->lo_dev->ld_site;
+	orig = o;
+
+	/*
+	 * till we have full fids-on-OST implemented anonymous objects
+	 * are possible in OSP. such an object isn't listed in the site
+	 * so we should not remove it from the site.
+	 */
+	fid = lu_object_fid(o);
+	if (fid_is_zero(fid)) {
+		LASSERT(top->loh_hash.next == NULL
+			&& top->loh_hash.pprev == NULL);
+		LASSERT(list_empty(&top->loh_lru));
+		if (!atomic_dec_and_test(&top->loh_ref))
+			return;
+		list_for_each_entry_reverse(o, &top->loh_layers, lo_linkage) {
+			if (o->lo_ops->loo_object_release != NULL)
+				o->lo_ops->loo_object_release(env, o);
+		}
+		lu_object_free(env, orig);
+		return;
+	}
+
+	cfs_hash_bd_get(site->ls_obj_hash, &top->loh_fid, &bd);
+	bkt = cfs_hash_bd_extra_get(site->ls_obj_hash, &bd);
+
+	if (!cfs_hash_bd_dec_and_lock(site->ls_obj_hash, &bd, &top->loh_ref)) {
+		if (lu_object_is_dying(top)) {
+
+			/*
+			 * somebody may be waiting for this, currently only
+			 * used for cl_object, see cl_object_put_last().
+			 */
+			wake_up_all(&bkt->lsb_marche_funebre);
+		}
+		return;
+	}
+
+	LASSERT(bkt->lsb_busy > 0);
+	bkt->lsb_busy--;
+	/*
+	 * When last reference is released, iterate over object
+	 * layers, and notify them that object is no longer busy.
+	 */
+	list_for_each_entry_reverse(o, &top->loh_layers, lo_linkage) {
+		if (o->lo_ops->loo_object_release != NULL)
+			o->lo_ops->loo_object_release(env, o);
+	}
+
+	if (!lu_object_is_dying(top)) {
+		LASSERT(list_empty(&top->loh_lru));
+		list_add_tail(&top->loh_lru, &bkt->lsb_lru);
+		cfs_hash_bd_unlock(site->ls_obj_hash, &bd, 1);
+		return;
+	}
+
+	/*
+	 * If object is dying (will not be cached), removed it
+	 * from hash table and LRU.
+	 *
+	 * This is done with hash table and LRU lists locked. As the only
+	 * way to acquire first reference to previously unreferenced
+	 * object is through hash-table lookup (lu_object_find()),
+	 * or LRU scanning (lu_site_purge()), that are done under hash-table
+	 * and LRU lock, no race with concurrent object lookup is possible
+	 * and we can safely destroy object below.
+	 */
+	if (!test_and_set_bit(LU_OBJECT_UNHASHED, &top->loh_flags))
+		cfs_hash_bd_del_locked(site->ls_obj_hash, &bd, &top->loh_hash);
+	cfs_hash_bd_unlock(site->ls_obj_hash, &bd, 1);
+	/*
+	 * Object was already removed from hash and lru above, can
+	 * kill it.
+	 */
+	lu_object_free(env, orig);
+}
+EXPORT_SYMBOL(lu_object_put);
+
+/**
+ * Put object and don't keep in cache. This is temporary solution for
+ * multi-site objects when its layering is not constant.
+ */
+void lu_object_put_nocache(const struct lu_env *env, struct lu_object *o)
+{
+	set_bit(LU_OBJECT_HEARD_BANSHEE, &o->lo_header->loh_flags);
+	return lu_object_put(env, o);
+}
+EXPORT_SYMBOL(lu_object_put_nocache);
+
+/**
+ * Kill the object and take it out of LRU cache.
+ * Currently used by client code for layout change.
+ */
+void lu_object_unhash(const struct lu_env *env, struct lu_object *o)
+{
+	struct lu_object_header *top;
+
+	top = o->lo_header;
+	set_bit(LU_OBJECT_HEARD_BANSHEE, &top->loh_flags);
+	if (!test_and_set_bit(LU_OBJECT_UNHASHED, &top->loh_flags)) {
+		struct cfs_hash *obj_hash = o->lo_dev->ld_site->ls_obj_hash;
+		struct cfs_hash_bd bd;
+
+		cfs_hash_bd_get_and_lock(obj_hash, &top->loh_fid, &bd, 1);
+		list_del_init(&top->loh_lru);
+		cfs_hash_bd_del_locked(obj_hash, &bd, &top->loh_hash);
+		cfs_hash_bd_unlock(obj_hash, &bd, 1);
+	}
+}
+EXPORT_SYMBOL(lu_object_unhash);
+
+/**
+ * Allocate new object.
+ *
+ * This follows object creation protocol, described in the comment within
+ * struct lu_device_operations definition.
+ */
+static struct lu_object *lu_object_alloc(const struct lu_env *env,
+					 struct lu_device *dev,
+					 const struct lu_fid *f,
+					 const struct lu_object_conf *conf)
+{
+	struct lu_object *scan;
+	struct lu_object *top;
+	struct list_head *layers;
+	unsigned int init_mask = 0;
+	unsigned int init_flag;
+	int clean;
+	int result;
+
+	/*
+	 * Create top-level object slice. This will also create
+	 * lu_object_header.
+	 */
+	top = dev->ld_ops->ldo_object_alloc(env, NULL, dev);
+	if (top == NULL)
+		return ERR_PTR(-ENOMEM);
+	if (IS_ERR(top))
+		return top;
+	/*
+	 * This is the only place where object fid is assigned. It's constant
+	 * after this point.
+	 */
+	top->lo_header->loh_fid = *f;
+	layers = &top->lo_header->loh_layers;
+
+	do {
+		/*
+		 * Call ->loo_object_init() repeatedly, until no more new
+		 * object slices are created.
+		 */
+		clean = 1;
+		init_flag = 1;
+		list_for_each_entry(scan, layers, lo_linkage) {
+			if (init_mask & init_flag)
+				goto next;
+			clean = 0;
+			scan->lo_header = top->lo_header;
+			result = scan->lo_ops->loo_object_init(env, scan, conf);
+			if (result != 0) {
+				lu_object_free(env, top);
+				return ERR_PTR(result);
+			}
+			init_mask |= init_flag;
+next:
+			init_flag <<= 1;
+		}
+	} while (!clean);
+
+	list_for_each_entry_reverse(scan, layers, lo_linkage) {
+		if (scan->lo_ops->loo_object_start != NULL) {
+			result = scan->lo_ops->loo_object_start(env, scan);
+			if (result != 0) {
+				lu_object_free(env, top);
+				return ERR_PTR(result);
+			}
+		}
+	}
+
+	lprocfs_counter_incr(dev->ld_site->ls_stats, LU_SS_CREATED);
+	return top;
+}
+
+/**
+ * Free an object.
+ */
+static void lu_object_free(const struct lu_env *env, struct lu_object *o)
+{
+	struct lu_site_bkt_data *bkt;
+	struct lu_site	  *site;
+	struct lu_object	*scan;
+	struct list_head	      *layers;
+	struct list_head	       splice;
+
+	site   = o->lo_dev->ld_site;
+	layers = &o->lo_header->loh_layers;
+	bkt    = lu_site_bkt_from_fid(site, &o->lo_header->loh_fid);
+	/*
+	 * First call ->loo_object_delete() method to release all resources.
+	 */
+	list_for_each_entry_reverse(scan, layers, lo_linkage) {
+		if (scan->lo_ops->loo_object_delete != NULL)
+			scan->lo_ops->loo_object_delete(env, scan);
+	}
+
+	/*
+	 * Then, splice object layers into stand-alone list, and call
+	 * ->loo_object_free() on all layers to free memory. Splice is
+	 * necessary, because lu_object_header is freed together with the
+	 * top-level slice.
+	 */
+	INIT_LIST_HEAD(&splice);
+	list_splice_init(layers, &splice);
+	while (!list_empty(&splice)) {
+		/*
+		 * Free layers in bottom-to-top order, so that object header
+		 * lives as long as possible and ->loo_object_free() methods
+		 * can look at its contents.
+		 */
+		o = container_of0(splice.prev, struct lu_object, lo_linkage);
+		list_del_init(&o->lo_linkage);
+		LASSERT(o->lo_ops->loo_object_free != NULL);
+		o->lo_ops->loo_object_free(env, o);
+	}
+
+	if (waitqueue_active(&bkt->lsb_marche_funebre))
+		wake_up_all(&bkt->lsb_marche_funebre);
+}
+
+/**
+ * Free \a nr objects from the cold end of the site LRU list.
+ */
+int lu_site_purge(const struct lu_env *env, struct lu_site *s, int nr)
+{
+	struct lu_object_header *h;
+	struct lu_object_header *temp;
+	struct lu_site_bkt_data *bkt;
+	struct cfs_hash_bd	    bd;
+	struct cfs_hash_bd	    bd2;
+	struct list_head	       dispose;
+	int		      did_sth;
+	int		      start;
+	int		      count;
+	int		      bnr;
+	int		      i;
+
+	if (OBD_FAIL_CHECK(OBD_FAIL_OBD_NO_LRU))
+		return 0;
+
+	INIT_LIST_HEAD(&dispose);
+	/*
+	 * Under LRU list lock, scan LRU list and move unreferenced objects to
+	 * the dispose list, removing them from LRU and hash table.
+	 */
+	start = s->ls_purge_start;
+	bnr = (nr == ~0) ? -1 : nr / CFS_HASH_NBKT(s->ls_obj_hash) + 1;
+ again:
+	did_sth = 0;
+	cfs_hash_for_each_bucket(s->ls_obj_hash, &bd, i) {
+		if (i < start)
+			continue;
+		count = bnr;
+		cfs_hash_bd_lock(s->ls_obj_hash, &bd, 1);
+		bkt = cfs_hash_bd_extra_get(s->ls_obj_hash, &bd);
+
+		list_for_each_entry_safe(h, temp, &bkt->lsb_lru, loh_lru) {
+			LASSERT(atomic_read(&h->loh_ref) == 0);
+
+			cfs_hash_bd_get(s->ls_obj_hash, &h->loh_fid, &bd2);
+			LASSERT(bd.bd_bucket == bd2.bd_bucket);
+
+			cfs_hash_bd_del_locked(s->ls_obj_hash,
+					       &bd2, &h->loh_hash);
+			list_move(&h->loh_lru, &dispose);
+			if (did_sth == 0)
+				did_sth = 1;
+
+			if (nr != ~0 && --nr == 0)
+				break;
+
+			if (count > 0 && --count == 0)
+				break;
+
+		}
+		cfs_hash_bd_unlock(s->ls_obj_hash, &bd, 1);
+		cond_resched();
+		/*
+		 * Free everything on the dispose list. This is safe against
+		 * races due to the reasons described in lu_object_put().
+		 */
+		while (!list_empty(&dispose)) {
+			h = container_of0(dispose.next,
+					  struct lu_object_header, loh_lru);
+			list_del_init(&h->loh_lru);
+			lu_object_free(env, lu_object_top(h));
+			lprocfs_counter_incr(s->ls_stats, LU_SS_LRU_PURGED);
+		}
+
+		if (nr == 0)
+			break;
+	}
+
+	if (nr != 0 && did_sth && start != 0) {
+		start = 0; /* restart from the first bucket */
+		goto again;
+	}
+	/* race on s->ls_purge_start, but nobody cares */
+	s->ls_purge_start = i % CFS_HASH_NBKT(s->ls_obj_hash);
+
+	return nr;
+}
+EXPORT_SYMBOL(lu_site_purge);
+
+/*
+ * Object printing.
+ *
+ * Code below has to jump through certain loops to output object description
+ * into libcfs_debug_msg-based log. The problem is that lu_object_print()
+ * composes object description from strings that are parts of _lines_ of
+ * output (i.e., strings that are not terminated by newline). This doesn't fit
+ * very well into libcfs_debug_msg() interface that assumes that each message
+ * supplied to it is a self-contained output line.
+ *
+ * To work around this, strings are collected in a temporary buffer
+ * (implemented as a value of lu_cdebug_key key), until terminating newline
+ * character is detected.
+ *
+ */
+
+enum {
+	/**
+	 * Maximal line size.
+	 *
+	 * XXX overflow is not handled correctly.
+	 */
+	LU_CDEBUG_LINE = 512
+};
+
+struct lu_cdebug_data {
+	/**
+	 * Temporary buffer.
+	 */
+	char lck_area[LU_CDEBUG_LINE];
+};
+
+/* context key constructor/destructor: lu_global_key_init, lu_global_key_fini */
+LU_KEY_INIT_FINI(lu_global, struct lu_cdebug_data);
+
+/**
+ * Key, holding temporary buffer. This key is registered very early by
+ * lu_global_init().
+ */
+struct lu_context_key lu_global_key = {
+	.lct_tags = LCT_MD_THREAD | LCT_DT_THREAD |
+		    LCT_MG_THREAD | LCT_CL_THREAD | LCT_LOCAL,
+	.lct_init = lu_global_key_init,
+	.lct_fini = lu_global_key_fini
+};
+
+/**
+ * Printer function emitting messages through libcfs_debug_msg().
+ */
+int lu_cdebug_printer(const struct lu_env *env,
+		      void *cookie, const char *format, ...)
+{
+	struct libcfs_debug_msg_data *msgdata = cookie;
+	struct lu_cdebug_data	*key;
+	int used;
+	int complete;
+	va_list args;
+
+	va_start(args, format);
+
+	key = lu_context_key_get(&env->le_ctx, &lu_global_key);
+	LASSERT(key != NULL);
+
+	used = strlen(key->lck_area);
+	complete = format[strlen(format) - 1] == '\n';
+	/*
+	 * Append new chunk to the buffer.
+	 */
+	vsnprintf(key->lck_area + used,
+		  ARRAY_SIZE(key->lck_area) - used, format, args);
+	if (complete) {
+		if (cfs_cdebug_show(msgdata->msg_mask, msgdata->msg_subsys))
+			libcfs_debug_msg(msgdata, "%s", key->lck_area);
+		key->lck_area[0] = 0;
+	}
+	va_end(args);
+	return 0;
+}
+EXPORT_SYMBOL(lu_cdebug_printer);
+
+/**
+ * Print object header.
+ */
+void lu_object_header_print(const struct lu_env *env, void *cookie,
+			    lu_printer_t printer,
+			    const struct lu_object_header *hdr)
+{
+	(*printer)(env, cookie, "header@%p[%#lx, %d, "DFID"%s%s%s]",
+		   hdr, hdr->loh_flags, atomic_read(&hdr->loh_ref),
+		   PFID(&hdr->loh_fid),
+		   hlist_unhashed(&hdr->loh_hash) ? "" : " hash",
+		   list_empty((struct list_head *)&hdr->loh_lru) ? \
+		   "" : " lru",
+		   hdr->loh_attr & LOHA_EXISTS ? " exist":"");
+}
+EXPORT_SYMBOL(lu_object_header_print);
+
+/**
+ * Print human readable representation of the \a o to the \a printer.
+ */
+void lu_object_print(const struct lu_env *env, void *cookie,
+		     lu_printer_t printer, const struct lu_object *o)
+{
+	static const char ruler[] = "........................................";
+	struct lu_object_header *top;
+	int depth = 4;
+
+	top = o->lo_header;
+	lu_object_header_print(env, cookie, printer, top);
+	(*printer)(env, cookie, "{\n");
+
+	list_for_each_entry(o, &top->loh_layers, lo_linkage) {
+		/*
+		 * print `.' \a depth times followed by type name and address
+		 */
+		(*printer)(env, cookie, "%*.*s%s@%p", depth, depth, ruler,
+			   o->lo_dev->ld_type->ldt_name, o);
+
+		if (o->lo_ops->loo_object_print != NULL)
+			(*o->lo_ops->loo_object_print)(env, cookie, printer, o);
+
+		(*printer)(env, cookie, "\n");
+	}
+
+	(*printer)(env, cookie, "} header@%p\n", top);
+}
+EXPORT_SYMBOL(lu_object_print);
+
+/**
+ * Check object consistency.
+ */
+int lu_object_invariant(const struct lu_object *o)
+{
+	struct lu_object_header *top;
+
+	top = o->lo_header;
+	list_for_each_entry(o, &top->loh_layers, lo_linkage) {
+		if (o->lo_ops->loo_object_invariant != NULL &&
+		    !o->lo_ops->loo_object_invariant(o))
+			return 0;
+	}
+	return 1;
+}
+EXPORT_SYMBOL(lu_object_invariant);
+
+static struct lu_object *htable_lookup(struct lu_site *s,
+				       struct cfs_hash_bd *bd,
+				       const struct lu_fid *f,
+				       wait_queue_t *waiter,
+				       __u64 *version)
+{
+	struct lu_site_bkt_data *bkt;
+	struct lu_object_header *h;
+	struct hlist_node	*hnode;
+	__u64  ver = cfs_hash_bd_version_get(bd);
+
+	if (*version == ver)
+		return ERR_PTR(-ENOENT);
+
+	*version = ver;
+	bkt = cfs_hash_bd_extra_get(s->ls_obj_hash, bd);
+	/* cfs_hash_bd_peek_locked is a somehow "internal" function
+	 * of cfs_hash, it doesn't add refcount on object. */
+	hnode = cfs_hash_bd_peek_locked(s->ls_obj_hash, bd, (void *)f);
+	if (hnode == NULL) {
+		lprocfs_counter_incr(s->ls_stats, LU_SS_CACHE_MISS);
+		return ERR_PTR(-ENOENT);
+	}
+
+	h = container_of0(hnode, struct lu_object_header, loh_hash);
+	if (likely(!lu_object_is_dying(h))) {
+		cfs_hash_get(s->ls_obj_hash, hnode);
+		lprocfs_counter_incr(s->ls_stats, LU_SS_CACHE_HIT);
+		list_del_init(&h->loh_lru);
+		return lu_object_top(h);
+	}
+
+	/*
+	 * Lookup found an object being destroyed this object cannot be
+	 * returned (to assure that references to dying objects are eventually
+	 * drained), and moreover, lookup has to wait until object is freed.
+	 */
+
+	init_waitqueue_entry(waiter, current);
+	add_wait_queue(&bkt->lsb_marche_funebre, waiter);
+	set_current_state(TASK_UNINTERRUPTIBLE);
+	lprocfs_counter_incr(s->ls_stats, LU_SS_CACHE_DEATH_RACE);
+	return ERR_PTR(-EAGAIN);
+}
+
+/**
+ * Search cache for an object with the fid \a f. If such object is found,
+ * return it. Otherwise, create new object, insert it into cache and return
+ * it. In any case, additional reference is acquired on the returned object.
+ */
+struct lu_object *lu_object_find(const struct lu_env *env,
+				 struct lu_device *dev, const struct lu_fid *f,
+				 const struct lu_object_conf *conf)
+{
+	return lu_object_find_at(env, dev->ld_site->ls_top_dev, f, conf);
+}
+EXPORT_SYMBOL(lu_object_find);
+
+static struct lu_object *lu_object_new(const struct lu_env *env,
+				       struct lu_device *dev,
+				       const struct lu_fid *f,
+				       const struct lu_object_conf *conf)
+{
+	struct lu_object	*o;
+	struct cfs_hash	      *hs;
+	struct cfs_hash_bd	    bd;
+	struct lu_site_bkt_data *bkt;
+
+	o = lu_object_alloc(env, dev, f, conf);
+	if (unlikely(IS_ERR(o)))
+		return o;
+
+	hs = dev->ld_site->ls_obj_hash;
+	cfs_hash_bd_get_and_lock(hs, (void *)f, &bd, 1);
+	bkt = cfs_hash_bd_extra_get(hs, &bd);
+	cfs_hash_bd_add_locked(hs, &bd, &o->lo_header->loh_hash);
+	bkt->lsb_busy++;
+	cfs_hash_bd_unlock(hs, &bd, 1);
+	return o;
+}
+
+/**
+ * Core logic of lu_object_find*() functions.
+ */
+static struct lu_object *lu_object_find_try(const struct lu_env *env,
+					    struct lu_device *dev,
+					    const struct lu_fid *f,
+					    const struct lu_object_conf *conf,
+					    wait_queue_t *waiter)
+{
+	struct lu_object      *o;
+	struct lu_object      *shadow;
+	struct lu_site	*s;
+	struct cfs_hash	    *hs;
+	struct cfs_hash_bd	  bd;
+	__u64		  version = 0;
+
+	/*
+	 * This uses standard index maintenance protocol:
+	 *
+	 *     - search index under lock, and return object if found;
+	 *     - otherwise, unlock index, allocate new object;
+	 *     - lock index and search again;
+	 *     - if nothing is found (usual case), insert newly created
+	 *       object into index;
+	 *     - otherwise (race: other thread inserted object), free
+	 *       object just allocated.
+	 *     - unlock index;
+	 *     - return object.
+	 *
+	 * For "LOC_F_NEW" case, we are sure the object is new established.
+	 * It is unnecessary to perform lookup-alloc-lookup-insert, instead,
+	 * just alloc and insert directly.
+	 *
+	 * If dying object is found during index search, add @waiter to the
+	 * site wait-queue and return ERR_PTR(-EAGAIN).
+	 */
+	if (conf != NULL && conf->loc_flags & LOC_F_NEW)
+		return lu_object_new(env, dev, f, conf);
+
+	s  = dev->ld_site;
+	hs = s->ls_obj_hash;
+	cfs_hash_bd_get_and_lock(hs, (void *)f, &bd, 1);
+	o = htable_lookup(s, &bd, f, waiter, &version);
+	cfs_hash_bd_unlock(hs, &bd, 1);
+	if (!IS_ERR(o) || PTR_ERR(o) != -ENOENT)
+		return o;
+
+	/*
+	 * Allocate new object. This may result in rather complicated
+	 * operations, including fld queries, inode loading, etc.
+	 */
+	o = lu_object_alloc(env, dev, f, conf);
+	if (unlikely(IS_ERR(o)))
+		return o;
+
+	LASSERT(lu_fid_eq(lu_object_fid(o), f));
+
+	cfs_hash_bd_lock(hs, &bd, 1);
+
+	shadow = htable_lookup(s, &bd, f, waiter, &version);
+	if (likely(IS_ERR(shadow) && PTR_ERR(shadow) == -ENOENT)) {
+		struct lu_site_bkt_data *bkt;
+
+		bkt = cfs_hash_bd_extra_get(hs, &bd);
+		cfs_hash_bd_add_locked(hs, &bd, &o->lo_header->loh_hash);
+		bkt->lsb_busy++;
+		cfs_hash_bd_unlock(hs, &bd, 1);
+		return o;
+	}
+
+	lprocfs_counter_incr(s->ls_stats, LU_SS_CACHE_RACE);
+	cfs_hash_bd_unlock(hs, &bd, 1);
+	lu_object_free(env, o);
+	return shadow;
+}
+
+/**
+ * Much like lu_object_find(), but top level device of object is specifically
+ * \a dev rather than top level device of the site. This interface allows
+ * objects of different "stacking" to be created within the same site.
+ */
+struct lu_object *lu_object_find_at(const struct lu_env *env,
+				    struct lu_device *dev,
+				    const struct lu_fid *f,
+				    const struct lu_object_conf *conf)
+{
+	struct lu_site_bkt_data *bkt;
+	struct lu_object	*obj;
+	wait_queue_t	   wait;
+
+	while (1) {
+		obj = lu_object_find_try(env, dev, f, conf, &wait);
+		if (obj != ERR_PTR(-EAGAIN))
+			return obj;
+		/*
+		 * lu_object_find_try() already added waiter into the
+		 * wait queue.
+		 */
+		schedule();
+		bkt = lu_site_bkt_from_fid(dev->ld_site, (void *)f);
+		remove_wait_queue(&bkt->lsb_marche_funebre, &wait);
+	}
+}
+EXPORT_SYMBOL(lu_object_find_at);
+
+/**
+ * Find object with given fid, and return its slice belonging to given device.
+ */
+struct lu_object *lu_object_find_slice(const struct lu_env *env,
+				       struct lu_device *dev,
+				       const struct lu_fid *f,
+				       const struct lu_object_conf *conf)
+{
+	struct lu_object *top;
+	struct lu_object *obj;
+
+	top = lu_object_find(env, dev, f, conf);
+	if (!IS_ERR(top)) {
+		obj = lu_object_locate(top->lo_header, dev->ld_type);
+		if (obj == NULL)
+			lu_object_put(env, top);
+	} else
+		obj = top;
+	return obj;
+}
+EXPORT_SYMBOL(lu_object_find_slice);
+
+/**
+ * Global list of all device types.
+ */
+static LIST_HEAD(lu_device_types);
+
+int lu_device_type_init(struct lu_device_type *ldt)
+{
+	int result = 0;
+
+	INIT_LIST_HEAD(&ldt->ldt_linkage);
+	if (ldt->ldt_ops->ldto_init)
+		result = ldt->ldt_ops->ldto_init(ldt);
+	if (result == 0)
+		list_add(&ldt->ldt_linkage, &lu_device_types);
+	return result;
+}
+EXPORT_SYMBOL(lu_device_type_init);
+
+void lu_device_type_fini(struct lu_device_type *ldt)
+{
+	list_del_init(&ldt->ldt_linkage);
+	if (ldt->ldt_ops->ldto_fini)
+		ldt->ldt_ops->ldto_fini(ldt);
+}
+EXPORT_SYMBOL(lu_device_type_fini);
+
+void lu_types_stop(void)
+{
+	struct lu_device_type *ldt;
+
+	list_for_each_entry(ldt, &lu_device_types, ldt_linkage) {
+		if (ldt->ldt_device_nr == 0 && ldt->ldt_ops->ldto_stop)
+			ldt->ldt_ops->ldto_stop(ldt);
+	}
+}
+EXPORT_SYMBOL(lu_types_stop);
+
+/**
+ * Global list of all sites on this node
+ */
+static LIST_HEAD(lu_sites);
+static DEFINE_MUTEX(lu_sites_guard);
+
+/**
+ * Global environment used by site shrinker.
+ */
+static struct lu_env lu_shrink_env;
+
+struct lu_site_print_arg {
+	struct lu_env   *lsp_env;
+	void	    *lsp_cookie;
+	lu_printer_t     lsp_printer;
+};
+
+static int
+lu_site_obj_print(struct cfs_hash *hs, struct cfs_hash_bd *bd,
+		  struct hlist_node *hnode, void *data)
+{
+	struct lu_site_print_arg *arg = (struct lu_site_print_arg *)data;
+	struct lu_object_header  *h;
+
+	h = hlist_entry(hnode, struct lu_object_header, loh_hash);
+	if (!list_empty(&h->loh_layers)) {
+		const struct lu_object *o;
+
+		o = lu_object_top(h);
+		lu_object_print(arg->lsp_env, arg->lsp_cookie,
+				arg->lsp_printer, o);
+	} else {
+		lu_object_header_print(arg->lsp_env, arg->lsp_cookie,
+				       arg->lsp_printer, h);
+	}
+	return 0;
+}
+
+/**
+ * Print all objects in \a s.
+ */
+void lu_site_print(const struct lu_env *env, struct lu_site *s, void *cookie,
+		   lu_printer_t printer)
+{
+	struct lu_site_print_arg arg = {
+		.lsp_env     = (struct lu_env *)env,
+		.lsp_cookie  = cookie,
+		.lsp_printer = printer,
+	};
+
+	cfs_hash_for_each(s->ls_obj_hash, lu_site_obj_print, &arg);
+}
+EXPORT_SYMBOL(lu_site_print);
+
+enum {
+	LU_CACHE_PERCENT_MAX     = 50,
+	LU_CACHE_PERCENT_DEFAULT = 20
+};
+
+static unsigned int lu_cache_percent = LU_CACHE_PERCENT_DEFAULT;
+module_param(lu_cache_percent, int, 0644);
+MODULE_PARM_DESC(lu_cache_percent, "Percentage of memory to be used as lu_object cache");
+
+/**
+ * Return desired hash table order.
+ */
+static int lu_htable_order(void)
+{
+	unsigned long cache_size;
+	int bits;
+
+	/*
+	 * Calculate hash table size, assuming that we want reasonable
+	 * performance when 20% of total memory is occupied by cache of
+	 * lu_objects.
+	 *
+	 * Size of lu_object is (arbitrary) taken as 1K (together with inode).
+	 */
+	cache_size = totalram_pages;
+
+#if BITS_PER_LONG == 32
+	/* limit hashtable size for lowmem systems to low RAM */
+	if (cache_size > 1 << (30 - PAGE_CACHE_SHIFT))
+		cache_size = 1 << (30 - PAGE_CACHE_SHIFT) * 3 / 4;
+#endif
+
+	/* clear off unreasonable cache setting. */
+	if (lu_cache_percent == 0 || lu_cache_percent > LU_CACHE_PERCENT_MAX) {
+		CWARN("obdclass: invalid lu_cache_percent: %u, it must be in the range of (0, %u]. Will use default value: %u.\n",
+		      lu_cache_percent, LU_CACHE_PERCENT_MAX,
+		      LU_CACHE_PERCENT_DEFAULT);
+
+		lu_cache_percent = LU_CACHE_PERCENT_DEFAULT;
+	}
+	cache_size = cache_size / 100 * lu_cache_percent *
+		(PAGE_CACHE_SIZE / 1024);
+
+	for (bits = 1; (1 << bits) < cache_size; ++bits) {
+		;
+	}
+	return bits;
+}
+
+static unsigned lu_obj_hop_hash(struct cfs_hash *hs,
+				const void *key, unsigned mask)
+{
+	struct lu_fid  *fid = (struct lu_fid *)key;
+	__u32	   hash;
+
+	hash = fid_flatten32(fid);
+	hash += (hash >> 4) + (hash << 12); /* mixing oid and seq */
+	hash = hash_long(hash, hs->hs_bkt_bits);
+
+	/* give me another random factor */
+	hash -= hash_long((unsigned long)hs, fid_oid(fid) % 11 + 3);
+
+	hash <<= hs->hs_cur_bits - hs->hs_bkt_bits;
+	hash |= (fid_seq(fid) + fid_oid(fid)) & (CFS_HASH_NBKT(hs) - 1);
+
+	return hash & mask;
+}
+
+static void *lu_obj_hop_object(struct hlist_node *hnode)
+{
+	return hlist_entry(hnode, struct lu_object_header, loh_hash);
+}
+
+static void *lu_obj_hop_key(struct hlist_node *hnode)
+{
+	struct lu_object_header *h;
+
+	h = hlist_entry(hnode, struct lu_object_header, loh_hash);
+	return &h->loh_fid;
+}
+
+static int lu_obj_hop_keycmp(const void *key, struct hlist_node *hnode)
+{
+	struct lu_object_header *h;
+
+	h = hlist_entry(hnode, struct lu_object_header, loh_hash);
+	return lu_fid_eq(&h->loh_fid, (struct lu_fid *)key);
+}
+
+static void lu_obj_hop_get(struct cfs_hash *hs, struct hlist_node *hnode)
+{
+	struct lu_object_header *h;
+
+	h = hlist_entry(hnode, struct lu_object_header, loh_hash);
+	if (atomic_add_return(1, &h->loh_ref) == 1) {
+		struct lu_site_bkt_data *bkt;
+		struct cfs_hash_bd	    bd;
+
+		cfs_hash_bd_get(hs, &h->loh_fid, &bd);
+		bkt = cfs_hash_bd_extra_get(hs, &bd);
+		bkt->lsb_busy++;
+	}
+}
+
+static void lu_obj_hop_put_locked(struct cfs_hash *hs, struct hlist_node *hnode)
+{
+	LBUG(); /* we should never called it */
+}
+
+cfs_hash_ops_t lu_site_hash_ops = {
+	.hs_hash	= lu_obj_hop_hash,
+	.hs_key	 = lu_obj_hop_key,
+	.hs_keycmp      = lu_obj_hop_keycmp,
+	.hs_object      = lu_obj_hop_object,
+	.hs_get	 = lu_obj_hop_get,
+	.hs_put_locked  = lu_obj_hop_put_locked,
+};
+
+void lu_dev_add_linkage(struct lu_site *s, struct lu_device *d)
+{
+	spin_lock(&s->ls_ld_lock);
+	if (list_empty(&d->ld_linkage))
+		list_add(&d->ld_linkage, &s->ls_ld_linkage);
+	spin_unlock(&s->ls_ld_lock);
+}
+EXPORT_SYMBOL(lu_dev_add_linkage);
+
+void lu_dev_del_linkage(struct lu_site *s, struct lu_device *d)
+{
+	spin_lock(&s->ls_ld_lock);
+	list_del_init(&d->ld_linkage);
+	spin_unlock(&s->ls_ld_lock);
+}
+EXPORT_SYMBOL(lu_dev_del_linkage);
+
+/**
+ * Initialize site \a s, with \a d as the top level device.
+ */
+#define LU_SITE_BITS_MIN    12
+#define LU_SITE_BITS_MAX    24
+/**
+ * total 256 buckets, we don't want too many buckets because:
+ * - consume too much memory
+ * - avoid unbalanced LRU list
+ */
+#define LU_SITE_BKT_BITS    8
+
+int lu_site_init(struct lu_site *s, struct lu_device *top)
+{
+	struct lu_site_bkt_data *bkt;
+	struct cfs_hash_bd bd;
+	char name[16];
+	int bits;
+	int i;
+
+	memset(s, 0, sizeof(*s));
+	bits = lu_htable_order();
+	snprintf(name, 16, "lu_site_%s", top->ld_type->ldt_name);
+	for (bits = min(max(LU_SITE_BITS_MIN, bits), LU_SITE_BITS_MAX);
+	     bits >= LU_SITE_BITS_MIN; bits--) {
+		s->ls_obj_hash = cfs_hash_create(name, bits, bits,
+						 bits - LU_SITE_BKT_BITS,
+						 sizeof(*bkt), 0, 0,
+						 &lu_site_hash_ops,
+						 CFS_HASH_SPIN_BKTLOCK |
+						 CFS_HASH_NO_ITEMREF |
+						 CFS_HASH_DEPTH |
+						 CFS_HASH_ASSERT_EMPTY);
+		if (s->ls_obj_hash != NULL)
+			break;
+	}
+
+	if (s->ls_obj_hash == NULL) {
+		CERROR("failed to create lu_site hash with bits: %d\n", bits);
+		return -ENOMEM;
+	}
+
+	cfs_hash_for_each_bucket(s->ls_obj_hash, &bd, i) {
+		bkt = cfs_hash_bd_extra_get(s->ls_obj_hash, &bd);
+		INIT_LIST_HEAD(&bkt->lsb_lru);
+		init_waitqueue_head(&bkt->lsb_marche_funebre);
+	}
+
+	s->ls_stats = lprocfs_alloc_stats(LU_SS_LAST_STAT, 0);
+	if (s->ls_stats == NULL) {
+		cfs_hash_putref(s->ls_obj_hash);
+		s->ls_obj_hash = NULL;
+		return -ENOMEM;
+	}
+
+	lprocfs_counter_init(s->ls_stats, LU_SS_CREATED,
+			     0, "created", "created");
+	lprocfs_counter_init(s->ls_stats, LU_SS_CACHE_HIT,
+			     0, "cache_hit", "cache_hit");
+	lprocfs_counter_init(s->ls_stats, LU_SS_CACHE_MISS,
+			     0, "cache_miss", "cache_miss");
+	lprocfs_counter_init(s->ls_stats, LU_SS_CACHE_RACE,
+			     0, "cache_race", "cache_race");
+	lprocfs_counter_init(s->ls_stats, LU_SS_CACHE_DEATH_RACE,
+			     0, "cache_death_race", "cache_death_race");
+	lprocfs_counter_init(s->ls_stats, LU_SS_LRU_PURGED,
+			     0, "lru_purged", "lru_purged");
+
+	INIT_LIST_HEAD(&s->ls_linkage);
+	s->ls_top_dev = top;
+	top->ld_site = s;
+	lu_device_get(top);
+	lu_ref_add(&top->ld_reference, "site-top", s);
+
+	INIT_LIST_HEAD(&s->ls_ld_linkage);
+	spin_lock_init(&s->ls_ld_lock);
+
+	lu_dev_add_linkage(s, top);
+
+	return 0;
+}
+EXPORT_SYMBOL(lu_site_init);
+
+/**
+ * Finalize \a s and release its resources.
+ */
+void lu_site_fini(struct lu_site *s)
+{
+	mutex_lock(&lu_sites_guard);
+	list_del_init(&s->ls_linkage);
+	mutex_unlock(&lu_sites_guard);
+
+	if (s->ls_obj_hash != NULL) {
+		cfs_hash_putref(s->ls_obj_hash);
+		s->ls_obj_hash = NULL;
+	}
+
+	if (s->ls_top_dev != NULL) {
+		s->ls_top_dev->ld_site = NULL;
+		lu_ref_del(&s->ls_top_dev->ld_reference, "site-top", s);
+		lu_device_put(s->ls_top_dev);
+		s->ls_top_dev = NULL;
+	}
+
+	if (s->ls_stats != NULL)
+		lprocfs_free_stats(&s->ls_stats);
+}
+EXPORT_SYMBOL(lu_site_fini);
+
+/**
+ * Called when initialization of stack for this site is completed.
+ */
+int lu_site_init_finish(struct lu_site *s)
+{
+	int result;
+	mutex_lock(&lu_sites_guard);
+	result = lu_context_refill(&lu_shrink_env.le_ctx);
+	if (result == 0)
+		list_add(&s->ls_linkage, &lu_sites);
+	mutex_unlock(&lu_sites_guard);
+	return result;
+}
+EXPORT_SYMBOL(lu_site_init_finish);
+
+/**
+ * Acquire additional reference on device \a d
+ */
+void lu_device_get(struct lu_device *d)
+{
+	atomic_inc(&d->ld_ref);
+}
+EXPORT_SYMBOL(lu_device_get);
+
+/**
+ * Release reference on device \a d.
+ */
+void lu_device_put(struct lu_device *d)
+{
+	LASSERT(atomic_read(&d->ld_ref) > 0);
+	atomic_dec(&d->ld_ref);
+}
+EXPORT_SYMBOL(lu_device_put);
+
+/**
+ * Initialize device \a d of type \a t.
+ */
+int lu_device_init(struct lu_device *d, struct lu_device_type *t)
+{
+	if (t->ldt_device_nr++ == 0 && t->ldt_ops->ldto_start != NULL)
+		t->ldt_ops->ldto_start(t);
+	memset(d, 0, sizeof(*d));
+	atomic_set(&d->ld_ref, 0);
+	d->ld_type = t;
+	lu_ref_init(&d->ld_reference);
+	INIT_LIST_HEAD(&d->ld_linkage);
+	return 0;
+}
+EXPORT_SYMBOL(lu_device_init);
+
+/**
+ * Finalize device \a d.
+ */
+void lu_device_fini(struct lu_device *d)
+{
+	struct lu_device_type *t;
+
+	t = d->ld_type;
+	if (d->ld_obd != NULL) {
+		d->ld_obd->obd_lu_dev = NULL;
+		d->ld_obd = NULL;
+	}
+
+	lu_ref_fini(&d->ld_reference);
+	LASSERTF(atomic_read(&d->ld_ref) == 0,
+		 "Refcount is %u\n", atomic_read(&d->ld_ref));
+	LASSERT(t->ldt_device_nr > 0);
+	if (--t->ldt_device_nr == 0 && t->ldt_ops->ldto_stop != NULL)
+		t->ldt_ops->ldto_stop(t);
+}
+EXPORT_SYMBOL(lu_device_fini);
+
+/**
+ * Initialize object \a o that is part of compound object \a h and was created
+ * by device \a d.
+ */
+int lu_object_init(struct lu_object *o, struct lu_object_header *h,
+		   struct lu_device *d)
+{
+	memset(o, 0, sizeof(*o));
+	o->lo_header = h;
+	o->lo_dev = d;
+	lu_device_get(d);
+	lu_ref_add_at(&d->ld_reference, &o->lo_dev_ref, "lu_object", o);
+	INIT_LIST_HEAD(&o->lo_linkage);
+
+	return 0;
+}
+EXPORT_SYMBOL(lu_object_init);
+
+/**
+ * Finalize object and release its resources.
+ */
+void lu_object_fini(struct lu_object *o)
+{
+	struct lu_device *dev = o->lo_dev;
+
+	LASSERT(list_empty(&o->lo_linkage));
+
+	if (dev != NULL) {
+		lu_ref_del_at(&dev->ld_reference, &o->lo_dev_ref,
+			      "lu_object", o);
+		lu_device_put(dev);
+		o->lo_dev = NULL;
+	}
+}
+EXPORT_SYMBOL(lu_object_fini);
+
+/**
+ * Add object \a o as first layer of compound object \a h
+ *
+ * This is typically called by the ->ldo_object_alloc() method of top-level
+ * device.
+ */
+void lu_object_add_top(struct lu_object_header *h, struct lu_object *o)
+{
+	list_move(&o->lo_linkage, &h->loh_layers);
+}
+EXPORT_SYMBOL(lu_object_add_top);
+
+/**
+ * Add object \a o as a layer of compound object, going after \a before.
+ *
+ * This is typically called by the ->ldo_object_alloc() method of \a
+ * before->lo_dev.
+ */
+void lu_object_add(struct lu_object *before, struct lu_object *o)
+{
+	list_move(&o->lo_linkage, &before->lo_linkage);
+}
+EXPORT_SYMBOL(lu_object_add);
+
+/**
+ * Initialize compound object.
+ */
+int lu_object_header_init(struct lu_object_header *h)
+{
+	memset(h, 0, sizeof(*h));
+	atomic_set(&h->loh_ref, 1);
+	INIT_HLIST_NODE(&h->loh_hash);
+	INIT_LIST_HEAD(&h->loh_lru);
+	INIT_LIST_HEAD(&h->loh_layers);
+	lu_ref_init(&h->loh_reference);
+	return 0;
+}
+EXPORT_SYMBOL(lu_object_header_init);
+
+/**
+ * Finalize compound object.
+ */
+void lu_object_header_fini(struct lu_object_header *h)
+{
+	LASSERT(list_empty(&h->loh_layers));
+	LASSERT(list_empty(&h->loh_lru));
+	LASSERT(hlist_unhashed(&h->loh_hash));
+	lu_ref_fini(&h->loh_reference);
+}
+EXPORT_SYMBOL(lu_object_header_fini);
+
+/**
+ * Given a compound object, find its slice, corresponding to the device type
+ * \a dtype.
+ */
+struct lu_object *lu_object_locate(struct lu_object_header *h,
+				   const struct lu_device_type *dtype)
+{
+	struct lu_object *o;
+
+	list_for_each_entry(o, &h->loh_layers, lo_linkage) {
+		if (o->lo_dev->ld_type == dtype)
+			return o;
+	}
+	return NULL;
+}
+EXPORT_SYMBOL(lu_object_locate);
+
+
+
+/**
+ * Finalize and free devices in the device stack.
+ *
+ * Finalize device stack by purging object cache, and calling
+ * lu_device_type_operations::ldto_device_fini() and
+ * lu_device_type_operations::ldto_device_free() on all devices in the stack.
+ */
+void lu_stack_fini(const struct lu_env *env, struct lu_device *top)
+{
+	struct lu_site   *site = top->ld_site;
+	struct lu_device *scan;
+	struct lu_device *next;
+
+	lu_site_purge(env, site, ~0);
+	for (scan = top; scan != NULL; scan = next) {
+		next = scan->ld_type->ldt_ops->ldto_device_fini(env, scan);
+		lu_ref_del(&scan->ld_reference, "lu-stack", &lu_site_init);
+		lu_device_put(scan);
+	}
+
+	/* purge again. */
+	lu_site_purge(env, site, ~0);
+
+	for (scan = top; scan != NULL; scan = next) {
+		const struct lu_device_type *ldt = scan->ld_type;
+		struct obd_type	     *type;
+
+		next = ldt->ldt_ops->ldto_device_free(env, scan);
+		type = ldt->ldt_obd_type;
+		if (type != NULL) {
+			type->typ_refcnt--;
+			class_put_type(type);
+		}
+	}
+}
+EXPORT_SYMBOL(lu_stack_fini);
+
+enum {
+	/**
+	 * Maximal number of tld slots.
+	 */
+	LU_CONTEXT_KEY_NR = 40
+};
+
+static struct lu_context_key *lu_keys[LU_CONTEXT_KEY_NR] = { NULL, };
+
+static DEFINE_SPINLOCK(lu_keys_guard);
+
+/**
+ * Global counter incremented whenever key is registered, unregistered,
+ * revived or quiesced. This is used to void unnecessary calls to
+ * lu_context_refill(). No locking is provided, as initialization and shutdown
+ * are supposed to be externally serialized.
+ */
+static unsigned key_set_version;
+
+/**
+ * Register new key.
+ */
+int lu_context_key_register(struct lu_context_key *key)
+{
+	int result;
+	int i;
+
+	LASSERT(key->lct_init != NULL);
+	LASSERT(key->lct_fini != NULL);
+	LASSERT(key->lct_tags != 0);
+
+	result = -ENFILE;
+	spin_lock(&lu_keys_guard);
+	for (i = 0; i < ARRAY_SIZE(lu_keys); ++i) {
+		if (lu_keys[i] == NULL) {
+			key->lct_index = i;
+			atomic_set(&key->lct_used, 1);
+			lu_keys[i] = key;
+			lu_ref_init(&key->lct_reference);
+			result = 0;
+			++key_set_version;
+			break;
+		}
+	}
+	spin_unlock(&lu_keys_guard);
+	return result;
+}
+EXPORT_SYMBOL(lu_context_key_register);
+
+static void key_fini(struct lu_context *ctx, int index)
+{
+	if (ctx->lc_value != NULL && ctx->lc_value[index] != NULL) {
+		struct lu_context_key *key;
+
+		key = lu_keys[index];
+		LASSERT(key != NULL);
+		LASSERT(key->lct_fini != NULL);
+		LASSERT(atomic_read(&key->lct_used) > 1);
+
+		key->lct_fini(ctx, key, ctx->lc_value[index]);
+		lu_ref_del(&key->lct_reference, "ctx", ctx);
+		atomic_dec(&key->lct_used);
+
+		if ((ctx->lc_tags & LCT_NOREF) == 0) {
+#ifdef CONFIG_MODULE_UNLOAD
+			LINVRNT(module_refcount(key->lct_owner) > 0);
+#endif
+			module_put(key->lct_owner);
+		}
+		ctx->lc_value[index] = NULL;
+	}
+}
+
+/**
+ * Deregister key.
+ */
+void lu_context_key_degister(struct lu_context_key *key)
+{
+	LASSERT(atomic_read(&key->lct_used) >= 1);
+	LINVRNT(0 <= key->lct_index && key->lct_index < ARRAY_SIZE(lu_keys));
+
+	lu_context_key_quiesce(key);
+
+	++key_set_version;
+	spin_lock(&lu_keys_guard);
+	key_fini(&lu_shrink_env.le_ctx, key->lct_index);
+	if (lu_keys[key->lct_index]) {
+		lu_keys[key->lct_index] = NULL;
+		lu_ref_fini(&key->lct_reference);
+	}
+	spin_unlock(&lu_keys_guard);
+
+	LASSERTF(atomic_read(&key->lct_used) == 1,
+		 "key has instances: %d\n",
+		 atomic_read(&key->lct_used));
+}
+EXPORT_SYMBOL(lu_context_key_degister);
+
+/**
+ * Register a number of keys. This has to be called after all keys have been
+ * initialized by a call to LU_CONTEXT_KEY_INIT().
+ */
+int lu_context_key_register_many(struct lu_context_key *k, ...)
+{
+	struct lu_context_key *key = k;
+	va_list args;
+	int result;
+
+	va_start(args, k);
+	do {
+		result = lu_context_key_register(key);
+		if (result)
+			break;
+		key = va_arg(args, struct lu_context_key *);
+	} while (key != NULL);
+	va_end(args);
+
+	if (result != 0) {
+		va_start(args, k);
+		while (k != key) {
+			lu_context_key_degister(k);
+			k = va_arg(args, struct lu_context_key *);
+		}
+		va_end(args);
+	}
+
+	return result;
+}
+EXPORT_SYMBOL(lu_context_key_register_many);
+
+/**
+ * De-register a number of keys. This is a dual to
+ * lu_context_key_register_many().
+ */
+void lu_context_key_degister_many(struct lu_context_key *k, ...)
+{
+	va_list args;
+
+	va_start(args, k);
+	do {
+		lu_context_key_degister(k);
+		k = va_arg(args, struct lu_context_key*);
+	} while (k != NULL);
+	va_end(args);
+}
+EXPORT_SYMBOL(lu_context_key_degister_many);
+
+/**
+ * Revive a number of keys.
+ */
+void lu_context_key_revive_many(struct lu_context_key *k, ...)
+{
+	va_list args;
+
+	va_start(args, k);
+	do {
+		lu_context_key_revive(k);
+		k = va_arg(args, struct lu_context_key*);
+	} while (k != NULL);
+	va_end(args);
+}
+EXPORT_SYMBOL(lu_context_key_revive_many);
+
+/**
+ * Quiescent a number of keys.
+ */
+void lu_context_key_quiesce_many(struct lu_context_key *k, ...)
+{
+	va_list args;
+
+	va_start(args, k);
+	do {
+		lu_context_key_quiesce(k);
+		k = va_arg(args, struct lu_context_key*);
+	} while (k != NULL);
+	va_end(args);
+}
+EXPORT_SYMBOL(lu_context_key_quiesce_many);
+
+/**
+ * Return value associated with key \a key in context \a ctx.
+ */
+void *lu_context_key_get(const struct lu_context *ctx,
+			 const struct lu_context_key *key)
+{
+	LINVRNT(ctx->lc_state == LCS_ENTERED);
+	LINVRNT(0 <= key->lct_index && key->lct_index < ARRAY_SIZE(lu_keys));
+	LASSERT(lu_keys[key->lct_index] == key);
+	return ctx->lc_value[key->lct_index];
+}
+EXPORT_SYMBOL(lu_context_key_get);
+
+/**
+ * List of remembered contexts. XXX document me.
+ */
+static LIST_HEAD(lu_context_remembered);
+
+/**
+ * Destroy \a key in all remembered contexts. This is used to destroy key
+ * values in "shared" contexts (like service threads), when a module owning
+ * the key is about to be unloaded.
+ */
+void lu_context_key_quiesce(struct lu_context_key *key)
+{
+	struct lu_context *ctx;
+
+	if (!(key->lct_tags & LCT_QUIESCENT)) {
+		/*
+		 * XXX layering violation.
+		 */
+		key->lct_tags |= LCT_QUIESCENT;
+		/*
+		 * XXX memory barrier has to go here.
+		 */
+		spin_lock(&lu_keys_guard);
+		list_for_each_entry(ctx, &lu_context_remembered,
+					lc_remember)
+			key_fini(ctx, key->lct_index);
+		spin_unlock(&lu_keys_guard);
+		++key_set_version;
+	}
+}
+EXPORT_SYMBOL(lu_context_key_quiesce);
+
+void lu_context_key_revive(struct lu_context_key *key)
+{
+	key->lct_tags &= ~LCT_QUIESCENT;
+	++key_set_version;
+}
+EXPORT_SYMBOL(lu_context_key_revive);
+
+static void keys_fini(struct lu_context *ctx)
+{
+	int	i;
+
+	if (ctx->lc_value == NULL)
+		return;
+
+	for (i = 0; i < ARRAY_SIZE(lu_keys); ++i)
+		key_fini(ctx, i);
+
+	OBD_FREE(ctx->lc_value, ARRAY_SIZE(lu_keys) * sizeof(ctx->lc_value[0]));
+	ctx->lc_value = NULL;
+}
+
+static int keys_fill(struct lu_context *ctx)
+{
+	int i;
+
+	LINVRNT(ctx->lc_value != NULL);
+	for (i = 0; i < ARRAY_SIZE(lu_keys); ++i) {
+		struct lu_context_key *key;
+
+		key = lu_keys[i];
+		if (ctx->lc_value[i] == NULL && key != NULL &&
+		    (key->lct_tags & ctx->lc_tags) &&
+		    /*
+		     * Don't create values for a LCT_QUIESCENT key, as this
+		     * will pin module owning a key.
+		     */
+		    !(key->lct_tags & LCT_QUIESCENT)) {
+			void *value;
+
+			LINVRNT(key->lct_init != NULL);
+			LINVRNT(key->lct_index == i);
+
+			value = key->lct_init(ctx, key);
+			if (unlikely(IS_ERR(value)))
+				return PTR_ERR(value);
+
+			if (!(ctx->lc_tags & LCT_NOREF))
+				try_module_get(key->lct_owner);
+			lu_ref_add_atomic(&key->lct_reference, "ctx", ctx);
+			atomic_inc(&key->lct_used);
+			/*
+			 * This is the only place in the code, where an
+			 * element of ctx->lc_value[] array is set to non-NULL
+			 * value.
+			 */
+			ctx->lc_value[i] = value;
+			if (key->lct_exit != NULL)
+				ctx->lc_tags |= LCT_HAS_EXIT;
+		}
+		ctx->lc_version = key_set_version;
+	}
+	return 0;
+}
+
+static int keys_init(struct lu_context *ctx)
+{
+	OBD_ALLOC(ctx->lc_value,
+		  ARRAY_SIZE(lu_keys) * sizeof(ctx->lc_value[0]));
+	if (likely(ctx->lc_value != NULL))
+		return keys_fill(ctx);
+
+	return -ENOMEM;
+}
+
+/**
+ * Initialize context data-structure. Create values for all keys.
+ */
+int lu_context_init(struct lu_context *ctx, __u32 tags)
+{
+	int	rc;
+
+	memset(ctx, 0, sizeof(*ctx));
+	ctx->lc_state = LCS_INITIALIZED;
+	ctx->lc_tags = tags;
+	if (tags & LCT_REMEMBER) {
+		spin_lock(&lu_keys_guard);
+		list_add(&ctx->lc_remember, &lu_context_remembered);
+		spin_unlock(&lu_keys_guard);
+	} else {
+		INIT_LIST_HEAD(&ctx->lc_remember);
+	}
+
+	rc = keys_init(ctx);
+	if (rc != 0)
+		lu_context_fini(ctx);
+
+	return rc;
+}
+EXPORT_SYMBOL(lu_context_init);
+
+/**
+ * Finalize context data-structure. Destroy key values.
+ */
+void lu_context_fini(struct lu_context *ctx)
+{
+	LINVRNT(ctx->lc_state == LCS_INITIALIZED || ctx->lc_state == LCS_LEFT);
+	ctx->lc_state = LCS_FINALIZED;
+
+	if ((ctx->lc_tags & LCT_REMEMBER) == 0) {
+		LASSERT(list_empty(&ctx->lc_remember));
+		keys_fini(ctx);
+
+	} else { /* could race with key degister */
+		spin_lock(&lu_keys_guard);
+		keys_fini(ctx);
+		list_del_init(&ctx->lc_remember);
+		spin_unlock(&lu_keys_guard);
+	}
+}
+EXPORT_SYMBOL(lu_context_fini);
+
+/**
+ * Called before entering context.
+ */
+void lu_context_enter(struct lu_context *ctx)
+{
+	LINVRNT(ctx->lc_state == LCS_INITIALIZED || ctx->lc_state == LCS_LEFT);
+	ctx->lc_state = LCS_ENTERED;
+}
+EXPORT_SYMBOL(lu_context_enter);
+
+/**
+ * Called after exiting from \a ctx
+ */
+void lu_context_exit(struct lu_context *ctx)
+{
+	int i;
+
+	LINVRNT(ctx->lc_state == LCS_ENTERED);
+	ctx->lc_state = LCS_LEFT;
+	if (ctx->lc_tags & LCT_HAS_EXIT && ctx->lc_value != NULL) {
+		for (i = 0; i < ARRAY_SIZE(lu_keys); ++i) {
+			if (ctx->lc_value[i] != NULL) {
+				struct lu_context_key *key;
+
+				key = lu_keys[i];
+				LASSERT(key != NULL);
+				if (key->lct_exit != NULL)
+					key->lct_exit(ctx,
+						      key, ctx->lc_value[i]);
+			}
+		}
+	}
+}
+EXPORT_SYMBOL(lu_context_exit);
+
+/**
+ * Allocate for context all missing keys that were registered after context
+ * creation. key_set_version is only changed in rare cases when modules
+ * are loaded and removed.
+ */
+int lu_context_refill(struct lu_context *ctx)
+{
+	return likely(ctx->lc_version == key_set_version) ? 0 : keys_fill(ctx);
+}
+EXPORT_SYMBOL(lu_context_refill);
+
+/**
+ * lu_ctx_tags/lu_ses_tags will be updated if there are new types of
+ * obd being added. Currently, this is only used on client side, specifically
+ * for echo device client, for other stack (like ptlrpc threads), context are
+ * predefined when the lu_device type are registered, during the module probe
+ * phase.
+ */
+__u32 lu_context_tags_default = 0;
+__u32 lu_session_tags_default = 0;
+
+void lu_context_tags_update(__u32 tags)
+{
+	spin_lock(&lu_keys_guard);
+	lu_context_tags_default |= tags;
+	key_set_version++;
+	spin_unlock(&lu_keys_guard);
+}
+EXPORT_SYMBOL(lu_context_tags_update);
+
+void lu_context_tags_clear(__u32 tags)
+{
+	spin_lock(&lu_keys_guard);
+	lu_context_tags_default &= ~tags;
+	key_set_version++;
+	spin_unlock(&lu_keys_guard);
+}
+EXPORT_SYMBOL(lu_context_tags_clear);
+
+void lu_session_tags_update(__u32 tags)
+{
+	spin_lock(&lu_keys_guard);
+	lu_session_tags_default |= tags;
+	key_set_version++;
+	spin_unlock(&lu_keys_guard);
+}
+EXPORT_SYMBOL(lu_session_tags_update);
+
+void lu_session_tags_clear(__u32 tags)
+{
+	spin_lock(&lu_keys_guard);
+	lu_session_tags_default &= ~tags;
+	key_set_version++;
+	spin_unlock(&lu_keys_guard);
+}
+EXPORT_SYMBOL(lu_session_tags_clear);
+
+int lu_env_init(struct lu_env *env, __u32 tags)
+{
+	int result;
+
+	env->le_ses = NULL;
+	result = lu_context_init(&env->le_ctx, tags);
+	if (likely(result == 0))
+		lu_context_enter(&env->le_ctx);
+	return result;
+}
+EXPORT_SYMBOL(lu_env_init);
+
+void lu_env_fini(struct lu_env *env)
+{
+	lu_context_exit(&env->le_ctx);
+	lu_context_fini(&env->le_ctx);
+	env->le_ses = NULL;
+}
+EXPORT_SYMBOL(lu_env_fini);
+
+int lu_env_refill(struct lu_env *env)
+{
+	int result;
+
+	result = lu_context_refill(&env->le_ctx);
+	if (result == 0 && env->le_ses != NULL)
+		result = lu_context_refill(env->le_ses);
+	return result;
+}
+EXPORT_SYMBOL(lu_env_refill);
+
+/**
+ * Currently, this API will only be used by echo client.
+ * Because echo client and normal lustre client will share
+ * same cl_env cache. So echo client needs to refresh
+ * the env context after it get one from the cache, especially
+ * when normal client and echo client co-exist in the same client.
+ */
+int lu_env_refill_by_tags(struct lu_env *env, __u32 ctags,
+			  __u32 stags)
+{
+	if ((env->le_ctx.lc_tags & ctags) != ctags) {
+		env->le_ctx.lc_version = 0;
+		env->le_ctx.lc_tags |= ctags;
+	}
+
+	if (env->le_ses && (env->le_ses->lc_tags & stags) != stags) {
+		env->le_ses->lc_version = 0;
+		env->le_ses->lc_tags |= stags;
+	}
+
+	return lu_env_refill(env);
+}
+EXPORT_SYMBOL(lu_env_refill_by_tags);
+
+
+typedef struct lu_site_stats{
+	unsigned	lss_populated;
+	unsigned	lss_max_search;
+	unsigned	lss_total;
+	unsigned	lss_busy;
+} lu_site_stats_t;
+
+static void lu_site_stats_get(struct cfs_hash *hs,
+			      lu_site_stats_t *stats, int populated)
+{
+	struct cfs_hash_bd bd;
+	int	   i;
+
+	cfs_hash_for_each_bucket(hs, &bd, i) {
+		struct lu_site_bkt_data *bkt = cfs_hash_bd_extra_get(hs, &bd);
+		struct hlist_head	*hhead;
+
+		cfs_hash_bd_lock(hs, &bd, 1);
+		stats->lss_busy  += bkt->lsb_busy;
+		stats->lss_total += cfs_hash_bd_count_get(&bd);
+		stats->lss_max_search = max((int)stats->lss_max_search,
+					    cfs_hash_bd_depmax_get(&bd));
+		if (!populated) {
+			cfs_hash_bd_unlock(hs, &bd, 1);
+			continue;
+		}
+
+		cfs_hash_bd_for_each_hlist(hs, &bd, hhead) {
+			if (!hlist_empty(hhead))
+				stats->lss_populated++;
+		}
+		cfs_hash_bd_unlock(hs, &bd, 1);
+	}
+}
+
+
+/*
+ * There exists a potential lock inversion deadlock scenario when using
+ * Lustre on top of ZFS. This occurs between one of ZFS's
+ * buf_hash_table.ht_lock's, and Lustre's lu_sites_guard lock. Essentially,
+ * thread A will take the lu_sites_guard lock and sleep on the ht_lock,
+ * while thread B will take the ht_lock and sleep on the lu_sites_guard
+ * lock. Obviously neither thread will wake and drop their respective hold
+ * on their lock.
+ *
+ * To prevent this from happening we must ensure the lu_sites_guard lock is
+ * not taken while down this code path. ZFS reliably does not set the
+ * __GFP_FS bit in its code paths, so this can be used to determine if it
+ * is safe to take the lu_sites_guard lock.
+ *
+ * Ideally we should accurately return the remaining number of cached
+ * objects without taking the  lu_sites_guard lock, but this is not
+ * possible in the current implementation.
+ */
+static unsigned long lu_cache_shrink_count(struct shrinker *sk,
+					   struct shrink_control *sc)
+{
+	lu_site_stats_t stats;
+	struct lu_site *s;
+	struct lu_site *tmp;
+	unsigned long cached = 0;
+
+	if (!(sc->gfp_mask & __GFP_FS))
+		return 0;
+
+	mutex_lock(&lu_sites_guard);
+	list_for_each_entry_safe(s, tmp, &lu_sites, ls_linkage) {
+		memset(&stats, 0, sizeof(stats));
+		lu_site_stats_get(s->ls_obj_hash, &stats, 0);
+		cached += stats.lss_total - stats.lss_busy;
+	}
+	mutex_unlock(&lu_sites_guard);
+
+	cached = (cached / 100) * sysctl_vfs_cache_pressure;
+	CDEBUG(D_INODE, "%ld objects cached\n", cached);
+	return cached;
+}
+
+static unsigned long lu_cache_shrink_scan(struct shrinker *sk,
+					  struct shrink_control *sc)
+{
+	struct lu_site *s;
+	struct lu_site *tmp;
+	unsigned long remain = sc->nr_to_scan, freed = 0;
+	LIST_HEAD(splice);
+
+	if (!(sc->gfp_mask & __GFP_FS))
+		/* We must not take the lu_sites_guard lock when
+		 * __GFP_FS is *not* set because of the deadlock
+		 * possibility detailed above. Additionally,
+		 * since we cannot determine the number of
+		 * objects in the cache without taking this
+		 * lock, we're in a particularly tough spot. As
+		 * a result, we'll just lie and say our cache is
+		 * empty. This _should_ be ok, as we can't
+		 * reclaim objects when __GFP_FS is *not* set
+		 * anyways.
+		 */
+		return SHRINK_STOP;
+
+	mutex_lock(&lu_sites_guard);
+	list_for_each_entry_safe(s, tmp, &lu_sites, ls_linkage) {
+		freed = lu_site_purge(&lu_shrink_env, s, remain);
+		remain -= freed;
+		/*
+		 * Move just shrunk site to the tail of site list to
+		 * assure shrinking fairness.
+		 */
+		list_move_tail(&s->ls_linkage, &splice);
+	}
+	list_splice(&splice, lu_sites.prev);
+	mutex_unlock(&lu_sites_guard);
+
+	return sc->nr_to_scan - remain;
+}
+
+/*
+ * Debugging stuff.
+ */
+
+/**
+ * Environment to be used in debugger, contains all tags.
+ */
+struct lu_env lu_debugging_env;
+
+/**
+ * Debugging printer function using printk().
+ */
+int lu_printk_printer(const struct lu_env *env,
+		      void *unused, const char *format, ...)
+{
+	va_list args;
+
+	va_start(args, format);
+	vprintk(format, args);
+	va_end(args);
+	return 0;
+}
+
+static struct shrinker lu_site_shrinker = {
+	.count_objects	= lu_cache_shrink_count,
+	.scan_objects	= lu_cache_shrink_scan,
+	.seeks 		= DEFAULT_SEEKS,
+};
+
+/**
+ * Initialization of global lu_* data.
+ */
+int lu_global_init(void)
+{
+	int result;
+
+	CDEBUG(D_INFO, "Lustre LU module (%p).\n", &lu_keys);
+
+	result = lu_ref_global_init();
+	if (result != 0)
+		return result;
+
+	LU_CONTEXT_KEY_INIT(&lu_global_key);
+	result = lu_context_key_register(&lu_global_key);
+	if (result != 0)
+		return result;
+
+	/*
+	 * At this level, we don't know what tags are needed, so allocate them
+	 * conservatively. This should not be too bad, because this
+	 * environment is global.
+	 */
+	mutex_lock(&lu_sites_guard);
+	result = lu_env_init(&lu_shrink_env, LCT_SHRINKER);
+	mutex_unlock(&lu_sites_guard);
+	if (result != 0)
+		return result;
+
+	/*
+	 * seeks estimation: 3 seeks to read a record from oi, one to read
+	 * inode, one for ea. Unfortunately setting this high value results in
+	 * lu_object/inode cache consuming all the memory.
+	 */
+	register_shrinker(&lu_site_shrinker);
+
+	return result;
+}
+
+/**
+ * Dual to lu_global_init().
+ */
+void lu_global_fini(void)
+{
+	unregister_shrinker(&lu_site_shrinker);
+	lu_context_key_degister(&lu_global_key);
+
+	/*
+	 * Tear shrinker environment down _after_ de-registering
+	 * lu_global_key, because the latter has a value in the former.
+	 */
+	mutex_lock(&lu_sites_guard);
+	lu_env_fini(&lu_shrink_env);
+	mutex_unlock(&lu_sites_guard);
+
+	lu_ref_global_fini();
+}
+
+static __u32 ls_stats_read(struct lprocfs_stats *stats, int idx)
+{
+#if defined (CONFIG_PROC_FS)
+	struct lprocfs_counter ret;
+
+	lprocfs_stats_collect(stats, idx, &ret);
+	return (__u32)ret.lc_count;
+#else
+	return 0;
+#endif
+}
+
+/**
+ * Output site statistical counters into a buffer. Suitable for
+ * lprocfs_rd_*()-style functions.
+ */
+int lu_site_stats_print(const struct lu_site *s, struct seq_file *m)
+{
+	lu_site_stats_t stats;
+
+	memset(&stats, 0, sizeof(stats));
+	lu_site_stats_get(s->ls_obj_hash, &stats, 1);
+
+	seq_printf(m, "%d/%d %d/%d %d %d %d %d %d %d %d\n",
+		   stats.lss_busy,
+		   stats.lss_total,
+		   stats.lss_populated,
+		   CFS_HASH_NHLIST(s->ls_obj_hash),
+		   stats.lss_max_search,
+		   ls_stats_read(s->ls_stats, LU_SS_CREATED),
+		   ls_stats_read(s->ls_stats, LU_SS_CACHE_HIT),
+		   ls_stats_read(s->ls_stats, LU_SS_CACHE_MISS),
+		   ls_stats_read(s->ls_stats, LU_SS_CACHE_RACE),
+		   ls_stats_read(s->ls_stats, LU_SS_CACHE_DEATH_RACE),
+		   ls_stats_read(s->ls_stats, LU_SS_LRU_PURGED));
+	return 0;
+}
+EXPORT_SYMBOL(lu_site_stats_print);
+
+/**
+ * Helper function to initialize a number of kmem slab caches at once.
+ */
+int lu_kmem_init(struct lu_kmem_descr *caches)
+{
+	int result;
+	struct lu_kmem_descr *iter = caches;
+
+	for (result = 0; iter->ckd_cache != NULL; ++iter) {
+		*iter->ckd_cache = kmem_cache_create(iter->ckd_name,
+							iter->ckd_size,
+							0, 0, NULL);
+		if (*iter->ckd_cache == NULL) {
+			result = -ENOMEM;
+			/* free all previously allocated caches */
+			lu_kmem_fini(caches);
+			break;
+		}
+	}
+	return result;
+}
+EXPORT_SYMBOL(lu_kmem_init);
+
+/**
+ * Helper function to finalize a number of kmem slab cached at once. Dual to
+ * lu_kmem_init().
+ */
+void lu_kmem_fini(struct lu_kmem_descr *caches)
+{
+	for (; caches->ckd_cache != NULL; ++caches) {
+		if (*caches->ckd_cache != NULL) {
+			kmem_cache_destroy(*caches->ckd_cache);
+			*caches->ckd_cache = NULL;
+		}
+	}
+}
+EXPORT_SYMBOL(lu_kmem_fini);
+
+/**
+ * Temporary solution to be able to assign fid in ->do_create()
+ * till we have fully-functional OST fids
+ */
+void lu_object_assign_fid(const struct lu_env *env, struct lu_object *o,
+			  const struct lu_fid *fid)
+{
+	struct lu_site		*s = o->lo_dev->ld_site;
+	struct lu_fid		*old = &o->lo_header->loh_fid;
+	struct lu_site_bkt_data	*bkt;
+	struct lu_object	*shadow;
+	wait_queue_t		 waiter;
+	struct cfs_hash		*hs;
+	struct cfs_hash_bd	 bd;
+	__u64			 version = 0;
+
+	LASSERT(fid_is_zero(old));
+
+	hs = s->ls_obj_hash;
+	cfs_hash_bd_get_and_lock(hs, (void *)fid, &bd, 1);
+	shadow = htable_lookup(s, &bd, fid, &waiter, &version);
+	/* supposed to be unique */
+	LASSERT(IS_ERR(shadow) && PTR_ERR(shadow) == -ENOENT);
+	*old = *fid;
+	bkt = cfs_hash_bd_extra_get(hs, &bd);
+	cfs_hash_bd_add_locked(hs, &bd, &o->lo_header->loh_hash);
+	bkt->lsb_busy++;
+	cfs_hash_bd_unlock(hs, &bd, 1);
+}
+EXPORT_SYMBOL(lu_object_assign_fid);
+
+/**
+ * allocates object with 0 (non-assigned) fid
+ * XXX: temporary solution to be able to assign fid in ->do_create()
+ *      till we have fully-functional OST fids
+ */
+struct lu_object *lu_object_anon(const struct lu_env *env,
+				 struct lu_device *dev,
+				 const struct lu_object_conf *conf)
+{
+	struct lu_fid     fid;
+	struct lu_object *o;
+
+	fid_zero(&fid);
+	o = lu_object_alloc(env, dev, &fid, conf);
+
+	return o;
+}
+EXPORT_SYMBOL(lu_object_anon);
+
+struct lu_buf LU_BUF_NULL = {
+	.lb_buf = NULL,
+	.lb_len = 0
+};
+EXPORT_SYMBOL(LU_BUF_NULL);
+
+void lu_buf_free(struct lu_buf *buf)
+{
+	LASSERT(buf);
+	if (buf->lb_buf) {
+		LASSERT(buf->lb_len > 0);
+		OBD_FREE_LARGE(buf->lb_buf, buf->lb_len);
+		buf->lb_buf = NULL;
+		buf->lb_len = 0;
+	}
+}
+EXPORT_SYMBOL(lu_buf_free);
+
+void lu_buf_alloc(struct lu_buf *buf, int size)
+{
+	LASSERT(buf);
+	LASSERT(buf->lb_buf == NULL);
+	LASSERT(buf->lb_len == 0);
+	OBD_ALLOC_LARGE(buf->lb_buf, size);
+	if (likely(buf->lb_buf))
+		buf->lb_len = size;
+}
+EXPORT_SYMBOL(lu_buf_alloc);
+
+void lu_buf_realloc(struct lu_buf *buf, int size)
+{
+	lu_buf_free(buf);
+	lu_buf_alloc(buf, size);
+}
+EXPORT_SYMBOL(lu_buf_realloc);
+
+struct lu_buf *lu_buf_check_and_alloc(struct lu_buf *buf, int len)
+{
+	if (buf->lb_buf == NULL && buf->lb_len == 0)
+		lu_buf_alloc(buf, len);
+
+	if ((len > buf->lb_len) && (buf->lb_buf != NULL))
+		lu_buf_realloc(buf, len);
+
+	return buf;
+}
+EXPORT_SYMBOL(lu_buf_check_and_alloc);
+
+/**
+ * Increase the size of the \a buf.
+ * preserves old data in buffer
+ * old buffer remains unchanged on error
+ * \retval 0 or -ENOMEM
+ */
+int lu_buf_check_and_grow(struct lu_buf *buf, int len)
+{
+	char *ptr;
+
+	if (len <= buf->lb_len)
+		return 0;
+
+	OBD_ALLOC_LARGE(ptr, len);
+	if (ptr == NULL)
+		return -ENOMEM;
+
+	/* Free the old buf */
+	if (buf->lb_buf != NULL) {
+		memcpy(ptr, buf->lb_buf, buf->lb_len);
+		OBD_FREE_LARGE(buf->lb_buf, buf->lb_len);
+	}
+
+	buf->lb_buf = ptr;
+	buf->lb_len = len;
+	return 0;
+}
+EXPORT_SYMBOL(lu_buf_check_and_grow);
diff --git a/kernel/drivers/staging/lustre/lustre/obdclass/lu_ref.c b/kernel/drivers/staging/lustre/lustre/obdclass/lu_ref.c
new file mode 100644
index 000000000..993697b66
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/obdclass/lu_ref.c
@@ -0,0 +1,50 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdclass/lu_ref.c
+ *
+ * Lustre reference.
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+
+#include "../../include/linux/libcfs/libcfs.h"
+
+#include "../include/obd.h"
+#include "../include/obd_class.h"
+#include "../include/obd_support.h"
+#include "../include/lu_ref.h"
diff --git a/kernel/drivers/staging/lustre/lustre/obdclass/lustre_handles.c b/kernel/drivers/staging/lustre/lustre/obdclass/lustre_handles.c
new file mode 100644
index 000000000..f720e3183
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/obdclass/lustre_handles.c
@@ -0,0 +1,257 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdclass/lustre_handles.c
+ *
+ * Author: Phil Schwan <phil@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+
+#include "../include/obd_support.h"
+#include "../include/lustre_handles.h"
+#include "../include/lustre_lib.h"
+
+
+static __u64 handle_base;
+#define HANDLE_INCR 7
+static spinlock_t handle_base_lock;
+
+static struct handle_bucket {
+	spinlock_t	lock;
+	struct list_head	head;
+} *handle_hash;
+
+#define HANDLE_HASH_SIZE (1 << 16)
+#define HANDLE_HASH_MASK (HANDLE_HASH_SIZE - 1)
+
+/*
+ * Generate a unique 64bit cookie (hash) for a handle and insert it into
+ * global (per-node) hash-table.
+ */
+void class_handle_hash(struct portals_handle *h,
+		       struct portals_handle_ops *ops)
+{
+	struct handle_bucket *bucket;
+
+	LASSERT(h != NULL);
+	LASSERT(list_empty(&h->h_link));
+
+	/*
+	 * This is fast, but simplistic cookie generation algorithm, it will
+	 * need a re-do at some point in the future for security.
+	 */
+	spin_lock(&handle_base_lock);
+	handle_base += HANDLE_INCR;
+
+	if (unlikely(handle_base == 0)) {
+		/*
+		 * Cookie of zero is "dangerous", because in many places it's
+		 * assumed that 0 means "unassigned" handle, not bound to any
+		 * object.
+		 */
+		CWARN("The universe has been exhausted: cookie wrap-around.\n");
+		handle_base += HANDLE_INCR;
+	}
+	h->h_cookie = handle_base;
+	spin_unlock(&handle_base_lock);
+
+	h->h_ops = ops;
+	spin_lock_init(&h->h_lock);
+
+	bucket = &handle_hash[h->h_cookie & HANDLE_HASH_MASK];
+	spin_lock(&bucket->lock);
+	list_add_rcu(&h->h_link, &bucket->head);
+	h->h_in = 1;
+	spin_unlock(&bucket->lock);
+
+	CDEBUG(D_INFO, "added object %p with handle %#llx to hash\n",
+	       h, h->h_cookie);
+}
+EXPORT_SYMBOL(class_handle_hash);
+
+static void class_handle_unhash_nolock(struct portals_handle *h)
+{
+	if (list_empty(&h->h_link)) {
+		CERROR("removing an already-removed handle (%#llx)\n",
+		       h->h_cookie);
+		return;
+	}
+
+	CDEBUG(D_INFO, "removing object %p with handle %#llx from hash\n",
+	       h, h->h_cookie);
+
+	spin_lock(&h->h_lock);
+	if (h->h_in == 0) {
+		spin_unlock(&h->h_lock);
+		return;
+	}
+	h->h_in = 0;
+	spin_unlock(&h->h_lock);
+	list_del_rcu(&h->h_link);
+}
+
+void class_handle_unhash(struct portals_handle *h)
+{
+	struct handle_bucket *bucket;
+	bucket = handle_hash + (h->h_cookie & HANDLE_HASH_MASK);
+
+	spin_lock(&bucket->lock);
+	class_handle_unhash_nolock(h);
+	spin_unlock(&bucket->lock);
+}
+EXPORT_SYMBOL(class_handle_unhash);
+
+void class_handle_hash_back(struct portals_handle *h)
+{
+	struct handle_bucket *bucket;
+
+	bucket = handle_hash + (h->h_cookie & HANDLE_HASH_MASK);
+
+	spin_lock(&bucket->lock);
+	list_add_rcu(&h->h_link, &bucket->head);
+	h->h_in = 1;
+	spin_unlock(&bucket->lock);
+}
+EXPORT_SYMBOL(class_handle_hash_back);
+
+void *class_handle2object(__u64 cookie)
+{
+	struct handle_bucket *bucket;
+	struct portals_handle *h;
+	void *retval = NULL;
+
+	LASSERT(handle_hash != NULL);
+
+	/* Be careful when you want to change this code. See the
+	 * rcu_read_lock() definition on top this file. - jxiong */
+	bucket = handle_hash + (cookie & HANDLE_HASH_MASK);
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(h, &bucket->head, h_link) {
+		if (h->h_cookie != cookie)
+			continue;
+
+		spin_lock(&h->h_lock);
+		if (likely(h->h_in != 0)) {
+			h->h_ops->hop_addref(h);
+			retval = h;
+		}
+		spin_unlock(&h->h_lock);
+		break;
+	}
+	rcu_read_unlock();
+
+	return retval;
+}
+EXPORT_SYMBOL(class_handle2object);
+
+void class_handle_free_cb(struct rcu_head *rcu)
+{
+	struct portals_handle *h = RCU2HANDLE(rcu);
+	void *ptr = (void *)(unsigned long)h->h_cookie;
+
+	if (h->h_ops->hop_free != NULL)
+		h->h_ops->hop_free(ptr, h->h_size);
+	else
+		OBD_FREE(ptr, h->h_size);
+}
+EXPORT_SYMBOL(class_handle_free_cb);
+
+int class_handle_init(void)
+{
+	struct handle_bucket *bucket;
+	struct timeval tv;
+	int seed[2];
+
+	LASSERT(handle_hash == NULL);
+
+	OBD_ALLOC_LARGE(handle_hash, sizeof(*bucket) * HANDLE_HASH_SIZE);
+	if (handle_hash == NULL)
+		return -ENOMEM;
+
+	spin_lock_init(&handle_base_lock);
+	for (bucket = handle_hash + HANDLE_HASH_SIZE - 1; bucket >= handle_hash;
+	     bucket--) {
+		INIT_LIST_HEAD(&bucket->head);
+		spin_lock_init(&bucket->lock);
+	}
+
+	/** bug 21430: add randomness to the initial base */
+	cfs_get_random_bytes(seed, sizeof(seed));
+	do_gettimeofday(&tv);
+	cfs_srand(tv.tv_sec ^ seed[0], tv.tv_usec ^ seed[1]);
+
+	cfs_get_random_bytes(&handle_base, sizeof(handle_base));
+	LASSERT(handle_base != 0ULL);
+
+	return 0;
+}
+
+static int cleanup_all_handles(void)
+{
+	int rc;
+	int i;
+
+	for (rc = i = 0; i < HANDLE_HASH_SIZE; i++) {
+		struct portals_handle *h;
+
+		spin_lock(&handle_hash[i].lock);
+		list_for_each_entry_rcu(h, &(handle_hash[i].head), h_link) {
+			CERROR("force clean handle %#llx addr %p ops %p\n",
+			       h->h_cookie, h, h->h_ops);
+
+			class_handle_unhash_nolock(h);
+			rc++;
+		}
+		spin_unlock(&handle_hash[i].lock);
+	}
+
+	return rc;
+}
+
+void class_handle_cleanup(void)
+{
+	int count;
+	LASSERT(handle_hash != NULL);
+
+	count = cleanup_all_handles();
+
+	OBD_FREE_LARGE(handle_hash, sizeof(*handle_hash) * HANDLE_HASH_SIZE);
+	handle_hash = NULL;
+
+	if (count != 0)
+		CERROR("handle_count at cleanup: %d\n", count);
+}
diff --git a/kernel/drivers/staging/lustre/lustre/obdclass/lustre_peer.c b/kernel/drivers/staging/lustre/lustre/obdclass/lustre_peer.c
new file mode 100644
index 000000000..64b2f35e2
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/obdclass/lustre_peer.c
@@ -0,0 +1,217 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_RPC
+
+#include "../include/obd.h"
+#include "../include/obd_support.h"
+#include "../include/obd_class.h"
+#include "../include/lustre_lib.h"
+#include "../include/lustre_ha.h"
+#include "../include/lustre_net.h"
+#include "../include/lprocfs_status.h"
+
+#define NIDS_MAX	32
+
+struct uuid_nid_data {
+	struct list_head       un_list;
+	struct obd_uuid  un_uuid;
+	int	      un_nid_count;
+	lnet_nid_t       un_nids[NIDS_MAX];
+};
+
+/* FIXME: This should probably become more elegant than a global linked list */
+static struct list_head	g_uuid_list;
+static spinlock_t	g_uuid_lock;
+
+void class_init_uuidlist(void)
+{
+	INIT_LIST_HEAD(&g_uuid_list);
+	spin_lock_init(&g_uuid_lock);
+}
+
+void class_exit_uuidlist(void)
+{
+	/* delete all */
+	class_del_uuid(NULL);
+}
+
+int lustre_uuid_to_peer(const char *uuid, lnet_nid_t *peer_nid, int index)
+{
+	struct uuid_nid_data *data;
+	struct obd_uuid tmp;
+	int rc = -ENOENT;
+
+	obd_str2uuid(&tmp, uuid);
+	spin_lock(&g_uuid_lock);
+	list_for_each_entry(data, &g_uuid_list, un_list) {
+		if (obd_uuid_equals(&data->un_uuid, &tmp)) {
+			if (index >= data->un_nid_count)
+				break;
+
+			rc = 0;
+			*peer_nid = data->un_nids[index];
+			break;
+		}
+	}
+	spin_unlock(&g_uuid_lock);
+	return rc;
+}
+EXPORT_SYMBOL(lustre_uuid_to_peer);
+
+/* Add a nid to a niduuid.  Multiple nids can be added to a single uuid;
+   LNET will choose the best one. */
+int class_add_uuid(const char *uuid, __u64 nid)
+{
+	struct uuid_nid_data *data, *entry;
+	int found = 0;
+
+	LASSERT(nid != 0);  /* valid newconfig NID is never zero */
+
+	if (strlen(uuid) > UUID_MAX - 1)
+		return -EOVERFLOW;
+
+	OBD_ALLOC_PTR(data);
+	if (data == NULL)
+		return -ENOMEM;
+
+	obd_str2uuid(&data->un_uuid, uuid);
+	data->un_nids[0] = nid;
+	data->un_nid_count = 1;
+
+	spin_lock(&g_uuid_lock);
+	list_for_each_entry(entry, &g_uuid_list, un_list) {
+		if (obd_uuid_equals(&entry->un_uuid, &data->un_uuid)) {
+			int i;
+
+			found = 1;
+			for (i = 0; i < entry->un_nid_count; i++)
+				if (nid == entry->un_nids[i])
+					break;
+
+			if (i == entry->un_nid_count) {
+				LASSERT(entry->un_nid_count < NIDS_MAX);
+				entry->un_nids[entry->un_nid_count++] = nid;
+			}
+			break;
+		}
+	}
+	if (!found)
+		list_add(&data->un_list, &g_uuid_list);
+	spin_unlock(&g_uuid_lock);
+
+	if (found) {
+		CDEBUG(D_INFO, "found uuid %s %s cnt=%d\n", uuid,
+		       libcfs_nid2str(nid), entry->un_nid_count);
+		OBD_FREE(data, sizeof(*data));
+	} else {
+		CDEBUG(D_INFO, "add uuid %s %s\n", uuid, libcfs_nid2str(nid));
+	}
+	return 0;
+}
+EXPORT_SYMBOL(class_add_uuid);
+
+/* Delete the nids for one uuid if specified, otherwise delete all */
+int class_del_uuid(const char *uuid)
+{
+	LIST_HEAD(deathrow);
+	struct uuid_nid_data *data;
+
+	spin_lock(&g_uuid_lock);
+	if (uuid != NULL) {
+		struct obd_uuid tmp;
+
+		obd_str2uuid(&tmp, uuid);
+		list_for_each_entry(data, &g_uuid_list, un_list) {
+			if (obd_uuid_equals(&data->un_uuid, &tmp)) {
+				list_move(&data->un_list, &deathrow);
+				break;
+			}
+		}
+	} else
+		list_splice_init(&g_uuid_list, &deathrow);
+	spin_unlock(&g_uuid_lock);
+
+	if (uuid != NULL && list_empty(&deathrow)) {
+		CDEBUG(D_INFO, "Try to delete a non-existent uuid %s\n", uuid);
+		return -EINVAL;
+	}
+
+	while (!list_empty(&deathrow)) {
+		data = list_entry(deathrow.next, struct uuid_nid_data,
+				      un_list);
+		list_del(&data->un_list);
+
+		CDEBUG(D_INFO, "del uuid %s %s/%d\n",
+		       obd_uuid2str(&data->un_uuid),
+		       libcfs_nid2str(data->un_nids[0]),
+		       data->un_nid_count);
+
+		OBD_FREE(data, sizeof(*data));
+	}
+
+	return 0;
+}
+
+/* check if @nid exists in nid list of @uuid */
+int class_check_uuid(struct obd_uuid *uuid, __u64 nid)
+{
+	struct uuid_nid_data *entry;
+	int found = 0;
+
+	CDEBUG(D_INFO, "check if uuid %s has %s.\n",
+	       obd_uuid2str(uuid), libcfs_nid2str(nid));
+
+	spin_lock(&g_uuid_lock);
+	list_for_each_entry(entry, &g_uuid_list, un_list) {
+		int i;
+
+		if (!obd_uuid_equals(&entry->un_uuid, uuid))
+			continue;
+
+		/* found the uuid, check if it has @nid */
+		for (i = 0; i < entry->un_nid_count; i++) {
+			if (entry->un_nids[i] == nid) {
+				found = 1;
+				break;
+			}
+		}
+		break;
+	}
+	spin_unlock(&g_uuid_lock);
+	return found;
+}
+EXPORT_SYMBOL(class_check_uuid);
diff --git a/kernel/drivers/staging/lustre/lustre/obdclass/obd_config.c b/kernel/drivers/staging/lustre/lustre/obdclass/obd_config.c
new file mode 100644
index 000000000..6ce9adc2f
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/obdclass/obd_config.c
@@ -0,0 +1,1953 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdclass/obd_config.c
+ *
+ * Config API
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+#include "../include/obd_class.h"
+#include <linux/string.h>
+#include "../include/lustre_log.h"
+#include "../include/lprocfs_status.h"
+#include "../include/lustre_param.h"
+
+#include "llog_internal.h"
+
+static cfs_hash_ops_t uuid_hash_ops;
+static cfs_hash_ops_t nid_hash_ops;
+static cfs_hash_ops_t nid_stat_hash_ops;
+
+/*********** string parsing utils *********/
+
+/* returns 0 if we find this key in the buffer, else 1 */
+int class_find_param(char *buf, char *key, char **valp)
+{
+	char *ptr;
+
+	if (!buf)
+		return 1;
+
+	ptr = strstr(buf, key);
+	if (ptr == NULL)
+		return 1;
+
+	if (valp)
+		*valp = ptr + strlen(key);
+
+	return 0;
+}
+EXPORT_SYMBOL(class_find_param);
+
+/**
+ * Check whether the proc parameter \a param is an old parameter or not from
+ * the array \a ptr which contains the mapping from old parameters to new ones.
+ * If it's an old one, then return the pointer to the cfg_interop_param struc-
+ * ture which contains both the old and new parameters.
+ *
+ * \param param			proc parameter
+ * \param ptr			an array which contains the mapping from
+ *				old parameters to new ones
+ *
+ * \retval valid-pointer	pointer to the cfg_interop_param structure
+ *				which contains the old and new parameters
+ * \retval NULL			\a param or \a ptr is NULL,
+ *				or \a param is not an old parameter
+ */
+struct cfg_interop_param *class_find_old_param(const char *param,
+					       struct cfg_interop_param *ptr)
+{
+	char *value = NULL;
+	int   name_len = 0;
+
+	if (param == NULL || ptr == NULL)
+		return NULL;
+
+	value = strchr(param, '=');
+	if (value == NULL)
+		name_len = strlen(param);
+	else
+		name_len = value - param;
+
+	while (ptr->old_param != NULL) {
+		if (strncmp(param, ptr->old_param, name_len) == 0 &&
+		    name_len == strlen(ptr->old_param))
+			return ptr;
+		ptr++;
+	}
+
+	return NULL;
+}
+EXPORT_SYMBOL(class_find_old_param);
+
+/**
+ * Finds a parameter in \a params and copies it to \a copy.
+ *
+ * Leading spaces are skipped. Next space or end of string is the
+ * parameter terminator with the exception that spaces inside single or double
+ * quotes get included into a parameter. The parameter is copied into \a copy
+ * which has to be allocated big enough by a caller, quotes are stripped in
+ * the copy and the copy is terminated by 0.
+ *
+ * On return \a params is set to next parameter or to NULL if last
+ * parameter is returned.
+ *
+ * \retval 0 if parameter is returned in \a copy
+ * \retval 1 otherwise
+ * \retval -EINVAL if unbalanced quota is found
+ */
+int class_get_next_param(char **params, char *copy)
+{
+	char *q1, *q2, *str;
+	int len;
+
+	str = *params;
+	while (*str == ' ')
+		str++;
+
+	if (*str == '\0') {
+		*params = NULL;
+		return 1;
+	}
+
+	while (1) {
+		q1 = strpbrk(str, " '\"");
+		if (q1 == NULL) {
+			len = strlen(str);
+			memcpy(copy, str, len);
+			copy[len] = '\0';
+			*params = NULL;
+			return 0;
+		}
+		len = q1 - str;
+		if (*q1 == ' ') {
+			memcpy(copy, str, len);
+			copy[len] = '\0';
+			*params = str + len;
+			return 0;
+		}
+
+		memcpy(copy, str, len);
+		copy += len;
+
+		/* search for the matching closing quote */
+		str = q1 + 1;
+		q2 = strchr(str, *q1);
+		if (q2 == NULL) {
+			CERROR("Unbalanced quota in parameters: \"%s\"\n",
+			       *params);
+			return -EINVAL;
+		}
+		len = q2 - str;
+		memcpy(copy, str, len);
+		copy += len;
+		str = q2 + 1;
+	}
+	return 1;
+}
+EXPORT_SYMBOL(class_get_next_param);
+
+/* returns 0 if this is the first key in the buffer, else 1.
+   valp points to first char after key. */
+int class_match_param(char *buf, char *key, char **valp)
+{
+	if (!buf)
+		return 1;
+
+	if (memcmp(buf, key, strlen(key)) != 0)
+		return 1;
+
+	if (valp)
+		*valp = buf + strlen(key);
+
+	return 0;
+}
+EXPORT_SYMBOL(class_match_param);
+
+static int parse_nid(char *buf, void *value, int quiet)
+{
+	lnet_nid_t *nid = (lnet_nid_t *)value;
+
+	*nid = libcfs_str2nid(buf);
+	if (*nid != LNET_NID_ANY)
+		return 0;
+
+	if (!quiet)
+		LCONSOLE_ERROR_MSG(0x159, "Can't parse NID '%s'\n", buf);
+	return -EINVAL;
+}
+
+static int parse_net(char *buf, void *value)
+{
+	__u32 *net = (__u32 *)value;
+
+	*net = libcfs_str2net(buf);
+	CDEBUG(D_INFO, "Net %s\n", libcfs_net2str(*net));
+	return 0;
+}
+
+enum {
+	CLASS_PARSE_NID = 1,
+	CLASS_PARSE_NET,
+};
+
+/* 0 is good nid,
+   1 not found
+   < 0 error
+   endh is set to next separator */
+static int class_parse_value(char *buf, int opc, void *value, char **endh,
+			     int quiet)
+{
+	char *endp;
+	char  tmp;
+	int   rc = 0;
+
+	if (!buf)
+		return 1;
+	while (*buf == ',' || *buf == ':')
+		buf++;
+	if (*buf == ' ' || *buf == '/' || *buf == '\0')
+		return 1;
+
+	/* nid separators or end of nids */
+	endp = strpbrk(buf, ",: /");
+	if (endp == NULL)
+		endp = buf + strlen(buf);
+
+	tmp = *endp;
+	*endp = '\0';
+	switch (opc) {
+	default:
+		LBUG();
+	case CLASS_PARSE_NID:
+		rc = parse_nid(buf, value, quiet);
+		break;
+	case CLASS_PARSE_NET:
+		rc = parse_net(buf, value);
+		break;
+	}
+	*endp = tmp;
+	if (rc != 0)
+		return rc;
+	if (endh)
+		*endh = endp;
+	return 0;
+}
+
+int class_parse_nid(char *buf, lnet_nid_t *nid, char **endh)
+{
+	return class_parse_value(buf, CLASS_PARSE_NID, (void *)nid, endh, 0);
+}
+EXPORT_SYMBOL(class_parse_nid);
+
+int class_parse_nid_quiet(char *buf, lnet_nid_t *nid, char **endh)
+{
+	return class_parse_value(buf, CLASS_PARSE_NID, (void *)nid, endh, 1);
+}
+EXPORT_SYMBOL(class_parse_nid_quiet);
+
+int class_parse_net(char *buf, __u32 *net, char **endh)
+{
+	return class_parse_value(buf, CLASS_PARSE_NET, (void *)net, endh, 0);
+}
+EXPORT_SYMBOL(class_parse_net);
+
+/* 1 param contains key and match
+ * 0 param contains key and not match
+ * -1 param does not contain key
+ */
+int class_match_nid(char *buf, char *key, lnet_nid_t nid)
+{
+	lnet_nid_t tmp;
+	int   rc = -1;
+
+	while (class_find_param(buf, key, &buf) == 0) {
+		/* please restrict to the nids pertaining to
+		 * the specified nids */
+		while (class_parse_nid(buf, &tmp, &buf) == 0) {
+			if (tmp == nid)
+				return 1;
+		}
+		rc = 0;
+	}
+	return rc;
+}
+EXPORT_SYMBOL(class_match_nid);
+
+int class_match_net(char *buf, char *key, __u32 net)
+{
+	__u32 tmp;
+	int   rc = -1;
+
+	while (class_find_param(buf, key, &buf) == 0) {
+		/* please restrict to the nids pertaining to
+		 * the specified networks */
+		while (class_parse_net(buf, &tmp, &buf) == 0) {
+			if (tmp == net)
+				return 1;
+		}
+		rc = 0;
+	}
+	return rc;
+}
+EXPORT_SYMBOL(class_match_net);
+
+/********************** class fns **********************/
+
+/**
+ * Create a new obd device and set the type, name and uuid.  If successful,
+ * the new device can be accessed by either name or uuid.
+ */
+int class_attach(struct lustre_cfg *lcfg)
+{
+	struct obd_device *obd = NULL;
+	char *typename, *name, *uuid;
+	int rc, len;
+
+	if (!LUSTRE_CFG_BUFLEN(lcfg, 1)) {
+		CERROR("No type passed!\n");
+		return -EINVAL;
+	}
+	typename = lustre_cfg_string(lcfg, 1);
+
+	if (!LUSTRE_CFG_BUFLEN(lcfg, 0)) {
+		CERROR("No name passed!\n");
+		return -EINVAL;
+	}
+	name = lustre_cfg_string(lcfg, 0);
+
+	if (!LUSTRE_CFG_BUFLEN(lcfg, 2)) {
+		CERROR("No UUID passed!\n");
+		return -EINVAL;
+	}
+	uuid = lustre_cfg_string(lcfg, 2);
+
+	CDEBUG(D_IOCTL, "attach type %s name: %s uuid: %s\n",
+	       MKSTR(typename), MKSTR(name), MKSTR(uuid));
+
+	obd = class_newdev(typename, name);
+	if (IS_ERR(obd)) {
+		/* Already exists or out of obds */
+		rc = PTR_ERR(obd);
+		obd = NULL;
+		CERROR("Cannot create device %s of type %s : %d\n",
+		       name, typename, rc);
+		goto out;
+	}
+	LASSERTF(obd != NULL, "Cannot get obd device %s of type %s\n",
+		 name, typename);
+	LASSERTF(obd->obd_magic == OBD_DEVICE_MAGIC,
+		 "obd %p obd_magic %08X != %08X\n",
+		 obd, obd->obd_magic, OBD_DEVICE_MAGIC);
+	LASSERTF(strncmp(obd->obd_name, name, strlen(name)) == 0,
+		 "%p obd_name %s != %s\n", obd, obd->obd_name, name);
+
+	rwlock_init(&obd->obd_pool_lock);
+	obd->obd_pool_limit = 0;
+	obd->obd_pool_slv = 0;
+
+	INIT_LIST_HEAD(&obd->obd_exports);
+	INIT_LIST_HEAD(&obd->obd_unlinked_exports);
+	INIT_LIST_HEAD(&obd->obd_delayed_exports);
+	INIT_LIST_HEAD(&obd->obd_exports_timed);
+	INIT_LIST_HEAD(&obd->obd_nid_stats);
+	spin_lock_init(&obd->obd_nid_lock);
+	spin_lock_init(&obd->obd_dev_lock);
+	mutex_init(&obd->obd_dev_mutex);
+	spin_lock_init(&obd->obd_osfs_lock);
+	/* obd->obd_osfs_age must be set to a value in the distant
+	 * past to guarantee a fresh statfs is fetched on mount. */
+	obd->obd_osfs_age = cfs_time_shift_64(-1000);
+
+	/* XXX belongs in setup not attach  */
+	init_rwsem(&obd->obd_observer_link_sem);
+	/* recovery data */
+	cfs_init_timer(&obd->obd_recovery_timer);
+	spin_lock_init(&obd->obd_recovery_task_lock);
+	init_waitqueue_head(&obd->obd_next_transno_waitq);
+	init_waitqueue_head(&obd->obd_evict_inprogress_waitq);
+	INIT_LIST_HEAD(&obd->obd_req_replay_queue);
+	INIT_LIST_HEAD(&obd->obd_lock_replay_queue);
+	INIT_LIST_HEAD(&obd->obd_final_req_queue);
+	INIT_LIST_HEAD(&obd->obd_evict_list);
+
+	llog_group_init(&obd->obd_olg, FID_SEQ_LLOG);
+
+	obd->obd_conn_inprogress = 0;
+
+	len = strlen(uuid);
+	if (len >= sizeof(obd->obd_uuid)) {
+		CERROR("uuid must be < %d bytes long\n",
+		       (int)sizeof(obd->obd_uuid));
+		rc = -EINVAL;
+		goto out;
+	}
+	memcpy(obd->obd_uuid.uuid, uuid, len);
+
+	/* do the attach */
+	if (OBP(obd, attach)) {
+		rc = OBP(obd, attach)(obd, sizeof(*lcfg), lcfg);
+		if (rc) {
+			rc = -EINVAL;
+			goto out;
+		}
+	}
+
+	/* Detach drops this */
+	spin_lock(&obd->obd_dev_lock);
+	atomic_set(&obd->obd_refcount, 1);
+	spin_unlock(&obd->obd_dev_lock);
+	lu_ref_init(&obd->obd_reference);
+	lu_ref_add(&obd->obd_reference, "attach", obd);
+
+	obd->obd_attached = 1;
+	CDEBUG(D_IOCTL, "OBD: dev %d attached type %s with refcount %d\n",
+	       obd->obd_minor, typename, atomic_read(&obd->obd_refcount));
+	return 0;
+ out:
+	if (obd != NULL) {
+		class_release_dev(obd);
+	}
+	return rc;
+}
+EXPORT_SYMBOL(class_attach);
+
+/** Create hashes, self-export, and call type-specific setup.
+ * Setup is effectively the "start this obd" call.
+ */
+int class_setup(struct obd_device *obd, struct lustre_cfg *lcfg)
+{
+	int err = 0;
+	struct obd_export *exp;
+
+	LASSERT(obd != NULL);
+	LASSERTF(obd == class_num2obd(obd->obd_minor),
+		 "obd %p != obd_devs[%d] %p\n",
+		 obd, obd->obd_minor, class_num2obd(obd->obd_minor));
+	LASSERTF(obd->obd_magic == OBD_DEVICE_MAGIC,
+		 "obd %p obd_magic %08x != %08x\n",
+		 obd, obd->obd_magic, OBD_DEVICE_MAGIC);
+
+	/* have we attached a type to this device? */
+	if (!obd->obd_attached) {
+		CERROR("Device %d not attached\n", obd->obd_minor);
+		return -ENODEV;
+	}
+
+	if (obd->obd_set_up) {
+		CERROR("Device %d already setup (type %s)\n",
+		       obd->obd_minor, obd->obd_type->typ_name);
+		return -EEXIST;
+	}
+
+	/* is someone else setting us up right now? (attach inits spinlock) */
+	spin_lock(&obd->obd_dev_lock);
+	if (obd->obd_starting) {
+		spin_unlock(&obd->obd_dev_lock);
+		CERROR("Device %d setup in progress (type %s)\n",
+		       obd->obd_minor, obd->obd_type->typ_name);
+		return -EEXIST;
+	}
+	/* just leave this on forever.  I can't use obd_set_up here because
+	   other fns check that status, and we're not actually set up yet. */
+	obd->obd_starting = 1;
+	obd->obd_uuid_hash = NULL;
+	obd->obd_nid_hash = NULL;
+	obd->obd_nid_stats_hash = NULL;
+	spin_unlock(&obd->obd_dev_lock);
+
+	/* create an uuid-export lustre hash */
+	obd->obd_uuid_hash = cfs_hash_create("UUID_HASH",
+					     HASH_UUID_CUR_BITS,
+					     HASH_UUID_MAX_BITS,
+					     HASH_UUID_BKT_BITS, 0,
+					     CFS_HASH_MIN_THETA,
+					     CFS_HASH_MAX_THETA,
+					     &uuid_hash_ops, CFS_HASH_DEFAULT);
+	if (!obd->obd_uuid_hash) {
+		err = -ENOMEM;
+		goto err_hash;
+	}
+
+	/* create a nid-export lustre hash */
+	obd->obd_nid_hash = cfs_hash_create("NID_HASH",
+					    HASH_NID_CUR_BITS,
+					    HASH_NID_MAX_BITS,
+					    HASH_NID_BKT_BITS, 0,
+					    CFS_HASH_MIN_THETA,
+					    CFS_HASH_MAX_THETA,
+					    &nid_hash_ops, CFS_HASH_DEFAULT);
+	if (!obd->obd_nid_hash) {
+		err = -ENOMEM;
+		goto err_hash;
+	}
+
+	/* create a nid-stats lustre hash */
+	obd->obd_nid_stats_hash = cfs_hash_create("NID_STATS",
+						  HASH_NID_STATS_CUR_BITS,
+						  HASH_NID_STATS_MAX_BITS,
+						  HASH_NID_STATS_BKT_BITS, 0,
+						  CFS_HASH_MIN_THETA,
+						  CFS_HASH_MAX_THETA,
+						  &nid_stat_hash_ops, CFS_HASH_DEFAULT);
+	if (!obd->obd_nid_stats_hash) {
+		err = -ENOMEM;
+		goto err_hash;
+	}
+
+	exp = class_new_export(obd, &obd->obd_uuid);
+	if (IS_ERR(exp)) {
+		err = PTR_ERR(exp);
+		goto err_hash;
+	}
+
+	obd->obd_self_export = exp;
+	list_del_init(&exp->exp_obd_chain_timed);
+	class_export_put(exp);
+
+	err = obd_setup(obd, lcfg);
+	if (err)
+		goto err_exp;
+
+	obd->obd_set_up = 1;
+
+	spin_lock(&obd->obd_dev_lock);
+	/* cleanup drops this */
+	class_incref(obd, "setup", obd);
+	spin_unlock(&obd->obd_dev_lock);
+
+	CDEBUG(D_IOCTL, "finished setup of obd %s (uuid %s)\n",
+	       obd->obd_name, obd->obd_uuid.uuid);
+
+	return 0;
+err_exp:
+	if (obd->obd_self_export) {
+		class_unlink_export(obd->obd_self_export);
+		obd->obd_self_export = NULL;
+	}
+err_hash:
+	if (obd->obd_uuid_hash) {
+		cfs_hash_putref(obd->obd_uuid_hash);
+		obd->obd_uuid_hash = NULL;
+	}
+	if (obd->obd_nid_hash) {
+		cfs_hash_putref(obd->obd_nid_hash);
+		obd->obd_nid_hash = NULL;
+	}
+	if (obd->obd_nid_stats_hash) {
+		cfs_hash_putref(obd->obd_nid_stats_hash);
+		obd->obd_nid_stats_hash = NULL;
+	}
+	obd->obd_starting = 0;
+	CERROR("setup %s failed (%d)\n", obd->obd_name, err);
+	return err;
+}
+EXPORT_SYMBOL(class_setup);
+
+/** We have finished using this obd and are ready to destroy it.
+ * There can be no more references to this obd.
+ */
+int class_detach(struct obd_device *obd, struct lustre_cfg *lcfg)
+{
+	if (obd->obd_set_up) {
+		CERROR("OBD device %d still set up\n", obd->obd_minor);
+		return -EBUSY;
+	}
+
+	spin_lock(&obd->obd_dev_lock);
+	if (!obd->obd_attached) {
+		spin_unlock(&obd->obd_dev_lock);
+		CERROR("OBD device %d not attached\n", obd->obd_minor);
+		return -ENODEV;
+	}
+	obd->obd_attached = 0;
+	spin_unlock(&obd->obd_dev_lock);
+
+	CDEBUG(D_IOCTL, "detach on obd %s (uuid %s)\n",
+	       obd->obd_name, obd->obd_uuid.uuid);
+
+	class_decref(obd, "attach", obd);
+	return 0;
+}
+EXPORT_SYMBOL(class_detach);
+
+/** Start shutting down the obd.  There may be in-progress ops when
+ * this is called.  We tell them to start shutting down with a call
+ * to class_disconnect_exports().
+ */
+int class_cleanup(struct obd_device *obd, struct lustre_cfg *lcfg)
+{
+	int err = 0;
+	char *flag;
+
+	OBD_RACE(OBD_FAIL_LDLM_RECOV_CLIENTS);
+
+	if (!obd->obd_set_up) {
+		CERROR("Device %d not setup\n", obd->obd_minor);
+		return -ENODEV;
+	}
+
+	spin_lock(&obd->obd_dev_lock);
+	if (obd->obd_stopping) {
+		spin_unlock(&obd->obd_dev_lock);
+		CERROR("OBD %d already stopping\n", obd->obd_minor);
+		return -ENODEV;
+	}
+	/* Leave this on forever */
+	obd->obd_stopping = 1;
+
+	/* wait for already-arrived-connections to finish. */
+	while (obd->obd_conn_inprogress > 0) {
+		spin_unlock(&obd->obd_dev_lock);
+
+		cond_resched();
+
+		spin_lock(&obd->obd_dev_lock);
+	}
+	spin_unlock(&obd->obd_dev_lock);
+
+	if (lcfg->lcfg_bufcount >= 2 && LUSTRE_CFG_BUFLEN(lcfg, 1) > 0) {
+		for (flag = lustre_cfg_string(lcfg, 1); *flag != 0; flag++)
+			switch (*flag) {
+			case 'F':
+				obd->obd_force = 1;
+				break;
+			case 'A':
+				LCONSOLE_WARN("Failing over %s\n",
+					      obd->obd_name);
+				obd->obd_fail = 1;
+				obd->obd_no_transno = 1;
+				obd->obd_no_recov = 1;
+				if (OBP(obd, iocontrol)) {
+					obd_iocontrol(OBD_IOC_SYNC,
+						      obd->obd_self_export,
+						      0, NULL, NULL);
+				}
+				break;
+			default:
+				CERROR("Unrecognised flag '%c'\n", *flag);
+			}
+	}
+
+	LASSERT(obd->obd_self_export);
+
+	/* The three references that should be remaining are the
+	 * obd_self_export and the attach and setup references. */
+	if (atomic_read(&obd->obd_refcount) > 3) {
+		/* refcount - 3 might be the number of real exports
+		   (excluding self export). But class_incref is called
+		   by other things as well, so don't count on it. */
+		CDEBUG(D_IOCTL, "%s: forcing exports to disconnect: %d\n",
+		       obd->obd_name, atomic_read(&obd->obd_refcount) - 3);
+		dump_exports(obd, 0);
+		class_disconnect_exports(obd);
+	}
+
+	/* Precleanup, we must make sure all exports get destroyed. */
+	err = obd_precleanup(obd, OBD_CLEANUP_EXPORTS);
+	if (err)
+		CERROR("Precleanup %s returned %d\n",
+		       obd->obd_name, err);
+
+	/* destroy an uuid-export hash body */
+	if (obd->obd_uuid_hash) {
+		cfs_hash_putref(obd->obd_uuid_hash);
+		obd->obd_uuid_hash = NULL;
+	}
+
+	/* destroy a nid-export hash body */
+	if (obd->obd_nid_hash) {
+		cfs_hash_putref(obd->obd_nid_hash);
+		obd->obd_nid_hash = NULL;
+	}
+
+	/* destroy a nid-stats hash body */
+	if (obd->obd_nid_stats_hash) {
+		cfs_hash_putref(obd->obd_nid_stats_hash);
+		obd->obd_nid_stats_hash = NULL;
+	}
+
+	class_decref(obd, "setup", obd);
+	obd->obd_set_up = 0;
+
+	return 0;
+}
+EXPORT_SYMBOL(class_cleanup);
+
+struct obd_device *class_incref(struct obd_device *obd,
+				const char *scope, const void *source)
+{
+	lu_ref_add_atomic(&obd->obd_reference, scope, source);
+	atomic_inc(&obd->obd_refcount);
+	CDEBUG(D_INFO, "incref %s (%p) now %d\n", obd->obd_name, obd,
+	       atomic_read(&obd->obd_refcount));
+
+	return obd;
+}
+EXPORT_SYMBOL(class_incref);
+
+void class_decref(struct obd_device *obd, const char *scope, const void *source)
+{
+	int err;
+	int refs;
+
+	spin_lock(&obd->obd_dev_lock);
+	atomic_dec(&obd->obd_refcount);
+	refs = atomic_read(&obd->obd_refcount);
+	spin_unlock(&obd->obd_dev_lock);
+	lu_ref_del(&obd->obd_reference, scope, source);
+
+	CDEBUG(D_INFO, "Decref %s (%p) now %d\n", obd->obd_name, obd, refs);
+
+	if ((refs == 1) && obd->obd_stopping) {
+		/* All exports have been destroyed; there should
+		   be no more in-progress ops by this point.*/
+
+		spin_lock(&obd->obd_self_export->exp_lock);
+		obd->obd_self_export->exp_flags |= exp_flags_from_obd(obd);
+		spin_unlock(&obd->obd_self_export->exp_lock);
+
+		/* note that we'll recurse into class_decref again */
+		class_unlink_export(obd->obd_self_export);
+		return;
+	}
+
+	if (refs == 0) {
+		CDEBUG(D_CONFIG, "finishing cleanup of obd %s (%s)\n",
+		       obd->obd_name, obd->obd_uuid.uuid);
+		LASSERT(!obd->obd_attached);
+		if (obd->obd_stopping) {
+			/* If we're not stopping, we were never set up */
+			err = obd_cleanup(obd);
+			if (err)
+				CERROR("Cleanup %s returned %d\n",
+				       obd->obd_name, err);
+		}
+		if (OBP(obd, detach)) {
+			err = OBP(obd, detach)(obd);
+			if (err)
+				CERROR("Detach returned %d\n", err);
+		}
+		class_release_dev(obd);
+	}
+}
+EXPORT_SYMBOL(class_decref);
+
+/** Add a failover nid location.
+ * Client obd types contact server obd types using this nid list.
+ */
+int class_add_conn(struct obd_device *obd, struct lustre_cfg *lcfg)
+{
+	struct obd_import *imp;
+	struct obd_uuid uuid;
+	int rc;
+
+	if (LUSTRE_CFG_BUFLEN(lcfg, 1) < 1 ||
+	    LUSTRE_CFG_BUFLEN(lcfg, 1) > sizeof(struct obd_uuid)) {
+		CERROR("invalid conn_uuid\n");
+		return -EINVAL;
+	}
+	if (strcmp(obd->obd_type->typ_name, LUSTRE_MDC_NAME) &&
+	    strcmp(obd->obd_type->typ_name, LUSTRE_OSC_NAME) &&
+	    strcmp(obd->obd_type->typ_name, LUSTRE_OSP_NAME) &&
+	    strcmp(obd->obd_type->typ_name, LUSTRE_LWP_NAME) &&
+	    strcmp(obd->obd_type->typ_name, LUSTRE_MGC_NAME)) {
+		CERROR("can't add connection on non-client dev\n");
+		return -EINVAL;
+	}
+
+	imp = obd->u.cli.cl_import;
+	if (!imp) {
+		CERROR("try to add conn on immature client dev\n");
+		return -EINVAL;
+	}
+
+	obd_str2uuid(&uuid, lustre_cfg_string(lcfg, 1));
+	rc = obd_add_conn(imp, &uuid, lcfg->lcfg_num);
+
+	return rc;
+}
+EXPORT_SYMBOL(class_add_conn);
+
+/** Remove a failover nid location.
+ */
+int class_del_conn(struct obd_device *obd, struct lustre_cfg *lcfg)
+{
+	struct obd_import *imp;
+	struct obd_uuid uuid;
+	int rc;
+
+	if (LUSTRE_CFG_BUFLEN(lcfg, 1) < 1 ||
+	    LUSTRE_CFG_BUFLEN(lcfg, 1) > sizeof(struct obd_uuid)) {
+		CERROR("invalid conn_uuid\n");
+		return -EINVAL;
+	}
+	if (strcmp(obd->obd_type->typ_name, LUSTRE_MDC_NAME) &&
+	    strcmp(obd->obd_type->typ_name, LUSTRE_OSC_NAME)) {
+		CERROR("can't del connection on non-client dev\n");
+		return -EINVAL;
+	}
+
+	imp = obd->u.cli.cl_import;
+	if (!imp) {
+		CERROR("try to del conn on immature client dev\n");
+		return -EINVAL;
+	}
+
+	obd_str2uuid(&uuid, lustre_cfg_string(lcfg, 1));
+	rc = obd_del_conn(imp, &uuid);
+
+	return rc;
+}
+
+LIST_HEAD(lustre_profile_list);
+
+struct lustre_profile *class_get_profile(const char *prof)
+{
+	struct lustre_profile *lprof;
+
+	list_for_each_entry(lprof, &lustre_profile_list, lp_list) {
+		if (!strcmp(lprof->lp_profile, prof)) {
+			return lprof;
+		}
+	}
+	return NULL;
+}
+EXPORT_SYMBOL(class_get_profile);
+
+/** Create a named "profile".
+ * This defines the mdc and osc names to use for a client.
+ * This also is used to define the lov to be used by a mdt.
+ */
+int class_add_profile(int proflen, char *prof, int osclen, char *osc,
+		      int mdclen, char *mdc)
+{
+	struct lustre_profile *lprof;
+	int err = 0;
+
+	CDEBUG(D_CONFIG, "Add profile %s\n", prof);
+
+	OBD_ALLOC(lprof, sizeof(*lprof));
+	if (lprof == NULL)
+		return -ENOMEM;
+	INIT_LIST_HEAD(&lprof->lp_list);
+
+	LASSERT(proflen == (strlen(prof) + 1));
+	OBD_ALLOC(lprof->lp_profile, proflen);
+	if (lprof->lp_profile == NULL) {
+		err = -ENOMEM;
+		goto out;
+	}
+	memcpy(lprof->lp_profile, prof, proflen);
+
+	LASSERT(osclen == (strlen(osc) + 1));
+	OBD_ALLOC(lprof->lp_dt, osclen);
+	if (lprof->lp_dt == NULL) {
+		err = -ENOMEM;
+		goto out;
+	}
+	memcpy(lprof->lp_dt, osc, osclen);
+
+	if (mdclen > 0) {
+		LASSERT(mdclen == (strlen(mdc) + 1));
+		OBD_ALLOC(lprof->lp_md, mdclen);
+		if (lprof->lp_md == NULL) {
+			err = -ENOMEM;
+			goto out;
+		}
+		memcpy(lprof->lp_md, mdc, mdclen);
+	}
+
+	list_add(&lprof->lp_list, &lustre_profile_list);
+	return err;
+
+out:
+	if (lprof->lp_md)
+		OBD_FREE(lprof->lp_md, mdclen);
+	if (lprof->lp_dt)
+		OBD_FREE(lprof->lp_dt, osclen);
+	if (lprof->lp_profile)
+		OBD_FREE(lprof->lp_profile, proflen);
+	OBD_FREE(lprof, sizeof(*lprof));
+	return err;
+}
+
+void class_del_profile(const char *prof)
+{
+	struct lustre_profile *lprof;
+
+	CDEBUG(D_CONFIG, "Del profile %s\n", prof);
+
+	lprof = class_get_profile(prof);
+	if (lprof) {
+		list_del(&lprof->lp_list);
+		OBD_FREE(lprof->lp_profile, strlen(lprof->lp_profile) + 1);
+		OBD_FREE(lprof->lp_dt, strlen(lprof->lp_dt) + 1);
+		if (lprof->lp_md)
+			OBD_FREE(lprof->lp_md, strlen(lprof->lp_md) + 1);
+		OBD_FREE(lprof, sizeof(*lprof));
+	}
+}
+EXPORT_SYMBOL(class_del_profile);
+
+/* COMPAT_146 */
+void class_del_profiles(void)
+{
+	struct lustre_profile *lprof, *n;
+
+	list_for_each_entry_safe(lprof, n, &lustre_profile_list, lp_list) {
+		list_del(&lprof->lp_list);
+		OBD_FREE(lprof->lp_profile, strlen(lprof->lp_profile) + 1);
+		OBD_FREE(lprof->lp_dt, strlen(lprof->lp_dt) + 1);
+		if (lprof->lp_md)
+			OBD_FREE(lprof->lp_md, strlen(lprof->lp_md) + 1);
+		OBD_FREE(lprof, sizeof(*lprof));
+	}
+}
+EXPORT_SYMBOL(class_del_profiles);
+
+static int class_set_global(char *ptr, int val, struct lustre_cfg *lcfg)
+{
+	if (class_match_param(ptr, PARAM_AT_MIN, NULL) == 0)
+		at_min = val;
+	else if (class_match_param(ptr, PARAM_AT_MAX, NULL) == 0)
+		at_max = val;
+	else if (class_match_param(ptr, PARAM_AT_EXTRA, NULL) == 0)
+		at_extra = val;
+	else if (class_match_param(ptr, PARAM_AT_EARLY_MARGIN, NULL) == 0)
+		at_early_margin = val;
+	else if (class_match_param(ptr, PARAM_AT_HISTORY, NULL) == 0)
+		at_history = val;
+	else if (class_match_param(ptr, PARAM_JOBID_VAR, NULL) == 0)
+		strlcpy(obd_jobid_var, lustre_cfg_string(lcfg, 2),
+			JOBSTATS_JOBID_VAR_MAX_LEN + 1);
+	else
+		return -EINVAL;
+
+	CDEBUG(D_IOCTL, "global %s = %d\n", ptr, val);
+	return 0;
+}
+
+
+/* We can't call ll_process_config or lquota_process_config directly because
+ * it lives in a module that must be loaded after this one. */
+static int (*client_process_config)(struct lustre_cfg *lcfg) = NULL;
+static int (*quota_process_config)(struct lustre_cfg *lcfg) = NULL;
+
+void lustre_register_client_process_config(int (*cpc)(struct lustre_cfg *lcfg))
+{
+	client_process_config = cpc;
+}
+EXPORT_SYMBOL(lustre_register_client_process_config);
+
+/**
+ * Rename the proc parameter in \a cfg with a new name \a new_name.
+ *
+ * \param cfg	   config structure which contains the proc parameter
+ * \param new_name new name of the proc parameter
+ *
+ * \retval valid-pointer    pointer to the newly-allocated config structure
+ *			    which contains the renamed proc parameter
+ * \retval ERR_PTR(-EINVAL) if \a cfg or \a new_name is NULL, or \a cfg does
+ *			    not contain a proc parameter
+ * \retval ERR_PTR(-ENOMEM) if memory allocation failure occurs
+ */
+struct lustre_cfg *lustre_cfg_rename(struct lustre_cfg *cfg,
+				     const char *new_name)
+{
+	struct lustre_cfg_bufs	*bufs = NULL;
+	struct lustre_cfg	*new_cfg = NULL;
+	char			*param = NULL;
+	char			*new_param = NULL;
+	char			*value = NULL;
+	int			 name_len = 0;
+	int			 new_len = 0;
+
+	if (cfg == NULL || new_name == NULL)
+		return ERR_PTR(-EINVAL);
+
+	param = lustre_cfg_string(cfg, 1);
+	if (param == NULL)
+		return ERR_PTR(-EINVAL);
+
+	value = strchr(param, '=');
+	if (value == NULL)
+		name_len = strlen(param);
+	else
+		name_len = value - param;
+
+	new_len = LUSTRE_CFG_BUFLEN(cfg, 1) + strlen(new_name) - name_len;
+
+	OBD_ALLOC(new_param, new_len);
+	if (new_param == NULL)
+		return ERR_PTR(-ENOMEM);
+
+	strcpy(new_param, new_name);
+	if (value != NULL)
+		strcat(new_param, value);
+
+	OBD_ALLOC_PTR(bufs);
+	if (bufs == NULL) {
+		OBD_FREE(new_param, new_len);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	lustre_cfg_bufs_reset(bufs, NULL);
+	lustre_cfg_bufs_init(bufs, cfg);
+	lustre_cfg_bufs_set_string(bufs, 1, new_param);
+
+	new_cfg = lustre_cfg_new(cfg->lcfg_command, bufs);
+
+	OBD_FREE(new_param, new_len);
+	OBD_FREE_PTR(bufs);
+	if (new_cfg == NULL)
+		return ERR_PTR(-ENOMEM);
+
+	new_cfg->lcfg_num = cfg->lcfg_num;
+	new_cfg->lcfg_flags = cfg->lcfg_flags;
+	new_cfg->lcfg_nid = cfg->lcfg_nid;
+	new_cfg->lcfg_nal = cfg->lcfg_nal;
+
+	return new_cfg;
+}
+EXPORT_SYMBOL(lustre_cfg_rename);
+
+static int process_param2_config(struct lustre_cfg *lcfg)
+{
+	char *param = lustre_cfg_string(lcfg, 1);
+	char *upcall = lustre_cfg_string(lcfg, 2);
+	char *argv[] = {
+		[0] = "/usr/sbin/lctl",
+		[1] = "set_param",
+		[2] = param,
+		[3] = NULL
+	};
+	struct timeval	start;
+	struct timeval	end;
+	int		rc;
+
+
+	/* Add upcall processing here. Now only lctl is supported */
+	if (strcmp(upcall, LCTL_UPCALL) != 0) {
+		CERROR("Unsupported upcall %s\n", upcall);
+		return -EINVAL;
+	}
+
+	do_gettimeofday(&start);
+	rc = call_usermodehelper(argv[0], argv, NULL, 1);
+	do_gettimeofday(&end);
+
+	if (rc < 0) {
+		CERROR(
+		       "lctl: error invoking upcall %s %s %s: rc = %d; time %ldus\n",
+		       argv[0], argv[1], argv[2], rc,
+		       cfs_timeval_sub(&end, &start, NULL));
+	} else {
+		CDEBUG(D_HA, "lctl: invoked upcall %s %s %s, time %ldus\n",
+		       argv[0], argv[1], argv[2],
+		       cfs_timeval_sub(&end, &start, NULL));
+		       rc = 0;
+	}
+
+	return rc;
+}
+
+void lustre_register_quota_process_config(int (*qpc)(struct lustre_cfg *lcfg))
+{
+	quota_process_config = qpc;
+}
+EXPORT_SYMBOL(lustre_register_quota_process_config);
+
+/** Process configuration commands given in lustre_cfg form.
+ * These may come from direct calls (e.g. class_manual_cleanup)
+ * or processing the config llog, or ioctl from lctl.
+ */
+int class_process_config(struct lustre_cfg *lcfg)
+{
+	struct obd_device *obd;
+	int err;
+
+	LASSERT(lcfg && !IS_ERR(lcfg));
+	CDEBUG(D_IOCTL, "processing cmd: %x\n", lcfg->lcfg_command);
+
+	/* Commands that don't need a device */
+	switch (lcfg->lcfg_command) {
+	case LCFG_ATTACH: {
+		err = class_attach(lcfg);
+		goto out;
+	}
+	case LCFG_ADD_UUID: {
+		CDEBUG(D_IOCTL, "adding mapping from uuid %s to nid %#llx (%s)\n",
+		       lustre_cfg_string(lcfg, 1), lcfg->lcfg_nid,
+		       libcfs_nid2str(lcfg->lcfg_nid));
+
+		err = class_add_uuid(lustre_cfg_string(lcfg, 1), lcfg->lcfg_nid);
+		goto out;
+	}
+	case LCFG_DEL_UUID: {
+		CDEBUG(D_IOCTL, "removing mappings for uuid %s\n",
+		       (lcfg->lcfg_bufcount < 2 || LUSTRE_CFG_BUFLEN(lcfg, 1) == 0)
+		       ? "<all uuids>" : lustre_cfg_string(lcfg, 1));
+
+		err = class_del_uuid(lustre_cfg_string(lcfg, 1));
+		goto out;
+	}
+	case LCFG_MOUNTOPT: {
+		CDEBUG(D_IOCTL, "mountopt: profile %s osc %s mdc %s\n",
+		       lustre_cfg_string(lcfg, 1),
+		       lustre_cfg_string(lcfg, 2),
+		       lustre_cfg_string(lcfg, 3));
+		/* set these mount options somewhere, so ll_fill_super
+		 * can find them. */
+		err = class_add_profile(LUSTRE_CFG_BUFLEN(lcfg, 1),
+					lustre_cfg_string(lcfg, 1),
+					LUSTRE_CFG_BUFLEN(lcfg, 2),
+					lustre_cfg_string(lcfg, 2),
+					LUSTRE_CFG_BUFLEN(lcfg, 3),
+					lustre_cfg_string(lcfg, 3));
+		goto out;
+	}
+	case LCFG_DEL_MOUNTOPT: {
+		CDEBUG(D_IOCTL, "mountopt: profile %s\n",
+		       lustre_cfg_string(lcfg, 1));
+		class_del_profile(lustre_cfg_string(lcfg, 1));
+		err = 0;
+		goto out;
+	}
+	case LCFG_SET_TIMEOUT: {
+		CDEBUG(D_IOCTL, "changing lustre timeout from %d to %d\n",
+		       obd_timeout, lcfg->lcfg_num);
+		obd_timeout = max(lcfg->lcfg_num, 1U);
+		obd_timeout_set = 1;
+		err = 0;
+		goto out;
+	}
+	case LCFG_SET_LDLM_TIMEOUT: {
+		CDEBUG(D_IOCTL, "changing lustre ldlm_timeout from %d to %d\n",
+		       ldlm_timeout, lcfg->lcfg_num);
+		ldlm_timeout = max(lcfg->lcfg_num, 1U);
+		if (ldlm_timeout >= obd_timeout)
+			ldlm_timeout = max(obd_timeout / 3, 1U);
+		ldlm_timeout_set = 1;
+		err = 0;
+		goto out;
+	}
+	case LCFG_SET_UPCALL: {
+		LCONSOLE_ERROR_MSG(0x15a, "recovery upcall is deprecated\n");
+		/* COMPAT_146 Don't fail on old configs */
+		err = 0;
+		goto out;
+	}
+	case LCFG_MARKER: {
+		struct cfg_marker *marker;
+		marker = lustre_cfg_buf(lcfg, 1);
+		CDEBUG(D_IOCTL, "marker %d (%#x) %.16s %s\n", marker->cm_step,
+		       marker->cm_flags, marker->cm_tgtname, marker->cm_comment);
+		err = 0;
+		goto out;
+	}
+	case LCFG_PARAM: {
+		char *tmp;
+		/* llite has no obd */
+		if ((class_match_param(lustre_cfg_string(lcfg, 1),
+				       PARAM_LLITE, NULL) == 0) &&
+		    client_process_config) {
+			err = (*client_process_config)(lcfg);
+			goto out;
+		} else if ((class_match_param(lustre_cfg_string(lcfg, 1),
+					      PARAM_SYS, &tmp) == 0)) {
+			/* Global param settings */
+			err = class_set_global(tmp, lcfg->lcfg_num, lcfg);
+			/*
+			 * Client or server should not fail to mount if
+			 * it hits an unknown configuration parameter.
+			 */
+			if (err != 0)
+				CWARN("Ignoring unknown param %s\n", tmp);
+
+			err = 0;
+			goto out;
+		} else if ((class_match_param(lustre_cfg_string(lcfg, 1),
+					      PARAM_QUOTA, &tmp) == 0) &&
+			   quota_process_config) {
+			err = (*quota_process_config)(lcfg);
+			goto out;
+		}
+
+		break;
+	}
+	case LCFG_SET_PARAM: {
+		err = process_param2_config(lcfg);
+		goto out;
+	}
+	}
+	/* Commands that require a device */
+	obd = class_name2obd(lustre_cfg_string(lcfg, 0));
+	if (obd == NULL) {
+		if (!LUSTRE_CFG_BUFLEN(lcfg, 0))
+			CERROR("this lcfg command requires a device name\n");
+		else
+			CERROR("no device for: %s\n",
+			       lustre_cfg_string(lcfg, 0));
+
+		err = -EINVAL;
+		goto out;
+	}
+
+	switch (lcfg->lcfg_command) {
+	case LCFG_SETUP: {
+		err = class_setup(obd, lcfg);
+		goto out;
+	}
+	case LCFG_DETACH: {
+		err = class_detach(obd, lcfg);
+		err = 0;
+		goto out;
+	}
+	case LCFG_CLEANUP: {
+		err = class_cleanup(obd, lcfg);
+		err = 0;
+		goto out;
+	}
+	case LCFG_ADD_CONN: {
+		err = class_add_conn(obd, lcfg);
+		err = 0;
+		goto out;
+	}
+	case LCFG_DEL_CONN: {
+		err = class_del_conn(obd, lcfg);
+		err = 0;
+		goto out;
+	}
+	case LCFG_POOL_NEW: {
+		err = obd_pool_new(obd, lustre_cfg_string(lcfg, 2));
+		err = 0;
+		goto out;
+	}
+	case LCFG_POOL_ADD: {
+		err = obd_pool_add(obd, lustre_cfg_string(lcfg, 2),
+				   lustre_cfg_string(lcfg, 3));
+		err = 0;
+		goto out;
+	}
+	case LCFG_POOL_REM: {
+		err = obd_pool_rem(obd, lustre_cfg_string(lcfg, 2),
+				   lustre_cfg_string(lcfg, 3));
+		err = 0;
+		goto out;
+	}
+	case LCFG_POOL_DEL: {
+		err = obd_pool_del(obd, lustre_cfg_string(lcfg, 2));
+		err = 0;
+		goto out;
+	}
+	default: {
+		err = obd_process_config(obd, sizeof(*lcfg), lcfg);
+		goto out;
+
+	}
+	}
+out:
+	if ((err < 0) && !(lcfg->lcfg_command & LCFG_REQUIRED)) {
+		CWARN("Ignoring error %d on optional command %#x\n", err,
+		      lcfg->lcfg_command);
+		err = 0;
+	}
+	return err;
+}
+EXPORT_SYMBOL(class_process_config);
+
+int class_process_proc_param(char *prefix, struct lprocfs_vars *lvars,
+			     struct lustre_cfg *lcfg, void *data)
+{
+	struct lprocfs_vars *var;
+	struct file fakefile;
+	struct seq_file fake_seqfile;
+	char *key, *sval;
+	int i, keylen, vallen;
+	int matched = 0, j = 0;
+	int rc = 0;
+	int skip = 0;
+
+	if (lcfg->lcfg_command != LCFG_PARAM) {
+		CERROR("Unknown command: %d\n", lcfg->lcfg_command);
+		return -EINVAL;
+	}
+
+	/* fake a seq file so that var->fops->write can work... */
+	fakefile.private_data = &fake_seqfile;
+	fake_seqfile.private = data;
+	/* e.g. tunefs.lustre --param mdt.group_upcall=foo /r/tmp/lustre-mdt
+	   or   lctl conf_param lustre-MDT0000.mdt.group_upcall=bar
+	   or   lctl conf_param lustre-OST0000.osc.max_dirty_mb=36 */
+	for (i = 1; i < lcfg->lcfg_bufcount; i++) {
+		key = lustre_cfg_buf(lcfg, i);
+		/* Strip off prefix */
+		class_match_param(key, prefix, &key);
+		sval = strchr(key, '=');
+		if (!sval || (*(sval + 1) == 0)) {
+			CERROR("Can't parse param %s (missing '=')\n", key);
+			/* rc = -EINVAL;	continue parsing other params */
+			continue;
+		}
+		keylen = sval - key;
+		sval++;
+		vallen = strlen(sval);
+		matched = 0;
+		j = 0;
+		/* Search proc entries */
+		while (lvars[j].name) {
+			var = &lvars[j];
+			if (class_match_param(key, (char *)var->name, NULL) == 0
+			    && keylen == strlen(var->name)) {
+				matched++;
+				rc = -EROFS;
+				if (var->fops && var->fops->write) {
+					mm_segment_t oldfs;
+					oldfs = get_fs();
+					set_fs(KERNEL_DS);
+					rc = (var->fops->write)(&fakefile, sval,
+								vallen, NULL);
+					set_fs(oldfs);
+				}
+				break;
+			}
+			j++;
+		}
+		if (!matched) {
+			/* If the prefix doesn't match, return error so we
+			   can pass it down the stack */
+			if (strnchr(key, keylen, '.'))
+			    return -ENOSYS;
+			CERROR("%s: unknown param %s\n",
+			       (char *)lustre_cfg_string(lcfg, 0), key);
+			/* rc = -EINVAL;	continue parsing other params */
+			skip++;
+		} else if (rc < 0) {
+			CERROR("writing proc entry %s err %d\n",
+			       var->name, rc);
+			rc = 0;
+		} else {
+			CDEBUG(D_CONFIG, "%s.%.*s: Set parameter %.*s=%s\n",
+					 lustre_cfg_string(lcfg, 0),
+					 (int)strlen(prefix) - 1, prefix,
+					 (int)(sval - key - 1), key, sval);
+		}
+	}
+
+	if (rc > 0)
+		rc = 0;
+	if (!rc && skip)
+		rc = skip;
+	return rc;
+}
+EXPORT_SYMBOL(class_process_proc_param);
+
+extern int lustre_check_exclusion(struct super_block *sb, char *svname);
+
+/** Parse a configuration llog, doing various manipulations on them
+ * for various reasons, (modifications for compatibility, skip obsolete
+ * records, change uuids, etc), then class_process_config() resulting
+ * net records.
+ */
+int class_config_llog_handler(const struct lu_env *env,
+			      struct llog_handle *handle,
+			      struct llog_rec_hdr *rec, void *data)
+{
+	struct config_llog_instance *clli = data;
+	int cfg_len = rec->lrh_len;
+	char *cfg_buf = (char *) (rec + 1);
+	int rc = 0;
+
+	//class_config_dump_handler(handle, rec, data);
+
+	switch (rec->lrh_type) {
+	case OBD_CFG_REC: {
+		struct lustre_cfg *lcfg, *lcfg_new;
+		struct lustre_cfg_bufs bufs;
+		char *inst_name = NULL;
+		int inst_len = 0;
+		int inst = 0, swab = 0;
+
+		lcfg = (struct lustre_cfg *)cfg_buf;
+		if (lcfg->lcfg_version == __swab32(LUSTRE_CFG_VERSION)) {
+			lustre_swab_lustre_cfg(lcfg);
+			swab = 1;
+		}
+
+		rc = lustre_cfg_sanity_check(cfg_buf, cfg_len);
+		if (rc)
+			goto out;
+
+		/* Figure out config state info */
+		if (lcfg->lcfg_command == LCFG_MARKER) {
+			struct cfg_marker *marker = lustre_cfg_buf(lcfg, 1);
+			lustre_swab_cfg_marker(marker, swab,
+					       LUSTRE_CFG_BUFLEN(lcfg, 1));
+			CDEBUG(D_CONFIG, "Marker, inst_flg=%#x mark_flg=%#x\n",
+			       clli->cfg_flags, marker->cm_flags);
+			if (marker->cm_flags & CM_START) {
+				/* all previous flags off */
+				clli->cfg_flags = CFG_F_MARKER;
+				if (marker->cm_flags & CM_SKIP) {
+					clli->cfg_flags |= CFG_F_SKIP;
+					CDEBUG(D_CONFIG, "SKIP #%d\n",
+					       marker->cm_step);
+				} else if ((marker->cm_flags & CM_EXCLUDE) ||
+					   (clli->cfg_sb &&
+					    lustre_check_exclusion(clli->cfg_sb,
+							 marker->cm_tgtname))) {
+					clli->cfg_flags |= CFG_F_EXCLUDE;
+					CDEBUG(D_CONFIG, "EXCLUDE %d\n",
+					       marker->cm_step);
+				}
+			} else if (marker->cm_flags & CM_END) {
+				clli->cfg_flags = 0;
+			}
+		}
+		/* A config command without a start marker before it is
+		   illegal (post 146) */
+		if (!(clli->cfg_flags & CFG_F_COMPAT146) &&
+		    !(clli->cfg_flags & CFG_F_MARKER) &&
+		    (lcfg->lcfg_command != LCFG_MARKER)) {
+			CWARN("Config not inside markers, ignoring! (inst: %p, uuid: %s, flags: %#x)\n",
+			      clli->cfg_instance,
+			      clli->cfg_uuid.uuid, clli->cfg_flags);
+			clli->cfg_flags |= CFG_F_SKIP;
+		}
+		if (clli->cfg_flags & CFG_F_SKIP) {
+			CDEBUG(D_CONFIG, "skipping %#x\n",
+			       clli->cfg_flags);
+			rc = 0;
+			/* No processing! */
+			break;
+		}
+
+		/*
+		 * For interoperability between 1.8 and 2.0,
+		 * rename "mds" obd device type to "mdt".
+		 */
+		{
+			char *typename = lustre_cfg_string(lcfg, 1);
+			char *index = lustre_cfg_string(lcfg, 2);
+
+			if ((lcfg->lcfg_command == LCFG_ATTACH && typename &&
+			     strcmp(typename, "mds") == 0)) {
+				CWARN("For 1.8 interoperability, rename obd type from mds to mdt\n");
+				typename[2] = 't';
+			}
+			if ((lcfg->lcfg_command == LCFG_SETUP && index &&
+			     strcmp(index, "type") == 0)) {
+				CDEBUG(D_INFO, "For 1.8 interoperability, set this index to '0'\n");
+				index[0] = '0';
+				index[1] = 0;
+			}
+		}
+
+
+		if (clli->cfg_flags & CFG_F_EXCLUDE) {
+			CDEBUG(D_CONFIG, "cmd: %x marked EXCLUDED\n",
+			       lcfg->lcfg_command);
+			if (lcfg->lcfg_command == LCFG_LOV_ADD_OBD)
+				/* Add inactive instead */
+				lcfg->lcfg_command = LCFG_LOV_ADD_INA;
+		}
+
+		lustre_cfg_bufs_init(&bufs, lcfg);
+
+		if (clli && clli->cfg_instance &&
+		    LUSTRE_CFG_BUFLEN(lcfg, 0) > 0){
+			inst = 1;
+			inst_len = LUSTRE_CFG_BUFLEN(lcfg, 0) +
+				   sizeof(clli->cfg_instance) * 2 + 4;
+			OBD_ALLOC(inst_name, inst_len);
+			if (inst_name == NULL) {
+				rc = -ENOMEM;
+				goto out;
+			}
+			sprintf(inst_name, "%s-%p",
+				lustre_cfg_string(lcfg, 0),
+				clli->cfg_instance);
+			lustre_cfg_bufs_set_string(&bufs, 0, inst_name);
+			CDEBUG(D_CONFIG, "cmd %x, instance name: %s\n",
+			       lcfg->lcfg_command, inst_name);
+		}
+
+		/* we override the llog's uuid for clients, to insure they
+		are unique */
+		if (clli && clli->cfg_instance != NULL &&
+		    lcfg->lcfg_command == LCFG_ATTACH) {
+			lustre_cfg_bufs_set_string(&bufs, 2,
+						   clli->cfg_uuid.uuid);
+		}
+		/*
+		 * sptlrpc config record, we expect 2 data segments:
+		 *  [0]: fs_name/target_name,
+		 *  [1]: rule string
+		 * moving them to index [1] and [2], and insert MGC's
+		 * obdname at index [0].
+		 */
+		if (clli && clli->cfg_instance == NULL &&
+		    lcfg->lcfg_command == LCFG_SPTLRPC_CONF) {
+			lustre_cfg_bufs_set(&bufs, 2, bufs.lcfg_buf[1],
+					    bufs.lcfg_buflen[1]);
+			lustre_cfg_bufs_set(&bufs, 1, bufs.lcfg_buf[0],
+					    bufs.lcfg_buflen[0]);
+			lustre_cfg_bufs_set_string(&bufs, 0,
+						   clli->cfg_obdname);
+		}
+
+		lcfg_new = lustre_cfg_new(lcfg->lcfg_command, &bufs);
+
+		lcfg_new->lcfg_num   = lcfg->lcfg_num;
+		lcfg_new->lcfg_flags = lcfg->lcfg_flags;
+
+		/* XXX Hack to try to remain binary compatible with
+		 * pre-newconfig logs */
+		if (lcfg->lcfg_nal != 0 &&      /* pre-newconfig log? */
+		    (lcfg->lcfg_nid >> 32) == 0) {
+			__u32 addr = (__u32)(lcfg->lcfg_nid & 0xffffffff);
+
+			lcfg_new->lcfg_nid =
+				LNET_MKNID(LNET_MKNET(lcfg->lcfg_nal, 0), addr);
+			CWARN("Converted pre-newconfig NAL %d NID %x to %s\n",
+			      lcfg->lcfg_nal, addr,
+			      libcfs_nid2str(lcfg_new->lcfg_nid));
+		} else {
+			lcfg_new->lcfg_nid = lcfg->lcfg_nid;
+		}
+
+		lcfg_new->lcfg_nal = 0; /* illegal value for obsolete field */
+
+		rc = class_process_config(lcfg_new);
+		lustre_cfg_free(lcfg_new);
+
+		if (inst)
+			OBD_FREE(inst_name, inst_len);
+		break;
+	}
+	default:
+		CERROR("Unknown llog record type %#x encountered\n",
+		       rec->lrh_type);
+		break;
+	}
+out:
+	if (rc) {
+		CERROR("%s: cfg command failed: rc = %d\n",
+		       handle->lgh_ctxt->loc_obd->obd_name, rc);
+		class_config_dump_handler(NULL, handle, rec, data);
+	}
+	return rc;
+}
+EXPORT_SYMBOL(class_config_llog_handler);
+
+int class_config_parse_llog(const struct lu_env *env, struct llog_ctxt *ctxt,
+			    char *name, struct config_llog_instance *cfg)
+{
+	struct llog_process_cat_data	 cd = {0, 0};
+	struct llog_handle		*llh;
+	llog_cb_t			 callback;
+	int				 rc;
+
+	CDEBUG(D_INFO, "looking up llog %s\n", name);
+	rc = llog_open(env, ctxt, &llh, NULL, name, LLOG_OPEN_EXISTS);
+	if (rc)
+		return rc;
+
+	rc = llog_init_handle(env, llh, LLOG_F_IS_PLAIN, NULL);
+	if (rc)
+		goto parse_out;
+
+	/* continue processing from where we last stopped to end-of-log */
+	if (cfg) {
+		cd.lpcd_first_idx = cfg->cfg_last_idx;
+		callback = cfg->cfg_callback;
+		LASSERT(callback != NULL);
+	} else {
+		callback = class_config_llog_handler;
+	}
+
+	cd.lpcd_last_idx = 0;
+
+	rc = llog_process(env, llh, callback, cfg, &cd);
+
+	CDEBUG(D_CONFIG, "Processed log %s gen %d-%d (rc=%d)\n", name,
+	       cd.lpcd_first_idx + 1, cd.lpcd_last_idx, rc);
+	if (cfg)
+		cfg->cfg_last_idx = cd.lpcd_last_idx;
+
+parse_out:
+	llog_close(env, llh);
+	return rc;
+}
+EXPORT_SYMBOL(class_config_parse_llog);
+
+/**
+ * parse config record and output dump in supplied buffer.
+ * This is separated from class_config_dump_handler() to use
+ * for ioctl needs as well
+ */
+int class_config_parse_rec(struct llog_rec_hdr *rec, char *buf, int size)
+{
+	struct lustre_cfg	*lcfg = (struct lustre_cfg *)(rec + 1);
+	char			*ptr = buf;
+	char			*end = buf + size;
+	int			 rc = 0;
+
+	LASSERT(rec->lrh_type == OBD_CFG_REC);
+	rc = lustre_cfg_sanity_check(lcfg, rec->lrh_len);
+	if (rc < 0)
+		return rc;
+
+	ptr += snprintf(ptr, end-ptr, "cmd=%05x ", lcfg->lcfg_command);
+	if (lcfg->lcfg_flags)
+		ptr += snprintf(ptr, end-ptr, "flags=%#08x ",
+				lcfg->lcfg_flags);
+
+	if (lcfg->lcfg_num)
+		ptr += snprintf(ptr, end-ptr, "num=%#08x ", lcfg->lcfg_num);
+
+	if (lcfg->lcfg_nid)
+		ptr += snprintf(ptr, end-ptr, "nid=%s(%#llx)\n     ",
+				libcfs_nid2str(lcfg->lcfg_nid),
+				lcfg->lcfg_nid);
+
+	if (lcfg->lcfg_command == LCFG_MARKER) {
+		struct cfg_marker *marker = lustre_cfg_buf(lcfg, 1);
+
+		ptr += snprintf(ptr, end-ptr, "marker=%d(%#x)%s '%s'",
+				marker->cm_step, marker->cm_flags,
+				marker->cm_tgtname, marker->cm_comment);
+	} else {
+		int i;
+
+		for (i = 0; i <  lcfg->lcfg_bufcount; i++) {
+			ptr += snprintf(ptr, end-ptr, "%d:%s  ", i,
+					lustre_cfg_string(lcfg, i));
+		}
+	}
+	/* return consumed bytes */
+	rc = ptr - buf;
+	return rc;
+}
+
+int class_config_dump_handler(const struct lu_env *env,
+			      struct llog_handle *handle,
+			      struct llog_rec_hdr *rec, void *data)
+{
+	char	*outstr;
+	int	 rc = 0;
+
+	OBD_ALLOC(outstr, 256);
+	if (outstr == NULL)
+		return -ENOMEM;
+
+	if (rec->lrh_type == OBD_CFG_REC) {
+		class_config_parse_rec(rec, outstr, 256);
+		LCONSOLE(D_WARNING, "   %s\n", outstr);
+	} else {
+		LCONSOLE(D_WARNING, "unhandled lrh_type: %#x\n", rec->lrh_type);
+		rc = -EINVAL;
+	}
+
+	OBD_FREE(outstr, 256);
+	return rc;
+}
+
+int class_config_dump_llog(const struct lu_env *env, struct llog_ctxt *ctxt,
+			   char *name, struct config_llog_instance *cfg)
+{
+	struct llog_handle	*llh;
+	int			 rc;
+
+	LCONSOLE_INFO("Dumping config log %s\n", name);
+
+	rc = llog_open(env, ctxt, &llh, NULL, name, LLOG_OPEN_EXISTS);
+	if (rc)
+		return rc;
+
+	rc = llog_init_handle(env, llh, LLOG_F_IS_PLAIN, NULL);
+	if (rc)
+		goto parse_out;
+
+	rc = llog_process(env, llh, class_config_dump_handler, cfg, NULL);
+parse_out:
+	llog_close(env, llh);
+
+	LCONSOLE_INFO("End config log %s\n", name);
+	return rc;
+}
+EXPORT_SYMBOL(class_config_dump_llog);
+
+/** Call class_cleanup and class_detach.
+ * "Manual" only in the sense that we're faking lcfg commands.
+ */
+int class_manual_cleanup(struct obd_device *obd)
+{
+	char		    flags[3] = "";
+	struct lustre_cfg      *lcfg;
+	struct lustre_cfg_bufs  bufs;
+	int		     rc;
+
+	if (!obd) {
+		CERROR("empty cleanup\n");
+		return -EALREADY;
+	}
+
+	if (obd->obd_force)
+		strcat(flags, "F");
+	if (obd->obd_fail)
+		strcat(flags, "A");
+
+	CDEBUG(D_CONFIG, "Manual cleanup of %s (flags='%s')\n",
+	       obd->obd_name, flags);
+
+	lustre_cfg_bufs_reset(&bufs, obd->obd_name);
+	lustre_cfg_bufs_set_string(&bufs, 1, flags);
+	lcfg = lustre_cfg_new(LCFG_CLEANUP, &bufs);
+	if (!lcfg)
+		return -ENOMEM;
+
+	rc = class_process_config(lcfg);
+	if (rc) {
+		CERROR("cleanup failed %d: %s\n", rc, obd->obd_name);
+		goto out;
+	}
+
+	/* the lcfg is almost the same for both ops */
+	lcfg->lcfg_command = LCFG_DETACH;
+	rc = class_process_config(lcfg);
+	if (rc)
+		CERROR("detach failed %d: %s\n", rc, obd->obd_name);
+out:
+	lustre_cfg_free(lcfg);
+	return rc;
+}
+EXPORT_SYMBOL(class_manual_cleanup);
+
+/*
+ * uuid<->export lustre hash operations
+ */
+
+static unsigned
+uuid_hash(struct cfs_hash *hs, const void *key, unsigned mask)
+{
+	return cfs_hash_djb2_hash(((struct obd_uuid *)key)->uuid,
+				  sizeof(((struct obd_uuid *)key)->uuid), mask);
+}
+
+static void *
+uuid_key(struct hlist_node *hnode)
+{
+	struct obd_export *exp;
+
+	exp = hlist_entry(hnode, struct obd_export, exp_uuid_hash);
+
+	return &exp->exp_client_uuid;
+}
+
+/*
+ * NOTE: It is impossible to find an export that is in failed
+ *       state with this function
+ */
+static int
+uuid_keycmp(const void *key, struct hlist_node *hnode)
+{
+	struct obd_export *exp;
+
+	LASSERT(key);
+	exp = hlist_entry(hnode, struct obd_export, exp_uuid_hash);
+
+	return obd_uuid_equals(key, &exp->exp_client_uuid) &&
+	       !exp->exp_failed;
+}
+
+static void *
+uuid_export_object(struct hlist_node *hnode)
+{
+	return hlist_entry(hnode, struct obd_export, exp_uuid_hash);
+}
+
+static void
+uuid_export_get(struct cfs_hash *hs, struct hlist_node *hnode)
+{
+	struct obd_export *exp;
+
+	exp = hlist_entry(hnode, struct obd_export, exp_uuid_hash);
+	class_export_get(exp);
+}
+
+static void
+uuid_export_put_locked(struct cfs_hash *hs, struct hlist_node *hnode)
+{
+	struct obd_export *exp;
+
+	exp = hlist_entry(hnode, struct obd_export, exp_uuid_hash);
+	class_export_put(exp);
+}
+
+static cfs_hash_ops_t uuid_hash_ops = {
+	.hs_hash	= uuid_hash,
+	.hs_key	 = uuid_key,
+	.hs_keycmp      = uuid_keycmp,
+	.hs_object      = uuid_export_object,
+	.hs_get	 = uuid_export_get,
+	.hs_put_locked  = uuid_export_put_locked,
+};
+
+
+/*
+ * nid<->export hash operations
+ */
+
+static unsigned
+nid_hash(struct cfs_hash *hs, const void *key, unsigned mask)
+{
+	return cfs_hash_djb2_hash(key, sizeof(lnet_nid_t), mask);
+}
+
+static void *
+nid_key(struct hlist_node *hnode)
+{
+	struct obd_export *exp;
+
+	exp = hlist_entry(hnode, struct obd_export, exp_nid_hash);
+
+	return &exp->exp_connection->c_peer.nid;
+}
+
+/*
+ * NOTE: It is impossible to find an export that is in failed
+ *       state with this function
+ */
+static int
+nid_kepcmp(const void *key, struct hlist_node *hnode)
+{
+	struct obd_export *exp;
+
+	LASSERT(key);
+	exp = hlist_entry(hnode, struct obd_export, exp_nid_hash);
+
+	return exp->exp_connection->c_peer.nid == *(lnet_nid_t *)key &&
+	       !exp->exp_failed;
+}
+
+static void *
+nid_export_object(struct hlist_node *hnode)
+{
+	return hlist_entry(hnode, struct obd_export, exp_nid_hash);
+}
+
+static void
+nid_export_get(struct cfs_hash *hs, struct hlist_node *hnode)
+{
+	struct obd_export *exp;
+
+	exp = hlist_entry(hnode, struct obd_export, exp_nid_hash);
+	class_export_get(exp);
+}
+
+static void
+nid_export_put_locked(struct cfs_hash *hs, struct hlist_node *hnode)
+{
+	struct obd_export *exp;
+
+	exp = hlist_entry(hnode, struct obd_export, exp_nid_hash);
+	class_export_put(exp);
+}
+
+static cfs_hash_ops_t nid_hash_ops = {
+	.hs_hash	= nid_hash,
+	.hs_key	 = nid_key,
+	.hs_keycmp      = nid_kepcmp,
+	.hs_object      = nid_export_object,
+	.hs_get	 = nid_export_get,
+	.hs_put_locked  = nid_export_put_locked,
+};
+
+
+/*
+ * nid<->nidstats hash operations
+ */
+
+static void *
+nidstats_key(struct hlist_node *hnode)
+{
+	struct nid_stat *ns;
+
+	ns = hlist_entry(hnode, struct nid_stat, nid_hash);
+
+	return &ns->nid;
+}
+
+static int
+nidstats_keycmp(const void *key, struct hlist_node *hnode)
+{
+	return *(lnet_nid_t *)nidstats_key(hnode) == *(lnet_nid_t *)key;
+}
+
+static void *
+nidstats_object(struct hlist_node *hnode)
+{
+	return hlist_entry(hnode, struct nid_stat, nid_hash);
+}
+
+static void
+nidstats_get(struct cfs_hash *hs, struct hlist_node *hnode)
+{
+	struct nid_stat *ns;
+
+	ns = hlist_entry(hnode, struct nid_stat, nid_hash);
+	nidstat_getref(ns);
+}
+
+static void
+nidstats_put_locked(struct cfs_hash *hs, struct hlist_node *hnode)
+{
+	struct nid_stat *ns;
+
+	ns = hlist_entry(hnode, struct nid_stat, nid_hash);
+	nidstat_putref(ns);
+}
+
+static cfs_hash_ops_t nid_stat_hash_ops = {
+	.hs_hash	= nid_hash,
+	.hs_key	 = nidstats_key,
+	.hs_keycmp      = nidstats_keycmp,
+	.hs_object      = nidstats_object,
+	.hs_get	 = nidstats_get,
+	.hs_put_locked  = nidstats_put_locked,
+};
diff --git a/kernel/drivers/staging/lustre/lustre/obdclass/obd_mount.c b/kernel/drivers/staging/lustre/lustre/obdclass/obd_mount.c
new file mode 100644
index 000000000..3437b2ecf
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/obdclass/obd_mount.c
@@ -0,0 +1,1319 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdclass/obd_mount.c
+ *
+ * Client mount routines
+ *
+ * Author: Nathan Rutman <nathan@clusterfs.com>
+ */
+
+
+#define DEBUG_SUBSYSTEM S_CLASS
+#define D_MOUNT (D_SUPER|D_CONFIG/*|D_WARNING */)
+#define PRINT_CMD CDEBUG
+
+#include "../include/obd.h"
+#include "../include/linux/lustre_compat25.h"
+#include "../include/obd_class.h"
+#include "../include/lustre/lustre_user.h"
+#include "../include/lustre_log.h"
+#include "../include/lustre_disk.h"
+#include "../include/lustre_param.h"
+
+static int (*client_fill_super)(struct super_block *sb,
+				struct vfsmount *mnt);
+
+static void (*kill_super_cb)(struct super_block *sb);
+
+/**************** config llog ********************/
+
+/** Get a config log from the MGS and process it.
+ * This func is called for both clients and servers.
+ * Continue to process new statements appended to the logs
+ * (whenever the config lock is revoked) until lustre_end_log
+ * is called.
+ * @param sb The superblock is used by the MGC to write to the local copy of
+ *   the config log
+ * @param logname The name of the llog to replicate from the MGS
+ * @param cfg Since the same mgc may be used to follow multiple config logs
+ *   (e.g. ost1, ost2, client), the config_llog_instance keeps the state for
+ *   this log, and is added to the mgc's list of logs to follow.
+ */
+int lustre_process_log(struct super_block *sb, char *logname,
+		     struct config_llog_instance *cfg)
+{
+	struct lustre_cfg *lcfg;
+	struct lustre_cfg_bufs *bufs;
+	struct lustre_sb_info *lsi = s2lsi(sb);
+	struct obd_device *mgc = lsi->lsi_mgc;
+	int rc;
+
+	LASSERT(mgc);
+	LASSERT(cfg);
+
+	OBD_ALLOC_PTR(bufs);
+	if (bufs == NULL)
+		return -ENOMEM;
+
+	/* mgc_process_config */
+	lustre_cfg_bufs_reset(bufs, mgc->obd_name);
+	lustre_cfg_bufs_set_string(bufs, 1, logname);
+	lustre_cfg_bufs_set(bufs, 2, cfg, sizeof(*cfg));
+	lustre_cfg_bufs_set(bufs, 3, &sb, sizeof(sb));
+	lcfg = lustre_cfg_new(LCFG_LOG_START, bufs);
+	rc = obd_process_config(mgc, sizeof(*lcfg), lcfg);
+	lustre_cfg_free(lcfg);
+
+	OBD_FREE_PTR(bufs);
+
+	if (rc == -EINVAL)
+		LCONSOLE_ERROR_MSG(0x15b, "%s: The configuration from log '%s' failed from the MGS (%d).  Make sure this client and the MGS are running compatible versions of Lustre.\n",
+				   mgc->obd_name, logname, rc);
+
+	if (rc)
+		LCONSOLE_ERROR_MSG(0x15c, "%s: The configuration from log '%s' failed (%d). This may be the result of communication errors between this node and the MGS, a bad configuration, or other errors. See the syslog for more information.\n",
+				   mgc->obd_name, logname,
+				   rc);
+
+	/* class_obd_list(); */
+	return rc;
+}
+EXPORT_SYMBOL(lustre_process_log);
+
+/* Stop watching this config log for updates */
+int lustre_end_log(struct super_block *sb, char *logname,
+		       struct config_llog_instance *cfg)
+{
+	struct lustre_cfg *lcfg;
+	struct lustre_cfg_bufs bufs;
+	struct lustre_sb_info *lsi = s2lsi(sb);
+	struct obd_device *mgc = lsi->lsi_mgc;
+	int rc;
+
+	if (!mgc)
+		return -ENOENT;
+
+	/* mgc_process_config */
+	lustre_cfg_bufs_reset(&bufs, mgc->obd_name);
+	lustre_cfg_bufs_set_string(&bufs, 1, logname);
+	if (cfg)
+		lustre_cfg_bufs_set(&bufs, 2, cfg, sizeof(*cfg));
+	lcfg = lustre_cfg_new(LCFG_LOG_END, &bufs);
+	rc = obd_process_config(mgc, sizeof(*lcfg), lcfg);
+	lustre_cfg_free(lcfg);
+	return rc;
+}
+EXPORT_SYMBOL(lustre_end_log);
+
+/**************** obd start *******************/
+
+/** lustre_cfg_bufs are a holdover from 1.4; we can still set these up from
+ * lctl (and do for echo cli/srv.
+ */
+int do_lcfg(char *cfgname, lnet_nid_t nid, int cmd,
+	    char *s1, char *s2, char *s3, char *s4)
+{
+	struct lustre_cfg_bufs bufs;
+	struct lustre_cfg    * lcfg = NULL;
+	int rc;
+
+	CDEBUG(D_TRACE, "lcfg %s %#x %s %s %s %s\n", cfgname,
+	       cmd, s1, s2, s3, s4);
+
+	lustre_cfg_bufs_reset(&bufs, cfgname);
+	if (s1)
+		lustre_cfg_bufs_set_string(&bufs, 1, s1);
+	if (s2)
+		lustre_cfg_bufs_set_string(&bufs, 2, s2);
+	if (s3)
+		lustre_cfg_bufs_set_string(&bufs, 3, s3);
+	if (s4)
+		lustre_cfg_bufs_set_string(&bufs, 4, s4);
+
+	lcfg = lustre_cfg_new(cmd, &bufs);
+	lcfg->lcfg_nid = nid;
+	rc = class_process_config(lcfg);
+	lustre_cfg_free(lcfg);
+	return rc;
+}
+EXPORT_SYMBOL(do_lcfg);
+
+/** Call class_attach and class_setup.  These methods in turn call
+ * obd type-specific methods.
+ */
+int lustre_start_simple(char *obdname, char *type, char *uuid,
+			char *s1, char *s2, char *s3, char *s4)
+{
+	int rc;
+	CDEBUG(D_MOUNT, "Starting obd %s (typ=%s)\n", obdname, type);
+
+	rc = do_lcfg(obdname, 0, LCFG_ATTACH, type, uuid, NULL, NULL);
+	if (rc) {
+		CERROR("%s attach error %d\n", obdname, rc);
+		return rc;
+	}
+	rc = do_lcfg(obdname, 0, LCFG_SETUP, s1, s2, s3, s4);
+	if (rc) {
+		CERROR("%s setup error %d\n", obdname, rc);
+		do_lcfg(obdname, 0, LCFG_DETACH, NULL, NULL, NULL, NULL);
+	}
+	return rc;
+}
+
+DEFINE_MUTEX(mgc_start_lock);
+
+/** Set up a mgc obd to process startup logs
+ *
+ * \param sb [in] super block of the mgc obd
+ *
+ * \retval 0 success, otherwise error code
+ */
+int lustre_start_mgc(struct super_block *sb)
+{
+	struct obd_connect_data *data = NULL;
+	struct lustre_sb_info *lsi = s2lsi(sb);
+	struct obd_device *obd;
+	struct obd_export *exp;
+	struct obd_uuid *uuid;
+	class_uuid_t uuidc;
+	lnet_nid_t nid;
+	char *mgcname = NULL, *niduuid = NULL, *mgssec = NULL;
+	char *ptr;
+	int rc = 0, i = 0, j, len;
+
+	LASSERT(lsi->lsi_lmd);
+
+	/* Find the first non-lo MGS nid for our MGC name */
+	if (IS_SERVER(lsi)) {
+		/* mount -o mgsnode=nid */
+		ptr = lsi->lsi_lmd->lmd_mgs;
+		if (lsi->lsi_lmd->lmd_mgs &&
+		    (class_parse_nid(lsi->lsi_lmd->lmd_mgs, &nid, &ptr) == 0)) {
+			i++;
+		} else if (IS_MGS(lsi)) {
+			lnet_process_id_t id;
+			while ((rc = LNetGetId(i++, &id)) != -ENOENT) {
+				if (LNET_NETTYP(LNET_NIDNET(id.nid)) == LOLND)
+					continue;
+				nid = id.nid;
+				i++;
+				break;
+			}
+		}
+	} else { /* client */
+		/* Use nids from mount line: uml1,1@elan:uml2,2@elan:/lustre */
+		ptr = lsi->lsi_lmd->lmd_dev;
+		if (class_parse_nid(ptr, &nid, &ptr) == 0)
+			i++;
+	}
+	if (i == 0) {
+		CERROR("No valid MGS nids found.\n");
+		return -EINVAL;
+	}
+
+	mutex_lock(&mgc_start_lock);
+
+	len = strlen(LUSTRE_MGC_OBDNAME) + strlen(libcfs_nid2str(nid)) + 1;
+	OBD_ALLOC(mgcname, len);
+	OBD_ALLOC(niduuid, len + 2);
+	if (!mgcname || !niduuid) {
+		rc = -ENOMEM;
+		goto out_free;
+	}
+	sprintf(mgcname, "%s%s", LUSTRE_MGC_OBDNAME, libcfs_nid2str(nid));
+
+	mgssec = lsi->lsi_lmd->lmd_mgssec ? lsi->lsi_lmd->lmd_mgssec : "";
+
+	OBD_ALLOC_PTR(data);
+	if (data == NULL) {
+		rc = -ENOMEM;
+		goto out_free;
+	}
+
+	obd = class_name2obd(mgcname);
+	if (obd && !obd->obd_stopping) {
+		int recov_bk;
+
+		rc = obd_set_info_async(NULL, obd->obd_self_export,
+					strlen(KEY_MGSSEC), KEY_MGSSEC,
+					strlen(mgssec), mgssec, NULL);
+		if (rc)
+			goto out_free;
+
+		/* Re-using an existing MGC */
+		atomic_inc(&obd->u.cli.cl_mgc_refcount);
+
+		/* IR compatibility check, only for clients */
+		if (lmd_is_client(lsi->lsi_lmd)) {
+			int has_ir;
+			int vallen = sizeof(*data);
+			__u32 *flags = &lsi->lsi_lmd->lmd_flags;
+
+			rc = obd_get_info(NULL, obd->obd_self_export,
+					  strlen(KEY_CONN_DATA), KEY_CONN_DATA,
+					  &vallen, data, NULL);
+			LASSERT(rc == 0);
+			has_ir = OCD_HAS_FLAG(data, IMP_RECOV);
+			if (has_ir ^ !(*flags & LMD_FLG_NOIR)) {
+				/* LMD_FLG_NOIR is for test purpose only */
+				LCONSOLE_WARN(
+					"Trying to mount a client with IR setting not compatible with current mgc. Force to use current mgc setting that is IR %s.\n",
+					has_ir ? "enabled" : "disabled");
+				if (has_ir)
+					*flags &= ~LMD_FLG_NOIR;
+				else
+					*flags |= LMD_FLG_NOIR;
+			}
+		}
+
+		recov_bk = 0;
+		/* If we are restarting the MGS, don't try to keep the MGC's
+		   old connection, or registration will fail. */
+		if (IS_MGS(lsi)) {
+			CDEBUG(D_MOUNT, "New MGS with live MGC\n");
+			recov_bk = 1;
+		}
+
+		/* Try all connections, but only once (again).
+		   We don't want to block another target from starting
+		   (using its local copy of the log), but we do want to connect
+		   if at all possible. */
+		recov_bk++;
+		CDEBUG(D_MOUNT, "%s: Set MGC reconnect %d\n", mgcname,
+		       recov_bk);
+		rc = obd_set_info_async(NULL, obd->obd_self_export,
+					sizeof(KEY_INIT_RECOV_BACKUP),
+					KEY_INIT_RECOV_BACKUP,
+					sizeof(recov_bk), &recov_bk, NULL);
+		rc = 0;
+		goto out;
+	}
+
+	CDEBUG(D_MOUNT, "Start MGC '%s'\n", mgcname);
+
+	/* Add the primary nids for the MGS */
+	i = 0;
+	sprintf(niduuid, "%s_%x", mgcname, i);
+	if (IS_SERVER(lsi)) {
+		ptr = lsi->lsi_lmd->lmd_mgs;
+		if (IS_MGS(lsi)) {
+			/* Use local nids (including LO) */
+			lnet_process_id_t id;
+			while ((rc = LNetGetId(i++, &id)) != -ENOENT) {
+				rc = do_lcfg(mgcname, id.nid,
+					     LCFG_ADD_UUID, niduuid,
+					     NULL, NULL, NULL);
+			}
+		} else {
+			/* Use mgsnode= nids */
+			/* mount -o mgsnode=nid */
+			if (lsi->lsi_lmd->lmd_mgs) {
+				ptr = lsi->lsi_lmd->lmd_mgs;
+			} else if (class_find_param(ptr, PARAM_MGSNODE,
+						    &ptr) != 0) {
+				CERROR("No MGS nids given.\n");
+				rc = -EINVAL;
+				goto out_free;
+			}
+			while (class_parse_nid(ptr, &nid, &ptr) == 0) {
+				rc = do_lcfg(mgcname, nid,
+					     LCFG_ADD_UUID, niduuid,
+					     NULL, NULL, NULL);
+				i++;
+			}
+		}
+	} else { /* client */
+		/* Use nids from mount line: uml1,1@elan:uml2,2@elan:/lustre */
+		ptr = lsi->lsi_lmd->lmd_dev;
+		while (class_parse_nid(ptr, &nid, &ptr) == 0) {
+			rc = do_lcfg(mgcname, nid,
+				     LCFG_ADD_UUID, niduuid, NULL, NULL, NULL);
+			i++;
+			/* Stop at the first failover nid */
+			if (*ptr == ':')
+				break;
+		}
+	}
+	if (i == 0) {
+		CERROR("No valid MGS nids found.\n");
+		rc = -EINVAL;
+		goto out_free;
+	}
+	lsi->lsi_lmd->lmd_mgs_failnodes = 1;
+
+	/* Random uuid for MGC allows easier reconnects */
+	OBD_ALLOC_PTR(uuid);
+	if (!uuid) {
+		rc = -ENOMEM;
+		goto out_free;
+	}
+
+	ll_generate_random_uuid(uuidc);
+	class_uuid_unparse(uuidc, uuid);
+
+	/* Start the MGC */
+	rc = lustre_start_simple(mgcname, LUSTRE_MGC_NAME,
+				 (char *)uuid->uuid, LUSTRE_MGS_OBDNAME,
+				 niduuid, NULL, NULL);
+	OBD_FREE_PTR(uuid);
+	if (rc)
+		goto out_free;
+
+	/* Add any failover MGS nids */
+	i = 1;
+	while (ptr && ((*ptr == ':' ||
+	       class_find_param(ptr, PARAM_MGSNODE, &ptr) == 0))) {
+		/* New failover node */
+		sprintf(niduuid, "%s_%x", mgcname, i);
+		j = 0;
+		while (class_parse_nid_quiet(ptr, &nid, &ptr) == 0) {
+			j++;
+			rc = do_lcfg(mgcname, nid,
+				     LCFG_ADD_UUID, niduuid, NULL, NULL, NULL);
+			if (*ptr == ':')
+				break;
+		}
+		if (j > 0) {
+			rc = do_lcfg(mgcname, 0, LCFG_ADD_CONN,
+				     niduuid, NULL, NULL, NULL);
+			i++;
+		} else {
+			/* at ":/fsname" */
+			break;
+		}
+	}
+	lsi->lsi_lmd->lmd_mgs_failnodes = i;
+
+	obd = class_name2obd(mgcname);
+	if (!obd) {
+		CERROR("Can't find mgcobd %s\n", mgcname);
+		rc = -ENOTCONN;
+		goto out_free;
+	}
+
+	rc = obd_set_info_async(NULL, obd->obd_self_export,
+				strlen(KEY_MGSSEC), KEY_MGSSEC,
+				strlen(mgssec), mgssec, NULL);
+	if (rc)
+		goto out_free;
+
+	/* Keep a refcount of servers/clients who started with "mount",
+	   so we know when we can get rid of the mgc. */
+	atomic_set(&obd->u.cli.cl_mgc_refcount, 1);
+
+	/* We connect to the MGS at setup, and don't disconnect until cleanup */
+	data->ocd_connect_flags = OBD_CONNECT_VERSION | OBD_CONNECT_AT |
+				  OBD_CONNECT_FULL20 | OBD_CONNECT_IMP_RECOV |
+				  OBD_CONNECT_LVB_TYPE;
+
+#if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(3, 2, 50, 0)
+	data->ocd_connect_flags |= OBD_CONNECT_MNE_SWAB;
+#else
+#warning "LU-1644: Remove old OBD_CONNECT_MNE_SWAB fixup and imp_need_mne_swab"
+#endif
+
+	if (lmd_is_client(lsi->lsi_lmd) &&
+	    lsi->lsi_lmd->lmd_flags & LMD_FLG_NOIR)
+		data->ocd_connect_flags &= ~OBD_CONNECT_IMP_RECOV;
+	data->ocd_version = LUSTRE_VERSION_CODE;
+	rc = obd_connect(NULL, &exp, obd, &(obd->obd_uuid), data, NULL);
+	if (rc) {
+		CERROR("connect failed %d\n", rc);
+		goto out;
+	}
+
+	obd->u.cli.cl_mgc_mgsexp = exp;
+
+out:
+	/* Keep the mgc info in the sb. Note that many lsi's can point
+	   to the same mgc.*/
+	lsi->lsi_mgc = obd;
+out_free:
+	mutex_unlock(&mgc_start_lock);
+
+	if (data)
+		OBD_FREE_PTR(data);
+	if (mgcname)
+		OBD_FREE(mgcname, len);
+	if (niduuid)
+		OBD_FREE(niduuid, len + 2);
+	return rc;
+}
+
+static int lustre_stop_mgc(struct super_block *sb)
+{
+	struct lustre_sb_info *lsi = s2lsi(sb);
+	struct obd_device *obd;
+	char *niduuid = NULL, *ptr = NULL;
+	int i, rc = 0, len = 0;
+
+	if (!lsi)
+		return -ENOENT;
+	obd = lsi->lsi_mgc;
+	if (!obd)
+		return -ENOENT;
+	lsi->lsi_mgc = NULL;
+
+	mutex_lock(&mgc_start_lock);
+	LASSERT(atomic_read(&obd->u.cli.cl_mgc_refcount) > 0);
+	if (!atomic_dec_and_test(&obd->u.cli.cl_mgc_refcount)) {
+		/* This is not fatal, every client that stops
+		   will call in here. */
+		CDEBUG(D_MOUNT, "mgc still has %d references.\n",
+		       atomic_read(&obd->u.cli.cl_mgc_refcount));
+		rc = -EBUSY;
+		goto out;
+	}
+
+	/* The MGC has no recoverable data in any case.
+	 * force shutdown set in umount_begin */
+	obd->obd_no_recov = 1;
+
+	if (obd->u.cli.cl_mgc_mgsexp) {
+		/* An error is not fatal, if we are unable to send the
+		   disconnect mgs ping evictor cleans up the export */
+		rc = obd_disconnect(obd->u.cli.cl_mgc_mgsexp);
+		if (rc)
+			CDEBUG(D_MOUNT, "disconnect failed %d\n", rc);
+	}
+
+	/* Save the obdname for cleaning the nid uuids, which are
+	   obdname_XX */
+	len = strlen(obd->obd_name) + 6;
+	OBD_ALLOC(niduuid, len);
+	if (niduuid) {
+		strcpy(niduuid, obd->obd_name);
+		ptr = niduuid + strlen(niduuid);
+	}
+
+	rc = class_manual_cleanup(obd);
+	if (rc)
+		goto out;
+
+	/* Clean the nid uuids */
+	if (!niduuid) {
+		rc = -ENOMEM;
+		goto out;
+	}
+
+	for (i = 0; i < lsi->lsi_lmd->lmd_mgs_failnodes; i++) {
+		sprintf(ptr, "_%x", i);
+		rc = do_lcfg(LUSTRE_MGC_OBDNAME, 0, LCFG_DEL_UUID,
+			     niduuid, NULL, NULL, NULL);
+		if (rc)
+			CERROR("del MDC UUID %s failed: rc = %d\n",
+			       niduuid, rc);
+	}
+out:
+	if (niduuid)
+		OBD_FREE(niduuid, len);
+
+	/* class_import_put will get rid of the additional connections */
+	mutex_unlock(&mgc_start_lock);
+	return rc;
+}
+
+/***************** lustre superblock **************/
+
+struct lustre_sb_info *lustre_init_lsi(struct super_block *sb)
+{
+	struct lustre_sb_info *lsi;
+
+	OBD_ALLOC_PTR(lsi);
+	if (!lsi)
+		return NULL;
+	OBD_ALLOC_PTR(lsi->lsi_lmd);
+	if (!lsi->lsi_lmd) {
+		OBD_FREE_PTR(lsi);
+		return NULL;
+	}
+
+	lsi->lsi_lmd->lmd_exclude_count = 0;
+	lsi->lsi_lmd->lmd_recovery_time_soft = 0;
+	lsi->lsi_lmd->lmd_recovery_time_hard = 0;
+	s2lsi_nocast(sb) = lsi;
+	/* we take 1 extra ref for our setup */
+	atomic_set(&lsi->lsi_mounts, 1);
+
+	/* Default umount style */
+	lsi->lsi_flags = LSI_UMOUNT_FAILOVER;
+
+	return lsi;
+}
+
+static int lustre_free_lsi(struct super_block *sb)
+{
+	struct lustre_sb_info *lsi = s2lsi(sb);
+
+	LASSERT(lsi != NULL);
+	CDEBUG(D_MOUNT, "Freeing lsi %p\n", lsi);
+
+	/* someone didn't call server_put_mount. */
+	LASSERT(atomic_read(&lsi->lsi_mounts) == 0);
+
+	if (lsi->lsi_lmd != NULL) {
+		if (lsi->lsi_lmd->lmd_dev != NULL)
+			OBD_FREE(lsi->lsi_lmd->lmd_dev,
+				 strlen(lsi->lsi_lmd->lmd_dev) + 1);
+		if (lsi->lsi_lmd->lmd_profile != NULL)
+			OBD_FREE(lsi->lsi_lmd->lmd_profile,
+				 strlen(lsi->lsi_lmd->lmd_profile) + 1);
+		if (lsi->lsi_lmd->lmd_mgssec != NULL)
+			OBD_FREE(lsi->lsi_lmd->lmd_mgssec,
+				 strlen(lsi->lsi_lmd->lmd_mgssec) + 1);
+		if (lsi->lsi_lmd->lmd_opts != NULL)
+			OBD_FREE(lsi->lsi_lmd->lmd_opts,
+				 strlen(lsi->lsi_lmd->lmd_opts) + 1);
+		if (lsi->lsi_lmd->lmd_exclude_count)
+			OBD_FREE(lsi->lsi_lmd->lmd_exclude,
+				 sizeof(lsi->lsi_lmd->lmd_exclude[0]) *
+				 lsi->lsi_lmd->lmd_exclude_count);
+		if (lsi->lsi_lmd->lmd_mgs != NULL)
+			OBD_FREE(lsi->lsi_lmd->lmd_mgs,
+				 strlen(lsi->lsi_lmd->lmd_mgs) + 1);
+		if (lsi->lsi_lmd->lmd_osd_type != NULL)
+			OBD_FREE(lsi->lsi_lmd->lmd_osd_type,
+				 strlen(lsi->lsi_lmd->lmd_osd_type) + 1);
+		if (lsi->lsi_lmd->lmd_params != NULL)
+			OBD_FREE(lsi->lsi_lmd->lmd_params, 4096);
+
+		OBD_FREE(lsi->lsi_lmd, sizeof(*lsi->lsi_lmd));
+	}
+
+	LASSERT(lsi->lsi_llsbi == NULL);
+	OBD_FREE(lsi, sizeof(*lsi));
+	s2lsi_nocast(sb) = NULL;
+
+	return 0;
+}
+
+/* The lsi has one reference for every server that is using the disk -
+   e.g. MDT, MGS, and potentially MGC */
+int lustre_put_lsi(struct super_block *sb)
+{
+	struct lustre_sb_info *lsi = s2lsi(sb);
+
+	LASSERT(lsi != NULL);
+
+	CDEBUG(D_MOUNT, "put %p %d\n", sb, atomic_read(&lsi->lsi_mounts));
+	if (atomic_dec_and_test(&lsi->lsi_mounts)) {
+		if (IS_SERVER(lsi) && lsi->lsi_osd_exp) {
+			lu_device_put(&lsi->lsi_dt_dev->dd_lu_dev);
+			lsi->lsi_osd_exp->exp_obd->obd_lvfs_ctxt.dt = NULL;
+			lsi->lsi_dt_dev = NULL;
+			obd_disconnect(lsi->lsi_osd_exp);
+			/* wait till OSD is gone */
+			obd_zombie_barrier();
+		}
+		lustre_free_lsi(sb);
+		return 1;
+	}
+	return 0;
+}
+
+/*** SERVER NAME ***
+ * <FSNAME><SEPARATOR><TYPE><INDEX>
+ * FSNAME is between 1 and 8 characters (inclusive).
+ *	Excluded characters are '/' and ':'
+ * SEPARATOR is either ':' or '-'
+ * TYPE: "OST", "MDT", etc.
+ * INDEX: Hex representation of the index
+ */
+
+/** Get the fsname ("lustre") from the server name ("lustre-OST003F").
+ * @param [in] svname server name including type and index
+ * @param [out] fsname Buffer to copy filesystem name prefix into.
+ *  Must have at least 'strlen(fsname) + 1' chars.
+ * @param [out] endptr if endptr isn't NULL it is set to end of fsname
+ * rc < 0  on error
+ */
+int server_name2fsname(const char *svname, char *fsname, const char **endptr)
+{
+	const char *dash;
+
+	dash = svname + strnlen(svname, 8); /* max fsname length is 8 */
+	for (; dash > svname && *dash != '-' && *dash != ':'; dash--)
+		;
+	if (dash == svname)
+		return -EINVAL;
+
+	if (fsname != NULL) {
+		strncpy(fsname, svname, dash - svname);
+		fsname[dash - svname] = '\0';
+	}
+
+	if (endptr != NULL)
+		*endptr = dash;
+
+	return 0;
+}
+EXPORT_SYMBOL(server_name2fsname);
+
+/**
+ * Get service name (svname) from string
+ * rc < 0 on error
+ * if endptr isn't NULL it is set to end of fsname *
+ */
+int server_name2svname(const char *label, char *svname, const char **endptr,
+		       size_t svsize)
+{
+	int rc;
+	const char *dash;
+
+	/* We use server_name2fsname() just for parsing */
+	rc = server_name2fsname(label, NULL, &dash);
+	if (rc != 0)
+		return rc;
+
+	if (endptr != NULL)
+		*endptr = dash;
+
+	if (strlcpy(svname, dash + 1, svsize) >= svsize)
+		return -E2BIG;
+
+	return 0;
+}
+EXPORT_SYMBOL(server_name2svname);
+
+
+/* Get the index from the obd name.
+   rc = server type, or
+   rc < 0  on error
+   if endptr isn't NULL it is set to end of name */
+int server_name2index(const char *svname, __u32 *idx, const char **endptr)
+{
+	unsigned long index;
+	int rc;
+	const char *dash;
+
+	/* We use server_name2fsname() just for parsing */
+	rc = server_name2fsname(svname, NULL, &dash);
+	if (rc != 0)
+		return rc;
+
+	dash++;
+
+	if (strncmp(dash, "MDT", 3) == 0)
+		rc = LDD_F_SV_TYPE_MDT;
+	else if (strncmp(dash, "OST", 3) == 0)
+		rc = LDD_F_SV_TYPE_OST;
+	else
+		return -EINVAL;
+
+	dash += 3;
+
+	if (strncmp(dash, "all", 3) == 0) {
+		if (endptr != NULL)
+			*endptr = dash + 3;
+		return rc | LDD_F_SV_ALL;
+	}
+
+	index = simple_strtoul(dash, (char **)endptr, 16);
+	if (idx != NULL)
+		*idx = index;
+
+	/* Account for -mdc after index that is possible when specifying mdt */
+	if (endptr != NULL && strncmp(LUSTRE_MDC_NAME, *endptr + 1,
+				      sizeof(LUSTRE_MDC_NAME)-1) == 0)
+		*endptr += sizeof(LUSTRE_MDC_NAME);
+
+	return rc;
+}
+EXPORT_SYMBOL(server_name2index);
+
+/*************** mount common between server and client ***************/
+
+/* Common umount */
+int lustre_common_put_super(struct super_block *sb)
+{
+	int rc;
+
+	CDEBUG(D_MOUNT, "dropping sb %p\n", sb);
+
+	/* Drop a ref to the MGC */
+	rc = lustre_stop_mgc(sb);
+	if (rc && (rc != -ENOENT)) {
+		if (rc != -EBUSY) {
+			CERROR("Can't stop MGC: %d\n", rc);
+			return rc;
+		}
+		/* BUSY just means that there's some other obd that
+		   needs the mgc.  Let him clean it up. */
+		CDEBUG(D_MOUNT, "MGC still in use\n");
+	}
+	/* Drop a ref to the mounted disk */
+	lustre_put_lsi(sb);
+	lu_types_stop();
+	return rc;
+}
+EXPORT_SYMBOL(lustre_common_put_super);
+
+static void lmd_print(struct lustre_mount_data *lmd)
+{
+	int i;
+
+	PRINT_CMD(D_MOUNT, "  mount data:\n");
+	if (lmd_is_client(lmd))
+		PRINT_CMD(D_MOUNT, "profile: %s\n", lmd->lmd_profile);
+	PRINT_CMD(D_MOUNT, "device:  %s\n", lmd->lmd_dev);
+	PRINT_CMD(D_MOUNT, "flags:   %x\n", lmd->lmd_flags);
+
+	if (lmd->lmd_opts)
+		PRINT_CMD(D_MOUNT, "options: %s\n", lmd->lmd_opts);
+
+	if (lmd->lmd_recovery_time_soft)
+		PRINT_CMD(D_MOUNT, "recovery time soft: %d\n",
+			  lmd->lmd_recovery_time_soft);
+
+	if (lmd->lmd_recovery_time_hard)
+		PRINT_CMD(D_MOUNT, "recovery time hard: %d\n",
+			  lmd->lmd_recovery_time_hard);
+
+	for (i = 0; i < lmd->lmd_exclude_count; i++) {
+		PRINT_CMD(D_MOUNT, "exclude %d:  OST%04x\n", i,
+			  lmd->lmd_exclude[i]);
+	}
+}
+
+/* Is this server on the exclusion list */
+int lustre_check_exclusion(struct super_block *sb, char *svname)
+{
+	struct lustre_sb_info *lsi = s2lsi(sb);
+	struct lustre_mount_data *lmd = lsi->lsi_lmd;
+	__u32 index;
+	int i, rc;
+
+	rc = server_name2index(svname, &index, NULL);
+	if (rc != LDD_F_SV_TYPE_OST)
+		/* Only exclude OSTs */
+		return 0;
+
+	CDEBUG(D_MOUNT, "Check exclusion %s (%d) in %d of %s\n", svname,
+	       index, lmd->lmd_exclude_count, lmd->lmd_dev);
+
+	for (i = 0; i < lmd->lmd_exclude_count; i++) {
+		if (index == lmd->lmd_exclude[i]) {
+			CWARN("Excluding %s (on exclusion list)\n", svname);
+			return 1;
+		}
+	}
+	return 0;
+}
+
+/* mount -v  -o exclude=lustre-OST0001:lustre-OST0002 -t lustre ... */
+static int lmd_make_exclusion(struct lustre_mount_data *lmd, const char *ptr)
+{
+	const char *s1 = ptr, *s2;
+	__u32 index, *exclude_list;
+	int rc = 0, devmax;
+
+	/* The shortest an ost name can be is 8 chars: -OST0000.
+	   We don't actually know the fsname at this time, so in fact
+	   a user could specify any fsname. */
+	devmax = strlen(ptr) / 8 + 1;
+
+	/* temp storage until we figure out how many we have */
+	OBD_ALLOC(exclude_list, sizeof(index) * devmax);
+	if (!exclude_list)
+		return -ENOMEM;
+
+	/* we enter this fn pointing at the '=' */
+	while (*s1 && *s1 != ' ' && *s1 != ',') {
+		s1++;
+		rc = server_name2index(s1, &index, &s2);
+		if (rc < 0) {
+			CERROR("Can't parse server name '%s': rc = %d\n",
+			       s1, rc);
+			break;
+		}
+		if (rc == LDD_F_SV_TYPE_OST)
+			exclude_list[lmd->lmd_exclude_count++] = index;
+		else
+			CDEBUG(D_MOUNT, "ignoring exclude %.*s: type = %#x\n",
+			       (uint)(s2-s1), s1, rc);
+		s1 = s2;
+		/* now we are pointing at ':' (next exclude)
+		   or ',' (end of excludes) */
+		if (lmd->lmd_exclude_count >= devmax)
+			break;
+	}
+	if (rc >= 0) /* non-err */
+		rc = 0;
+
+	if (lmd->lmd_exclude_count) {
+		/* permanent, freed in lustre_free_lsi */
+		OBD_ALLOC(lmd->lmd_exclude, sizeof(index) *
+			  lmd->lmd_exclude_count);
+		if (lmd->lmd_exclude) {
+			memcpy(lmd->lmd_exclude, exclude_list,
+			       sizeof(index) * lmd->lmd_exclude_count);
+		} else {
+			rc = -ENOMEM;
+			lmd->lmd_exclude_count = 0;
+		}
+	}
+	OBD_FREE(exclude_list, sizeof(index) * devmax);
+	return rc;
+}
+
+static int lmd_parse_mgssec(struct lustre_mount_data *lmd, char *ptr)
+{
+	char   *tail;
+	int     length;
+
+	if (lmd->lmd_mgssec != NULL) {
+		OBD_FREE(lmd->lmd_mgssec, strlen(lmd->lmd_mgssec) + 1);
+		lmd->lmd_mgssec = NULL;
+	}
+
+	tail = strchr(ptr, ',');
+	if (tail == NULL)
+		length = strlen(ptr);
+	else
+		length = tail - ptr;
+
+	OBD_ALLOC(lmd->lmd_mgssec, length + 1);
+	if (lmd->lmd_mgssec == NULL)
+		return -ENOMEM;
+
+	memcpy(lmd->lmd_mgssec, ptr, length);
+	lmd->lmd_mgssec[length] = '\0';
+	return 0;
+}
+
+static int lmd_parse_string(char **handle, char *ptr)
+{
+	char   *tail;
+	int     length;
+
+	if ((handle == NULL) || (ptr == NULL))
+		return -EINVAL;
+
+	if (*handle != NULL) {
+		OBD_FREE(*handle, strlen(*handle) + 1);
+		*handle = NULL;
+	}
+
+	tail = strchr(ptr, ',');
+	if (tail == NULL)
+		length = strlen(ptr);
+	else
+		length = tail - ptr;
+
+	OBD_ALLOC(*handle, length + 1);
+	if (*handle == NULL)
+		return -ENOMEM;
+
+	memcpy(*handle, ptr, length);
+	(*handle)[length] = '\0';
+
+	return 0;
+}
+
+/* Collect multiple values for mgsnid specifiers */
+static int lmd_parse_mgs(struct lustre_mount_data *lmd, char **ptr)
+{
+	lnet_nid_t nid;
+	char *tail = *ptr;
+	char *mgsnid;
+	int   length;
+	int   oldlen = 0;
+
+	/* Find end of nidlist */
+	while (class_parse_nid_quiet(tail, &nid, &tail) == 0) {}
+	length = tail - *ptr;
+	if (length == 0) {
+		LCONSOLE_ERROR_MSG(0x159, "Can't parse NID '%s'\n", *ptr);
+		return -EINVAL;
+	}
+
+	if (lmd->lmd_mgs != NULL)
+		oldlen = strlen(lmd->lmd_mgs) + 1;
+
+	OBD_ALLOC(mgsnid, oldlen + length + 1);
+	if (mgsnid == NULL)
+		return -ENOMEM;
+
+	if (lmd->lmd_mgs != NULL) {
+		/* Multiple mgsnid= are taken to mean failover locations */
+		memcpy(mgsnid, lmd->lmd_mgs, oldlen);
+		mgsnid[oldlen - 1] = ':';
+		OBD_FREE(lmd->lmd_mgs, oldlen);
+	}
+	memcpy(mgsnid + oldlen, *ptr, length);
+	mgsnid[oldlen + length] = '\0';
+	lmd->lmd_mgs = mgsnid;
+	*ptr = tail;
+
+	return 0;
+}
+
+/** Parse mount line options
+ * e.g. mount -v -t lustre -o abort_recov uml1:uml2:/lustre-client /mnt/lustre
+ * dev is passed as device=uml1:/lustre by mount.lustre
+ */
+static int lmd_parse(char *options, struct lustre_mount_data *lmd)
+{
+	char *s1, *s2, *devname = NULL;
+	struct lustre_mount_data *raw = (struct lustre_mount_data *)options;
+	int rc = 0;
+
+	LASSERT(lmd);
+	if (!options) {
+		LCONSOLE_ERROR_MSG(0x162, "Missing mount data: check that /sbin/mount.lustre is installed.\n");
+		return -EINVAL;
+	}
+
+	/* Options should be a string - try to detect old lmd data */
+	if ((raw->lmd_magic & 0xffffff00) == (LMD_MAGIC & 0xffffff00)) {
+		LCONSOLE_ERROR_MSG(0x163, "You're using an old version of /sbin/mount.lustre.  Please install version %s\n",
+				   LUSTRE_VERSION_STRING);
+		return -EINVAL;
+	}
+	lmd->lmd_magic = LMD_MAGIC;
+
+	OBD_ALLOC(lmd->lmd_params, 4096);
+	if (lmd->lmd_params == NULL)
+		return -ENOMEM;
+	lmd->lmd_params[0] = '\0';
+
+	/* Set default flags here */
+
+	s1 = options;
+	while (*s1) {
+		int clear = 0;
+		int time_min = OBD_RECOVERY_TIME_MIN;
+
+		/* Skip whitespace and extra commas */
+		while (*s1 == ' ' || *s1 == ',')
+			s1++;
+
+		/* Client options are parsed in ll_options: eg. flock,
+		   user_xattr, acl */
+
+		/* Parse non-ldiskfs options here. Rather than modifying
+		   ldiskfs, we just zero these out here */
+		if (strncmp(s1, "abort_recov", 11) == 0) {
+			lmd->lmd_flags |= LMD_FLG_ABORT_RECOV;
+			clear++;
+		} else if (strncmp(s1, "recovery_time_soft=", 19) == 0) {
+			lmd->lmd_recovery_time_soft = max_t(int,
+				simple_strtoul(s1 + 19, NULL, 10), time_min);
+			clear++;
+		} else if (strncmp(s1, "recovery_time_hard=", 19) == 0) {
+			lmd->lmd_recovery_time_hard = max_t(int,
+				simple_strtoul(s1 + 19, NULL, 10), time_min);
+			clear++;
+		} else if (strncmp(s1, "noir", 4) == 0) {
+			lmd->lmd_flags |= LMD_FLG_NOIR; /* test purpose only. */
+			clear++;
+		} else if (strncmp(s1, "nosvc", 5) == 0) {
+			lmd->lmd_flags |= LMD_FLG_NOSVC;
+			clear++;
+		} else if (strncmp(s1, "nomgs", 5) == 0) {
+			lmd->lmd_flags |= LMD_FLG_NOMGS;
+			clear++;
+		} else if (strncmp(s1, "noscrub", 7) == 0) {
+			lmd->lmd_flags |= LMD_FLG_NOSCRUB;
+			clear++;
+		} else if (strncmp(s1, PARAM_MGSNODE,
+				   sizeof(PARAM_MGSNODE) - 1) == 0) {
+			s2 = s1 + sizeof(PARAM_MGSNODE) - 1;
+			/* Assume the next mount opt is the first
+			   invalid nid we get to. */
+			rc = lmd_parse_mgs(lmd, &s2);
+			if (rc)
+				goto invalid;
+			clear++;
+		} else if (strncmp(s1, "writeconf", 9) == 0) {
+			lmd->lmd_flags |= LMD_FLG_WRITECONF;
+			clear++;
+		} else if (strncmp(s1, "update", 6) == 0) {
+			lmd->lmd_flags |= LMD_FLG_UPDATE;
+			clear++;
+		} else if (strncmp(s1, "virgin", 6) == 0) {
+			lmd->lmd_flags |= LMD_FLG_VIRGIN;
+			clear++;
+		} else if (strncmp(s1, "noprimnode", 10) == 0) {
+			lmd->lmd_flags |= LMD_FLG_NO_PRIMNODE;
+			clear++;
+		} else if (strncmp(s1, "mgssec=", 7) == 0) {
+			rc = lmd_parse_mgssec(lmd, s1 + 7);
+			if (rc)
+				goto invalid;
+			clear++;
+		/* ost exclusion list */
+		} else if (strncmp(s1, "exclude=", 8) == 0) {
+			rc = lmd_make_exclusion(lmd, s1 + 7);
+			if (rc)
+				goto invalid;
+			clear++;
+		} else if (strncmp(s1, "mgs", 3) == 0) {
+			/* We are an MGS */
+			lmd->lmd_flags |= LMD_FLG_MGS;
+			clear++;
+		} else if (strncmp(s1, "svname=", 7) == 0) {
+			rc = lmd_parse_string(&lmd->lmd_profile, s1 + 7);
+			if (rc)
+				goto invalid;
+			clear++;
+		} else if (strncmp(s1, "param=", 6) == 0) {
+			int length;
+			char *tail = strchr(s1 + 6, ',');
+			if (tail == NULL)
+				length = strlen(s1);
+			else
+				length = tail - s1;
+			length -= 6;
+			strncat(lmd->lmd_params, s1 + 6, length);
+			strcat(lmd->lmd_params, " ");
+			clear++;
+		} else if (strncmp(s1, "osd=", 4) == 0) {
+			rc = lmd_parse_string(&lmd->lmd_osd_type, s1 + 4);
+			if (rc)
+				goto invalid;
+			clear++;
+		}
+		/* Linux 2.4 doesn't pass the device, so we stuck it at the
+		   end of the options. */
+		else if (strncmp(s1, "device=", 7) == 0) {
+			devname = s1 + 7;
+			/* terminate options right before device.  device
+			   must be the last one. */
+			*s1 = '\0';
+			break;
+		}
+
+		/* Find next opt */
+		s2 = strchr(s1, ',');
+		if (s2 == NULL) {
+			if (clear)
+				*s1 = '\0';
+			break;
+		}
+		s2++;
+		if (clear)
+			memmove(s1, s2, strlen(s2) + 1);
+		else
+			s1 = s2;
+	}
+
+	if (!devname) {
+		LCONSOLE_ERROR_MSG(0x164, "Can't find the device name (need mount option 'device=...')\n");
+		goto invalid;
+	}
+
+	s1 = strstr(devname, ":/");
+	if (s1) {
+		++s1;
+		lmd->lmd_flags |= LMD_FLG_CLIENT;
+		/* Remove leading /s from fsname */
+		while (*++s1 == '/') ;
+		/* Freed in lustre_free_lsi */
+		OBD_ALLOC(lmd->lmd_profile, strlen(s1) + 8);
+		if (!lmd->lmd_profile)
+			return -ENOMEM;
+		sprintf(lmd->lmd_profile, "%s-client", s1);
+	}
+
+	/* Freed in lustre_free_lsi */
+	OBD_ALLOC(lmd->lmd_dev, strlen(devname) + 1);
+	if (!lmd->lmd_dev)
+		return -ENOMEM;
+	strcpy(lmd->lmd_dev, devname);
+
+	/* Save mount options */
+	s1 = options + strlen(options) - 1;
+	while (s1 >= options && (*s1 == ',' || *s1 == ' '))
+		*s1-- = 0;
+	if (*options != 0) {
+		/* Freed in lustre_free_lsi */
+		OBD_ALLOC(lmd->lmd_opts, strlen(options) + 1);
+		if (!lmd->lmd_opts)
+			return -ENOMEM;
+		strcpy(lmd->lmd_opts, options);
+	}
+
+	lmd_print(lmd);
+	lmd->lmd_magic = LMD_MAGIC;
+
+	return rc;
+
+invalid:
+	CERROR("Bad mount options %s\n", options);
+	return -EINVAL;
+}
+
+struct lustre_mount_data2 {
+	void *lmd2_data;
+	struct vfsmount *lmd2_mnt;
+};
+
+/** This is the entry point for the mount call into Lustre.
+ * This is called when a server or client is mounted,
+ * and this is where we start setting things up.
+ * @param data Mount options (e.g. -o flock,abort_recov)
+ */
+int lustre_fill_super(struct super_block *sb, void *data, int silent)
+{
+	struct lustre_mount_data *lmd;
+	struct lustre_mount_data2 *lmd2 = data;
+	struct lustre_sb_info *lsi;
+	int rc;
+
+	CDEBUG(D_MOUNT|D_VFSTRACE, "VFS Op: sb %p\n", sb);
+
+	lsi = lustre_init_lsi(sb);
+	if (!lsi)
+		return -ENOMEM;
+	lmd = lsi->lsi_lmd;
+
+	/*
+	 * Disable lockdep during mount, because mount locking patterns are
+	 * `special'.
+	 */
+	lockdep_off();
+
+	/*
+	 * LU-639: the obd cleanup of last mount may not finish yet, wait here.
+	 */
+	obd_zombie_barrier();
+
+	/* Figure out the lmd from the mount options */
+	if (lmd_parse((char *)(lmd2->lmd2_data), lmd)) {
+		lustre_put_lsi(sb);
+		rc = -EINVAL;
+		goto out;
+	}
+
+	if (lmd_is_client(lmd)) {
+		CDEBUG(D_MOUNT, "Mounting client %s\n", lmd->lmd_profile);
+		if (client_fill_super == NULL)
+			request_module("lustre");
+		if (client_fill_super == NULL) {
+			LCONSOLE_ERROR_MSG(0x165, "Nothing registered for client mount! Is the 'lustre' module loaded?\n");
+			lustre_put_lsi(sb);
+			rc = -ENODEV;
+		} else {
+			rc = lustre_start_mgc(sb);
+			if (rc) {
+				lustre_put_lsi(sb);
+				goto out;
+			}
+			/* Connect and start */
+			/* (should always be ll_fill_super) */
+			rc = (*client_fill_super)(sb, lmd2->lmd2_mnt);
+			/* c_f_s will call lustre_common_put_super on failure */
+		}
+	} else {
+		CERROR("This is client-side-only module, cannot handle server mount.\n");
+		rc = -EINVAL;
+	}
+
+	/* If error happens in fill_super() call, @lsi will be killed there.
+	 * This is why we do not put it here. */
+	goto out;
+out:
+	if (rc) {
+		CERROR("Unable to mount %s (%d)\n",
+		       s2lsi(sb) ? lmd->lmd_dev : "", rc);
+	} else {
+		CDEBUG(D_SUPER, "Mount %s complete\n",
+		       lmd->lmd_dev);
+	}
+	lockdep_on();
+	return rc;
+}
+
+
+/* We can't call ll_fill_super by name because it lives in a module that
+   must be loaded after this one. */
+void lustre_register_client_fill_super(int (*cfs)(struct super_block *sb,
+						  struct vfsmount *mnt))
+{
+	client_fill_super = cfs;
+}
+EXPORT_SYMBOL(lustre_register_client_fill_super);
+
+void lustre_register_kill_super_cb(void (*cfs)(struct super_block *sb))
+{
+	kill_super_cb = cfs;
+}
+EXPORT_SYMBOL(lustre_register_kill_super_cb);
+
+/***************** FS registration ******************/
+struct dentry *lustre_mount(struct file_system_type *fs_type, int flags,
+				const char *devname, void *data)
+{
+	struct lustre_mount_data2 lmd2 = {
+		.lmd2_data = data,
+		.lmd2_mnt = NULL
+	};
+
+	return mount_nodev(fs_type, flags, &lmd2, lustre_fill_super);
+}
+
+static void lustre_kill_super(struct super_block *sb)
+{
+	struct lustre_sb_info *lsi = s2lsi(sb);
+
+	if (kill_super_cb && lsi && !IS_SERVER(lsi))
+		(*kill_super_cb)(sb);
+
+	kill_anon_super(sb);
+}
+
+/** Register the "lustre" fs type
+ */
+struct file_system_type lustre_fs_type = {
+	.owner	= THIS_MODULE,
+	.name	 = "lustre",
+	.mount	= lustre_mount,
+	.kill_sb      = lustre_kill_super,
+	.fs_flags     = FS_BINARY_MOUNTDATA | FS_REQUIRES_DEV |
+			FS_HAS_FIEMAP | FS_RENAME_DOES_D_MOVE,
+};
+MODULE_ALIAS_FS("lustre");
+
+int lustre_register_fs(void)
+{
+	return register_filesystem(&lustre_fs_type);
+}
+
+int lustre_unregister_fs(void)
+{
+	return unregister_filesystem(&lustre_fs_type);
+}
diff --git a/kernel/drivers/staging/lustre/lustre/obdclass/obdo.c b/kernel/drivers/staging/lustre/lustre/obdclass/obdo.c
new file mode 100644
index 000000000..307ffe347
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/obdclass/obdo.c
@@ -0,0 +1,362 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdclass/obdo.c
+ *
+ * Object Devices Class Driver
+ * These are the only exported functions, they provide some generic
+ * infrastructure for managing object devices
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+
+#include "../include/obd_class.h"
+#include "../include/lustre/lustre_idl.h"
+
+void obdo_set_parent_fid(struct obdo *dst, const struct lu_fid *parent)
+{
+	dst->o_parent_oid = fid_oid(parent);
+	dst->o_parent_seq = fid_seq(parent);
+	dst->o_parent_ver = fid_ver(parent);
+	dst->o_valid |= OBD_MD_FLGENER | OBD_MD_FLFID;
+}
+EXPORT_SYMBOL(obdo_set_parent_fid);
+
+/* WARNING: the file systems must take care not to tinker with
+   attributes they don't manage (such as blocks). */
+void obdo_from_inode(struct obdo *dst, struct inode *src, u32 valid)
+{
+	u32 newvalid = 0;
+
+	if (valid & (OBD_MD_FLCTIME | OBD_MD_FLMTIME))
+		CDEBUG(D_INODE, "valid %x, new time %lu/%lu\n",
+		       valid, LTIME_S(src->i_mtime),
+		       LTIME_S(src->i_ctime));
+
+	if (valid & OBD_MD_FLATIME) {
+		dst->o_atime = LTIME_S(src->i_atime);
+		newvalid |= OBD_MD_FLATIME;
+	}
+	if (valid & OBD_MD_FLMTIME) {
+		dst->o_mtime = LTIME_S(src->i_mtime);
+		newvalid |= OBD_MD_FLMTIME;
+	}
+	if (valid & OBD_MD_FLCTIME) {
+		dst->o_ctime = LTIME_S(src->i_ctime);
+		newvalid |= OBD_MD_FLCTIME;
+	}
+	if (valid & OBD_MD_FLSIZE) {
+		dst->o_size = i_size_read(src);
+		newvalid |= OBD_MD_FLSIZE;
+	}
+	if (valid & OBD_MD_FLBLOCKS) {  /* allocation of space (x512 bytes) */
+		dst->o_blocks = src->i_blocks;
+		newvalid |= OBD_MD_FLBLOCKS;
+	}
+	if (valid & OBD_MD_FLBLKSZ) {   /* optimal block size */
+		dst->o_blksize = 1 << src->i_blkbits;
+		newvalid |= OBD_MD_FLBLKSZ;
+	}
+	if (valid & OBD_MD_FLTYPE) {
+		dst->o_mode = (dst->o_mode & S_IALLUGO) |
+			      (src->i_mode & S_IFMT);
+		newvalid |= OBD_MD_FLTYPE;
+	}
+	if (valid & OBD_MD_FLMODE) {
+		dst->o_mode = (dst->o_mode & S_IFMT) |
+			      (src->i_mode & S_IALLUGO);
+		newvalid |= OBD_MD_FLMODE;
+	}
+	if (valid & OBD_MD_FLUID) {
+		dst->o_uid = from_kuid(&init_user_ns, src->i_uid);
+		newvalid |= OBD_MD_FLUID;
+	}
+	if (valid & OBD_MD_FLGID) {
+		dst->o_gid = from_kgid(&init_user_ns, src->i_gid);
+		newvalid |= OBD_MD_FLGID;
+	}
+	if (valid & OBD_MD_FLFLAGS) {
+		dst->o_flags = src->i_flags;
+		newvalid |= OBD_MD_FLFLAGS;
+	}
+	dst->o_valid |= newvalid;
+}
+EXPORT_SYMBOL(obdo_from_inode);
+
+void obdo_cpy_md(struct obdo *dst, struct obdo *src, u32 valid)
+{
+	CDEBUG(D_INODE, "src obdo "DOSTID" valid %#llx, dst obdo "DOSTID"\n",
+	       POSTID(&src->o_oi), src->o_valid, POSTID(&dst->o_oi));
+	if (valid & OBD_MD_FLATIME)
+		dst->o_atime = src->o_atime;
+	if (valid & OBD_MD_FLMTIME)
+		dst->o_mtime = src->o_mtime;
+	if (valid & OBD_MD_FLCTIME)
+		dst->o_ctime = src->o_ctime;
+	if (valid & OBD_MD_FLSIZE)
+		dst->o_size = src->o_size;
+	if (valid & OBD_MD_FLBLOCKS) /* allocation of space */
+		dst->o_blocks = src->o_blocks;
+	if (valid & OBD_MD_FLBLKSZ)
+		dst->o_blksize = src->o_blksize;
+	if (valid & OBD_MD_FLTYPE)
+		dst->o_mode = (dst->o_mode & ~S_IFMT) | (src->o_mode & S_IFMT);
+	if (valid & OBD_MD_FLMODE)
+		dst->o_mode = (dst->o_mode & S_IFMT) | (src->o_mode & ~S_IFMT);
+	if (valid & OBD_MD_FLUID)
+		dst->o_uid = src->o_uid;
+	if (valid & OBD_MD_FLGID)
+		dst->o_gid = src->o_gid;
+	if (valid & OBD_MD_FLFLAGS)
+		dst->o_flags = src->o_flags;
+	if (valid & OBD_MD_FLFID) {
+		dst->o_parent_seq = src->o_parent_seq;
+		dst->o_parent_ver = src->o_parent_ver;
+	}
+	if (valid & OBD_MD_FLGENER)
+		dst->o_parent_oid = src->o_parent_oid;
+	if (valid & OBD_MD_FLHANDLE)
+		dst->o_handle = src->o_handle;
+	if (valid & OBD_MD_FLCOOKIE)
+		dst->o_lcookie = src->o_lcookie;
+
+	dst->o_valid |= valid;
+}
+EXPORT_SYMBOL(obdo_cpy_md);
+
+/* returns FALSE if comparison (by flags) is same, TRUE if changed */
+int obdo_cmp_md(struct obdo *dst, struct obdo *src, u32 compare)
+{
+	int res = 0;
+
+	if (compare & OBD_MD_FLATIME)
+		res |= dst->o_atime != src->o_atime;
+	if (compare & OBD_MD_FLMTIME)
+		res |= dst->o_mtime != src->o_mtime;
+	if (compare & OBD_MD_FLCTIME)
+		res |= dst->o_ctime != src->o_ctime;
+	if (compare & OBD_MD_FLSIZE)
+		res |= dst->o_size != src->o_size;
+	if (compare & OBD_MD_FLBLOCKS) /* allocation of space */
+		res |= dst->o_blocks != src->o_blocks;
+	if (compare & OBD_MD_FLBLKSZ)
+		res |= dst->o_blksize != src->o_blksize;
+	if (compare & OBD_MD_FLTYPE)
+		res |= ((dst->o_mode ^ src->o_mode) & S_IFMT) != 0;
+	if (compare & OBD_MD_FLMODE)
+		res |= ((dst->o_mode ^ src->o_mode) & ~S_IFMT) != 0;
+	if (compare & OBD_MD_FLUID)
+		res |= dst->o_uid != src->o_uid;
+	if (compare & OBD_MD_FLGID)
+		res |= dst->o_gid != src->o_gid;
+	if (compare & OBD_MD_FLFLAGS)
+		res |= dst->o_flags != src->o_flags;
+	if (compare & OBD_MD_FLNLINK)
+		res |= dst->o_nlink != src->o_nlink;
+	if (compare & OBD_MD_FLFID) {
+		res |= dst->o_parent_seq != src->o_parent_seq;
+		res |= dst->o_parent_ver != src->o_parent_ver;
+	}
+	if (compare & OBD_MD_FLGENER)
+		res |= dst->o_parent_oid != src->o_parent_oid;
+	/* XXX Don't know if these should be included here - wasn't previously
+	if ( compare & OBD_MD_FLINLINE )
+		res |= memcmp(dst->o_inline, src->o_inline);
+	*/
+	return res;
+}
+EXPORT_SYMBOL(obdo_cmp_md);
+
+void obdo_to_ioobj(struct obdo *oa, struct obd_ioobj *ioobj)
+{
+	ioobj->ioo_oid = oa->o_oi;
+	if (unlikely(!(oa->o_valid & OBD_MD_FLGROUP)))
+		ostid_set_seq_mdt0(&ioobj->ioo_oid);
+
+	/* Since 2.4 this does not contain o_mode in the low 16 bits.
+	 * Instead, it holds (bd_md_max_brw - 1) for multi-bulk BRW RPCs */
+	ioobj->ioo_max_brw = 0;
+}
+EXPORT_SYMBOL(obdo_to_ioobj);
+
+void obdo_from_iattr(struct obdo *oa, struct iattr *attr, unsigned int ia_valid)
+{
+	if (ia_valid & ATTR_ATIME) {
+		oa->o_atime = LTIME_S(attr->ia_atime);
+		oa->o_valid |= OBD_MD_FLATIME;
+	}
+	if (ia_valid & ATTR_MTIME) {
+		oa->o_mtime = LTIME_S(attr->ia_mtime);
+		oa->o_valid |= OBD_MD_FLMTIME;
+	}
+	if (ia_valid & ATTR_CTIME) {
+		oa->o_ctime = LTIME_S(attr->ia_ctime);
+		oa->o_valid |= OBD_MD_FLCTIME;
+	}
+	if (ia_valid & ATTR_SIZE) {
+		oa->o_size = attr->ia_size;
+		oa->o_valid |= OBD_MD_FLSIZE;
+	}
+	if (ia_valid & ATTR_MODE) {
+		oa->o_mode = attr->ia_mode;
+		oa->o_valid |= OBD_MD_FLTYPE | OBD_MD_FLMODE;
+		if (!in_group_p(make_kgid(&init_user_ns, oa->o_gid)) &&
+		    !capable(CFS_CAP_FSETID))
+			oa->o_mode &= ~S_ISGID;
+	}
+	if (ia_valid & ATTR_UID) {
+		oa->o_uid = from_kuid(&init_user_ns, attr->ia_uid);
+		oa->o_valid |= OBD_MD_FLUID;
+	}
+	if (ia_valid & ATTR_GID) {
+		oa->o_gid = from_kgid(&init_user_ns, attr->ia_gid);
+		oa->o_valid |= OBD_MD_FLGID;
+	}
+}
+EXPORT_SYMBOL(obdo_from_iattr);
+
+void iattr_from_obdo(struct iattr *attr, struct obdo *oa, u32 valid)
+{
+	valid &= oa->o_valid;
+
+	if (valid & (OBD_MD_FLCTIME | OBD_MD_FLMTIME))
+		CDEBUG(D_INODE, "valid %#llx, new time %llu/%llu\n",
+		       oa->o_valid, oa->o_mtime, oa->o_ctime);
+
+	attr->ia_valid = 0;
+	if (valid & OBD_MD_FLATIME) {
+		LTIME_S(attr->ia_atime) = oa->o_atime;
+		attr->ia_valid |= ATTR_ATIME;
+	}
+	if (valid & OBD_MD_FLMTIME) {
+		LTIME_S(attr->ia_mtime) = oa->o_mtime;
+		attr->ia_valid |= ATTR_MTIME;
+	}
+	if (valid & OBD_MD_FLCTIME) {
+		LTIME_S(attr->ia_ctime) = oa->o_ctime;
+		attr->ia_valid |= ATTR_CTIME;
+	}
+	if (valid & OBD_MD_FLSIZE) {
+		attr->ia_size = oa->o_size;
+		attr->ia_valid |= ATTR_SIZE;
+	}
+#if 0   /* you shouldn't be able to change a file's type with setattr */
+	if (valid & OBD_MD_FLTYPE) {
+		attr->ia_mode = (attr->ia_mode & ~S_IFMT)|(oa->o_mode & S_IFMT);
+		attr->ia_valid |= ATTR_MODE;
+	}
+#endif
+	if (valid & OBD_MD_FLMODE) {
+		attr->ia_mode = (attr->ia_mode & S_IFMT)|(oa->o_mode & ~S_IFMT);
+		attr->ia_valid |= ATTR_MODE;
+		if (!in_group_p(make_kgid(&init_user_ns, oa->o_gid)) &&
+		    !capable(CFS_CAP_FSETID))
+			attr->ia_mode &= ~S_ISGID;
+	}
+	if (valid & OBD_MD_FLUID) {
+		attr->ia_uid = make_kuid(&init_user_ns, oa->o_uid);
+		attr->ia_valid |= ATTR_UID;
+	}
+	if (valid & OBD_MD_FLGID) {
+		attr->ia_gid = make_kgid(&init_user_ns, oa->o_gid);
+		attr->ia_valid |= ATTR_GID;
+	}
+}
+EXPORT_SYMBOL(iattr_from_obdo);
+
+void md_from_obdo(struct md_op_data *op_data, struct obdo *oa, u32 valid)
+{
+	iattr_from_obdo(&op_data->op_attr, oa, valid);
+	if (valid & OBD_MD_FLBLOCKS) {
+		op_data->op_attr_blocks = oa->o_blocks;
+		op_data->op_attr.ia_valid |= ATTR_BLOCKS;
+	}
+	if (valid & OBD_MD_FLFLAGS) {
+		((struct ll_iattr *)&op_data->op_attr)->ia_attr_flags =
+			oa->o_flags;
+		op_data->op_attr.ia_valid |= ATTR_ATTR_FLAG;
+	}
+}
+EXPORT_SYMBOL(md_from_obdo);
+
+void obdo_from_md(struct obdo *oa, struct md_op_data *op_data,
+		  unsigned int valid)
+{
+	obdo_from_iattr(oa, &op_data->op_attr, valid);
+	if (valid & ATTR_BLOCKS) {
+		oa->o_blocks = op_data->op_attr_blocks;
+		oa->o_valid |= OBD_MD_FLBLOCKS;
+	}
+	if (valid & ATTR_ATTR_FLAG) {
+		oa->o_flags =
+			((struct ll_iattr *)&op_data->op_attr)->ia_attr_flags;
+		oa->o_valid |= OBD_MD_FLFLAGS;
+	}
+}
+EXPORT_SYMBOL(obdo_from_md);
+
+void obdo_cpu_to_le(struct obdo *dobdo, struct obdo *sobdo)
+{
+	dobdo->o_size = cpu_to_le64(sobdo->o_size);
+	dobdo->o_mtime = cpu_to_le64(sobdo->o_mtime);
+	dobdo->o_atime = cpu_to_le64(sobdo->o_atime);
+	dobdo->o_ctime = cpu_to_le64(sobdo->o_ctime);
+	dobdo->o_blocks = cpu_to_le64(sobdo->o_blocks);
+	dobdo->o_mode = cpu_to_le32(sobdo->o_mode);
+	dobdo->o_uid = cpu_to_le32(sobdo->o_uid);
+	dobdo->o_gid = cpu_to_le32(sobdo->o_gid);
+	dobdo->o_flags = cpu_to_le32(sobdo->o_flags);
+	dobdo->o_nlink = cpu_to_le32(sobdo->o_nlink);
+	dobdo->o_blksize = cpu_to_le32(sobdo->o_blksize);
+	dobdo->o_valid = cpu_to_le64(sobdo->o_valid);
+}
+EXPORT_SYMBOL(obdo_cpu_to_le);
+
+void obdo_le_to_cpu(struct obdo *dobdo, struct obdo *sobdo)
+{
+	dobdo->o_size = le64_to_cpu(sobdo->o_size);
+	dobdo->o_mtime = le64_to_cpu(sobdo->o_mtime);
+	dobdo->o_atime = le64_to_cpu(sobdo->o_atime);
+	dobdo->o_ctime = le64_to_cpu(sobdo->o_ctime);
+	dobdo->o_blocks = le64_to_cpu(sobdo->o_blocks);
+	dobdo->o_mode = le32_to_cpu(sobdo->o_mode);
+	dobdo->o_uid = le32_to_cpu(sobdo->o_uid);
+	dobdo->o_gid = le32_to_cpu(sobdo->o_gid);
+	dobdo->o_flags = le32_to_cpu(sobdo->o_flags);
+	dobdo->o_nlink = le32_to_cpu(sobdo->o_nlink);
+	dobdo->o_blksize = le32_to_cpu(sobdo->o_blksize);
+	dobdo->o_valid = le64_to_cpu(sobdo->o_valid);
+}
+EXPORT_SYMBOL(obdo_le_to_cpu);
diff --git a/kernel/drivers/staging/lustre/lustre/obdclass/statfs_pack.c b/kernel/drivers/staging/lustre/lustre/obdclass/statfs_pack.c
new file mode 100644
index 000000000..cc785ab3f
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/obdclass/statfs_pack.c
@@ -0,0 +1,75 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdclass/statfs_pack.c
+ *
+ * (Un)packing of OST/MDS requests
+ *
+ * Author: Andreas Dilger <adilger@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+
+#include <linux/statfs.h>
+#include "../include/lustre_export.h"
+#include "../include/lustre_net.h"
+#include "../include/obd_support.h"
+#include "../include/obd_class.h"
+
+void statfs_pack(struct obd_statfs *osfs, struct kstatfs *sfs)
+{
+	memset(osfs, 0, sizeof(*osfs));
+	osfs->os_type = sfs->f_type;
+	osfs->os_blocks = sfs->f_blocks;
+	osfs->os_bfree = sfs->f_bfree;
+	osfs->os_bavail = sfs->f_bavail;
+	osfs->os_files = sfs->f_files;
+	osfs->os_ffree = sfs->f_ffree;
+	osfs->os_bsize = sfs->f_bsize;
+	osfs->os_namelen = sfs->f_namelen;
+}
+EXPORT_SYMBOL(statfs_pack);
+
+void statfs_unpack(struct kstatfs *sfs, struct obd_statfs *osfs)
+{
+	memset(sfs, 0, sizeof(*sfs));
+	sfs->f_type = osfs->os_type;
+	sfs->f_blocks = osfs->os_blocks;
+	sfs->f_bfree = osfs->os_bfree;
+	sfs->f_bavail = osfs->os_bavail;
+	sfs->f_files = osfs->os_files;
+	sfs->f_ffree = osfs->os_ffree;
+	sfs->f_bsize = osfs->os_bsize;
+	sfs->f_namelen = osfs->os_namelen;
+}
+EXPORT_SYMBOL(statfs_unpack);
diff --git a/kernel/drivers/staging/lustre/lustre/obdclass/uuid.c b/kernel/drivers/staging/lustre/lustre/obdclass/uuid.c
new file mode 100644
index 000000000..ff0a01bcf
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/obdclass/uuid.c
@@ -0,0 +1,82 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdclass/uuid.c
+ *
+ * Public include file for the UUID library
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+
+#include "../../include/linux/libcfs/libcfs.h"
+
+#include "../include/obd_support.h"
+#include "../include/obd_class.h"
+
+
+static inline __u32 consume(int nob, __u8 **ptr)
+{
+	__u32 value;
+
+	LASSERT(nob <= sizeof(value));
+
+	for (value = 0; nob > 0; --nob)
+		value = (value << 8) | *((*ptr)++);
+	return value;
+}
+
+#define CONSUME(val, ptr) (val) = consume(sizeof(val), (ptr))
+
+static void uuid_unpack(class_uuid_t in, __u16 *uu, int nr)
+{
+	__u8 *ptr = in;
+
+	LASSERT(nr * sizeof(*uu) == sizeof(class_uuid_t));
+
+	while (nr-- > 0)
+		CONSUME(uu[nr], &ptr);
+}
+
+void class_uuid_unparse(class_uuid_t uu, struct obd_uuid *out)
+{
+	/* uu as an array of __u16's */
+	__u16 uuid[sizeof(class_uuid_t) / sizeof(__u16)];
+
+	CLASSERT(ARRAY_SIZE(uuid) == 8);
+
+	uuid_unpack(uu, uuid, ARRAY_SIZE(uuid));
+	sprintf(out->uuid, "%04x%04x-%04x-%04x-%04x-%04x%04x%04x",
+		uuid[0], uuid[1], uuid[2], uuid[3],
+		uuid[4], uuid[5], uuid[6], uuid[7]);
+}
+EXPORT_SYMBOL(class_uuid_unparse);
diff --git a/kernel/drivers/staging/lustre/lustre/obdecho/Makefile b/kernel/drivers/staging/lustre/lustre/obdecho/Makefile
new file mode 100644
index 000000000..672028fc7
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/obdecho/Makefile
@@ -0,0 +1,2 @@
+obj-$(CONFIG_LUSTRE_FS) += obdecho.o
+obdecho-y := echo_client.o lproc_echo.o
diff --git a/kernel/drivers/staging/lustre/lustre/obdecho/echo_client.c b/kernel/drivers/staging/lustre/lustre/obdecho/echo_client.c
new file mode 100644
index 000000000..d542e06d6
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/obdecho/echo_client.c
@@ -0,0 +1,2197 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_ECHO
+#include "../../include/linux/libcfs/libcfs.h"
+
+#include "../include/obd.h"
+#include "../include/obd_support.h"
+#include "../include/obd_class.h"
+#include "../include/lustre_debug.h"
+#include "../include/lprocfs_status.h"
+#include "../include/cl_object.h"
+#include "../include/lustre_fid.h"
+#include "../include/lustre_acl.h"
+#include "../include/lustre_net.h"
+
+#include "echo_internal.h"
+
+/** \defgroup echo_client Echo Client
+ * @{
+ */
+
+struct echo_device {
+	struct cl_device	ed_cl;
+	struct echo_client_obd *ed_ec;
+
+	struct cl_site	  ed_site_myself;
+	struct cl_site	 *ed_site;
+	struct lu_device       *ed_next;
+	int		     ed_next_islov;
+};
+
+struct echo_object {
+	struct cl_object	eo_cl;
+	struct cl_object_header eo_hdr;
+
+	struct echo_device     *eo_dev;
+	struct list_head	      eo_obj_chain;
+	struct lov_stripe_md   *eo_lsm;
+	atomic_t	    eo_npages;
+	int		     eo_deleted;
+};
+
+struct echo_object_conf {
+	struct cl_object_conf  eoc_cl;
+	struct lov_stripe_md **eoc_md;
+};
+
+struct echo_page {
+	struct cl_page_slice   ep_cl;
+	struct mutex		ep_lock;
+	struct page	    *ep_vmpage;
+};
+
+struct echo_lock {
+	struct cl_lock_slice   el_cl;
+	struct list_head	     el_chain;
+	struct echo_object    *el_object;
+	__u64		  el_cookie;
+	atomic_t	   el_refcount;
+};
+
+static int echo_client_setup(const struct lu_env *env,
+			     struct obd_device *obddev,
+			     struct lustre_cfg *lcfg);
+static int echo_client_cleanup(struct obd_device *obddev);
+
+
+/** \defgroup echo_helpers Helper functions
+ * @{
+ */
+static inline struct echo_device *cl2echo_dev(const struct cl_device *dev)
+{
+	return container_of0(dev, struct echo_device, ed_cl);
+}
+
+static inline struct cl_device *echo_dev2cl(struct echo_device *d)
+{
+	return &d->ed_cl;
+}
+
+static inline struct echo_device *obd2echo_dev(const struct obd_device *obd)
+{
+	return cl2echo_dev(lu2cl_dev(obd->obd_lu_dev));
+}
+
+static inline struct cl_object *echo_obj2cl(struct echo_object *eco)
+{
+	return &eco->eo_cl;
+}
+
+static inline struct echo_object *cl2echo_obj(const struct cl_object *o)
+{
+	return container_of(o, struct echo_object, eo_cl);
+}
+
+static inline struct echo_page *cl2echo_page(const struct cl_page_slice *s)
+{
+	return container_of(s, struct echo_page, ep_cl);
+}
+
+static inline struct echo_lock *cl2echo_lock(const struct cl_lock_slice *s)
+{
+	return container_of(s, struct echo_lock, el_cl);
+}
+
+static inline struct cl_lock *echo_lock2cl(const struct echo_lock *ecl)
+{
+	return ecl->el_cl.cls_lock;
+}
+
+static struct lu_context_key echo_thread_key;
+static inline struct echo_thread_info *echo_env_info(const struct lu_env *env)
+{
+	struct echo_thread_info *info;
+
+	info = lu_context_key_get(&env->le_ctx, &echo_thread_key);
+	LASSERT(info != NULL);
+	return info;
+}
+
+static inline
+struct echo_object_conf *cl2echo_conf(const struct cl_object_conf *c)
+{
+	return container_of(c, struct echo_object_conf, eoc_cl);
+}
+
+/** @} echo_helpers */
+
+static struct echo_object *cl_echo_object_find(struct echo_device *d,
+					       struct lov_stripe_md **lsm);
+static int cl_echo_object_put(struct echo_object *eco);
+static int cl_echo_enqueue(struct echo_object *eco, u64 start,
+			   u64 end, int mode, __u64 *cookie);
+static int cl_echo_cancel(struct echo_device *d, __u64 cookie);
+static int cl_echo_object_brw(struct echo_object *eco, int rw, u64 offset,
+			      struct page **pages, int npages, int async);
+
+static struct echo_thread_info *echo_env_info(const struct lu_env *env);
+
+struct echo_thread_info {
+	struct echo_object_conf eti_conf;
+	struct lustre_md	eti_md;
+
+	struct cl_2queue	eti_queue;
+	struct cl_io	    eti_io;
+	struct cl_lock_descr    eti_descr;
+	struct lu_fid	   eti_fid;
+	struct lu_fid		eti_fid2;
+};
+
+/* No session used right now */
+struct echo_session_info {
+	unsigned long dummy;
+};
+
+static struct kmem_cache *echo_lock_kmem;
+static struct kmem_cache *echo_object_kmem;
+static struct kmem_cache *echo_thread_kmem;
+static struct kmem_cache *echo_session_kmem;
+
+static struct lu_kmem_descr echo_caches[] = {
+	{
+		.ckd_cache = &echo_lock_kmem,
+		.ckd_name  = "echo_lock_kmem",
+		.ckd_size  = sizeof(struct echo_lock)
+	},
+	{
+		.ckd_cache = &echo_object_kmem,
+		.ckd_name  = "echo_object_kmem",
+		.ckd_size  = sizeof(struct echo_object)
+	},
+	{
+		.ckd_cache = &echo_thread_kmem,
+		.ckd_name  = "echo_thread_kmem",
+		.ckd_size  = sizeof(struct echo_thread_info)
+	},
+	{
+		.ckd_cache = &echo_session_kmem,
+		.ckd_name  = "echo_session_kmem",
+		.ckd_size  = sizeof(struct echo_session_info)
+	},
+	{
+		.ckd_cache = NULL
+	}
+};
+
+/** \defgroup echo_page Page operations
+ *
+ * Echo page operations.
+ *
+ * @{
+ */
+static struct page *echo_page_vmpage(const struct lu_env *env,
+				    const struct cl_page_slice *slice)
+{
+	return cl2echo_page(slice)->ep_vmpage;
+}
+
+static int echo_page_own(const struct lu_env *env,
+			 const struct cl_page_slice *slice,
+			 struct cl_io *io, int nonblock)
+{
+	struct echo_page *ep = cl2echo_page(slice);
+
+	if (!nonblock)
+		mutex_lock(&ep->ep_lock);
+	else if (!mutex_trylock(&ep->ep_lock))
+		return -EAGAIN;
+	return 0;
+}
+
+static void echo_page_disown(const struct lu_env *env,
+			     const struct cl_page_slice *slice,
+			     struct cl_io *io)
+{
+	struct echo_page *ep = cl2echo_page(slice);
+
+	LASSERT(mutex_is_locked(&ep->ep_lock));
+	mutex_unlock(&ep->ep_lock);
+}
+
+static void echo_page_discard(const struct lu_env *env,
+			      const struct cl_page_slice *slice,
+			      struct cl_io *unused)
+{
+	cl_page_delete(env, slice->cpl_page);
+}
+
+static int echo_page_is_vmlocked(const struct lu_env *env,
+				 const struct cl_page_slice *slice)
+{
+	if (mutex_is_locked(&cl2echo_page(slice)->ep_lock))
+		return -EBUSY;
+	return -ENODATA;
+}
+
+static void echo_page_completion(const struct lu_env *env,
+				 const struct cl_page_slice *slice,
+				 int ioret)
+{
+	LASSERT(slice->cpl_page->cp_sync_io != NULL);
+}
+
+static void echo_page_fini(const struct lu_env *env,
+			   struct cl_page_slice *slice)
+{
+	struct echo_page *ep    = cl2echo_page(slice);
+	struct echo_object *eco = cl2echo_obj(slice->cpl_obj);
+	struct page *vmpage      = ep->ep_vmpage;
+
+	atomic_dec(&eco->eo_npages);
+	page_cache_release(vmpage);
+}
+
+static int echo_page_prep(const struct lu_env *env,
+			  const struct cl_page_slice *slice,
+			  struct cl_io *unused)
+{
+	return 0;
+}
+
+static int echo_page_print(const struct lu_env *env,
+			   const struct cl_page_slice *slice,
+			   void *cookie, lu_printer_t printer)
+{
+	struct echo_page *ep = cl2echo_page(slice);
+
+	(*printer)(env, cookie, LUSTRE_ECHO_CLIENT_NAME"-page@%p %d vm@%p\n",
+		   ep, mutex_is_locked(&ep->ep_lock), ep->ep_vmpage);
+	return 0;
+}
+
+static const struct cl_page_operations echo_page_ops = {
+	.cpo_own	   = echo_page_own,
+	.cpo_disown	= echo_page_disown,
+	.cpo_discard       = echo_page_discard,
+	.cpo_vmpage	= echo_page_vmpage,
+	.cpo_fini	  = echo_page_fini,
+	.cpo_print	 = echo_page_print,
+	.cpo_is_vmlocked   = echo_page_is_vmlocked,
+	.io = {
+		[CRT_READ] = {
+			.cpo_prep	= echo_page_prep,
+			.cpo_completion  = echo_page_completion,
+		},
+		[CRT_WRITE] = {
+			.cpo_prep	= echo_page_prep,
+			.cpo_completion  = echo_page_completion,
+		}
+	}
+};
+/** @} echo_page */
+
+/** \defgroup echo_lock Locking
+ *
+ * echo lock operations
+ *
+ * @{
+ */
+static void echo_lock_fini(const struct lu_env *env,
+			   struct cl_lock_slice *slice)
+{
+	struct echo_lock *ecl = cl2echo_lock(slice);
+
+	LASSERT(list_empty(&ecl->el_chain));
+	OBD_SLAB_FREE_PTR(ecl, echo_lock_kmem);
+}
+
+static void echo_lock_delete(const struct lu_env *env,
+			     const struct cl_lock_slice *slice)
+{
+	struct echo_lock *ecl      = cl2echo_lock(slice);
+
+	LASSERT(list_empty(&ecl->el_chain));
+}
+
+static int echo_lock_fits_into(const struct lu_env *env,
+			       const struct cl_lock_slice *slice,
+			       const struct cl_lock_descr *need,
+			       const struct cl_io *unused)
+{
+	return 1;
+}
+
+static struct cl_lock_operations echo_lock_ops = {
+	.clo_fini      = echo_lock_fini,
+	.clo_delete    = echo_lock_delete,
+	.clo_fits_into = echo_lock_fits_into
+};
+
+/** @} echo_lock */
+
+/** \defgroup echo_cl_ops cl_object operations
+ *
+ * operations for cl_object
+ *
+ * @{
+ */
+static int echo_page_init(const struct lu_env *env, struct cl_object *obj,
+			struct cl_page *page, struct page *vmpage)
+{
+	struct echo_page *ep = cl_object_page_slice(obj, page);
+	struct echo_object *eco = cl2echo_obj(obj);
+
+	ep->ep_vmpage = vmpage;
+	page_cache_get(vmpage);
+	mutex_init(&ep->ep_lock);
+	cl_page_slice_add(page, &ep->ep_cl, obj, &echo_page_ops);
+	atomic_inc(&eco->eo_npages);
+	return 0;
+}
+
+static int echo_io_init(const struct lu_env *env, struct cl_object *obj,
+			struct cl_io *io)
+{
+	return 0;
+}
+
+static int echo_lock_init(const struct lu_env *env,
+			  struct cl_object *obj, struct cl_lock *lock,
+			  const struct cl_io *unused)
+{
+	struct echo_lock *el;
+
+	OBD_SLAB_ALLOC_PTR_GFP(el, echo_lock_kmem, GFP_NOFS);
+	if (el != NULL) {
+		cl_lock_slice_add(lock, &el->el_cl, obj, &echo_lock_ops);
+		el->el_object = cl2echo_obj(obj);
+		INIT_LIST_HEAD(&el->el_chain);
+		atomic_set(&el->el_refcount, 0);
+	}
+	return el == NULL ? -ENOMEM : 0;
+}
+
+static int echo_conf_set(const struct lu_env *env, struct cl_object *obj,
+			 const struct cl_object_conf *conf)
+{
+	return 0;
+}
+
+static const struct cl_object_operations echo_cl_obj_ops = {
+	.coo_page_init = echo_page_init,
+	.coo_lock_init = echo_lock_init,
+	.coo_io_init   = echo_io_init,
+	.coo_conf_set  = echo_conf_set
+};
+/** @} echo_cl_ops */
+
+/** \defgroup echo_lu_ops lu_object operations
+ *
+ * operations for echo lu object.
+ *
+ * @{
+ */
+static int echo_object_init(const struct lu_env *env, struct lu_object *obj,
+			    const struct lu_object_conf *conf)
+{
+	struct echo_device *ed	 = cl2echo_dev(lu2cl_dev(obj->lo_dev));
+	struct echo_client_obd *ec     = ed->ed_ec;
+	struct echo_object *eco	= cl2echo_obj(lu2cl(obj));
+	const struct cl_object_conf *cconf;
+	struct echo_object_conf *econf;
+
+	if (ed->ed_next) {
+		struct lu_object  *below;
+		struct lu_device  *under;
+
+		under = ed->ed_next;
+		below = under->ld_ops->ldo_object_alloc(env, obj->lo_header,
+							under);
+		if (below == NULL)
+			return -ENOMEM;
+		lu_object_add(obj, below);
+	}
+
+	cconf = lu2cl_conf(conf);
+	econf = cl2echo_conf(cconf);
+
+	LASSERT(econf->eoc_md);
+	eco->eo_lsm = *econf->eoc_md;
+	/* clear the lsm pointer so that it won't get freed. */
+	*econf->eoc_md = NULL;
+
+	eco->eo_dev = ed;
+	atomic_set(&eco->eo_npages, 0);
+	cl_object_page_init(lu2cl(obj), sizeof(struct echo_page));
+
+	spin_lock(&ec->ec_lock);
+	list_add_tail(&eco->eo_obj_chain, &ec->ec_objects);
+	spin_unlock(&ec->ec_lock);
+
+	return 0;
+}
+
+/* taken from osc_unpackmd() */
+static int echo_alloc_memmd(struct echo_device *ed,
+			    struct lov_stripe_md **lsmp)
+{
+	int lsm_size;
+
+	/* If export is lov/osc then use their obd method */
+	if (ed->ed_next != NULL)
+		return obd_alloc_memmd(ed->ed_ec->ec_exp, lsmp);
+	/* OFD has no unpackmd method, do everything here */
+	lsm_size = lov_stripe_md_size(1);
+
+	LASSERT(*lsmp == NULL);
+	OBD_ALLOC(*lsmp, lsm_size);
+	if (*lsmp == NULL)
+		return -ENOMEM;
+
+	OBD_ALLOC((*lsmp)->lsm_oinfo[0], sizeof(struct lov_oinfo));
+	if ((*lsmp)->lsm_oinfo[0] == NULL) {
+		OBD_FREE(*lsmp, lsm_size);
+		return -ENOMEM;
+	}
+
+	loi_init((*lsmp)->lsm_oinfo[0]);
+	(*lsmp)->lsm_maxbytes = LUSTRE_STRIPE_MAXBYTES;
+	ostid_set_seq_echo(&(*lsmp)->lsm_oi);
+
+	return lsm_size;
+}
+
+static int echo_free_memmd(struct echo_device *ed, struct lov_stripe_md **lsmp)
+{
+	int lsm_size;
+
+	/* If export is lov/osc then use their obd method */
+	if (ed->ed_next != NULL)
+		return obd_free_memmd(ed->ed_ec->ec_exp, lsmp);
+	/* OFD has no unpackmd method, do everything here */
+	lsm_size = lov_stripe_md_size(1);
+
+	LASSERT(*lsmp != NULL);
+	OBD_FREE((*lsmp)->lsm_oinfo[0], sizeof(struct lov_oinfo));
+	OBD_FREE(*lsmp, lsm_size);
+	*lsmp = NULL;
+	return 0;
+}
+
+static void echo_object_free(const struct lu_env *env, struct lu_object *obj)
+{
+	struct echo_object *eco    = cl2echo_obj(lu2cl(obj));
+	struct echo_client_obd *ec = eco->eo_dev->ed_ec;
+
+	LASSERT(atomic_read(&eco->eo_npages) == 0);
+
+	spin_lock(&ec->ec_lock);
+	list_del_init(&eco->eo_obj_chain);
+	spin_unlock(&ec->ec_lock);
+
+	lu_object_fini(obj);
+	lu_object_header_fini(obj->lo_header);
+
+	if (eco->eo_lsm)
+		echo_free_memmd(eco->eo_dev, &eco->eo_lsm);
+	OBD_SLAB_FREE_PTR(eco, echo_object_kmem);
+}
+
+static int echo_object_print(const struct lu_env *env, void *cookie,
+			    lu_printer_t p, const struct lu_object *o)
+{
+	struct echo_object *obj = cl2echo_obj(lu2cl(o));
+
+	return (*p)(env, cookie, "echoclient-object@%p", obj);
+}
+
+static const struct lu_object_operations echo_lu_obj_ops = {
+	.loo_object_init      = echo_object_init,
+	.loo_object_delete    = NULL,
+	.loo_object_release   = NULL,
+	.loo_object_free      = echo_object_free,
+	.loo_object_print     = echo_object_print,
+	.loo_object_invariant = NULL
+};
+/** @} echo_lu_ops */
+
+/** \defgroup echo_lu_dev_ops  lu_device operations
+ *
+ * Operations for echo lu device.
+ *
+ * @{
+ */
+static struct lu_object *echo_object_alloc(const struct lu_env *env,
+					   const struct lu_object_header *hdr,
+					   struct lu_device *dev)
+{
+	struct echo_object *eco;
+	struct lu_object *obj = NULL;
+
+	/* we're the top dev. */
+	LASSERT(hdr == NULL);
+	OBD_SLAB_ALLOC_PTR_GFP(eco, echo_object_kmem, GFP_NOFS);
+	if (eco != NULL) {
+		struct cl_object_header *hdr = &eco->eo_hdr;
+
+		obj = &echo_obj2cl(eco)->co_lu;
+		cl_object_header_init(hdr);
+		lu_object_init(obj, &hdr->coh_lu, dev);
+		lu_object_add_top(&hdr->coh_lu, obj);
+
+		eco->eo_cl.co_ops = &echo_cl_obj_ops;
+		obj->lo_ops       = &echo_lu_obj_ops;
+	}
+	return obj;
+}
+
+static struct lu_device_operations echo_device_lu_ops = {
+	.ldo_object_alloc   = echo_object_alloc,
+};
+
+/** @} echo_lu_dev_ops */
+
+static struct cl_device_operations echo_device_cl_ops = {
+};
+
+/** \defgroup echo_init Setup and teardown
+ *
+ * Init and fini functions for echo client.
+ *
+ * @{
+ */
+static int echo_site_init(const struct lu_env *env, struct echo_device *ed)
+{
+	struct cl_site *site = &ed->ed_site_myself;
+	int rc;
+
+	/* initialize site */
+	rc = cl_site_init(site, &ed->ed_cl);
+	if (rc) {
+		CERROR("Cannot initialize site for echo client(%d)\n", rc);
+		return rc;
+	}
+
+	rc = lu_site_init_finish(&site->cs_lu);
+	if (rc)
+		return rc;
+
+	ed->ed_site = site;
+	return 0;
+}
+
+static void echo_site_fini(const struct lu_env *env, struct echo_device *ed)
+{
+	if (ed->ed_site) {
+		cl_site_fini(ed->ed_site);
+		ed->ed_site = NULL;
+	}
+}
+
+static void *echo_thread_key_init(const struct lu_context *ctx,
+			  struct lu_context_key *key)
+{
+	struct echo_thread_info *info;
+
+	OBD_SLAB_ALLOC_PTR_GFP(info, echo_thread_kmem, GFP_NOFS);
+	if (info == NULL)
+		info = ERR_PTR(-ENOMEM);
+	return info;
+}
+
+static void echo_thread_key_fini(const struct lu_context *ctx,
+			 struct lu_context_key *key, void *data)
+{
+	struct echo_thread_info *info = data;
+
+	OBD_SLAB_FREE_PTR(info, echo_thread_kmem);
+}
+
+static void echo_thread_key_exit(const struct lu_context *ctx,
+			 struct lu_context_key *key, void *data)
+{
+}
+
+static struct lu_context_key echo_thread_key = {
+	.lct_tags = LCT_CL_THREAD,
+	.lct_init = echo_thread_key_init,
+	.lct_fini = echo_thread_key_fini,
+	.lct_exit = echo_thread_key_exit
+};
+
+static void *echo_session_key_init(const struct lu_context *ctx,
+				  struct lu_context_key *key)
+{
+	struct echo_session_info *session;
+
+	OBD_SLAB_ALLOC_PTR_GFP(session, echo_session_kmem, GFP_NOFS);
+	if (session == NULL)
+		session = ERR_PTR(-ENOMEM);
+	return session;
+}
+
+static void echo_session_key_fini(const struct lu_context *ctx,
+				 struct lu_context_key *key, void *data)
+{
+	struct echo_session_info *session = data;
+
+	OBD_SLAB_FREE_PTR(session, echo_session_kmem);
+}
+
+static void echo_session_key_exit(const struct lu_context *ctx,
+				 struct lu_context_key *key, void *data)
+{
+}
+
+static struct lu_context_key echo_session_key = {
+	.lct_tags = LCT_SESSION,
+	.lct_init = echo_session_key_init,
+	.lct_fini = echo_session_key_fini,
+	.lct_exit = echo_session_key_exit
+};
+
+LU_TYPE_INIT_FINI(echo, &echo_thread_key, &echo_session_key);
+
+static struct lu_device *echo_device_alloc(const struct lu_env *env,
+					   struct lu_device_type *t,
+					   struct lustre_cfg *cfg)
+{
+	struct lu_device   *next;
+	struct echo_device *ed;
+	struct cl_device   *cd;
+	struct obd_device  *obd = NULL; /* to keep compiler happy */
+	struct obd_device  *tgt;
+	const char *tgt_type_name;
+	int rc;
+	int cleanup = 0;
+
+	OBD_ALLOC_PTR(ed);
+	if (ed == NULL) {
+		rc = -ENOMEM;
+		goto out;
+	}
+
+	cleanup = 1;
+	cd = &ed->ed_cl;
+	rc = cl_device_init(cd, t);
+	if (rc)
+		goto out;
+
+	cd->cd_lu_dev.ld_ops = &echo_device_lu_ops;
+	cd->cd_ops = &echo_device_cl_ops;
+
+	cleanup = 2;
+	obd = class_name2obd(lustre_cfg_string(cfg, 0));
+	LASSERT(obd != NULL);
+	LASSERT(env != NULL);
+
+	tgt = class_name2obd(lustre_cfg_string(cfg, 1));
+	if (tgt == NULL) {
+		CERROR("Can not find tgt device %s\n",
+			lustre_cfg_string(cfg, 1));
+		rc = -ENODEV;
+		goto out;
+	}
+
+	next = tgt->obd_lu_dev;
+	if (!strcmp(tgt->obd_type->typ_name, LUSTRE_MDT_NAME)) {
+		CERROR("echo MDT client must be run on server\n");
+		rc = -EOPNOTSUPP;
+		goto out;
+	}
+
+	rc = echo_site_init(env, ed);
+	if (rc)
+		goto out;
+
+	cleanup = 3;
+
+	rc = echo_client_setup(env, obd, cfg);
+	if (rc)
+		goto out;
+
+	ed->ed_ec = &obd->u.echo_client;
+	cleanup = 4;
+
+	/* if echo client is to be stacked upon ost device, the next is
+	 * NULL since ost is not a clio device so far */
+	if (next != NULL && !lu_device_is_cl(next))
+		next = NULL;
+
+	tgt_type_name = tgt->obd_type->typ_name;
+	if (next != NULL) {
+		LASSERT(next != NULL);
+		if (next->ld_site != NULL) {
+			rc = -EBUSY;
+			goto out;
+		}
+
+		next->ld_site = &ed->ed_site->cs_lu;
+		rc = next->ld_type->ldt_ops->ldto_device_init(env, next,
+						next->ld_type->ldt_name,
+							      NULL);
+		if (rc)
+			goto out;
+
+		/* Tricky case, I have to determine the obd type since
+		 * CLIO uses the different parameters to initialize
+		 * objects for lov & osc. */
+		if (strcmp(tgt_type_name, LUSTRE_LOV_NAME) == 0)
+			ed->ed_next_islov = 1;
+		else
+			LASSERT(strcmp(tgt_type_name,
+				       LUSTRE_OSC_NAME) == 0);
+	} else {
+		LASSERT(strcmp(tgt_type_name, LUSTRE_OST_NAME) == 0);
+	}
+
+	ed->ed_next = next;
+	return &cd->cd_lu_dev;
+out:
+	switch (cleanup) {
+	case 4: {
+		int rc2;
+
+		rc2 = echo_client_cleanup(obd);
+		if (rc2)
+			CERROR("Cleanup obd device %s error(%d)\n",
+			       obd->obd_name, rc2);
+	}
+
+	case 3:
+		echo_site_fini(env, ed);
+	case 2:
+		cl_device_fini(&ed->ed_cl);
+	case 1:
+		OBD_FREE_PTR(ed);
+	case 0:
+	default:
+		break;
+	}
+	return ERR_PTR(rc);
+}
+
+static int echo_device_init(const struct lu_env *env, struct lu_device *d,
+			  const char *name, struct lu_device *next)
+{
+	LBUG();
+	return 0;
+}
+
+static struct lu_device *echo_device_fini(const struct lu_env *env,
+					  struct lu_device *d)
+{
+	struct echo_device *ed = cl2echo_dev(lu2cl_dev(d));
+	struct lu_device *next = ed->ed_next;
+
+	while (next)
+		next = next->ld_type->ldt_ops->ldto_device_fini(env, next);
+	return NULL;
+}
+
+static void echo_lock_release(const struct lu_env *env,
+			      struct echo_lock *ecl,
+			      int still_used)
+{
+	struct cl_lock *clk = echo_lock2cl(ecl);
+
+	cl_lock_get(clk);
+	cl_unuse(env, clk);
+	cl_lock_release(env, clk, "ec enqueue", ecl->el_object);
+	if (!still_used) {
+		cl_lock_mutex_get(env, clk);
+		cl_lock_cancel(env, clk);
+		cl_lock_delete(env, clk);
+		cl_lock_mutex_put(env, clk);
+	}
+	cl_lock_put(env, clk);
+}
+
+static struct lu_device *echo_device_free(const struct lu_env *env,
+					  struct lu_device *d)
+{
+	struct echo_device     *ed   = cl2echo_dev(lu2cl_dev(d));
+	struct echo_client_obd *ec   = ed->ed_ec;
+	struct echo_object     *eco;
+	struct lu_device       *next = ed->ed_next;
+
+	CDEBUG(D_INFO, "echo device:%p is going to be freed, next = %p\n",
+	       ed, next);
+
+	lu_site_purge(env, &ed->ed_site->cs_lu, -1);
+
+	/* check if there are objects still alive.
+	 * It shouldn't have any object because lu_site_purge would cleanup
+	 * all of cached objects. Anyway, probably the echo device is being
+	 * parallelly accessed.
+	 */
+	spin_lock(&ec->ec_lock);
+	list_for_each_entry(eco, &ec->ec_objects, eo_obj_chain)
+		eco->eo_deleted = 1;
+	spin_unlock(&ec->ec_lock);
+
+	/* purge again */
+	lu_site_purge(env, &ed->ed_site->cs_lu, -1);
+
+	CDEBUG(D_INFO,
+	       "Waiting for the reference of echo object to be dropped\n");
+
+	/* Wait for the last reference to be dropped. */
+	spin_lock(&ec->ec_lock);
+	while (!list_empty(&ec->ec_objects)) {
+		spin_unlock(&ec->ec_lock);
+		CERROR("echo_client still has objects at cleanup time, wait for 1 second\n");
+		set_current_state(TASK_UNINTERRUPTIBLE);
+		schedule_timeout(cfs_time_seconds(1));
+		lu_site_purge(env, &ed->ed_site->cs_lu, -1);
+		spin_lock(&ec->ec_lock);
+	}
+	spin_unlock(&ec->ec_lock);
+
+	LASSERT(list_empty(&ec->ec_locks));
+
+	CDEBUG(D_INFO, "No object exists, exiting...\n");
+
+	echo_client_cleanup(d->ld_obd);
+
+	while (next)
+		next = next->ld_type->ldt_ops->ldto_device_free(env, next);
+
+	LASSERT(ed->ed_site == lu2cl_site(d->ld_site));
+	echo_site_fini(env, ed);
+	cl_device_fini(&ed->ed_cl);
+	OBD_FREE_PTR(ed);
+
+	return NULL;
+}
+
+static const struct lu_device_type_operations echo_device_type_ops = {
+	.ldto_init = echo_type_init,
+	.ldto_fini = echo_type_fini,
+
+	.ldto_start = echo_type_start,
+	.ldto_stop  = echo_type_stop,
+
+	.ldto_device_alloc = echo_device_alloc,
+	.ldto_device_free  = echo_device_free,
+	.ldto_device_init  = echo_device_init,
+	.ldto_device_fini  = echo_device_fini
+};
+
+static struct lu_device_type echo_device_type = {
+	.ldt_tags     = LU_DEVICE_CL,
+	.ldt_name     = LUSTRE_ECHO_CLIENT_NAME,
+	.ldt_ops      = &echo_device_type_ops,
+	.ldt_ctx_tags = LCT_CL_THREAD,
+};
+/** @} echo_init */
+
+/** \defgroup echo_exports Exported operations
+ *
+ * exporting functions to echo client
+ *
+ * @{
+ */
+
+/* Interfaces to echo client obd device */
+static struct echo_object *cl_echo_object_find(struct echo_device *d,
+					       struct lov_stripe_md **lsmp)
+{
+	struct lu_env *env;
+	struct echo_thread_info *info;
+	struct echo_object_conf *conf;
+	struct lov_stripe_md    *lsm;
+	struct echo_object *eco;
+	struct cl_object   *obj;
+	struct lu_fid *fid;
+	int refcheck;
+	int rc;
+
+	LASSERT(lsmp);
+	lsm = *lsmp;
+	LASSERT(lsm);
+	LASSERTF(ostid_id(&lsm->lsm_oi) != 0, DOSTID"\n", POSTID(&lsm->lsm_oi));
+	LASSERTF(ostid_seq(&lsm->lsm_oi) == FID_SEQ_ECHO, DOSTID"\n",
+		 POSTID(&lsm->lsm_oi));
+
+	/* Never return an object if the obd is to be freed. */
+	if (echo_dev2cl(d)->cd_lu_dev.ld_obd->obd_stopping)
+		return ERR_PTR(-ENODEV);
+
+	env = cl_env_get(&refcheck);
+	if (IS_ERR(env))
+		return (void *)env;
+
+	info = echo_env_info(env);
+	conf = &info->eti_conf;
+	if (d->ed_next) {
+		if (!d->ed_next_islov) {
+			struct lov_oinfo *oinfo = lsm->lsm_oinfo[0];
+
+			LASSERT(oinfo != NULL);
+			oinfo->loi_oi = lsm->lsm_oi;
+			conf->eoc_cl.u.coc_oinfo = oinfo;
+		} else {
+			struct lustre_md *md;
+
+			md = &info->eti_md;
+			memset(md, 0, sizeof(*md));
+			md->lsm = lsm;
+			conf->eoc_cl.u.coc_md = md;
+		}
+	}
+	conf->eoc_md = lsmp;
+
+	fid  = &info->eti_fid;
+	rc = ostid_to_fid(fid, &lsm->lsm_oi, 0);
+	if (rc != 0) {
+		eco = ERR_PTR(rc);
+		goto out;
+	}
+
+	/* In the function below, .hs_keycmp resolves to
+	 * lu_obj_hop_keycmp() */
+	/* coverity[overrun-buffer-val] */
+	obj = cl_object_find(env, echo_dev2cl(d), fid, &conf->eoc_cl);
+	if (IS_ERR(obj)) {
+		eco = (void *)obj;
+		goto out;
+	}
+
+	eco = cl2echo_obj(obj);
+	if (eco->eo_deleted) {
+		cl_object_put(env, obj);
+		eco = ERR_PTR(-EAGAIN);
+	}
+
+out:
+	cl_env_put(env, &refcheck);
+	return eco;
+}
+
+static int cl_echo_object_put(struct echo_object *eco)
+{
+	struct lu_env *env;
+	struct cl_object *obj = echo_obj2cl(eco);
+	int refcheck;
+
+	env = cl_env_get(&refcheck);
+	if (IS_ERR(env))
+		return PTR_ERR(env);
+
+	/* an external function to kill an object? */
+	if (eco->eo_deleted) {
+		struct lu_object_header *loh = obj->co_lu.lo_header;
+
+		LASSERT(&eco->eo_hdr == luh2coh(loh));
+		set_bit(LU_OBJECT_HEARD_BANSHEE, &loh->loh_flags);
+	}
+
+	cl_object_put(env, obj);
+	cl_env_put(env, &refcheck);
+	return 0;
+}
+
+static int cl_echo_enqueue0(struct lu_env *env, struct echo_object *eco,
+			    u64 start, u64 end, int mode,
+			    __u64 *cookie, __u32 enqflags)
+{
+	struct cl_io *io;
+	struct cl_lock *lck;
+	struct cl_object *obj;
+	struct cl_lock_descr *descr;
+	struct echo_thread_info *info;
+	int rc = -ENOMEM;
+
+	info = echo_env_info(env);
+	io = &info->eti_io;
+	descr = &info->eti_descr;
+	obj = echo_obj2cl(eco);
+
+	descr->cld_obj   = obj;
+	descr->cld_start = cl_index(obj, start);
+	descr->cld_end   = cl_index(obj, end);
+	descr->cld_mode  = mode == LCK_PW ? CLM_WRITE : CLM_READ;
+	descr->cld_enq_flags = enqflags;
+	io->ci_obj = obj;
+
+	lck = cl_lock_request(env, io, descr, "ec enqueue", eco);
+	if (lck) {
+		struct echo_client_obd *ec = eco->eo_dev->ed_ec;
+		struct echo_lock *el;
+
+		rc = cl_wait(env, lck);
+		if (rc == 0) {
+			el = cl2echo_lock(cl_lock_at(lck, &echo_device_type));
+			spin_lock(&ec->ec_lock);
+			if (list_empty(&el->el_chain)) {
+				list_add(&el->el_chain, &ec->ec_locks);
+				el->el_cookie = ++ec->ec_unique;
+			}
+			atomic_inc(&el->el_refcount);
+			*cookie = el->el_cookie;
+			spin_unlock(&ec->ec_lock);
+		} else {
+			cl_lock_release(env, lck, "ec enqueue", current);
+		}
+	}
+	return rc;
+}
+
+static int cl_echo_enqueue(struct echo_object *eco, u64 start, u64 end,
+			   int mode, __u64 *cookie)
+{
+	struct echo_thread_info *info;
+	struct lu_env *env;
+	struct cl_io *io;
+	int refcheck;
+	int result;
+
+	env = cl_env_get(&refcheck);
+	if (IS_ERR(env))
+		return PTR_ERR(env);
+
+	info = echo_env_info(env);
+	io = &info->eti_io;
+
+	io->ci_ignore_layout = 1;
+	result = cl_io_init(env, io, CIT_MISC, echo_obj2cl(eco));
+	if (result < 0)
+		goto out;
+	LASSERT(result == 0);
+
+	result = cl_echo_enqueue0(env, eco, start, end, mode, cookie, 0);
+	cl_io_fini(env, io);
+
+out:
+	cl_env_put(env, &refcheck);
+	return result;
+}
+
+static int cl_echo_cancel0(struct lu_env *env, struct echo_device *ed,
+			   __u64 cookie)
+{
+	struct echo_client_obd *ec = ed->ed_ec;
+	struct echo_lock       *ecl = NULL;
+	struct list_head	     *el;
+	int found = 0, still_used = 0;
+
+	LASSERT(ec != NULL);
+	spin_lock(&ec->ec_lock);
+	list_for_each(el, &ec->ec_locks) {
+		ecl = list_entry(el, struct echo_lock, el_chain);
+		CDEBUG(D_INFO, "ecl: %p, cookie: %#llx\n", ecl, ecl->el_cookie);
+		found = (ecl->el_cookie == cookie);
+		if (found) {
+			if (atomic_dec_and_test(&ecl->el_refcount))
+				list_del_init(&ecl->el_chain);
+			else
+				still_used = 1;
+			break;
+		}
+	}
+	spin_unlock(&ec->ec_lock);
+
+	if (!found)
+		return -ENOENT;
+
+	echo_lock_release(env, ecl, still_used);
+	return 0;
+}
+
+static int cl_echo_cancel(struct echo_device *ed, __u64 cookie)
+{
+	struct lu_env *env;
+	int refcheck;
+	int rc;
+
+	env = cl_env_get(&refcheck);
+	if (IS_ERR(env))
+		return PTR_ERR(env);
+
+	rc = cl_echo_cancel0(env, ed, cookie);
+
+	cl_env_put(env, &refcheck);
+	return rc;
+}
+
+static int cl_echo_async_brw(const struct lu_env *env, struct cl_io *io,
+			     enum cl_req_type unused, struct cl_2queue *queue)
+{
+	struct cl_page *clp;
+	struct cl_page *temp;
+	int result = 0;
+
+	cl_page_list_for_each_safe(clp, temp, &queue->c2_qin) {
+		int rc;
+
+		rc = cl_page_cache_add(env, io, clp, CRT_WRITE);
+		if (rc == 0)
+			continue;
+		result = result ?: rc;
+	}
+	return result;
+}
+
+static int cl_echo_object_brw(struct echo_object *eco, int rw, u64 offset,
+			      struct page **pages, int npages, int async)
+{
+	struct lu_env	   *env;
+	struct echo_thread_info *info;
+	struct cl_object	*obj = echo_obj2cl(eco);
+	struct echo_device      *ed  = eco->eo_dev;
+	struct cl_2queue	*queue;
+	struct cl_io	    *io;
+	struct cl_page	  *clp;
+	struct lustre_handle    lh = { 0 };
+	int page_size = cl_page_size(obj);
+	int refcheck;
+	int rc;
+	int i;
+
+	LASSERT((offset & ~CFS_PAGE_MASK) == 0);
+	LASSERT(ed->ed_next != NULL);
+	env = cl_env_get(&refcheck);
+	if (IS_ERR(env))
+		return PTR_ERR(env);
+
+	info    = echo_env_info(env);
+	io      = &info->eti_io;
+	queue   = &info->eti_queue;
+
+	cl_2queue_init(queue);
+
+	io->ci_ignore_layout = 1;
+	rc = cl_io_init(env, io, CIT_MISC, obj);
+	if (rc < 0)
+		goto out;
+	LASSERT(rc == 0);
+
+
+	rc = cl_echo_enqueue0(env, eco, offset,
+			      offset + npages * PAGE_CACHE_SIZE - 1,
+			      rw == READ ? LCK_PR : LCK_PW, &lh.cookie,
+			      CEF_NEVER);
+	if (rc < 0)
+		goto error_lock;
+
+	for (i = 0; i < npages; i++) {
+		LASSERT(pages[i]);
+		clp = cl_page_find(env, obj, cl_index(obj, offset),
+				   pages[i], CPT_TRANSIENT);
+		if (IS_ERR(clp)) {
+			rc = PTR_ERR(clp);
+			break;
+		}
+		LASSERT(clp->cp_type == CPT_TRANSIENT);
+
+		rc = cl_page_own(env, io, clp);
+		if (rc) {
+			LASSERT(clp->cp_state == CPS_FREEING);
+			cl_page_put(env, clp);
+			break;
+		}
+
+		cl_2queue_add(queue, clp);
+
+		/* drop the reference count for cl_page_find, so that the page
+		 * will be freed in cl_2queue_fini. */
+		cl_page_put(env, clp);
+		cl_page_clip(env, clp, 0, page_size);
+
+		offset += page_size;
+	}
+
+	if (rc == 0) {
+		enum cl_req_type typ = rw == READ ? CRT_READ : CRT_WRITE;
+
+		async = async && (typ == CRT_WRITE);
+		if (async)
+			rc = cl_echo_async_brw(env, io, typ, queue);
+		else
+			rc = cl_io_submit_sync(env, io, typ, queue, 0);
+		CDEBUG(D_INFO, "echo_client %s write returns %d\n",
+		       async ? "async" : "sync", rc);
+	}
+
+	cl_echo_cancel0(env, ed, lh.cookie);
+error_lock:
+	cl_2queue_discard(env, io, queue);
+	cl_2queue_disown(env, io, queue);
+	cl_2queue_fini(env, queue);
+	cl_io_fini(env, io);
+out:
+	cl_env_put(env, &refcheck);
+	return rc;
+}
+/** @} echo_exports */
+
+
+static u64 last_object_id;
+
+static int
+echo_copyout_lsm(struct lov_stripe_md *lsm, void *_ulsm, int ulsm_nob)
+{
+	struct lov_stripe_md *ulsm = _ulsm;
+	int nob, i;
+
+	nob = offsetof(struct lov_stripe_md, lsm_oinfo[lsm->lsm_stripe_count]);
+	if (nob > ulsm_nob)
+		return -EINVAL;
+
+	if (copy_to_user(ulsm, lsm, sizeof(*ulsm)))
+		return -EFAULT;
+
+	for (i = 0; i < lsm->lsm_stripe_count; i++) {
+		if (copy_to_user(ulsm->lsm_oinfo[i], lsm->lsm_oinfo[i],
+				      sizeof(lsm->lsm_oinfo[0])))
+			return -EFAULT;
+	}
+	return 0;
+}
+
+static int
+echo_copyin_lsm(struct echo_device *ed, struct lov_stripe_md *lsm,
+		 void *ulsm, int ulsm_nob)
+{
+	struct echo_client_obd *ec = ed->ed_ec;
+	int		     i;
+
+	if (ulsm_nob < sizeof(*lsm))
+		return -EINVAL;
+
+	if (copy_from_user(lsm, ulsm, sizeof(*lsm)))
+		return -EFAULT;
+
+	if (lsm->lsm_stripe_count > ec->ec_nstripes ||
+	    lsm->lsm_magic != LOV_MAGIC ||
+	    (lsm->lsm_stripe_size & (~CFS_PAGE_MASK)) != 0 ||
+	    ((__u64)lsm->lsm_stripe_size * lsm->lsm_stripe_count > ~0UL))
+		return -EINVAL;
+
+
+	for (i = 0; i < lsm->lsm_stripe_count; i++) {
+		if (copy_from_user(lsm->lsm_oinfo[i],
+				       ((struct lov_stripe_md *)ulsm)-> \
+				       lsm_oinfo[i],
+				       sizeof(lsm->lsm_oinfo[0])))
+			return -EFAULT;
+	}
+	return 0;
+}
+
+static int echo_create_object(const struct lu_env *env, struct echo_device *ed,
+			      int on_target, struct obdo *oa, void *ulsm,
+			      int ulsm_nob, struct obd_trans_info *oti)
+{
+	struct echo_object     *eco;
+	struct echo_client_obd *ec = ed->ed_ec;
+	struct lov_stripe_md   *lsm = NULL;
+	int		     rc;
+	int		     created = 0;
+
+	if ((oa->o_valid & OBD_MD_FLID) == 0 && /* no obj id */
+	    (on_target ||		       /* set_stripe */
+	     ec->ec_nstripes != 0)) {	   /* LOV */
+		CERROR("No valid oid\n");
+		return -EINVAL;
+	}
+
+	rc = echo_alloc_memmd(ed, &lsm);
+	if (rc < 0) {
+		CERROR("Cannot allocate md: rc = %d\n", rc);
+		goto failed;
+	}
+
+	if (ulsm != NULL) {
+		int i, idx;
+
+		rc = echo_copyin_lsm(ed, lsm, ulsm, ulsm_nob);
+		if (rc != 0)
+			goto failed;
+
+		if (lsm->lsm_stripe_count == 0)
+			lsm->lsm_stripe_count = ec->ec_nstripes;
+
+		if (lsm->lsm_stripe_size == 0)
+			lsm->lsm_stripe_size = PAGE_CACHE_SIZE;
+
+		idx = cfs_rand();
+
+		/* setup stripes: indices + default ids if required */
+		for (i = 0; i < lsm->lsm_stripe_count; i++) {
+			if (ostid_id(&lsm->lsm_oinfo[i]->loi_oi) == 0)
+				lsm->lsm_oinfo[i]->loi_oi = lsm->lsm_oi;
+
+			lsm->lsm_oinfo[i]->loi_ost_idx =
+				(idx + i) % ec->ec_nstripes;
+		}
+	}
+
+	/* setup object ID here for !on_target and LOV hint */
+	if (oa->o_valid & OBD_MD_FLID) {
+		LASSERT(oa->o_valid & OBD_MD_FLGROUP);
+		lsm->lsm_oi = oa->o_oi;
+	}
+
+	if (ostid_id(&lsm->lsm_oi) == 0)
+		ostid_set_id(&lsm->lsm_oi, ++last_object_id);
+
+	rc = 0;
+	if (on_target) {
+		/* Only echo objects are allowed to be created */
+		LASSERT((oa->o_valid & OBD_MD_FLGROUP) &&
+			(ostid_seq(&oa->o_oi) == FID_SEQ_ECHO));
+		rc = obd_create(env, ec->ec_exp, oa, &lsm, oti);
+		if (rc != 0) {
+			CERROR("Cannot create objects: rc = %d\n", rc);
+			goto failed;
+		}
+		created = 1;
+	}
+
+	/* See what object ID we were given */
+	oa->o_oi = lsm->lsm_oi;
+	oa->o_valid |= OBD_MD_FLID;
+
+	eco = cl_echo_object_find(ed, &lsm);
+	if (IS_ERR(eco)) {
+		rc = PTR_ERR(eco);
+		goto failed;
+	}
+	cl_echo_object_put(eco);
+
+	CDEBUG(D_INFO, "oa oid "DOSTID"\n", POSTID(&oa->o_oi));
+
+ failed:
+	if (created && rc)
+		obd_destroy(env, ec->ec_exp, oa, lsm, oti, NULL, NULL);
+	if (lsm)
+		echo_free_memmd(ed, &lsm);
+	if (rc)
+		CERROR("create object failed with: rc = %d\n", rc);
+	return rc;
+}
+
+static int echo_get_object(struct echo_object **ecop, struct echo_device *ed,
+			   struct obdo *oa)
+{
+	struct lov_stripe_md   *lsm = NULL;
+	struct echo_object     *eco;
+	int		     rc;
+
+	if ((oa->o_valid & OBD_MD_FLID) == 0 || ostid_id(&oa->o_oi) == 0) {
+		/* disallow use of object id 0 */
+		CERROR("No valid oid\n");
+		return -EINVAL;
+	}
+
+	rc = echo_alloc_memmd(ed, &lsm);
+	if (rc < 0)
+		return rc;
+
+	lsm->lsm_oi = oa->o_oi;
+	if (!(oa->o_valid & OBD_MD_FLGROUP))
+		ostid_set_seq_echo(&lsm->lsm_oi);
+
+	rc = 0;
+	eco = cl_echo_object_find(ed, &lsm);
+	if (!IS_ERR(eco))
+		*ecop = eco;
+	else
+		rc = PTR_ERR(eco);
+	if (lsm)
+		echo_free_memmd(ed, &lsm);
+	return rc;
+}
+
+static void echo_put_object(struct echo_object *eco)
+{
+	if (cl_echo_object_put(eco))
+		CERROR("echo client: drop an object failed");
+}
+
+static void
+echo_get_stripe_off_id(struct lov_stripe_md *lsm, u64 *offp, u64 *idp)
+{
+	unsigned long stripe_count;
+	unsigned long stripe_size;
+	unsigned long width;
+	unsigned long woffset;
+	int	   stripe_index;
+	u64       offset;
+
+	if (lsm->lsm_stripe_count <= 1)
+		return;
+
+	offset       = *offp;
+	stripe_size  = lsm->lsm_stripe_size;
+	stripe_count = lsm->lsm_stripe_count;
+
+	/* width = # bytes in all stripes */
+	width = stripe_size * stripe_count;
+
+	/* woffset = offset within a width; offset = whole number of widths */
+	woffset = do_div(offset, width);
+
+	stripe_index = woffset / stripe_size;
+
+	*idp = ostid_id(&lsm->lsm_oinfo[stripe_index]->loi_oi);
+	*offp = offset * stripe_size + woffset % stripe_size;
+}
+
+static void
+echo_client_page_debug_setup(struct lov_stripe_md *lsm,
+			     struct page *page, int rw, u64 id,
+			     u64 offset, u64 count)
+{
+	char    *addr;
+	u64	 stripe_off;
+	u64	 stripe_id;
+	int      delta;
+
+	/* no partial pages on the client */
+	LASSERT(count == PAGE_CACHE_SIZE);
+
+	addr = kmap(page);
+
+	for (delta = 0; delta < PAGE_CACHE_SIZE; delta += OBD_ECHO_BLOCK_SIZE) {
+		if (rw == OBD_BRW_WRITE) {
+			stripe_off = offset + delta;
+			stripe_id = id;
+			echo_get_stripe_off_id(lsm, &stripe_off, &stripe_id);
+		} else {
+			stripe_off = 0xdeadbeef00c0ffeeULL;
+			stripe_id = 0xdeadbeef00c0ffeeULL;
+		}
+		block_debug_setup(addr + delta, OBD_ECHO_BLOCK_SIZE,
+				  stripe_off, stripe_id);
+	}
+
+	kunmap(page);
+}
+
+static int echo_client_page_debug_check(struct lov_stripe_md *lsm,
+					struct page *page, u64 id,
+					u64 offset, u64 count)
+{
+	u64	stripe_off;
+	u64	stripe_id;
+	char   *addr;
+	int     delta;
+	int     rc;
+	int     rc2;
+
+	/* no partial pages on the client */
+	LASSERT(count == PAGE_CACHE_SIZE);
+
+	addr = kmap(page);
+
+	for (rc = delta = 0; delta < PAGE_CACHE_SIZE; delta += OBD_ECHO_BLOCK_SIZE) {
+		stripe_off = offset + delta;
+		stripe_id = id;
+		echo_get_stripe_off_id(lsm, &stripe_off, &stripe_id);
+
+		rc2 = block_debug_check("test_brw",
+					addr + delta, OBD_ECHO_BLOCK_SIZE,
+					stripe_off, stripe_id);
+		if (rc2 != 0) {
+			CERROR("Error in echo object %#llx\n", id);
+			rc = rc2;
+		}
+	}
+
+	kunmap(page);
+	return rc;
+}
+
+static int echo_client_kbrw(struct echo_device *ed, int rw, struct obdo *oa,
+			    struct echo_object *eco, u64 offset,
+			    u64 count, int async,
+			    struct obd_trans_info *oti)
+{
+	struct lov_stripe_md   *lsm = eco->eo_lsm;
+	u32	       npages;
+	struct brw_page	*pga;
+	struct brw_page	*pgp;
+	struct page	    **pages;
+	u64		 off;
+	int		     i;
+	int		     rc;
+	int		     verify;
+	gfp_t		     gfp_mask;
+	int		     brw_flags = 0;
+
+	verify = (ostid_id(&oa->o_oi) != ECHO_PERSISTENT_OBJID &&
+		  (oa->o_valid & OBD_MD_FLFLAGS) != 0 &&
+		  (oa->o_flags & OBD_FL_DEBUG_CHECK) != 0);
+
+	gfp_mask = ((ostid_id(&oa->o_oi) & 2) == 0) ? GFP_IOFS : GFP_HIGHUSER;
+
+	LASSERT(rw == OBD_BRW_WRITE || rw == OBD_BRW_READ);
+	LASSERT(lsm != NULL);
+	LASSERT(ostid_id(&lsm->lsm_oi) == ostid_id(&oa->o_oi));
+
+	if (count <= 0 ||
+	    (count & (~CFS_PAGE_MASK)) != 0)
+		return -EINVAL;
+
+	/* XXX think again with misaligned I/O */
+	npages = count >> PAGE_CACHE_SHIFT;
+
+	if (rw == OBD_BRW_WRITE)
+		brw_flags = OBD_BRW_ASYNC;
+
+	OBD_ALLOC(pga, npages * sizeof(*pga));
+	if (pga == NULL)
+		return -ENOMEM;
+
+	OBD_ALLOC(pages, npages * sizeof(*pages));
+	if (pages == NULL) {
+		OBD_FREE(pga, npages * sizeof(*pga));
+		return -ENOMEM;
+	}
+
+	for (i = 0, pgp = pga, off = offset;
+	     i < npages;
+	     i++, pgp++, off += PAGE_CACHE_SIZE) {
+
+		LASSERT(pgp->pg == NULL);      /* for cleanup */
+
+		rc = -ENOMEM;
+		OBD_PAGE_ALLOC(pgp->pg, gfp_mask);
+		if (pgp->pg == NULL)
+			goto out;
+
+		pages[i] = pgp->pg;
+		pgp->count = PAGE_CACHE_SIZE;
+		pgp->off = off;
+		pgp->flag = brw_flags;
+
+		if (verify)
+			echo_client_page_debug_setup(lsm, pgp->pg, rw,
+						     ostid_id(&oa->o_oi), off,
+						     pgp->count);
+	}
+
+	/* brw mode can only be used at client */
+	LASSERT(ed->ed_next != NULL);
+	rc = cl_echo_object_brw(eco, rw, offset, pages, npages, async);
+
+ out:
+	if (rc != 0 || rw != OBD_BRW_READ)
+		verify = 0;
+
+	for (i = 0, pgp = pga; i < npages; i++, pgp++) {
+		if (pgp->pg == NULL)
+			continue;
+
+		if (verify) {
+			int vrc;
+
+			vrc = echo_client_page_debug_check(lsm, pgp->pg,
+							   ostid_id(&oa->o_oi),
+							   pgp->off, pgp->count);
+			if (vrc != 0 && rc == 0)
+				rc = vrc;
+		}
+		OBD_PAGE_FREE(pgp->pg);
+	}
+	OBD_FREE(pga, npages * sizeof(*pga));
+	OBD_FREE(pages, npages * sizeof(*pages));
+	return rc;
+}
+
+static int echo_client_prep_commit(const struct lu_env *env,
+				   struct obd_export *exp, int rw,
+				   struct obdo *oa, struct echo_object *eco,
+				   u64 offset, u64 count,
+				   u64 batch, struct obd_trans_info *oti,
+				   int async)
+{
+	struct lov_stripe_md *lsm = eco->eo_lsm;
+	struct obd_ioobj ioo;
+	struct niobuf_local *lnb;
+	struct niobuf_remote *rnb;
+	u64 off;
+	u64 npages, tot_pages;
+	int i, ret = 0, brw_flags = 0;
+
+	if (count <= 0 || (count & (~CFS_PAGE_MASK)) != 0 ||
+	    (lsm != NULL && ostid_id(&lsm->lsm_oi) != ostid_id(&oa->o_oi)))
+		return -EINVAL;
+
+	npages = batch >> PAGE_CACHE_SHIFT;
+	tot_pages = count >> PAGE_CACHE_SHIFT;
+
+	OBD_ALLOC(lnb, npages * sizeof(struct niobuf_local));
+	OBD_ALLOC(rnb, npages * sizeof(struct niobuf_remote));
+
+	if (lnb == NULL || rnb == NULL) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	if (rw == OBD_BRW_WRITE && async)
+		brw_flags |= OBD_BRW_ASYNC;
+
+	obdo_to_ioobj(oa, &ioo);
+
+	off = offset;
+
+	for (; tot_pages; tot_pages -= npages) {
+		int lpages;
+
+		if (tot_pages < npages)
+			npages = tot_pages;
+
+		for (i = 0; i < npages; i++, off += PAGE_CACHE_SIZE) {
+			rnb[i].offset = off;
+			rnb[i].len = PAGE_CACHE_SIZE;
+			rnb[i].flags = brw_flags;
+		}
+
+		ioo.ioo_bufcnt = npages;
+		oti->oti_transno = 0;
+
+		lpages = npages;
+		ret = obd_preprw(env, rw, exp, oa, 1, &ioo, rnb, &lpages,
+				 lnb, oti, NULL);
+		if (ret != 0)
+			goto out;
+		LASSERT(lpages == npages);
+
+		for (i = 0; i < lpages; i++) {
+			struct page *page = lnb[i].page;
+
+			/* read past eof? */
+			if (page == NULL && lnb[i].rc == 0)
+				continue;
+
+			if (async)
+				lnb[i].flags |= OBD_BRW_ASYNC;
+
+			if (ostid_id(&oa->o_oi) == ECHO_PERSISTENT_OBJID ||
+			    (oa->o_valid & OBD_MD_FLFLAGS) == 0 ||
+			    (oa->o_flags & OBD_FL_DEBUG_CHECK) == 0)
+				continue;
+
+			if (rw == OBD_BRW_WRITE)
+				echo_client_page_debug_setup(lsm, page, rw,
+							    ostid_id(&oa->o_oi),
+							     rnb[i].offset,
+							     rnb[i].len);
+			else
+				echo_client_page_debug_check(lsm, page,
+							    ostid_id(&oa->o_oi),
+							     rnb[i].offset,
+							     rnb[i].len);
+		}
+
+		ret = obd_commitrw(env, rw, exp, oa, 1, &ioo,
+				   rnb, npages, lnb, oti, ret);
+		if (ret != 0)
+			goto out;
+
+		/* Reset oti otherwise it would confuse ldiskfs. */
+		memset(oti, 0, sizeof(*oti));
+
+		/* Reuse env context. */
+		lu_context_exit((struct lu_context *)&env->le_ctx);
+		lu_context_enter((struct lu_context *)&env->le_ctx);
+	}
+
+out:
+	if (lnb)
+		OBD_FREE(lnb, npages * sizeof(struct niobuf_local));
+	if (rnb)
+		OBD_FREE(rnb, npages * sizeof(struct niobuf_remote));
+	return ret;
+}
+
+static int echo_client_brw_ioctl(const struct lu_env *env, int rw,
+				 struct obd_export *exp,
+				 struct obd_ioctl_data *data,
+				 struct obd_trans_info *dummy_oti)
+{
+	struct obd_device *obd = class_exp2obd(exp);
+	struct echo_device *ed = obd2echo_dev(obd);
+	struct echo_client_obd *ec = ed->ed_ec;
+	struct obdo *oa = &data->ioc_obdo1;
+	struct echo_object *eco;
+	int rc;
+	int async = 1;
+	long test_mode;
+
+	LASSERT(oa->o_valid & OBD_MD_FLGROUP);
+
+	rc = echo_get_object(&eco, ed, oa);
+	if (rc)
+		return rc;
+
+	oa->o_valid &= ~OBD_MD_FLHANDLE;
+
+	/* OFD/obdfilter works only via prep/commit */
+	test_mode = (long)data->ioc_pbuf1;
+	if (test_mode == 1)
+		async = 0;
+
+	if (ed->ed_next == NULL && test_mode != 3) {
+		test_mode = 3;
+		data->ioc_plen1 = data->ioc_count;
+	}
+
+	/* Truncate batch size to maximum */
+	if (data->ioc_plen1 > PTLRPC_MAX_BRW_SIZE)
+		data->ioc_plen1 = PTLRPC_MAX_BRW_SIZE;
+
+	switch (test_mode) {
+	case 1:
+		/* fall through */
+	case 2:
+		rc = echo_client_kbrw(ed, rw, oa,
+				      eco, data->ioc_offset,
+				      data->ioc_count, async, dummy_oti);
+		break;
+	case 3:
+		rc = echo_client_prep_commit(env, ec->ec_exp, rw, oa,
+					     eco, data->ioc_offset,
+					     data->ioc_count, data->ioc_plen1,
+					     dummy_oti, async);
+		break;
+	default:
+		rc = -EINVAL;
+	}
+	echo_put_object(eco);
+	return rc;
+}
+
+static int
+echo_client_enqueue(struct obd_export *exp, struct obdo *oa,
+		    int mode, u64 offset, u64 nob)
+{
+	struct echo_device     *ed = obd2echo_dev(exp->exp_obd);
+	struct lustre_handle   *ulh = &oa->o_handle;
+	struct echo_object     *eco;
+	u64		 end;
+	int		     rc;
+
+	if (ed->ed_next == NULL)
+		return -EOPNOTSUPP;
+
+	if (!(mode == LCK_PR || mode == LCK_PW))
+		return -EINVAL;
+
+	if ((offset & (~CFS_PAGE_MASK)) != 0 ||
+	    (nob & (~CFS_PAGE_MASK)) != 0)
+		return -EINVAL;
+
+	rc = echo_get_object(&eco, ed, oa);
+	if (rc != 0)
+		return rc;
+
+	end = (nob == 0) ? ((u64) -1) : (offset + nob - 1);
+	rc = cl_echo_enqueue(eco, offset, end, mode, &ulh->cookie);
+	if (rc == 0) {
+		oa->o_valid |= OBD_MD_FLHANDLE;
+		CDEBUG(D_INFO, "Cookie is %#llx\n", ulh->cookie);
+	}
+	echo_put_object(eco);
+	return rc;
+}
+
+static int
+echo_client_cancel(struct obd_export *exp, struct obdo *oa)
+{
+	struct echo_device *ed     = obd2echo_dev(exp->exp_obd);
+	__u64	       cookie = oa->o_handle.cookie;
+
+	if ((oa->o_valid & OBD_MD_FLHANDLE) == 0)
+		return -EINVAL;
+
+	CDEBUG(D_INFO, "Cookie is %#llx\n", cookie);
+	return cl_echo_cancel(ed, cookie);
+}
+
+static int
+echo_client_iocontrol(unsigned int cmd, struct obd_export *exp, int len,
+		      void *karg, void *uarg)
+{
+	struct obd_device      *obd = exp->exp_obd;
+	struct echo_device     *ed = obd2echo_dev(obd);
+	struct echo_client_obd *ec = ed->ed_ec;
+	struct echo_object     *eco;
+	struct obd_ioctl_data  *data = karg;
+	struct obd_trans_info   dummy_oti;
+	struct lu_env	  *env;
+	struct oti_req_ack_lock *ack_lock;
+	struct obdo	    *oa;
+	struct lu_fid	   fid;
+	int		     rw = OBD_BRW_READ;
+	int		     rc = 0;
+	int		     i;
+
+	memset(&dummy_oti, 0, sizeof(dummy_oti));
+
+	oa = &data->ioc_obdo1;
+	if (!(oa->o_valid & OBD_MD_FLGROUP)) {
+		oa->o_valid |= OBD_MD_FLGROUP;
+		ostid_set_seq_echo(&oa->o_oi);
+	}
+
+	/* This FID is unpacked just for validation at this point */
+	rc = ostid_to_fid(&fid, &oa->o_oi, 0);
+	if (rc < 0)
+		return rc;
+
+	OBD_ALLOC_PTR(env);
+	if (env == NULL)
+		return -ENOMEM;
+
+	rc = lu_env_init(env, LCT_DT_THREAD);
+	if (rc) {
+		rc = -ENOMEM;
+		goto out;
+	}
+
+	switch (cmd) {
+	case OBD_IOC_CREATE:		    /* may create echo object */
+		if (!capable(CFS_CAP_SYS_ADMIN)) {
+			rc = -EPERM;
+			goto out;
+		}
+
+		rc = echo_create_object(env, ed, 1, oa, data->ioc_pbuf1,
+					data->ioc_plen1, &dummy_oti);
+		goto out;
+
+	case OBD_IOC_DESTROY:
+		if (!capable(CFS_CAP_SYS_ADMIN)) {
+			rc = -EPERM;
+			goto out;
+		}
+
+		rc = echo_get_object(&eco, ed, oa);
+		if (rc == 0) {
+			rc = obd_destroy(env, ec->ec_exp, oa, eco->eo_lsm,
+					 &dummy_oti, NULL, NULL);
+			if (rc == 0)
+				eco->eo_deleted = 1;
+			echo_put_object(eco);
+		}
+		goto out;
+
+	case OBD_IOC_GETATTR:
+		rc = echo_get_object(&eco, ed, oa);
+		if (rc == 0) {
+			struct obd_info oinfo = { { { 0 } } };
+
+			oinfo.oi_md = eco->eo_lsm;
+			oinfo.oi_oa = oa;
+			rc = obd_getattr(env, ec->ec_exp, &oinfo);
+			echo_put_object(eco);
+		}
+		goto out;
+
+	case OBD_IOC_SETATTR:
+		if (!capable(CFS_CAP_SYS_ADMIN)) {
+			rc = -EPERM;
+			goto out;
+		}
+
+		rc = echo_get_object(&eco, ed, oa);
+		if (rc == 0) {
+			struct obd_info oinfo = { { { 0 } } };
+
+			oinfo.oi_oa = oa;
+			oinfo.oi_md = eco->eo_lsm;
+
+			rc = obd_setattr(env, ec->ec_exp, &oinfo, NULL);
+			echo_put_object(eco);
+		}
+		goto out;
+
+	case OBD_IOC_BRW_WRITE:
+		if (!capable(CFS_CAP_SYS_ADMIN)) {
+			rc = -EPERM;
+			goto out;
+		}
+
+		rw = OBD_BRW_WRITE;
+		/* fall through */
+	case OBD_IOC_BRW_READ:
+		rc = echo_client_brw_ioctl(env, rw, exp, data, &dummy_oti);
+		goto out;
+
+	case ECHO_IOC_GET_STRIPE:
+		rc = echo_get_object(&eco, ed, oa);
+		if (rc == 0) {
+			rc = echo_copyout_lsm(eco->eo_lsm, data->ioc_pbuf1,
+					      data->ioc_plen1);
+			echo_put_object(eco);
+		}
+		goto out;
+
+	case ECHO_IOC_SET_STRIPE:
+		if (!capable(CFS_CAP_SYS_ADMIN)) {
+			rc = -EPERM;
+			goto out;
+		}
+
+		if (data->ioc_pbuf1 == NULL) {  /* unset */
+			rc = echo_get_object(&eco, ed, oa);
+			if (rc == 0) {
+				eco->eo_deleted = 1;
+				echo_put_object(eco);
+			}
+		} else {
+			rc = echo_create_object(env, ed, 0, oa,
+						data->ioc_pbuf1,
+						data->ioc_plen1, &dummy_oti);
+		}
+		goto out;
+
+	case ECHO_IOC_ENQUEUE:
+		if (!capable(CFS_CAP_SYS_ADMIN)) {
+			rc = -EPERM;
+			goto out;
+		}
+
+		rc = echo_client_enqueue(exp, oa,
+					 data->ioc_conn1, /* lock mode */
+					 data->ioc_offset,
+					 data->ioc_count);/*extent*/
+		goto out;
+
+	case ECHO_IOC_CANCEL:
+		rc = echo_client_cancel(exp, oa);
+		goto out;
+
+	default:
+		CERROR("echo_ioctl(): unrecognised ioctl %#x\n", cmd);
+		rc = -ENOTTY;
+		goto out;
+	}
+
+out:
+	lu_env_fini(env);
+	OBD_FREE_PTR(env);
+
+	/* XXX this should be in a helper also called by target_send_reply */
+	for (ack_lock = dummy_oti.oti_ack_locks, i = 0; i < 4;
+	     i++, ack_lock++) {
+		if (!ack_lock->mode)
+			break;
+		ldlm_lock_decref(&ack_lock->lock, ack_lock->mode);
+	}
+
+	return rc;
+}
+
+static int echo_client_setup(const struct lu_env *env,
+			     struct obd_device *obddev, struct lustre_cfg *lcfg)
+{
+	struct echo_client_obd *ec = &obddev->u.echo_client;
+	struct obd_device *tgt;
+	struct obd_uuid echo_uuid = { "ECHO_UUID" };
+	struct obd_connect_data *ocd = NULL;
+	int rc;
+
+	if (lcfg->lcfg_bufcount < 2 || LUSTRE_CFG_BUFLEN(lcfg, 1) < 1) {
+		CERROR("requires a TARGET OBD name\n");
+		return -EINVAL;
+	}
+
+	tgt = class_name2obd(lustre_cfg_string(lcfg, 1));
+	if (!tgt || !tgt->obd_attached || !tgt->obd_set_up) {
+		CERROR("device not attached or not set up (%s)\n",
+		       lustre_cfg_string(lcfg, 1));
+		return -EINVAL;
+	}
+
+	spin_lock_init(&ec->ec_lock);
+	INIT_LIST_HEAD(&ec->ec_objects);
+	INIT_LIST_HEAD(&ec->ec_locks);
+	ec->ec_unique = 0;
+	ec->ec_nstripes = 0;
+
+	OBD_ALLOC(ocd, sizeof(*ocd));
+	if (ocd == NULL) {
+		CERROR("Can't alloc ocd connecting to %s\n",
+		       lustre_cfg_string(lcfg, 1));
+		return -ENOMEM;
+	}
+
+	ocd->ocd_connect_flags = OBD_CONNECT_VERSION | OBD_CONNECT_REQPORTAL |
+				 OBD_CONNECT_BRW_SIZE |
+				 OBD_CONNECT_GRANT | OBD_CONNECT_FULL20 |
+				 OBD_CONNECT_64BITHASH | OBD_CONNECT_LVB_TYPE |
+				 OBD_CONNECT_FID;
+	ocd->ocd_brw_size = DT_MAX_BRW_SIZE;
+	ocd->ocd_version = LUSTRE_VERSION_CODE;
+	ocd->ocd_group = FID_SEQ_ECHO;
+
+	rc = obd_connect(env, &ec->ec_exp, tgt, &echo_uuid, ocd, NULL);
+	if (rc == 0) {
+		/* Turn off pinger because it connects to tgt obd directly. */
+		spin_lock(&tgt->obd_dev_lock);
+		list_del_init(&ec->ec_exp->exp_obd_chain_timed);
+		spin_unlock(&tgt->obd_dev_lock);
+	}
+
+	OBD_FREE(ocd, sizeof(*ocd));
+
+	if (rc != 0) {
+		CERROR("fail to connect to device %s\n",
+		       lustre_cfg_string(lcfg, 1));
+		return rc;
+	}
+
+	return rc;
+}
+
+static int echo_client_cleanup(struct obd_device *obddev)
+{
+	struct echo_client_obd *ec = &obddev->u.echo_client;
+	int rc;
+
+	if (!list_empty(&obddev->obd_exports)) {
+		CERROR("still has clients!\n");
+		return -EBUSY;
+	}
+
+	LASSERT(atomic_read(&ec->ec_exp->exp_refcount) > 0);
+	rc = obd_disconnect(ec->ec_exp);
+	if (rc != 0)
+		CERROR("fail to disconnect device: %d\n", rc);
+
+	return rc;
+}
+
+static int echo_client_connect(const struct lu_env *env,
+			       struct obd_export **exp,
+			       struct obd_device *src, struct obd_uuid *cluuid,
+			       struct obd_connect_data *data, void *localdata)
+{
+	int		rc;
+	struct lustre_handle conn = { 0 };
+
+	rc = class_connect(&conn, src, cluuid);
+	if (rc == 0) {
+		*exp = class_conn2export(&conn);
+	}
+
+	return rc;
+}
+
+static int echo_client_disconnect(struct obd_export *exp)
+{
+	int		     rc;
+
+	if (exp == NULL) {
+		rc = -EINVAL;
+		goto out;
+	}
+
+	rc = class_disconnect(exp);
+	goto out;
+ out:
+	return rc;
+}
+
+static struct obd_ops echo_client_obd_ops = {
+	.o_owner       = THIS_MODULE,
+	.o_iocontrol   = echo_client_iocontrol,
+	.o_connect     = echo_client_connect,
+	.o_disconnect  = echo_client_disconnect
+};
+
+int echo_client_init(void)
+{
+	struct lprocfs_static_vars lvars = { NULL };
+	int rc;
+
+	lprocfs_echo_init_vars(&lvars);
+
+	rc = lu_kmem_init(echo_caches);
+	if (rc == 0) {
+		rc = class_register_type(&echo_client_obd_ops, NULL,
+					 lvars.module_vars,
+					 LUSTRE_ECHO_CLIENT_NAME,
+					 &echo_device_type);
+		if (rc)
+			lu_kmem_fini(echo_caches);
+	}
+	return rc;
+}
+
+void echo_client_exit(void)
+{
+	class_unregister_type(LUSTRE_ECHO_CLIENT_NAME);
+	lu_kmem_fini(echo_caches);
+}
+
+static int __init obdecho_init(void)
+{
+	struct lprocfs_static_vars lvars;
+
+	LCONSOLE_INFO("Echo OBD driver; http://www.lustre.org/\n");
+
+	LASSERT(PAGE_CACHE_SIZE % OBD_ECHO_BLOCK_SIZE == 0);
+
+	lprocfs_echo_init_vars(&lvars);
+
+
+	return echo_client_init();
+}
+
+static void /*__exit*/ obdecho_exit(void)
+{
+	echo_client_exit();
+
+}
+
+MODULE_AUTHOR("Sun Microsystems, Inc. <http://www.lustre.org/>");
+MODULE_DESCRIPTION("Lustre Testing Echo OBD driver");
+MODULE_LICENSE("GPL");
+MODULE_VERSION(LUSTRE_VERSION_STRING);
+
+module_init(obdecho_init);
+module_exit(obdecho_exit);
+
+/** @} echo_client */
diff --git a/kernel/drivers/staging/lustre/lustre/obdecho/echo_internal.h b/kernel/drivers/staging/lustre/lustre/obdecho/echo_internal.h
new file mode 100644
index 000000000..8e9dbc235
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/obdecho/echo_internal.h
@@ -0,0 +1,47 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Whamcloud, Inc.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdecho/echo_internal.h
+ */
+
+#ifndef _ECHO_INTERNAL_H
+#define _ECHO_INTERNAL_H
+
+/* The persistent object (i.e. actually stores stuff!) */
+#define ECHO_PERSISTENT_OBJID    1ULL
+#define ECHO_PERSISTENT_SIZE     ((__u64)(1<<20))
+
+/* block size to use for data verification */
+#define OBD_ECHO_BLOCK_SIZE	(4<<10)
+
+
+#endif
diff --git a/kernel/drivers/staging/lustre/lustre/obdecho/lproc_echo.c b/kernel/drivers/staging/lustre/lustre/obdecho/lproc_echo.c
new file mode 100644
index 000000000..0beb97db7
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/obdecho/lproc_echo.c
@@ -0,0 +1,57 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+#define DEBUG_SUBSYSTEM S_ECHO
+
+#include "../include/lprocfs_status.h"
+#include "../include/obd_class.h"
+
+#if defined(CONFIG_PROC_FS)
+LPROC_SEQ_FOPS_RO_TYPE(echo, uuid);
+static struct lprocfs_vars lprocfs_echo_obd_vars[] = {
+	{ "uuid",	 &echo_uuid_fops,	NULL, 0 },
+	{ NULL }
+};
+
+LPROC_SEQ_FOPS_RO_TYPE(echo, numrefs);
+static struct lprocfs_vars lprocfs_echo_module_vars[] = {
+	{ "num_refs",     &echo_numrefs_fops,     NULL, 0 },
+	{ NULL }
+};
+
+void lprocfs_echo_init_vars(struct lprocfs_static_vars *lvars)
+{
+	lvars->module_vars  = lprocfs_echo_module_vars;
+	lvars->obd_vars     = lprocfs_echo_obd_vars;
+}
+#endif /* CONFIG_PROC_FS */
diff --git a/kernel/drivers/staging/lustre/lustre/osc/Makefile b/kernel/drivers/staging/lustre/lustre/osc/Makefile
new file mode 100644
index 000000000..54927fba4
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/osc/Makefile
@@ -0,0 +1,4 @@
+obj-$(CONFIG_LUSTRE_FS) += osc.o
+osc-y := osc_request.o osc_dev.o osc_object.o \
+	 osc_page.o osc_lock.o osc_io.o osc_quota.o osc_cache.o
+osc-$(CONFIG_PROC_FS) += lproc_osc.o
diff --git a/kernel/drivers/staging/lustre/lustre/osc/lproc_osc.c b/kernel/drivers/staging/lustre/lustre/osc/lproc_osc.c
new file mode 100644
index 000000000..15a662098
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/osc/lproc_osc.c
@@ -0,0 +1,751 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+#define DEBUG_SUBSYSTEM S_CLASS
+
+#include <linux/statfs.h>
+#include "../include/obd_cksum.h"
+#include "../include/obd_class.h"
+#include "../include/lprocfs_status.h"
+#include <linux/seq_file.h>
+#include "osc_internal.h"
+
+static int osc_active_seq_show(struct seq_file *m, void *v)
+{
+	struct obd_device *dev = m->private;
+
+	LPROCFS_CLIMP_CHECK(dev);
+	seq_printf(m, "%d\n", !dev->u.cli.cl_import->imp_deactive);
+	LPROCFS_CLIMP_EXIT(dev);
+
+	return 0;
+}
+
+static ssize_t osc_active_seq_write(struct file *file,
+				const char __user *buffer,
+				size_t count, loff_t *off)
+{
+	struct obd_device *dev = ((struct seq_file *)file->private_data)->private;
+	int val, rc;
+
+	rc = lprocfs_write_helper(buffer, count, &val);
+	if (rc)
+		return rc;
+	if (val < 0 || val > 1)
+		return -ERANGE;
+
+	/* opposite senses */
+	if (dev->u.cli.cl_import->imp_deactive == val)
+		rc = ptlrpc_set_import_active(dev->u.cli.cl_import, val);
+	else
+		CDEBUG(D_CONFIG, "activate %d: ignoring repeat request\n", val);
+
+	return count;
+}
+LPROC_SEQ_FOPS(osc_active);
+
+static int osc_max_rpcs_in_flight_seq_show(struct seq_file *m, void *v)
+{
+	struct obd_device *dev = m->private;
+	struct client_obd *cli = &dev->u.cli;
+
+	client_obd_list_lock(&cli->cl_loi_list_lock);
+	seq_printf(m, "%u\n", cli->cl_max_rpcs_in_flight);
+	client_obd_list_unlock(&cli->cl_loi_list_lock);
+
+	return 0;
+}
+
+static ssize_t osc_max_rpcs_in_flight_seq_write(struct file *file,
+			const char __user *buffer,
+			size_t count, loff_t *off)
+{
+	struct obd_device *dev = ((struct seq_file *)file->private_data)->private;
+	struct client_obd *cli = &dev->u.cli;
+	struct ptlrpc_request_pool *pool = cli->cl_import->imp_rq_pool;
+	int val, rc;
+
+	rc = lprocfs_write_helper(buffer, count, &val);
+	if (rc)
+		return rc;
+
+	if (val < 1 || val > OSC_MAX_RIF_MAX)
+		return -ERANGE;
+
+	LPROCFS_CLIMP_CHECK(dev);
+	if (pool && val > cli->cl_max_rpcs_in_flight)
+		pool->prp_populate(pool, val-cli->cl_max_rpcs_in_flight);
+
+	client_obd_list_lock(&cli->cl_loi_list_lock);
+	cli->cl_max_rpcs_in_flight = val;
+	client_obd_list_unlock(&cli->cl_loi_list_lock);
+
+	LPROCFS_CLIMP_EXIT(dev);
+	return count;
+}
+LPROC_SEQ_FOPS(osc_max_rpcs_in_flight);
+
+static int osc_max_dirty_mb_seq_show(struct seq_file *m, void *v)
+{
+	struct obd_device *dev = m->private;
+	struct client_obd *cli = &dev->u.cli;
+	long val;
+	int mult;
+
+	client_obd_list_lock(&cli->cl_loi_list_lock);
+	val = cli->cl_dirty_max;
+	client_obd_list_unlock(&cli->cl_loi_list_lock);
+
+	mult = 1 << 20;
+	return lprocfs_seq_read_frac_helper(m, val, mult);
+}
+
+static ssize_t osc_max_dirty_mb_seq_write(struct file *file,
+				const char __user *buffer,
+				size_t count, loff_t *off)
+{
+	struct obd_device *dev = ((struct seq_file *)file->private_data)->private;
+	struct client_obd *cli = &dev->u.cli;
+	int pages_number, mult, rc;
+
+	mult = 1 << (20 - PAGE_CACHE_SHIFT);
+	rc = lprocfs_write_frac_helper(buffer, count, &pages_number, mult);
+	if (rc)
+		return rc;
+
+	if (pages_number <= 0 ||
+	    pages_number > OSC_MAX_DIRTY_MB_MAX << (20 - PAGE_CACHE_SHIFT) ||
+	    pages_number > totalram_pages / 4) /* 1/4 of RAM */
+		return -ERANGE;
+
+	client_obd_list_lock(&cli->cl_loi_list_lock);
+	cli->cl_dirty_max = (u32)(pages_number << PAGE_CACHE_SHIFT);
+	osc_wake_cache_waiters(cli);
+	client_obd_list_unlock(&cli->cl_loi_list_lock);
+
+	return count;
+}
+LPROC_SEQ_FOPS(osc_max_dirty_mb);
+
+static int osc_cached_mb_seq_show(struct seq_file *m, void *v)
+{
+	struct obd_device *dev = m->private;
+	struct client_obd *cli = &dev->u.cli;
+	int shift = 20 - PAGE_CACHE_SHIFT;
+
+	seq_printf(m,
+		   "used_mb: %d\n"
+		   "busy_cnt: %d\n",
+		   (atomic_read(&cli->cl_lru_in_list) +
+		    atomic_read(&cli->cl_lru_busy)) >> shift,
+		   atomic_read(&cli->cl_lru_busy));
+
+	return 0;
+}
+
+/* shrink the number of caching pages to a specific number */
+static ssize_t osc_cached_mb_seq_write(struct file *file,
+				       const char __user *buffer,
+				       size_t count, loff_t *off)
+{
+	struct obd_device *dev = ((struct seq_file *)file->private_data)->private;
+	struct client_obd *cli = &dev->u.cli;
+	int pages_number, mult, rc;
+	char kernbuf[128];
+
+	if (count >= sizeof(kernbuf))
+		return -EINVAL;
+
+	if (copy_from_user(kernbuf, buffer, count))
+		return -EFAULT;
+	kernbuf[count] = 0;
+
+	mult = 1 << (20 - PAGE_CACHE_SHIFT);
+	buffer += lprocfs_find_named_value(kernbuf, "used_mb:", &count) -
+		  kernbuf;
+	rc = lprocfs_write_frac_helper(buffer, count, &pages_number, mult);
+	if (rc)
+		return rc;
+
+	if (pages_number < 0)
+		return -ERANGE;
+
+	rc = atomic_read(&cli->cl_lru_in_list) - pages_number;
+	if (rc > 0)
+		(void)osc_lru_shrink(cli, rc);
+
+	return count;
+}
+LPROC_SEQ_FOPS(osc_cached_mb);
+
+static int osc_cur_dirty_bytes_seq_show(struct seq_file *m, void *v)
+{
+	struct obd_device *dev = m->private;
+	struct client_obd *cli = &dev->u.cli;
+
+	client_obd_list_lock(&cli->cl_loi_list_lock);
+	seq_printf(m, "%lu\n", cli->cl_dirty);
+	client_obd_list_unlock(&cli->cl_loi_list_lock);
+
+	return 0;
+}
+LPROC_SEQ_FOPS_RO(osc_cur_dirty_bytes);
+
+static int osc_cur_grant_bytes_seq_show(struct seq_file *m, void *v)
+{
+	struct obd_device *dev = m->private;
+	struct client_obd *cli = &dev->u.cli;
+
+	client_obd_list_lock(&cli->cl_loi_list_lock);
+	seq_printf(m, "%lu\n", cli->cl_avail_grant);
+	client_obd_list_unlock(&cli->cl_loi_list_lock);
+
+	return 0;
+}
+
+static ssize_t osc_cur_grant_bytes_seq_write(struct file *file,
+				const char __user *buffer,
+				size_t count, loff_t *off)
+{
+	struct obd_device *obd = ((struct seq_file *)file->private_data)->private;
+	struct client_obd *cli = &obd->u.cli;
+	int		rc;
+	__u64	      val;
+
+	if (obd == NULL)
+		return 0;
+
+	rc = lprocfs_write_u64_helper(buffer, count, &val);
+	if (rc)
+		return rc;
+
+	/* this is only for shrinking grant */
+	client_obd_list_lock(&cli->cl_loi_list_lock);
+	if (val >= cli->cl_avail_grant) {
+		client_obd_list_unlock(&cli->cl_loi_list_lock);
+		return 0;
+	}
+	client_obd_list_unlock(&cli->cl_loi_list_lock);
+
+	LPROCFS_CLIMP_CHECK(obd);
+	if (cli->cl_import->imp_state == LUSTRE_IMP_FULL)
+		rc = osc_shrink_grant_to_target(cli, val);
+	LPROCFS_CLIMP_EXIT(obd);
+	if (rc)
+		return rc;
+	return count;
+}
+LPROC_SEQ_FOPS(osc_cur_grant_bytes);
+
+static int osc_cur_lost_grant_bytes_seq_show(struct seq_file *m, void *v)
+{
+	struct obd_device *dev = m->private;
+	struct client_obd *cli = &dev->u.cli;
+
+	client_obd_list_lock(&cli->cl_loi_list_lock);
+	seq_printf(m, "%lu\n", cli->cl_lost_grant);
+	client_obd_list_unlock(&cli->cl_loi_list_lock);
+
+	return 0;
+}
+LPROC_SEQ_FOPS_RO(osc_cur_lost_grant_bytes);
+
+static int osc_grant_shrink_interval_seq_show(struct seq_file *m, void *v)
+{
+	struct obd_device *obd = m->private;
+
+	if (obd == NULL)
+		return 0;
+	seq_printf(m, "%d\n", obd->u.cli.cl_grant_shrink_interval);
+	return 0;
+}
+
+static ssize_t osc_grant_shrink_interval_seq_write(struct file *file,
+				const char __user *buffer,
+				size_t count, loff_t *off)
+{
+	struct obd_device *obd = ((struct seq_file *)file->private_data)->private;
+	int val, rc;
+
+	if (obd == NULL)
+		return 0;
+
+	rc = lprocfs_write_helper(buffer, count, &val);
+	if (rc)
+		return rc;
+
+	if (val <= 0)
+		return -ERANGE;
+
+	obd->u.cli.cl_grant_shrink_interval = val;
+
+	return count;
+}
+LPROC_SEQ_FOPS(osc_grant_shrink_interval);
+
+static int osc_checksum_seq_show(struct seq_file *m, void *v)
+{
+	struct obd_device *obd = m->private;
+
+	if (obd == NULL)
+		return 0;
+
+	seq_printf(m, "%d\n", obd->u.cli.cl_checksum ? 1 : 0);
+	return 0;
+}
+
+static ssize_t osc_checksum_seq_write(struct file *file,
+				const char __user *buffer,
+				size_t count, loff_t *off)
+{
+	struct obd_device *obd = ((struct seq_file *)file->private_data)->private;
+	int val, rc;
+
+	if (obd == NULL)
+		return 0;
+
+	rc = lprocfs_write_helper(buffer, count, &val);
+	if (rc)
+		return rc;
+
+	obd->u.cli.cl_checksum = (val ? 1 : 0);
+
+	return count;
+}
+LPROC_SEQ_FOPS(osc_checksum);
+
+static int osc_checksum_type_seq_show(struct seq_file *m, void *v)
+{
+	struct obd_device *obd = m->private;
+	int i;
+	DECLARE_CKSUM_NAME;
+
+	if (obd == NULL)
+		return 0;
+
+	for (i = 0; i < ARRAY_SIZE(cksum_name); i++) {
+		if (((1 << i) & obd->u.cli.cl_supp_cksum_types) == 0)
+			continue;
+		if (obd->u.cli.cl_cksum_type == (1 << i))
+			seq_printf(m, "[%s] ", cksum_name[i]);
+		else
+			seq_printf(m, "%s ", cksum_name[i]);
+	}
+	seq_putc(m, '\n');
+	return 0;
+}
+
+static ssize_t osc_checksum_type_seq_write(struct file *file,
+				const char __user *buffer,
+				size_t count, loff_t *off)
+{
+	struct obd_device *obd = ((struct seq_file *)file->private_data)->private;
+	int i;
+	DECLARE_CKSUM_NAME;
+	char kernbuf[10];
+
+	if (obd == NULL)
+		return 0;
+
+	if (count > sizeof(kernbuf) - 1)
+		return -EINVAL;
+	if (copy_from_user(kernbuf, buffer, count))
+		return -EFAULT;
+	if (count > 0 && kernbuf[count - 1] == '\n')
+		kernbuf[count - 1] = '\0';
+	else
+		kernbuf[count] = '\0';
+
+	for (i = 0; i < ARRAY_SIZE(cksum_name); i++) {
+		if (((1 << i) & obd->u.cli.cl_supp_cksum_types) == 0)
+			continue;
+		if (!strcmp(kernbuf, cksum_name[i])) {
+		       obd->u.cli.cl_cksum_type = 1 << i;
+		       return count;
+		}
+	}
+	return -EINVAL;
+}
+LPROC_SEQ_FOPS(osc_checksum_type);
+
+static int osc_resend_count_seq_show(struct seq_file *m, void *v)
+{
+	struct obd_device *obd = m->private;
+
+	seq_printf(m, "%u\n", atomic_read(&obd->u.cli.cl_resends));
+	return 0;
+}
+
+static ssize_t osc_resend_count_seq_write(struct file *file,
+				const char __user *buffer,
+				size_t count, loff_t *off)
+{
+	struct obd_device *obd = ((struct seq_file *)file->private_data)->private;
+	int val, rc;
+
+	rc = lprocfs_write_helper(buffer, count, &val);
+	if (rc)
+		return rc;
+
+	if (val < 0)
+	       return -EINVAL;
+
+	atomic_set(&obd->u.cli.cl_resends, val);
+
+	return count;
+}
+LPROC_SEQ_FOPS(osc_resend_count);
+
+static int osc_contention_seconds_seq_show(struct seq_file *m, void *v)
+{
+	struct obd_device *obd = m->private;
+	struct osc_device *od  = obd2osc_dev(obd);
+
+	seq_printf(m, "%u\n", od->od_contention_time);
+	return 0;
+}
+
+static ssize_t osc_contention_seconds_seq_write(struct file *file,
+					const char __user *buffer,
+					size_t count, loff_t *off)
+{
+	struct obd_device *obd = ((struct seq_file *)file->private_data)->private;
+	struct osc_device *od  = obd2osc_dev(obd);
+
+	return lprocfs_write_helper(buffer, count, &od->od_contention_time) ?:
+		count;
+}
+LPROC_SEQ_FOPS(osc_contention_seconds);
+
+static int osc_lockless_truncate_seq_show(struct seq_file *m, void *v)
+{
+	struct obd_device *obd = m->private;
+	struct osc_device *od  = obd2osc_dev(obd);
+
+	seq_printf(m, "%u\n", od->od_lockless_truncate);
+	return 0;
+}
+
+static ssize_t osc_lockless_truncate_seq_write(struct file *file,
+					const char __user *buffer,
+					size_t count, loff_t *off)
+{
+	struct obd_device *obd = ((struct seq_file *)file->private_data)->private;
+	struct osc_device *od  = obd2osc_dev(obd);
+
+	return lprocfs_write_helper(buffer, count, &od->od_lockless_truncate) ?:
+		count;
+}
+LPROC_SEQ_FOPS(osc_lockless_truncate);
+
+static int osc_destroys_in_flight_seq_show(struct seq_file *m, void *v)
+{
+	struct obd_device *obd = m->private;
+
+	seq_printf(m, "%u\n", atomic_read(&obd->u.cli.cl_destroy_in_flight));
+	return 0;
+}
+LPROC_SEQ_FOPS_RO(osc_destroys_in_flight);
+
+static int osc_obd_max_pages_per_rpc_seq_show(struct seq_file *m, void *v)
+{
+	return lprocfs_obd_rd_max_pages_per_rpc(m, m->private);
+}
+
+static ssize_t osc_obd_max_pages_per_rpc_seq_write(struct file *file,
+				const char __user *buffer,
+				size_t count, loff_t *off)
+{
+	struct obd_device *dev = ((struct seq_file *)file->private_data)->private;
+	struct client_obd *cli = &dev->u.cli;
+	struct obd_connect_data *ocd = &cli->cl_import->imp_connect_data;
+	int chunk_mask, rc;
+	__u64 val;
+
+	rc = lprocfs_write_u64_helper(buffer, count, &val);
+	if (rc)
+		return rc;
+
+	/* if the max_pages is specified in bytes, convert to pages */
+	if (val >= ONE_MB_BRW_SIZE)
+		val >>= PAGE_CACHE_SHIFT;
+
+	LPROCFS_CLIMP_CHECK(dev);
+
+	chunk_mask = ~((1 << (cli->cl_chunkbits - PAGE_CACHE_SHIFT)) - 1);
+	/* max_pages_per_rpc must be chunk aligned */
+	val = (val + ~chunk_mask) & chunk_mask;
+	if (val == 0 || val > ocd->ocd_brw_size >> PAGE_CACHE_SHIFT) {
+		LPROCFS_CLIMP_EXIT(dev);
+		return -ERANGE;
+	}
+	client_obd_list_lock(&cli->cl_loi_list_lock);
+	cli->cl_max_pages_per_rpc = val;
+	client_obd_list_unlock(&cli->cl_loi_list_lock);
+
+	LPROCFS_CLIMP_EXIT(dev);
+	return count;
+}
+LPROC_SEQ_FOPS(osc_obd_max_pages_per_rpc);
+
+LPROC_SEQ_FOPS_RO_TYPE(osc, uuid);
+LPROC_SEQ_FOPS_RO_TYPE(osc, connect_flags);
+LPROC_SEQ_FOPS_RO_TYPE(osc, blksize);
+LPROC_SEQ_FOPS_RO_TYPE(osc, kbytestotal);
+LPROC_SEQ_FOPS_RO_TYPE(osc, kbytesfree);
+LPROC_SEQ_FOPS_RO_TYPE(osc, kbytesavail);
+LPROC_SEQ_FOPS_RO_TYPE(osc, filestotal);
+LPROC_SEQ_FOPS_RO_TYPE(osc, filesfree);
+LPROC_SEQ_FOPS_RO_TYPE(osc, server_uuid);
+LPROC_SEQ_FOPS_RO_TYPE(osc, conn_uuid);
+LPROC_SEQ_FOPS_RO_TYPE(osc, timeouts);
+LPROC_SEQ_FOPS_RO_TYPE(osc, state);
+
+LPROC_SEQ_FOPS_WR_ONLY(osc, ping);
+
+LPROC_SEQ_FOPS_RW_TYPE(osc, import);
+LPROC_SEQ_FOPS_RW_TYPE(osc, pinger_recov);
+
+static struct lprocfs_vars lprocfs_osc_obd_vars[] = {
+	{ "uuid",	     &osc_uuid_fops,	NULL, 0 },
+	{ "ping",	     &osc_ping_fops,    NULL, 0222 },
+	{ "connect_flags",   &osc_connect_flags_fops, NULL, 0 },
+	{ "blocksize",       &osc_blksize_fops,     NULL, 0 },
+	{ "kbytestotal",     &osc_kbytestotal_fops, NULL, 0 },
+	{ "kbytesfree",      &osc_kbytesfree_fops,  NULL, 0 },
+	{ "kbytesavail",     &osc_kbytesavail_fops, NULL, 0 },
+	{ "filestotal",      &osc_filestotal_fops,  NULL, 0 },
+	{ "filesfree",       &osc_filesfree_fops,   NULL, 0 },
+	/*{ "filegroups",      lprocfs_rd_filegroups,  NULL, 0 },*/
+	{ "ost_server_uuid", &osc_server_uuid_fops, NULL, 0 },
+	{ "ost_conn_uuid",   &osc_conn_uuid_fops, NULL, 0 },
+	{ "active",	     &osc_active_fops, NULL },
+	{ "max_pages_per_rpc", &osc_obd_max_pages_per_rpc_fops, NULL },
+	{ "max_rpcs_in_flight", &osc_max_rpcs_in_flight_fops, NULL },
+	{ "destroys_in_flight", &osc_destroys_in_flight_fops, NULL, 0 },
+	{ "max_dirty_mb",    &osc_max_dirty_mb_fops, NULL },
+	{ "osc_cached_mb",   &osc_cached_mb_fops, NULL },
+	{ "cur_dirty_bytes", &osc_cur_dirty_bytes_fops, NULL, 0 },
+	{ "cur_grant_bytes", &osc_cur_grant_bytes_fops, NULL },
+	{ "cur_lost_grant_bytes", &osc_cur_lost_grant_bytes_fops, NULL, 0},
+	{ "grant_shrink_interval", &osc_grant_shrink_interval_fops, NULL },
+	{ "checksums",       &osc_checksum_fops, NULL },
+	{ "checksum_type",   &osc_checksum_type_fops, NULL },
+	{ "resend_count",    &osc_resend_count_fops, NULL},
+	{ "timeouts",	     &osc_timeouts_fops, NULL, 0 },
+	{ "contention_seconds", &osc_contention_seconds_fops, NULL },
+	{ "lockless_truncate",  &osc_lockless_truncate_fops, NULL },
+	{ "import",		&osc_import_fops, NULL },
+	{ "state",		&osc_state_fops, NULL, 0 },
+	{ "pinger_recov",	&osc_pinger_recov_fops, NULL },
+	{ NULL }
+};
+
+LPROC_SEQ_FOPS_RO_TYPE(osc, numrefs);
+static struct lprocfs_vars lprocfs_osc_module_vars[] = {
+	{ "num_refs",	&osc_numrefs_fops,     NULL, 0 },
+	{ NULL }
+};
+
+#define pct(a, b) (b ? a * 100 / b : 0)
+
+static int osc_rpc_stats_seq_show(struct seq_file *seq, void *v)
+{
+	struct timeval now;
+	struct obd_device *dev = seq->private;
+	struct client_obd *cli = &dev->u.cli;
+	unsigned long read_tot = 0, write_tot = 0, read_cum, write_cum;
+	int i;
+
+	do_gettimeofday(&now);
+
+	client_obd_list_lock(&cli->cl_loi_list_lock);
+
+	seq_printf(seq, "snapshot_time:	 %lu.%lu (secs.usecs)\n",
+		   now.tv_sec, (unsigned long)now.tv_usec);
+	seq_printf(seq, "read RPCs in flight:  %d\n",
+		   cli->cl_r_in_flight);
+	seq_printf(seq, "write RPCs in flight: %d\n",
+		   cli->cl_w_in_flight);
+	seq_printf(seq, "pending write pages:  %d\n",
+		   atomic_read(&cli->cl_pending_w_pages));
+	seq_printf(seq, "pending read pages:   %d\n",
+		   atomic_read(&cli->cl_pending_r_pages));
+
+	seq_puts(seq, "\n\t\t\tread\t\t\twrite\n");
+	seq_puts(seq, "pages per rpc	 rpcs   % cum % |");
+	seq_puts(seq, "       rpcs   % cum %\n");
+
+	read_tot = lprocfs_oh_sum(&cli->cl_read_page_hist);
+	write_tot = lprocfs_oh_sum(&cli->cl_write_page_hist);
+
+	read_cum = 0;
+	write_cum = 0;
+	for (i = 0; i < OBD_HIST_MAX; i++) {
+		unsigned long r = cli->cl_read_page_hist.oh_buckets[i];
+		unsigned long w = cli->cl_write_page_hist.oh_buckets[i];
+		read_cum += r;
+		write_cum += w;
+		seq_printf(seq, "%d:\t\t%10lu %3lu %3lu   | %10lu %3lu %3lu\n",
+				 1 << i, r, pct(r, read_tot),
+				 pct(read_cum, read_tot), w,
+				 pct(w, write_tot),
+				 pct(write_cum, write_tot));
+		if (read_cum == read_tot && write_cum == write_tot)
+			break;
+	}
+
+	seq_puts(seq, "\n\t\t\tread\t\t\twrite\n");
+	seq_puts(seq, "rpcs in flight	rpcs   % cum % |");
+	seq_puts(seq, "       rpcs   % cum %\n");
+
+	read_tot = lprocfs_oh_sum(&cli->cl_read_rpc_hist);
+	write_tot = lprocfs_oh_sum(&cli->cl_write_rpc_hist);
+
+	read_cum = 0;
+	write_cum = 0;
+	for (i = 0; i < OBD_HIST_MAX; i++) {
+		unsigned long r = cli->cl_read_rpc_hist.oh_buckets[i];
+		unsigned long w = cli->cl_write_rpc_hist.oh_buckets[i];
+		read_cum += r;
+		write_cum += w;
+		seq_printf(seq, "%d:\t\t%10lu %3lu %3lu   | %10lu %3lu %3lu\n",
+				 i, r, pct(r, read_tot),
+				 pct(read_cum, read_tot), w,
+				 pct(w, write_tot),
+				 pct(write_cum, write_tot));
+		if (read_cum == read_tot && write_cum == write_tot)
+			break;
+	}
+
+	seq_puts(seq, "\n\t\t\tread\t\t\twrite\n");
+	seq_puts(seq, "offset		rpcs   % cum % |");
+	seq_puts(seq, "       rpcs   % cum %\n");
+
+	read_tot = lprocfs_oh_sum(&cli->cl_read_offset_hist);
+	write_tot = lprocfs_oh_sum(&cli->cl_write_offset_hist);
+
+	read_cum = 0;
+	write_cum = 0;
+	for (i = 0; i < OBD_HIST_MAX; i++) {
+		unsigned long r = cli->cl_read_offset_hist.oh_buckets[i];
+		unsigned long w = cli->cl_write_offset_hist.oh_buckets[i];
+		read_cum += r;
+		write_cum += w;
+		seq_printf(seq, "%d:\t\t%10lu %3lu %3lu   | %10lu %3lu %3lu\n",
+			   (i == 0) ? 0 : 1 << (i - 1),
+			   r, pct(r, read_tot), pct(read_cum, read_tot),
+			   w, pct(w, write_tot), pct(write_cum, write_tot));
+		if (read_cum == read_tot && write_cum == write_tot)
+			break;
+	}
+
+	client_obd_list_unlock(&cli->cl_loi_list_lock);
+
+	return 0;
+}
+#undef pct
+
+static ssize_t osc_rpc_stats_seq_write(struct file *file,
+				const char __user *buf,
+				size_t len, loff_t *off)
+{
+	struct seq_file *seq = file->private_data;
+	struct obd_device *dev = seq->private;
+	struct client_obd *cli = &dev->u.cli;
+
+	lprocfs_oh_clear(&cli->cl_read_rpc_hist);
+	lprocfs_oh_clear(&cli->cl_write_rpc_hist);
+	lprocfs_oh_clear(&cli->cl_read_page_hist);
+	lprocfs_oh_clear(&cli->cl_write_page_hist);
+	lprocfs_oh_clear(&cli->cl_read_offset_hist);
+	lprocfs_oh_clear(&cli->cl_write_offset_hist);
+
+	return len;
+}
+
+LPROC_SEQ_FOPS(osc_rpc_stats);
+
+static int osc_stats_seq_show(struct seq_file *seq, void *v)
+{
+	struct timeval now;
+	struct obd_device *dev = seq->private;
+	struct osc_stats *stats = &obd2osc_dev(dev)->od_stats;
+
+	do_gettimeofday(&now);
+
+	seq_printf(seq, "snapshot_time:	 %lu.%lu (secs.usecs)\n",
+		   now.tv_sec, (unsigned long)now.tv_usec);
+	seq_printf(seq, "lockless_write_bytes\t\t%llu\n",
+		   stats->os_lockless_writes);
+	seq_printf(seq, "lockless_read_bytes\t\t%llu\n",
+		   stats->os_lockless_reads);
+	seq_printf(seq, "lockless_truncate\t\t%llu\n",
+		   stats->os_lockless_truncates);
+	return 0;
+}
+
+static ssize_t osc_stats_seq_write(struct file *file,
+				const char __user *buf,
+				size_t len, loff_t *off)
+{
+	struct seq_file *seq = file->private_data;
+	struct obd_device *dev = seq->private;
+	struct osc_stats *stats = &obd2osc_dev(dev)->od_stats;
+
+	memset(stats, 0, sizeof(*stats));
+	return len;
+}
+
+LPROC_SEQ_FOPS(osc_stats);
+
+int lproc_osc_attach_seqstat(struct obd_device *dev)
+{
+	int rc;
+
+	rc = lprocfs_seq_create(dev->obd_proc_entry, "osc_stats", 0644,
+				&osc_stats_fops, dev);
+	if (rc == 0)
+		rc = lprocfs_obd_seq_create(dev, "rpc_stats", 0644,
+					    &osc_rpc_stats_fops, dev);
+
+	return rc;
+}
+
+void lprocfs_osc_init_vars(struct lprocfs_static_vars *lvars)
+{
+	lvars->module_vars = lprocfs_osc_module_vars;
+	lvars->obd_vars    = lprocfs_osc_obd_vars;
+}
diff --git a/kernel/drivers/staging/lustre/lustre/osc/osc_cache.c b/kernel/drivers/staging/lustre/lustre/osc/osc_cache.c
new file mode 100644
index 000000000..d44b3d4ff
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/osc/osc_cache.c
@@ -0,0 +1,2944 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ *
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * osc cache management.
+ *
+ * Author: Jinshan Xiong <jinshan.xiong@whamcloud.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_OSC
+
+#include "osc_cl_internal.h"
+#include "osc_internal.h"
+
+static int extent_debug; /* set it to be true for more debug */
+
+static void osc_update_pending(struct osc_object *obj, int cmd, int delta);
+static int osc_extent_wait(const struct lu_env *env, struct osc_extent *ext,
+			   int state);
+static void osc_ap_completion(const struct lu_env *env, struct client_obd *cli,
+			      struct osc_async_page *oap, int sent, int rc);
+static int osc_make_ready(const struct lu_env *env, struct osc_async_page *oap,
+			  int cmd);
+static int osc_refresh_count(const struct lu_env *env,
+			     struct osc_async_page *oap, int cmd);
+static int osc_io_unplug_async(const struct lu_env *env,
+			       struct client_obd *cli, struct osc_object *osc);
+static void osc_free_grant(struct client_obd *cli, unsigned int nr_pages,
+			   unsigned int lost_grant);
+
+static void osc_extent_tree_dump0(int level, struct osc_object *obj,
+				  const char *func, int line);
+#define osc_extent_tree_dump(lvl, obj) \
+	osc_extent_tree_dump0(lvl, obj, __func__, __LINE__)
+
+/** \addtogroup osc
+ *  @{
+ */
+
+/* ------------------ osc extent ------------------ */
+static inline char *ext_flags(struct osc_extent *ext, char *flags)
+{
+	char *buf = flags;
+	*buf++ = ext->oe_rw ? 'r' : 'w';
+	if (ext->oe_intree)
+		*buf++ = 'i';
+	if (ext->oe_srvlock)
+		*buf++ = 's';
+	if (ext->oe_hp)
+		*buf++ = 'h';
+	if (ext->oe_urgent)
+		*buf++ = 'u';
+	if (ext->oe_memalloc)
+		*buf++ = 'm';
+	if (ext->oe_trunc_pending)
+		*buf++ = 't';
+	if (ext->oe_fsync_wait)
+		*buf++ = 'Y';
+	*buf = 0;
+	return flags;
+}
+
+static inline char list_empty_marker(struct list_head *list)
+{
+	return list_empty(list) ? '-' : '+';
+}
+
+#define EXTSTR       "[%lu -> %lu/%lu]"
+#define EXTPARA(ext) (ext)->oe_start, (ext)->oe_end, (ext)->oe_max_end
+static const char *oes_strings[] = {
+	"inv", "active", "cache", "locking", "lockdone", "rpc", "trunc", NULL };
+
+#define OSC_EXTENT_DUMP(lvl, extent, fmt, ...) do {			      \
+	struct osc_extent *__ext = (extent);				      \
+	char __buf[16];							      \
+									      \
+	CDEBUG(lvl,							      \
+		"extent %p@{" EXTSTR ", "				      \
+		"[%d|%d|%c|%s|%s|%p], [%d|%d|%c|%c|%p|%u|%p]} " fmt,	      \
+		/* ----- extent part 0 ----- */				      \
+		__ext, EXTPARA(__ext),					      \
+		/* ----- part 1 ----- */				      \
+		atomic_read(&__ext->oe_refc),			      \
+		atomic_read(&__ext->oe_users),			      \
+		list_empty_marker(&__ext->oe_link),			      \
+		oes_strings[__ext->oe_state], ext_flags(__ext, __buf),	      \
+		__ext->oe_obj,						      \
+		/* ----- part 2 ----- */				      \
+		__ext->oe_grants, __ext->oe_nr_pages,			      \
+		list_empty_marker(&__ext->oe_pages),			      \
+		waitqueue_active(&__ext->oe_waitq) ? '+' : '-',		      \
+		__ext->oe_osclock, __ext->oe_mppr, __ext->oe_owner,	      \
+		/* ----- part 4 ----- */				      \
+		## __VA_ARGS__);					      \
+} while (0)
+
+#undef EASSERTF
+#define EASSERTF(expr, ext, fmt, args...) do {				\
+	if (!(expr)) {							\
+		OSC_EXTENT_DUMP(D_ERROR, (ext), fmt, ##args);		\
+		osc_extent_tree_dump(D_ERROR, (ext)->oe_obj);		\
+		LASSERT(expr);						\
+	}								\
+} while (0)
+
+#undef EASSERT
+#define EASSERT(expr, ext) EASSERTF(expr, ext, "\n")
+
+static inline struct osc_extent *rb_extent(struct rb_node *n)
+{
+	if (n == NULL)
+		return NULL;
+
+	return container_of(n, struct osc_extent, oe_node);
+}
+
+static inline struct osc_extent *next_extent(struct osc_extent *ext)
+{
+	if (ext == NULL)
+		return NULL;
+
+	LASSERT(ext->oe_intree);
+	return rb_extent(rb_next(&ext->oe_node));
+}
+
+static inline struct osc_extent *prev_extent(struct osc_extent *ext)
+{
+	if (ext == NULL)
+		return NULL;
+
+	LASSERT(ext->oe_intree);
+	return rb_extent(rb_prev(&ext->oe_node));
+}
+
+static inline struct osc_extent *first_extent(struct osc_object *obj)
+{
+	return rb_extent(rb_first(&obj->oo_root));
+}
+
+/* object must be locked by caller. */
+static int osc_extent_sanity_check0(struct osc_extent *ext,
+				    const char *func, const int line)
+{
+	struct osc_object *obj = ext->oe_obj;
+	struct osc_async_page *oap;
+	int page_count;
+	int rc = 0;
+
+	if (!osc_object_is_locked(obj)) {
+		rc = 9;
+		goto out;
+	}
+
+	if (ext->oe_state >= OES_STATE_MAX) {
+		rc = 10;
+		goto out;
+	}
+
+	if (atomic_read(&ext->oe_refc) <= 0) {
+		rc = 20;
+		goto out;
+	}
+
+	if (atomic_read(&ext->oe_refc) < atomic_read(&ext->oe_users)) {
+		rc = 30;
+		goto out;
+	}
+
+	switch (ext->oe_state) {
+	case OES_INV:
+		if (ext->oe_nr_pages > 0 || !list_empty(&ext->oe_pages))
+			rc = 35;
+		else
+			rc = 0;
+		goto out;
+	case OES_ACTIVE:
+		if (atomic_read(&ext->oe_users) == 0) {
+			rc = 40;
+			goto out;
+		}
+		if (ext->oe_hp) {
+			rc = 50;
+			goto out;
+		}
+		if (ext->oe_fsync_wait && !ext->oe_urgent) {
+			rc = 55;
+			goto out;
+		}
+		break;
+	case OES_CACHE:
+		if (ext->oe_grants == 0) {
+			rc = 60;
+			goto out;
+		}
+		if (ext->oe_fsync_wait && !ext->oe_urgent && !ext->oe_hp) {
+			rc = 65;
+			goto out;
+		}
+	default:
+		if (atomic_read(&ext->oe_users) > 0) {
+			rc = 70;
+			goto out;
+		}
+	}
+
+	if (ext->oe_max_end < ext->oe_end || ext->oe_end < ext->oe_start) {
+		rc = 80;
+		goto out;
+	}
+
+	if (ext->oe_osclock == NULL && ext->oe_grants > 0) {
+		rc = 90;
+		goto out;
+	}
+
+	if (ext->oe_osclock) {
+		struct cl_lock_descr *descr;
+		descr = &ext->oe_osclock->cll_descr;
+		if (!(descr->cld_start <= ext->oe_start &&
+		      descr->cld_end >= ext->oe_max_end)) {
+			rc = 100;
+			goto out;
+		}
+	}
+
+	if (ext->oe_nr_pages > ext->oe_mppr) {
+		rc = 105;
+		goto out;
+	}
+
+	/* Do not verify page list if extent is in RPC. This is because an
+	 * in-RPC extent is supposed to be exclusively accessible w/o lock. */
+	if (ext->oe_state > OES_CACHE) {
+		rc = 0;
+		goto out;
+	}
+
+	if (!extent_debug) {
+		rc = 0;
+		goto out;
+	}
+
+	page_count = 0;
+	list_for_each_entry(oap, &ext->oe_pages, oap_pending_item) {
+		pgoff_t index = oap2cl_page(oap)->cp_index;
+		++page_count;
+		if (index > ext->oe_end || index < ext->oe_start) {
+			rc = 110;
+			goto out;
+		}
+	}
+	if (page_count != ext->oe_nr_pages) {
+		rc = 120;
+		goto out;
+	}
+
+out:
+	if (rc != 0)
+		OSC_EXTENT_DUMP(D_ERROR, ext,
+				"%s:%d sanity check %p failed with rc = %d\n",
+				func, line, ext, rc);
+	return rc;
+}
+
+#define sanity_check_nolock(ext) \
+	osc_extent_sanity_check0(ext, __func__, __LINE__)
+
+#define sanity_check(ext) ({						   \
+	int __res;							     \
+	osc_object_lock((ext)->oe_obj);					\
+	__res = sanity_check_nolock(ext);				      \
+	osc_object_unlock((ext)->oe_obj);				      \
+	__res;								 \
+})
+
+
+/**
+ * sanity check - to make sure there is no overlapped extent in the tree.
+ */
+static int osc_extent_is_overlapped(struct osc_object *obj,
+				    struct osc_extent *ext)
+{
+	struct osc_extent *tmp;
+
+	LASSERT(osc_object_is_locked(obj));
+
+	if (!extent_debug)
+		return 0;
+
+	for (tmp = first_extent(obj); tmp != NULL; tmp = next_extent(tmp)) {
+		if (tmp == ext)
+			continue;
+		if (tmp->oe_end >= ext->oe_start &&
+		    tmp->oe_start <= ext->oe_end)
+			return 1;
+	}
+	return 0;
+}
+
+static void osc_extent_state_set(struct osc_extent *ext, int state)
+{
+	LASSERT(osc_object_is_locked(ext->oe_obj));
+	LASSERT(state >= OES_INV && state < OES_STATE_MAX);
+
+	/* Never try to sanity check a state changing extent :-) */
+	/* LASSERT(sanity_check_nolock(ext) == 0); */
+
+	/* TODO: validate the state machine */
+	ext->oe_state = state;
+	wake_up_all(&ext->oe_waitq);
+}
+
+static struct osc_extent *osc_extent_alloc(struct osc_object *obj)
+{
+	struct osc_extent *ext;
+
+	OBD_SLAB_ALLOC_PTR_GFP(ext, osc_extent_kmem, GFP_IOFS);
+	if (ext == NULL)
+		return NULL;
+
+	RB_CLEAR_NODE(&ext->oe_node);
+	ext->oe_obj = obj;
+	atomic_set(&ext->oe_refc, 1);
+	atomic_set(&ext->oe_users, 0);
+	INIT_LIST_HEAD(&ext->oe_link);
+	ext->oe_state = OES_INV;
+	INIT_LIST_HEAD(&ext->oe_pages);
+	init_waitqueue_head(&ext->oe_waitq);
+	ext->oe_osclock = NULL;
+
+	return ext;
+}
+
+static void osc_extent_free(struct osc_extent *ext)
+{
+	OBD_SLAB_FREE_PTR(ext, osc_extent_kmem);
+}
+
+static struct osc_extent *osc_extent_get(struct osc_extent *ext)
+{
+	LASSERT(atomic_read(&ext->oe_refc) >= 0);
+	atomic_inc(&ext->oe_refc);
+	return ext;
+}
+
+static void osc_extent_put(const struct lu_env *env, struct osc_extent *ext)
+{
+	LASSERT(atomic_read(&ext->oe_refc) > 0);
+	if (atomic_dec_and_test(&ext->oe_refc)) {
+		LASSERT(list_empty(&ext->oe_link));
+		LASSERT(atomic_read(&ext->oe_users) == 0);
+		LASSERT(ext->oe_state == OES_INV);
+		LASSERT(!ext->oe_intree);
+
+		if (ext->oe_osclock) {
+			cl_lock_put(env, ext->oe_osclock);
+			ext->oe_osclock = NULL;
+		}
+		osc_extent_free(ext);
+	}
+}
+
+/**
+ * osc_extent_put_trust() is a special version of osc_extent_put() when
+ * it's known that the caller is not the last user. This is to address the
+ * problem of lacking of lu_env ;-).
+ */
+static void osc_extent_put_trust(struct osc_extent *ext)
+{
+	LASSERT(atomic_read(&ext->oe_refc) > 1);
+	LASSERT(osc_object_is_locked(ext->oe_obj));
+	atomic_dec(&ext->oe_refc);
+}
+
+/**
+ * Return the extent which includes pgoff @index, or return the greatest
+ * previous extent in the tree.
+ */
+static struct osc_extent *osc_extent_search(struct osc_object *obj,
+					    pgoff_t index)
+{
+	struct rb_node    *n = obj->oo_root.rb_node;
+	struct osc_extent *tmp, *p = NULL;
+
+	LASSERT(osc_object_is_locked(obj));
+	while (n != NULL) {
+		tmp = rb_extent(n);
+		if (index < tmp->oe_start) {
+			n = n->rb_left;
+		} else if (index > tmp->oe_end) {
+			p = rb_extent(n);
+			n = n->rb_right;
+		} else {
+			return tmp;
+		}
+	}
+	return p;
+}
+
+/*
+ * Return the extent covering @index, otherwise return NULL.
+ * caller must have held object lock.
+ */
+static struct osc_extent *osc_extent_lookup(struct osc_object *obj,
+					    pgoff_t index)
+{
+	struct osc_extent *ext;
+
+	ext = osc_extent_search(obj, index);
+	if (ext != NULL && ext->oe_start <= index && index <= ext->oe_end)
+		return osc_extent_get(ext);
+	return NULL;
+}
+
+/* caller must have held object lock. */
+static void osc_extent_insert(struct osc_object *obj, struct osc_extent *ext)
+{
+	struct rb_node   **n      = &obj->oo_root.rb_node;
+	struct rb_node    *parent = NULL;
+	struct osc_extent *tmp;
+
+	LASSERT(ext->oe_intree == 0);
+	LASSERT(ext->oe_obj == obj);
+	LASSERT(osc_object_is_locked(obj));
+	while (*n != NULL) {
+		tmp = rb_extent(*n);
+		parent = *n;
+
+		if (ext->oe_end < tmp->oe_start)
+			n = &(*n)->rb_left;
+		else if (ext->oe_start > tmp->oe_end)
+			n = &(*n)->rb_right;
+		else
+			EASSERTF(0, tmp, EXTSTR, EXTPARA(ext));
+	}
+	rb_link_node(&ext->oe_node, parent, n);
+	rb_insert_color(&ext->oe_node, &obj->oo_root);
+	osc_extent_get(ext);
+	ext->oe_intree = 1;
+}
+
+/* caller must have held object lock. */
+static void osc_extent_erase(struct osc_extent *ext)
+{
+	struct osc_object *obj = ext->oe_obj;
+	LASSERT(osc_object_is_locked(obj));
+	if (ext->oe_intree) {
+		rb_erase(&ext->oe_node, &obj->oo_root);
+		ext->oe_intree = 0;
+		/* rbtree held a refcount */
+		osc_extent_put_trust(ext);
+	}
+}
+
+static struct osc_extent *osc_extent_hold(struct osc_extent *ext)
+{
+	struct osc_object *obj = ext->oe_obj;
+
+	LASSERT(osc_object_is_locked(obj));
+	LASSERT(ext->oe_state == OES_ACTIVE || ext->oe_state == OES_CACHE);
+	if (ext->oe_state == OES_CACHE) {
+		osc_extent_state_set(ext, OES_ACTIVE);
+		osc_update_pending(obj, OBD_BRW_WRITE, -ext->oe_nr_pages);
+	}
+	atomic_inc(&ext->oe_users);
+	list_del_init(&ext->oe_link);
+	return osc_extent_get(ext);
+}
+
+static void __osc_extent_remove(struct osc_extent *ext)
+{
+	LASSERT(osc_object_is_locked(ext->oe_obj));
+	LASSERT(list_empty(&ext->oe_pages));
+	osc_extent_erase(ext);
+	list_del_init(&ext->oe_link);
+	osc_extent_state_set(ext, OES_INV);
+	OSC_EXTENT_DUMP(D_CACHE, ext, "destroyed.\n");
+}
+
+static void osc_extent_remove(struct osc_extent *ext)
+{
+	struct osc_object *obj = ext->oe_obj;
+
+	osc_object_lock(obj);
+	__osc_extent_remove(ext);
+	osc_object_unlock(obj);
+}
+
+/**
+ * This function is used to merge extents to get better performance. It checks
+ * if @cur and @victim are contiguous at chunk level.
+ */
+static int osc_extent_merge(const struct lu_env *env, struct osc_extent *cur,
+			    struct osc_extent *victim)
+{
+	struct osc_object *obj = cur->oe_obj;
+	pgoff_t chunk_start;
+	pgoff_t chunk_end;
+	int ppc_bits;
+
+	LASSERT(cur->oe_state == OES_CACHE);
+	LASSERT(osc_object_is_locked(obj));
+	if (victim == NULL)
+		return -EINVAL;
+
+	if (victim->oe_state != OES_CACHE || victim->oe_fsync_wait)
+		return -EBUSY;
+
+	if (cur->oe_max_end != victim->oe_max_end)
+		return -ERANGE;
+
+	LASSERT(cur->oe_osclock == victim->oe_osclock);
+	ppc_bits = osc_cli(obj)->cl_chunkbits - PAGE_CACHE_SHIFT;
+	chunk_start = cur->oe_start >> ppc_bits;
+	chunk_end   = cur->oe_end   >> ppc_bits;
+	if (chunk_start   != (victim->oe_end >> ppc_bits) + 1 &&
+	    chunk_end + 1 != victim->oe_start >> ppc_bits)
+		return -ERANGE;
+
+	OSC_EXTENT_DUMP(D_CACHE, victim, "will be merged by %p.\n", cur);
+
+	cur->oe_start     = min(cur->oe_start, victim->oe_start);
+	cur->oe_end       = max(cur->oe_end,   victim->oe_end);
+	cur->oe_grants   += victim->oe_grants;
+	cur->oe_nr_pages += victim->oe_nr_pages;
+	/* only the following bits are needed to merge */
+	cur->oe_urgent   |= victim->oe_urgent;
+	cur->oe_memalloc |= victim->oe_memalloc;
+	list_splice_init(&victim->oe_pages, &cur->oe_pages);
+	list_del_init(&victim->oe_link);
+	victim->oe_nr_pages = 0;
+
+	osc_extent_get(victim);
+	__osc_extent_remove(victim);
+	osc_extent_put(env, victim);
+
+	OSC_EXTENT_DUMP(D_CACHE, cur, "after merging %p.\n", victim);
+	return 0;
+}
+
+/**
+ * Drop user count of osc_extent, and unplug IO asynchronously.
+ */
+void osc_extent_release(const struct lu_env *env, struct osc_extent *ext)
+{
+	struct osc_object *obj = ext->oe_obj;
+
+	LASSERT(atomic_read(&ext->oe_users) > 0);
+	LASSERT(sanity_check(ext) == 0);
+	LASSERT(ext->oe_grants > 0);
+
+	if (atomic_dec_and_lock(&ext->oe_users, &obj->oo_lock)) {
+		LASSERT(ext->oe_state == OES_ACTIVE);
+		if (ext->oe_trunc_pending) {
+			/* a truncate process is waiting for this extent.
+			 * This may happen due to a race, check
+			 * osc_cache_truncate_start(). */
+			osc_extent_state_set(ext, OES_TRUNC);
+			ext->oe_trunc_pending = 0;
+		} else {
+			osc_extent_state_set(ext, OES_CACHE);
+			osc_update_pending(obj, OBD_BRW_WRITE,
+					   ext->oe_nr_pages);
+
+			/* try to merge the previous and next extent. */
+			osc_extent_merge(env, ext, prev_extent(ext));
+			osc_extent_merge(env, ext, next_extent(ext));
+
+			if (ext->oe_urgent)
+				list_move_tail(&ext->oe_link,
+						   &obj->oo_urgent_exts);
+		}
+		osc_object_unlock(obj);
+
+		osc_io_unplug_async(env, osc_cli(obj), obj);
+	}
+	osc_extent_put(env, ext);
+}
+
+static inline int overlapped(struct osc_extent *ex1, struct osc_extent *ex2)
+{
+	return !(ex1->oe_end < ex2->oe_start || ex2->oe_end < ex1->oe_start);
+}
+
+/**
+ * Find or create an extent which includes @index, core function to manage
+ * extent tree.
+ */
+struct osc_extent *osc_extent_find(const struct lu_env *env,
+				   struct osc_object *obj, pgoff_t index,
+				   int *grants)
+
+{
+	struct client_obd *cli = osc_cli(obj);
+	struct cl_lock    *lock;
+	struct osc_extent *cur;
+	struct osc_extent *ext;
+	struct osc_extent *conflict = NULL;
+	struct osc_extent *found = NULL;
+	pgoff_t    chunk;
+	pgoff_t    max_end;
+	int	max_pages; /* max_pages_per_rpc */
+	int	chunksize;
+	int	ppc_bits; /* pages per chunk bits */
+	int	chunk_mask;
+	int	rc;
+
+	cur = osc_extent_alloc(obj);
+	if (cur == NULL)
+		return ERR_PTR(-ENOMEM);
+
+	lock = cl_lock_at_pgoff(env, osc2cl(obj), index, NULL, 1, 0);
+	LASSERT(lock != NULL);
+	LASSERT(lock->cll_descr.cld_mode >= CLM_WRITE);
+
+	LASSERT(cli->cl_chunkbits >= PAGE_CACHE_SHIFT);
+	ppc_bits   = cli->cl_chunkbits - PAGE_CACHE_SHIFT;
+	chunk_mask = ~((1 << ppc_bits) - 1);
+	chunksize  = 1 << cli->cl_chunkbits;
+	chunk      = index >> ppc_bits;
+
+	/* align end to rpc edge, rpc size may not be a power 2 integer. */
+	max_pages = cli->cl_max_pages_per_rpc;
+	LASSERT((max_pages & ~chunk_mask) == 0);
+	max_end = index - (index % max_pages) + max_pages - 1;
+	max_end = min_t(pgoff_t, max_end, lock->cll_descr.cld_end);
+
+	/* initialize new extent by parameters so far */
+	cur->oe_max_end = max_end;
+	cur->oe_start   = index & chunk_mask;
+	cur->oe_end     = ((index + ~chunk_mask + 1) & chunk_mask) - 1;
+	if (cur->oe_start < lock->cll_descr.cld_start)
+		cur->oe_start = lock->cll_descr.cld_start;
+	if (cur->oe_end > max_end)
+		cur->oe_end = max_end;
+	cur->oe_osclock = lock;
+	cur->oe_grants  = 0;
+	cur->oe_mppr    = max_pages;
+
+	/* grants has been allocated by caller */
+	LASSERTF(*grants >= chunksize + cli->cl_extent_tax,
+		 "%u/%u/%u.\n", *grants, chunksize, cli->cl_extent_tax);
+	LASSERTF((max_end - cur->oe_start) < max_pages, EXTSTR, EXTPARA(cur));
+
+restart:
+	osc_object_lock(obj);
+	ext = osc_extent_search(obj, cur->oe_start);
+	if (ext == NULL)
+		ext = first_extent(obj);
+	while (ext != NULL) {
+		loff_t ext_chk_start = ext->oe_start >> ppc_bits;
+		loff_t ext_chk_end   = ext->oe_end   >> ppc_bits;
+
+		LASSERT(sanity_check_nolock(ext) == 0);
+		if (chunk > ext_chk_end + 1)
+			break;
+
+		/* if covering by different locks, no chance to match */
+		if (lock != ext->oe_osclock) {
+			EASSERTF(!overlapped(ext, cur), ext,
+				 EXTSTR, EXTPARA(cur));
+
+			ext = next_extent(ext);
+			continue;
+		}
+
+		/* discontiguous chunks? */
+		if (chunk + 1 < ext_chk_start) {
+			ext = next_extent(ext);
+			continue;
+		}
+
+		/* ok, from now on, ext and cur have these attrs:
+		 * 1. covered by the same lock
+		 * 2. contiguous at chunk level or overlapping. */
+
+		if (overlapped(ext, cur)) {
+			/* cur is the minimum unit, so overlapping means
+			 * full contain. */
+			EASSERTF((ext->oe_start <= cur->oe_start &&
+				  ext->oe_end >= cur->oe_end),
+				 ext, EXTSTR, EXTPARA(cur));
+
+			if (ext->oe_state > OES_CACHE || ext->oe_fsync_wait) {
+				/* for simplicity, we wait for this extent to
+				 * finish before going forward. */
+				conflict = osc_extent_get(ext);
+				break;
+			}
+
+			found = osc_extent_hold(ext);
+			break;
+		}
+
+		/* non-overlapped extent */
+		if (ext->oe_state != OES_CACHE || ext->oe_fsync_wait) {
+			/* we can't do anything for a non OES_CACHE extent, or
+			 * if there is someone waiting for this extent to be
+			 * flushed, try next one. */
+			ext = next_extent(ext);
+			continue;
+		}
+
+		/* check if they belong to the same rpc slot before trying to
+		 * merge. the extents are not overlapped and contiguous at
+		 * chunk level to get here. */
+		if (ext->oe_max_end != max_end) {
+			/* if they don't belong to the same RPC slot or
+			 * max_pages_per_rpc has ever changed, do not merge. */
+			ext = next_extent(ext);
+			continue;
+		}
+
+		/* it's required that an extent must be contiguous at chunk
+		 * level so that we know the whole extent is covered by grant
+		 * (the pages in the extent are NOT required to be contiguous).
+		 * Otherwise, it will be too much difficult to know which
+		 * chunks have grants allocated. */
+
+		/* try to do front merge - extend ext's start */
+		if (chunk + 1 == ext_chk_start) {
+			/* ext must be chunk size aligned */
+			EASSERT((ext->oe_start & ~chunk_mask) == 0, ext);
+
+			/* pull ext's start back to cover cur */
+			ext->oe_start   = cur->oe_start;
+			ext->oe_grants += chunksize;
+			*grants -= chunksize;
+
+			found = osc_extent_hold(ext);
+		} else if (chunk == ext_chk_end + 1) {
+			/* rear merge */
+			ext->oe_end     = cur->oe_end;
+			ext->oe_grants += chunksize;
+			*grants -= chunksize;
+
+			/* try to merge with the next one because we just fill
+			 * in a gap */
+			if (osc_extent_merge(env, ext, next_extent(ext)) == 0)
+				/* we can save extent tax from next extent */
+				*grants += cli->cl_extent_tax;
+
+			found = osc_extent_hold(ext);
+		}
+		if (found != NULL)
+			break;
+
+		ext = next_extent(ext);
+	}
+
+	osc_extent_tree_dump(D_CACHE, obj);
+	if (found != NULL) {
+		LASSERT(conflict == NULL);
+		if (!IS_ERR(found)) {
+			LASSERT(found->oe_osclock == cur->oe_osclock);
+			OSC_EXTENT_DUMP(D_CACHE, found,
+					"found caching ext for %lu.\n", index);
+		}
+	} else if (conflict == NULL) {
+		/* create a new extent */
+		EASSERT(osc_extent_is_overlapped(obj, cur) == 0, cur);
+		cur->oe_grants = chunksize + cli->cl_extent_tax;
+		*grants -= cur->oe_grants;
+		LASSERT(*grants >= 0);
+
+		cur->oe_state = OES_CACHE;
+		found = osc_extent_hold(cur);
+		osc_extent_insert(obj, cur);
+		OSC_EXTENT_DUMP(D_CACHE, cur, "add into tree %lu/%lu.\n",
+				index, lock->cll_descr.cld_end);
+	}
+	osc_object_unlock(obj);
+
+	if (conflict != NULL) {
+		LASSERT(found == NULL);
+
+		/* waiting for IO to finish. Please notice that it's impossible
+		 * to be an OES_TRUNC extent. */
+		rc = osc_extent_wait(env, conflict, OES_INV);
+		osc_extent_put(env, conflict);
+		conflict = NULL;
+		if (rc < 0) {
+			found = ERR_PTR(rc);
+			goto out;
+		}
+
+		goto restart;
+	}
+
+out:
+	osc_extent_put(env, cur);
+	LASSERT(*grants >= 0);
+	return found;
+}
+
+/**
+ * Called when IO is finished to an extent.
+ */
+int osc_extent_finish(const struct lu_env *env, struct osc_extent *ext,
+		      int sent, int rc)
+{
+	struct client_obd *cli = osc_cli(ext->oe_obj);
+	struct osc_async_page *oap;
+	struct osc_async_page *tmp;
+	int nr_pages = ext->oe_nr_pages;
+	int lost_grant = 0;
+	int blocksize = cli->cl_import->imp_obd->obd_osfs.os_bsize ? : 4096;
+	__u64 last_off = 0;
+	int last_count = -1;
+
+	OSC_EXTENT_DUMP(D_CACHE, ext, "extent finished.\n");
+
+	ext->oe_rc = rc ?: ext->oe_nr_pages;
+	EASSERT(ergo(rc == 0, ext->oe_state == OES_RPC), ext);
+	list_for_each_entry_safe(oap, tmp, &ext->oe_pages,
+				     oap_pending_item) {
+		list_del_init(&oap->oap_rpc_item);
+		list_del_init(&oap->oap_pending_item);
+		if (last_off <= oap->oap_obj_off) {
+			last_off = oap->oap_obj_off;
+			last_count = oap->oap_count;
+		}
+
+		--ext->oe_nr_pages;
+		osc_ap_completion(env, cli, oap, sent, rc);
+	}
+	EASSERT(ext->oe_nr_pages == 0, ext);
+
+	if (!sent) {
+		lost_grant = ext->oe_grants;
+	} else if (blocksize < PAGE_CACHE_SIZE &&
+		   last_count != PAGE_CACHE_SIZE) {
+		/* For short writes we shouldn't count parts of pages that
+		 * span a whole chunk on the OST side, or our accounting goes
+		 * wrong.  Should match the code in filter_grant_check. */
+		int offset = oap->oap_page_off & ~CFS_PAGE_MASK;
+		int count = oap->oap_count + (offset & (blocksize - 1));
+		int end = (offset + oap->oap_count) & (blocksize - 1);
+		if (end)
+			count += blocksize - end;
+
+		lost_grant = PAGE_CACHE_SIZE - count;
+	}
+	if (ext->oe_grants > 0)
+		osc_free_grant(cli, nr_pages, lost_grant);
+
+	osc_extent_remove(ext);
+	/* put the refcount for RPC */
+	osc_extent_put(env, ext);
+	return 0;
+}
+
+static int extent_wait_cb(struct osc_extent *ext, int state)
+{
+	int ret;
+
+	osc_object_lock(ext->oe_obj);
+	ret = ext->oe_state == state;
+	osc_object_unlock(ext->oe_obj);
+
+	return ret;
+}
+
+/**
+ * Wait for the extent's state to become @state.
+ */
+static int osc_extent_wait(const struct lu_env *env, struct osc_extent *ext,
+			   int state)
+{
+	struct osc_object *obj = ext->oe_obj;
+	struct l_wait_info lwi = LWI_TIMEOUT_INTR(cfs_time_seconds(600), NULL,
+						  LWI_ON_SIGNAL_NOOP, NULL);
+	int rc = 0;
+
+	osc_object_lock(obj);
+	LASSERT(sanity_check_nolock(ext) == 0);
+	/* `Kick' this extent only if the caller is waiting for it to be
+	 * written out. */
+	if (state == OES_INV && !ext->oe_urgent && !ext->oe_hp &&
+	    !ext->oe_trunc_pending) {
+		if (ext->oe_state == OES_ACTIVE) {
+			ext->oe_urgent = 1;
+		} else if (ext->oe_state == OES_CACHE) {
+			ext->oe_urgent = 1;
+			osc_extent_hold(ext);
+			rc = 1;
+		}
+	}
+	osc_object_unlock(obj);
+	if (rc == 1)
+		osc_extent_release(env, ext);
+
+	/* wait for the extent until its state becomes @state */
+	rc = l_wait_event(ext->oe_waitq, extent_wait_cb(ext, state), &lwi);
+	if (rc == -ETIMEDOUT) {
+		OSC_EXTENT_DUMP(D_ERROR, ext,
+			"%s: wait ext to %d timedout, recovery in progress?\n",
+			osc_export(obj)->exp_obd->obd_name, state);
+
+		lwi = LWI_INTR(LWI_ON_SIGNAL_NOOP, NULL);
+		rc = l_wait_event(ext->oe_waitq, extent_wait_cb(ext, state),
+				  &lwi);
+	}
+	if (rc == 0 && ext->oe_rc < 0)
+		rc = ext->oe_rc;
+	return rc;
+}
+
+/**
+ * Discard pages with index greater than @size. If @ext is overlapped with
+ * @size, then partial truncate happens.
+ */
+static int osc_extent_truncate(struct osc_extent *ext, pgoff_t trunc_index,
+				bool partial)
+{
+	struct cl_env_nest     nest;
+	struct lu_env	 *env;
+	struct cl_io	  *io;
+	struct osc_object     *obj = ext->oe_obj;
+	struct client_obd     *cli = osc_cli(obj);
+	struct osc_async_page *oap;
+	struct osc_async_page *tmp;
+	int		    pages_in_chunk = 0;
+	int		    ppc_bits    = cli->cl_chunkbits - PAGE_CACHE_SHIFT;
+	__u64		  trunc_chunk = trunc_index >> ppc_bits;
+	int		    grants   = 0;
+	int		    nr_pages = 0;
+	int		    rc       = 0;
+
+	LASSERT(sanity_check(ext) == 0);
+	EASSERT(ext->oe_state == OES_TRUNC, ext);
+	EASSERT(!ext->oe_urgent, ext);
+
+	/* Request new lu_env.
+	 * We can't use that env from osc_cache_truncate_start() because
+	 * it's from lov_io_sub and not fully initialized. */
+	env = cl_env_nested_get(&nest);
+	io  = &osc_env_info(env)->oti_io;
+	io->ci_obj = cl_object_top(osc2cl(obj));
+	rc = cl_io_init(env, io, CIT_MISC, io->ci_obj);
+	if (rc < 0)
+		goto out;
+
+	/* discard all pages with index greater then trunc_index */
+	list_for_each_entry_safe(oap, tmp, &ext->oe_pages,
+				     oap_pending_item) {
+		struct cl_page  *sub  = oap2cl_page(oap);
+		struct cl_page  *page = cl_page_top(sub);
+
+		LASSERT(list_empty(&oap->oap_rpc_item));
+
+		/* only discard the pages with their index greater than
+		 * trunc_index, and ... */
+		if (sub->cp_index < trunc_index ||
+		    (sub->cp_index == trunc_index && partial)) {
+			/* accounting how many pages remaining in the chunk
+			 * so that we can calculate grants correctly. */
+			if (sub->cp_index >> ppc_bits == trunc_chunk)
+				++pages_in_chunk;
+			continue;
+		}
+
+		list_del_init(&oap->oap_pending_item);
+
+		cl_page_get(page);
+		lu_ref_add(&page->cp_reference, "truncate", current);
+
+		if (cl_page_own(env, io, page) == 0) {
+			cl_page_unmap(env, io, page);
+			cl_page_discard(env, io, page);
+			cl_page_disown(env, io, page);
+		} else {
+			LASSERT(page->cp_state == CPS_FREEING);
+			LASSERT(0);
+		}
+
+		lu_ref_del(&page->cp_reference, "truncate", current);
+		cl_page_put(env, page);
+
+		--ext->oe_nr_pages;
+		++nr_pages;
+	}
+	EASSERTF(ergo(ext->oe_start >= trunc_index + !!partial,
+		      ext->oe_nr_pages == 0),
+		ext, "trunc_index %lu, partial %d\n", trunc_index, partial);
+
+	osc_object_lock(obj);
+	if (ext->oe_nr_pages == 0) {
+		LASSERT(pages_in_chunk == 0);
+		grants = ext->oe_grants;
+		ext->oe_grants = 0;
+	} else { /* calculate how many grants we can free */
+		int     chunks = (ext->oe_end >> ppc_bits) - trunc_chunk;
+		pgoff_t last_index;
+
+
+		/* if there is no pages in this chunk, we can also free grants
+		 * for the last chunk */
+		if (pages_in_chunk == 0) {
+			/* if this is the 1st chunk and no pages in this chunk,
+			 * ext->oe_nr_pages must be zero, so we should be in
+			 * the other if-clause. */
+			LASSERT(trunc_chunk > 0);
+			--trunc_chunk;
+			++chunks;
+		}
+
+		/* this is what we can free from this extent */
+		grants	  = chunks << cli->cl_chunkbits;
+		ext->oe_grants -= grants;
+		last_index      = ((trunc_chunk + 1) << ppc_bits) - 1;
+		ext->oe_end     = min(last_index, ext->oe_max_end);
+		LASSERT(ext->oe_end >= ext->oe_start);
+		LASSERT(ext->oe_grants > 0);
+	}
+	osc_object_unlock(obj);
+
+	if (grants > 0 || nr_pages > 0)
+		osc_free_grant(cli, nr_pages, grants);
+
+out:
+	cl_io_fini(env, io);
+	cl_env_nested_put(&nest, env);
+	return rc;
+}
+
+/**
+ * This function is used to make the extent prepared for transfer.
+ * A race with flushing page - ll_writepage() has to be handled cautiously.
+ */
+static int osc_extent_make_ready(const struct lu_env *env,
+				 struct osc_extent *ext)
+{
+	struct osc_async_page *oap;
+	struct osc_async_page *last = NULL;
+	struct osc_object *obj = ext->oe_obj;
+	int page_count = 0;
+	int rc;
+
+	/* we're going to grab page lock, so object lock must not be taken. */
+	LASSERT(sanity_check(ext) == 0);
+	/* in locking state, any process should not touch this extent. */
+	EASSERT(ext->oe_state == OES_LOCKING, ext);
+	EASSERT(ext->oe_owner != NULL, ext);
+
+	OSC_EXTENT_DUMP(D_CACHE, ext, "make ready\n");
+
+	list_for_each_entry(oap, &ext->oe_pages, oap_pending_item) {
+		++page_count;
+		if (last == NULL || last->oap_obj_off < oap->oap_obj_off)
+			last = oap;
+
+		/* checking ASYNC_READY is race safe */
+		if ((oap->oap_async_flags & ASYNC_READY) != 0)
+			continue;
+
+		rc = osc_make_ready(env, oap, OBD_BRW_WRITE);
+		switch (rc) {
+		case 0:
+			spin_lock(&oap->oap_lock);
+			oap->oap_async_flags |= ASYNC_READY;
+			spin_unlock(&oap->oap_lock);
+			break;
+		case -EALREADY:
+			LASSERT((oap->oap_async_flags & ASYNC_READY) != 0);
+			break;
+		default:
+			LASSERTF(0, "unknown return code: %d\n", rc);
+		}
+	}
+
+	LASSERT(page_count == ext->oe_nr_pages);
+	LASSERT(last != NULL);
+	/* the last page is the only one we need to refresh its count by
+	 * the size of file. */
+	if (!(last->oap_async_flags & ASYNC_COUNT_STABLE)) {
+		last->oap_count = osc_refresh_count(env, last, OBD_BRW_WRITE);
+		LASSERT(last->oap_count > 0);
+		LASSERT(last->oap_page_off + last->oap_count <= PAGE_CACHE_SIZE);
+		last->oap_async_flags |= ASYNC_COUNT_STABLE;
+	}
+
+	/* for the rest of pages, we don't need to call osf_refresh_count()
+	 * because it's known they are not the last page */
+	list_for_each_entry(oap, &ext->oe_pages, oap_pending_item) {
+		if (!(oap->oap_async_flags & ASYNC_COUNT_STABLE)) {
+			oap->oap_count = PAGE_CACHE_SIZE - oap->oap_page_off;
+			oap->oap_async_flags |= ASYNC_COUNT_STABLE;
+		}
+	}
+
+	osc_object_lock(obj);
+	osc_extent_state_set(ext, OES_RPC);
+	osc_object_unlock(obj);
+	/* get a refcount for RPC. */
+	osc_extent_get(ext);
+
+	return 0;
+}
+
+/**
+ * Quick and simple version of osc_extent_find(). This function is frequently
+ * called to expand the extent for the same IO. To expand the extent, the
+ * page index must be in the same or next chunk of ext->oe_end.
+ */
+static int osc_extent_expand(struct osc_extent *ext, pgoff_t index, int *grants)
+{
+	struct osc_object *obj = ext->oe_obj;
+	struct client_obd *cli = osc_cli(obj);
+	struct osc_extent *next;
+	int ppc_bits = cli->cl_chunkbits - PAGE_CACHE_SHIFT;
+	pgoff_t chunk = index >> ppc_bits;
+	pgoff_t end_chunk;
+	pgoff_t end_index;
+	int chunksize = 1 << cli->cl_chunkbits;
+	int rc = 0;
+
+	LASSERT(ext->oe_max_end >= index && ext->oe_start <= index);
+	osc_object_lock(obj);
+	LASSERT(sanity_check_nolock(ext) == 0);
+	end_chunk = ext->oe_end >> ppc_bits;
+	if (chunk > end_chunk + 1) {
+		rc = -ERANGE;
+		goto out;
+	}
+
+	if (end_chunk >= chunk) {
+		rc = 0;
+		goto out;
+	}
+
+	LASSERT(end_chunk + 1 == chunk);
+	/* try to expand this extent to cover @index */
+	end_index = min(ext->oe_max_end, ((chunk + 1) << ppc_bits) - 1);
+
+	next = next_extent(ext);
+	if (next != NULL && next->oe_start <= end_index) {
+		/* complex mode - overlapped with the next extent,
+		 * this case will be handled by osc_extent_find() */
+		rc = -EAGAIN;
+		goto out;
+	}
+
+	ext->oe_end = end_index;
+	ext->oe_grants += chunksize;
+	*grants -= chunksize;
+	LASSERT(*grants >= 0);
+	EASSERTF(osc_extent_is_overlapped(obj, ext) == 0, ext,
+		 "overlapped after expanding for %lu.\n", index);
+
+out:
+	osc_object_unlock(obj);
+	return rc;
+}
+
+static void osc_extent_tree_dump0(int level, struct osc_object *obj,
+				  const char *func, int line)
+{
+	struct osc_extent *ext;
+	int cnt;
+
+	CDEBUG(level, "Dump object %p extents at %s:%d, mppr: %u.\n",
+	       obj, func, line, osc_cli(obj)->cl_max_pages_per_rpc);
+
+	/* osc_object_lock(obj); */
+	cnt = 1;
+	for (ext = first_extent(obj); ext != NULL; ext = next_extent(ext))
+		OSC_EXTENT_DUMP(level, ext, "in tree %d.\n", cnt++);
+
+	cnt = 1;
+	list_for_each_entry(ext, &obj->oo_hp_exts, oe_link)
+		OSC_EXTENT_DUMP(level, ext, "hp %d.\n", cnt++);
+
+	cnt = 1;
+	list_for_each_entry(ext, &obj->oo_urgent_exts, oe_link)
+		OSC_EXTENT_DUMP(level, ext, "urgent %d.\n", cnt++);
+
+	cnt = 1;
+	list_for_each_entry(ext, &obj->oo_reading_exts, oe_link)
+		OSC_EXTENT_DUMP(level, ext, "reading %d.\n", cnt++);
+	/* osc_object_unlock(obj); */
+}
+
+/* ------------------ osc extent end ------------------ */
+
+static inline int osc_is_ready(struct osc_object *osc)
+{
+	return !list_empty(&osc->oo_ready_item) ||
+	       !list_empty(&osc->oo_hp_ready_item);
+}
+
+#define OSC_IO_DEBUG(OSC, STR, args...)					       \
+	CDEBUG(D_CACHE, "obj %p ready %d|%c|%c wr %d|%c|%c rd %d|%c " STR,     \
+	       (OSC), osc_is_ready(OSC),				       \
+	       list_empty_marker(&(OSC)->oo_hp_ready_item),		       \
+	       list_empty_marker(&(OSC)->oo_ready_item),		       \
+	       atomic_read(&(OSC)->oo_nr_writes),			       \
+	       list_empty_marker(&(OSC)->oo_hp_exts),			       \
+	       list_empty_marker(&(OSC)->oo_urgent_exts),		       \
+	       atomic_read(&(OSC)->oo_nr_reads),			       \
+	       list_empty_marker(&(OSC)->oo_reading_exts),		       \
+	       ##args)
+
+static int osc_make_ready(const struct lu_env *env, struct osc_async_page *oap,
+			  int cmd)
+{
+	struct osc_page *opg  = oap2osc_page(oap);
+	struct cl_page  *page = cl_page_top(oap2cl_page(oap));
+	int result;
+
+	LASSERT(cmd == OBD_BRW_WRITE); /* no cached reads */
+
+	result = cl_page_make_ready(env, page, CRT_WRITE);
+	if (result == 0)
+		opg->ops_submit_time = cfs_time_current();
+	return result;
+}
+
+static int osc_refresh_count(const struct lu_env *env,
+			     struct osc_async_page *oap, int cmd)
+{
+	struct osc_page  *opg = oap2osc_page(oap);
+	struct cl_page   *page = oap2cl_page(oap);
+	struct cl_object *obj;
+	struct cl_attr   *attr = &osc_env_info(env)->oti_attr;
+
+	int result;
+	loff_t kms;
+
+	/* readpage queues with _COUNT_STABLE, shouldn't get here. */
+	LASSERT(!(cmd & OBD_BRW_READ));
+	LASSERT(opg != NULL);
+	obj = opg->ops_cl.cpl_obj;
+
+	cl_object_attr_lock(obj);
+	result = cl_object_attr_get(env, obj, attr);
+	cl_object_attr_unlock(obj);
+	if (result < 0)
+		return result;
+	kms = attr->cat_kms;
+	if (cl_offset(obj, page->cp_index) >= kms)
+		/* catch race with truncate */
+		return 0;
+	else if (cl_offset(obj, page->cp_index + 1) > kms)
+		/* catch sub-page write at end of file */
+		return kms % PAGE_CACHE_SIZE;
+	else
+		return PAGE_CACHE_SIZE;
+}
+
+static int osc_completion(const struct lu_env *env, struct osc_async_page *oap,
+			  int cmd, int rc)
+{
+	struct osc_page   *opg  = oap2osc_page(oap);
+	struct cl_page    *page = cl_page_top(oap2cl_page(oap));
+	struct osc_object *obj  = cl2osc(opg->ops_cl.cpl_obj);
+	enum cl_req_type   crt;
+	int srvlock;
+
+	cmd &= ~OBD_BRW_NOQUOTA;
+	LASSERT(equi(page->cp_state == CPS_PAGEIN,  cmd == OBD_BRW_READ));
+	LASSERT(equi(page->cp_state == CPS_PAGEOUT, cmd == OBD_BRW_WRITE));
+	LASSERT(opg->ops_transfer_pinned);
+
+	/*
+	 * page->cp_req can be NULL if io submission failed before
+	 * cl_req was allocated.
+	 */
+	if (page->cp_req != NULL)
+		cl_req_page_done(env, page);
+	LASSERT(page->cp_req == NULL);
+
+	crt = cmd == OBD_BRW_READ ? CRT_READ : CRT_WRITE;
+	/* Clear opg->ops_transfer_pinned before VM lock is released. */
+	opg->ops_transfer_pinned = 0;
+
+	spin_lock(&obj->oo_seatbelt);
+	LASSERT(opg->ops_submitter != NULL);
+	LASSERT(!list_empty(&opg->ops_inflight));
+	list_del_init(&opg->ops_inflight);
+	opg->ops_submitter = NULL;
+	spin_unlock(&obj->oo_seatbelt);
+
+	opg->ops_submit_time = 0;
+	srvlock = oap->oap_brw_flags & OBD_BRW_SRVLOCK;
+
+	/* statistic */
+	if (rc == 0 && srvlock) {
+		struct lu_device *ld    = opg->ops_cl.cpl_obj->co_lu.lo_dev;
+		struct osc_stats *stats = &lu2osc_dev(ld)->od_stats;
+		int bytes = oap->oap_count;
+
+		if (crt == CRT_READ)
+			stats->os_lockless_reads += bytes;
+		else
+			stats->os_lockless_writes += bytes;
+	}
+
+	/*
+	 * This has to be the last operation with the page, as locks are
+	 * released in cl_page_completion() and nothing except for the
+	 * reference counter protects page from concurrent reclaim.
+	 */
+	lu_ref_del(&page->cp_reference, "transfer", page);
+
+	cl_page_completion(env, page, crt, rc);
+
+	return 0;
+}
+
+#define OSC_DUMP_GRANT(cli, fmt, args...) do {				      \
+	struct client_obd *__tmp = (cli);				      \
+	CDEBUG(D_CACHE, "%s: { dirty: %ld/%ld dirty_pages: %d/%d "	      \
+	       "dropped: %ld avail: %ld, reserved: %ld, flight: %d } " fmt,   \
+	       __tmp->cl_import->imp_obd->obd_name,			      \
+	       __tmp->cl_dirty, __tmp->cl_dirty_max,			      \
+	       atomic_read(&obd_dirty_pages), obd_max_dirty_pages,	      \
+	       __tmp->cl_lost_grant, __tmp->cl_avail_grant,		      \
+	       __tmp->cl_reserved_grant, __tmp->cl_w_in_flight, ##args);      \
+} while (0)
+
+/* caller must hold loi_list_lock */
+static void osc_consume_write_grant(struct client_obd *cli,
+				    struct brw_page *pga)
+{
+	assert_spin_locked(&cli->cl_loi_list_lock.lock);
+	LASSERT(!(pga->flag & OBD_BRW_FROM_GRANT));
+	atomic_inc(&obd_dirty_pages);
+	cli->cl_dirty += PAGE_CACHE_SIZE;
+	pga->flag |= OBD_BRW_FROM_GRANT;
+	CDEBUG(D_CACHE, "using %lu grant credits for brw %p page %p\n",
+	       PAGE_CACHE_SIZE, pga, pga->pg);
+	osc_update_next_shrink(cli);
+}
+
+/* the companion to osc_consume_write_grant, called when a brw has completed.
+ * must be called with the loi lock held. */
+static void osc_release_write_grant(struct client_obd *cli,
+				    struct brw_page *pga)
+{
+	assert_spin_locked(&cli->cl_loi_list_lock.lock);
+	if (!(pga->flag & OBD_BRW_FROM_GRANT)) {
+		return;
+	}
+
+	pga->flag &= ~OBD_BRW_FROM_GRANT;
+	atomic_dec(&obd_dirty_pages);
+	cli->cl_dirty -= PAGE_CACHE_SIZE;
+	if (pga->flag & OBD_BRW_NOCACHE) {
+		pga->flag &= ~OBD_BRW_NOCACHE;
+		atomic_dec(&obd_dirty_transit_pages);
+		cli->cl_dirty_transit -= PAGE_CACHE_SIZE;
+	}
+}
+
+/**
+ * To avoid sleeping with object lock held, it's good for us allocate enough
+ * grants before entering into critical section.
+ *
+ * client_obd_list_lock held by caller
+ */
+static int osc_reserve_grant(struct client_obd *cli, unsigned int bytes)
+{
+	int rc = -EDQUOT;
+
+	if (cli->cl_avail_grant >= bytes) {
+		cli->cl_avail_grant    -= bytes;
+		cli->cl_reserved_grant += bytes;
+		rc = 0;
+	}
+	return rc;
+}
+
+static void __osc_unreserve_grant(struct client_obd *cli,
+				  unsigned int reserved, unsigned int unused)
+{
+	/* it's quite normal for us to get more grant than reserved.
+	 * Thinking about a case that two extents merged by adding a new
+	 * chunk, we can save one extent tax. If extent tax is greater than
+	 * one chunk, we can save more grant by adding a new chunk */
+	cli->cl_reserved_grant -= reserved;
+	if (unused > reserved) {
+		cli->cl_avail_grant += reserved;
+		cli->cl_lost_grant  += unused - reserved;
+	} else {
+		cli->cl_avail_grant += unused;
+	}
+}
+
+void osc_unreserve_grant(struct client_obd *cli,
+			 unsigned int reserved, unsigned int unused)
+{
+	client_obd_list_lock(&cli->cl_loi_list_lock);
+	__osc_unreserve_grant(cli, reserved, unused);
+	if (unused > 0)
+		osc_wake_cache_waiters(cli);
+	client_obd_list_unlock(&cli->cl_loi_list_lock);
+}
+
+/**
+ * Free grant after IO is finished or canceled.
+ *
+ * @lost_grant is used to remember how many grants we have allocated but not
+ * used, we should return these grants to OST. There're two cases where grants
+ * can be lost:
+ * 1. truncate;
+ * 2. blocksize at OST is less than PAGE_CACHE_SIZE and a partial page was
+ *    written. In this case OST may use less chunks to serve this partial
+ *    write. OSTs don't actually know the page size on the client side. so
+ *    clients have to calculate lost grant by the blocksize on the OST.
+ *    See filter_grant_check() for details.
+ */
+static void osc_free_grant(struct client_obd *cli, unsigned int nr_pages,
+			   unsigned int lost_grant)
+{
+	int grant = (1 << cli->cl_chunkbits) + cli->cl_extent_tax;
+
+	client_obd_list_lock(&cli->cl_loi_list_lock);
+	atomic_sub(nr_pages, &obd_dirty_pages);
+	cli->cl_dirty -= nr_pages << PAGE_CACHE_SHIFT;
+	cli->cl_lost_grant += lost_grant;
+	if (cli->cl_avail_grant < grant && cli->cl_lost_grant >= grant) {
+		/* borrow some grant from truncate to avoid the case that
+		 * truncate uses up all avail grant */
+		cli->cl_lost_grant -= grant;
+		cli->cl_avail_grant += grant;
+	}
+	osc_wake_cache_waiters(cli);
+	client_obd_list_unlock(&cli->cl_loi_list_lock);
+	CDEBUG(D_CACHE, "lost %u grant: %lu avail: %lu dirty: %lu\n",
+	       lost_grant, cli->cl_lost_grant,
+	       cli->cl_avail_grant, cli->cl_dirty);
+}
+
+/**
+ * The companion to osc_enter_cache(), called when @oap is no longer part of
+ * the dirty accounting due to error.
+ */
+static void osc_exit_cache(struct client_obd *cli, struct osc_async_page *oap)
+{
+	client_obd_list_lock(&cli->cl_loi_list_lock);
+	osc_release_write_grant(cli, &oap->oap_brw_page);
+	client_obd_list_unlock(&cli->cl_loi_list_lock);
+}
+
+/**
+ * Non-blocking version of osc_enter_cache() that consumes grant only when it
+ * is available.
+ */
+static int osc_enter_cache_try(struct client_obd *cli,
+			       struct osc_async_page *oap,
+			       int bytes, int transient)
+{
+	int rc;
+
+	OSC_DUMP_GRANT(cli, "need:%d.\n", bytes);
+
+	rc = osc_reserve_grant(cli, bytes);
+	if (rc < 0)
+		return 0;
+
+	if (cli->cl_dirty + PAGE_CACHE_SIZE <= cli->cl_dirty_max &&
+	    atomic_read(&obd_dirty_pages) + 1 <= obd_max_dirty_pages) {
+		osc_consume_write_grant(cli, &oap->oap_brw_page);
+		if (transient) {
+			cli->cl_dirty_transit += PAGE_CACHE_SIZE;
+			atomic_inc(&obd_dirty_transit_pages);
+			oap->oap_brw_flags |= OBD_BRW_NOCACHE;
+		}
+		rc = 1;
+	} else {
+		__osc_unreserve_grant(cli, bytes, bytes);
+		rc = 0;
+	}
+	return rc;
+}
+
+static int ocw_granted(struct client_obd *cli, struct osc_cache_waiter *ocw)
+{
+	int rc;
+	client_obd_list_lock(&cli->cl_loi_list_lock);
+	rc = list_empty(&ocw->ocw_entry);
+	client_obd_list_unlock(&cli->cl_loi_list_lock);
+	return rc;
+}
+
+/**
+ * The main entry to reserve dirty page accounting. Usually the grant reserved
+ * in this function will be freed in bulk in osc_free_grant() unless it fails
+ * to add osc cache, in that case, it will be freed in osc_exit_cache().
+ *
+ * The process will be put into sleep if it's already run out of grant.
+ */
+static int osc_enter_cache(const struct lu_env *env, struct client_obd *cli,
+			   struct osc_async_page *oap, int bytes)
+{
+	struct osc_object *osc = oap->oap_obj;
+	struct lov_oinfo  *loi = osc->oo_oinfo;
+	struct osc_cache_waiter ocw;
+	struct l_wait_info lwi = LWI_INTR(LWI_ON_SIGNAL_NOOP, NULL);
+	int rc = -EDQUOT;
+
+	OSC_DUMP_GRANT(cli, "need:%d.\n", bytes);
+
+	client_obd_list_lock(&cli->cl_loi_list_lock);
+
+	/* force the caller to try sync io.  this can jump the list
+	 * of queued writes and create a discontiguous rpc stream */
+	if (OBD_FAIL_CHECK(OBD_FAIL_OSC_NO_GRANT) ||
+	    cli->cl_dirty_max < PAGE_CACHE_SIZE     ||
+	    cli->cl_ar.ar_force_sync || loi->loi_ar.ar_force_sync) {
+		rc = -EDQUOT;
+		goto out;
+	}
+
+	/* Hopefully normal case - cache space and write credits available */
+	if (osc_enter_cache_try(cli, oap, bytes, 0)) {
+		rc = 0;
+		goto out;
+	}
+
+	/* We can get here for two reasons: too many dirty pages in cache, or
+	 * run out of grants. In both cases we should write dirty pages out.
+	 * Adding a cache waiter will trigger urgent write-out no matter what
+	 * RPC size will be.
+	 * The exiting condition is no avail grants and no dirty pages caching,
+	 * that really means there is no space on the OST. */
+	init_waitqueue_head(&ocw.ocw_waitq);
+	ocw.ocw_oap   = oap;
+	ocw.ocw_grant = bytes;
+	while (cli->cl_dirty > 0 || cli->cl_w_in_flight > 0) {
+		list_add_tail(&ocw.ocw_entry, &cli->cl_cache_waiters);
+		ocw.ocw_rc = 0;
+		client_obd_list_unlock(&cli->cl_loi_list_lock);
+
+		osc_io_unplug_async(env, cli, NULL);
+
+		CDEBUG(D_CACHE, "%s: sleeping for cache space @ %p for %p\n",
+		       cli->cl_import->imp_obd->obd_name, &ocw, oap);
+
+		rc = l_wait_event(ocw.ocw_waitq, ocw_granted(cli, &ocw), &lwi);
+
+		client_obd_list_lock(&cli->cl_loi_list_lock);
+
+		/* l_wait_event is interrupted by signal */
+		if (rc < 0) {
+			list_del_init(&ocw.ocw_entry);
+			goto out;
+		}
+
+		LASSERT(list_empty(&ocw.ocw_entry));
+		rc = ocw.ocw_rc;
+
+		if (rc != -EDQUOT)
+			goto out;
+		if (osc_enter_cache_try(cli, oap, bytes, 0)) {
+			rc = 0;
+			goto out;
+		}
+	}
+out:
+	client_obd_list_unlock(&cli->cl_loi_list_lock);
+	OSC_DUMP_GRANT(cli, "returned %d.\n", rc);
+	return rc;
+}
+
+/* caller must hold loi_list_lock */
+void osc_wake_cache_waiters(struct client_obd *cli)
+{
+	struct list_head *l, *tmp;
+	struct osc_cache_waiter *ocw;
+
+	list_for_each_safe(l, tmp, &cli->cl_cache_waiters) {
+		ocw = list_entry(l, struct osc_cache_waiter, ocw_entry);
+		list_del_init(&ocw->ocw_entry);
+
+		ocw->ocw_rc = -EDQUOT;
+		/* we can't dirty more */
+		if ((cli->cl_dirty + PAGE_CACHE_SIZE > cli->cl_dirty_max) ||
+		    (atomic_read(&obd_dirty_pages) + 1 >
+		     obd_max_dirty_pages)) {
+			CDEBUG(D_CACHE, "no dirty room: dirty: %ld osc max %ld, sys max %d\n",
+			       cli->cl_dirty,
+			       cli->cl_dirty_max, obd_max_dirty_pages);
+			goto wakeup;
+		}
+
+		ocw->ocw_rc = 0;
+		if (!osc_enter_cache_try(cli, ocw->ocw_oap, ocw->ocw_grant, 0))
+			ocw->ocw_rc = -EDQUOT;
+
+wakeup:
+		CDEBUG(D_CACHE, "wake up %p for oap %p, avail grant %ld, %d\n",
+		       ocw, ocw->ocw_oap, cli->cl_avail_grant, ocw->ocw_rc);
+
+		wake_up(&ocw->ocw_waitq);
+	}
+}
+
+static int osc_max_rpc_in_flight(struct client_obd *cli, struct osc_object *osc)
+{
+	int hprpc = !!list_empty(&osc->oo_hp_exts);
+	return rpcs_in_flight(cli) >= cli->cl_max_rpcs_in_flight + hprpc;
+}
+
+/* This maintains the lists of pending pages to read/write for a given object
+ * (lop).  This is used by osc_check_rpcs->osc_next_obj() and osc_list_maint()
+ * to quickly find objects that are ready to send an RPC. */
+static int osc_makes_rpc(struct client_obd *cli, struct osc_object *osc,
+			 int cmd)
+{
+	int invalid_import = 0;
+
+	/* if we have an invalid import we want to drain the queued pages
+	 * by forcing them through rpcs that immediately fail and complete
+	 * the pages.  recovery relies on this to empty the queued pages
+	 * before canceling the locks and evicting down the llite pages */
+	if ((cli->cl_import == NULL || cli->cl_import->imp_invalid))
+		invalid_import = 1;
+
+	if (cmd & OBD_BRW_WRITE) {
+		if (atomic_read(&osc->oo_nr_writes) == 0)
+			return 0;
+		if (invalid_import) {
+			CDEBUG(D_CACHE, "invalid import forcing RPC\n");
+			return 1;
+		}
+		if (!list_empty(&osc->oo_hp_exts)) {
+			CDEBUG(D_CACHE, "high prio request forcing RPC\n");
+			return 1;
+		}
+		if (!list_empty(&osc->oo_urgent_exts)) {
+			CDEBUG(D_CACHE, "urgent request forcing RPC\n");
+			return 1;
+		}
+		/* trigger a write rpc stream as long as there are dirtiers
+		 * waiting for space.  as they're waiting, they're not going to
+		 * create more pages to coalesce with what's waiting.. */
+		if (!list_empty(&cli->cl_cache_waiters)) {
+			CDEBUG(D_CACHE, "cache waiters forcing RPC\n");
+			return 1;
+		}
+		if (atomic_read(&osc->oo_nr_writes) >=
+		    cli->cl_max_pages_per_rpc)
+			return 1;
+	} else {
+		if (atomic_read(&osc->oo_nr_reads) == 0)
+			return 0;
+		if (invalid_import) {
+			CDEBUG(D_CACHE, "invalid import forcing RPC\n");
+			return 1;
+		}
+		/* all read are urgent. */
+		if (!list_empty(&osc->oo_reading_exts))
+			return 1;
+	}
+
+	return 0;
+}
+
+static void osc_update_pending(struct osc_object *obj, int cmd, int delta)
+{
+	struct client_obd *cli = osc_cli(obj);
+	if (cmd & OBD_BRW_WRITE) {
+		atomic_add(delta, &obj->oo_nr_writes);
+		atomic_add(delta, &cli->cl_pending_w_pages);
+		LASSERT(atomic_read(&obj->oo_nr_writes) >= 0);
+	} else {
+		atomic_add(delta, &obj->oo_nr_reads);
+		atomic_add(delta, &cli->cl_pending_r_pages);
+		LASSERT(atomic_read(&obj->oo_nr_reads) >= 0);
+	}
+	OSC_IO_DEBUG(obj, "update pending cmd %d delta %d.\n", cmd, delta);
+}
+
+static int osc_makes_hprpc(struct osc_object *obj)
+{
+	return !list_empty(&obj->oo_hp_exts);
+}
+
+static void on_list(struct list_head *item, struct list_head *list, int should_be_on)
+{
+	if (list_empty(item) && should_be_on)
+		list_add_tail(item, list);
+	else if (!list_empty(item) && !should_be_on)
+		list_del_init(item);
+}
+
+/* maintain the osc's cli list membership invariants so that osc_send_oap_rpc
+ * can find pages to build into rpcs quickly */
+static int __osc_list_maint(struct client_obd *cli, struct osc_object *osc)
+{
+	if (osc_makes_hprpc(osc)) {
+		/* HP rpc */
+		on_list(&osc->oo_ready_item, &cli->cl_loi_ready_list, 0);
+		on_list(&osc->oo_hp_ready_item, &cli->cl_loi_hp_ready_list, 1);
+	} else {
+		on_list(&osc->oo_hp_ready_item, &cli->cl_loi_hp_ready_list, 0);
+		on_list(&osc->oo_ready_item, &cli->cl_loi_ready_list,
+			osc_makes_rpc(cli, osc, OBD_BRW_WRITE) ||
+			osc_makes_rpc(cli, osc, OBD_BRW_READ));
+	}
+
+	on_list(&osc->oo_write_item, &cli->cl_loi_write_list,
+		atomic_read(&osc->oo_nr_writes) > 0);
+
+	on_list(&osc->oo_read_item, &cli->cl_loi_read_list,
+		atomic_read(&osc->oo_nr_reads) > 0);
+
+	return osc_is_ready(osc);
+}
+
+static int osc_list_maint(struct client_obd *cli, struct osc_object *osc)
+{
+	int is_ready;
+
+	client_obd_list_lock(&cli->cl_loi_list_lock);
+	is_ready = __osc_list_maint(cli, osc);
+	client_obd_list_unlock(&cli->cl_loi_list_lock);
+
+	return is_ready;
+}
+
+/* this is trying to propagate async writeback errors back up to the
+ * application.  As an async write fails we record the error code for later if
+ * the app does an fsync.  As long as errors persist we force future rpcs to be
+ * sync so that the app can get a sync error and break the cycle of queueing
+ * pages for which writeback will fail. */
+static void osc_process_ar(struct osc_async_rc *ar, __u64 xid,
+			   int rc)
+{
+	if (rc) {
+		if (!ar->ar_rc)
+			ar->ar_rc = rc;
+
+		ar->ar_force_sync = 1;
+		ar->ar_min_xid = ptlrpc_sample_next_xid();
+		return;
+
+	}
+
+	if (ar->ar_force_sync && (xid >= ar->ar_min_xid))
+		ar->ar_force_sync = 0;
+}
+
+
+/* this must be called holding the loi list lock to give coverage to exit_cache,
+ * async_flag maintenance, and oap_request */
+static void osc_ap_completion(const struct lu_env *env, struct client_obd *cli,
+			      struct osc_async_page *oap, int sent, int rc)
+{
+	struct osc_object *osc = oap->oap_obj;
+	struct lov_oinfo  *loi = osc->oo_oinfo;
+	__u64 xid = 0;
+
+	if (oap->oap_request != NULL) {
+		xid = ptlrpc_req_xid(oap->oap_request);
+		ptlrpc_req_finished(oap->oap_request);
+		oap->oap_request = NULL;
+	}
+
+	/* As the transfer for this page is being done, clear the flags */
+	spin_lock(&oap->oap_lock);
+	oap->oap_async_flags = 0;
+	spin_unlock(&oap->oap_lock);
+	oap->oap_interrupted = 0;
+
+	if (oap->oap_cmd & OBD_BRW_WRITE && xid > 0) {
+		client_obd_list_lock(&cli->cl_loi_list_lock);
+		osc_process_ar(&cli->cl_ar, xid, rc);
+		osc_process_ar(&loi->loi_ar, xid, rc);
+		client_obd_list_unlock(&cli->cl_loi_list_lock);
+	}
+
+	rc = osc_completion(env, oap, oap->oap_cmd, rc);
+	if (rc)
+		CERROR("completion on oap %p obj %p returns %d.\n",
+		       oap, osc, rc);
+}
+
+/**
+ * Try to add extent to one RPC. We need to think about the following things:
+ * - # of pages must not be over max_pages_per_rpc
+ * - extent must be compatible with previous ones
+ */
+static int try_to_add_extent_for_io(struct client_obd *cli,
+				    struct osc_extent *ext, struct list_head *rpclist,
+				    int *pc, unsigned int *max_pages)
+{
+	struct osc_extent *tmp;
+	struct osc_async_page *oap = list_first_entry(&ext->oe_pages,
+						      struct osc_async_page,
+						      oap_pending_item);
+
+	EASSERT((ext->oe_state == OES_CACHE || ext->oe_state == OES_LOCK_DONE),
+		ext);
+
+	*max_pages = max(ext->oe_mppr, *max_pages);
+	if (*pc + ext->oe_nr_pages > *max_pages)
+		return 0;
+
+	list_for_each_entry(tmp, rpclist, oe_link) {
+		struct osc_async_page *oap2;
+
+		oap2 = list_first_entry(&tmp->oe_pages, struct osc_async_page,
+					oap_pending_item);
+		EASSERT(tmp->oe_owner == current, tmp);
+#if 0
+		if (overlapped(tmp, ext)) {
+			OSC_EXTENT_DUMP(D_ERROR, tmp, "overlapped %p.\n", ext);
+			EASSERT(0, ext);
+		}
+#endif
+		if (oap2cl_page(oap)->cp_type != oap2cl_page(oap2)->cp_type) {
+			CDEBUG(D_CACHE, "Do not permit different type of IO"
+					" for a same RPC\n");
+			return 0;
+		}
+
+		if (tmp->oe_srvlock != ext->oe_srvlock ||
+		    !tmp->oe_grants != !ext->oe_grants)
+			return 0;
+
+		/* remove break for strict check */
+		break;
+	}
+
+	*pc += ext->oe_nr_pages;
+	list_move_tail(&ext->oe_link, rpclist);
+	ext->oe_owner = current;
+	return 1;
+}
+
+/**
+ * In order to prevent multiple ptlrpcd from breaking contiguous extents,
+ * get_write_extent() takes all appropriate extents in atomic.
+ *
+ * The following policy is used to collect extents for IO:
+ * 1. Add as many HP extents as possible;
+ * 2. Add the first urgent extent in urgent extent list and take it out of
+ *    urgent list;
+ * 3. Add subsequent extents of this urgent extent;
+ * 4. If urgent list is not empty, goto 2;
+ * 5. Traverse the extent tree from the 1st extent;
+ * 6. Above steps exit if there is no space in this RPC.
+ */
+static int get_write_extents(struct osc_object *obj, struct list_head *rpclist)
+{
+	struct client_obd *cli = osc_cli(obj);
+	struct osc_extent *ext;
+	int page_count = 0;
+	unsigned int max_pages = cli->cl_max_pages_per_rpc;
+
+	LASSERT(osc_object_is_locked(obj));
+	while (!list_empty(&obj->oo_hp_exts)) {
+		ext = list_entry(obj->oo_hp_exts.next, struct osc_extent,
+				     oe_link);
+		LASSERT(ext->oe_state == OES_CACHE);
+		if (!try_to_add_extent_for_io(cli, ext, rpclist, &page_count,
+					      &max_pages))
+			return page_count;
+		EASSERT(ext->oe_nr_pages <= max_pages, ext);
+	}
+	if (page_count == max_pages)
+		return page_count;
+
+	while (!list_empty(&obj->oo_urgent_exts)) {
+		ext = list_entry(obj->oo_urgent_exts.next,
+				     struct osc_extent, oe_link);
+		if (!try_to_add_extent_for_io(cli, ext, rpclist, &page_count,
+					      &max_pages))
+			return page_count;
+
+		if (!ext->oe_intree)
+			continue;
+
+		while ((ext = next_extent(ext)) != NULL) {
+			if ((ext->oe_state != OES_CACHE) ||
+			    (!list_empty(&ext->oe_link) &&
+			     ext->oe_owner != NULL))
+				continue;
+
+			if (!try_to_add_extent_for_io(cli, ext, rpclist,
+						      &page_count, &max_pages))
+				return page_count;
+		}
+	}
+	if (page_count == max_pages)
+		return page_count;
+
+	ext = first_extent(obj);
+	while (ext != NULL) {
+		if ((ext->oe_state != OES_CACHE) ||
+		    /* this extent may be already in current rpclist */
+		    (!list_empty(&ext->oe_link) && ext->oe_owner != NULL)) {
+			ext = next_extent(ext);
+			continue;
+		}
+
+		if (!try_to_add_extent_for_io(cli, ext, rpclist, &page_count,
+					      &max_pages))
+			return page_count;
+
+		ext = next_extent(ext);
+	}
+	return page_count;
+}
+
+static int
+osc_send_write_rpc(const struct lu_env *env, struct client_obd *cli,
+		   struct osc_object *osc, pdl_policy_t pol)
+{
+	LIST_HEAD(rpclist);
+	struct osc_extent *ext;
+	struct osc_extent *tmp;
+	struct osc_extent *first = NULL;
+	u32 page_count = 0;
+	int srvlock = 0;
+	int rc = 0;
+
+	LASSERT(osc_object_is_locked(osc));
+
+	page_count = get_write_extents(osc, &rpclist);
+	LASSERT(equi(page_count == 0, list_empty(&rpclist)));
+
+	if (list_empty(&rpclist))
+		return 0;
+
+	osc_update_pending(osc, OBD_BRW_WRITE, -page_count);
+
+	list_for_each_entry(ext, &rpclist, oe_link) {
+		LASSERT(ext->oe_state == OES_CACHE ||
+			ext->oe_state == OES_LOCK_DONE);
+		if (ext->oe_state == OES_CACHE)
+			osc_extent_state_set(ext, OES_LOCKING);
+		else
+			osc_extent_state_set(ext, OES_RPC);
+	}
+
+	/* we're going to grab page lock, so release object lock because
+	 * lock order is page lock -> object lock. */
+	osc_object_unlock(osc);
+
+	list_for_each_entry_safe(ext, tmp, &rpclist, oe_link) {
+		if (ext->oe_state == OES_LOCKING) {
+			rc = osc_extent_make_ready(env, ext);
+			if (unlikely(rc < 0)) {
+				list_del_init(&ext->oe_link);
+				osc_extent_finish(env, ext, 0, rc);
+				continue;
+			}
+		}
+		if (first == NULL) {
+			first = ext;
+			srvlock = ext->oe_srvlock;
+		} else {
+			LASSERT(srvlock == ext->oe_srvlock);
+		}
+	}
+
+	if (!list_empty(&rpclist)) {
+		LASSERT(page_count > 0);
+		rc = osc_build_rpc(env, cli, &rpclist, OBD_BRW_WRITE, pol);
+		LASSERT(list_empty(&rpclist));
+	}
+
+	osc_object_lock(osc);
+	return rc;
+}
+
+/**
+ * prepare pages for ASYNC io and put pages in send queue.
+ *
+ * \param cmd OBD_BRW_* macroses
+ * \param lop pending pages
+ *
+ * \return zero if no page added to send queue.
+ * \return 1 if pages successfully added to send queue.
+ * \return negative on errors.
+ */
+static int
+osc_send_read_rpc(const struct lu_env *env, struct client_obd *cli,
+		  struct osc_object *osc, pdl_policy_t pol)
+{
+	struct osc_extent *ext;
+	struct osc_extent *next;
+	LIST_HEAD(rpclist);
+	int page_count = 0;
+	unsigned int max_pages = cli->cl_max_pages_per_rpc;
+	int rc = 0;
+
+	LASSERT(osc_object_is_locked(osc));
+	list_for_each_entry_safe(ext, next,
+				     &osc->oo_reading_exts, oe_link) {
+		EASSERT(ext->oe_state == OES_LOCK_DONE, ext);
+		if (!try_to_add_extent_for_io(cli, ext, &rpclist, &page_count,
+					      &max_pages))
+			break;
+		osc_extent_state_set(ext, OES_RPC);
+		EASSERT(ext->oe_nr_pages <= max_pages, ext);
+	}
+	LASSERT(page_count <= max_pages);
+
+	osc_update_pending(osc, OBD_BRW_READ, -page_count);
+
+	if (!list_empty(&rpclist)) {
+		osc_object_unlock(osc);
+
+		LASSERT(page_count > 0);
+		rc = osc_build_rpc(env, cli, &rpclist, OBD_BRW_READ, pol);
+		LASSERT(list_empty(&rpclist));
+
+		osc_object_lock(osc);
+	}
+	return rc;
+}
+
+#define list_to_obj(list, item) ({					      \
+	struct list_head *__tmp = (list)->next;				      \
+	list_del_init(__tmp);					      \
+	list_entry(__tmp, struct osc_object, oo_##item);		      \
+})
+
+/* This is called by osc_check_rpcs() to find which objects have pages that
+ * we could be sending.  These lists are maintained by osc_makes_rpc(). */
+static struct osc_object *osc_next_obj(struct client_obd *cli)
+{
+	/* First return objects that have blocked locks so that they
+	 * will be flushed quickly and other clients can get the lock,
+	 * then objects which have pages ready to be stuffed into RPCs */
+	if (!list_empty(&cli->cl_loi_hp_ready_list))
+		return list_to_obj(&cli->cl_loi_hp_ready_list, hp_ready_item);
+	if (!list_empty(&cli->cl_loi_ready_list))
+		return list_to_obj(&cli->cl_loi_ready_list, ready_item);
+
+	/* then if we have cache waiters, return all objects with queued
+	 * writes.  This is especially important when many small files
+	 * have filled up the cache and not been fired into rpcs because
+	 * they don't pass the nr_pending/object threshold */
+	if (!list_empty(&cli->cl_cache_waiters) &&
+	    !list_empty(&cli->cl_loi_write_list))
+		return list_to_obj(&cli->cl_loi_write_list, write_item);
+
+	/* then return all queued objects when we have an invalid import
+	 * so that they get flushed */
+	if (cli->cl_import == NULL || cli->cl_import->imp_invalid) {
+		if (!list_empty(&cli->cl_loi_write_list))
+			return list_to_obj(&cli->cl_loi_write_list, write_item);
+		if (!list_empty(&cli->cl_loi_read_list))
+			return list_to_obj(&cli->cl_loi_read_list, read_item);
+	}
+	return NULL;
+}
+
+/* called with the loi list lock held */
+static void osc_check_rpcs(const struct lu_env *env, struct client_obd *cli,
+			   pdl_policy_t pol)
+{
+	struct osc_object *osc;
+	int rc = 0;
+
+	while ((osc = osc_next_obj(cli)) != NULL) {
+		struct cl_object *obj = osc2cl(osc);
+		struct lu_ref_link link;
+
+		OSC_IO_DEBUG(osc, "%lu in flight\n", rpcs_in_flight(cli));
+
+		if (osc_max_rpc_in_flight(cli, osc)) {
+			__osc_list_maint(cli, osc);
+			break;
+		}
+
+		cl_object_get(obj);
+		client_obd_list_unlock(&cli->cl_loi_list_lock);
+		lu_object_ref_add_at(&obj->co_lu, &link, "check",
+				     current);
+
+		/* attempt some read/write balancing by alternating between
+		 * reads and writes in an object.  The makes_rpc checks here
+		 * would be redundant if we were getting read/write work items
+		 * instead of objects.  we don't want send_oap_rpc to drain a
+		 * partial read pending queue when we're given this object to
+		 * do io on writes while there are cache waiters */
+		osc_object_lock(osc);
+		if (osc_makes_rpc(cli, osc, OBD_BRW_WRITE)) {
+			rc = osc_send_write_rpc(env, cli, osc, pol);
+			if (rc < 0) {
+				CERROR("Write request failed with %d\n", rc);
+
+				/* osc_send_write_rpc failed, mostly because of
+				 * memory pressure.
+				 *
+				 * It can't break here, because if:
+				 *  - a page was submitted by osc_io_submit, so
+				 *    page locked;
+				 *  - no request in flight
+				 *  - no subsequent request
+				 * The system will be in live-lock state,
+				 * because there is no chance to call
+				 * osc_io_unplug() and osc_check_rpcs() any
+				 * more. pdflush can't help in this case,
+				 * because it might be blocked at grabbing
+				 * the page lock as we mentioned.
+				 *
+				 * Anyway, continue to drain pages. */
+				/* break; */
+			}
+		}
+		if (osc_makes_rpc(cli, osc, OBD_BRW_READ)) {
+			rc = osc_send_read_rpc(env, cli, osc, pol);
+			if (rc < 0)
+				CERROR("Read request failed with %d\n", rc);
+		}
+		osc_object_unlock(osc);
+
+		osc_list_maint(cli, osc);
+		lu_object_ref_del_at(&obj->co_lu, &link, "check",
+				     current);
+		cl_object_put(env, obj);
+
+		client_obd_list_lock(&cli->cl_loi_list_lock);
+	}
+}
+
+static int osc_io_unplug0(const struct lu_env *env, struct client_obd *cli,
+			  struct osc_object *osc, pdl_policy_t pol, int async)
+{
+	int rc = 0;
+
+	if (osc != NULL && osc_list_maint(cli, osc) == 0)
+		return 0;
+
+	if (!async) {
+		/* disable osc_lru_shrink() temporarily to avoid
+		 * potential stack overrun problem. LU-2859 */
+		atomic_inc(&cli->cl_lru_shrinkers);
+		client_obd_list_lock(&cli->cl_loi_list_lock);
+		osc_check_rpcs(env, cli, pol);
+		client_obd_list_unlock(&cli->cl_loi_list_lock);
+		atomic_dec(&cli->cl_lru_shrinkers);
+	} else {
+		CDEBUG(D_CACHE, "Queue writeback work for client %p.\n", cli);
+		LASSERT(cli->cl_writeback_work != NULL);
+		rc = ptlrpcd_queue_work(cli->cl_writeback_work);
+	}
+	return rc;
+}
+
+static int osc_io_unplug_async(const struct lu_env *env,
+				struct client_obd *cli, struct osc_object *osc)
+{
+	/* XXX: policy is no use actually. */
+	return osc_io_unplug0(env, cli, osc, PDL_POLICY_ROUND, 1);
+}
+
+void osc_io_unplug(const struct lu_env *env, struct client_obd *cli,
+		   struct osc_object *osc, pdl_policy_t pol)
+{
+	(void)osc_io_unplug0(env, cli, osc, pol, 0);
+}
+
+int osc_prep_async_page(struct osc_object *osc, struct osc_page *ops,
+			struct page *page, loff_t offset)
+{
+	struct obd_export     *exp = osc_export(osc);
+	struct osc_async_page *oap = &ops->ops_oap;
+
+	if (!page)
+		return cfs_size_round(sizeof(*oap));
+
+	oap->oap_magic = OAP_MAGIC;
+	oap->oap_cli = &exp->exp_obd->u.cli;
+	oap->oap_obj = osc;
+
+	oap->oap_page = page;
+	oap->oap_obj_off = offset;
+	LASSERT(!(offset & ~CFS_PAGE_MASK));
+
+	if (!client_is_remote(exp) && capable(CFS_CAP_SYS_RESOURCE))
+		oap->oap_brw_flags = OBD_BRW_NOQUOTA;
+
+	INIT_LIST_HEAD(&oap->oap_pending_item);
+	INIT_LIST_HEAD(&oap->oap_rpc_item);
+
+	spin_lock_init(&oap->oap_lock);
+	CDEBUG(D_INFO, "oap %p page %p obj off %llu\n",
+	       oap, page, oap->oap_obj_off);
+	return 0;
+}
+
+int osc_queue_async_io(const struct lu_env *env, struct cl_io *io,
+		       struct osc_page *ops)
+{
+	struct osc_io *oio = osc_env_io(env);
+	struct osc_extent     *ext = NULL;
+	struct osc_async_page *oap = &ops->ops_oap;
+	struct client_obd     *cli = oap->oap_cli;
+	struct osc_object     *osc = oap->oap_obj;
+	pgoff_t index;
+	int    grants = 0;
+	int    brw_flags = OBD_BRW_ASYNC;
+	int    cmd = OBD_BRW_WRITE;
+	int    need_release = 0;
+	int    rc = 0;
+
+	if (oap->oap_magic != OAP_MAGIC)
+		return -EINVAL;
+
+	if (cli->cl_import == NULL || cli->cl_import->imp_invalid)
+		return -EIO;
+
+	if (!list_empty(&oap->oap_pending_item) ||
+	    !list_empty(&oap->oap_rpc_item))
+		return -EBUSY;
+
+	/* Set the OBD_BRW_SRVLOCK before the page is queued. */
+	brw_flags |= ops->ops_srvlock ? OBD_BRW_SRVLOCK : 0;
+	if (!client_is_remote(osc_export(osc)) &&
+	    capable(CFS_CAP_SYS_RESOURCE)) {
+		brw_flags |= OBD_BRW_NOQUOTA;
+		cmd |= OBD_BRW_NOQUOTA;
+	}
+
+	/* check if the file's owner/group is over quota */
+	if (!(cmd & OBD_BRW_NOQUOTA)) {
+		struct cl_object *obj;
+		struct cl_attr   *attr;
+		unsigned int qid[MAXQUOTAS];
+
+		obj = cl_object_top(&osc->oo_cl);
+		attr = &osc_env_info(env)->oti_attr;
+
+		cl_object_attr_lock(obj);
+		rc = cl_object_attr_get(env, obj, attr);
+		cl_object_attr_unlock(obj);
+
+		qid[USRQUOTA] = attr->cat_uid;
+		qid[GRPQUOTA] = attr->cat_gid;
+		if (rc == 0 && osc_quota_chkdq(cli, qid) == NO_QUOTA)
+			rc = -EDQUOT;
+		if (rc)
+			return rc;
+	}
+
+	oap->oap_cmd = cmd;
+	oap->oap_page_off = ops->ops_from;
+	oap->oap_count = ops->ops_to - ops->ops_from;
+	oap->oap_async_flags = 0;
+	oap->oap_brw_flags = brw_flags;
+
+	OSC_IO_DEBUG(osc, "oap %p page %p added for cmd %d\n",
+		     oap, oap->oap_page, oap->oap_cmd & OBD_BRW_RWMASK);
+
+	index = oap2cl_page(oap)->cp_index;
+
+	/* Add this page into extent by the following steps:
+	 * 1. if there exists an active extent for this IO, mostly this page
+	 *    can be added to the active extent and sometimes we need to
+	 *    expand extent to accommodate this page;
+	 * 2. otherwise, a new extent will be allocated. */
+
+	ext = oio->oi_active;
+	if (ext != NULL && ext->oe_start <= index && ext->oe_max_end >= index) {
+		/* one chunk plus extent overhead must be enough to write this
+		 * page */
+		grants = (1 << cli->cl_chunkbits) + cli->cl_extent_tax;
+		if (ext->oe_end >= index)
+			grants = 0;
+
+		/* it doesn't need any grant to dirty this page */
+		client_obd_list_lock(&cli->cl_loi_list_lock);
+		rc = osc_enter_cache_try(cli, oap, grants, 0);
+		client_obd_list_unlock(&cli->cl_loi_list_lock);
+		if (rc == 0) { /* try failed */
+			grants = 0;
+			need_release = 1;
+		} else if (ext->oe_end < index) {
+			int tmp = grants;
+			/* try to expand this extent */
+			rc = osc_extent_expand(ext, index, &tmp);
+			if (rc < 0) {
+				need_release = 1;
+				/* don't free reserved grant */
+			} else {
+				OSC_EXTENT_DUMP(D_CACHE, ext,
+						"expanded for %lu.\n", index);
+				osc_unreserve_grant(cli, grants, tmp);
+				grants = 0;
+			}
+		}
+		rc = 0;
+	} else if (ext != NULL) {
+		/* index is located outside of active extent */
+		need_release = 1;
+	}
+	if (need_release) {
+		osc_extent_release(env, ext);
+		oio->oi_active = NULL;
+		ext = NULL;
+	}
+
+	if (ext == NULL) {
+		int tmp = (1 << cli->cl_chunkbits) + cli->cl_extent_tax;
+
+		/* try to find new extent to cover this page */
+		LASSERT(oio->oi_active == NULL);
+		/* we may have allocated grant for this page if we failed
+		 * to expand the previous active extent. */
+		LASSERT(ergo(grants > 0, grants >= tmp));
+
+		rc = 0;
+		if (grants == 0) {
+			/* we haven't allocated grant for this page. */
+			rc = osc_enter_cache(env, cli, oap, tmp);
+			if (rc == 0)
+				grants = tmp;
+		}
+
+		tmp = grants;
+		if (rc == 0) {
+			ext = osc_extent_find(env, osc, index, &tmp);
+			if (IS_ERR(ext)) {
+				LASSERT(tmp == grants);
+				osc_exit_cache(cli, oap);
+				rc = PTR_ERR(ext);
+				ext = NULL;
+			} else {
+				oio->oi_active = ext;
+			}
+		}
+		if (grants > 0)
+			osc_unreserve_grant(cli, grants, tmp);
+	}
+
+	LASSERT(ergo(rc == 0, ext != NULL));
+	if (ext != NULL) {
+		EASSERTF(ext->oe_end >= index && ext->oe_start <= index,
+			 ext, "index = %lu.\n", index);
+		LASSERT((oap->oap_brw_flags & OBD_BRW_FROM_GRANT) != 0);
+
+		osc_object_lock(osc);
+		if (ext->oe_nr_pages == 0)
+			ext->oe_srvlock = ops->ops_srvlock;
+		else
+			LASSERT(ext->oe_srvlock == ops->ops_srvlock);
+		++ext->oe_nr_pages;
+		list_add_tail(&oap->oap_pending_item, &ext->oe_pages);
+		osc_object_unlock(osc);
+	}
+	return rc;
+}
+
+int osc_teardown_async_page(const struct lu_env *env,
+			    struct osc_object *obj, struct osc_page *ops)
+{
+	struct osc_async_page *oap = &ops->ops_oap;
+	struct osc_extent     *ext = NULL;
+	int rc = 0;
+
+	LASSERT(oap->oap_magic == OAP_MAGIC);
+
+	CDEBUG(D_INFO, "teardown oap %p page %p at index %lu.\n",
+	       oap, ops, oap2cl_page(oap)->cp_index);
+
+	osc_object_lock(obj);
+	if (!list_empty(&oap->oap_rpc_item)) {
+		CDEBUG(D_CACHE, "oap %p is not in cache.\n", oap);
+		rc = -EBUSY;
+	} else if (!list_empty(&oap->oap_pending_item)) {
+		ext = osc_extent_lookup(obj, oap2cl_page(oap)->cp_index);
+		/* only truncated pages are allowed to be taken out.
+		 * See osc_extent_truncate() and osc_cache_truncate_start()
+		 * for details. */
+		if (ext != NULL && ext->oe_state != OES_TRUNC) {
+			OSC_EXTENT_DUMP(D_ERROR, ext, "trunc at %lu.\n",
+					oap2cl_page(oap)->cp_index);
+			rc = -EBUSY;
+		}
+	}
+	osc_object_unlock(obj);
+	if (ext != NULL)
+		osc_extent_put(env, ext);
+	return rc;
+}
+
+/**
+ * This is called when a page is picked up by kernel to write out.
+ *
+ * We should find out the corresponding extent and add the whole extent
+ * into urgent list. The extent may be being truncated or used, handle it
+ * carefully.
+ */
+int osc_flush_async_page(const struct lu_env *env, struct cl_io *io,
+			 struct osc_page *ops)
+{
+	struct osc_extent *ext   = NULL;
+	struct osc_object *obj   = cl2osc(ops->ops_cl.cpl_obj);
+	struct cl_page    *cp    = ops->ops_cl.cpl_page;
+	pgoff_t	    index = cp->cp_index;
+	struct osc_async_page *oap = &ops->ops_oap;
+	bool unplug = false;
+	int rc = 0;
+
+	osc_object_lock(obj);
+	ext = osc_extent_lookup(obj, index);
+	if (ext == NULL) {
+		osc_extent_tree_dump(D_ERROR, obj);
+		LASSERTF(0, "page index %lu is NOT covered.\n", index);
+	}
+
+	switch (ext->oe_state) {
+	case OES_RPC:
+	case OES_LOCK_DONE:
+		CL_PAGE_DEBUG(D_ERROR, env, cl_page_top(cp),
+			      "flush an in-rpc page?\n");
+		LASSERT(0);
+		break;
+	case OES_LOCKING:
+		/* If we know this extent is being written out, we should abort
+		 * so that the writer can make this page ready. Otherwise, there
+		 * exists a deadlock problem because other process can wait for
+		 * page writeback bit holding page lock; and meanwhile in
+		 * vvp_page_make_ready(), we need to grab page lock before
+		 * really sending the RPC. */
+	case OES_TRUNC:
+		/* race with truncate, page will be redirtied */
+	case OES_ACTIVE:
+		/* The extent is active so we need to abort and let the caller
+		 * re-dirty the page. If we continued on here, and we were the
+		 * one making the extent active, we could deadlock waiting for
+		 * the page writeback to clear but it won't because the extent
+		 * is active and won't be written out. */
+		rc = -EAGAIN;
+		goto out;
+	default:
+		break;
+	}
+
+	rc = cl_page_prep(env, io, cl_page_top(cp), CRT_WRITE);
+	if (rc)
+		goto out;
+
+	spin_lock(&oap->oap_lock);
+	oap->oap_async_flags |= ASYNC_READY|ASYNC_URGENT;
+	spin_unlock(&oap->oap_lock);
+
+	if (memory_pressure_get())
+		ext->oe_memalloc = 1;
+
+	ext->oe_urgent = 1;
+	if (ext->oe_state == OES_CACHE) {
+		OSC_EXTENT_DUMP(D_CACHE, ext,
+				"flush page %p make it urgent.\n", oap);
+		if (list_empty(&ext->oe_link))
+			list_add_tail(&ext->oe_link, &obj->oo_urgent_exts);
+		unplug = true;
+	}
+	rc = 0;
+
+out:
+	osc_object_unlock(obj);
+	osc_extent_put(env, ext);
+	if (unplug)
+		osc_io_unplug_async(env, osc_cli(obj), obj);
+	return rc;
+}
+
+/**
+ * this is called when a sync waiter receives an interruption.  Its job is to
+ * get the caller woken as soon as possible.  If its page hasn't been put in an
+ * rpc yet it can dequeue immediately.  Otherwise it has to mark the rpc as
+ * desiring interruption which will forcefully complete the rpc once the rpc
+ * has timed out.
+ */
+int osc_cancel_async_page(const struct lu_env *env, struct osc_page *ops)
+{
+	struct osc_async_page *oap = &ops->ops_oap;
+	struct osc_object     *obj = oap->oap_obj;
+	struct client_obd     *cli = osc_cli(obj);
+	struct osc_extent     *ext;
+	struct osc_extent     *found = NULL;
+	struct list_head	    *plist;
+	pgoff_t index = oap2cl_page(oap)->cp_index;
+	int     rc = -EBUSY;
+	int     cmd;
+
+	LASSERT(!oap->oap_interrupted);
+	oap->oap_interrupted = 1;
+
+	/* Find out the caching extent */
+	osc_object_lock(obj);
+	if (oap->oap_cmd & OBD_BRW_WRITE) {
+		plist = &obj->oo_urgent_exts;
+		cmd   = OBD_BRW_WRITE;
+	} else {
+		plist = &obj->oo_reading_exts;
+		cmd   = OBD_BRW_READ;
+	}
+	list_for_each_entry(ext, plist, oe_link) {
+		if (ext->oe_start <= index && ext->oe_end >= index) {
+			LASSERT(ext->oe_state == OES_LOCK_DONE);
+			/* For OES_LOCK_DONE state extent, it has already held
+			 * a refcount for RPC. */
+			found = osc_extent_get(ext);
+			break;
+		}
+	}
+	if (found != NULL) {
+		list_del_init(&found->oe_link);
+		osc_update_pending(obj, cmd, -found->oe_nr_pages);
+		osc_object_unlock(obj);
+
+		osc_extent_finish(env, found, 0, -EINTR);
+		osc_extent_put(env, found);
+		rc = 0;
+	} else {
+		osc_object_unlock(obj);
+		/* ok, it's been put in an rpc. only one oap gets a request
+		 * reference */
+		if (oap->oap_request != NULL) {
+			ptlrpc_mark_interrupted(oap->oap_request);
+			ptlrpcd_wake(oap->oap_request);
+			ptlrpc_req_finished(oap->oap_request);
+			oap->oap_request = NULL;
+		}
+	}
+
+	osc_list_maint(cli, obj);
+	return rc;
+}
+
+int osc_queue_sync_pages(const struct lu_env *env, struct osc_object *obj,
+			 struct list_head *list, int cmd, int brw_flags)
+{
+	struct client_obd     *cli = osc_cli(obj);
+	struct osc_extent     *ext;
+	struct osc_async_page *oap, *tmp;
+	int     page_count = 0;
+	int     mppr       = cli->cl_max_pages_per_rpc;
+	pgoff_t start      = CL_PAGE_EOF;
+	pgoff_t end	= 0;
+
+	list_for_each_entry(oap, list, oap_pending_item) {
+		struct cl_page *cp = oap2cl_page(oap);
+		if (cp->cp_index > end)
+			end = cp->cp_index;
+		if (cp->cp_index < start)
+			start = cp->cp_index;
+		++page_count;
+		mppr <<= (page_count > mppr);
+	}
+
+	ext = osc_extent_alloc(obj);
+	if (ext == NULL) {
+		list_for_each_entry_safe(oap, tmp, list, oap_pending_item) {
+			list_del_init(&oap->oap_pending_item);
+			osc_ap_completion(env, cli, oap, 0, -ENOMEM);
+		}
+		return -ENOMEM;
+	}
+
+	ext->oe_rw = !!(cmd & OBD_BRW_READ);
+	ext->oe_urgent = 1;
+	ext->oe_start = start;
+	ext->oe_end = ext->oe_max_end = end;
+	ext->oe_obj = obj;
+	ext->oe_srvlock = !!(brw_flags & OBD_BRW_SRVLOCK);
+	ext->oe_nr_pages = page_count;
+	ext->oe_mppr = mppr;
+	list_splice_init(list, &ext->oe_pages);
+
+	osc_object_lock(obj);
+	/* Reuse the initial refcount for RPC, don't drop it */
+	osc_extent_state_set(ext, OES_LOCK_DONE);
+	if (cmd & OBD_BRW_WRITE) {
+		list_add_tail(&ext->oe_link, &obj->oo_urgent_exts);
+		osc_update_pending(obj, OBD_BRW_WRITE, page_count);
+	} else {
+		list_add_tail(&ext->oe_link, &obj->oo_reading_exts);
+		osc_update_pending(obj, OBD_BRW_READ, page_count);
+	}
+	osc_object_unlock(obj);
+
+	osc_io_unplug_async(env, cli, obj);
+	return 0;
+}
+
+/**
+ * Called by osc_io_setattr_start() to freeze and destroy covering extents.
+ */
+int osc_cache_truncate_start(const struct lu_env *env, struct osc_io *oio,
+			     struct osc_object *obj, __u64 size)
+{
+	struct client_obd *cli = osc_cli(obj);
+	struct osc_extent *ext;
+	struct osc_extent *waiting = NULL;
+	pgoff_t index;
+	LIST_HEAD(list);
+	int result = 0;
+	bool partial;
+
+	/* pages with index greater or equal to index will be truncated. */
+	index = cl_index(osc2cl(obj), size);
+	partial = size > cl_offset(osc2cl(obj), index);
+
+again:
+	osc_object_lock(obj);
+	ext = osc_extent_search(obj, index);
+	if (ext == NULL)
+		ext = first_extent(obj);
+	else if (ext->oe_end < index)
+		ext = next_extent(ext);
+	while (ext != NULL) {
+		EASSERT(ext->oe_state != OES_TRUNC, ext);
+
+		if (ext->oe_state > OES_CACHE || ext->oe_urgent) {
+			/* if ext is in urgent state, it means there must exist
+			 * a page already having been flushed by write_page().
+			 * We have to wait for this extent because we can't
+			 * truncate that page. */
+			LASSERT(!ext->oe_hp);
+			OSC_EXTENT_DUMP(D_CACHE, ext,
+					"waiting for busy extent\n");
+			waiting = osc_extent_get(ext);
+			break;
+		}
+
+		OSC_EXTENT_DUMP(D_CACHE, ext, "try to trunc:%llu.\n", size);
+
+		osc_extent_get(ext);
+		if (ext->oe_state == OES_ACTIVE) {
+			/* though we grab inode mutex for write path, but we
+			 * release it before releasing extent(in osc_io_end()),
+			 * so there is a race window that an extent is still
+			 * in OES_ACTIVE when truncate starts. */
+			LASSERT(!ext->oe_trunc_pending);
+			ext->oe_trunc_pending = 1;
+		} else {
+			EASSERT(ext->oe_state == OES_CACHE, ext);
+			osc_extent_state_set(ext, OES_TRUNC);
+			osc_update_pending(obj, OBD_BRW_WRITE,
+					   -ext->oe_nr_pages);
+		}
+		EASSERT(list_empty(&ext->oe_link), ext);
+		list_add_tail(&ext->oe_link, &list);
+
+		ext = next_extent(ext);
+	}
+	osc_object_unlock(obj);
+
+	osc_list_maint(cli, obj);
+
+	while (!list_empty(&list)) {
+		int rc;
+
+		ext = list_entry(list.next, struct osc_extent, oe_link);
+		list_del_init(&ext->oe_link);
+
+		/* extent may be in OES_ACTIVE state because inode mutex
+		 * is released before osc_io_end() in file write case */
+		if (ext->oe_state != OES_TRUNC)
+			osc_extent_wait(env, ext, OES_TRUNC);
+
+		rc = osc_extent_truncate(ext, index, partial);
+		if (rc < 0) {
+			if (result == 0)
+				result = rc;
+
+			OSC_EXTENT_DUMP(D_ERROR, ext,
+					"truncate error %d\n", rc);
+		} else if (ext->oe_nr_pages == 0) {
+			osc_extent_remove(ext);
+		} else {
+			/* this must be an overlapped extent which means only
+			 * part of pages in this extent have been truncated.
+			 */
+			EASSERTF(ext->oe_start <= index, ext,
+				 "trunc index = %lu/%d.\n", index, partial);
+			/* fix index to skip this partially truncated extent */
+			index = ext->oe_end + 1;
+			partial = false;
+
+			/* we need to hold this extent in OES_TRUNC state so
+			 * that no writeback will happen. This is to avoid
+			 * BUG 17397. */
+			LASSERT(oio->oi_trunc == NULL);
+			oio->oi_trunc = osc_extent_get(ext);
+			OSC_EXTENT_DUMP(D_CACHE, ext,
+					"trunc at %llu\n", size);
+		}
+		osc_extent_put(env, ext);
+	}
+	if (waiting != NULL) {
+		int rc;
+
+		/* ignore the result of osc_extent_wait the write initiator
+		 * should take care of it. */
+		rc = osc_extent_wait(env, waiting, OES_INV);
+		if (rc < 0)
+			OSC_EXTENT_DUMP(D_CACHE, waiting, "error: %d.\n", rc);
+
+		osc_extent_put(env, waiting);
+		waiting = NULL;
+		goto again;
+	}
+	return result;
+}
+
+/**
+ * Called after osc_io_setattr_end to add oio->oi_trunc back to cache.
+ */
+void osc_cache_truncate_end(const struct lu_env *env, struct osc_io *oio,
+			    struct osc_object *obj)
+{
+	struct osc_extent *ext = oio->oi_trunc;
+
+	oio->oi_trunc = NULL;
+	if (ext != NULL) {
+		bool unplug = false;
+
+		EASSERT(ext->oe_nr_pages > 0, ext);
+		EASSERT(ext->oe_state == OES_TRUNC, ext);
+		EASSERT(!ext->oe_urgent, ext);
+
+		OSC_EXTENT_DUMP(D_CACHE, ext, "trunc -> cache.\n");
+		osc_object_lock(obj);
+		osc_extent_state_set(ext, OES_CACHE);
+		if (ext->oe_fsync_wait && !ext->oe_urgent) {
+			ext->oe_urgent = 1;
+			list_move_tail(&ext->oe_link, &obj->oo_urgent_exts);
+			unplug = true;
+		}
+		osc_update_pending(obj, OBD_BRW_WRITE, ext->oe_nr_pages);
+		osc_object_unlock(obj);
+		osc_extent_put(env, ext);
+
+		if (unplug)
+			osc_io_unplug_async(env, osc_cli(obj), obj);
+	}
+}
+
+/**
+ * Wait for extents in a specific range to be written out.
+ * The caller must have called osc_cache_writeback_range() to issue IO
+ * otherwise it will take a long time for this function to finish.
+ *
+ * Caller must hold inode_mutex , or cancel exclusive dlm lock so that
+ * nobody else can dirty this range of file while we're waiting for
+ * extents to be written.
+ */
+int osc_cache_wait_range(const struct lu_env *env, struct osc_object *obj,
+			 pgoff_t start, pgoff_t end)
+{
+	struct osc_extent *ext;
+	pgoff_t index = start;
+	int     result = 0;
+
+again:
+	osc_object_lock(obj);
+	ext = osc_extent_search(obj, index);
+	if (ext == NULL)
+		ext = first_extent(obj);
+	else if (ext->oe_end < index)
+		ext = next_extent(ext);
+	while (ext != NULL) {
+		int rc;
+
+		if (ext->oe_start > end)
+			break;
+
+		if (!ext->oe_fsync_wait) {
+			ext = next_extent(ext);
+			continue;
+		}
+
+		EASSERT(ergo(ext->oe_state == OES_CACHE,
+			     ext->oe_hp || ext->oe_urgent), ext);
+		EASSERT(ergo(ext->oe_state == OES_ACTIVE,
+			     !ext->oe_hp && ext->oe_urgent), ext);
+
+		index = ext->oe_end + 1;
+		osc_extent_get(ext);
+		osc_object_unlock(obj);
+
+		rc = osc_extent_wait(env, ext, OES_INV);
+		if (result == 0)
+			result = rc;
+		osc_extent_put(env, ext);
+		goto again;
+	}
+	osc_object_unlock(obj);
+
+	OSC_IO_DEBUG(obj, "sync file range.\n");
+	return result;
+}
+
+/**
+ * Called to write out a range of osc object.
+ *
+ * @hp     : should be set this is caused by lock cancel;
+ * @discard: is set if dirty pages should be dropped - file will be deleted or
+ *	   truncated, this implies there is no partially discarding extents.
+ *
+ * Return how many pages will be issued, or error code if error occurred.
+ */
+int osc_cache_writeback_range(const struct lu_env *env, struct osc_object *obj,
+			      pgoff_t start, pgoff_t end, int hp, int discard)
+{
+	struct osc_extent *ext;
+	LIST_HEAD(discard_list);
+	bool unplug = false;
+	int result = 0;
+
+	osc_object_lock(obj);
+	ext = osc_extent_search(obj, start);
+	if (ext == NULL)
+		ext = first_extent(obj);
+	else if (ext->oe_end < start)
+		ext = next_extent(ext);
+	while (ext != NULL) {
+		if (ext->oe_start > end)
+			break;
+
+		ext->oe_fsync_wait = 1;
+		switch (ext->oe_state) {
+		case OES_CACHE:
+			result += ext->oe_nr_pages;
+			if (!discard) {
+				struct list_head *list = NULL;
+				if (hp) {
+					EASSERT(!ext->oe_hp, ext);
+					ext->oe_hp = 1;
+					list = &obj->oo_hp_exts;
+				} else if (!ext->oe_urgent) {
+					ext->oe_urgent = 1;
+					list = &obj->oo_urgent_exts;
+				}
+				if (list != NULL)
+					list_move_tail(&ext->oe_link, list);
+				unplug = true;
+			} else {
+				/* the only discarder is lock cancelling, so
+				 * [start, end] must contain this extent */
+				EASSERT(ext->oe_start >= start &&
+					ext->oe_max_end <= end, ext);
+				osc_extent_state_set(ext, OES_LOCKING);
+				ext->oe_owner = current;
+				list_move_tail(&ext->oe_link,
+						   &discard_list);
+				osc_update_pending(obj, OBD_BRW_WRITE,
+						   -ext->oe_nr_pages);
+			}
+			break;
+		case OES_ACTIVE:
+			/* It's pretty bad to wait for ACTIVE extents, because
+			 * we don't know how long we will wait for it to be
+			 * flushed since it may be blocked at awaiting more
+			 * grants. We do this for the correctness of fsync. */
+			LASSERT(hp == 0 && discard == 0);
+			ext->oe_urgent = 1;
+			break;
+		case OES_TRUNC:
+			/* this extent is being truncated, can't do anything
+			 * for it now. it will be set to urgent after truncate
+			 * is finished in osc_cache_truncate_end(). */
+		default:
+			break;
+		}
+		ext = next_extent(ext);
+	}
+	osc_object_unlock(obj);
+
+	LASSERT(ergo(!discard, list_empty(&discard_list)));
+	if (!list_empty(&discard_list)) {
+		struct osc_extent *tmp;
+		int rc;
+
+		osc_list_maint(osc_cli(obj), obj);
+		list_for_each_entry_safe(ext, tmp, &discard_list, oe_link) {
+			list_del_init(&ext->oe_link);
+			EASSERT(ext->oe_state == OES_LOCKING, ext);
+
+			/* Discard caching pages. We don't actually write this
+			 * extent out but we complete it as if we did. */
+			rc = osc_extent_make_ready(env, ext);
+			if (unlikely(rc < 0)) {
+				OSC_EXTENT_DUMP(D_ERROR, ext,
+						"make_ready returned %d\n", rc);
+				if (result >= 0)
+					result = rc;
+			}
+
+			/* finish the extent as if the pages were sent */
+			osc_extent_finish(env, ext, 0, 0);
+		}
+	}
+
+	if (unplug)
+		osc_io_unplug(env, osc_cli(obj), obj, PDL_POLICY_ROUND);
+
+	if (hp || discard) {
+		int rc;
+		rc = osc_cache_wait_range(env, obj, start, end);
+		if (result >= 0 && rc < 0)
+			result = rc;
+	}
+
+	OSC_IO_DEBUG(obj, "cache page out.\n");
+	return result;
+}
+
+/** @} osc */
diff --git a/kernel/drivers/staging/lustre/lustre/osc/osc_cl_internal.h b/kernel/drivers/staging/lustre/lustre/osc/osc_cl_internal.h
new file mode 100644
index 000000000..365b2787b
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/osc/osc_cl_internal.h
@@ -0,0 +1,685 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Internal interfaces of OSC layer.
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ *   Author: Jinshan Xiong <jinshan.xiong@whamcloud.com>
+ */
+
+#ifndef OSC_CL_INTERNAL_H
+#define OSC_CL_INTERNAL_H
+
+#include "../../include/linux/libcfs/libcfs.h"
+
+#include "../include/obd.h"
+/* osc_build_res_name() */
+#include "../include/cl_object.h"
+#include "../include/lclient.h"
+#include "osc_internal.h"
+
+/** \defgroup osc osc
+ *  @{
+ */
+
+struct osc_extent;
+
+/**
+ * State maintained by osc layer for each IO context.
+ */
+struct osc_io {
+	/** super class */
+	struct cl_io_slice oi_cl;
+	/** true if this io is lockless. */
+	int		oi_lockless;
+	/** active extents, we know how many bytes is going to be written,
+	 * so having an active extent will prevent it from being fragmented */
+	struct osc_extent *oi_active;
+	/** partially truncated extent, we need to hold this extent to prevent
+	 * page writeback from happening. */
+	struct osc_extent *oi_trunc;
+
+	struct obd_info    oi_info;
+	struct obdo	oi_oa;
+	struct osc_async_cbargs {
+		bool		  opc_rpc_sent;
+		int	       opc_rc;
+		struct completion	opc_sync;
+	} oi_cbarg;
+};
+
+/**
+ * State of transfer for osc.
+ */
+struct osc_req {
+	struct cl_req_slice    or_cl;
+};
+
+/**
+ * State maintained by osc layer for the duration of a system call.
+ */
+struct osc_session {
+	struct osc_io       os_io;
+};
+
+#define OTI_PVEC_SIZE 64
+struct osc_thread_info {
+	struct ldlm_res_id      oti_resname;
+	ldlm_policy_data_t      oti_policy;
+	struct cl_lock_descr    oti_descr;
+	struct cl_attr	  oti_attr;
+	struct lustre_handle    oti_handle;
+	struct cl_page_list     oti_plist;
+	struct cl_io		oti_io;
+	struct cl_page	       *oti_pvec[OTI_PVEC_SIZE];
+};
+
+struct osc_object {
+	struct cl_object   oo_cl;
+	struct lov_oinfo  *oo_oinfo;
+	/**
+	 * True if locking against this stripe got -EUSERS.
+	 */
+	int		oo_contended;
+	unsigned long	 oo_contention_time;
+	/**
+	 * List of pages in transfer.
+	 */
+	struct list_head	 oo_inflight[CRT_NR];
+	/**
+	 * Lock, protecting ccc_object::cob_inflight, because a seat-belt is
+	 * locked during take-off and landing.
+	 */
+	spinlock_t	   oo_seatbelt;
+
+	/**
+	 * used by the osc to keep track of what objects to build into rpcs.
+	 * Protected by client_obd->cli_loi_list_lock.
+	 */
+	struct list_head	   oo_ready_item;
+	struct list_head	   oo_hp_ready_item;
+	struct list_head	   oo_write_item;
+	struct list_head	   oo_read_item;
+
+	/**
+	 * extent is a red black tree to manage (async) dirty pages.
+	 */
+	struct rb_root       oo_root;
+	/**
+	 * Manage write(dirty) extents.
+	 */
+	struct list_head	   oo_hp_exts; /* list of hp extents */
+	struct list_head	   oo_urgent_exts; /* list of writeback extents */
+	struct list_head	   oo_rpc_exts;
+
+	struct list_head	   oo_reading_exts;
+
+	atomic_t	 oo_nr_reads;
+	atomic_t	 oo_nr_writes;
+
+	/** Protect extent tree. Will be used to protect
+	 * oo_{read|write}_pages soon. */
+	spinlock_t	    oo_lock;
+};
+
+static inline void osc_object_lock(struct osc_object *obj)
+{
+	spin_lock(&obj->oo_lock);
+}
+
+static inline int osc_object_trylock(struct osc_object *obj)
+{
+	return spin_trylock(&obj->oo_lock);
+}
+
+static inline void osc_object_unlock(struct osc_object *obj)
+{
+	spin_unlock(&obj->oo_lock);
+}
+
+static inline int osc_object_is_locked(struct osc_object *obj)
+{
+#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
+	return spin_is_locked(&obj->oo_lock);
+#else
+	/*
+	 * It is not perfect to return true all the time.
+	 * But since this function is only used for assertion
+	 * and checking, it seems OK.
+	 */
+	return 1;
+#endif
+}
+
+/*
+ * Lock "micro-states" for osc layer.
+ */
+enum osc_lock_state {
+	OLS_NEW,
+	OLS_ENQUEUED,
+	OLS_UPCALL_RECEIVED,
+	OLS_GRANTED,
+	OLS_RELEASED,
+	OLS_BLOCKED,
+	OLS_CANCELLED
+};
+
+/**
+ * osc-private state of cl_lock.
+ *
+ * Interaction with DLM.
+ *
+ * CLIO enqueues all DLM locks through ptlrpcd (that is, in "async" mode).
+ *
+ * Once receive upcall is invoked, osc_lock remembers a handle of DLM lock in
+ * osc_lock::ols_handle and a pointer to that lock in osc_lock::ols_lock.
+ *
+ * This pointer is protected through a reference, acquired by
+ * osc_lock_upcall0(). Also, an additional reference is acquired by
+ * ldlm_lock_addref() call protecting the lock from cancellation, until
+ * osc_lock_unuse() releases it.
+ *
+ * Below is a description of how lock references are acquired and released
+ * inside of DLM.
+ *
+ * - When new lock is created and enqueued to the server (ldlm_cli_enqueue())
+ *      - ldlm_lock_create()
+ *	  - ldlm_lock_new(): initializes a lock with 2 references. One for
+ *	    the caller (released when reply from the server is received, or on
+ *	    error), and another for the hash table.
+ *      - ldlm_lock_addref_internal(): protects the lock from cancellation.
+ *
+ * - When reply is received from the server (osc_enqueue_interpret())
+ *      - ldlm_cli_enqueue_fini()
+ *	  - LDLM_LOCK_PUT(): releases caller reference acquired by
+ *	    ldlm_lock_new().
+ *	  - if (rc != 0)
+ *		ldlm_lock_decref(): error case: matches ldlm_cli_enqueue().
+ *      - ldlm_lock_decref(): for async locks, matches ldlm_cli_enqueue().
+ *
+ * - When lock is being cancelled (ldlm_lock_cancel())
+ *      - ldlm_lock_destroy()
+ *	  - LDLM_LOCK_PUT(): releases hash-table reference acquired by
+ *	    ldlm_lock_new().
+ *
+ * osc_lock is detached from ldlm_lock by osc_lock_detach() that is called
+ * either when lock is cancelled (osc_lock_blocking()), or when locks is
+ * deleted without cancellation (e.g., from cl_locks_prune()). In the latter
+ * case ldlm lock remains in memory, and can be re-attached to osc_lock in the
+ * future.
+ */
+struct osc_lock {
+	struct cl_lock_slice     ols_cl;
+	/** underlying DLM lock */
+	struct ldlm_lock	*ols_lock;
+	/** lock value block */
+	struct ost_lvb	   ols_lvb;
+	/** DLM flags with which osc_lock::ols_lock was enqueued */
+	__u64		    ols_flags;
+	/** osc_lock::ols_lock handle */
+	struct lustre_handle     ols_handle;
+	struct ldlm_enqueue_info ols_einfo;
+	enum osc_lock_state      ols_state;
+
+	/**
+	 * How many pages are using this lock for io, currently only used by
+	 * read-ahead. If non-zero, the underlying dlm lock won't be cancelled
+	 * during recovery to avoid deadlock. see bz16774.
+	 *
+	 * \see osc_page::ops_lock
+	 * \see osc_page_addref_lock(), osc_page_putref_lock()
+	 */
+	atomic_t	     ols_pageref;
+
+	/**
+	 * true, if ldlm_lock_addref() was called against
+	 * osc_lock::ols_lock. This is used for sanity checking.
+	 *
+	 * \see osc_lock::ols_has_ref
+	 */
+	unsigned		  ols_hold :1,
+	/**
+	 * this is much like osc_lock::ols_hold, except that this bit is
+	 * cleared _after_ reference in released in osc_lock_unuse(). This
+	 * fine distinction is needed because:
+	 *
+	 *     - if ldlm lock still has a reference, osc_ast_data_get() needs
+	 *       to return associated cl_lock (so that a flag is needed that is
+	 *       cleared after ldlm_lock_decref() returned), and
+	 *
+	 *     - ldlm_lock_decref() can invoke blocking ast (for a
+	 *       LDLM_FL_CBPENDING lock), and osc_lock functions like
+	 *       osc_lock_cancel() called from there need to know whether to
+	 *       release lock reference (so that a flag is needed that is
+	 *       cleared before ldlm_lock_decref() is called).
+	 */
+				 ols_has_ref:1,
+	/**
+	 * inherit the lockless attribute from top level cl_io.
+	 * If true, osc_lock_enqueue is able to tolerate the -EUSERS error.
+	 */
+				 ols_locklessable:1,
+	/**
+	 * set by osc_lock_use() to wait until blocking AST enters into
+	 * osc_ldlm_blocking_ast0(), so that cl_lock mutex can be used for
+	 * further synchronization.
+	 */
+				 ols_ast_wait:1,
+	/**
+	 * If the data of this lock has been flushed to server side.
+	 */
+				 ols_flush:1,
+	/**
+	 * if set, the osc_lock is a glimpse lock. For glimpse locks, we treat
+	 * the EVAVAIL error as tolerable, this will make upper logic happy
+	 * to wait all glimpse locks to each OSTs to be completed.
+	 * Glimpse lock converts to normal lock if the server lock is
+	 * granted.
+	 * Glimpse lock should be destroyed immediately after use.
+	 */
+				 ols_glimpse:1,
+	/**
+	 * For async glimpse lock.
+	 */
+				 ols_agl:1;
+	/**
+	 * IO that owns this lock. This field is used for a dead-lock
+	 * avoidance by osc_lock_enqueue_wait().
+	 *
+	 * XXX: unfortunately, the owner of a osc_lock is not unique,
+	 * the lock may have multiple users, if the lock is granted and
+	 * then matched.
+	 */
+	struct osc_io	   *ols_owner;
+};
+
+
+/**
+ * Page state private for osc layer.
+ */
+struct osc_page {
+	struct cl_page_slice  ops_cl;
+	/**
+	 * Page queues used by osc to detect when RPC can be formed.
+	 */
+	struct osc_async_page ops_oap;
+	/**
+	 * An offset within page from which next transfer starts. This is used
+	 * by cl_page_clip() to submit partial page transfers.
+	 */
+	int		   ops_from;
+	/**
+	 * An offset within page at which next transfer ends.
+	 *
+	 * \see osc_page::ops_from.
+	 */
+	int		   ops_to;
+	/**
+	 * Boolean, true iff page is under transfer. Used for sanity checking.
+	 */
+	unsigned	      ops_transfer_pinned:1,
+	/**
+	 * True for a `temporary page' created by read-ahead code, probably
+	 * outside of any DLM lock.
+	 */
+			      ops_temp:1,
+	/**
+	 * in LRU?
+	 */
+			      ops_in_lru:1,
+	/**
+	 * Set if the page must be transferred with OBD_BRW_SRVLOCK.
+	 */
+			      ops_srvlock:1;
+	union {
+		/**
+		 * lru page list. ops_inflight and ops_lru are exclusive so
+		 * that they can share the same data.
+		 */
+		struct list_head	      ops_lru;
+		/**
+		 * Linkage into a per-osc_object list of pages in flight. For
+		 * debugging.
+		 */
+		struct list_head	    ops_inflight;
+	};
+	/**
+	 * Thread that submitted this page for transfer. For debugging.
+	 */
+	struct task_struct	*ops_submitter;
+	/**
+	 * Submit time - the time when the page is starting RPC. For debugging.
+	 */
+	unsigned long	    ops_submit_time;
+
+	/**
+	 * A lock of which we hold a reference covers this page. Only used by
+	 * read-ahead: for a readahead page, we hold it's covering lock to
+	 * prevent it from being canceled during recovery.
+	 *
+	 * \see osc_lock::ols_pageref
+	 * \see osc_page_addref_lock(), osc_page_putref_lock().
+	 */
+	struct cl_lock       *ops_lock;
+};
+
+extern struct kmem_cache *osc_lock_kmem;
+extern struct kmem_cache *osc_object_kmem;
+extern struct kmem_cache *osc_thread_kmem;
+extern struct kmem_cache *osc_session_kmem;
+extern struct kmem_cache *osc_req_kmem;
+extern struct kmem_cache *osc_extent_kmem;
+
+extern struct lu_device_type osc_device_type;
+extern struct lu_context_key osc_key;
+extern struct lu_context_key osc_session_key;
+
+#define OSC_FLAGS (ASYNC_URGENT|ASYNC_READY)
+
+int osc_lock_init(const struct lu_env *env,
+		  struct cl_object *obj, struct cl_lock *lock,
+		  const struct cl_io *io);
+int osc_io_init  (const struct lu_env *env,
+		  struct cl_object *obj, struct cl_io *io);
+int osc_req_init (const struct lu_env *env, struct cl_device *dev,
+		  struct cl_req *req);
+struct lu_object *osc_object_alloc(const struct lu_env *env,
+				   const struct lu_object_header *hdr,
+				   struct lu_device *dev);
+int osc_page_init(const struct lu_env *env, struct cl_object *obj,
+		  struct cl_page *page, struct page *vmpage);
+
+void osc_index2policy  (ldlm_policy_data_t *policy, const struct cl_object *obj,
+			pgoff_t start, pgoff_t end);
+int  osc_lvb_print     (const struct lu_env *env, void *cookie,
+			lu_printer_t p, const struct ost_lvb *lvb);
+
+void osc_page_submit(const struct lu_env *env, struct osc_page *opg,
+		     enum cl_req_type crt, int brw_flags);
+int osc_cancel_async_page(const struct lu_env *env, struct osc_page *ops);
+int osc_set_async_flags(struct osc_object *obj, struct osc_page *opg,
+			u32 async_flags);
+int osc_prep_async_page(struct osc_object *osc, struct osc_page *ops,
+			struct page *page, loff_t offset);
+int osc_queue_async_io(const struct lu_env *env, struct cl_io *io,
+		       struct osc_page *ops);
+int osc_teardown_async_page(const struct lu_env *env, struct osc_object *obj,
+			    struct osc_page *ops);
+int osc_flush_async_page(const struct lu_env *env, struct cl_io *io,
+			 struct osc_page *ops);
+int osc_queue_sync_pages(const struct lu_env *env, struct osc_object *obj,
+			 struct list_head *list, int cmd, int brw_flags);
+int osc_cache_truncate_start(const struct lu_env *env, struct osc_io *oio,
+			     struct osc_object *obj, __u64 size);
+void osc_cache_truncate_end(const struct lu_env *env, struct osc_io *oio,
+			    struct osc_object *obj);
+int osc_cache_writeback_range(const struct lu_env *env, struct osc_object *obj,
+			      pgoff_t start, pgoff_t end, int hp, int discard);
+int osc_cache_wait_range(const struct lu_env *env, struct osc_object *obj,
+			 pgoff_t start, pgoff_t end);
+void osc_io_unplug(const struct lu_env *env, struct client_obd *cli,
+		   struct osc_object *osc, pdl_policy_t pol);
+
+void osc_object_set_contended  (struct osc_object *obj);
+void osc_object_clear_contended(struct osc_object *obj);
+int  osc_object_is_contended   (struct osc_object *obj);
+
+int  osc_lock_is_lockless      (const struct osc_lock *olck);
+
+/*****************************************************************************
+ *
+ * Accessors.
+ *
+ */
+
+static inline struct osc_thread_info *osc_env_info(const struct lu_env *env)
+{
+	struct osc_thread_info *info;
+
+	info = lu_context_key_get(&env->le_ctx, &osc_key);
+	LASSERT(info != NULL);
+	return info;
+}
+
+static inline struct osc_session *osc_env_session(const struct lu_env *env)
+{
+	struct osc_session *ses;
+
+	ses = lu_context_key_get(env->le_ses, &osc_session_key);
+	LASSERT(ses != NULL);
+	return ses;
+}
+
+static inline struct osc_io *osc_env_io(const struct lu_env *env)
+{
+	return &osc_env_session(env)->os_io;
+}
+
+static inline int osc_is_object(const struct lu_object *obj)
+{
+	return obj->lo_dev->ld_type == &osc_device_type;
+}
+
+static inline struct osc_device *lu2osc_dev(const struct lu_device *d)
+{
+	LINVRNT(d->ld_type == &osc_device_type);
+	return container_of0(d, struct osc_device, od_cl.cd_lu_dev);
+}
+
+static inline struct obd_export *osc_export(const struct osc_object *obj)
+{
+	return lu2osc_dev(obj->oo_cl.co_lu.lo_dev)->od_exp;
+}
+
+static inline struct client_obd *osc_cli(const struct osc_object *obj)
+{
+	return &osc_export(obj)->exp_obd->u.cli;
+}
+
+static inline struct osc_object *cl2osc(const struct cl_object *obj)
+{
+	LINVRNT(osc_is_object(&obj->co_lu));
+	return container_of0(obj, struct osc_object, oo_cl);
+}
+
+static inline struct cl_object *osc2cl(const struct osc_object *obj)
+{
+	return (struct cl_object *)&obj->oo_cl;
+}
+
+static inline ldlm_mode_t osc_cl_lock2ldlm(enum cl_lock_mode mode)
+{
+	LASSERT(mode == CLM_READ || mode == CLM_WRITE || mode == CLM_GROUP);
+	if (mode == CLM_READ)
+		return LCK_PR;
+	else if (mode == CLM_WRITE)
+		return LCK_PW;
+	else
+		return LCK_GROUP;
+}
+
+static inline enum cl_lock_mode osc_ldlm2cl_lock(ldlm_mode_t mode)
+{
+	LASSERT(mode == LCK_PR || mode == LCK_PW || mode == LCK_GROUP);
+	if (mode == LCK_PR)
+		return CLM_READ;
+	else if (mode == LCK_PW)
+		return CLM_WRITE;
+	else
+		return CLM_GROUP;
+}
+
+static inline struct osc_page *cl2osc_page(const struct cl_page_slice *slice)
+{
+	LINVRNT(osc_is_object(&slice->cpl_obj->co_lu));
+	return container_of0(slice, struct osc_page, ops_cl);
+}
+
+static inline struct osc_page *oap2osc(struct osc_async_page *oap)
+{
+	return container_of0(oap, struct osc_page, ops_oap);
+}
+
+static inline struct cl_page *oap2cl_page(struct osc_async_page *oap)
+{
+	return oap2osc(oap)->ops_cl.cpl_page;
+}
+
+static inline struct osc_page *oap2osc_page(struct osc_async_page *oap)
+{
+	return (struct osc_page *)container_of(oap, struct osc_page, ops_oap);
+}
+
+static inline struct osc_lock *cl2osc_lock(const struct cl_lock_slice *slice)
+{
+	LINVRNT(osc_is_object(&slice->cls_obj->co_lu));
+	return container_of0(slice, struct osc_lock, ols_cl);
+}
+
+static inline struct osc_lock *osc_lock_at(const struct cl_lock *lock)
+{
+	return cl2osc_lock(cl_lock_at(lock, &osc_device_type));
+}
+
+static inline int osc_io_srvlock(struct osc_io *oio)
+{
+	return (oio->oi_lockless && !oio->oi_cl.cis_io->ci_no_srvlock);
+}
+
+enum osc_extent_state {
+	OES_INV       = 0, /** extent is just initialized or destroyed */
+	OES_ACTIVE    = 1, /** process is using this extent */
+	OES_CACHE     = 2, /** extent is ready for IO */
+	OES_LOCKING   = 3, /** locking page to prepare IO */
+	OES_LOCK_DONE = 4, /** locking finished, ready to send */
+	OES_RPC       = 5, /** in RPC */
+	OES_TRUNC     = 6, /** being truncated */
+	OES_STATE_MAX
+};
+
+/**
+ * osc_extent data to manage dirty pages.
+ * osc_extent has the following attributes:
+ * 1. all pages in the same must be in one RPC in write back;
+ * 2. # of pages must be less than max_pages_per_rpc - implied by 1;
+ * 3. must be covered by only 1 osc_lock;
+ * 4. exclusive. It's impossible to have overlapped osc_extent.
+ *
+ * The lifetime of an extent is from when the 1st page is dirtied to when
+ * all pages inside it are written out.
+ *
+ * LOCKING ORDER
+ * =============
+ * page lock -> client_obd_list_lock -> object lock(osc_object::oo_lock)
+ */
+struct osc_extent {
+	/** red-black tree node */
+	struct rb_node     oe_node;
+	/** osc_object of this extent */
+	struct osc_object *oe_obj;
+	/** refcount, removed from red-black tree if reaches zero. */
+	atomic_t       oe_refc;
+	/** busy if non-zero */
+	atomic_t       oe_users;
+	/** link list of osc_object's oo_{hp|urgent|locking}_exts. */
+	struct list_head	 oe_link;
+	/** state of this extent */
+	unsigned int       oe_state;
+	/** flags for this extent. */
+	unsigned int       oe_intree:1,
+	/** 0 is write, 1 is read */
+			   oe_rw:1,
+			   oe_srvlock:1,
+			   oe_memalloc:1,
+	/** an ACTIVE extent is going to be truncated, so when this extent
+	 * is released, it will turn into TRUNC state instead of CACHE. */
+			   oe_trunc_pending:1,
+	/** this extent should be written asap and someone may wait for the
+	 * write to finish. This bit is usually set along with urgent if
+	 * the extent was CACHE state.
+	 * fsync_wait extent can't be merged because new extent region may
+	 * exceed fsync range. */
+			   oe_fsync_wait:1,
+	/** covering lock is being canceled */
+			   oe_hp:1,
+	/** this extent should be written back asap. set if one of pages is
+	 * called by page WB daemon, or sync write or reading requests. */
+			   oe_urgent:1;
+	/** how many grants allocated for this extent.
+	 *  Grant allocated for this extent. There is no grant allocated
+	 *  for reading extents and sync write extents. */
+	unsigned int       oe_grants;
+	/** # of dirty pages in this extent */
+	unsigned int       oe_nr_pages;
+	/** list of pending oap pages. Pages in this list are NOT sorted. */
+	struct list_head	 oe_pages;
+	/** Since an extent has to be written out in atomic, this is used to
+	 * remember the next page need to be locked to write this extent out.
+	 * Not used right now.
+	 */
+	struct osc_page   *oe_next_page;
+	/** start and end index of this extent, include start and end
+	 * themselves. Page offset here is the page index of osc_pages.
+	 * oe_start is used as keyword for red-black tree. */
+	pgoff_t	    oe_start;
+	pgoff_t	    oe_end;
+	/** maximum ending index of this extent, this is limited by
+	 * max_pages_per_rpc, lock extent and chunk size. */
+	pgoff_t	    oe_max_end;
+	/** waitqueue - for those who want to be notified if this extent's
+	 * state has changed. */
+	wait_queue_head_t	oe_waitq;
+	/** lock covering this extent */
+	struct cl_lock    *oe_osclock;
+	/** terminator of this extent. Must be true if this extent is in IO. */
+	struct task_struct	*oe_owner;
+	/** return value of writeback. If somebody is waiting for this extent,
+	 * this value can be known by outside world. */
+	int		oe_rc;
+	/** max pages per rpc when this extent was created */
+	unsigned int       oe_mppr;
+};
+
+int osc_extent_finish(const struct lu_env *env, struct osc_extent *ext,
+		      int sent, int rc);
+void osc_extent_release(const struct lu_env *env, struct osc_extent *ext);
+
+/** @} osc */
+
+#endif /* OSC_CL_INTERNAL_H */
diff --git a/kernel/drivers/staging/lustre/lustre/osc/osc_dev.c b/kernel/drivers/staging/lustre/lustre/osc/osc_dev.c
new file mode 100644
index 000000000..4935fc7c0
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/osc/osc_dev.c
@@ -0,0 +1,262 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Implementation of cl_device, cl_req for OSC layer.
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_OSC
+
+/* class_name2obd() */
+#include "../include/obd_class.h"
+
+#include "osc_cl_internal.h"
+
+/** \addtogroup osc
+ * @{
+ */
+
+struct kmem_cache *osc_lock_kmem;
+struct kmem_cache *osc_object_kmem;
+struct kmem_cache *osc_thread_kmem;
+struct kmem_cache *osc_session_kmem;
+struct kmem_cache *osc_req_kmem;
+struct kmem_cache *osc_extent_kmem;
+struct kmem_cache *osc_quota_kmem;
+
+struct lu_kmem_descr osc_caches[] = {
+	{
+		.ckd_cache = &osc_lock_kmem,
+		.ckd_name  = "osc_lock_kmem",
+		.ckd_size  = sizeof(struct osc_lock)
+	},
+	{
+		.ckd_cache = &osc_object_kmem,
+		.ckd_name  = "osc_object_kmem",
+		.ckd_size  = sizeof(struct osc_object)
+	},
+	{
+		.ckd_cache = &osc_thread_kmem,
+		.ckd_name  = "osc_thread_kmem",
+		.ckd_size  = sizeof(struct osc_thread_info)
+	},
+	{
+		.ckd_cache = &osc_session_kmem,
+		.ckd_name  = "osc_session_kmem",
+		.ckd_size  = sizeof(struct osc_session)
+	},
+	{
+		.ckd_cache = &osc_req_kmem,
+		.ckd_name  = "osc_req_kmem",
+		.ckd_size  = sizeof(struct osc_req)
+	},
+	{
+		.ckd_cache = &osc_extent_kmem,
+		.ckd_name  = "osc_extent_kmem",
+		.ckd_size  = sizeof(struct osc_extent)
+	},
+	{
+		.ckd_cache = &osc_quota_kmem,
+		.ckd_name  = "osc_quota_kmem",
+		.ckd_size  = sizeof(struct osc_quota_info)
+	},
+	{
+		.ckd_cache = NULL
+	}
+};
+
+struct lock_class_key osc_ast_guard_class;
+
+/*****************************************************************************
+ *
+ * Type conversions.
+ *
+ */
+
+static struct lu_device *osc2lu_dev(struct osc_device *osc)
+{
+	return &osc->od_cl.cd_lu_dev;
+}
+
+/*****************************************************************************
+ *
+ * Osc device and device type functions.
+ *
+ */
+
+static void *osc_key_init(const struct lu_context *ctx,
+			 struct lu_context_key *key)
+{
+	struct osc_thread_info *info;
+
+	OBD_SLAB_ALLOC_PTR_GFP(info, osc_thread_kmem, GFP_NOFS);
+	if (info == NULL)
+		info = ERR_PTR(-ENOMEM);
+	return info;
+}
+
+static void osc_key_fini(const struct lu_context *ctx,
+			 struct lu_context_key *key, void *data)
+{
+	struct osc_thread_info *info = data;
+
+	OBD_SLAB_FREE_PTR(info, osc_thread_kmem);
+}
+
+struct lu_context_key osc_key = {
+	.lct_tags = LCT_CL_THREAD,
+	.lct_init = osc_key_init,
+	.lct_fini = osc_key_fini
+};
+
+static void *osc_session_init(const struct lu_context *ctx,
+			      struct lu_context_key *key)
+{
+	struct osc_session *info;
+
+	OBD_SLAB_ALLOC_PTR_GFP(info, osc_session_kmem, GFP_NOFS);
+	if (info == NULL)
+		info = ERR_PTR(-ENOMEM);
+	return info;
+}
+
+static void osc_session_fini(const struct lu_context *ctx,
+			     struct lu_context_key *key, void *data)
+{
+	struct osc_session *info = data;
+
+	OBD_SLAB_FREE_PTR(info, osc_session_kmem);
+}
+
+struct lu_context_key osc_session_key = {
+	.lct_tags = LCT_SESSION,
+	.lct_init = osc_session_init,
+	.lct_fini = osc_session_fini
+};
+
+/* type constructor/destructor: osc_type_{init,fini,start,stop}(). */
+LU_TYPE_INIT_FINI(osc, &osc_key, &osc_session_key);
+
+static int osc_cl_process_config(const struct lu_env *env,
+				 struct lu_device *d, struct lustre_cfg *cfg)
+{
+	return osc_process_config_base(d->ld_obd, cfg);
+}
+
+static const struct lu_device_operations osc_lu_ops = {
+	.ldo_object_alloc      = osc_object_alloc,
+	.ldo_process_config    = osc_cl_process_config,
+	.ldo_recovery_complete = NULL
+};
+
+static const struct cl_device_operations osc_cl_ops = {
+	.cdo_req_init = osc_req_init
+};
+
+static int osc_device_init(const struct lu_env *env, struct lu_device *d,
+			   const char *name, struct lu_device *next)
+{
+	return 0;
+}
+
+static struct lu_device *osc_device_fini(const struct lu_env *env,
+					 struct lu_device *d)
+{
+	return NULL;
+}
+
+static struct lu_device *osc_device_free(const struct lu_env *env,
+					 struct lu_device *d)
+{
+	struct osc_device *od = lu2osc_dev(d);
+
+	cl_device_fini(lu2cl_dev(d));
+	OBD_FREE_PTR(od);
+	return NULL;
+}
+
+static struct lu_device *osc_device_alloc(const struct lu_env *env,
+					  struct lu_device_type *t,
+					  struct lustre_cfg *cfg)
+{
+	struct lu_device *d;
+	struct osc_device *od;
+	struct obd_device *obd;
+	int rc;
+
+	OBD_ALLOC_PTR(od);
+	if (od == NULL)
+		return ERR_PTR(-ENOMEM);
+
+	cl_device_init(&od->od_cl, t);
+	d = osc2lu_dev(od);
+	d->ld_ops = &osc_lu_ops;
+	od->od_cl.cd_ops = &osc_cl_ops;
+
+	/* Setup OSC OBD */
+	obd = class_name2obd(lustre_cfg_string(cfg, 0));
+	LASSERT(obd != NULL);
+	rc = osc_setup(obd, cfg);
+	if (rc) {
+		osc_device_free(env, d);
+		return ERR_PTR(rc);
+	}
+	od->od_exp = obd->obd_self_export;
+	return d;
+}
+
+static const struct lu_device_type_operations osc_device_type_ops = {
+	.ldto_init = osc_type_init,
+	.ldto_fini = osc_type_fini,
+
+	.ldto_start = osc_type_start,
+	.ldto_stop  = osc_type_stop,
+
+	.ldto_device_alloc = osc_device_alloc,
+	.ldto_device_free  = osc_device_free,
+
+	.ldto_device_init    = osc_device_init,
+	.ldto_device_fini    = osc_device_fini
+};
+
+struct lu_device_type osc_device_type = {
+	.ldt_tags     = LU_DEVICE_CL,
+	.ldt_name     = LUSTRE_OSC_NAME,
+	.ldt_ops      = &osc_device_type_ops,
+	.ldt_ctx_tags = LCT_CL_THREAD
+};
+
+/** @} osc */
diff --git a/kernel/drivers/staging/lustre/lustre/osc/osc_internal.h b/kernel/drivers/staging/lustre/lustre/osc/osc_internal.h
new file mode 100644
index 000000000..af96c7bc7
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/osc/osc_internal.h
@@ -0,0 +1,203 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef OSC_INTERNAL_H
+#define OSC_INTERNAL_H
+
+#define OAP_MAGIC 8675309
+
+struct lu_env;
+
+enum async_flags {
+	ASYNC_READY = 0x1, /* ap_make_ready will not be called before this
+			      page is added to an rpc */
+	ASYNC_URGENT = 0x2, /* page must be put into an RPC before return */
+	ASYNC_COUNT_STABLE = 0x4, /* ap_refresh_count will not be called
+				     to give the caller a chance to update
+				     or cancel the size of the io */
+	ASYNC_HP = 0x10,
+};
+
+struct osc_async_page {
+	int		     oap_magic;
+	unsigned short	  oap_cmd;
+	unsigned short	  oap_interrupted:1;
+
+	struct list_head	      oap_pending_item;
+	struct list_head	      oap_rpc_item;
+
+	u64		 oap_obj_off;
+	unsigned		oap_page_off;
+	enum async_flags	oap_async_flags;
+
+	struct brw_page	 oap_brw_page;
+
+	struct ptlrpc_request   *oap_request;
+	struct client_obd       *oap_cli;
+	struct osc_object       *oap_obj;
+
+	struct ldlm_lock	*oap_ldlm_lock;
+	spinlock_t		 oap_lock;
+};
+
+#define oap_page	oap_brw_page.pg
+#define oap_count       oap_brw_page.count
+#define oap_brw_flags   oap_brw_page.flag
+
+struct osc_cache_waiter {
+	struct list_head	      ocw_entry;
+	wait_queue_head_t	     ocw_waitq;
+	struct osc_async_page  *ocw_oap;
+	int		     ocw_grant;
+	int		     ocw_rc;
+};
+
+int osc_create(const struct lu_env *env, struct obd_export *exp,
+	       struct obdo *oa, struct lov_stripe_md **ea,
+	       struct obd_trans_info *oti);
+int osc_real_create(struct obd_export *exp, struct obdo *oa,
+		    struct lov_stripe_md **ea, struct obd_trans_info *oti);
+void osc_wake_cache_waiters(struct client_obd *cli);
+int osc_shrink_grant_to_target(struct client_obd *cli, __u64 target_bytes);
+void osc_update_next_shrink(struct client_obd *cli);
+
+/*
+ * cl integration.
+ */
+#include "../include/cl_object.h"
+
+extern struct ptlrpc_request_set *PTLRPCD_SET;
+
+int osc_enqueue_base(struct obd_export *exp, struct ldlm_res_id *res_id,
+		     __u64 *flags, ldlm_policy_data_t *policy,
+		     struct ost_lvb *lvb, int kms_valid,
+		     obd_enqueue_update_f upcall,
+		     void *cookie, struct ldlm_enqueue_info *einfo,
+		     struct lustre_handle *lockh,
+		     struct ptlrpc_request_set *rqset, int async, int agl);
+int osc_cancel_base(struct lustre_handle *lockh, __u32 mode);
+
+int osc_match_base(struct obd_export *exp, struct ldlm_res_id *res_id,
+		   __u32 type, ldlm_policy_data_t *policy, __u32 mode,
+		   __u64 *flags, void *data, struct lustre_handle *lockh,
+		   int unref);
+
+int osc_setattr_async_base(struct obd_export *exp, struct obd_info *oinfo,
+			   struct obd_trans_info *oti,
+			   obd_enqueue_update_f upcall, void *cookie,
+			   struct ptlrpc_request_set *rqset);
+int osc_punch_base(struct obd_export *exp, struct obd_info *oinfo,
+		   obd_enqueue_update_f upcall, void *cookie,
+		   struct ptlrpc_request_set *rqset);
+int osc_sync_base(struct obd_export *exp, struct obd_info *oinfo,
+		  obd_enqueue_update_f upcall, void *cookie,
+		  struct ptlrpc_request_set *rqset);
+
+int osc_process_config_base(struct obd_device *obd, struct lustre_cfg *cfg);
+int osc_build_rpc(const struct lu_env *env, struct client_obd *cli,
+		  struct list_head *ext_list, int cmd, pdl_policy_t p);
+int osc_lru_shrink(struct client_obd *cli, int target);
+
+extern spinlock_t osc_ast_guard;
+
+int osc_cleanup(struct obd_device *obd);
+int osc_setup(struct obd_device *obd, struct lustre_cfg *lcfg);
+
+#if defined (CONFIG_PROC_FS)
+int lproc_osc_attach_seqstat(struct obd_device *dev);
+void lprocfs_osc_init_vars(struct lprocfs_static_vars *lvars);
+#else
+static inline int lproc_osc_attach_seqstat(struct obd_device *dev) {return 0;}
+static inline void lprocfs_osc_init_vars(struct lprocfs_static_vars *lvars)
+{
+	memset(lvars, 0, sizeof(*lvars));
+}
+#endif
+
+extern struct lu_device_type osc_device_type;
+
+static inline int osc_recoverable_error(int rc)
+{
+	return (rc == -EIO || rc == -EROFS || rc == -ENOMEM ||
+		rc == -EAGAIN || rc == -EINPROGRESS);
+}
+
+static inline unsigned long rpcs_in_flight(struct client_obd *cli)
+{
+	return cli->cl_r_in_flight + cli->cl_w_in_flight;
+}
+
+struct osc_device {
+	struct cl_device    od_cl;
+	struct obd_export  *od_exp;
+
+	/* Write stats is actually protected by client_obd's lock. */
+	struct osc_stats {
+		uint64_t     os_lockless_writes;	  /* by bytes */
+		uint64_t     os_lockless_reads;	   /* by bytes */
+		uint64_t     os_lockless_truncates;       /* by times */
+	} od_stats;
+
+	/* configuration item(s) */
+	int		 od_contention_time;
+	int		 od_lockless_truncate;
+};
+
+static inline struct osc_device *obd2osc_dev(const struct obd_device *d)
+{
+	return container_of0(d->obd_lu_dev, struct osc_device, od_cl.cd_lu_dev);
+}
+
+int osc_dlm_lock_pageref(struct ldlm_lock *dlm);
+
+extern struct kmem_cache *osc_quota_kmem;
+struct osc_quota_info {
+	/** linkage for quota hash table */
+	struct hlist_node oqi_hash;
+	u32	  oqi_id;
+};
+int osc_quota_setup(struct obd_device *obd);
+int osc_quota_cleanup(struct obd_device *obd);
+int osc_quota_setdq(struct client_obd *cli, const unsigned int qid[],
+		    u32 valid, u32 flags);
+int osc_quota_chkdq(struct client_obd *cli, const unsigned int qid[]);
+int osc_quotactl(struct obd_device *unused, struct obd_export *exp,
+		 struct obd_quotactl *oqctl);
+int osc_quotacheck(struct obd_device *unused, struct obd_export *exp,
+		   struct obd_quotactl *oqctl);
+int osc_quota_poll_check(struct obd_export *exp, struct if_quotacheck *qchk);
+
+#endif /* OSC_INTERNAL_H */
diff --git a/kernel/drivers/staging/lustre/lustre/osc/osc_io.c b/kernel/drivers/staging/lustre/lustre/osc/osc_io.c
new file mode 100644
index 000000000..3c7300b06
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/osc/osc_io.c
@@ -0,0 +1,819 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Implementation of cl_io for OSC layer.
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ *   Author: Jinshan Xiong <jinshan.xiong@whamcloud.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_OSC
+
+#include "osc_cl_internal.h"
+
+/** \addtogroup osc
+ *  @{
+ */
+
+/*****************************************************************************
+ *
+ * Type conversions.
+ *
+ */
+
+static struct osc_req *cl2osc_req(const struct cl_req_slice *slice)
+{
+	LINVRNT(slice->crs_dev->cd_lu_dev.ld_type == &osc_device_type);
+	return container_of0(slice, struct osc_req, or_cl);
+}
+
+static struct osc_io *cl2osc_io(const struct lu_env *env,
+				const struct cl_io_slice *slice)
+{
+	struct osc_io *oio = container_of0(slice, struct osc_io, oi_cl);
+
+	LINVRNT(oio == osc_env_io(env));
+	return oio;
+}
+
+static struct osc_page *osc_cl_page_osc(struct cl_page *page)
+{
+	const struct cl_page_slice *slice;
+
+	slice = cl_page_at(page, &osc_device_type);
+	LASSERT(slice != NULL);
+
+	return cl2osc_page(slice);
+}
+
+
+/*****************************************************************************
+ *
+ * io operations.
+ *
+ */
+
+static void osc_io_fini(const struct lu_env *env, const struct cl_io_slice *io)
+{
+}
+
+/**
+ * An implementation of cl_io_operations::cio_io_submit() method for osc
+ * layer. Iterates over pages in the in-queue, prepares each for io by calling
+ * cl_page_prep() and then either submits them through osc_io_submit_page()
+ * or, if page is already submitted, changes osc flags through
+ * osc_set_async_flags().
+ */
+static int osc_io_submit(const struct lu_env *env,
+			 const struct cl_io_slice *ios,
+			 enum cl_req_type crt, struct cl_2queue *queue)
+{
+	struct cl_page    *page;
+	struct cl_page    *tmp;
+	struct client_obd *cli  = NULL;
+	struct osc_object *osc  = NULL; /* to keep gcc happy */
+	struct osc_page   *opg;
+	struct cl_io      *io;
+	LIST_HEAD(list);
+
+	struct cl_page_list *qin      = &queue->c2_qin;
+	struct cl_page_list *qout     = &queue->c2_qout;
+	int queued = 0;
+	int result = 0;
+	int cmd;
+	int brw_flags;
+	int max_pages;
+
+	LASSERT(qin->pl_nr > 0);
+
+	CDEBUG(D_CACHE, "%d %d\n", qin->pl_nr, crt);
+
+	osc = cl2osc(ios->cis_obj);
+	cli = osc_cli(osc);
+	max_pages = cli->cl_max_pages_per_rpc;
+
+	cmd = crt == CRT_WRITE ? OBD_BRW_WRITE : OBD_BRW_READ;
+	brw_flags = osc_io_srvlock(cl2osc_io(env, ios)) ? OBD_BRW_SRVLOCK : 0;
+
+	/*
+	 * NOTE: here @page is a top-level page. This is done to avoid
+	 *       creation of sub-page-list.
+	 */
+	cl_page_list_for_each_safe(page, tmp, qin) {
+		struct osc_async_page *oap;
+
+		/* Top level IO. */
+		io = page->cp_owner;
+		LASSERT(io != NULL);
+
+		opg = osc_cl_page_osc(page);
+		oap = &opg->ops_oap;
+		LASSERT(osc == oap->oap_obj);
+
+		if (!list_empty(&oap->oap_pending_item) ||
+		    !list_empty(&oap->oap_rpc_item)) {
+			CDEBUG(D_CACHE, "Busy oap %p page %p for submit.\n",
+			       oap, opg);
+			result = -EBUSY;
+			break;
+		}
+
+		result = cl_page_prep(env, io, page, crt);
+		if (result != 0) {
+			LASSERT(result < 0);
+			if (result != -EALREADY)
+				break;
+			/*
+			 * Handle -EALREADY error: for read case, the page is
+			 * already in UPTODATE state; for write, the page
+			 * is not dirty.
+			 */
+			result = 0;
+			continue;
+		}
+
+		cl_page_list_move(qout, qin, page);
+		oap->oap_async_flags = ASYNC_URGENT|ASYNC_READY;
+		oap->oap_async_flags |= ASYNC_COUNT_STABLE;
+
+		osc_page_submit(env, opg, crt, brw_flags);
+		list_add_tail(&oap->oap_pending_item, &list);
+		if (++queued == max_pages) {
+			queued = 0;
+			result = osc_queue_sync_pages(env, osc, &list, cmd,
+						      brw_flags);
+			if (result < 0)
+				break;
+		}
+	}
+
+	if (queued > 0)
+		result = osc_queue_sync_pages(env, osc, &list, cmd, brw_flags);
+
+	CDEBUG(D_INFO, "%d/%d %d\n", qin->pl_nr, qout->pl_nr, result);
+	return qout->pl_nr > 0 ? 0 : result;
+}
+
+static void osc_page_touch_at(const struct lu_env *env,
+			      struct cl_object *obj, pgoff_t idx, unsigned to)
+{
+	struct lov_oinfo  *loi  = cl2osc(obj)->oo_oinfo;
+	struct cl_attr    *attr = &osc_env_info(env)->oti_attr;
+	int valid;
+	__u64 kms;
+
+	/* offset within stripe */
+	kms = cl_offset(obj, idx) + to;
+
+	cl_object_attr_lock(obj);
+	/*
+	 * XXX old code used
+	 *
+	 *	 ll_inode_size_lock(inode, 0); lov_stripe_lock(lsm);
+	 *
+	 * here
+	 */
+	CDEBUG(D_INODE, "stripe KMS %sincreasing %llu->%llu %llu\n",
+	       kms > loi->loi_kms ? "" : "not ", loi->loi_kms, kms,
+	       loi->loi_lvb.lvb_size);
+
+	valid = 0;
+	if (kms > loi->loi_kms) {
+		attr->cat_kms = kms;
+		valid |= CAT_KMS;
+	}
+	if (kms > loi->loi_lvb.lvb_size) {
+		attr->cat_size = kms;
+		valid |= CAT_SIZE;
+	}
+	cl_object_attr_set(env, obj, attr, valid);
+	cl_object_attr_unlock(obj);
+}
+
+/**
+ * This is called when a page is accessed within file in a way that creates
+ * new page, if one were missing (i.e., if there were a hole at that place in
+ * the file, or accessed page is beyond the current file size). Examples:
+ * ->commit_write() and ->nopage() methods.
+ *
+ * Expand stripe KMS if necessary.
+ */
+static void osc_page_touch(const struct lu_env *env,
+			   struct osc_page *opage, unsigned to)
+{
+	struct cl_page    *page = opage->ops_cl.cpl_page;
+	struct cl_object  *obj  = opage->ops_cl.cpl_obj;
+
+	osc_page_touch_at(env, obj, page->cp_index, to);
+}
+
+/**
+ * Implements cl_io_operations::cio_prepare_write() method for osc layer.
+ *
+ * \retval -EIO transfer initiated against this osc will most likely fail
+ * \retval 0    transfer initiated against this osc will most likely succeed.
+ *
+ * The reason for this check is to immediately return an error to the caller
+ * in the case of a deactivated import. Note, that import can be deactivated
+ * later, while pages, dirtied by this IO, are still in the cache, but this is
+ * irrelevant, because that would still return an error to the application (if
+ * it does fsync), but many applications don't do fsync because of performance
+ * issues, and we wanted to return an -EIO at write time to notify the
+ * application.
+ */
+static int osc_io_prepare_write(const struct lu_env *env,
+				const struct cl_io_slice *ios,
+				const struct cl_page_slice *slice,
+				unsigned from, unsigned to)
+{
+	struct osc_device *dev = lu2osc_dev(slice->cpl_obj->co_lu.lo_dev);
+	struct obd_import *imp = class_exp2cliimp(dev->od_exp);
+	struct osc_io     *oio = cl2osc_io(env, ios);
+	int result = 0;
+
+	/*
+	 * This implements OBD_BRW_CHECK logic from old client.
+	 */
+
+	if (imp == NULL || imp->imp_invalid)
+		result = -EIO;
+	if (result == 0 && oio->oi_lockless)
+		/* this page contains `invalid' data, but who cares?
+		 * nobody can access the invalid data.
+		 * in osc_io_commit_write(), we're going to write exact
+		 * [from, to) bytes of this page to OST. -jay */
+		cl_page_export(env, slice->cpl_page, 1);
+
+	return result;
+}
+
+static int osc_io_commit_write(const struct lu_env *env,
+			       const struct cl_io_slice *ios,
+			       const struct cl_page_slice *slice,
+			       unsigned from, unsigned to)
+{
+	struct osc_io	 *oio = cl2osc_io(env, ios);
+	struct osc_page       *opg = cl2osc_page(slice);
+	struct osc_object     *obj = cl2osc(opg->ops_cl.cpl_obj);
+	struct osc_async_page *oap = &opg->ops_oap;
+
+	LASSERT(to > 0);
+	/*
+	 * XXX instead of calling osc_page_touch() here and in
+	 * osc_io_fault_start() it might be more logical to introduce
+	 * cl_page_touch() method, that generic cl_io_commit_write() and page
+	 * fault code calls.
+	 */
+	osc_page_touch(env, cl2osc_page(slice), to);
+	if (!client_is_remote(osc_export(obj)) &&
+	    capable(CFS_CAP_SYS_RESOURCE))
+		oap->oap_brw_flags |= OBD_BRW_NOQUOTA;
+
+	if (oio->oi_lockless)
+		/* see osc_io_prepare_write() for lockless io handling. */
+		cl_page_clip(env, slice->cpl_page, from, to);
+
+	return 0;
+}
+
+static int osc_io_fault_start(const struct lu_env *env,
+			      const struct cl_io_slice *ios)
+{
+	struct cl_io       *io;
+	struct cl_fault_io *fio;
+
+	io  = ios->cis_io;
+	fio = &io->u.ci_fault;
+	CDEBUG(D_INFO, "%lu %d %d\n",
+	       fio->ft_index, fio->ft_writable, fio->ft_nob);
+	/*
+	 * If mapping is writeable, adjust kms to cover this page,
+	 * but do not extend kms beyond actual file size.
+	 * See bug 10919.
+	 */
+	if (fio->ft_writable)
+		osc_page_touch_at(env, ios->cis_obj,
+				  fio->ft_index, fio->ft_nob);
+	return 0;
+}
+
+static int osc_async_upcall(void *a, int rc)
+{
+	struct osc_async_cbargs *args = a;
+
+	args->opc_rc = rc;
+	complete(&args->opc_sync);
+	return 0;
+}
+
+/**
+ * Checks that there are no pages being written in the extent being truncated.
+ */
+static int trunc_check_cb(const struct lu_env *env, struct cl_io *io,
+			  struct cl_page *page, void *cbdata)
+{
+	const struct cl_page_slice *slice;
+	struct osc_page *ops;
+	struct osc_async_page *oap;
+	__u64 start = *(__u64 *)cbdata;
+
+	slice = cl_page_at(page, &osc_device_type);
+	LASSERT(slice != NULL);
+	ops = cl2osc_page(slice);
+	oap = &ops->ops_oap;
+
+	if (oap->oap_cmd & OBD_BRW_WRITE &&
+	    !list_empty(&oap->oap_pending_item))
+		CL_PAGE_DEBUG(D_ERROR, env, page, "exists %llu/%s.\n",
+				start, current->comm);
+
+	{
+		struct page *vmpage = cl_page_vmpage(env, page);
+
+		if (PageLocked(vmpage))
+			CDEBUG(D_CACHE, "page %p index %lu locked for %d.\n",
+			       ops, page->cp_index,
+			       (oap->oap_cmd & OBD_BRW_RWMASK));
+	}
+
+	return CLP_GANG_OKAY;
+}
+
+static void osc_trunc_check(const struct lu_env *env, struct cl_io *io,
+			    struct osc_io *oio, __u64 size)
+{
+	struct cl_object *clob;
+	int     partial;
+	pgoff_t start;
+
+	clob    = oio->oi_cl.cis_obj;
+	start   = cl_index(clob, size);
+	partial = cl_offset(clob, start) < size;
+
+	/*
+	 * Complain if there are pages in the truncated region.
+	 */
+	cl_page_gang_lookup(env, clob, io, start + partial, CL_PAGE_EOF,
+			    trunc_check_cb, (void *)&size);
+}
+
+static int osc_io_setattr_start(const struct lu_env *env,
+				const struct cl_io_slice *slice)
+{
+	struct cl_io	    *io     = slice->cis_io;
+	struct osc_io	   *oio    = cl2osc_io(env, slice);
+	struct cl_object	*obj    = slice->cis_obj;
+	struct lov_oinfo	*loi    = cl2osc(obj)->oo_oinfo;
+	struct cl_attr	  *attr   = &osc_env_info(env)->oti_attr;
+	struct obdo	     *oa     = &oio->oi_oa;
+	struct osc_async_cbargs *cbargs = &oio->oi_cbarg;
+	__u64		    size   = io->u.ci_setattr.sa_attr.lvb_size;
+	unsigned int	     ia_valid = io->u.ci_setattr.sa_valid;
+	int		      result = 0;
+	struct obd_info	  oinfo = { { { 0 } } };
+
+	/* truncate cache dirty pages first */
+	if (cl_io_is_trunc(io))
+		result = osc_cache_truncate_start(env, oio, cl2osc(obj), size);
+
+	if (result == 0 && oio->oi_lockless == 0) {
+		cl_object_attr_lock(obj);
+		result = cl_object_attr_get(env, obj, attr);
+		if (result == 0) {
+			struct ost_lvb *lvb = &io->u.ci_setattr.sa_attr;
+			unsigned int cl_valid = 0;
+
+			if (ia_valid & ATTR_SIZE) {
+				attr->cat_size = attr->cat_kms = size;
+				cl_valid = CAT_SIZE | CAT_KMS;
+			}
+			if (ia_valid & ATTR_MTIME_SET) {
+				attr->cat_mtime = lvb->lvb_mtime;
+				cl_valid |= CAT_MTIME;
+			}
+			if (ia_valid & ATTR_ATIME_SET) {
+				attr->cat_atime = lvb->lvb_atime;
+				cl_valid |= CAT_ATIME;
+			}
+			if (ia_valid & ATTR_CTIME_SET) {
+				attr->cat_ctime = lvb->lvb_ctime;
+				cl_valid |= CAT_CTIME;
+			}
+			result = cl_object_attr_set(env, obj, attr, cl_valid);
+		}
+		cl_object_attr_unlock(obj);
+	}
+	memset(oa, 0, sizeof(*oa));
+	if (result == 0) {
+		oa->o_oi = loi->loi_oi;
+		oa->o_mtime = attr->cat_mtime;
+		oa->o_atime = attr->cat_atime;
+		oa->o_ctime = attr->cat_ctime;
+		oa->o_valid = OBD_MD_FLID | OBD_MD_FLGROUP | OBD_MD_FLATIME |
+			OBD_MD_FLCTIME | OBD_MD_FLMTIME;
+		if (ia_valid & ATTR_SIZE) {
+			oa->o_size = size;
+			oa->o_blocks = OBD_OBJECT_EOF;
+			oa->o_valid |= OBD_MD_FLSIZE | OBD_MD_FLBLOCKS;
+
+			if (oio->oi_lockless) {
+				oa->o_flags = OBD_FL_SRVLOCK;
+				oa->o_valid |= OBD_MD_FLFLAGS;
+			}
+		} else {
+			LASSERT(oio->oi_lockless == 0);
+		}
+
+		oinfo.oi_oa = oa;
+		oinfo.oi_capa = io->u.ci_setattr.sa_capa;
+		init_completion(&cbargs->opc_sync);
+
+		if (ia_valid & ATTR_SIZE)
+			result = osc_punch_base(osc_export(cl2osc(obj)),
+						&oinfo, osc_async_upcall,
+						cbargs, PTLRPCD_SET);
+		else
+			result = osc_setattr_async_base(osc_export(cl2osc(obj)),
+							&oinfo, NULL,
+							osc_async_upcall,
+							cbargs, PTLRPCD_SET);
+		cbargs->opc_rpc_sent = result == 0;
+	}
+	return result;
+}
+
+static void osc_io_setattr_end(const struct lu_env *env,
+			       const struct cl_io_slice *slice)
+{
+	struct cl_io     *io  = slice->cis_io;
+	struct osc_io    *oio = cl2osc_io(env, slice);
+	struct cl_object *obj = slice->cis_obj;
+	struct osc_async_cbargs *cbargs = &oio->oi_cbarg;
+	int result = 0;
+
+	if (cbargs->opc_rpc_sent) {
+		wait_for_completion(&cbargs->opc_sync);
+		result = io->ci_result = cbargs->opc_rc;
+	}
+	if (result == 0) {
+		if (oio->oi_lockless) {
+			/* lockless truncate */
+			struct osc_device *osd = lu2osc_dev(obj->co_lu.lo_dev);
+
+			LASSERT(cl_io_is_trunc(io));
+			/* XXX: Need a lock. */
+			osd->od_stats.os_lockless_truncates++;
+		}
+	}
+
+	if (cl_io_is_trunc(io)) {
+		__u64 size = io->u.ci_setattr.sa_attr.lvb_size;
+
+		osc_trunc_check(env, io, oio, size);
+		if (oio->oi_trunc != NULL) {
+			osc_cache_truncate_end(env, oio, cl2osc(obj));
+			oio->oi_trunc = NULL;
+		}
+	}
+}
+
+static int osc_io_read_start(const struct lu_env *env,
+			     const struct cl_io_slice *slice)
+{
+	struct cl_object *obj   = slice->cis_obj;
+	struct cl_attr   *attr  = &osc_env_info(env)->oti_attr;
+	int rc = 0;
+
+	if (!slice->cis_io->ci_noatime) {
+		cl_object_attr_lock(obj);
+		attr->cat_atime = LTIME_S(CURRENT_TIME);
+		rc = cl_object_attr_set(env, obj, attr, CAT_ATIME);
+		cl_object_attr_unlock(obj);
+	}
+	return rc;
+}
+
+static int osc_io_write_start(const struct lu_env *env,
+			      const struct cl_io_slice *slice)
+{
+	struct cl_object *obj   = slice->cis_obj;
+	struct cl_attr   *attr  = &osc_env_info(env)->oti_attr;
+	int rc = 0;
+
+	OBD_FAIL_TIMEOUT(OBD_FAIL_OSC_DELAY_SETTIME, 1);
+	cl_object_attr_lock(obj);
+	attr->cat_mtime = attr->cat_ctime = LTIME_S(CURRENT_TIME);
+	rc = cl_object_attr_set(env, obj, attr, CAT_MTIME | CAT_CTIME);
+	cl_object_attr_unlock(obj);
+
+	return rc;
+}
+
+static int osc_fsync_ost(const struct lu_env *env, struct osc_object *obj,
+			 struct cl_fsync_io *fio)
+{
+	struct osc_io    *oio   = osc_env_io(env);
+	struct obdo      *oa    = &oio->oi_oa;
+	struct obd_info  *oinfo = &oio->oi_info;
+	struct lov_oinfo *loi   = obj->oo_oinfo;
+	struct osc_async_cbargs *cbargs = &oio->oi_cbarg;
+	int rc = 0;
+
+	memset(oa, 0, sizeof(*oa));
+	oa->o_oi = loi->loi_oi;
+	oa->o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
+
+	/* reload size abd blocks for start and end of sync range */
+	oa->o_size = fio->fi_start;
+	oa->o_blocks = fio->fi_end;
+	oa->o_valid |= OBD_MD_FLSIZE | OBD_MD_FLBLOCKS;
+
+	obdo_set_parent_fid(oa, fio->fi_fid);
+
+	memset(oinfo, 0, sizeof(*oinfo));
+	oinfo->oi_oa = oa;
+	oinfo->oi_capa = fio->fi_capa;
+	init_completion(&cbargs->opc_sync);
+
+	rc = osc_sync_base(osc_export(obj), oinfo, osc_async_upcall, cbargs,
+			   PTLRPCD_SET);
+	return rc;
+}
+
+static int osc_io_fsync_start(const struct lu_env *env,
+			      const struct cl_io_slice *slice)
+{
+	struct cl_io       *io  = slice->cis_io;
+	struct cl_fsync_io *fio = &io->u.ci_fsync;
+	struct cl_object   *obj = slice->cis_obj;
+	struct osc_object  *osc = cl2osc(obj);
+	pgoff_t start  = cl_index(obj, fio->fi_start);
+	pgoff_t end    = cl_index(obj, fio->fi_end);
+	int     result = 0;
+
+	if (fio->fi_end == OBD_OBJECT_EOF)
+		end = CL_PAGE_EOF;
+
+	result = osc_cache_writeback_range(env, osc, start, end, 0,
+					   fio->fi_mode == CL_FSYNC_DISCARD);
+	if (result > 0) {
+		fio->fi_nr_written += result;
+		result = 0;
+	}
+	if (fio->fi_mode == CL_FSYNC_ALL) {
+		int rc;
+
+		/* we have to wait for writeback to finish before we can
+		 * send OST_SYNC RPC. This is bad because it causes extents
+		 * to be written osc by osc. However, we usually start
+		 * writeback before CL_FSYNC_ALL so this won't have any real
+		 * problem. */
+		rc = osc_cache_wait_range(env, osc, start, end);
+		if (result == 0)
+			result = rc;
+		rc = osc_fsync_ost(env, osc, fio);
+		if (result == 0)
+			result = rc;
+	}
+
+	return result;
+}
+
+static void osc_io_fsync_end(const struct lu_env *env,
+			     const struct cl_io_slice *slice)
+{
+	struct cl_fsync_io *fio = &slice->cis_io->u.ci_fsync;
+	struct cl_object   *obj = slice->cis_obj;
+	pgoff_t start = cl_index(obj, fio->fi_start);
+	pgoff_t end   = cl_index(obj, fio->fi_end);
+	int result = 0;
+
+	if (fio->fi_mode == CL_FSYNC_LOCAL) {
+		result = osc_cache_wait_range(env, cl2osc(obj), start, end);
+	} else if (fio->fi_mode == CL_FSYNC_ALL) {
+		struct osc_io	   *oio    = cl2osc_io(env, slice);
+		struct osc_async_cbargs *cbargs = &oio->oi_cbarg;
+
+		wait_for_completion(&cbargs->opc_sync);
+		if (result == 0)
+			result = cbargs->opc_rc;
+	}
+	slice->cis_io->ci_result = result;
+}
+
+static void osc_io_end(const struct lu_env *env,
+		       const struct cl_io_slice *slice)
+{
+	struct osc_io *oio = cl2osc_io(env, slice);
+
+	if (oio->oi_active) {
+		osc_extent_release(env, oio->oi_active);
+		oio->oi_active = NULL;
+	}
+}
+
+static const struct cl_io_operations osc_io_ops = {
+	.op = {
+		[CIT_READ] = {
+			.cio_start  = osc_io_read_start,
+			.cio_fini   = osc_io_fini
+		},
+		[CIT_WRITE] = {
+			.cio_start  = osc_io_write_start,
+			.cio_end    = osc_io_end,
+			.cio_fini   = osc_io_fini
+		},
+		[CIT_SETATTR] = {
+			.cio_start  = osc_io_setattr_start,
+			.cio_end    = osc_io_setattr_end
+		},
+		[CIT_FAULT] = {
+			.cio_start  = osc_io_fault_start,
+			.cio_end    = osc_io_end,
+			.cio_fini   = osc_io_fini
+		},
+		[CIT_FSYNC] = {
+			.cio_start  = osc_io_fsync_start,
+			.cio_end    = osc_io_fsync_end,
+			.cio_fini   = osc_io_fini
+		},
+		[CIT_MISC] = {
+			.cio_fini   = osc_io_fini
+		}
+	},
+	.req_op = {
+		 [CRT_READ] = {
+			 .cio_submit    = osc_io_submit
+		 },
+		 [CRT_WRITE] = {
+			 .cio_submit    = osc_io_submit
+		 }
+	 },
+	.cio_prepare_write = osc_io_prepare_write,
+	.cio_commit_write  = osc_io_commit_write
+};
+
+/*****************************************************************************
+ *
+ * Transfer operations.
+ *
+ */
+
+static int osc_req_prep(const struct lu_env *env,
+			const struct cl_req_slice *slice)
+{
+	return 0;
+}
+
+static void osc_req_completion(const struct lu_env *env,
+			       const struct cl_req_slice *slice, int ioret)
+{
+	struct osc_req *or;
+
+	or = cl2osc_req(slice);
+	OBD_SLAB_FREE_PTR(or, osc_req_kmem);
+}
+
+/**
+ * Implementation of struct cl_req_operations::cro_attr_set() for osc
+ * layer. osc is responsible for struct obdo::o_id and struct obdo::o_seq
+ * fields.
+ */
+static void osc_req_attr_set(const struct lu_env *env,
+			     const struct cl_req_slice *slice,
+			     const struct cl_object *obj,
+			     struct cl_req_attr *attr, u64 flags)
+{
+	struct lov_oinfo *oinfo;
+	struct cl_req    *clerq;
+	struct cl_page   *apage; /* _some_ page in @clerq */
+	struct cl_lock   *lock;  /* _some_ lock protecting @apage */
+	struct osc_lock  *olck;
+	struct osc_page  *opg;
+	struct obdo      *oa;
+	struct ost_lvb   *lvb;
+
+	oinfo	= cl2osc(obj)->oo_oinfo;
+	lvb	= &oinfo->loi_lvb;
+	oa	= attr->cra_oa;
+
+	if ((flags & OBD_MD_FLMTIME) != 0) {
+		oa->o_mtime = lvb->lvb_mtime;
+		oa->o_valid |= OBD_MD_FLMTIME;
+	}
+	if ((flags & OBD_MD_FLATIME) != 0) {
+		oa->o_atime = lvb->lvb_atime;
+		oa->o_valid |= OBD_MD_FLATIME;
+	}
+	if ((flags & OBD_MD_FLCTIME) != 0) {
+		oa->o_ctime = lvb->lvb_ctime;
+		oa->o_valid |= OBD_MD_FLCTIME;
+	}
+	if (flags & OBD_MD_FLGROUP) {
+		ostid_set_seq(&oa->o_oi, ostid_seq(&oinfo->loi_oi));
+		oa->o_valid |= OBD_MD_FLGROUP;
+	}
+	if (flags & OBD_MD_FLID) {
+		ostid_set_id(&oa->o_oi, ostid_id(&oinfo->loi_oi));
+		oa->o_valid |= OBD_MD_FLID;
+	}
+	if (flags & OBD_MD_FLHANDLE) {
+		clerq = slice->crs_req;
+		LASSERT(!list_empty(&clerq->crq_pages));
+		apage = container_of(clerq->crq_pages.next,
+				     struct cl_page, cp_flight);
+		opg = osc_cl_page_osc(apage);
+		apage = opg->ops_cl.cpl_page; /* now apage is a sub-page */
+		lock = cl_lock_at_page(env, apage->cp_obj, apage, NULL, 1, 1);
+		if (lock == NULL) {
+			struct cl_object_header *head;
+			struct cl_lock	  *scan;
+
+			head = cl_object_header(apage->cp_obj);
+			list_for_each_entry(scan, &head->coh_locks,
+						cll_linkage)
+				CL_LOCK_DEBUG(D_ERROR, env, scan,
+					      "no cover page!\n");
+			CL_PAGE_DEBUG(D_ERROR, env, apage,
+				      "dump uncover page!\n");
+			dump_stack();
+			LBUG();
+		}
+
+		olck = osc_lock_at(lock);
+		LASSERT(olck != NULL);
+		LASSERT(ergo(opg->ops_srvlock, olck->ols_lock == NULL));
+		/* check for lockless io. */
+		if (olck->ols_lock != NULL) {
+			oa->o_handle = olck->ols_lock->l_remote_handle;
+			oa->o_valid |= OBD_MD_FLHANDLE;
+		}
+		cl_lock_put(env, lock);
+	}
+}
+
+static const struct cl_req_operations osc_req_ops = {
+	.cro_prep       = osc_req_prep,
+	.cro_attr_set   = osc_req_attr_set,
+	.cro_completion = osc_req_completion
+};
+
+
+int osc_io_init(const struct lu_env *env,
+		struct cl_object *obj, struct cl_io *io)
+{
+	struct osc_io *oio = osc_env_io(env);
+
+	CL_IO_SLICE_CLEAN(oio, oi_cl);
+	cl_io_slice_add(io, &oio->oi_cl, obj, &osc_io_ops);
+	return 0;
+}
+
+int osc_req_init(const struct lu_env *env, struct cl_device *dev,
+		 struct cl_req *req)
+{
+	struct osc_req *or;
+	int result;
+
+	OBD_SLAB_ALLOC_PTR_GFP(or, osc_req_kmem, GFP_NOFS);
+	if (or != NULL) {
+		cl_req_slice_add(req, &or->or_cl, dev, &osc_req_ops);
+		result = 0;
+	} else
+		result = -ENOMEM;
+	return result;
+}
+
+/** @} osc */
diff --git a/kernel/drivers/staging/lustre/lustre/osc/osc_lock.c b/kernel/drivers/staging/lustre/lustre/osc/osc_lock.c
new file mode 100644
index 000000000..350ad4955
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/osc/osc_lock.c
@@ -0,0 +1,1613 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Implementation of cl_lock for OSC layer.
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_OSC
+
+#include "../../include/linux/libcfs/libcfs.h"
+/* fid_build_reg_res_name() */
+#include "../include/lustre_fid.h"
+
+#include "osc_cl_internal.h"
+
+/** \addtogroup osc
+ *  @{
+ */
+
+#define _PAGEREF_MAGIC  (-10000000)
+
+/*****************************************************************************
+ *
+ * Type conversions.
+ *
+ */
+
+static const struct cl_lock_operations osc_lock_ops;
+static const struct cl_lock_operations osc_lock_lockless_ops;
+static void osc_lock_to_lockless(const struct lu_env *env,
+				 struct osc_lock *ols, int force);
+static int osc_lock_has_pages(struct osc_lock *olck);
+
+int osc_lock_is_lockless(const struct osc_lock *olck)
+{
+	return (olck->ols_cl.cls_ops == &osc_lock_lockless_ops);
+}
+
+/**
+ * Returns a weak pointer to the ldlm lock identified by a handle. Returned
+ * pointer cannot be dereferenced, as lock is not protected from concurrent
+ * reclaim. This function is a helper for osc_lock_invariant().
+ */
+static struct ldlm_lock *osc_handle_ptr(struct lustre_handle *handle)
+{
+	struct ldlm_lock *lock;
+
+	lock = ldlm_handle2lock(handle);
+	if (lock != NULL)
+		LDLM_LOCK_PUT(lock);
+	return lock;
+}
+
+/**
+ * Invariant that has to be true all of the time.
+ */
+static int osc_lock_invariant(struct osc_lock *ols)
+{
+	struct ldlm_lock *lock	      = osc_handle_ptr(&ols->ols_handle);
+	struct ldlm_lock *olock	      = ols->ols_lock;
+	int		  handle_used = lustre_handle_is_used(&ols->ols_handle);
+
+	if (ergo(osc_lock_is_lockless(ols),
+		 ols->ols_locklessable && ols->ols_lock == NULL))
+		return 1;
+
+	/*
+	 * If all the following "ergo"s are true, return 1, otherwise 0
+	 */
+	if (!ergo(olock != NULL, handle_used))
+		return 0;
+
+	if (!ergo(olock != NULL,
+		   olock->l_handle.h_cookie == ols->ols_handle.cookie))
+		return 0;
+
+	if (!ergo(handle_used,
+		   ergo(lock != NULL && olock != NULL, lock == olock) &&
+		   ergo(lock == NULL, olock == NULL)))
+		return 0;
+	/*
+	 * Check that ->ols_handle and ->ols_lock are consistent, but
+	 * take into account that they are set at the different time.
+	 */
+	if (!ergo(ols->ols_state == OLS_CANCELLED,
+		   olock == NULL && !handle_used))
+		return 0;
+	/*
+	 * DLM lock is destroyed only after we have seen cancellation
+	 * ast.
+	 */
+	if (!ergo(olock != NULL && ols->ols_state < OLS_CANCELLED,
+		   ((olock->l_flags & LDLM_FL_DESTROYED) == 0)))
+		return 0;
+
+	if (!ergo(ols->ols_state == OLS_GRANTED,
+		   olock != NULL &&
+		   olock->l_req_mode == olock->l_granted_mode &&
+		   ols->ols_hold))
+		return 0;
+	return 1;
+}
+
+/*****************************************************************************
+ *
+ * Lock operations.
+ *
+ */
+
+/**
+ * Breaks a link between osc_lock and dlm_lock.
+ */
+static void osc_lock_detach(const struct lu_env *env, struct osc_lock *olck)
+{
+	struct ldlm_lock *dlmlock;
+
+	spin_lock(&osc_ast_guard);
+	dlmlock = olck->ols_lock;
+	if (dlmlock == NULL) {
+		spin_unlock(&osc_ast_guard);
+		return;
+	}
+
+	olck->ols_lock = NULL;
+	/* wb(); --- for all who checks (ols->ols_lock != NULL) before
+	 * call to osc_lock_detach() */
+	dlmlock->l_ast_data = NULL;
+	olck->ols_handle.cookie = 0ULL;
+	spin_unlock(&osc_ast_guard);
+
+	lock_res_and_lock(dlmlock);
+	if (dlmlock->l_granted_mode == dlmlock->l_req_mode) {
+		struct cl_object *obj = olck->ols_cl.cls_obj;
+		struct cl_attr *attr  = &osc_env_info(env)->oti_attr;
+		__u64 old_kms;
+
+		cl_object_attr_lock(obj);
+		/* Must get the value under the lock to avoid possible races. */
+		old_kms = cl2osc(obj)->oo_oinfo->loi_kms;
+		/* Update the kms. Need to loop all granted locks.
+		 * Not a problem for the client */
+		attr->cat_kms = ldlm_extent_shift_kms(dlmlock, old_kms);
+
+		cl_object_attr_set(env, obj, attr, CAT_KMS);
+		cl_object_attr_unlock(obj);
+	}
+	unlock_res_and_lock(dlmlock);
+
+	/* release a reference taken in osc_lock_upcall0(). */
+	LASSERT(olck->ols_has_ref);
+	lu_ref_del(&dlmlock->l_reference, "osc_lock", olck);
+	LDLM_LOCK_RELEASE(dlmlock);
+	olck->ols_has_ref = 0;
+}
+
+static int osc_lock_unhold(struct osc_lock *ols)
+{
+	int result = 0;
+
+	if (ols->ols_hold) {
+		ols->ols_hold = 0;
+		result = osc_cancel_base(&ols->ols_handle,
+					 ols->ols_einfo.ei_mode);
+	}
+	return result;
+}
+
+static int osc_lock_unuse(const struct lu_env *env,
+			  const struct cl_lock_slice *slice)
+{
+	struct osc_lock *ols = cl2osc_lock(slice);
+
+	LINVRNT(osc_lock_invariant(ols));
+
+	switch (ols->ols_state) {
+	case OLS_NEW:
+		LASSERT(!ols->ols_hold);
+		LASSERT(ols->ols_agl);
+		return 0;
+	case OLS_UPCALL_RECEIVED:
+		osc_lock_unhold(ols);
+	case OLS_ENQUEUED:
+		LASSERT(!ols->ols_hold);
+		osc_lock_detach(env, ols);
+		ols->ols_state = OLS_NEW;
+		return 0;
+	case OLS_GRANTED:
+		LASSERT(!ols->ols_glimpse);
+		LASSERT(ols->ols_hold);
+		/*
+		 * Move lock into OLS_RELEASED state before calling
+		 * osc_cancel_base() so that possible synchronous cancellation
+		 * (that always happens e.g., for liblustre) sees that lock is
+		 * released.
+		 */
+		ols->ols_state = OLS_RELEASED;
+		return osc_lock_unhold(ols);
+	default:
+		CERROR("Impossible state: %d\n", ols->ols_state);
+		LBUG();
+	}
+}
+
+static void osc_lock_fini(const struct lu_env *env,
+			  struct cl_lock_slice *slice)
+{
+	struct osc_lock  *ols = cl2osc_lock(slice);
+
+	LINVRNT(osc_lock_invariant(ols));
+	/*
+	 * ->ols_hold can still be true at this point if, for example, a
+	 * thread that requested a lock was killed (and released a reference
+	 * to the lock), before reply from a server was received. In this case
+	 * lock is destroyed immediately after upcall.
+	 */
+	osc_lock_unhold(ols);
+	LASSERT(ols->ols_lock == NULL);
+	LASSERT(atomic_read(&ols->ols_pageref) == 0 ||
+		atomic_read(&ols->ols_pageref) == _PAGEREF_MAGIC);
+
+	OBD_SLAB_FREE_PTR(ols, osc_lock_kmem);
+}
+
+static void osc_lock_build_policy(const struct lu_env *env,
+				  const struct cl_lock *lock,
+				  ldlm_policy_data_t *policy)
+{
+	const struct cl_lock_descr *d = &lock->cll_descr;
+
+	osc_index2policy(policy, d->cld_obj, d->cld_start, d->cld_end);
+	policy->l_extent.gid = d->cld_gid;
+}
+
+static __u64 osc_enq2ldlm_flags(__u32 enqflags)
+{
+	__u64 result = 0;
+
+	LASSERT((enqflags & ~CEF_MASK) == 0);
+
+	if (enqflags & CEF_NONBLOCK)
+		result |= LDLM_FL_BLOCK_NOWAIT;
+	if (enqflags & CEF_ASYNC)
+		result |= LDLM_FL_HAS_INTENT;
+	if (enqflags & CEF_DISCARD_DATA)
+		result |= LDLM_FL_AST_DISCARD_DATA;
+	return result;
+}
+
+/**
+ * Global spin-lock protecting consistency of ldlm_lock::l_ast_data
+ * pointers. Initialized in osc_init().
+ */
+spinlock_t osc_ast_guard;
+
+static struct osc_lock *osc_ast_data_get(struct ldlm_lock *dlm_lock)
+{
+	struct osc_lock *olck;
+
+	lock_res_and_lock(dlm_lock);
+	spin_lock(&osc_ast_guard);
+	olck = dlm_lock->l_ast_data;
+	if (olck != NULL) {
+		struct cl_lock *lock = olck->ols_cl.cls_lock;
+		/*
+		 * If osc_lock holds a reference on ldlm lock, return it even
+		 * when cl_lock is in CLS_FREEING state. This way
+		 *
+		 *	 osc_ast_data_get(dlmlock) == NULL
+		 *
+		 * guarantees that all osc references on dlmlock were
+		 * released. osc_dlm_blocking_ast0() relies on that.
+		 */
+		if (lock->cll_state < CLS_FREEING || olck->ols_has_ref) {
+			cl_lock_get_trust(lock);
+			lu_ref_add_atomic(&lock->cll_reference,
+					  "ast", current);
+		} else
+			olck = NULL;
+	}
+	spin_unlock(&osc_ast_guard);
+	unlock_res_and_lock(dlm_lock);
+	return olck;
+}
+
+static void osc_ast_data_put(const struct lu_env *env, struct osc_lock *olck)
+{
+	struct cl_lock *lock;
+
+	lock = olck->ols_cl.cls_lock;
+	lu_ref_del(&lock->cll_reference, "ast", current);
+	cl_lock_put(env, lock);
+}
+
+/**
+ * Updates object attributes from a lock value block (lvb) received together
+ * with the DLM lock reply from the server. Copy of osc_update_enqueue()
+ * logic.
+ *
+ * This can be optimized to not update attributes when lock is a result of a
+ * local match.
+ *
+ * Called under lock and resource spin-locks.
+ */
+static void osc_lock_lvb_update(const struct lu_env *env, struct osc_lock *olck,
+				int rc)
+{
+	struct ost_lvb    *lvb;
+	struct cl_object  *obj;
+	struct lov_oinfo  *oinfo;
+	struct cl_attr    *attr;
+	unsigned	   valid;
+
+	if (!(olck->ols_flags & LDLM_FL_LVB_READY))
+		return;
+
+	lvb   = &olck->ols_lvb;
+	obj   = olck->ols_cl.cls_obj;
+	oinfo = cl2osc(obj)->oo_oinfo;
+	attr  = &osc_env_info(env)->oti_attr;
+	valid = CAT_BLOCKS | CAT_ATIME | CAT_CTIME | CAT_MTIME | CAT_SIZE;
+	cl_lvb2attr(attr, lvb);
+
+	cl_object_attr_lock(obj);
+	if (rc == 0) {
+		struct ldlm_lock  *dlmlock;
+		__u64 size;
+
+		dlmlock = olck->ols_lock;
+		LASSERT(dlmlock != NULL);
+
+		/* re-grab LVB from a dlm lock under DLM spin-locks. */
+		*lvb = *(struct ost_lvb *)dlmlock->l_lvb_data;
+		size = lvb->lvb_size;
+		/* Extend KMS up to the end of this lock and no further
+		 * A lock on [x,y] means a KMS of up to y + 1 bytes! */
+		if (size > dlmlock->l_policy_data.l_extent.end)
+			size = dlmlock->l_policy_data.l_extent.end + 1;
+		if (size >= oinfo->loi_kms) {
+			LDLM_DEBUG(dlmlock, "lock acquired, setting rss=%llu, kms=%llu",
+				   lvb->lvb_size, size);
+			valid |= CAT_KMS;
+			attr->cat_kms = size;
+		} else {
+			LDLM_DEBUG(dlmlock, "lock acquired, setting rss=%llu; leaving kms=%llu, end=%llu",
+				   lvb->lvb_size, oinfo->loi_kms,
+				   dlmlock->l_policy_data.l_extent.end);
+		}
+		ldlm_lock_allow_match_locked(dlmlock);
+	} else if (rc == -ENAVAIL && olck->ols_glimpse) {
+		CDEBUG(D_INODE, "glimpsed, setting rss=%llu; leaving kms=%llu\n",
+		       lvb->lvb_size, oinfo->loi_kms);
+	} else
+		valid = 0;
+
+	if (valid != 0)
+		cl_object_attr_set(env, obj, attr, valid);
+
+	cl_object_attr_unlock(obj);
+}
+
+/**
+ * Called when a lock is granted, from an upcall (when server returned a
+ * granted lock), or from completion AST, when server returned a blocked lock.
+ *
+ * Called under lock and resource spin-locks, that are released temporarily
+ * here.
+ */
+static void osc_lock_granted(const struct lu_env *env, struct osc_lock *olck,
+			     struct ldlm_lock *dlmlock, int rc)
+{
+	struct ldlm_extent   *ext;
+	struct cl_lock       *lock;
+	struct cl_lock_descr *descr;
+
+	LASSERT(dlmlock->l_granted_mode == dlmlock->l_req_mode);
+
+	if (olck->ols_state < OLS_GRANTED) {
+		lock  = olck->ols_cl.cls_lock;
+		ext   = &dlmlock->l_policy_data.l_extent;
+		descr = &osc_env_info(env)->oti_descr;
+		descr->cld_obj = lock->cll_descr.cld_obj;
+
+		/* XXX check that ->l_granted_mode is valid. */
+		descr->cld_mode  = osc_ldlm2cl_lock(dlmlock->l_granted_mode);
+		descr->cld_start = cl_index(descr->cld_obj, ext->start);
+		descr->cld_end   = cl_index(descr->cld_obj, ext->end);
+		descr->cld_gid   = ext->gid;
+		/*
+		 * tell upper layers the extent of the lock that was actually
+		 * granted
+		 */
+		olck->ols_state = OLS_GRANTED;
+		osc_lock_lvb_update(env, olck, rc);
+
+		/* release DLM spin-locks to allow cl_lock_{modify,signal}()
+		 * to take a semaphore on a parent lock. This is safe, because
+		 * spin-locks are needed to protect consistency of
+		 * dlmlock->l_*_mode and LVB, and we have finished processing
+		 * them. */
+		unlock_res_and_lock(dlmlock);
+		cl_lock_modify(env, lock, descr);
+		cl_lock_signal(env, lock);
+		LINVRNT(osc_lock_invariant(olck));
+		lock_res_and_lock(dlmlock);
+	}
+}
+
+static void osc_lock_upcall0(const struct lu_env *env, struct osc_lock *olck)
+
+{
+	struct ldlm_lock *dlmlock;
+
+	dlmlock = ldlm_handle2lock_long(&olck->ols_handle, 0);
+	LASSERT(dlmlock != NULL);
+
+	lock_res_and_lock(dlmlock);
+	spin_lock(&osc_ast_guard);
+	LASSERT(dlmlock->l_ast_data == olck);
+	LASSERT(olck->ols_lock == NULL);
+	olck->ols_lock = dlmlock;
+	spin_unlock(&osc_ast_guard);
+
+	/*
+	 * Lock might be not yet granted. In this case, completion ast
+	 * (osc_ldlm_completion_ast()) comes later and finishes lock
+	 * granting.
+	 */
+	if (dlmlock->l_granted_mode == dlmlock->l_req_mode)
+		osc_lock_granted(env, olck, dlmlock, 0);
+	unlock_res_and_lock(dlmlock);
+
+	/*
+	 * osc_enqueue_interpret() decrefs asynchronous locks, counter
+	 * this.
+	 */
+	ldlm_lock_addref(&olck->ols_handle, olck->ols_einfo.ei_mode);
+	olck->ols_hold = 1;
+
+	/* lock reference taken by ldlm_handle2lock_long() is owned by
+	 * osc_lock and released in osc_lock_detach() */
+	lu_ref_add(&dlmlock->l_reference, "osc_lock", olck);
+	olck->ols_has_ref = 1;
+}
+
+/**
+ * Lock upcall function that is executed either when a reply to ENQUEUE rpc is
+ * received from a server, or after osc_enqueue_base() matched a local DLM
+ * lock.
+ */
+static int osc_lock_upcall(void *cookie, int errcode)
+{
+	struct osc_lock	 *olck  = cookie;
+	struct cl_lock_slice    *slice = &olck->ols_cl;
+	struct cl_lock	  *lock  = slice->cls_lock;
+	struct lu_env	   *env;
+	struct cl_env_nest       nest;
+
+	env = cl_env_nested_get(&nest);
+	if (!IS_ERR(env)) {
+		int rc;
+
+		cl_lock_mutex_get(env, lock);
+
+		LASSERT(lock->cll_state >= CLS_QUEUING);
+		if (olck->ols_state == OLS_ENQUEUED) {
+			olck->ols_state = OLS_UPCALL_RECEIVED;
+			rc = ldlm_error2errno(errcode);
+		} else if (olck->ols_state == OLS_CANCELLED) {
+			rc = -EIO;
+		} else {
+			CERROR("Impossible state: %d\n", olck->ols_state);
+			LBUG();
+		}
+		if (rc) {
+			struct ldlm_lock *dlmlock;
+
+			dlmlock = ldlm_handle2lock(&olck->ols_handle);
+			if (dlmlock != NULL) {
+				lock_res_and_lock(dlmlock);
+				spin_lock(&osc_ast_guard);
+				LASSERT(olck->ols_lock == NULL);
+				dlmlock->l_ast_data = NULL;
+				olck->ols_handle.cookie = 0ULL;
+				spin_unlock(&osc_ast_guard);
+				ldlm_lock_fail_match_locked(dlmlock);
+				unlock_res_and_lock(dlmlock);
+				LDLM_LOCK_PUT(dlmlock);
+			}
+		} else {
+			if (olck->ols_glimpse)
+				olck->ols_glimpse = 0;
+			osc_lock_upcall0(env, olck);
+		}
+
+		/* Error handling, some errors are tolerable. */
+		if (olck->ols_locklessable && rc == -EUSERS) {
+			/* This is a tolerable error, turn this lock into
+			 * lockless lock.
+			 */
+			osc_object_set_contended(cl2osc(slice->cls_obj));
+			LASSERT(slice->cls_ops == &osc_lock_ops);
+
+			/* Change this lock to ldlmlock-less lock. */
+			osc_lock_to_lockless(env, olck, 1);
+			olck->ols_state = OLS_GRANTED;
+			rc = 0;
+		} else if (olck->ols_glimpse && rc == -ENAVAIL) {
+			osc_lock_lvb_update(env, olck, rc);
+			cl_lock_delete(env, lock);
+			/* Hide the error. */
+			rc = 0;
+		}
+
+		if (rc == 0) {
+			/* For AGL case, the RPC sponsor may exits the cl_lock
+			*  processing without wait() called before related OSC
+			*  lock upcall(). So update the lock status according
+			*  to the enqueue result inside AGL upcall(). */
+			if (olck->ols_agl) {
+				lock->cll_flags |= CLF_FROM_UPCALL;
+				cl_wait_try(env, lock);
+				lock->cll_flags &= ~CLF_FROM_UPCALL;
+				if (!olck->ols_glimpse)
+					olck->ols_agl = 0;
+			}
+			cl_lock_signal(env, lock);
+			/* del user for lock upcall cookie */
+			cl_unuse_try(env, lock);
+		} else {
+			/* del user for lock upcall cookie */
+			cl_lock_user_del(env, lock);
+			cl_lock_error(env, lock, rc);
+		}
+
+		/* release cookie reference, acquired by osc_lock_enqueue() */
+		cl_lock_hold_release(env, lock, "upcall", lock);
+		cl_lock_mutex_put(env, lock);
+
+		lu_ref_del(&lock->cll_reference, "upcall", lock);
+		/* This maybe the last reference, so must be called after
+		 * cl_lock_mutex_put(). */
+		cl_lock_put(env, lock);
+
+		cl_env_nested_put(&nest, env);
+	} else {
+		/* should never happen, similar to osc_ldlm_blocking_ast(). */
+		LBUG();
+	}
+	return errcode;
+}
+
+/**
+ * Core of osc_dlm_blocking_ast() logic.
+ */
+static void osc_lock_blocking(const struct lu_env *env,
+			      struct ldlm_lock *dlmlock,
+			      struct osc_lock *olck, int blocking)
+{
+	struct cl_lock *lock = olck->ols_cl.cls_lock;
+
+	LASSERT(olck->ols_lock == dlmlock);
+	CLASSERT(OLS_BLOCKED < OLS_CANCELLED);
+	LASSERT(!osc_lock_is_lockless(olck));
+
+	/*
+	 * Lock might be still addref-ed here, if e.g., blocking ast
+	 * is sent for a failed lock.
+	 */
+	osc_lock_unhold(olck);
+
+	if (blocking && olck->ols_state < OLS_BLOCKED)
+		/*
+		 * Move osc_lock into OLS_BLOCKED before canceling the lock,
+		 * because it recursively re-enters osc_lock_blocking(), with
+		 * the state set to OLS_CANCELLED.
+		 */
+		olck->ols_state = OLS_BLOCKED;
+	/*
+	 * cancel and destroy lock at least once no matter how blocking ast is
+	 * entered (see comment above osc_ldlm_blocking_ast() for use
+	 * cases). cl_lock_cancel() and cl_lock_delete() are idempotent.
+	 */
+	cl_lock_cancel(env, lock);
+	cl_lock_delete(env, lock);
+}
+
+/**
+ * Helper for osc_dlm_blocking_ast() handling discrepancies between cl_lock
+ * and ldlm_lock caches.
+ */
+static int osc_dlm_blocking_ast0(const struct lu_env *env,
+				 struct ldlm_lock *dlmlock,
+				 void *data, int flag)
+{
+	struct osc_lock *olck;
+	struct cl_lock  *lock;
+	int result;
+	int cancel;
+
+	LASSERT(flag == LDLM_CB_BLOCKING || flag == LDLM_CB_CANCELING);
+
+	cancel = 0;
+	olck = osc_ast_data_get(dlmlock);
+	if (olck != NULL) {
+		lock = olck->ols_cl.cls_lock;
+		cl_lock_mutex_get(env, lock);
+		LINVRNT(osc_lock_invariant(olck));
+		if (olck->ols_ast_wait) {
+			/* wake up osc_lock_use() */
+			cl_lock_signal(env, lock);
+			olck->ols_ast_wait = 0;
+		}
+		/*
+		 * Lock might have been canceled while this thread was
+		 * sleeping for lock mutex, but olck is pinned in memory.
+		 */
+		if (olck == dlmlock->l_ast_data) {
+			/*
+			 * NOTE: DLM sends blocking AST's for failed locks
+			 *       (that are still in pre-OLS_GRANTED state)
+			 *       too, and they have to be canceled otherwise
+			 *       DLM lock is never destroyed and stuck in
+			 *       the memory.
+			 *
+			 *       Alternatively, ldlm_cli_cancel() can be
+			 *       called here directly for osc_locks with
+			 *       ols_state < OLS_GRANTED to maintain an
+			 *       invariant that ->clo_cancel() is only called
+			 *       for locks that were granted.
+			 */
+			LASSERT(data == olck);
+			osc_lock_blocking(env, dlmlock,
+					  olck, flag == LDLM_CB_BLOCKING);
+		} else
+			cancel = 1;
+		cl_lock_mutex_put(env, lock);
+		osc_ast_data_put(env, olck);
+	} else
+		/*
+		 * DLM lock exists, but there is no cl_lock attached to it.
+		 * This is a `normal' race. cl_object and its cl_lock's can be
+		 * removed by memory pressure, together with all pages.
+		 */
+		cancel = (flag == LDLM_CB_BLOCKING);
+
+	if (cancel) {
+		struct lustre_handle *lockh;
+
+		lockh = &osc_env_info(env)->oti_handle;
+		ldlm_lock2handle(dlmlock, lockh);
+		result = ldlm_cli_cancel(lockh, LCF_ASYNC);
+	} else
+		result = 0;
+	return result;
+}
+
+/**
+ * Blocking ast invoked by ldlm when dlm lock is either blocking progress of
+ * some other lock, or is canceled. This function is installed as a
+ * ldlm_lock::l_blocking_ast() for client extent locks.
+ *
+ * Control flow is tricky, because ldlm uses the same call-back
+ * (ldlm_lock::l_blocking_ast()) for both blocking and cancellation ast's.
+ *
+ * \param dlmlock lock for which ast occurred.
+ *
+ * \param new description of a conflicting lock in case of blocking ast.
+ *
+ * \param data value of dlmlock->l_ast_data
+ *
+ * \param flag LDLM_CB_BLOCKING or LDLM_CB_CANCELING. Used to distinguish
+ *	     cancellation and blocking ast's.
+ *
+ * Possible use cases:
+ *
+ *     - ldlm calls dlmlock->l_blocking_ast(..., LDLM_CB_CANCELING) to cancel
+ *       lock due to lock lru pressure, or explicit user request to purge
+ *       locks.
+ *
+ *     - ldlm calls dlmlock->l_blocking_ast(..., LDLM_CB_BLOCKING) to notify
+ *       us that dlmlock conflicts with another lock that some client is
+ *       enqueing. Lock is canceled.
+ *
+ *	   - cl_lock_cancel() is called. osc_lock_cancel() calls
+ *	     ldlm_cli_cancel() that calls
+ *
+ *		  dlmlock->l_blocking_ast(..., LDLM_CB_CANCELING)
+ *
+ *	     recursively entering osc_ldlm_blocking_ast().
+ *
+ *     - client cancels lock voluntary (e.g., as a part of early cancellation):
+ *
+ *	   cl_lock_cancel()->
+ *	     osc_lock_cancel()->
+ *	       ldlm_cli_cancel()->
+ *		 dlmlock->l_blocking_ast(..., LDLM_CB_CANCELING)
+ *
+ */
+static int osc_ldlm_blocking_ast(struct ldlm_lock *dlmlock,
+				 struct ldlm_lock_desc *new, void *data,
+				 int flag)
+{
+	struct lu_env     *env;
+	struct cl_env_nest nest;
+	int		result;
+
+	/*
+	 * This can be called in the context of outer IO, e.g.,
+	 *
+	 *     cl_enqueue()->...
+	 *       ->osc_enqueue_base()->...
+	 *	 ->ldlm_prep_elc_req()->...
+	 *	   ->ldlm_cancel_callback()->...
+	 *	     ->osc_ldlm_blocking_ast()
+	 *
+	 * new environment has to be created to not corrupt outer context.
+	 */
+	env = cl_env_nested_get(&nest);
+	if (!IS_ERR(env)) {
+		result = osc_dlm_blocking_ast0(env, dlmlock, data, flag);
+		cl_env_nested_put(&nest, env);
+	} else {
+		result = PTR_ERR(env);
+		/*
+		 * XXX This should never happen, as cl_lock is
+		 * stuck. Pre-allocated environment a la vvp_inode_fini_env
+		 * should be used.
+		 */
+		LBUG();
+	}
+	if (result != 0) {
+		if (result == -ENODATA)
+			result = 0;
+		else
+			CERROR("BAST failed: %d\n", result);
+	}
+	return result;
+}
+
+static int osc_ldlm_completion_ast(struct ldlm_lock *dlmlock,
+				   __u64 flags, void *data)
+{
+	struct cl_env_nest nest;
+	struct lu_env     *env;
+	struct osc_lock   *olck;
+	struct cl_lock    *lock;
+	int result;
+	int dlmrc;
+
+	/* first, do dlm part of the work */
+	dlmrc = ldlm_completion_ast_async(dlmlock, flags, data);
+	/* then, notify cl_lock */
+	env = cl_env_nested_get(&nest);
+	if (!IS_ERR(env)) {
+		olck = osc_ast_data_get(dlmlock);
+		if (olck != NULL) {
+			lock = olck->ols_cl.cls_lock;
+			cl_lock_mutex_get(env, lock);
+			/*
+			 * ldlm_handle_cp_callback() copied LVB from request
+			 * to lock->l_lvb_data, store it in osc_lock.
+			 */
+			LASSERT(dlmlock->l_lvb_data != NULL);
+			lock_res_and_lock(dlmlock);
+			olck->ols_lvb = *(struct ost_lvb *)dlmlock->l_lvb_data;
+			if (olck->ols_lock == NULL) {
+				/*
+				 * upcall (osc_lock_upcall()) hasn't yet been
+				 * called. Do nothing now, upcall will bind
+				 * olck to dlmlock and signal the waiters.
+				 *
+				 * This maintains an invariant that osc_lock
+				 * and ldlm_lock are always bound when
+				 * osc_lock is in OLS_GRANTED state.
+				 */
+			} else if (dlmlock->l_granted_mode ==
+				   dlmlock->l_req_mode) {
+				osc_lock_granted(env, olck, dlmlock, dlmrc);
+			}
+			unlock_res_and_lock(dlmlock);
+
+			if (dlmrc != 0) {
+				CL_LOCK_DEBUG(D_ERROR, env, lock,
+					      "dlmlock returned %d\n", dlmrc);
+				cl_lock_error(env, lock, dlmrc);
+			}
+			cl_lock_mutex_put(env, lock);
+			osc_ast_data_put(env, olck);
+			result = 0;
+		} else
+			result = -ELDLM_NO_LOCK_DATA;
+		cl_env_nested_put(&nest, env);
+	} else
+		result = PTR_ERR(env);
+	return dlmrc ?: result;
+}
+
+static int osc_ldlm_glimpse_ast(struct ldlm_lock *dlmlock, void *data)
+{
+	struct ptlrpc_request  *req  = data;
+	struct osc_lock	*olck;
+	struct cl_lock	 *lock;
+	struct cl_object       *obj;
+	struct cl_env_nest      nest;
+	struct lu_env	  *env;
+	struct ost_lvb	 *lvb;
+	struct req_capsule     *cap;
+	int		     result;
+
+	LASSERT(lustre_msg_get_opc(req->rq_reqmsg) == LDLM_GL_CALLBACK);
+
+	env = cl_env_nested_get(&nest);
+	if (!IS_ERR(env)) {
+		/* osc_ast_data_get() has to go after environment is
+		 * allocated, because osc_ast_data() acquires a
+		 * reference to a lock, and it can only be released in
+		 * environment.
+		 */
+		olck = osc_ast_data_get(dlmlock);
+		if (olck != NULL) {
+			lock = olck->ols_cl.cls_lock;
+			/* Do not grab the mutex of cl_lock for glimpse.
+			 * See LU-1274 for details.
+			 * BTW, it's okay for cl_lock to be cancelled during
+			 * this period because server can handle this race.
+			 * See ldlm_server_glimpse_ast() for details.
+			 * cl_lock_mutex_get(env, lock); */
+			cap = &req->rq_pill;
+			req_capsule_extend(cap, &RQF_LDLM_GL_CALLBACK);
+			req_capsule_set_size(cap, &RMF_DLM_LVB, RCL_SERVER,
+					     sizeof(*lvb));
+			result = req_capsule_server_pack(cap);
+			if (result == 0) {
+				lvb = req_capsule_server_get(cap, &RMF_DLM_LVB);
+				obj = lock->cll_descr.cld_obj;
+				result = cl_object_glimpse(env, obj, lvb);
+			}
+			if (!exp_connect_lvb_type(req->rq_export))
+				req_capsule_shrink(&req->rq_pill,
+						   &RMF_DLM_LVB,
+						   sizeof(struct ost_lvb_v1),
+						   RCL_SERVER);
+			osc_ast_data_put(env, olck);
+		} else {
+			/*
+			 * These errors are normal races, so we don't want to
+			 * fill the console with messages by calling
+			 * ptlrpc_error()
+			 */
+			lustre_pack_reply(req, 1, NULL, NULL);
+			result = -ELDLM_NO_LOCK_DATA;
+		}
+		cl_env_nested_put(&nest, env);
+	} else
+		result = PTR_ERR(env);
+	req->rq_status = result;
+	return result;
+}
+
+static unsigned long osc_lock_weigh(const struct lu_env *env,
+				    const struct cl_lock_slice *slice)
+{
+	/*
+	 * don't need to grab coh_page_guard since we don't care the exact #
+	 * of pages..
+	 */
+	return cl_object_header(slice->cls_obj)->coh_pages;
+}
+
+static void osc_lock_build_einfo(const struct lu_env *env,
+				 const struct cl_lock *clock,
+				 struct osc_lock *lock,
+				 struct ldlm_enqueue_info *einfo)
+{
+	enum cl_lock_mode mode;
+
+	mode = clock->cll_descr.cld_mode;
+	if (mode == CLM_PHANTOM)
+		/*
+		 * For now, enqueue all glimpse locks in read mode. In the
+		 * future, client might choose to enqueue LCK_PW lock for
+		 * glimpse on a file opened for write.
+		 */
+		mode = CLM_READ;
+
+	einfo->ei_type   = LDLM_EXTENT;
+	einfo->ei_mode   = osc_cl_lock2ldlm(mode);
+	einfo->ei_cb_bl  = osc_ldlm_blocking_ast;
+	einfo->ei_cb_cp  = osc_ldlm_completion_ast;
+	einfo->ei_cb_gl  = osc_ldlm_glimpse_ast;
+	einfo->ei_cbdata = lock; /* value to be put into ->l_ast_data */
+}
+
+/**
+ * Determine if the lock should be converted into a lockless lock.
+ *
+ * Steps to check:
+ * - if the lock has an explicit requirement for a non-lockless lock;
+ * - if the io lock request type ci_lockreq;
+ * - send the enqueue rpc to ost to make the further decision;
+ * - special treat to truncate lockless lock
+ *
+ *  Additional policy can be implemented here, e.g., never do lockless-io
+ *  for large extents.
+ */
+static void osc_lock_to_lockless(const struct lu_env *env,
+				 struct osc_lock *ols, int force)
+{
+	struct cl_lock_slice *slice = &ols->ols_cl;
+
+	LASSERT(ols->ols_state == OLS_NEW ||
+		ols->ols_state == OLS_UPCALL_RECEIVED);
+
+	if (force) {
+		ols->ols_locklessable = 1;
+		slice->cls_ops = &osc_lock_lockless_ops;
+	} else {
+		struct osc_io *oio     = osc_env_io(env);
+		struct cl_io  *io      = oio->oi_cl.cis_io;
+		struct cl_object *obj  = slice->cls_obj;
+		struct osc_object *oob = cl2osc(obj);
+		const struct osc_device *osd = lu2osc_dev(obj->co_lu.lo_dev);
+		struct obd_connect_data *ocd;
+
+		LASSERT(io->ci_lockreq == CILR_MANDATORY ||
+			io->ci_lockreq == CILR_MAYBE ||
+			io->ci_lockreq == CILR_NEVER);
+
+		ocd = &class_exp2cliimp(osc_export(oob))->imp_connect_data;
+		ols->ols_locklessable = (io->ci_type != CIT_SETATTR) &&
+				(io->ci_lockreq == CILR_MAYBE) &&
+				(ocd->ocd_connect_flags & OBD_CONNECT_SRVLOCK);
+		if (io->ci_lockreq == CILR_NEVER ||
+			/* lockless IO */
+		    (ols->ols_locklessable && osc_object_is_contended(oob)) ||
+			/* lockless truncate */
+		    (cl_io_is_trunc(io) &&
+		     (ocd->ocd_connect_flags & OBD_CONNECT_TRUNCLOCK) &&
+		      osd->od_lockless_truncate)) {
+			ols->ols_locklessable = 1;
+			slice->cls_ops = &osc_lock_lockless_ops;
+		}
+	}
+	LASSERT(ergo(ols->ols_glimpse, !osc_lock_is_lockless(ols)));
+}
+
+static int osc_lock_compatible(const struct osc_lock *qing,
+			       const struct osc_lock *qed)
+{
+	enum cl_lock_mode qing_mode;
+	enum cl_lock_mode qed_mode;
+
+	qing_mode = qing->ols_cl.cls_lock->cll_descr.cld_mode;
+	if (qed->ols_glimpse &&
+	    (qed->ols_state >= OLS_UPCALL_RECEIVED || qing_mode == CLM_READ))
+		return 1;
+
+	qed_mode = qed->ols_cl.cls_lock->cll_descr.cld_mode;
+	return ((qing_mode == CLM_READ) && (qed_mode == CLM_READ));
+}
+
+/**
+ * Cancel all conflicting locks and wait for them to be destroyed.
+ *
+ * This function is used for two purposes:
+ *
+ *     - early cancel all conflicting locks before starting IO, and
+ *
+ *     - guarantee that pages added to the page cache by lockless IO are never
+ *       covered by locks other than lockless IO lock, and, hence, are not
+ *       visible to other threads.
+ */
+static int osc_lock_enqueue_wait(const struct lu_env *env,
+				 const struct osc_lock *olck)
+{
+	struct cl_lock	  *lock    = olck->ols_cl.cls_lock;
+	struct cl_lock_descr    *descr   = &lock->cll_descr;
+	struct cl_object_header *hdr     = cl_object_header(descr->cld_obj);
+	struct cl_lock	  *scan;
+	struct cl_lock	  *conflict = NULL;
+	int lockless		     = osc_lock_is_lockless(olck);
+	int rc			   = 0;
+
+	LASSERT(cl_lock_is_mutexed(lock));
+
+	/* make it enqueue anyway for glimpse lock, because we actually
+	 * don't need to cancel any conflicting locks. */
+	if (olck->ols_glimpse)
+		return 0;
+
+	spin_lock(&hdr->coh_lock_guard);
+	list_for_each_entry(scan, &hdr->coh_locks, cll_linkage) {
+		struct cl_lock_descr *cld = &scan->cll_descr;
+		const struct osc_lock *scan_ols;
+
+		if (scan == lock)
+			break;
+
+		if (scan->cll_state < CLS_QUEUING ||
+		    scan->cll_state == CLS_FREEING ||
+		    cld->cld_start > descr->cld_end ||
+		    cld->cld_end < descr->cld_start)
+			continue;
+
+		/* overlapped and living locks. */
+
+		/* We're not supposed to give up group lock. */
+		if (scan->cll_descr.cld_mode == CLM_GROUP) {
+			LASSERT(descr->cld_mode != CLM_GROUP ||
+				descr->cld_gid != scan->cll_descr.cld_gid);
+			continue;
+		}
+
+		scan_ols = osc_lock_at(scan);
+
+		/* We need to cancel the compatible locks if we're enqueuing
+		 * a lockless lock, for example:
+		 * imagine that client has PR lock on [0, 1000], and thread T0
+		 * is doing lockless IO in [500, 1500] region. Concurrent
+		 * thread T1 can see lockless data in [500, 1000], which is
+		 * wrong, because these data are possibly stale. */
+		if (!lockless && osc_lock_compatible(olck, scan_ols))
+			continue;
+
+		cl_lock_get_trust(scan);
+		conflict = scan;
+		break;
+	}
+	spin_unlock(&hdr->coh_lock_guard);
+
+	if (conflict) {
+		if (lock->cll_descr.cld_mode == CLM_GROUP) {
+			/* we want a group lock but a previous lock request
+			 * conflicts, we do not wait but return 0 so the
+			 * request is send to the server
+			 */
+			CDEBUG(D_DLMTRACE, "group lock %p is conflicted with %p, no wait, send to server\n",
+			       lock, conflict);
+			cl_lock_put(env, conflict);
+			rc = 0;
+		} else {
+			CDEBUG(D_DLMTRACE, "lock %p is conflicted with %p, will wait\n",
+			       lock, conflict);
+			LASSERT(lock->cll_conflict == NULL);
+			lu_ref_add(&conflict->cll_reference, "cancel-wait",
+				   lock);
+			lock->cll_conflict = conflict;
+			rc = CLO_WAIT;
+		}
+	}
+	return rc;
+}
+
+/**
+ * Implementation of cl_lock_operations::clo_enqueue() method for osc
+ * layer. This initiates ldlm enqueue:
+ *
+ *     - cancels conflicting locks early (osc_lock_enqueue_wait());
+ *
+ *     - calls osc_enqueue_base() to do actual enqueue.
+ *
+ * osc_enqueue_base() is supplied with an upcall function that is executed
+ * when lock is received either after a local cached ldlm lock is matched, or
+ * when a reply from the server is received.
+ *
+ * This function does not wait for the network communication to complete.
+ */
+static int osc_lock_enqueue(const struct lu_env *env,
+			    const struct cl_lock_slice *slice,
+			    struct cl_io *unused, __u32 enqflags)
+{
+	struct osc_lock	  *ols     = cl2osc_lock(slice);
+	struct cl_lock	   *lock    = ols->ols_cl.cls_lock;
+	int result;
+
+	LASSERT(cl_lock_is_mutexed(lock));
+	LASSERTF(ols->ols_state == OLS_NEW,
+		 "Impossible state: %d\n", ols->ols_state);
+
+	LASSERTF(ergo(ols->ols_glimpse, lock->cll_descr.cld_mode <= CLM_READ),
+		"lock = %p, ols = %p\n", lock, ols);
+
+	result = osc_lock_enqueue_wait(env, ols);
+	if (result == 0) {
+		if (!osc_lock_is_lockless(ols)) {
+			struct osc_object	*obj = cl2osc(slice->cls_obj);
+			struct osc_thread_info   *info = osc_env_info(env);
+			struct ldlm_res_id       *resname = &info->oti_resname;
+			ldlm_policy_data_t       *policy = &info->oti_policy;
+			struct ldlm_enqueue_info *einfo = &ols->ols_einfo;
+
+			/* lock will be passed as upcall cookie,
+			 * hold ref to prevent to be released. */
+			cl_lock_hold_add(env, lock, "upcall", lock);
+			/* a user for lock also */
+			cl_lock_user_add(env, lock);
+			ols->ols_state = OLS_ENQUEUED;
+
+			/*
+			 * XXX: this is possible blocking point as
+			 * ldlm_lock_match(LDLM_FL_LVB_READY) waits for
+			 * LDLM_CP_CALLBACK.
+			 */
+			ostid_build_res_name(&obj->oo_oinfo->loi_oi, resname);
+			osc_lock_build_policy(env, lock, policy);
+			result = osc_enqueue_base(osc_export(obj), resname,
+					  &ols->ols_flags, policy,
+					  &ols->ols_lvb,
+					  obj->oo_oinfo->loi_kms_valid,
+					  osc_lock_upcall,
+					  ols, einfo, &ols->ols_handle,
+					  PTLRPCD_SET, 1, ols->ols_agl);
+			if (result != 0) {
+				cl_lock_user_del(env, lock);
+				cl_lock_unhold(env, lock, "upcall", lock);
+				if (unlikely(result == -ECANCELED)) {
+					ols->ols_state = OLS_NEW;
+					result = 0;
+				}
+			}
+		} else {
+			ols->ols_state = OLS_GRANTED;
+			ols->ols_owner = osc_env_io(env);
+		}
+	}
+	LASSERT(ergo(ols->ols_glimpse, !osc_lock_is_lockless(ols)));
+	return result;
+}
+
+static int osc_lock_wait(const struct lu_env *env,
+			 const struct cl_lock_slice *slice)
+{
+	struct osc_lock *olck = cl2osc_lock(slice);
+	struct cl_lock  *lock = olck->ols_cl.cls_lock;
+
+	LINVRNT(osc_lock_invariant(olck));
+
+	if (olck->ols_glimpse && olck->ols_state >= OLS_UPCALL_RECEIVED) {
+		if (olck->ols_flags & LDLM_FL_LVB_READY) {
+			return 0;
+		} else if (olck->ols_agl) {
+			if (lock->cll_flags & CLF_FROM_UPCALL)
+				/* It is from enqueue RPC reply upcall for
+				 * updating state. Do not re-enqueue. */
+				return -ENAVAIL;
+			else
+				olck->ols_state = OLS_NEW;
+		} else {
+			LASSERT(lock->cll_error);
+			return lock->cll_error;
+		}
+	}
+
+	if (olck->ols_state == OLS_NEW) {
+		int rc;
+
+		LASSERT(olck->ols_agl);
+		olck->ols_agl = 0;
+		olck->ols_flags &= ~LDLM_FL_BLOCK_NOWAIT;
+		rc = osc_lock_enqueue(env, slice, NULL, CEF_ASYNC | CEF_MUST);
+		if (rc != 0)
+			return rc;
+		else
+			return CLO_REENQUEUED;
+	}
+
+	LASSERT(equi(olck->ols_state >= OLS_UPCALL_RECEIVED &&
+		     lock->cll_error == 0, olck->ols_lock != NULL));
+
+	return lock->cll_error ?: olck->ols_state >= OLS_GRANTED ? 0 : CLO_WAIT;
+}
+
+/**
+ * An implementation of cl_lock_operations::clo_use() method that pins cached
+ * lock.
+ */
+static int osc_lock_use(const struct lu_env *env,
+			const struct cl_lock_slice *slice)
+{
+	struct osc_lock *olck = cl2osc_lock(slice);
+	int rc;
+
+	LASSERT(!olck->ols_hold);
+
+	/*
+	 * Atomically check for LDLM_FL_CBPENDING and addref a lock if this
+	 * flag is not set. This protects us from a concurrent blocking ast.
+	 */
+	rc = ldlm_lock_addref_try(&olck->ols_handle, olck->ols_einfo.ei_mode);
+	if (rc == 0) {
+		olck->ols_hold = 1;
+		olck->ols_state = OLS_GRANTED;
+	} else {
+		struct cl_lock *lock;
+
+		/*
+		 * Lock is being cancelled somewhere within
+		 * ldlm_handle_bl_callback(): LDLM_FL_CBPENDING is already
+		 * set, but osc_ldlm_blocking_ast() hasn't yet acquired
+		 * cl_lock mutex.
+		 */
+		lock = slice->cls_lock;
+		LASSERT(lock->cll_state == CLS_INTRANSIT);
+		LASSERT(lock->cll_users > 0);
+		/* set a flag for osc_dlm_blocking_ast0() to signal the
+		 * lock.*/
+		olck->ols_ast_wait = 1;
+		rc = CLO_WAIT;
+	}
+	return rc;
+}
+
+static int osc_lock_flush(struct osc_lock *ols, int discard)
+{
+	struct cl_lock       *lock  = ols->ols_cl.cls_lock;
+	struct cl_env_nest    nest;
+	struct lu_env	*env;
+	int result = 0;
+
+	env = cl_env_nested_get(&nest);
+	if (!IS_ERR(env)) {
+		struct osc_object    *obj   = cl2osc(ols->ols_cl.cls_obj);
+		struct cl_lock_descr *descr = &lock->cll_descr;
+		int rc = 0;
+
+		if (descr->cld_mode >= CLM_WRITE) {
+			result = osc_cache_writeback_range(env, obj,
+					descr->cld_start, descr->cld_end,
+					1, discard);
+			LDLM_DEBUG(ols->ols_lock,
+				"lock %p: %d pages were %s.\n", lock, result,
+				discard ? "discarded" : "written");
+			if (result > 0)
+				result = 0;
+		}
+
+		rc = cl_lock_discard_pages(env, lock);
+		if (result == 0 && rc < 0)
+			result = rc;
+
+		cl_env_nested_put(&nest, env);
+	} else
+		result = PTR_ERR(env);
+	if (result == 0) {
+		ols->ols_flush = 1;
+		LINVRNT(!osc_lock_has_pages(ols));
+	}
+	return result;
+}
+
+/**
+ * Implements cl_lock_operations::clo_cancel() method for osc layer. This is
+ * called (as part of cl_lock_cancel()) when lock is canceled either voluntary
+ * (LRU pressure, early cancellation, umount, etc.) or due to the conflict
+ * with some other lock some where in the cluster. This function does the
+ * following:
+ *
+ *     - invalidates all pages protected by this lock (after sending dirty
+ *       ones to the server, as necessary);
+ *
+ *     - decref's underlying ldlm lock;
+ *
+ *     - cancels ldlm lock (ldlm_cli_cancel()).
+ */
+static void osc_lock_cancel(const struct lu_env *env,
+			    const struct cl_lock_slice *slice)
+{
+	struct cl_lock   *lock    = slice->cls_lock;
+	struct osc_lock  *olck    = cl2osc_lock(slice);
+	struct ldlm_lock *dlmlock = olck->ols_lock;
+	int	       result  = 0;
+	int	       discard;
+
+	LASSERT(cl_lock_is_mutexed(lock));
+	LINVRNT(osc_lock_invariant(olck));
+
+	if (dlmlock != NULL) {
+		int do_cancel;
+
+		discard = !!(dlmlock->l_flags & LDLM_FL_DISCARD_DATA);
+		if (olck->ols_state >= OLS_GRANTED)
+			result = osc_lock_flush(olck, discard);
+		osc_lock_unhold(olck);
+
+		lock_res_and_lock(dlmlock);
+		/* Now that we're the only user of dlm read/write reference,
+		 * mostly the ->l_readers + ->l_writers should be zero.
+		 * However, there is a corner case.
+		 * See bug 18829 for details.*/
+		do_cancel = (dlmlock->l_readers == 0 &&
+			     dlmlock->l_writers == 0);
+		dlmlock->l_flags |= LDLM_FL_CBPENDING;
+		unlock_res_and_lock(dlmlock);
+		if (do_cancel)
+			result = ldlm_cli_cancel(&olck->ols_handle, LCF_ASYNC);
+		if (result < 0)
+			CL_LOCK_DEBUG(D_ERROR, env, lock,
+				      "lock %p cancel failure with error(%d)\n",
+				      lock, result);
+	}
+	olck->ols_state = OLS_CANCELLED;
+	olck->ols_flags &= ~LDLM_FL_LVB_READY;
+	osc_lock_detach(env, olck);
+}
+
+static int osc_lock_has_pages(struct osc_lock *olck)
+{
+	return 0;
+}
+
+static void osc_lock_delete(const struct lu_env *env,
+			    const struct cl_lock_slice *slice)
+{
+	struct osc_lock *olck;
+
+	olck = cl2osc_lock(slice);
+	if (olck->ols_glimpse) {
+		LASSERT(!olck->ols_hold);
+		LASSERT(!olck->ols_lock);
+		return;
+	}
+
+	LINVRNT(osc_lock_invariant(olck));
+	LINVRNT(!osc_lock_has_pages(olck));
+
+	osc_lock_unhold(olck);
+	osc_lock_detach(env, olck);
+}
+
+/**
+ * Implements cl_lock_operations::clo_state() method for osc layer.
+ *
+ * Maintains osc_lock::ols_owner field.
+ *
+ * This assumes that lock always enters CLS_HELD (from some other state) in
+ * the same IO context as one that requested the lock. This should not be a
+ * problem, because context is by definition shared by all activity pertaining
+ * to the same high-level IO.
+ */
+static void osc_lock_state(const struct lu_env *env,
+			   const struct cl_lock_slice *slice,
+			   enum cl_lock_state state)
+{
+	struct osc_lock *lock = cl2osc_lock(slice);
+
+	/*
+	 * XXX multiple io contexts can use the lock at the same time.
+	 */
+	LINVRNT(osc_lock_invariant(lock));
+	if (state == CLS_HELD && slice->cls_lock->cll_state != CLS_HELD) {
+		struct osc_io *oio = osc_env_io(env);
+
+		LASSERT(lock->ols_owner == NULL);
+		lock->ols_owner = oio;
+	} else if (state != CLS_HELD)
+		lock->ols_owner = NULL;
+}
+
+static int osc_lock_print(const struct lu_env *env, void *cookie,
+			  lu_printer_t p, const struct cl_lock_slice *slice)
+{
+	struct osc_lock *lock = cl2osc_lock(slice);
+
+	/*
+	 * XXX print ldlm lock and einfo properly.
+	 */
+	(*p)(env, cookie, "%p %#16llx %#llx %d %p ",
+	     lock->ols_lock, lock->ols_flags, lock->ols_handle.cookie,
+	     lock->ols_state, lock->ols_owner);
+	osc_lvb_print(env, cookie, p, &lock->ols_lvb);
+	return 0;
+}
+
+static int osc_lock_fits_into(const struct lu_env *env,
+			      const struct cl_lock_slice *slice,
+			      const struct cl_lock_descr *need,
+			      const struct cl_io *io)
+{
+	struct osc_lock *ols = cl2osc_lock(slice);
+
+	if (need->cld_enq_flags & CEF_NEVER)
+		return 0;
+
+	if (ols->ols_state >= OLS_CANCELLED)
+		return 0;
+
+	if (need->cld_mode == CLM_PHANTOM) {
+		if (ols->ols_agl)
+			return !(ols->ols_state > OLS_RELEASED);
+
+		/*
+		 * Note: the QUEUED lock can't be matched here, otherwise
+		 * it might cause the deadlocks.
+		 * In read_process,
+		 * P1: enqueued read lock, create sublock1
+		 * P2: enqueued write lock, create sublock2(conflicted
+		 *     with sublock1).
+		 * P1: Grant read lock.
+		 * P1: enqueued glimpse lock(with holding sublock1_read),
+		 *     matched with sublock2, waiting sublock2 to be granted.
+		 *     But sublock2 can not be granted, because P1
+		 *     will not release sublock1. Bang!
+		 */
+		if (ols->ols_state < OLS_GRANTED ||
+		    ols->ols_state > OLS_RELEASED)
+			return 0;
+	} else if (need->cld_enq_flags & CEF_MUST) {
+		/*
+		 * If the lock hasn't ever enqueued, it can't be matched
+		 * because enqueue process brings in many information
+		 * which can be used to determine things such as lockless,
+		 * CEF_MUST, etc.
+		 */
+		if (ols->ols_state < OLS_UPCALL_RECEIVED &&
+		    ols->ols_locklessable)
+			return 0;
+	}
+	return 1;
+}
+
+static const struct cl_lock_operations osc_lock_ops = {
+	.clo_fini    = osc_lock_fini,
+	.clo_enqueue = osc_lock_enqueue,
+	.clo_wait    = osc_lock_wait,
+	.clo_unuse   = osc_lock_unuse,
+	.clo_use     = osc_lock_use,
+	.clo_delete  = osc_lock_delete,
+	.clo_state   = osc_lock_state,
+	.clo_cancel  = osc_lock_cancel,
+	.clo_weigh   = osc_lock_weigh,
+	.clo_print   = osc_lock_print,
+	.clo_fits_into = osc_lock_fits_into,
+};
+
+static int osc_lock_lockless_unuse(const struct lu_env *env,
+				   const struct cl_lock_slice *slice)
+{
+	struct osc_lock *ols = cl2osc_lock(slice);
+	struct cl_lock *lock = slice->cls_lock;
+
+	LASSERT(ols->ols_state == OLS_GRANTED);
+	LINVRNT(osc_lock_invariant(ols));
+
+	cl_lock_cancel(env, lock);
+	cl_lock_delete(env, lock);
+	return 0;
+}
+
+static void osc_lock_lockless_cancel(const struct lu_env *env,
+				     const struct cl_lock_slice *slice)
+{
+	struct osc_lock   *ols  = cl2osc_lock(slice);
+	int result;
+
+	result = osc_lock_flush(ols, 0);
+	if (result)
+		CERROR("Pages for lockless lock %p were not purged(%d)\n",
+		       ols, result);
+	ols->ols_state = OLS_CANCELLED;
+}
+
+static int osc_lock_lockless_wait(const struct lu_env *env,
+				  const struct cl_lock_slice *slice)
+{
+	struct osc_lock *olck = cl2osc_lock(slice);
+	struct cl_lock  *lock = olck->ols_cl.cls_lock;
+
+	LINVRNT(osc_lock_invariant(olck));
+	LASSERT(olck->ols_state >= OLS_UPCALL_RECEIVED);
+
+	return lock->cll_error;
+}
+
+static void osc_lock_lockless_state(const struct lu_env *env,
+				    const struct cl_lock_slice *slice,
+				    enum cl_lock_state state)
+{
+	struct osc_lock *lock = cl2osc_lock(slice);
+
+	LINVRNT(osc_lock_invariant(lock));
+	if (state == CLS_HELD) {
+		struct osc_io *oio  = osc_env_io(env);
+
+		LASSERT(ergo(lock->ols_owner, lock->ols_owner == oio));
+		lock->ols_owner = oio;
+
+		/* set the io to be lockless if this lock is for io's
+		 * host object */
+		if (cl_object_same(oio->oi_cl.cis_obj, slice->cls_obj))
+			oio->oi_lockless = 1;
+	}
+}
+
+static int osc_lock_lockless_fits_into(const struct lu_env *env,
+				       const struct cl_lock_slice *slice,
+				       const struct cl_lock_descr *need,
+				       const struct cl_io *io)
+{
+	struct osc_lock *lock = cl2osc_lock(slice);
+
+	if (!(need->cld_enq_flags & CEF_NEVER))
+		return 0;
+
+	/* lockless lock should only be used by its owning io. b22147 */
+	return (lock->ols_owner == osc_env_io(env));
+}
+
+static const struct cl_lock_operations osc_lock_lockless_ops = {
+	.clo_fini      = osc_lock_fini,
+	.clo_enqueue   = osc_lock_enqueue,
+	.clo_wait      = osc_lock_lockless_wait,
+	.clo_unuse     = osc_lock_lockless_unuse,
+	.clo_state     = osc_lock_lockless_state,
+	.clo_fits_into = osc_lock_lockless_fits_into,
+	.clo_cancel    = osc_lock_lockless_cancel,
+	.clo_print     = osc_lock_print
+};
+
+int osc_lock_init(const struct lu_env *env,
+		  struct cl_object *obj, struct cl_lock *lock,
+		  const struct cl_io *unused)
+{
+	struct osc_lock *clk;
+	int result;
+
+	OBD_SLAB_ALLOC_PTR_GFP(clk, osc_lock_kmem, GFP_NOFS);
+	if (clk != NULL) {
+		__u32 enqflags = lock->cll_descr.cld_enq_flags;
+
+		osc_lock_build_einfo(env, lock, clk, &clk->ols_einfo);
+		atomic_set(&clk->ols_pageref, 0);
+		clk->ols_state = OLS_NEW;
+
+		clk->ols_flags = osc_enq2ldlm_flags(enqflags);
+		clk->ols_agl = !!(enqflags & CEF_AGL);
+		if (clk->ols_agl)
+			clk->ols_flags |= LDLM_FL_BLOCK_NOWAIT;
+		if (clk->ols_flags & LDLM_FL_HAS_INTENT)
+			clk->ols_glimpse = 1;
+
+		cl_lock_slice_add(lock, &clk->ols_cl, obj, &osc_lock_ops);
+
+		if (!(enqflags & CEF_MUST))
+			/* try to convert this lock to a lockless lock */
+			osc_lock_to_lockless(env, clk, (enqflags & CEF_NEVER));
+		if (clk->ols_locklessable && !(enqflags & CEF_DISCARD_DATA))
+			clk->ols_flags |= LDLM_FL_DENY_ON_CONTENTION;
+
+		LDLM_DEBUG_NOLOCK("lock %p, osc lock %p, flags %llx\n",
+				lock, clk, clk->ols_flags);
+
+		result = 0;
+	} else
+		result = -ENOMEM;
+	return result;
+}
+
+int osc_dlm_lock_pageref(struct ldlm_lock *dlm)
+{
+	struct osc_lock *olock;
+	int	      rc = 0;
+
+	spin_lock(&osc_ast_guard);
+	olock = dlm->l_ast_data;
+	/*
+	 * there's a very rare race with osc_page_addref_lock(), but that
+	 * doesn't matter because in the worst case we don't cancel a lock
+	 * which we actually can, that's no harm.
+	 */
+	if (olock != NULL &&
+	    atomic_add_return(_PAGEREF_MAGIC,
+				  &olock->ols_pageref) != _PAGEREF_MAGIC) {
+		atomic_sub(_PAGEREF_MAGIC, &olock->ols_pageref);
+		rc = 1;
+	}
+	spin_unlock(&osc_ast_guard);
+	return rc;
+}
+
+/** @} osc */
diff --git a/kernel/drivers/staging/lustre/lustre/osc/osc_object.c b/kernel/drivers/staging/lustre/lustre/osc/osc_object.c
new file mode 100644
index 000000000..92c202f70
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/osc/osc_object.c
@@ -0,0 +1,271 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Implementation of cl_object for OSC layer.
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_OSC
+
+#include "osc_cl_internal.h"
+
+/** \addtogroup osc
+ *  @{
+ */
+
+/*****************************************************************************
+ *
+ * Type conversions.
+ *
+ */
+
+static struct lu_object *osc2lu(struct osc_object *osc)
+{
+	return &osc->oo_cl.co_lu;
+}
+
+static struct osc_object *lu2osc(const struct lu_object *obj)
+{
+	LINVRNT(osc_is_object(obj));
+	return container_of0(obj, struct osc_object, oo_cl.co_lu);
+}
+
+/*****************************************************************************
+ *
+ * Object operations.
+ *
+ */
+
+static int osc_object_init(const struct lu_env *env, struct lu_object *obj,
+			   const struct lu_object_conf *conf)
+{
+	struct osc_object	   *osc   = lu2osc(obj);
+	const struct cl_object_conf *cconf = lu2cl_conf(conf);
+	int i;
+
+	osc->oo_oinfo = cconf->u.coc_oinfo;
+	spin_lock_init(&osc->oo_seatbelt);
+	for (i = 0; i < CRT_NR; ++i)
+		INIT_LIST_HEAD(&osc->oo_inflight[i]);
+
+	INIT_LIST_HEAD(&osc->oo_ready_item);
+	INIT_LIST_HEAD(&osc->oo_hp_ready_item);
+	INIT_LIST_HEAD(&osc->oo_write_item);
+	INIT_LIST_HEAD(&osc->oo_read_item);
+
+	osc->oo_root.rb_node = NULL;
+	INIT_LIST_HEAD(&osc->oo_hp_exts);
+	INIT_LIST_HEAD(&osc->oo_urgent_exts);
+	INIT_LIST_HEAD(&osc->oo_rpc_exts);
+	INIT_LIST_HEAD(&osc->oo_reading_exts);
+	atomic_set(&osc->oo_nr_reads, 0);
+	atomic_set(&osc->oo_nr_writes, 0);
+	spin_lock_init(&osc->oo_lock);
+
+	cl_object_page_init(lu2cl(obj), sizeof(struct osc_page));
+
+	return 0;
+}
+
+static void osc_object_free(const struct lu_env *env, struct lu_object *obj)
+{
+	struct osc_object *osc = lu2osc(obj);
+	int i;
+
+	for (i = 0; i < CRT_NR; ++i)
+		LASSERT(list_empty(&osc->oo_inflight[i]));
+
+	LASSERT(list_empty(&osc->oo_ready_item));
+	LASSERT(list_empty(&osc->oo_hp_ready_item));
+	LASSERT(list_empty(&osc->oo_write_item));
+	LASSERT(list_empty(&osc->oo_read_item));
+
+	LASSERT(osc->oo_root.rb_node == NULL);
+	LASSERT(list_empty(&osc->oo_hp_exts));
+	LASSERT(list_empty(&osc->oo_urgent_exts));
+	LASSERT(list_empty(&osc->oo_rpc_exts));
+	LASSERT(list_empty(&osc->oo_reading_exts));
+	LASSERT(atomic_read(&osc->oo_nr_reads) == 0);
+	LASSERT(atomic_read(&osc->oo_nr_writes) == 0);
+
+	lu_object_fini(obj);
+	OBD_SLAB_FREE_PTR(osc, osc_object_kmem);
+}
+
+int osc_lvb_print(const struct lu_env *env, void *cookie,
+		  lu_printer_t p, const struct ost_lvb *lvb)
+{
+	return (*p)(env, cookie, "size: %llu mtime: %llu atime: %llu ctime: %llu blocks: %llu",
+		    lvb->lvb_size, lvb->lvb_mtime, lvb->lvb_atime,
+		    lvb->lvb_ctime, lvb->lvb_blocks);
+}
+
+static int osc_object_print(const struct lu_env *env, void *cookie,
+			    lu_printer_t p, const struct lu_object *obj)
+{
+	struct osc_object   *osc   = lu2osc(obj);
+	struct lov_oinfo    *oinfo = osc->oo_oinfo;
+	struct osc_async_rc *ar    = &oinfo->loi_ar;
+
+	(*p)(env, cookie, "id: " DOSTID " idx: %d gen: %d kms_valid: %u kms %llu rc: %d force_sync: %d min_xid: %llu ",
+	     POSTID(&oinfo->loi_oi), oinfo->loi_ost_idx,
+	     oinfo->loi_ost_gen, oinfo->loi_kms_valid, oinfo->loi_kms,
+	     ar->ar_rc, ar->ar_force_sync, ar->ar_min_xid);
+	osc_lvb_print(env, cookie, p, &oinfo->loi_lvb);
+	return 0;
+}
+
+
+static int osc_attr_get(const struct lu_env *env, struct cl_object *obj,
+			struct cl_attr *attr)
+{
+	struct lov_oinfo *oinfo = cl2osc(obj)->oo_oinfo;
+
+	cl_lvb2attr(attr, &oinfo->loi_lvb);
+	attr->cat_kms = oinfo->loi_kms_valid ? oinfo->loi_kms : 0;
+	return 0;
+}
+
+int osc_attr_set(const struct lu_env *env, struct cl_object *obj,
+		 const struct cl_attr *attr, unsigned valid)
+{
+	struct lov_oinfo *oinfo = cl2osc(obj)->oo_oinfo;
+	struct ost_lvb   *lvb   = &oinfo->loi_lvb;
+
+	if (valid & CAT_SIZE)
+		lvb->lvb_size = attr->cat_size;
+	if (valid & CAT_MTIME)
+		lvb->lvb_mtime = attr->cat_mtime;
+	if (valid & CAT_ATIME)
+		lvb->lvb_atime = attr->cat_atime;
+	if (valid & CAT_CTIME)
+		lvb->lvb_ctime = attr->cat_ctime;
+	if (valid & CAT_BLOCKS)
+		lvb->lvb_blocks = attr->cat_blocks;
+	if (valid & CAT_KMS) {
+		CDEBUG(D_CACHE, "set kms from %llu to %llu\n",
+		       oinfo->loi_kms, (__u64)attr->cat_kms);
+		loi_kms_set(oinfo, attr->cat_kms);
+	}
+	return 0;
+}
+
+static int osc_object_glimpse(const struct lu_env *env,
+			      const struct cl_object *obj, struct ost_lvb *lvb)
+{
+	struct lov_oinfo *oinfo = cl2osc(obj)->oo_oinfo;
+
+	lvb->lvb_size   = oinfo->loi_kms;
+	lvb->lvb_blocks = oinfo->loi_lvb.lvb_blocks;
+	return 0;
+}
+
+
+void osc_object_set_contended(struct osc_object *obj)
+{
+	obj->oo_contention_time = cfs_time_current();
+	/* mb(); */
+	obj->oo_contended = 1;
+}
+
+void osc_object_clear_contended(struct osc_object *obj)
+{
+	obj->oo_contended = 0;
+}
+
+int osc_object_is_contended(struct osc_object *obj)
+{
+	struct osc_device *dev  = lu2osc_dev(obj->oo_cl.co_lu.lo_dev);
+	int osc_contention_time = dev->od_contention_time;
+	unsigned long cur_time     = cfs_time_current();
+	unsigned long retry_time;
+
+	if (OBD_FAIL_CHECK(OBD_FAIL_OSC_OBJECT_CONTENTION))
+		return 1;
+
+	if (!obj->oo_contended)
+		return 0;
+
+	/*
+	 * I like copy-paste. the code is copied from
+	 * ll_file_is_contended.
+	 */
+	retry_time = cfs_time_add(obj->oo_contention_time,
+				  cfs_time_seconds(osc_contention_time));
+	if (cfs_time_after(cur_time, retry_time)) {
+		osc_object_clear_contended(obj);
+		return 0;
+	}
+	return 1;
+}
+
+static const struct cl_object_operations osc_ops = {
+	.coo_page_init = osc_page_init,
+	.coo_lock_init = osc_lock_init,
+	.coo_io_init   = osc_io_init,
+	.coo_attr_get  = osc_attr_get,
+	.coo_attr_set  = osc_attr_set,
+	.coo_glimpse   = osc_object_glimpse
+};
+
+static const struct lu_object_operations osc_lu_obj_ops = {
+	.loo_object_init      = osc_object_init,
+	.loo_object_delete    = NULL,
+	.loo_object_release   = NULL,
+	.loo_object_free      = osc_object_free,
+	.loo_object_print     = osc_object_print,
+	.loo_object_invariant = NULL
+};
+
+struct lu_object *osc_object_alloc(const struct lu_env *env,
+				   const struct lu_object_header *unused,
+				   struct lu_device *dev)
+{
+	struct osc_object *osc;
+	struct lu_object  *obj;
+
+	OBD_SLAB_ALLOC_PTR_GFP(osc, osc_object_kmem, GFP_NOFS);
+	if (osc != NULL) {
+		obj = osc2lu(osc);
+		lu_object_init(obj, NULL, dev);
+		osc->oo_cl.co_ops = &osc_ops;
+		obj->lo_ops = &osc_lu_obj_ops;
+	} else
+		obj = NULL;
+	return obj;
+}
+
+/** @} osc */
diff --git a/kernel/drivers/staging/lustre/lustre/osc/osc_page.c b/kernel/drivers/staging/lustre/lustre/osc/osc_page.c
new file mode 100644
index 000000000..76ba58b09
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/osc/osc_page.c
@@ -0,0 +1,916 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Implementation of cl_page for OSC layer.
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_OSC
+
+#include "osc_cl_internal.h"
+
+static void osc_lru_del(struct client_obd *cli, struct osc_page *opg, bool del);
+static void osc_lru_add(struct client_obd *cli, struct osc_page *opg);
+static int osc_lru_reserve(const struct lu_env *env, struct osc_object *obj,
+			   struct osc_page *opg);
+
+/** \addtogroup osc
+ *  @{
+ */
+
+/*
+ * Comment out osc_page_protected because it may sleep inside the
+ * the client_obd_list_lock.
+ * client_obd_list_lock -> osc_ap_completion -> osc_completion ->
+ *   -> osc_page_protected -> osc_page_is_dlocked -> osc_match_base
+ *   -> ldlm_lock_match -> sptlrpc_import_check_ctx -> sleep.
+ */
+#if 0
+static int osc_page_is_dlocked(const struct lu_env *env,
+			       const struct osc_page *opg,
+			       enum cl_lock_mode mode, int pending, int unref)
+{
+	struct cl_page	 *page;
+	struct osc_object      *obj;
+	struct osc_thread_info *info;
+	struct ldlm_res_id     *resname;
+	struct lustre_handle   *lockh;
+	ldlm_policy_data_t     *policy;
+	ldlm_mode_t	     dlmmode;
+	__u64                   flags;
+
+	might_sleep();
+
+	info = osc_env_info(env);
+	resname = &info->oti_resname;
+	policy = &info->oti_policy;
+	lockh = &info->oti_handle;
+	page = opg->ops_cl.cpl_page;
+	obj = cl2osc(opg->ops_cl.cpl_obj);
+
+	flags = LDLM_FL_TEST_LOCK | LDLM_FL_BLOCK_GRANTED;
+	if (pending)
+		flags |= LDLM_FL_CBPENDING;
+
+	dlmmode = osc_cl_lock2ldlm(mode) | LCK_PW;
+	osc_lock_build_res(env, obj, resname);
+	osc_index2policy(policy, page->cp_obj, page->cp_index, page->cp_index);
+	return osc_match_base(osc_export(obj), resname, LDLM_EXTENT, policy,
+			      dlmmode, &flags, NULL, lockh, unref);
+}
+
+/**
+ * Checks an invariant that a page in the cache is covered by a lock, as
+ * needed.
+ */
+static int osc_page_protected(const struct lu_env *env,
+			      const struct osc_page *opg,
+			      enum cl_lock_mode mode, int unref)
+{
+	struct cl_object_header *hdr;
+	struct cl_lock	  *scan;
+	struct cl_page	  *page;
+	struct cl_lock_descr    *descr;
+	int result;
+
+	LINVRNT(!opg->ops_temp);
+
+	page = opg->ops_cl.cpl_page;
+	if (page->cp_owner != NULL &&
+	    cl_io_top(page->cp_owner)->ci_lockreq == CILR_NEVER)
+		/*
+		 * If IO is done without locks (liblustre, or lloop), lock is
+		 * not required.
+		 */
+		result = 1;
+	else
+		/* otherwise check for a DLM lock */
+	result = osc_page_is_dlocked(env, opg, mode, 1, unref);
+	if (result == 0) {
+		/* maybe this page is a part of a lockless io? */
+		hdr = cl_object_header(opg->ops_cl.cpl_obj);
+		descr = &osc_env_info(env)->oti_descr;
+		descr->cld_mode = mode;
+		descr->cld_start = page->cp_index;
+		descr->cld_end   = page->cp_index;
+		spin_lock(&hdr->coh_lock_guard);
+		list_for_each_entry(scan, &hdr->coh_locks, cll_linkage) {
+			/*
+			 * Lock-less sub-lock has to be either in HELD state
+			 * (when io is actively going on), or in CACHED state,
+			 * when top-lock is being unlocked:
+			 * cl_io_unlock()->cl_unuse()->...->lov_lock_unuse().
+			 */
+			if ((scan->cll_state == CLS_HELD ||
+			     scan->cll_state == CLS_CACHED) &&
+			    cl_lock_ext_match(&scan->cll_descr, descr)) {
+				struct osc_lock *olck;
+
+				olck = osc_lock_at(scan);
+				result = osc_lock_is_lockless(olck);
+				break;
+			}
+		}
+		spin_unlock(&hdr->coh_lock_guard);
+	}
+	return result;
+}
+#else
+static int osc_page_protected(const struct lu_env *env,
+			      const struct osc_page *opg,
+			      enum cl_lock_mode mode, int unref)
+{
+	return 1;
+}
+#endif
+
+/*****************************************************************************
+ *
+ * Page operations.
+ *
+ */
+static void osc_page_fini(const struct lu_env *env,
+			  struct cl_page_slice *slice)
+{
+	struct osc_page *opg = cl2osc_page(slice);
+	CDEBUG(D_TRACE, "%p\n", opg);
+	LASSERT(opg->ops_lock == NULL);
+}
+
+static void osc_page_transfer_get(struct osc_page *opg, const char *label)
+{
+	struct cl_page *page = cl_page_top(opg->ops_cl.cpl_page);
+
+	LASSERT(!opg->ops_transfer_pinned);
+	cl_page_get(page);
+	lu_ref_add_atomic(&page->cp_reference, label, page);
+	opg->ops_transfer_pinned = 1;
+}
+
+static void osc_page_transfer_put(const struct lu_env *env,
+				  struct osc_page *opg)
+{
+	struct cl_page *page = cl_page_top(opg->ops_cl.cpl_page);
+
+	if (opg->ops_transfer_pinned) {
+		lu_ref_del(&page->cp_reference, "transfer", page);
+		opg->ops_transfer_pinned = 0;
+		cl_page_put(env, page);
+	}
+}
+
+/**
+ * This is called once for every page when it is submitted for a transfer
+ * either opportunistic (osc_page_cache_add()), or immediate
+ * (osc_page_submit()).
+ */
+static void osc_page_transfer_add(const struct lu_env *env,
+				  struct osc_page *opg, enum cl_req_type crt)
+{
+	struct osc_object *obj = cl2osc(opg->ops_cl.cpl_obj);
+
+	/* ops_lru and ops_inflight share the same field, so take it from LRU
+	 * first and then use it as inflight. */
+	osc_lru_del(osc_cli(obj), opg, false);
+
+	spin_lock(&obj->oo_seatbelt);
+	list_add(&opg->ops_inflight, &obj->oo_inflight[crt]);
+	opg->ops_submitter = current;
+	spin_unlock(&obj->oo_seatbelt);
+}
+
+static int osc_page_cache_add(const struct lu_env *env,
+			      const struct cl_page_slice *slice,
+			      struct cl_io *io)
+{
+	struct osc_io   *oio = osc_env_io(env);
+	struct osc_page *opg = cl2osc_page(slice);
+	int result;
+
+	LINVRNT(osc_page_protected(env, opg, CLM_WRITE, 0));
+
+	osc_page_transfer_get(opg, "transfer\0cache");
+	result = osc_queue_async_io(env, io, opg);
+	if (result != 0)
+		osc_page_transfer_put(env, opg);
+	else
+		osc_page_transfer_add(env, opg, CRT_WRITE);
+
+	/* for sync write, kernel will wait for this page to be flushed before
+	 * osc_io_end() is called, so release it earlier.
+	 * for mkwrite(), it's known there is no further pages. */
+	if (cl_io_is_sync_write(io) || cl_io_is_mkwrite(io)) {
+		if (oio->oi_active != NULL) {
+			osc_extent_release(env, oio->oi_active);
+			oio->oi_active = NULL;
+		}
+	}
+
+	return result;
+}
+
+void osc_index2policy(ldlm_policy_data_t *policy, const struct cl_object *obj,
+		      pgoff_t start, pgoff_t end)
+{
+	memset(policy, 0, sizeof(*policy));
+	policy->l_extent.start = cl_offset(obj, start);
+	policy->l_extent.end   = cl_offset(obj, end + 1) - 1;
+}
+
+static int osc_page_addref_lock(const struct lu_env *env,
+				struct osc_page *opg,
+				struct cl_lock *lock)
+{
+	struct osc_lock *olock;
+	int	      rc;
+
+	LASSERT(opg->ops_lock == NULL);
+
+	olock = osc_lock_at(lock);
+	if (atomic_inc_return(&olock->ols_pageref) <= 0) {
+		atomic_dec(&olock->ols_pageref);
+		rc = -ENODATA;
+	} else {
+		cl_lock_get(lock);
+		opg->ops_lock = lock;
+		rc = 0;
+	}
+	return rc;
+}
+
+static void osc_page_putref_lock(const struct lu_env *env,
+				 struct osc_page *opg)
+{
+	struct cl_lock  *lock = opg->ops_lock;
+	struct osc_lock *olock;
+
+	LASSERT(lock != NULL);
+	olock = osc_lock_at(lock);
+
+	atomic_dec(&olock->ols_pageref);
+	opg->ops_lock = NULL;
+
+	cl_lock_put(env, lock);
+}
+
+static int osc_page_is_under_lock(const struct lu_env *env,
+				  const struct cl_page_slice *slice,
+				  struct cl_io *unused)
+{
+	struct cl_lock *lock;
+	int	     result = -ENODATA;
+
+	lock = cl_lock_at_page(env, slice->cpl_obj, slice->cpl_page,
+			       NULL, 1, 0);
+	if (lock != NULL) {
+		if (osc_page_addref_lock(env, cl2osc_page(slice), lock) == 0)
+			result = -EBUSY;
+		cl_lock_put(env, lock);
+	}
+	return result;
+}
+
+static void osc_page_disown(const struct lu_env *env,
+			    const struct cl_page_slice *slice,
+			    struct cl_io *io)
+{
+	struct osc_page *opg = cl2osc_page(slice);
+
+	if (unlikely(opg->ops_lock))
+		osc_page_putref_lock(env, opg);
+}
+
+static void osc_page_completion_read(const struct lu_env *env,
+				     const struct cl_page_slice *slice,
+				     int ioret)
+{
+	struct osc_page   *opg = cl2osc_page(slice);
+	struct osc_object *obj = cl2osc(opg->ops_cl.cpl_obj);
+
+	if (likely(opg->ops_lock))
+		osc_page_putref_lock(env, opg);
+	osc_lru_add(osc_cli(obj), opg);
+}
+
+static void osc_page_completion_write(const struct lu_env *env,
+				      const struct cl_page_slice *slice,
+				      int ioret)
+{
+	struct osc_page   *opg = cl2osc_page(slice);
+	struct osc_object *obj = cl2osc(slice->cpl_obj);
+
+	osc_lru_add(osc_cli(obj), opg);
+}
+
+static int osc_page_fail(const struct lu_env *env,
+			 const struct cl_page_slice *slice,
+			 struct cl_io *unused)
+{
+	/*
+	 * Cached read?
+	 */
+	LBUG();
+	return 0;
+}
+
+
+static const char *osc_list(struct list_head *head)
+{
+	return list_empty(head) ? "-" : "+";
+}
+
+static inline unsigned long osc_submit_duration(struct osc_page *opg)
+{
+	if (opg->ops_submit_time == 0)
+		return 0;
+
+	return (cfs_time_current() - opg->ops_submit_time);
+}
+
+static int osc_page_print(const struct lu_env *env,
+			  const struct cl_page_slice *slice,
+			  void *cookie, lu_printer_t printer)
+{
+	struct osc_page       *opg = cl2osc_page(slice);
+	struct osc_async_page *oap = &opg->ops_oap;
+	struct osc_object     *obj = cl2osc(slice->cpl_obj);
+	struct client_obd     *cli = &osc_export(obj)->exp_obd->u.cli;
+
+	return (*printer)(env, cookie, LUSTRE_OSC_NAME "-page@%p: 1< %#x %d %u %s %s > 2< %llu %u %u %#x %#x | %p %p %p > 3< %s %p %d %lu %d > 4< %d %d %d %lu %s | %s %s %s %s > 5< %s %s %s %s | %d %s | %d %s %s>\n",
+			  opg,
+			  /* 1 */
+			  oap->oap_magic, oap->oap_cmd,
+			  oap->oap_interrupted,
+			  osc_list(&oap->oap_pending_item),
+			  osc_list(&oap->oap_rpc_item),
+			  /* 2 */
+			  oap->oap_obj_off, oap->oap_page_off, oap->oap_count,
+			  oap->oap_async_flags, oap->oap_brw_flags,
+			  oap->oap_request, oap->oap_cli, obj,
+			  /* 3 */
+			  osc_list(&opg->ops_inflight),
+			  opg->ops_submitter, opg->ops_transfer_pinned,
+			  osc_submit_duration(opg), opg->ops_srvlock,
+			  /* 4 */
+			  cli->cl_r_in_flight, cli->cl_w_in_flight,
+			  cli->cl_max_rpcs_in_flight,
+			  cli->cl_avail_grant,
+			  osc_list(&cli->cl_cache_waiters),
+			  osc_list(&cli->cl_loi_ready_list),
+			  osc_list(&cli->cl_loi_hp_ready_list),
+			  osc_list(&cli->cl_loi_write_list),
+			  osc_list(&cli->cl_loi_read_list),
+			  /* 5 */
+			  osc_list(&obj->oo_ready_item),
+			  osc_list(&obj->oo_hp_ready_item),
+			  osc_list(&obj->oo_write_item),
+			  osc_list(&obj->oo_read_item),
+			  atomic_read(&obj->oo_nr_reads),
+			  osc_list(&obj->oo_reading_exts),
+			  atomic_read(&obj->oo_nr_writes),
+			  osc_list(&obj->oo_hp_exts),
+			  osc_list(&obj->oo_urgent_exts));
+}
+
+static void osc_page_delete(const struct lu_env *env,
+			    const struct cl_page_slice *slice)
+{
+	struct osc_page   *opg = cl2osc_page(slice);
+	struct osc_object *obj = cl2osc(opg->ops_cl.cpl_obj);
+	int rc;
+
+	LINVRNT(opg->ops_temp || osc_page_protected(env, opg, CLM_READ, 1));
+
+	CDEBUG(D_TRACE, "%p\n", opg);
+	osc_page_transfer_put(env, opg);
+	rc = osc_teardown_async_page(env, obj, opg);
+	if (rc) {
+		CL_PAGE_DEBUG(D_ERROR, env, cl_page_top(slice->cpl_page),
+			      "Trying to teardown failed: %d\n", rc);
+		LASSERT(0);
+	}
+
+	spin_lock(&obj->oo_seatbelt);
+	if (opg->ops_submitter != NULL) {
+		LASSERT(!list_empty(&opg->ops_inflight));
+		list_del_init(&opg->ops_inflight);
+		opg->ops_submitter = NULL;
+	}
+	spin_unlock(&obj->oo_seatbelt);
+
+	osc_lru_del(osc_cli(obj), opg, true);
+}
+
+void osc_page_clip(const struct lu_env *env, const struct cl_page_slice *slice,
+		   int from, int to)
+{
+	struct osc_page       *opg = cl2osc_page(slice);
+	struct osc_async_page *oap = &opg->ops_oap;
+
+	LINVRNT(osc_page_protected(env, opg, CLM_READ, 0));
+
+	opg->ops_from = from;
+	opg->ops_to   = to;
+	spin_lock(&oap->oap_lock);
+	oap->oap_async_flags |= ASYNC_COUNT_STABLE;
+	spin_unlock(&oap->oap_lock);
+}
+
+static int osc_page_cancel(const struct lu_env *env,
+			   const struct cl_page_slice *slice)
+{
+	struct osc_page *opg = cl2osc_page(slice);
+	int rc = 0;
+
+	LINVRNT(osc_page_protected(env, opg, CLM_READ, 0));
+
+	/* Check if the transferring against this page
+	 * is completed, or not even queued. */
+	if (opg->ops_transfer_pinned)
+		/* FIXME: may not be interrupted.. */
+		rc = osc_cancel_async_page(env, opg);
+	LASSERT(ergo(rc == 0, opg->ops_transfer_pinned == 0));
+	return rc;
+}
+
+static int osc_page_flush(const struct lu_env *env,
+			  const struct cl_page_slice *slice,
+			  struct cl_io *io)
+{
+	struct osc_page *opg = cl2osc_page(slice);
+	int rc = 0;
+
+	rc = osc_flush_async_page(env, io, opg);
+	return rc;
+}
+
+static const struct cl_page_operations osc_page_ops = {
+	.cpo_fini	  = osc_page_fini,
+	.cpo_print	 = osc_page_print,
+	.cpo_delete	= osc_page_delete,
+	.cpo_is_under_lock = osc_page_is_under_lock,
+	.cpo_disown	= osc_page_disown,
+	.io = {
+		[CRT_READ] = {
+			.cpo_cache_add  = osc_page_fail,
+			.cpo_completion = osc_page_completion_read
+		},
+		[CRT_WRITE] = {
+			.cpo_cache_add  = osc_page_cache_add,
+			.cpo_completion = osc_page_completion_write
+		}
+	},
+	.cpo_clip	   = osc_page_clip,
+	.cpo_cancel	 = osc_page_cancel,
+	.cpo_flush	  = osc_page_flush
+};
+
+int osc_page_init(const struct lu_env *env, struct cl_object *obj,
+		struct cl_page *page, struct page *vmpage)
+{
+	struct osc_object *osc = cl2osc(obj);
+	struct osc_page   *opg = cl_object_page_slice(obj, page);
+	int result;
+
+	opg->ops_from = 0;
+	opg->ops_to   = PAGE_CACHE_SIZE;
+
+	result = osc_prep_async_page(osc, opg, vmpage,
+					cl_offset(obj, page->cp_index));
+	if (result == 0) {
+		struct osc_io *oio = osc_env_io(env);
+		opg->ops_srvlock = osc_io_srvlock(oio);
+		cl_page_slice_add(page, &opg->ops_cl, obj,
+				&osc_page_ops);
+	}
+	/*
+	 * Cannot assert osc_page_protected() here as read-ahead
+	 * creates temporary pages outside of a lock.
+	 */
+	/* ops_inflight and ops_lru are the same field, but it doesn't
+	 * hurt to initialize it twice :-) */
+	INIT_LIST_HEAD(&opg->ops_inflight);
+	INIT_LIST_HEAD(&opg->ops_lru);
+
+	/* reserve an LRU space for this page */
+	if (page->cp_type == CPT_CACHEABLE && result == 0)
+		result = osc_lru_reserve(env, osc, opg);
+
+	return result;
+}
+
+/**
+ * Helper function called by osc_io_submit() for every page in an immediate
+ * transfer (i.e., transferred synchronously).
+ */
+void osc_page_submit(const struct lu_env *env, struct osc_page *opg,
+		     enum cl_req_type crt, int brw_flags)
+{
+	struct osc_async_page *oap = &opg->ops_oap;
+	struct osc_object     *obj = oap->oap_obj;
+
+	LINVRNT(osc_page_protected(env, opg,
+				   crt == CRT_WRITE ? CLM_WRITE : CLM_READ, 1));
+
+	LASSERTF(oap->oap_magic == OAP_MAGIC, "Bad oap magic: oap %p, magic 0x%x\n",
+		 oap, oap->oap_magic);
+	LASSERT(oap->oap_async_flags & ASYNC_READY);
+	LASSERT(oap->oap_async_flags & ASYNC_COUNT_STABLE);
+
+	oap->oap_cmd       = crt == CRT_WRITE ? OBD_BRW_WRITE : OBD_BRW_READ;
+	oap->oap_page_off  = opg->ops_from;
+	oap->oap_count     = opg->ops_to - opg->ops_from;
+	oap->oap_brw_flags = OBD_BRW_SYNC | brw_flags;
+
+	if (!client_is_remote(osc_export(obj)) &&
+			capable(CFS_CAP_SYS_RESOURCE)) {
+		oap->oap_brw_flags |= OBD_BRW_NOQUOTA;
+		oap->oap_cmd |= OBD_BRW_NOQUOTA;
+	}
+
+	opg->ops_submit_time = cfs_time_current();
+	osc_page_transfer_get(opg, "transfer\0imm");
+	osc_page_transfer_add(env, opg, crt);
+}
+
+/* --------------- LRU page management ------------------ */
+
+/* OSC is a natural place to manage LRU pages as applications are specialized
+ * to write OSC by OSC. Ideally, if one OSC is used more frequently it should
+ * occupy more LRU slots. On the other hand, we should avoid using up all LRU
+ * slots (client_obd::cl_lru_left) otherwise process has to be put into sleep
+ * for free LRU slots - this will be very bad so the algorithm requires each
+ * OSC to free slots voluntarily to maintain a reasonable number of free slots
+ * at any time.
+ */
+
+static DECLARE_WAIT_QUEUE_HEAD(osc_lru_waitq);
+static atomic_t osc_lru_waiters = ATOMIC_INIT(0);
+/* LRU pages are freed in batch mode. OSC should at least free this
+ * number of pages to avoid running out of LRU budget, and.. */
+static const int lru_shrink_min = 2 << (20 - PAGE_CACHE_SHIFT);  /* 2M */
+/* free this number at most otherwise it will take too long time to finish. */
+static const int lru_shrink_max = 32 << (20 - PAGE_CACHE_SHIFT); /* 32M */
+
+/* Check if we can free LRU slots from this OSC. If there exists LRU waiters,
+ * we should free slots aggressively. In this way, slots are freed in a steady
+ * step to maintain fairness among OSCs.
+ *
+ * Return how many LRU pages should be freed. */
+static int osc_cache_too_much(struct client_obd *cli)
+{
+	struct cl_client_cache *cache = cli->cl_cache;
+	int pages = atomic_read(&cli->cl_lru_in_list) >> 1;
+
+	if (atomic_read(&osc_lru_waiters) > 0 &&
+	    atomic_read(cli->cl_lru_left) < lru_shrink_max)
+		/* drop lru pages aggressively */
+		return min(pages, lru_shrink_max);
+
+	/* if it's going to run out LRU slots, we should free some, but not
+	 * too much to maintain fairness among OSCs. */
+	if (atomic_read(cli->cl_lru_left) < cache->ccc_lru_max >> 4) {
+		unsigned long tmp;
+
+		tmp = cache->ccc_lru_max / atomic_read(&cache->ccc_users);
+		if (pages > tmp)
+			return min(pages, lru_shrink_max);
+
+		return pages > lru_shrink_min ? lru_shrink_min : 0;
+	}
+
+	return 0;
+}
+
+/* Return how many pages are not discarded in @pvec. */
+static int discard_pagevec(const struct lu_env *env, struct cl_io *io,
+			   struct cl_page **pvec, int max_index)
+{
+	int count;
+	int i;
+
+	for (count = 0, i = 0; i < max_index; i++) {
+		struct cl_page *page = pvec[i];
+		if (cl_page_own_try(env, io, page) == 0) {
+			/* free LRU page only if nobody is using it.
+			 * This check is necessary to avoid freeing the pages
+			 * having already been removed from LRU and pinned
+			 * for IO. */
+			if (!cl_page_in_use(page)) {
+				cl_page_unmap(env, io, page);
+				cl_page_discard(env, io, page);
+				++count;
+			}
+			cl_page_disown(env, io, page);
+		}
+		cl_page_put(env, page);
+		pvec[i] = NULL;
+	}
+	return max_index - count;
+}
+
+/**
+ * Drop @target of pages from LRU at most.
+ */
+int osc_lru_shrink(struct client_obd *cli, int target)
+{
+	struct cl_env_nest nest;
+	struct lu_env *env;
+	struct cl_io *io;
+	struct cl_object *clobj = NULL;
+	struct cl_page **pvec;
+	struct osc_page *opg;
+	int maxscan = 0;
+	int count = 0;
+	int index = 0;
+	int rc = 0;
+
+	LASSERT(atomic_read(&cli->cl_lru_in_list) >= 0);
+	if (atomic_read(&cli->cl_lru_in_list) == 0 || target <= 0)
+		return 0;
+
+	env = cl_env_nested_get(&nest);
+	if (IS_ERR(env))
+		return PTR_ERR(env);
+
+	pvec = osc_env_info(env)->oti_pvec;
+	io = &osc_env_info(env)->oti_io;
+
+	client_obd_list_lock(&cli->cl_lru_list_lock);
+	atomic_inc(&cli->cl_lru_shrinkers);
+	maxscan = min(target << 1, atomic_read(&cli->cl_lru_in_list));
+	while (!list_empty(&cli->cl_lru_list)) {
+		struct cl_page *page;
+
+		if (--maxscan < 0)
+			break;
+
+		opg = list_entry(cli->cl_lru_list.next, struct osc_page,
+				     ops_lru);
+		page = cl_page_top(opg->ops_cl.cpl_page);
+		if (cl_page_in_use_noref(page)) {
+			list_move_tail(&opg->ops_lru, &cli->cl_lru_list);
+			continue;
+		}
+
+		LASSERT(page->cp_obj != NULL);
+		if (clobj != page->cp_obj) {
+			struct cl_object *tmp = page->cp_obj;
+
+			cl_object_get(tmp);
+			client_obd_list_unlock(&cli->cl_lru_list_lock);
+
+			if (clobj != NULL) {
+				count -= discard_pagevec(env, io, pvec, index);
+				index = 0;
+
+				cl_io_fini(env, io);
+				cl_object_put(env, clobj);
+				clobj = NULL;
+			}
+
+			clobj = tmp;
+			io->ci_obj = clobj;
+			io->ci_ignore_layout = 1;
+			rc = cl_io_init(env, io, CIT_MISC, clobj);
+
+			client_obd_list_lock(&cli->cl_lru_list_lock);
+
+			if (rc != 0)
+				break;
+
+			++maxscan;
+			continue;
+		}
+
+		/* move this page to the end of list as it will be discarded
+		 * soon. The page will be finally removed from LRU list in
+		 * osc_page_delete().  */
+		list_move_tail(&opg->ops_lru, &cli->cl_lru_list);
+
+		/* it's okay to grab a refcount here w/o holding lock because
+		 * it has to grab cl_lru_list_lock to delete the page. */
+		cl_page_get(page);
+		pvec[index++] = page;
+		if (++count >= target)
+			break;
+
+		if (unlikely(index == OTI_PVEC_SIZE)) {
+			client_obd_list_unlock(&cli->cl_lru_list_lock);
+			count -= discard_pagevec(env, io, pvec, index);
+			index = 0;
+
+			client_obd_list_lock(&cli->cl_lru_list_lock);
+		}
+	}
+	client_obd_list_unlock(&cli->cl_lru_list_lock);
+
+	if (clobj != NULL) {
+		count -= discard_pagevec(env, io, pvec, index);
+
+		cl_io_fini(env, io);
+		cl_object_put(env, clobj);
+	}
+	cl_env_nested_put(&nest, env);
+
+	atomic_dec(&cli->cl_lru_shrinkers);
+	return count > 0 ? count : rc;
+}
+
+static void osc_lru_add(struct client_obd *cli, struct osc_page *opg)
+{
+	bool wakeup = false;
+
+	if (!opg->ops_in_lru)
+		return;
+
+	atomic_dec(&cli->cl_lru_busy);
+	client_obd_list_lock(&cli->cl_lru_list_lock);
+	if (list_empty(&opg->ops_lru)) {
+		list_move_tail(&opg->ops_lru, &cli->cl_lru_list);
+		atomic_inc_return(&cli->cl_lru_in_list);
+		wakeup = atomic_read(&osc_lru_waiters) > 0;
+	}
+	client_obd_list_unlock(&cli->cl_lru_list_lock);
+
+	if (wakeup) {
+		osc_lru_shrink(cli, osc_cache_too_much(cli));
+		wake_up_all(&osc_lru_waitq);
+	}
+}
+
+/* delete page from LRUlist. The page can be deleted from LRUlist for two
+ * reasons: redirtied or deleted from page cache. */
+static void osc_lru_del(struct client_obd *cli, struct osc_page *opg, bool del)
+{
+	if (opg->ops_in_lru) {
+		client_obd_list_lock(&cli->cl_lru_list_lock);
+		if (!list_empty(&opg->ops_lru)) {
+			LASSERT(atomic_read(&cli->cl_lru_in_list) > 0);
+			list_del_init(&opg->ops_lru);
+			atomic_dec(&cli->cl_lru_in_list);
+			if (!del)
+				atomic_inc(&cli->cl_lru_busy);
+		} else if (del) {
+			LASSERT(atomic_read(&cli->cl_lru_busy) > 0);
+			atomic_dec(&cli->cl_lru_busy);
+		}
+		client_obd_list_unlock(&cli->cl_lru_list_lock);
+		if (del) {
+			atomic_inc(cli->cl_lru_left);
+			/* this is a great place to release more LRU pages if
+			 * this osc occupies too many LRU pages and kernel is
+			 * stealing one of them.
+			 * cl_lru_shrinkers is to avoid recursive call in case
+			 * we're already in the context of osc_lru_shrink(). */
+			if (atomic_read(&cli->cl_lru_shrinkers) == 0 &&
+			    !memory_pressure_get())
+				osc_lru_shrink(cli, osc_cache_too_much(cli));
+			wake_up(&osc_lru_waitq);
+		}
+	} else {
+		LASSERT(list_empty(&opg->ops_lru));
+	}
+}
+
+static inline int max_to_shrink(struct client_obd *cli)
+{
+	return min(atomic_read(&cli->cl_lru_in_list) >> 1, lru_shrink_max);
+}
+
+static int osc_lru_reclaim(struct client_obd *cli)
+{
+	struct cl_client_cache *cache = cli->cl_cache;
+	int max_scans;
+	int rc;
+
+	LASSERT(cache != NULL);
+	LASSERT(!list_empty(&cache->ccc_lru));
+
+	rc = osc_lru_shrink(cli, lru_shrink_min);
+	if (rc != 0) {
+		CDEBUG(D_CACHE, "%s: Free %d pages from own LRU: %p.\n",
+			cli->cl_import->imp_obd->obd_name, rc, cli);
+		return rc;
+	}
+
+	CDEBUG(D_CACHE, "%s: cli %p no free slots, pages: %d, busy: %d.\n",
+		cli->cl_import->imp_obd->obd_name, cli,
+		atomic_read(&cli->cl_lru_in_list),
+		atomic_read(&cli->cl_lru_busy));
+
+	/* Reclaim LRU slots from other client_obd as it can't free enough
+	 * from its own. This should rarely happen. */
+	spin_lock(&cache->ccc_lru_lock);
+	cache->ccc_lru_shrinkers++;
+	list_move_tail(&cli->cl_lru_osc, &cache->ccc_lru);
+
+	max_scans = atomic_read(&cache->ccc_users);
+	while (--max_scans > 0 && !list_empty(&cache->ccc_lru)) {
+		cli = list_entry(cache->ccc_lru.next, struct client_obd,
+					cl_lru_osc);
+
+		CDEBUG(D_CACHE, "%s: cli %p LRU pages: %d, busy: %d.\n",
+			cli->cl_import->imp_obd->obd_name, cli,
+			atomic_read(&cli->cl_lru_in_list),
+			atomic_read(&cli->cl_lru_busy));
+
+		list_move_tail(&cli->cl_lru_osc, &cache->ccc_lru);
+		if (atomic_read(&cli->cl_lru_in_list) > 0) {
+			spin_unlock(&cache->ccc_lru_lock);
+
+			rc = osc_lru_shrink(cli, max_to_shrink(cli));
+			spin_lock(&cache->ccc_lru_lock);
+			if (rc != 0)
+				break;
+		}
+	}
+	spin_unlock(&cache->ccc_lru_lock);
+
+	CDEBUG(D_CACHE, "%s: cli %p freed %d pages.\n",
+		cli->cl_import->imp_obd->obd_name, cli, rc);
+	return rc;
+}
+
+static int osc_lru_reserve(const struct lu_env *env, struct osc_object *obj,
+			   struct osc_page *opg)
+{
+	struct l_wait_info lwi = LWI_INTR(LWI_ON_SIGNAL_NOOP, NULL);
+	struct client_obd *cli = osc_cli(obj);
+	int rc = 0;
+
+	if (cli->cl_cache == NULL) /* shall not be in LRU */
+		return 0;
+
+	LASSERT(atomic_read(cli->cl_lru_left) >= 0);
+	while (!atomic_add_unless(cli->cl_lru_left, -1, 0)) {
+		int gen;
+
+		/* run out of LRU spaces, try to drop some by itself */
+		rc = osc_lru_reclaim(cli);
+		if (rc < 0)
+			break;
+		if (rc > 0)
+			continue;
+
+		cond_resched();
+
+		/* slowest case, all of caching pages are busy, notifying
+		 * other OSCs that we're lack of LRU slots. */
+		atomic_inc(&osc_lru_waiters);
+
+		gen = atomic_read(&cli->cl_lru_in_list);
+		rc = l_wait_event(osc_lru_waitq,
+				atomic_read(cli->cl_lru_left) > 0 ||
+				(atomic_read(&cli->cl_lru_in_list) > 0 &&
+				 gen != atomic_read(&cli->cl_lru_in_list)),
+				&lwi);
+
+		atomic_dec(&osc_lru_waiters);
+		if (rc < 0)
+			break;
+	}
+
+	if (rc >= 0) {
+		atomic_inc(&cli->cl_lru_busy);
+		opg->ops_in_lru = 1;
+		rc = 0;
+	}
+
+	return rc;
+}
+
+/** @} osc */
diff --git a/kernel/drivers/staging/lustre/lustre/osc/osc_quota.c b/kernel/drivers/staging/lustre/lustre/osc/osc_quota.c
new file mode 100644
index 000000000..6690f149a
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/osc/osc_quota.c
@@ -0,0 +1,327 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ *
+ * Code originally extracted from quota directory
+ */
+
+#include "../include/obd_class.h"
+#include "osc_internal.h"
+
+static inline struct osc_quota_info *osc_oqi_alloc(u32 id)
+{
+	struct osc_quota_info *oqi;
+
+	OBD_SLAB_ALLOC_PTR(oqi, osc_quota_kmem);
+	if (oqi != NULL)
+		oqi->oqi_id = id;
+
+	return oqi;
+}
+
+int osc_quota_chkdq(struct client_obd *cli, const unsigned int qid[])
+{
+	int type;
+
+	for (type = 0; type < MAXQUOTAS; type++) {
+		struct osc_quota_info *oqi;
+
+		oqi = cfs_hash_lookup(cli->cl_quota_hash[type], &qid[type]);
+		if (oqi) {
+			/* do not try to access oqi here, it could have been
+			 * freed by osc_quota_setdq() */
+
+			/* the slot is busy, the user is about to run out of
+			 * quota space on this OST */
+			CDEBUG(D_QUOTA, "chkdq found noquota for %s %d\n",
+			       type == USRQUOTA ? "user" : "grout", qid[type]);
+			return NO_QUOTA;
+		}
+	}
+
+	return QUOTA_OK;
+}
+
+#define MD_QUOTA_FLAG(type) ((type == USRQUOTA) ? OBD_MD_FLUSRQUOTA \
+						: OBD_MD_FLGRPQUOTA)
+#define FL_QUOTA_FLAG(type) ((type == USRQUOTA) ? OBD_FL_NO_USRQUOTA \
+						: OBD_FL_NO_GRPQUOTA)
+
+int osc_quota_setdq(struct client_obd *cli, const unsigned int qid[],
+		    u32 valid, u32 flags)
+{
+	int type;
+	int rc = 0;
+
+	if ((valid & (OBD_MD_FLUSRQUOTA | OBD_MD_FLGRPQUOTA)) == 0)
+		return 0;
+
+	for (type = 0; type < MAXQUOTAS; type++) {
+		struct osc_quota_info *oqi;
+
+		if ((valid & MD_QUOTA_FLAG(type)) == 0)
+			continue;
+
+		/* lookup the ID in the per-type hash table */
+		oqi = cfs_hash_lookup(cli->cl_quota_hash[type], &qid[type]);
+		if ((flags & FL_QUOTA_FLAG(type)) != 0) {
+			/* This ID is getting close to its quota limit, let's
+			 * switch to sync I/O */
+			if (oqi != NULL)
+				continue;
+
+			oqi = osc_oqi_alloc(qid[type]);
+			if (oqi == NULL) {
+				rc = -ENOMEM;
+				break;
+			}
+
+			rc = cfs_hash_add_unique(cli->cl_quota_hash[type],
+						 &qid[type], &oqi->oqi_hash);
+			/* race with others? */
+			if (rc == -EALREADY) {
+				rc = 0;
+				OBD_SLAB_FREE_PTR(oqi, osc_quota_kmem);
+			}
+
+			CDEBUG(D_QUOTA, "%s: setdq to insert for %s %d (%d)\n",
+			       cli->cl_import->imp_obd->obd_name,
+			       type == USRQUOTA ? "user" : "group",
+			       qid[type], rc);
+		} else {
+			/* This ID is now off the hook, let's remove it from
+			 * the hash table */
+			if (oqi == NULL)
+				continue;
+
+			oqi = cfs_hash_del_key(cli->cl_quota_hash[type],
+					       &qid[type]);
+			if (oqi)
+				OBD_SLAB_FREE_PTR(oqi, osc_quota_kmem);
+
+			CDEBUG(D_QUOTA, "%s: setdq to remove for %s %d (%p)\n",
+			       cli->cl_import->imp_obd->obd_name,
+			       type == USRQUOTA ? "user" : "group",
+			       qid[type], oqi);
+		}
+	}
+
+	return rc;
+}
+
+/*
+ * Hash operations for uid/gid <-> osc_quota_info
+ */
+static unsigned
+oqi_hashfn(struct cfs_hash *hs, const void *key, unsigned mask)
+{
+	return cfs_hash_u32_hash(*((__u32 *)key), mask);
+}
+
+static int
+oqi_keycmp(const void *key, struct hlist_node *hnode)
+{
+	struct osc_quota_info *oqi;
+	u32 uid;
+
+	LASSERT(key != NULL);
+	uid = *((u32 *)key);
+	oqi = hlist_entry(hnode, struct osc_quota_info, oqi_hash);
+
+	return uid == oqi->oqi_id;
+}
+
+static void *
+oqi_key(struct hlist_node *hnode)
+{
+	struct osc_quota_info *oqi;
+	oqi = hlist_entry(hnode, struct osc_quota_info, oqi_hash);
+	return &oqi->oqi_id;
+}
+
+static void *
+oqi_object(struct hlist_node *hnode)
+{
+	return hlist_entry(hnode, struct osc_quota_info, oqi_hash);
+}
+
+static void
+oqi_get(struct cfs_hash *hs, struct hlist_node *hnode)
+{
+}
+
+static void
+oqi_put_locked(struct cfs_hash *hs, struct hlist_node *hnode)
+{
+}
+
+static void
+oqi_exit(struct cfs_hash *hs, struct hlist_node *hnode)
+{
+	struct osc_quota_info *oqi;
+
+	oqi = hlist_entry(hnode, struct osc_quota_info, oqi_hash);
+
+	OBD_SLAB_FREE_PTR(oqi, osc_quota_kmem);
+}
+
+#define HASH_QUOTA_BKT_BITS 5
+#define HASH_QUOTA_CUR_BITS 5
+#define HASH_QUOTA_MAX_BITS 15
+
+static cfs_hash_ops_t quota_hash_ops = {
+	.hs_hash	= oqi_hashfn,
+	.hs_keycmp	= oqi_keycmp,
+	.hs_key		= oqi_key,
+	.hs_object	= oqi_object,
+	.hs_get		= oqi_get,
+	.hs_put_locked	= oqi_put_locked,
+	.hs_exit	= oqi_exit,
+};
+
+int osc_quota_setup(struct obd_device *obd)
+{
+	struct client_obd *cli = &obd->u.cli;
+	int i, type;
+
+	for (type = 0; type < MAXQUOTAS; type++) {
+		cli->cl_quota_hash[type] = cfs_hash_create("QUOTA_HASH",
+							   HASH_QUOTA_CUR_BITS,
+							   HASH_QUOTA_MAX_BITS,
+							   HASH_QUOTA_BKT_BITS,
+							   0,
+							   CFS_HASH_MIN_THETA,
+							   CFS_HASH_MAX_THETA,
+							   &quota_hash_ops,
+							   CFS_HASH_DEFAULT);
+		if (cli->cl_quota_hash[type] == NULL)
+			break;
+	}
+
+	if (type == MAXQUOTAS)
+		return 0;
+
+	for (i = 0; i < type; i++)
+		cfs_hash_putref(cli->cl_quota_hash[i]);
+
+	return -ENOMEM;
+}
+
+int osc_quota_cleanup(struct obd_device *obd)
+{
+	struct client_obd     *cli = &obd->u.cli;
+	int type;
+
+	for (type = 0; type < MAXQUOTAS; type++)
+		cfs_hash_putref(cli->cl_quota_hash[type]);
+
+	return 0;
+}
+
+int osc_quotactl(struct obd_device *unused, struct obd_export *exp,
+		 struct obd_quotactl *oqctl)
+{
+	struct ptlrpc_request *req;
+	struct obd_quotactl   *oqc;
+	int		    rc;
+
+	req = ptlrpc_request_alloc_pack(class_exp2cliimp(exp),
+					&RQF_OST_QUOTACTL, LUSTRE_OST_VERSION,
+					OST_QUOTACTL);
+	if (req == NULL)
+		return -ENOMEM;
+
+	oqc = req_capsule_client_get(&req->rq_pill, &RMF_OBD_QUOTACTL);
+	*oqc = *oqctl;
+
+	ptlrpc_request_set_replen(req);
+	ptlrpc_at_set_req_timeout(req);
+	req->rq_no_resend = 1;
+
+	rc = ptlrpc_queue_wait(req);
+	if (rc)
+		CERROR("ptlrpc_queue_wait failed, rc: %d\n", rc);
+
+	if (req->rq_repmsg) {
+		oqc = req_capsule_server_get(&req->rq_pill, &RMF_OBD_QUOTACTL);
+		if (oqc) {
+			*oqctl = *oqc;
+		} else if (!rc) {
+			CERROR("Can't unpack obd_quotactl\n");
+			rc = -EPROTO;
+		}
+	} else if (!rc) {
+		CERROR("Can't unpack obd_quotactl\n");
+		rc = -EPROTO;
+	}
+	ptlrpc_req_finished(req);
+
+	return rc;
+}
+
+int osc_quotacheck(struct obd_device *unused, struct obd_export *exp,
+		   struct obd_quotactl *oqctl)
+{
+	struct client_obd       *cli = &exp->exp_obd->u.cli;
+	struct ptlrpc_request   *req;
+	struct obd_quotactl     *body;
+	int		      rc;
+
+	req = ptlrpc_request_alloc_pack(class_exp2cliimp(exp),
+					&RQF_OST_QUOTACHECK, LUSTRE_OST_VERSION,
+					OST_QUOTACHECK);
+	if (req == NULL)
+		return -ENOMEM;
+
+	body = req_capsule_client_get(&req->rq_pill, &RMF_OBD_QUOTACTL);
+	*body = *oqctl;
+
+	ptlrpc_request_set_replen(req);
+
+	/* the next poll will find -ENODATA, that means quotacheck is
+	 * going on */
+	cli->cl_qchk_stat = -ENODATA;
+	rc = ptlrpc_queue_wait(req);
+	if (rc)
+		cli->cl_qchk_stat = rc;
+	ptlrpc_req_finished(req);
+	return rc;
+}
+
+int osc_quota_poll_check(struct obd_export *exp, struct if_quotacheck *qchk)
+{
+	struct client_obd *cli = &exp->exp_obd->u.cli;
+	int rc;
+
+	qchk->obd_uuid = cli->cl_target_uuid;
+	memcpy(qchk->obd_type, LUSTRE_OST_NAME, strlen(LUSTRE_OST_NAME));
+
+	rc = cli->cl_qchk_stat;
+	/* the client is not the previous one */
+	if (rc == CL_NOT_QUOTACHECKED)
+		rc = -EINTR;
+	return rc;
+}
diff --git a/kernel/drivers/staging/lustre/lustre/osc/osc_request.c b/kernel/drivers/staging/lustre/lustre/osc/osc_request.c
new file mode 100644
index 000000000..d7a9b650d
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/osc/osc_request.c
@@ -0,0 +1,3379 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_OSC
+
+#include "../../include/linux/libcfs/libcfs.h"
+
+
+#include "../include/lustre_dlm.h"
+#include "../include/lustre_net.h"
+#include "../include/lustre/lustre_user.h"
+#include "../include/obd_cksum.h"
+
+#include "../include/lustre_ha.h"
+#include "../include/lprocfs_status.h"
+#include "../include/lustre_debug.h"
+#include "../include/lustre_param.h"
+#include "../include/lustre_fid.h"
+#include "../include/obd_class.h"
+#include "osc_internal.h"
+#include "osc_cl_internal.h"
+
+struct osc_brw_async_args {
+	struct obdo       *aa_oa;
+	int		aa_requested_nob;
+	int		aa_nio_count;
+	u32		aa_page_count;
+	int		aa_resends;
+	struct brw_page  **aa_ppga;
+	struct client_obd *aa_cli;
+	struct list_head	 aa_oaps;
+	struct list_head	 aa_exts;
+	struct obd_capa   *aa_ocapa;
+	struct cl_req     *aa_clerq;
+};
+
+struct osc_async_args {
+	struct obd_info   *aa_oi;
+};
+
+struct osc_setattr_args {
+	struct obdo	 *sa_oa;
+	obd_enqueue_update_f sa_upcall;
+	void		*sa_cookie;
+};
+
+struct osc_fsync_args {
+	struct obd_info     *fa_oi;
+	obd_enqueue_update_f fa_upcall;
+	void		*fa_cookie;
+};
+
+struct osc_enqueue_args {
+	struct obd_export	*oa_exp;
+	__u64		    *oa_flags;
+	obd_enqueue_update_f      oa_upcall;
+	void		     *oa_cookie;
+	struct ost_lvb	   *oa_lvb;
+	struct lustre_handle     *oa_lockh;
+	struct ldlm_enqueue_info *oa_ei;
+	unsigned int	      oa_agl:1;
+};
+
+static void osc_release_ppga(struct brw_page **ppga, u32 count);
+static int brw_interpret(const struct lu_env *env,
+			 struct ptlrpc_request *req, void *data, int rc);
+int osc_cleanup(struct obd_device *obd);
+
+/* Pack OSC object metadata for disk storage (LE byte order). */
+static int osc_packmd(struct obd_export *exp, struct lov_mds_md **lmmp,
+		      struct lov_stripe_md *lsm)
+{
+	int lmm_size;
+
+	lmm_size = sizeof(**lmmp);
+	if (lmmp == NULL)
+		return lmm_size;
+
+	if (*lmmp != NULL && lsm == NULL) {
+		OBD_FREE(*lmmp, lmm_size);
+		*lmmp = NULL;
+		return 0;
+	} else if (unlikely(lsm != NULL && ostid_id(&lsm->lsm_oi) == 0)) {
+		return -EBADF;
+	}
+
+	if (*lmmp == NULL) {
+		OBD_ALLOC(*lmmp, lmm_size);
+		if (*lmmp == NULL)
+			return -ENOMEM;
+	}
+
+	if (lsm)
+		ostid_cpu_to_le(&lsm->lsm_oi, &(*lmmp)->lmm_oi);
+
+	return lmm_size;
+}
+
+/* Unpack OSC object metadata from disk storage (LE byte order). */
+static int osc_unpackmd(struct obd_export *exp, struct lov_stripe_md **lsmp,
+			struct lov_mds_md *lmm, int lmm_bytes)
+{
+	int lsm_size;
+	struct obd_import *imp = class_exp2cliimp(exp);
+
+	if (lmm != NULL) {
+		if (lmm_bytes < sizeof(*lmm)) {
+			CERROR("%s: lov_mds_md too small: %d, need %d\n",
+			       exp->exp_obd->obd_name, lmm_bytes,
+			       (int)sizeof(*lmm));
+			return -EINVAL;
+		}
+		/* XXX LOV_MAGIC etc check? */
+
+		if (unlikely(ostid_id(&lmm->lmm_oi) == 0)) {
+			CERROR("%s: zero lmm_object_id: rc = %d\n",
+			       exp->exp_obd->obd_name, -EINVAL);
+			return -EINVAL;
+		}
+	}
+
+	lsm_size = lov_stripe_md_size(1);
+	if (lsmp == NULL)
+		return lsm_size;
+
+	if (*lsmp != NULL && lmm == NULL) {
+		OBD_FREE((*lsmp)->lsm_oinfo[0], sizeof(struct lov_oinfo));
+		OBD_FREE(*lsmp, lsm_size);
+		*lsmp = NULL;
+		return 0;
+	}
+
+	if (*lsmp == NULL) {
+		OBD_ALLOC(*lsmp, lsm_size);
+		if (unlikely(*lsmp == NULL))
+			return -ENOMEM;
+		OBD_ALLOC((*lsmp)->lsm_oinfo[0], sizeof(struct lov_oinfo));
+		if (unlikely((*lsmp)->lsm_oinfo[0] == NULL)) {
+			OBD_FREE(*lsmp, lsm_size);
+			return -ENOMEM;
+		}
+		loi_init((*lsmp)->lsm_oinfo[0]);
+	} else if (unlikely(ostid_id(&(*lsmp)->lsm_oi) == 0)) {
+		return -EBADF;
+	}
+
+	if (lmm != NULL)
+		/* XXX zero *lsmp? */
+		ostid_le_to_cpu(&lmm->lmm_oi, &(*lsmp)->lsm_oi);
+
+	if (imp != NULL &&
+	    (imp->imp_connect_data.ocd_connect_flags & OBD_CONNECT_MAXBYTES))
+		(*lsmp)->lsm_maxbytes = imp->imp_connect_data.ocd_maxbytes;
+	else
+		(*lsmp)->lsm_maxbytes = LUSTRE_STRIPE_MAXBYTES;
+
+	return lsm_size;
+}
+
+static inline void osc_pack_capa(struct ptlrpc_request *req,
+				 struct ost_body *body, void *capa)
+{
+	struct obd_capa *oc = (struct obd_capa *)capa;
+	struct lustre_capa *c;
+
+	if (!capa)
+		return;
+
+	c = req_capsule_client_get(&req->rq_pill, &RMF_CAPA1);
+	LASSERT(c);
+	capa_cpy(c, oc);
+	body->oa.o_valid |= OBD_MD_FLOSSCAPA;
+	DEBUG_CAPA(D_SEC, c, "pack");
+}
+
+static inline void osc_pack_req_body(struct ptlrpc_request *req,
+				     struct obd_info *oinfo)
+{
+	struct ost_body *body;
+
+	body = req_capsule_client_get(&req->rq_pill, &RMF_OST_BODY);
+	LASSERT(body);
+
+	lustre_set_wire_obdo(&req->rq_import->imp_connect_data, &body->oa,
+			     oinfo->oi_oa);
+	osc_pack_capa(req, body, oinfo->oi_capa);
+}
+
+static inline void osc_set_capa_size(struct ptlrpc_request *req,
+				     const struct req_msg_field *field,
+				     struct obd_capa *oc)
+{
+	if (oc == NULL)
+		req_capsule_set_size(&req->rq_pill, field, RCL_CLIENT, 0);
+	else
+		/* it is already calculated as sizeof struct obd_capa */
+		;
+}
+
+static int osc_getattr_interpret(const struct lu_env *env,
+				 struct ptlrpc_request *req,
+				 struct osc_async_args *aa, int rc)
+{
+	struct ost_body *body;
+
+	if (rc != 0)
+		goto out;
+
+	body = req_capsule_server_get(&req->rq_pill, &RMF_OST_BODY);
+	if (body) {
+		CDEBUG(D_INODE, "mode: %o\n", body->oa.o_mode);
+		lustre_get_wire_obdo(&req->rq_import->imp_connect_data,
+				     aa->aa_oi->oi_oa, &body->oa);
+
+		/* This should really be sent by the OST */
+		aa->aa_oi->oi_oa->o_blksize = DT_MAX_BRW_SIZE;
+		aa->aa_oi->oi_oa->o_valid |= OBD_MD_FLBLKSZ;
+	} else {
+		CDEBUG(D_INFO, "can't unpack ost_body\n");
+		rc = -EPROTO;
+		aa->aa_oi->oi_oa->o_valid = 0;
+	}
+out:
+	rc = aa->aa_oi->oi_cb_up(aa->aa_oi, rc);
+	return rc;
+}
+
+static int osc_getattr_async(struct obd_export *exp, struct obd_info *oinfo,
+			     struct ptlrpc_request_set *set)
+{
+	struct ptlrpc_request *req;
+	struct osc_async_args *aa;
+	int		    rc;
+
+	req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_OST_GETATTR);
+	if (req == NULL)
+		return -ENOMEM;
+
+	osc_set_capa_size(req, &RMF_CAPA1, oinfo->oi_capa);
+	rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_GETATTR);
+	if (rc) {
+		ptlrpc_request_free(req);
+		return rc;
+	}
+
+	osc_pack_req_body(req, oinfo);
+
+	ptlrpc_request_set_replen(req);
+	req->rq_interpret_reply = (ptlrpc_interpterer_t)osc_getattr_interpret;
+
+	CLASSERT(sizeof(*aa) <= sizeof(req->rq_async_args));
+	aa = ptlrpc_req_async_args(req);
+	aa->aa_oi = oinfo;
+
+	ptlrpc_set_add_req(set, req);
+	return 0;
+}
+
+static int osc_getattr(const struct lu_env *env, struct obd_export *exp,
+		       struct obd_info *oinfo)
+{
+	struct ptlrpc_request *req;
+	struct ost_body       *body;
+	int		    rc;
+
+	req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_OST_GETATTR);
+	if (req == NULL)
+		return -ENOMEM;
+
+	osc_set_capa_size(req, &RMF_CAPA1, oinfo->oi_capa);
+	rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_GETATTR);
+	if (rc) {
+		ptlrpc_request_free(req);
+		return rc;
+	}
+
+	osc_pack_req_body(req, oinfo);
+
+	ptlrpc_request_set_replen(req);
+
+	rc = ptlrpc_queue_wait(req);
+	if (rc)
+		goto out;
+
+	body = req_capsule_server_get(&req->rq_pill, &RMF_OST_BODY);
+	if (body == NULL) {
+		rc = -EPROTO;
+		goto out;
+	}
+
+	CDEBUG(D_INODE, "mode: %o\n", body->oa.o_mode);
+	lustre_get_wire_obdo(&req->rq_import->imp_connect_data, oinfo->oi_oa,
+			     &body->oa);
+
+	oinfo->oi_oa->o_blksize = cli_brw_size(exp->exp_obd);
+	oinfo->oi_oa->o_valid |= OBD_MD_FLBLKSZ;
+
+ out:
+	ptlrpc_req_finished(req);
+	return rc;
+}
+
+static int osc_setattr(const struct lu_env *env, struct obd_export *exp,
+		       struct obd_info *oinfo, struct obd_trans_info *oti)
+{
+	struct ptlrpc_request *req;
+	struct ost_body       *body;
+	int		    rc;
+
+	LASSERT(oinfo->oi_oa->o_valid & OBD_MD_FLGROUP);
+
+	req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_OST_SETATTR);
+	if (req == NULL)
+		return -ENOMEM;
+
+	osc_set_capa_size(req, &RMF_CAPA1, oinfo->oi_capa);
+	rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_SETATTR);
+	if (rc) {
+		ptlrpc_request_free(req);
+		return rc;
+	}
+
+	osc_pack_req_body(req, oinfo);
+
+	ptlrpc_request_set_replen(req);
+
+	rc = ptlrpc_queue_wait(req);
+	if (rc)
+		goto out;
+
+	body = req_capsule_server_get(&req->rq_pill, &RMF_OST_BODY);
+	if (body == NULL) {
+		rc = -EPROTO;
+		goto out;
+	}
+
+	lustre_get_wire_obdo(&req->rq_import->imp_connect_data, oinfo->oi_oa,
+			     &body->oa);
+
+out:
+	ptlrpc_req_finished(req);
+	return rc;
+}
+
+static int osc_setattr_interpret(const struct lu_env *env,
+				 struct ptlrpc_request *req,
+				 struct osc_setattr_args *sa, int rc)
+{
+	struct ost_body *body;
+
+	if (rc != 0)
+		goto out;
+
+	body = req_capsule_server_get(&req->rq_pill, &RMF_OST_BODY);
+	if (body == NULL) {
+		rc = -EPROTO;
+		goto out;
+	}
+
+	lustre_get_wire_obdo(&req->rq_import->imp_connect_data, sa->sa_oa,
+			     &body->oa);
+out:
+	rc = sa->sa_upcall(sa->sa_cookie, rc);
+	return rc;
+}
+
+int osc_setattr_async_base(struct obd_export *exp, struct obd_info *oinfo,
+			   struct obd_trans_info *oti,
+			   obd_enqueue_update_f upcall, void *cookie,
+			   struct ptlrpc_request_set *rqset)
+{
+	struct ptlrpc_request   *req;
+	struct osc_setattr_args *sa;
+	int		      rc;
+
+	req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_OST_SETATTR);
+	if (req == NULL)
+		return -ENOMEM;
+
+	osc_set_capa_size(req, &RMF_CAPA1, oinfo->oi_capa);
+	rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_SETATTR);
+	if (rc) {
+		ptlrpc_request_free(req);
+		return rc;
+	}
+
+	if (oti && oinfo->oi_oa->o_valid & OBD_MD_FLCOOKIE)
+		oinfo->oi_oa->o_lcookie = *oti->oti_logcookies;
+
+	osc_pack_req_body(req, oinfo);
+
+	ptlrpc_request_set_replen(req);
+
+	/* do mds to ost setattr asynchronously */
+	if (!rqset) {
+		/* Do not wait for response. */
+		ptlrpcd_add_req(req, PDL_POLICY_ROUND, -1);
+	} else {
+		req->rq_interpret_reply =
+			(ptlrpc_interpterer_t)osc_setattr_interpret;
+
+		CLASSERT (sizeof(*sa) <= sizeof(req->rq_async_args));
+		sa = ptlrpc_req_async_args(req);
+		sa->sa_oa = oinfo->oi_oa;
+		sa->sa_upcall = upcall;
+		sa->sa_cookie = cookie;
+
+		if (rqset == PTLRPCD_SET)
+			ptlrpcd_add_req(req, PDL_POLICY_ROUND, -1);
+		else
+			ptlrpc_set_add_req(rqset, req);
+	}
+
+	return 0;
+}
+
+static int osc_setattr_async(struct obd_export *exp, struct obd_info *oinfo,
+			     struct obd_trans_info *oti,
+			     struct ptlrpc_request_set *rqset)
+{
+	return osc_setattr_async_base(exp, oinfo, oti,
+				      oinfo->oi_cb_up, oinfo, rqset);
+}
+
+int osc_real_create(struct obd_export *exp, struct obdo *oa,
+		    struct lov_stripe_md **ea, struct obd_trans_info *oti)
+{
+	struct ptlrpc_request *req;
+	struct ost_body       *body;
+	struct lov_stripe_md  *lsm;
+	int		    rc;
+
+	LASSERT(oa);
+	LASSERT(ea);
+
+	lsm = *ea;
+	if (!lsm) {
+		rc = obd_alloc_memmd(exp, &lsm);
+		if (rc < 0)
+			return rc;
+	}
+
+	req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_OST_CREATE);
+	if (req == NULL) {
+		rc = -ENOMEM;
+		goto out;
+	}
+
+	rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_CREATE);
+	if (rc) {
+		ptlrpc_request_free(req);
+		goto out;
+	}
+
+	body = req_capsule_client_get(&req->rq_pill, &RMF_OST_BODY);
+	LASSERT(body);
+
+	lustre_set_wire_obdo(&req->rq_import->imp_connect_data, &body->oa, oa);
+
+	ptlrpc_request_set_replen(req);
+
+	if ((oa->o_valid & OBD_MD_FLFLAGS) &&
+	    oa->o_flags == OBD_FL_DELORPHAN) {
+		DEBUG_REQ(D_HA, req,
+			  "delorphan from OST integration");
+		/* Don't resend the delorphan req */
+		req->rq_no_resend = req->rq_no_delay = 1;
+	}
+
+	rc = ptlrpc_queue_wait(req);
+	if (rc)
+		goto out_req;
+
+	body = req_capsule_server_get(&req->rq_pill, &RMF_OST_BODY);
+	if (body == NULL) {
+		rc = -EPROTO;
+		goto out_req;
+	}
+
+	CDEBUG(D_INFO, "oa flags %x\n", oa->o_flags);
+	lustre_get_wire_obdo(&req->rq_import->imp_connect_data, oa, &body->oa);
+
+	oa->o_blksize = cli_brw_size(exp->exp_obd);
+	oa->o_valid |= OBD_MD_FLBLKSZ;
+
+	/* XXX LOV STACKING: the lsm that is passed to us from LOV does not
+	 * have valid lsm_oinfo data structs, so don't go touching that.
+	 * This needs to be fixed in a big way.
+	 */
+	lsm->lsm_oi = oa->o_oi;
+	*ea = lsm;
+
+	if (oti != NULL) {
+		oti->oti_transno = lustre_msg_get_transno(req->rq_repmsg);
+
+		if (oa->o_valid & OBD_MD_FLCOOKIE) {
+			if (!oti->oti_logcookies)
+				oti_alloc_cookies(oti, 1);
+			*oti->oti_logcookies = oa->o_lcookie;
+		}
+	}
+
+	CDEBUG(D_HA, "transno: %lld\n",
+	       lustre_msg_get_transno(req->rq_repmsg));
+out_req:
+	ptlrpc_req_finished(req);
+out:
+	if (rc && !*ea)
+		obd_free_memmd(exp, &lsm);
+	return rc;
+}
+
+int osc_punch_base(struct obd_export *exp, struct obd_info *oinfo,
+		   obd_enqueue_update_f upcall, void *cookie,
+		   struct ptlrpc_request_set *rqset)
+{
+	struct ptlrpc_request   *req;
+	struct osc_setattr_args *sa;
+	struct ost_body	 *body;
+	int		      rc;
+
+	req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_OST_PUNCH);
+	if (req == NULL)
+		return -ENOMEM;
+
+	osc_set_capa_size(req, &RMF_CAPA1, oinfo->oi_capa);
+	rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_PUNCH);
+	if (rc) {
+		ptlrpc_request_free(req);
+		return rc;
+	}
+	req->rq_request_portal = OST_IO_PORTAL; /* bug 7198 */
+	ptlrpc_at_set_req_timeout(req);
+
+	body = req_capsule_client_get(&req->rq_pill, &RMF_OST_BODY);
+	LASSERT(body);
+	lustre_set_wire_obdo(&req->rq_import->imp_connect_data, &body->oa,
+			     oinfo->oi_oa);
+	osc_pack_capa(req, body, oinfo->oi_capa);
+
+	ptlrpc_request_set_replen(req);
+
+	req->rq_interpret_reply = (ptlrpc_interpterer_t)osc_setattr_interpret;
+	CLASSERT (sizeof(*sa) <= sizeof(req->rq_async_args));
+	sa = ptlrpc_req_async_args(req);
+	sa->sa_oa     = oinfo->oi_oa;
+	sa->sa_upcall = upcall;
+	sa->sa_cookie = cookie;
+	if (rqset == PTLRPCD_SET)
+		ptlrpcd_add_req(req, PDL_POLICY_ROUND, -1);
+	else
+		ptlrpc_set_add_req(rqset, req);
+
+	return 0;
+}
+
+static int osc_sync_interpret(const struct lu_env *env,
+			      struct ptlrpc_request *req,
+			      void *arg, int rc)
+{
+	struct osc_fsync_args *fa = arg;
+	struct ost_body *body;
+
+	if (rc)
+		goto out;
+
+	body = req_capsule_server_get(&req->rq_pill, &RMF_OST_BODY);
+	if (body == NULL) {
+		CERROR ("can't unpack ost_body\n");
+		rc = -EPROTO;
+		goto out;
+	}
+
+	*fa->fa_oi->oi_oa = body->oa;
+out:
+	rc = fa->fa_upcall(fa->fa_cookie, rc);
+	return rc;
+}
+
+int osc_sync_base(struct obd_export *exp, struct obd_info *oinfo,
+		  obd_enqueue_update_f upcall, void *cookie,
+		  struct ptlrpc_request_set *rqset)
+{
+	struct ptlrpc_request *req;
+	struct ost_body       *body;
+	struct osc_fsync_args *fa;
+	int		    rc;
+
+	req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_OST_SYNC);
+	if (req == NULL)
+		return -ENOMEM;
+
+	osc_set_capa_size(req, &RMF_CAPA1, oinfo->oi_capa);
+	rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_SYNC);
+	if (rc) {
+		ptlrpc_request_free(req);
+		return rc;
+	}
+
+	/* overload the size and blocks fields in the oa with start/end */
+	body = req_capsule_client_get(&req->rq_pill, &RMF_OST_BODY);
+	LASSERT(body);
+	lustre_set_wire_obdo(&req->rq_import->imp_connect_data, &body->oa,
+			     oinfo->oi_oa);
+	osc_pack_capa(req, body, oinfo->oi_capa);
+
+	ptlrpc_request_set_replen(req);
+	req->rq_interpret_reply = osc_sync_interpret;
+
+	CLASSERT(sizeof(*fa) <= sizeof(req->rq_async_args));
+	fa = ptlrpc_req_async_args(req);
+	fa->fa_oi = oinfo;
+	fa->fa_upcall = upcall;
+	fa->fa_cookie = cookie;
+
+	if (rqset == PTLRPCD_SET)
+		ptlrpcd_add_req(req, PDL_POLICY_ROUND, -1);
+	else
+		ptlrpc_set_add_req(rqset, req);
+
+	return 0;
+}
+
+/* Find and cancel locally locks matched by @mode in the resource found by
+ * @objid. Found locks are added into @cancel list. Returns the amount of
+ * locks added to @cancels list. */
+static int osc_resource_get_unused(struct obd_export *exp, struct obdo *oa,
+				   struct list_head *cancels,
+				   ldlm_mode_t mode, __u64 lock_flags)
+{
+	struct ldlm_namespace *ns = exp->exp_obd->obd_namespace;
+	struct ldlm_res_id res_id;
+	struct ldlm_resource *res;
+	int count;
+
+	/* Return, i.e. cancel nothing, only if ELC is supported (flag in
+	 * export) but disabled through procfs (flag in NS).
+	 *
+	 * This distinguishes from a case when ELC is not supported originally,
+	 * when we still want to cancel locks in advance and just cancel them
+	 * locally, without sending any RPC. */
+	if (exp_connect_cancelset(exp) && !ns_connect_cancelset(ns))
+		return 0;
+
+	ostid_build_res_name(&oa->o_oi, &res_id);
+	res = ldlm_resource_get(ns, NULL, &res_id, 0, 0);
+	if (res == NULL)
+		return 0;
+
+	LDLM_RESOURCE_ADDREF(res);
+	count = ldlm_cancel_resource_local(res, cancels, NULL, mode,
+					   lock_flags, 0, NULL);
+	LDLM_RESOURCE_DELREF(res);
+	ldlm_resource_putref(res);
+	return count;
+}
+
+static int osc_destroy_interpret(const struct lu_env *env,
+				 struct ptlrpc_request *req, void *data,
+				 int rc)
+{
+	struct client_obd *cli = &req->rq_import->imp_obd->u.cli;
+
+	atomic_dec(&cli->cl_destroy_in_flight);
+	wake_up(&cli->cl_destroy_waitq);
+	return 0;
+}
+
+static int osc_can_send_destroy(struct client_obd *cli)
+{
+	if (atomic_inc_return(&cli->cl_destroy_in_flight) <=
+	    cli->cl_max_rpcs_in_flight) {
+		/* The destroy request can be sent */
+		return 1;
+	}
+	if (atomic_dec_return(&cli->cl_destroy_in_flight) <
+	    cli->cl_max_rpcs_in_flight) {
+		/*
+		 * The counter has been modified between the two atomic
+		 * operations.
+		 */
+		wake_up(&cli->cl_destroy_waitq);
+	}
+	return 0;
+}
+
+int osc_create(const struct lu_env *env, struct obd_export *exp,
+	       struct obdo *oa, struct lov_stripe_md **ea,
+	       struct obd_trans_info *oti)
+{
+	int rc = 0;
+
+	LASSERT(oa);
+	LASSERT(ea);
+	LASSERT(oa->o_valid & OBD_MD_FLGROUP);
+
+	if ((oa->o_valid & OBD_MD_FLFLAGS) &&
+	    oa->o_flags == OBD_FL_RECREATE_OBJS) {
+		return osc_real_create(exp, oa, ea, oti);
+	}
+
+	if (!fid_seq_is_mdt(ostid_seq(&oa->o_oi)))
+		return osc_real_create(exp, oa, ea, oti);
+
+	/* we should not get here anymore */
+	LBUG();
+
+	return rc;
+}
+
+/* Destroy requests can be async always on the client, and we don't even really
+ * care about the return code since the client cannot do anything at all about
+ * a destroy failure.
+ * When the MDS is unlinking a filename, it saves the file objects into a
+ * recovery llog, and these object records are cancelled when the OST reports
+ * they were destroyed and sync'd to disk (i.e. transaction committed).
+ * If the client dies, or the OST is down when the object should be destroyed,
+ * the records are not cancelled, and when the OST reconnects to the MDS next,
+ * it will retrieve the llog unlink logs and then sends the log cancellation
+ * cookies to the MDS after committing destroy transactions. */
+static int osc_destroy(const struct lu_env *env, struct obd_export *exp,
+		       struct obdo *oa, struct lov_stripe_md *ea,
+		       struct obd_trans_info *oti, struct obd_export *md_export,
+		       void *capa)
+{
+	struct client_obd     *cli = &exp->exp_obd->u.cli;
+	struct ptlrpc_request *req;
+	struct ost_body       *body;
+	LIST_HEAD(cancels);
+	int rc, count;
+
+	if (!oa) {
+		CDEBUG(D_INFO, "oa NULL\n");
+		return -EINVAL;
+	}
+
+	count = osc_resource_get_unused(exp, oa, &cancels, LCK_PW,
+					LDLM_FL_DISCARD_DATA);
+
+	req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_OST_DESTROY);
+	if (req == NULL) {
+		ldlm_lock_list_put(&cancels, l_bl_ast, count);
+		return -ENOMEM;
+	}
+
+	osc_set_capa_size(req, &RMF_CAPA1, (struct obd_capa *)capa);
+	rc = ldlm_prep_elc_req(exp, req, LUSTRE_OST_VERSION, OST_DESTROY,
+			       0, &cancels, count);
+	if (rc) {
+		ptlrpc_request_free(req);
+		return rc;
+	}
+
+	req->rq_request_portal = OST_IO_PORTAL; /* bug 7198 */
+	ptlrpc_at_set_req_timeout(req);
+
+	if (oti != NULL && oa->o_valid & OBD_MD_FLCOOKIE)
+		oa->o_lcookie = *oti->oti_logcookies;
+	body = req_capsule_client_get(&req->rq_pill, &RMF_OST_BODY);
+	LASSERT(body);
+	lustre_set_wire_obdo(&req->rq_import->imp_connect_data, &body->oa, oa);
+
+	osc_pack_capa(req, body, (struct obd_capa *)capa);
+	ptlrpc_request_set_replen(req);
+
+	/* If osc_destroy is for destroying the unlink orphan,
+	 * sent from MDT to OST, which should not be blocked here,
+	 * because the process might be triggered by ptlrpcd, and
+	 * it is not good to block ptlrpcd thread (b=16006)*/
+	if (!(oa->o_flags & OBD_FL_DELORPHAN)) {
+		req->rq_interpret_reply = osc_destroy_interpret;
+		if (!osc_can_send_destroy(cli)) {
+			struct l_wait_info lwi = LWI_INTR(LWI_ON_SIGNAL_NOOP,
+							  NULL);
+
+			/*
+			 * Wait until the number of on-going destroy RPCs drops
+			 * under max_rpc_in_flight
+			 */
+			l_wait_event_exclusive(cli->cl_destroy_waitq,
+					       osc_can_send_destroy(cli), &lwi);
+		}
+	}
+
+	/* Do not wait for response */
+	ptlrpcd_add_req(req, PDL_POLICY_ROUND, -1);
+	return 0;
+}
+
+static void osc_announce_cached(struct client_obd *cli, struct obdo *oa,
+				long writing_bytes)
+{
+	u32 bits = OBD_MD_FLBLOCKS|OBD_MD_FLGRANT;
+
+	LASSERT(!(oa->o_valid & bits));
+
+	oa->o_valid |= bits;
+	client_obd_list_lock(&cli->cl_loi_list_lock);
+	oa->o_dirty = cli->cl_dirty;
+	if (unlikely(cli->cl_dirty - cli->cl_dirty_transit >
+		     cli->cl_dirty_max)) {
+		CERROR("dirty %lu - %lu > dirty_max %lu\n",
+		       cli->cl_dirty, cli->cl_dirty_transit, cli->cl_dirty_max);
+		oa->o_undirty = 0;
+	} else if (unlikely(atomic_read(&obd_dirty_pages) -
+			    atomic_read(&obd_dirty_transit_pages) >
+			    (long)(obd_max_dirty_pages + 1))) {
+		/* The atomic_read() allowing the atomic_inc() are
+		 * not covered by a lock thus they may safely race and trip
+		 * this CERROR() unless we add in a small fudge factor (+1). */
+		CERROR("dirty %d - %d > system dirty_max %d\n",
+		       atomic_read(&obd_dirty_pages),
+		       atomic_read(&obd_dirty_transit_pages),
+		       obd_max_dirty_pages);
+		oa->o_undirty = 0;
+	} else if (unlikely(cli->cl_dirty_max - cli->cl_dirty > 0x7fffffff)) {
+		CERROR("dirty %lu - dirty_max %lu too big???\n",
+		       cli->cl_dirty, cli->cl_dirty_max);
+		oa->o_undirty = 0;
+	} else {
+		long max_in_flight = (cli->cl_max_pages_per_rpc <<
+				      PAGE_CACHE_SHIFT)*
+				     (cli->cl_max_rpcs_in_flight + 1);
+		oa->o_undirty = max(cli->cl_dirty_max, max_in_flight);
+	}
+	oa->o_grant = cli->cl_avail_grant + cli->cl_reserved_grant;
+	oa->o_dropped = cli->cl_lost_grant;
+	cli->cl_lost_grant = 0;
+	client_obd_list_unlock(&cli->cl_loi_list_lock);
+	CDEBUG(D_CACHE, "dirty: %llu undirty: %u dropped %u grant: %llu\n",
+	       oa->o_dirty, oa->o_undirty, oa->o_dropped, oa->o_grant);
+
+}
+
+void osc_update_next_shrink(struct client_obd *cli)
+{
+	cli->cl_next_shrink_grant =
+		cfs_time_shift(cli->cl_grant_shrink_interval);
+	CDEBUG(D_CACHE, "next time %ld to shrink grant \n",
+	       cli->cl_next_shrink_grant);
+}
+
+static void __osc_update_grant(struct client_obd *cli, u64 grant)
+{
+	client_obd_list_lock(&cli->cl_loi_list_lock);
+	cli->cl_avail_grant += grant;
+	client_obd_list_unlock(&cli->cl_loi_list_lock);
+}
+
+static void osc_update_grant(struct client_obd *cli, struct ost_body *body)
+{
+	if (body->oa.o_valid & OBD_MD_FLGRANT) {
+		CDEBUG(D_CACHE, "got %llu extra grant\n", body->oa.o_grant);
+		__osc_update_grant(cli, body->oa.o_grant);
+	}
+}
+
+static int osc_set_info_async(const struct lu_env *env, struct obd_export *exp,
+			      u32 keylen, void *key, u32 vallen,
+			      void *val, struct ptlrpc_request_set *set);
+
+static int osc_shrink_grant_interpret(const struct lu_env *env,
+				      struct ptlrpc_request *req,
+				      void *aa, int rc)
+{
+	struct client_obd *cli = &req->rq_import->imp_obd->u.cli;
+	struct obdo *oa = ((struct osc_brw_async_args *)aa)->aa_oa;
+	struct ost_body *body;
+
+	if (rc != 0) {
+		__osc_update_grant(cli, oa->o_grant);
+		goto out;
+	}
+
+	body = req_capsule_server_get(&req->rq_pill, &RMF_OST_BODY);
+	LASSERT(body);
+	osc_update_grant(cli, body);
+out:
+	OBDO_FREE(oa);
+	return rc;
+}
+
+static void osc_shrink_grant_local(struct client_obd *cli, struct obdo *oa)
+{
+	client_obd_list_lock(&cli->cl_loi_list_lock);
+	oa->o_grant = cli->cl_avail_grant / 4;
+	cli->cl_avail_grant -= oa->o_grant;
+	client_obd_list_unlock(&cli->cl_loi_list_lock);
+	if (!(oa->o_valid & OBD_MD_FLFLAGS)) {
+		oa->o_valid |= OBD_MD_FLFLAGS;
+		oa->o_flags = 0;
+	}
+	oa->o_flags |= OBD_FL_SHRINK_GRANT;
+	osc_update_next_shrink(cli);
+}
+
+/* Shrink the current grant, either from some large amount to enough for a
+ * full set of in-flight RPCs, or if we have already shrunk to that limit
+ * then to enough for a single RPC.  This avoids keeping more grant than
+ * needed, and avoids shrinking the grant piecemeal. */
+static int osc_shrink_grant(struct client_obd *cli)
+{
+	__u64 target_bytes = (cli->cl_max_rpcs_in_flight + 1) *
+			     (cli->cl_max_pages_per_rpc << PAGE_CACHE_SHIFT);
+
+	client_obd_list_lock(&cli->cl_loi_list_lock);
+	if (cli->cl_avail_grant <= target_bytes)
+		target_bytes = cli->cl_max_pages_per_rpc << PAGE_CACHE_SHIFT;
+	client_obd_list_unlock(&cli->cl_loi_list_lock);
+
+	return osc_shrink_grant_to_target(cli, target_bytes);
+}
+
+int osc_shrink_grant_to_target(struct client_obd *cli, __u64 target_bytes)
+{
+	int			rc = 0;
+	struct ost_body	*body;
+
+	client_obd_list_lock(&cli->cl_loi_list_lock);
+	/* Don't shrink if we are already above or below the desired limit
+	 * We don't want to shrink below a single RPC, as that will negatively
+	 * impact block allocation and long-term performance. */
+	if (target_bytes < cli->cl_max_pages_per_rpc << PAGE_CACHE_SHIFT)
+		target_bytes = cli->cl_max_pages_per_rpc << PAGE_CACHE_SHIFT;
+
+	if (target_bytes >= cli->cl_avail_grant) {
+		client_obd_list_unlock(&cli->cl_loi_list_lock);
+		return 0;
+	}
+	client_obd_list_unlock(&cli->cl_loi_list_lock);
+
+	OBD_ALLOC_PTR(body);
+	if (!body)
+		return -ENOMEM;
+
+	osc_announce_cached(cli, &body->oa, 0);
+
+	client_obd_list_lock(&cli->cl_loi_list_lock);
+	body->oa.o_grant = cli->cl_avail_grant - target_bytes;
+	cli->cl_avail_grant = target_bytes;
+	client_obd_list_unlock(&cli->cl_loi_list_lock);
+	if (!(body->oa.o_valid & OBD_MD_FLFLAGS)) {
+		body->oa.o_valid |= OBD_MD_FLFLAGS;
+		body->oa.o_flags = 0;
+	}
+	body->oa.o_flags |= OBD_FL_SHRINK_GRANT;
+	osc_update_next_shrink(cli);
+
+	rc = osc_set_info_async(NULL, cli->cl_import->imp_obd->obd_self_export,
+				sizeof(KEY_GRANT_SHRINK), KEY_GRANT_SHRINK,
+				sizeof(*body), body, NULL);
+	if (rc != 0)
+		__osc_update_grant(cli, body->oa.o_grant);
+	OBD_FREE_PTR(body);
+	return rc;
+}
+
+static int osc_should_shrink_grant(struct client_obd *client)
+{
+	unsigned long time = cfs_time_current();
+	unsigned long next_shrink = client->cl_next_shrink_grant;
+
+	if ((client->cl_import->imp_connect_data.ocd_connect_flags &
+	     OBD_CONNECT_GRANT_SHRINK) == 0)
+		return 0;
+
+	if (cfs_time_aftereq(time, next_shrink - 5 * CFS_TICK)) {
+		/* Get the current RPC size directly, instead of going via:
+		 * cli_brw_size(obd->u.cli.cl_import->imp_obd->obd_self_export)
+		 * Keep comment here so that it can be found by searching. */
+		int brw_size = client->cl_max_pages_per_rpc << PAGE_CACHE_SHIFT;
+
+		if (client->cl_import->imp_state == LUSTRE_IMP_FULL &&
+		    client->cl_avail_grant > brw_size)
+			return 1;
+		else
+			osc_update_next_shrink(client);
+	}
+	return 0;
+}
+
+static int osc_grant_shrink_grant_cb(struct timeout_item *item, void *data)
+{
+	struct client_obd *client;
+
+	list_for_each_entry(client, &item->ti_obd_list,
+				cl_grant_shrink_list) {
+		if (osc_should_shrink_grant(client))
+			osc_shrink_grant(client);
+	}
+	return 0;
+}
+
+static int osc_add_shrink_grant(struct client_obd *client)
+{
+	int rc;
+
+	rc = ptlrpc_add_timeout_client(client->cl_grant_shrink_interval,
+				       TIMEOUT_GRANT,
+				       osc_grant_shrink_grant_cb, NULL,
+				       &client->cl_grant_shrink_list);
+	if (rc) {
+		CERROR("add grant client %s error %d\n",
+			client->cl_import->imp_obd->obd_name, rc);
+		return rc;
+	}
+	CDEBUG(D_CACHE, "add grant client %s \n",
+	       client->cl_import->imp_obd->obd_name);
+	osc_update_next_shrink(client);
+	return 0;
+}
+
+static int osc_del_shrink_grant(struct client_obd *client)
+{
+	return ptlrpc_del_timeout_client(&client->cl_grant_shrink_list,
+					 TIMEOUT_GRANT);
+}
+
+static void osc_init_grant(struct client_obd *cli, struct obd_connect_data *ocd)
+{
+	/*
+	 * ocd_grant is the total grant amount we're expect to hold: if we've
+	 * been evicted, it's the new avail_grant amount, cl_dirty will drop
+	 * to 0 as inflight RPCs fail out; otherwise, it's avail_grant + dirty.
+	 *
+	 * race is tolerable here: if we're evicted, but imp_state already
+	 * left EVICTED state, then cl_dirty must be 0 already.
+	 */
+	client_obd_list_lock(&cli->cl_loi_list_lock);
+	if (cli->cl_import->imp_state == LUSTRE_IMP_EVICTED)
+		cli->cl_avail_grant = ocd->ocd_grant;
+	else
+		cli->cl_avail_grant = ocd->ocd_grant - cli->cl_dirty;
+
+	if (cli->cl_avail_grant < 0) {
+		CWARN("%s: available grant < 0: avail/ocd/dirty %ld/%u/%ld\n",
+		      cli->cl_import->imp_obd->obd_name, cli->cl_avail_grant,
+		      ocd->ocd_grant, cli->cl_dirty);
+		/* workaround for servers which do not have the patch from
+		 * LU-2679 */
+		cli->cl_avail_grant = ocd->ocd_grant;
+	}
+
+	/* determine the appropriate chunk size used by osc_extent. */
+	cli->cl_chunkbits = max_t(int, PAGE_CACHE_SHIFT, ocd->ocd_blocksize);
+	client_obd_list_unlock(&cli->cl_loi_list_lock);
+
+	CDEBUG(D_CACHE, "%s, setting cl_avail_grant: %ld cl_lost_grant: %ld chunk bits: %d\n",
+	       cli->cl_import->imp_obd->obd_name,
+	       cli->cl_avail_grant, cli->cl_lost_grant, cli->cl_chunkbits);
+
+	if (ocd->ocd_connect_flags & OBD_CONNECT_GRANT_SHRINK &&
+	    list_empty(&cli->cl_grant_shrink_list))
+		osc_add_shrink_grant(cli);
+}
+
+/* We assume that the reason this OSC got a short read is because it read
+ * beyond the end of a stripe file; i.e. lustre is reading a sparse file
+ * via the LOV, and it _knows_ it's reading inside the file, it's just that
+ * this stripe never got written at or beyond this stripe offset yet. */
+static void handle_short_read(int nob_read, u32 page_count,
+			      struct brw_page **pga)
+{
+	char *ptr;
+	int i = 0;
+
+	/* skip bytes read OK */
+	while (nob_read > 0) {
+		LASSERT (page_count > 0);
+
+		if (pga[i]->count > nob_read) {
+			/* EOF inside this page */
+			ptr = kmap(pga[i]->pg) +
+				(pga[i]->off & ~CFS_PAGE_MASK);
+			memset(ptr + nob_read, 0, pga[i]->count - nob_read);
+			kunmap(pga[i]->pg);
+			page_count--;
+			i++;
+			break;
+		}
+
+		nob_read -= pga[i]->count;
+		page_count--;
+		i++;
+	}
+
+	/* zero remaining pages */
+	while (page_count-- > 0) {
+		ptr = kmap(pga[i]->pg) + (pga[i]->off & ~CFS_PAGE_MASK);
+		memset(ptr, 0, pga[i]->count);
+		kunmap(pga[i]->pg);
+		i++;
+	}
+}
+
+static int check_write_rcs(struct ptlrpc_request *req,
+			   int requested_nob, int niocount,
+			   u32 page_count, struct brw_page **pga)
+{
+	int     i;
+	__u32   *remote_rcs;
+
+	remote_rcs = req_capsule_server_sized_get(&req->rq_pill, &RMF_RCS,
+						  sizeof(*remote_rcs) *
+						  niocount);
+	if (remote_rcs == NULL) {
+		CDEBUG(D_INFO, "Missing/short RC vector on BRW_WRITE reply\n");
+		return -EPROTO;
+	}
+
+	/* return error if any niobuf was in error */
+	for (i = 0; i < niocount; i++) {
+		if ((int)remote_rcs[i] < 0)
+			return remote_rcs[i];
+
+		if (remote_rcs[i] != 0) {
+			CDEBUG(D_INFO, "rc[%d] invalid (%d) req %p\n",
+				i, remote_rcs[i], req);
+			return -EPROTO;
+		}
+	}
+
+	if (req->rq_bulk->bd_nob_transferred != requested_nob) {
+		CERROR("Unexpected # bytes transferred: %d (requested %d)\n",
+		       req->rq_bulk->bd_nob_transferred, requested_nob);
+		return -EPROTO;
+	}
+
+	return 0;
+}
+
+static inline int can_merge_pages(struct brw_page *p1, struct brw_page *p2)
+{
+	if (p1->flag != p2->flag) {
+		unsigned mask = ~(OBD_BRW_FROM_GRANT | OBD_BRW_NOCACHE |
+				  OBD_BRW_SYNC | OBD_BRW_ASYNC|OBD_BRW_NOQUOTA);
+
+		/* warn if we try to combine flags that we don't know to be
+		 * safe to combine */
+		if (unlikely((p1->flag & mask) != (p2->flag & mask))) {
+			CWARN("Saw flags 0x%x and 0x%x in the same brw, please report this at http://bugs.whamcloud.com/\n",
+			      p1->flag, p2->flag);
+		}
+		return 0;
+	}
+
+	return (p1->off + p1->count == p2->off);
+}
+
+static u32 osc_checksum_bulk(int nob, u32 pg_count,
+				   struct brw_page **pga, int opc,
+				   cksum_type_t cksum_type)
+{
+	__u32				cksum;
+	int				i = 0;
+	struct cfs_crypto_hash_desc	*hdesc;
+	unsigned int			bufsize;
+	int				err;
+	unsigned char			cfs_alg = cksum_obd2cfs(cksum_type);
+
+	LASSERT(pg_count > 0);
+
+	hdesc = cfs_crypto_hash_init(cfs_alg, NULL, 0);
+	if (IS_ERR(hdesc)) {
+		CERROR("Unable to initialize checksum hash %s\n",
+		       cfs_crypto_hash_name(cfs_alg));
+		return PTR_ERR(hdesc);
+	}
+
+	while (nob > 0 && pg_count > 0) {
+		int count = pga[i]->count > nob ? nob : pga[i]->count;
+
+		/* corrupt the data before we compute the checksum, to
+		 * simulate an OST->client data error */
+		if (i == 0 && opc == OST_READ &&
+		    OBD_FAIL_CHECK(OBD_FAIL_OSC_CHECKSUM_RECEIVE)) {
+			unsigned char *ptr = kmap(pga[i]->pg);
+			int off = pga[i]->off & ~CFS_PAGE_MASK;
+			memcpy(ptr + off, "bad1", min(4, nob));
+			kunmap(pga[i]->pg);
+		}
+		cfs_crypto_hash_update_page(hdesc, pga[i]->pg,
+				  pga[i]->off & ~CFS_PAGE_MASK,
+				  count);
+		CDEBUG(D_PAGE,
+		       "page %p map %p index %lu flags %lx count %u priv %0lx: off %d\n",
+		       pga[i]->pg, pga[i]->pg->mapping, pga[i]->pg->index,
+		       (long)pga[i]->pg->flags, page_count(pga[i]->pg),
+		       page_private(pga[i]->pg),
+		       (int)(pga[i]->off & ~CFS_PAGE_MASK));
+
+		nob -= pga[i]->count;
+		pg_count--;
+		i++;
+	}
+
+	bufsize = 4;
+	err = cfs_crypto_hash_final(hdesc, (unsigned char *)&cksum, &bufsize);
+
+	if (err)
+		cfs_crypto_hash_final(hdesc, NULL, NULL);
+
+	/* For sending we only compute the wrong checksum instead
+	 * of corrupting the data so it is still correct on a redo */
+	if (opc == OST_WRITE && OBD_FAIL_CHECK(OBD_FAIL_OSC_CHECKSUM_SEND))
+		cksum++;
+
+	return cksum;
+}
+
+static int osc_brw_prep_request(int cmd, struct client_obd *cli,
+				struct obdo *oa,
+				struct lov_stripe_md *lsm, u32 page_count,
+				struct brw_page **pga,
+				struct ptlrpc_request **reqp,
+				struct obd_capa *ocapa, int reserve,
+				int resend)
+{
+	struct ptlrpc_request   *req;
+	struct ptlrpc_bulk_desc *desc;
+	struct ost_body	 *body;
+	struct obd_ioobj	*ioobj;
+	struct niobuf_remote    *niobuf;
+	int niocount, i, requested_nob, opc, rc;
+	struct osc_brw_async_args *aa;
+	struct req_capsule      *pill;
+	struct brw_page *pg_prev;
+
+	if (OBD_FAIL_CHECK(OBD_FAIL_OSC_BRW_PREP_REQ))
+		return -ENOMEM; /* Recoverable */
+	if (OBD_FAIL_CHECK(OBD_FAIL_OSC_BRW_PREP_REQ2))
+		return -EINVAL; /* Fatal */
+
+	if ((cmd & OBD_BRW_WRITE) != 0) {
+		opc = OST_WRITE;
+		req = ptlrpc_request_alloc_pool(cli->cl_import,
+						cli->cl_import->imp_rq_pool,
+						&RQF_OST_BRW_WRITE);
+	} else {
+		opc = OST_READ;
+		req = ptlrpc_request_alloc(cli->cl_import, &RQF_OST_BRW_READ);
+	}
+	if (req == NULL)
+		return -ENOMEM;
+
+	for (niocount = i = 1; i < page_count; i++) {
+		if (!can_merge_pages(pga[i - 1], pga[i]))
+			niocount++;
+	}
+
+	pill = &req->rq_pill;
+	req_capsule_set_size(pill, &RMF_OBD_IOOBJ, RCL_CLIENT,
+			     sizeof(*ioobj));
+	req_capsule_set_size(pill, &RMF_NIOBUF_REMOTE, RCL_CLIENT,
+			     niocount * sizeof(*niobuf));
+	osc_set_capa_size(req, &RMF_CAPA1, ocapa);
+
+	rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, opc);
+	if (rc) {
+		ptlrpc_request_free(req);
+		return rc;
+	}
+	req->rq_request_portal = OST_IO_PORTAL; /* bug 7198 */
+	ptlrpc_at_set_req_timeout(req);
+	/* ask ptlrpc not to resend on EINPROGRESS since BRWs have their own
+	 * retry logic */
+	req->rq_no_retry_einprogress = 1;
+
+	desc = ptlrpc_prep_bulk_imp(req, page_count,
+		cli->cl_import->imp_connect_data.ocd_brw_size >> LNET_MTU_BITS,
+		opc == OST_WRITE ? BULK_GET_SOURCE : BULK_PUT_SINK,
+		OST_BULK_PORTAL);
+
+	if (desc == NULL) {
+		rc = -ENOMEM;
+		goto out;
+	}
+	/* NB request now owns desc and will free it when it gets freed */
+
+	body = req_capsule_client_get(pill, &RMF_OST_BODY);
+	ioobj = req_capsule_client_get(pill, &RMF_OBD_IOOBJ);
+	niobuf = req_capsule_client_get(pill, &RMF_NIOBUF_REMOTE);
+	LASSERT(body != NULL && ioobj != NULL && niobuf != NULL);
+
+	lustre_set_wire_obdo(&req->rq_import->imp_connect_data, &body->oa, oa);
+
+	obdo_to_ioobj(oa, ioobj);
+	ioobj->ioo_bufcnt = niocount;
+	/* The high bits of ioo_max_brw tells server _maximum_ number of bulks
+	 * that might be send for this request.  The actual number is decided
+	 * when the RPC is finally sent in ptlrpc_register_bulk(). It sends
+	 * "max - 1" for old client compatibility sending "0", and also so the
+	 * the actual maximum is a power-of-two number, not one less. LU-1431 */
+	ioobj_max_brw_set(ioobj, desc->bd_md_max_brw);
+	osc_pack_capa(req, body, ocapa);
+	LASSERT(page_count > 0);
+	pg_prev = pga[0];
+	for (requested_nob = i = 0; i < page_count; i++, niobuf++) {
+		struct brw_page *pg = pga[i];
+		int poff = pg->off & ~CFS_PAGE_MASK;
+
+		LASSERT(pg->count > 0);
+		/* make sure there is no gap in the middle of page array */
+		LASSERTF(page_count == 1 ||
+			 (ergo(i == 0, poff + pg->count == PAGE_CACHE_SIZE) &&
+			  ergo(i > 0 && i < page_count - 1,
+			       poff == 0 && pg->count == PAGE_CACHE_SIZE)   &&
+			  ergo(i == page_count - 1, poff == 0)),
+			 "i: %d/%d pg: %p off: %llu, count: %u\n",
+			 i, page_count, pg, pg->off, pg->count);
+		LASSERTF(i == 0 || pg->off > pg_prev->off,
+			 "i %d p_c %u pg %p [pri %lu ind %lu] off %llu prev_pg %p [pri %lu ind %lu] off %llu\n",
+			 i, page_count,
+			 pg->pg, page_private(pg->pg), pg->pg->index, pg->off,
+			 pg_prev->pg, page_private(pg_prev->pg),
+			 pg_prev->pg->index, pg_prev->off);
+		LASSERT((pga[0]->flag & OBD_BRW_SRVLOCK) ==
+			(pg->flag & OBD_BRW_SRVLOCK));
+
+		ptlrpc_prep_bulk_page_pin(desc, pg->pg, poff, pg->count);
+		requested_nob += pg->count;
+
+		if (i > 0 && can_merge_pages(pg_prev, pg)) {
+			niobuf--;
+			niobuf->len += pg->count;
+		} else {
+			niobuf->offset = pg->off;
+			niobuf->len    = pg->count;
+			niobuf->flags  = pg->flag;
+		}
+		pg_prev = pg;
+	}
+
+	LASSERTF((void *)(niobuf - niocount) ==
+		req_capsule_client_get(&req->rq_pill, &RMF_NIOBUF_REMOTE),
+		"want %p - real %p\n", req_capsule_client_get(&req->rq_pill,
+		&RMF_NIOBUF_REMOTE), (void *)(niobuf - niocount));
+
+	osc_announce_cached(cli, &body->oa, opc == OST_WRITE ? requested_nob:0);
+	if (resend) {
+		if ((body->oa.o_valid & OBD_MD_FLFLAGS) == 0) {
+			body->oa.o_valid |= OBD_MD_FLFLAGS;
+			body->oa.o_flags = 0;
+		}
+		body->oa.o_flags |= OBD_FL_RECOV_RESEND;
+	}
+
+	if (osc_should_shrink_grant(cli))
+		osc_shrink_grant_local(cli, &body->oa);
+
+	/* size[REQ_REC_OFF] still sizeof (*body) */
+	if (opc == OST_WRITE) {
+		if (cli->cl_checksum &&
+		    !sptlrpc_flavor_has_bulk(&req->rq_flvr)) {
+			/* store cl_cksum_type in a local variable since
+			 * it can be changed via lprocfs */
+			cksum_type_t cksum_type = cli->cl_cksum_type;
+
+			if ((body->oa.o_valid & OBD_MD_FLFLAGS) == 0) {
+				oa->o_flags &= OBD_FL_LOCAL_MASK;
+				body->oa.o_flags = 0;
+			}
+			body->oa.o_flags |= cksum_type_pack(cksum_type);
+			body->oa.o_valid |= OBD_MD_FLCKSUM | OBD_MD_FLFLAGS;
+			body->oa.o_cksum = osc_checksum_bulk(requested_nob,
+							     page_count, pga,
+							     OST_WRITE,
+							     cksum_type);
+			CDEBUG(D_PAGE, "checksum at write origin: %x\n",
+			       body->oa.o_cksum);
+			/* save this in 'oa', too, for later checking */
+			oa->o_valid |= OBD_MD_FLCKSUM | OBD_MD_FLFLAGS;
+			oa->o_flags |= cksum_type_pack(cksum_type);
+		} else {
+			/* clear out the checksum flag, in case this is a
+			 * resend but cl_checksum is no longer set. b=11238 */
+			oa->o_valid &= ~OBD_MD_FLCKSUM;
+		}
+		oa->o_cksum = body->oa.o_cksum;
+		/* 1 RC per niobuf */
+		req_capsule_set_size(pill, &RMF_RCS, RCL_SERVER,
+				     sizeof(__u32) * niocount);
+	} else {
+		if (cli->cl_checksum &&
+		    !sptlrpc_flavor_has_bulk(&req->rq_flvr)) {
+			if ((body->oa.o_valid & OBD_MD_FLFLAGS) == 0)
+				body->oa.o_flags = 0;
+			body->oa.o_flags |= cksum_type_pack(cli->cl_cksum_type);
+			body->oa.o_valid |= OBD_MD_FLCKSUM | OBD_MD_FLFLAGS;
+		}
+	}
+	ptlrpc_request_set_replen(req);
+
+	CLASSERT(sizeof(*aa) <= sizeof(req->rq_async_args));
+	aa = ptlrpc_req_async_args(req);
+	aa->aa_oa = oa;
+	aa->aa_requested_nob = requested_nob;
+	aa->aa_nio_count = niocount;
+	aa->aa_page_count = page_count;
+	aa->aa_resends = 0;
+	aa->aa_ppga = pga;
+	aa->aa_cli = cli;
+	INIT_LIST_HEAD(&aa->aa_oaps);
+	if (ocapa && reserve)
+		aa->aa_ocapa = capa_get(ocapa);
+
+	*reqp = req;
+	return 0;
+
+ out:
+	ptlrpc_req_finished(req);
+	return rc;
+}
+
+static int check_write_checksum(struct obdo *oa, const lnet_process_id_t *peer,
+				__u32 client_cksum, __u32 server_cksum, int nob,
+				u32 page_count, struct brw_page **pga,
+				cksum_type_t client_cksum_type)
+{
+	__u32 new_cksum;
+	char *msg;
+	cksum_type_t cksum_type;
+
+	if (server_cksum == client_cksum) {
+		CDEBUG(D_PAGE, "checksum %x confirmed\n", client_cksum);
+		return 0;
+	}
+
+	cksum_type = cksum_type_unpack(oa->o_valid & OBD_MD_FLFLAGS ?
+				       oa->o_flags : 0);
+	new_cksum = osc_checksum_bulk(nob, page_count, pga, OST_WRITE,
+				      cksum_type);
+
+	if (cksum_type != client_cksum_type)
+		msg = "the server did not use the checksum type specified in the original request - likely a protocol problem"
+			;
+	else if (new_cksum == server_cksum)
+		msg = "changed on the client after we checksummed it - likely false positive due to mmap IO (bug 11742)"
+			;
+	else if (new_cksum == client_cksum)
+		msg = "changed in transit before arrival at OST";
+	else
+		msg = "changed in transit AND doesn't match the original - likely false positive due to mmap IO (bug 11742)"
+			;
+
+	LCONSOLE_ERROR_MSG(0x132, "BAD WRITE CHECKSUM: %s: from %s inode "DFID
+			   " object "DOSTID" extent [%llu-%llu]\n",
+			   msg, libcfs_nid2str(peer->nid),
+			   oa->o_valid & OBD_MD_FLFID ? oa->o_parent_seq : (__u64)0,
+			   oa->o_valid & OBD_MD_FLFID ? oa->o_parent_oid : 0,
+			   oa->o_valid & OBD_MD_FLFID ? oa->o_parent_ver : 0,
+			   POSTID(&oa->o_oi), pga[0]->off,
+			   pga[page_count-1]->off + pga[page_count-1]->count - 1);
+	CERROR("original client csum %x (type %x), server csum %x (type %x), client csum now %x\n",
+	       client_cksum, client_cksum_type,
+	       server_cksum, cksum_type, new_cksum);
+	return 1;
+}
+
+/* Note rc enters this function as number of bytes transferred */
+static int osc_brw_fini_request(struct ptlrpc_request *req, int rc)
+{
+	struct osc_brw_async_args *aa = (void *)&req->rq_async_args;
+	const lnet_process_id_t *peer =
+			&req->rq_import->imp_connection->c_peer;
+	struct client_obd *cli = aa->aa_cli;
+	struct ost_body *body;
+	__u32 client_cksum = 0;
+
+	if (rc < 0 && rc != -EDQUOT) {
+		DEBUG_REQ(D_INFO, req, "Failed request with rc = %d\n", rc);
+		return rc;
+	}
+
+	LASSERTF(req->rq_repmsg != NULL, "rc = %d\n", rc);
+	body = req_capsule_server_get(&req->rq_pill, &RMF_OST_BODY);
+	if (body == NULL) {
+		DEBUG_REQ(D_INFO, req, "Can't unpack body\n");
+		return -EPROTO;
+	}
+
+	/* set/clear over quota flag for a uid/gid */
+	if (lustre_msg_get_opc(req->rq_reqmsg) == OST_WRITE &&
+	    body->oa.o_valid & (OBD_MD_FLUSRQUOTA | OBD_MD_FLGRPQUOTA)) {
+		unsigned int qid[MAXQUOTAS] = { body->oa.o_uid, body->oa.o_gid };
+
+		CDEBUG(D_QUOTA, "setdq for [%u %u] with valid %#llx, flags %x\n",
+		       body->oa.o_uid, body->oa.o_gid, body->oa.o_valid,
+		       body->oa.o_flags);
+		osc_quota_setdq(cli, qid, body->oa.o_valid, body->oa.o_flags);
+	}
+
+	osc_update_grant(cli, body);
+
+	if (rc < 0)
+		return rc;
+
+	if (aa->aa_oa->o_valid & OBD_MD_FLCKSUM)
+		client_cksum = aa->aa_oa->o_cksum; /* save for later */
+
+	if (lustre_msg_get_opc(req->rq_reqmsg) == OST_WRITE) {
+		if (rc > 0) {
+			CERROR("Unexpected +ve rc %d\n", rc);
+			return -EPROTO;
+		}
+		LASSERT(req->rq_bulk->bd_nob == aa->aa_requested_nob);
+
+		if (sptlrpc_cli_unwrap_bulk_write(req, req->rq_bulk))
+			return -EAGAIN;
+
+		if ((aa->aa_oa->o_valid & OBD_MD_FLCKSUM) && client_cksum &&
+		    check_write_checksum(&body->oa, peer, client_cksum,
+					 body->oa.o_cksum, aa->aa_requested_nob,
+					 aa->aa_page_count, aa->aa_ppga,
+					 cksum_type_unpack(aa->aa_oa->o_flags)))
+			return -EAGAIN;
+
+		rc = check_write_rcs(req, aa->aa_requested_nob,
+				     aa->aa_nio_count,
+				     aa->aa_page_count, aa->aa_ppga);
+		goto out;
+	}
+
+	/* The rest of this function executes only for OST_READs */
+
+	/* if unwrap_bulk failed, return -EAGAIN to retry */
+	rc = sptlrpc_cli_unwrap_bulk_read(req, req->rq_bulk, rc);
+	if (rc < 0) {
+		rc = -EAGAIN;
+		goto out;
+	}
+
+	if (rc > aa->aa_requested_nob) {
+		CERROR("Unexpected rc %d (%d requested)\n", rc,
+		       aa->aa_requested_nob);
+		return -EPROTO;
+	}
+
+	if (rc != req->rq_bulk->bd_nob_transferred) {
+		CERROR ("Unexpected rc %d (%d transferred)\n",
+			rc, req->rq_bulk->bd_nob_transferred);
+		return -EPROTO;
+	}
+
+	if (rc < aa->aa_requested_nob)
+		handle_short_read(rc, aa->aa_page_count, aa->aa_ppga);
+
+	if (body->oa.o_valid & OBD_MD_FLCKSUM) {
+		static int cksum_counter;
+		__u32      server_cksum = body->oa.o_cksum;
+		char      *via;
+		char      *router;
+		cksum_type_t cksum_type;
+
+		cksum_type = cksum_type_unpack(body->oa.o_valid &OBD_MD_FLFLAGS?
+					       body->oa.o_flags : 0);
+		client_cksum = osc_checksum_bulk(rc, aa->aa_page_count,
+						 aa->aa_ppga, OST_READ,
+						 cksum_type);
+
+		if (peer->nid == req->rq_bulk->bd_sender) {
+			via = router = "";
+		} else {
+			via = " via ";
+			router = libcfs_nid2str(req->rq_bulk->bd_sender);
+		}
+
+		if (server_cksum != client_cksum) {
+			LCONSOLE_ERROR_MSG(0x133, "%s: BAD READ CHECKSUM: from %s%s%s inode " DFID " object " DOSTID " extent [%llu-%llu]\n",
+					   req->rq_import->imp_obd->obd_name,
+					   libcfs_nid2str(peer->nid),
+					   via, router,
+					   body->oa.o_valid & OBD_MD_FLFID ?
+					   body->oa.o_parent_seq : (__u64)0,
+					   body->oa.o_valid & OBD_MD_FLFID ?
+					   body->oa.o_parent_oid : 0,
+					   body->oa.o_valid & OBD_MD_FLFID ?
+					   body->oa.o_parent_ver : 0,
+					   POSTID(&body->oa.o_oi),
+					   aa->aa_ppga[0]->off,
+					   aa->aa_ppga[aa->aa_page_count-1]->off +
+					   aa->aa_ppga[aa->aa_page_count-1]->count -
+					   1);
+			CERROR("client %x, server %x, cksum_type %x\n",
+			       client_cksum, server_cksum, cksum_type);
+			cksum_counter = 0;
+			aa->aa_oa->o_cksum = client_cksum;
+			rc = -EAGAIN;
+		} else {
+			cksum_counter++;
+			CDEBUG(D_PAGE, "checksum %x confirmed\n", client_cksum);
+			rc = 0;
+		}
+	} else if (unlikely(client_cksum)) {
+		static int cksum_missed;
+
+		cksum_missed++;
+		if ((cksum_missed & (-cksum_missed)) == cksum_missed)
+			CERROR("Checksum %u requested from %s but not sent\n",
+			       cksum_missed, libcfs_nid2str(peer->nid));
+	} else {
+		rc = 0;
+	}
+out:
+	if (rc >= 0)
+		lustre_get_wire_obdo(&req->rq_import->imp_connect_data,
+				     aa->aa_oa, &body->oa);
+
+	return rc;
+}
+
+static int osc_brw_redo_request(struct ptlrpc_request *request,
+				struct osc_brw_async_args *aa, int rc)
+{
+	struct ptlrpc_request *new_req;
+	struct osc_brw_async_args *new_aa;
+	struct osc_async_page *oap;
+
+	DEBUG_REQ(rc == -EINPROGRESS ? D_RPCTRACE : D_ERROR, request,
+		  "redo for recoverable error %d", rc);
+
+	rc = osc_brw_prep_request(lustre_msg_get_opc(request->rq_reqmsg) ==
+					OST_WRITE ? OBD_BRW_WRITE :OBD_BRW_READ,
+				  aa->aa_cli, aa->aa_oa,
+				  NULL /* lsm unused by osc currently */,
+				  aa->aa_page_count, aa->aa_ppga,
+				  &new_req, aa->aa_ocapa, 0, 1);
+	if (rc)
+		return rc;
+
+	list_for_each_entry(oap, &aa->aa_oaps, oap_rpc_item) {
+		if (oap->oap_request != NULL) {
+			LASSERTF(request == oap->oap_request,
+				 "request %p != oap_request %p\n",
+				 request, oap->oap_request);
+			if (oap->oap_interrupted) {
+				ptlrpc_req_finished(new_req);
+				return -EINTR;
+			}
+		}
+	}
+	/* New request takes over pga and oaps from old request.
+	 * Note that copying a list_head doesn't work, need to move it... */
+	aa->aa_resends++;
+	new_req->rq_interpret_reply = request->rq_interpret_reply;
+	new_req->rq_async_args = request->rq_async_args;
+	/* cap resend delay to the current request timeout, this is similar to
+	 * what ptlrpc does (see after_reply()) */
+	if (aa->aa_resends > new_req->rq_timeout)
+		new_req->rq_sent = get_seconds() + new_req->rq_timeout;
+	else
+		new_req->rq_sent = get_seconds() + aa->aa_resends;
+	new_req->rq_generation_set = 1;
+	new_req->rq_import_generation = request->rq_import_generation;
+
+	new_aa = ptlrpc_req_async_args(new_req);
+
+	INIT_LIST_HEAD(&new_aa->aa_oaps);
+	list_splice_init(&aa->aa_oaps, &new_aa->aa_oaps);
+	INIT_LIST_HEAD(&new_aa->aa_exts);
+	list_splice_init(&aa->aa_exts, &new_aa->aa_exts);
+	new_aa->aa_resends = aa->aa_resends;
+
+	list_for_each_entry(oap, &new_aa->aa_oaps, oap_rpc_item) {
+		if (oap->oap_request) {
+			ptlrpc_req_finished(oap->oap_request);
+			oap->oap_request = ptlrpc_request_addref(new_req);
+		}
+	}
+
+	new_aa->aa_ocapa = aa->aa_ocapa;
+	aa->aa_ocapa = NULL;
+
+	/* XXX: This code will run into problem if we're going to support
+	 * to add a series of BRW RPCs into a self-defined ptlrpc_request_set
+	 * and wait for all of them to be finished. We should inherit request
+	 * set from old request. */
+	ptlrpcd_add_req(new_req, PDL_POLICY_SAME, -1);
+
+	DEBUG_REQ(D_INFO, new_req, "new request");
+	return 0;
+}
+
+/*
+ * ugh, we want disk allocation on the target to happen in offset order.  we'll
+ * follow sedgewicks advice and stick to the dead simple shellsort -- it'll do
+ * fine for our small page arrays and doesn't require allocation.  its an
+ * insertion sort that swaps elements that are strides apart, shrinking the
+ * stride down until its '1' and the array is sorted.
+ */
+static void sort_brw_pages(struct brw_page **array, int num)
+{
+	int stride, i, j;
+	struct brw_page *tmp;
+
+	if (num == 1)
+		return;
+	for (stride = 1; stride < num ; stride = (stride * 3) + 1)
+		;
+
+	do {
+		stride /= 3;
+		for (i = stride ; i < num ; i++) {
+			tmp = array[i];
+			j = i;
+			while (j >= stride && array[j - stride]->off > tmp->off) {
+				array[j] = array[j - stride];
+				j -= stride;
+			}
+			array[j] = tmp;
+		}
+	} while (stride > 1);
+}
+
+static void osc_release_ppga(struct brw_page **ppga, u32 count)
+{
+	LASSERT(ppga != NULL);
+	OBD_FREE(ppga, sizeof(*ppga) * count);
+}
+
+static int brw_interpret(const struct lu_env *env,
+			 struct ptlrpc_request *req, void *data, int rc)
+{
+	struct osc_brw_async_args *aa = data;
+	struct osc_extent *ext;
+	struct osc_extent *tmp;
+	struct cl_object  *obj = NULL;
+	struct client_obd *cli = aa->aa_cli;
+
+	rc = osc_brw_fini_request(req, rc);
+	CDEBUG(D_INODE, "request %p aa %p rc %d\n", req, aa, rc);
+	/* When server return -EINPROGRESS, client should always retry
+	 * regardless of the number of times the bulk was resent already. */
+	if (osc_recoverable_error(rc)) {
+		if (req->rq_import_generation !=
+		    req->rq_import->imp_generation) {
+			CDEBUG(D_HA, "%s: resend cross eviction for object: " DOSTID ", rc = %d.\n",
+			       req->rq_import->imp_obd->obd_name,
+			       POSTID(&aa->aa_oa->o_oi), rc);
+		} else if (rc == -EINPROGRESS ||
+		    client_should_resend(aa->aa_resends, aa->aa_cli)) {
+			rc = osc_brw_redo_request(req, aa, rc);
+		} else {
+			CERROR("%s: too many resent retries for object: %llu:%llu, rc = %d.\n",
+			       req->rq_import->imp_obd->obd_name,
+			       POSTID(&aa->aa_oa->o_oi), rc);
+		}
+
+		if (rc == 0)
+			return 0;
+		else if (rc == -EAGAIN || rc == -EINPROGRESS)
+			rc = -EIO;
+	}
+
+	if (aa->aa_ocapa) {
+		capa_put(aa->aa_ocapa);
+		aa->aa_ocapa = NULL;
+	}
+
+	list_for_each_entry_safe(ext, tmp, &aa->aa_exts, oe_link) {
+		if (obj == NULL && rc == 0) {
+			obj = osc2cl(ext->oe_obj);
+			cl_object_get(obj);
+		}
+
+		list_del_init(&ext->oe_link);
+		osc_extent_finish(env, ext, 1, rc);
+	}
+	LASSERT(list_empty(&aa->aa_exts));
+	LASSERT(list_empty(&aa->aa_oaps));
+
+	if (obj != NULL) {
+		struct obdo *oa = aa->aa_oa;
+		struct cl_attr *attr  = &osc_env_info(env)->oti_attr;
+		unsigned long valid = 0;
+
+		LASSERT(rc == 0);
+		if (oa->o_valid & OBD_MD_FLBLOCKS) {
+			attr->cat_blocks = oa->o_blocks;
+			valid |= CAT_BLOCKS;
+		}
+		if (oa->o_valid & OBD_MD_FLMTIME) {
+			attr->cat_mtime = oa->o_mtime;
+			valid |= CAT_MTIME;
+		}
+		if (oa->o_valid & OBD_MD_FLATIME) {
+			attr->cat_atime = oa->o_atime;
+			valid |= CAT_ATIME;
+		}
+		if (oa->o_valid & OBD_MD_FLCTIME) {
+			attr->cat_ctime = oa->o_ctime;
+			valid |= CAT_CTIME;
+		}
+		if (valid != 0) {
+			cl_object_attr_lock(obj);
+			cl_object_attr_set(env, obj, attr, valid);
+			cl_object_attr_unlock(obj);
+		}
+		cl_object_put(env, obj);
+	}
+	OBDO_FREE(aa->aa_oa);
+
+	cl_req_completion(env, aa->aa_clerq, rc < 0 ? rc :
+			  req->rq_bulk->bd_nob_transferred);
+	osc_release_ppga(aa->aa_ppga, aa->aa_page_count);
+	ptlrpc_lprocfs_brw(req, req->rq_bulk->bd_nob_transferred);
+
+	client_obd_list_lock(&cli->cl_loi_list_lock);
+	/* We need to decrement before osc_ap_completion->osc_wake_cache_waiters
+	 * is called so we know whether to go to sync BRWs or wait for more
+	 * RPCs to complete */
+	if (lustre_msg_get_opc(req->rq_reqmsg) == OST_WRITE)
+		cli->cl_w_in_flight--;
+	else
+		cli->cl_r_in_flight--;
+	osc_wake_cache_waiters(cli);
+	client_obd_list_unlock(&cli->cl_loi_list_lock);
+
+	osc_io_unplug(env, cli, NULL, PDL_POLICY_SAME);
+	return rc;
+}
+
+/**
+ * Build an RPC by the list of extent @ext_list. The caller must ensure
+ * that the total pages in this list are NOT over max pages per RPC.
+ * Extents in the list must be in OES_RPC state.
+ */
+int osc_build_rpc(const struct lu_env *env, struct client_obd *cli,
+		  struct list_head *ext_list, int cmd, pdl_policy_t pol)
+{
+	struct ptlrpc_request		*req = NULL;
+	struct osc_extent		*ext;
+	struct brw_page			**pga = NULL;
+	struct osc_brw_async_args	*aa = NULL;
+	struct obdo			*oa = NULL;
+	struct osc_async_page		*oap;
+	struct osc_async_page		*tmp;
+	struct cl_req			*clerq = NULL;
+	enum cl_req_type		crt = (cmd & OBD_BRW_WRITE) ? CRT_WRITE :
+								      CRT_READ;
+	struct ldlm_lock		*lock = NULL;
+	struct cl_req_attr		*crattr = NULL;
+	u64				starting_offset = OBD_OBJECT_EOF;
+	u64				ending_offset = 0;
+	int				mpflag = 0;
+	int				mem_tight = 0;
+	int				page_count = 0;
+	int				i;
+	int				rc;
+	struct ost_body			*body;
+	LIST_HEAD(rpc_list);
+
+	LASSERT(!list_empty(ext_list));
+
+	/* add pages into rpc_list to build BRW rpc */
+	list_for_each_entry(ext, ext_list, oe_link) {
+		LASSERT(ext->oe_state == OES_RPC);
+		mem_tight |= ext->oe_memalloc;
+		list_for_each_entry(oap, &ext->oe_pages, oap_pending_item) {
+			++page_count;
+			list_add_tail(&oap->oap_rpc_item, &rpc_list);
+			if (starting_offset > oap->oap_obj_off)
+				starting_offset = oap->oap_obj_off;
+			else
+				LASSERT(oap->oap_page_off == 0);
+			if (ending_offset < oap->oap_obj_off + oap->oap_count)
+				ending_offset = oap->oap_obj_off +
+						oap->oap_count;
+			else
+				LASSERT(oap->oap_page_off + oap->oap_count ==
+					PAGE_CACHE_SIZE);
+		}
+	}
+
+	if (mem_tight)
+		mpflag = cfs_memory_pressure_get_and_set();
+
+	OBD_ALLOC(crattr, sizeof(*crattr));
+	if (crattr == NULL) {
+		rc = -ENOMEM;
+		goto out;
+	}
+
+	OBD_ALLOC(pga, sizeof(*pga) * page_count);
+	if (pga == NULL) {
+		rc = -ENOMEM;
+		goto out;
+	}
+
+	OBDO_ALLOC(oa);
+	if (oa == NULL) {
+		rc = -ENOMEM;
+		goto out;
+	}
+
+	i = 0;
+	list_for_each_entry(oap, &rpc_list, oap_rpc_item) {
+		struct cl_page *page = oap2cl_page(oap);
+		if (clerq == NULL) {
+			clerq = cl_req_alloc(env, page, crt,
+					     1 /* only 1-object rpcs for now */);
+			if (IS_ERR(clerq)) {
+				rc = PTR_ERR(clerq);
+				goto out;
+			}
+			lock = oap->oap_ldlm_lock;
+		}
+		if (mem_tight)
+			oap->oap_brw_flags |= OBD_BRW_MEMALLOC;
+		pga[i] = &oap->oap_brw_page;
+		pga[i]->off = oap->oap_obj_off + oap->oap_page_off;
+		CDEBUG(0, "put page %p index %lu oap %p flg %x to pga\n",
+		       pga[i]->pg, page_index(oap->oap_page), oap,
+		       pga[i]->flag);
+		i++;
+		cl_req_page_add(env, clerq, page);
+	}
+
+	/* always get the data for the obdo for the rpc */
+	LASSERT(clerq != NULL);
+	crattr->cra_oa = oa;
+	cl_req_attr_set(env, clerq, crattr, ~0ULL);
+	if (lock) {
+		oa->o_handle = lock->l_remote_handle;
+		oa->o_valid |= OBD_MD_FLHANDLE;
+	}
+
+	rc = cl_req_prep(env, clerq);
+	if (rc != 0) {
+		CERROR("cl_req_prep failed: %d\n", rc);
+		goto out;
+	}
+
+	sort_brw_pages(pga, page_count);
+	rc = osc_brw_prep_request(cmd, cli, oa, NULL, page_count,
+			pga, &req, crattr->cra_capa, 1, 0);
+	if (rc != 0) {
+		CERROR("prep_req failed: %d\n", rc);
+		goto out;
+	}
+
+	req->rq_interpret_reply = brw_interpret;
+
+	if (mem_tight != 0)
+		req->rq_memalloc = 1;
+
+	/* Need to update the timestamps after the request is built in case
+	 * we race with setattr (locally or in queue at OST).  If OST gets
+	 * later setattr before earlier BRW (as determined by the request xid),
+	 * the OST will not use BRW timestamps.  Sadly, there is no obvious
+	 * way to do this in a single call.  bug 10150 */
+	body = req_capsule_client_get(&req->rq_pill, &RMF_OST_BODY);
+	crattr->cra_oa = &body->oa;
+	cl_req_attr_set(env, clerq, crattr,
+			OBD_MD_FLMTIME|OBD_MD_FLCTIME|OBD_MD_FLATIME);
+
+	lustre_msg_set_jobid(req->rq_reqmsg, crattr->cra_jobid);
+
+	CLASSERT(sizeof(*aa) <= sizeof(req->rq_async_args));
+	aa = ptlrpc_req_async_args(req);
+	INIT_LIST_HEAD(&aa->aa_oaps);
+	list_splice_init(&rpc_list, &aa->aa_oaps);
+	INIT_LIST_HEAD(&aa->aa_exts);
+	list_splice_init(ext_list, &aa->aa_exts);
+	aa->aa_clerq = clerq;
+
+	/* queued sync pages can be torn down while the pages
+	 * were between the pending list and the rpc */
+	tmp = NULL;
+	list_for_each_entry(oap, &aa->aa_oaps, oap_rpc_item) {
+		/* only one oap gets a request reference */
+		if (tmp == NULL)
+			tmp = oap;
+		if (oap->oap_interrupted && !req->rq_intr) {
+			CDEBUG(D_INODE, "oap %p in req %p interrupted\n",
+					oap, req);
+			ptlrpc_mark_interrupted(req);
+		}
+	}
+	if (tmp != NULL)
+		tmp->oap_request = ptlrpc_request_addref(req);
+
+	client_obd_list_lock(&cli->cl_loi_list_lock);
+	starting_offset >>= PAGE_CACHE_SHIFT;
+	if (cmd == OBD_BRW_READ) {
+		cli->cl_r_in_flight++;
+		lprocfs_oh_tally_log2(&cli->cl_read_page_hist, page_count);
+		lprocfs_oh_tally(&cli->cl_read_rpc_hist, cli->cl_r_in_flight);
+		lprocfs_oh_tally_log2(&cli->cl_read_offset_hist,
+				      starting_offset + 1);
+	} else {
+		cli->cl_w_in_flight++;
+		lprocfs_oh_tally_log2(&cli->cl_write_page_hist, page_count);
+		lprocfs_oh_tally(&cli->cl_write_rpc_hist, cli->cl_w_in_flight);
+		lprocfs_oh_tally_log2(&cli->cl_write_offset_hist,
+				      starting_offset + 1);
+	}
+	client_obd_list_unlock(&cli->cl_loi_list_lock);
+
+	DEBUG_REQ(D_INODE, req, "%d pages, aa %p. now %dr/%dw in flight",
+		  page_count, aa, cli->cl_r_in_flight,
+		  cli->cl_w_in_flight);
+
+	/* XXX: Maybe the caller can check the RPC bulk descriptor to
+	 * see which CPU/NUMA node the majority of pages were allocated
+	 * on, and try to assign the async RPC to the CPU core
+	 * (PDL_POLICY_PREFERRED) to reduce cross-CPU memory traffic.
+	 *
+	 * But on the other hand, we expect that multiple ptlrpcd
+	 * threads and the initial write sponsor can run in parallel,
+	 * especially when data checksum is enabled, which is CPU-bound
+	 * operation and single ptlrpcd thread cannot process in time.
+	 * So more ptlrpcd threads sharing BRW load
+	 * (with PDL_POLICY_ROUND) seems better.
+	 */
+	ptlrpcd_add_req(req, pol, -1);
+	rc = 0;
+
+out:
+	if (mem_tight != 0)
+		cfs_memory_pressure_restore(mpflag);
+
+	if (crattr != NULL) {
+		capa_put(crattr->cra_capa);
+		OBD_FREE(crattr, sizeof(*crattr));
+	}
+
+	if (rc != 0) {
+		LASSERT(req == NULL);
+
+		if (oa)
+			OBDO_FREE(oa);
+		if (pga)
+			OBD_FREE(pga, sizeof(*pga) * page_count);
+		/* this should happen rarely and is pretty bad, it makes the
+		 * pending list not follow the dirty order */
+		while (!list_empty(ext_list)) {
+			ext = list_entry(ext_list->next, struct osc_extent,
+					     oe_link);
+			list_del_init(&ext->oe_link);
+			osc_extent_finish(env, ext, 0, rc);
+		}
+		if (clerq && !IS_ERR(clerq))
+			cl_req_completion(env, clerq, rc);
+	}
+	return rc;
+}
+
+static int osc_set_lock_data_with_check(struct ldlm_lock *lock,
+					struct ldlm_enqueue_info *einfo)
+{
+	void *data = einfo->ei_cbdata;
+	int set = 0;
+
+	LASSERT(lock != NULL);
+	LASSERT(lock->l_blocking_ast == einfo->ei_cb_bl);
+	LASSERT(lock->l_resource->lr_type == einfo->ei_type);
+	LASSERT(lock->l_completion_ast == einfo->ei_cb_cp);
+	LASSERT(lock->l_glimpse_ast == einfo->ei_cb_gl);
+
+	lock_res_and_lock(lock);
+	spin_lock(&osc_ast_guard);
+
+	if (lock->l_ast_data == NULL)
+		lock->l_ast_data = data;
+	if (lock->l_ast_data == data)
+		set = 1;
+
+	spin_unlock(&osc_ast_guard);
+	unlock_res_and_lock(lock);
+
+	return set;
+}
+
+static int osc_set_data_with_check(struct lustre_handle *lockh,
+				   struct ldlm_enqueue_info *einfo)
+{
+	struct ldlm_lock *lock = ldlm_handle2lock(lockh);
+	int set = 0;
+
+	if (lock != NULL) {
+		set = osc_set_lock_data_with_check(lock, einfo);
+		LDLM_LOCK_PUT(lock);
+	} else
+		CERROR("lockh %p, data %p - client evicted?\n",
+		       lockh, einfo->ei_cbdata);
+	return set;
+}
+
+/* find any ldlm lock of the inode in osc
+ * return 0    not find
+ *	1    find one
+ *      < 0    error */
+static int osc_find_cbdata(struct obd_export *exp, struct lov_stripe_md *lsm,
+			   ldlm_iterator_t replace, void *data)
+{
+	struct ldlm_res_id res_id;
+	struct obd_device *obd = class_exp2obd(exp);
+	int rc = 0;
+
+	ostid_build_res_name(&lsm->lsm_oi, &res_id);
+	rc = ldlm_resource_iterate(obd->obd_namespace, &res_id, replace, data);
+	if (rc == LDLM_ITER_STOP)
+		return 1;
+	if (rc == LDLM_ITER_CONTINUE)
+		return 0;
+	return rc;
+}
+
+static int osc_enqueue_fini(struct ptlrpc_request *req, struct ost_lvb *lvb,
+			    obd_enqueue_update_f upcall, void *cookie,
+			    __u64 *flags, int agl, int rc)
+{
+	int intent = *flags & LDLM_FL_HAS_INTENT;
+
+	if (intent) {
+		/* The request was created before ldlm_cli_enqueue call. */
+		if (rc == ELDLM_LOCK_ABORTED) {
+			struct ldlm_reply *rep;
+			rep = req_capsule_server_get(&req->rq_pill,
+						     &RMF_DLM_REP);
+
+			LASSERT(rep != NULL);
+			rep->lock_policy_res1 =
+				ptlrpc_status_ntoh(rep->lock_policy_res1);
+			if (rep->lock_policy_res1)
+				rc = rep->lock_policy_res1;
+		}
+	}
+
+	if ((intent != 0 && rc == ELDLM_LOCK_ABORTED && agl == 0) ||
+	    (rc == 0)) {
+		*flags |= LDLM_FL_LVB_READY;
+		CDEBUG(D_INODE, "got kms %llu blocks %llu mtime %llu\n",
+		       lvb->lvb_size, lvb->lvb_blocks, lvb->lvb_mtime);
+	}
+
+	/* Call the update callback. */
+	rc = (*upcall)(cookie, rc);
+	return rc;
+}
+
+static int osc_enqueue_interpret(const struct lu_env *env,
+				 struct ptlrpc_request *req,
+				 struct osc_enqueue_args *aa, int rc)
+{
+	struct ldlm_lock *lock;
+	struct lustre_handle handle;
+	__u32 mode;
+	struct ost_lvb *lvb;
+	__u32 lvb_len;
+	__u64 *flags = aa->oa_flags;
+
+	/* Make a local copy of a lock handle and a mode, because aa->oa_*
+	 * might be freed anytime after lock upcall has been called. */
+	lustre_handle_copy(&handle, aa->oa_lockh);
+	mode = aa->oa_ei->ei_mode;
+
+	/* ldlm_cli_enqueue is holding a reference on the lock, so it must
+	 * be valid. */
+	lock = ldlm_handle2lock(&handle);
+
+	/* Take an additional reference so that a blocking AST that
+	 * ldlm_cli_enqueue_fini() might post for a failed lock, is guaranteed
+	 * to arrive after an upcall has been executed by
+	 * osc_enqueue_fini(). */
+	ldlm_lock_addref(&handle, mode);
+
+	/* Let CP AST to grant the lock first. */
+	OBD_FAIL_TIMEOUT(OBD_FAIL_OSC_CP_ENQ_RACE, 1);
+
+	if (aa->oa_agl && rc == ELDLM_LOCK_ABORTED) {
+		lvb = NULL;
+		lvb_len = 0;
+	} else {
+		lvb = aa->oa_lvb;
+		lvb_len = sizeof(*aa->oa_lvb);
+	}
+
+	/* Complete obtaining the lock procedure. */
+	rc = ldlm_cli_enqueue_fini(aa->oa_exp, req, aa->oa_ei->ei_type, 1,
+				   mode, flags, lvb, lvb_len, &handle, rc);
+	/* Complete osc stuff. */
+	rc = osc_enqueue_fini(req, aa->oa_lvb, aa->oa_upcall, aa->oa_cookie,
+			      flags, aa->oa_agl, rc);
+
+	OBD_FAIL_TIMEOUT(OBD_FAIL_OSC_CP_CANCEL_RACE, 10);
+
+	/* Release the lock for async request. */
+	if (lustre_handle_is_used(&handle) && rc == ELDLM_OK)
+		/*
+		 * Releases a reference taken by ldlm_cli_enqueue(), if it is
+		 * not already released by
+		 * ldlm_cli_enqueue_fini()->failed_lock_cleanup()
+		 */
+		ldlm_lock_decref(&handle, mode);
+
+	LASSERTF(lock != NULL, "lockh %p, req %p, aa %p - client evicted?\n",
+		 aa->oa_lockh, req, aa);
+	ldlm_lock_decref(&handle, mode);
+	LDLM_LOCK_PUT(lock);
+	return rc;
+}
+
+struct ptlrpc_request_set *PTLRPCD_SET = (void *)1;
+
+/* When enqueuing asynchronously, locks are not ordered, we can obtain a lock
+ * from the 2nd OSC before a lock from the 1st one. This does not deadlock with
+ * other synchronous requests, however keeping some locks and trying to obtain
+ * others may take a considerable amount of time in a case of ost failure; and
+ * when other sync requests do not get released lock from a client, the client
+ * is excluded from the cluster -- such scenarious make the life difficult, so
+ * release locks just after they are obtained. */
+int osc_enqueue_base(struct obd_export *exp, struct ldlm_res_id *res_id,
+		     __u64 *flags, ldlm_policy_data_t *policy,
+		     struct ost_lvb *lvb, int kms_valid,
+		     obd_enqueue_update_f upcall, void *cookie,
+		     struct ldlm_enqueue_info *einfo,
+		     struct lustre_handle *lockh,
+		     struct ptlrpc_request_set *rqset, int async, int agl)
+{
+	struct obd_device *obd = exp->exp_obd;
+	struct ptlrpc_request *req = NULL;
+	int intent = *flags & LDLM_FL_HAS_INTENT;
+	__u64 match_lvb = (agl != 0 ? 0 : LDLM_FL_LVB_READY);
+	ldlm_mode_t mode;
+	int rc;
+
+	/* Filesystem lock extents are extended to page boundaries so that
+	 * dealing with the page cache is a little smoother.  */
+	policy->l_extent.start -= policy->l_extent.start & ~CFS_PAGE_MASK;
+	policy->l_extent.end |= ~CFS_PAGE_MASK;
+
+	/*
+	 * kms is not valid when either object is completely fresh (so that no
+	 * locks are cached), or object was evicted. In the latter case cached
+	 * lock cannot be used, because it would prime inode state with
+	 * potentially stale LVB.
+	 */
+	if (!kms_valid)
+		goto no_match;
+
+	/* Next, search for already existing extent locks that will cover us */
+	/* If we're trying to read, we also search for an existing PW lock.  The
+	 * VFS and page cache already protect us locally, so lots of readers/
+	 * writers can share a single PW lock.
+	 *
+	 * There are problems with conversion deadlocks, so instead of
+	 * converting a read lock to a write lock, we'll just enqueue a new
+	 * one.
+	 *
+	 * At some point we should cancel the read lock instead of making them
+	 * send us a blocking callback, but there are problems with canceling
+	 * locks out from other users right now, too. */
+	mode = einfo->ei_mode;
+	if (einfo->ei_mode == LCK_PR)
+		mode |= LCK_PW;
+	mode = ldlm_lock_match(obd->obd_namespace, *flags | match_lvb, res_id,
+			       einfo->ei_type, policy, mode, lockh, 0);
+	if (mode) {
+		struct ldlm_lock *matched = ldlm_handle2lock(lockh);
+
+		if ((agl != 0) && !(matched->l_flags & LDLM_FL_LVB_READY)) {
+			/* For AGL, if enqueue RPC is sent but the lock is not
+			 * granted, then skip to process this strpe.
+			 * Return -ECANCELED to tell the caller. */
+			ldlm_lock_decref(lockh, mode);
+			LDLM_LOCK_PUT(matched);
+			return -ECANCELED;
+		} else if (osc_set_lock_data_with_check(matched, einfo)) {
+			*flags |= LDLM_FL_LVB_READY;
+			/* addref the lock only if not async requests and PW
+			 * lock is matched whereas we asked for PR. */
+			if (!rqset && einfo->ei_mode != mode)
+				ldlm_lock_addref(lockh, LCK_PR);
+			if (intent) {
+				/* I would like to be able to ASSERT here that
+				 * rss <= kms, but I can't, for reasons which
+				 * are explained in lov_enqueue() */
+			}
+
+			/* We already have a lock, and it's referenced.
+			 *
+			 * At this point, the cl_lock::cll_state is CLS_QUEUING,
+			 * AGL upcall may change it to CLS_HELD directly. */
+			(*upcall)(cookie, ELDLM_OK);
+
+			if (einfo->ei_mode != mode)
+				ldlm_lock_decref(lockh, LCK_PW);
+			else if (rqset)
+				/* For async requests, decref the lock. */
+				ldlm_lock_decref(lockh, einfo->ei_mode);
+			LDLM_LOCK_PUT(matched);
+			return ELDLM_OK;
+		} else {
+			ldlm_lock_decref(lockh, mode);
+			LDLM_LOCK_PUT(matched);
+		}
+	}
+
+ no_match:
+	if (intent) {
+		LIST_HEAD(cancels);
+		req = ptlrpc_request_alloc(class_exp2cliimp(exp),
+					   &RQF_LDLM_ENQUEUE_LVB);
+		if (req == NULL)
+			return -ENOMEM;
+
+		rc = ldlm_prep_enqueue_req(exp, req, &cancels, 0);
+		if (rc) {
+			ptlrpc_request_free(req);
+			return rc;
+		}
+
+		req_capsule_set_size(&req->rq_pill, &RMF_DLM_LVB, RCL_SERVER,
+				     sizeof(*lvb));
+		ptlrpc_request_set_replen(req);
+	}
+
+	/* users of osc_enqueue() can pass this flag for ldlm_lock_match() */
+	*flags &= ~LDLM_FL_BLOCK_GRANTED;
+
+	rc = ldlm_cli_enqueue(exp, &req, einfo, res_id, policy, flags, lvb,
+			      sizeof(*lvb), LVB_T_OST, lockh, async);
+	if (rqset) {
+		if (!rc) {
+			struct osc_enqueue_args *aa;
+			CLASSERT (sizeof(*aa) <= sizeof(req->rq_async_args));
+			aa = ptlrpc_req_async_args(req);
+			aa->oa_ei = einfo;
+			aa->oa_exp = exp;
+			aa->oa_flags  = flags;
+			aa->oa_upcall = upcall;
+			aa->oa_cookie = cookie;
+			aa->oa_lvb    = lvb;
+			aa->oa_lockh  = lockh;
+			aa->oa_agl    = !!agl;
+
+			req->rq_interpret_reply =
+				(ptlrpc_interpterer_t)osc_enqueue_interpret;
+			if (rqset == PTLRPCD_SET)
+				ptlrpcd_add_req(req, PDL_POLICY_ROUND, -1);
+			else
+				ptlrpc_set_add_req(rqset, req);
+		} else if (intent) {
+			ptlrpc_req_finished(req);
+		}
+		return rc;
+	}
+
+	rc = osc_enqueue_fini(req, lvb, upcall, cookie, flags, agl, rc);
+	if (intent)
+		ptlrpc_req_finished(req);
+
+	return rc;
+}
+
+int osc_match_base(struct obd_export *exp, struct ldlm_res_id *res_id,
+		   __u32 type, ldlm_policy_data_t *policy, __u32 mode,
+		   __u64 *flags, void *data, struct lustre_handle *lockh,
+		   int unref)
+{
+	struct obd_device *obd = exp->exp_obd;
+	__u64 lflags = *flags;
+	ldlm_mode_t rc;
+
+	if (OBD_FAIL_CHECK(OBD_FAIL_OSC_MATCH))
+		return -EIO;
+
+	/* Filesystem lock extents are extended to page boundaries so that
+	 * dealing with the page cache is a little smoother */
+	policy->l_extent.start -= policy->l_extent.start & ~CFS_PAGE_MASK;
+	policy->l_extent.end |= ~CFS_PAGE_MASK;
+
+	/* Next, search for already existing extent locks that will cover us */
+	/* If we're trying to read, we also search for an existing PW lock.  The
+	 * VFS and page cache already protect us locally, so lots of readers/
+	 * writers can share a single PW lock. */
+	rc = mode;
+	if (mode == LCK_PR)
+		rc |= LCK_PW;
+	rc = ldlm_lock_match(obd->obd_namespace, lflags,
+			     res_id, type, policy, rc, lockh, unref);
+	if (rc) {
+		if (data != NULL) {
+			if (!osc_set_data_with_check(lockh, data)) {
+				if (!(lflags & LDLM_FL_TEST_LOCK))
+					ldlm_lock_decref(lockh, rc);
+				return 0;
+			}
+		}
+		if (!(lflags & LDLM_FL_TEST_LOCK) && mode != rc) {
+			ldlm_lock_addref(lockh, LCK_PR);
+			ldlm_lock_decref(lockh, LCK_PW);
+		}
+		return rc;
+	}
+	return rc;
+}
+
+int osc_cancel_base(struct lustre_handle *lockh, __u32 mode)
+{
+	if (unlikely(mode == LCK_GROUP))
+		ldlm_lock_decref_and_cancel(lockh, mode);
+	else
+		ldlm_lock_decref(lockh, mode);
+
+	return 0;
+}
+
+static int osc_statfs_interpret(const struct lu_env *env,
+				struct ptlrpc_request *req,
+				struct osc_async_args *aa, int rc)
+{
+	struct obd_statfs *msfs;
+
+	if (rc == -EBADR)
+		/* The request has in fact never been sent
+		 * due to issues at a higher level (LOV).
+		 * Exit immediately since the caller is
+		 * aware of the problem and takes care
+		 * of the clean up */
+		 return rc;
+
+	if ((rc == -ENOTCONN || rc == -EAGAIN) &&
+	    (aa->aa_oi->oi_flags & OBD_STATFS_NODELAY)) {
+		rc = 0;
+		goto out;
+	}
+
+	if (rc != 0)
+		goto out;
+
+	msfs = req_capsule_server_get(&req->rq_pill, &RMF_OBD_STATFS);
+	if (msfs == NULL) {
+		rc = -EPROTO;
+		goto out;
+	}
+
+	*aa->aa_oi->oi_osfs = *msfs;
+out:
+	rc = aa->aa_oi->oi_cb_up(aa->aa_oi, rc);
+	return rc;
+}
+
+static int osc_statfs_async(struct obd_export *exp,
+			    struct obd_info *oinfo, __u64 max_age,
+			    struct ptlrpc_request_set *rqset)
+{
+	struct obd_device     *obd = class_exp2obd(exp);
+	struct ptlrpc_request *req;
+	struct osc_async_args *aa;
+	int		    rc;
+
+	/* We could possibly pass max_age in the request (as an absolute
+	 * timestamp or a "seconds.usec ago") so the target can avoid doing
+	 * extra calls into the filesystem if that isn't necessary (e.g.
+	 * during mount that would help a bit).  Having relative timestamps
+	 * is not so great if request processing is slow, while absolute
+	 * timestamps are not ideal because they need time synchronization. */
+	req = ptlrpc_request_alloc(obd->u.cli.cl_import, &RQF_OST_STATFS);
+	if (req == NULL)
+		return -ENOMEM;
+
+	rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_STATFS);
+	if (rc) {
+		ptlrpc_request_free(req);
+		return rc;
+	}
+	ptlrpc_request_set_replen(req);
+	req->rq_request_portal = OST_CREATE_PORTAL;
+	ptlrpc_at_set_req_timeout(req);
+
+	if (oinfo->oi_flags & OBD_STATFS_NODELAY) {
+		/* procfs requests not want stat in wait for avoid deadlock */
+		req->rq_no_resend = 1;
+		req->rq_no_delay = 1;
+	}
+
+	req->rq_interpret_reply = (ptlrpc_interpterer_t)osc_statfs_interpret;
+	CLASSERT (sizeof(*aa) <= sizeof(req->rq_async_args));
+	aa = ptlrpc_req_async_args(req);
+	aa->aa_oi = oinfo;
+
+	ptlrpc_set_add_req(rqset, req);
+	return 0;
+}
+
+static int osc_statfs(const struct lu_env *env, struct obd_export *exp,
+		      struct obd_statfs *osfs, __u64 max_age, __u32 flags)
+{
+	struct obd_device     *obd = class_exp2obd(exp);
+	struct obd_statfs     *msfs;
+	struct ptlrpc_request *req;
+	struct obd_import     *imp = NULL;
+	int rc;
+
+	/*Since the request might also come from lprocfs, so we need
+	 *sync this with client_disconnect_export Bug15684*/
+	down_read(&obd->u.cli.cl_sem);
+	if (obd->u.cli.cl_import)
+		imp = class_import_get(obd->u.cli.cl_import);
+	up_read(&obd->u.cli.cl_sem);
+	if (!imp)
+		return -ENODEV;
+
+	/* We could possibly pass max_age in the request (as an absolute
+	 * timestamp or a "seconds.usec ago") so the target can avoid doing
+	 * extra calls into the filesystem if that isn't necessary (e.g.
+	 * during mount that would help a bit).  Having relative timestamps
+	 * is not so great if request processing is slow, while absolute
+	 * timestamps are not ideal because they need time synchronization. */
+	req = ptlrpc_request_alloc(imp, &RQF_OST_STATFS);
+
+	class_import_put(imp);
+
+	if (req == NULL)
+		return -ENOMEM;
+
+	rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_STATFS);
+	if (rc) {
+		ptlrpc_request_free(req);
+		return rc;
+	}
+	ptlrpc_request_set_replen(req);
+	req->rq_request_portal = OST_CREATE_PORTAL;
+	ptlrpc_at_set_req_timeout(req);
+
+	if (flags & OBD_STATFS_NODELAY) {
+		/* procfs requests not want stat in wait for avoid deadlock */
+		req->rq_no_resend = 1;
+		req->rq_no_delay = 1;
+	}
+
+	rc = ptlrpc_queue_wait(req);
+	if (rc)
+		goto out;
+
+	msfs = req_capsule_server_get(&req->rq_pill, &RMF_OBD_STATFS);
+	if (msfs == NULL) {
+		rc = -EPROTO;
+		goto out;
+	}
+
+	*osfs = *msfs;
+
+ out:
+	ptlrpc_req_finished(req);
+	return rc;
+}
+
+/* Retrieve object striping information.
+ *
+ * @lmmu is a pointer to an in-core struct with lmm_ost_count indicating
+ * the maximum number of OST indices which will fit in the user buffer.
+ * lmm_magic must be LOV_MAGIC (we only use 1 slot here).
+ */
+static int osc_getstripe(struct lov_stripe_md *lsm, struct lov_user_md *lump)
+{
+	/* we use lov_user_md_v3 because it is larger than lov_user_md_v1 */
+	struct lov_user_md_v3 lum, *lumk;
+	struct lov_user_ost_data_v1 *lmm_objects;
+	int rc = 0, lum_size;
+
+	if (!lsm)
+		return -ENODATA;
+
+	/* we only need the header part from user space to get lmm_magic and
+	 * lmm_stripe_count, (the header part is common to v1 and v3) */
+	lum_size = sizeof(struct lov_user_md_v1);
+	if (copy_from_user(&lum, lump, lum_size))
+		return -EFAULT;
+
+	if ((lum.lmm_magic != LOV_USER_MAGIC_V1) &&
+	    (lum.lmm_magic != LOV_USER_MAGIC_V3))
+		return -EINVAL;
+
+	/* lov_user_md_vX and lov_mds_md_vX must have the same size */
+	LASSERT(sizeof(struct lov_user_md_v1) == sizeof(struct lov_mds_md_v1));
+	LASSERT(sizeof(struct lov_user_md_v3) == sizeof(struct lov_mds_md_v3));
+	LASSERT(sizeof(lum.lmm_objects[0]) == sizeof(lumk->lmm_objects[0]));
+
+	/* we can use lov_mds_md_size() to compute lum_size
+	 * because lov_user_md_vX and lov_mds_md_vX have the same size */
+	if (lum.lmm_stripe_count > 0) {
+		lum_size = lov_mds_md_size(lum.lmm_stripe_count, lum.lmm_magic);
+		OBD_ALLOC(lumk, lum_size);
+		if (!lumk)
+			return -ENOMEM;
+
+		if (lum.lmm_magic == LOV_USER_MAGIC_V1)
+			lmm_objects =
+			    &(((struct lov_user_md_v1 *)lumk)->lmm_objects[0]);
+		else
+			lmm_objects = &(lumk->lmm_objects[0]);
+		lmm_objects->l_ost_oi = lsm->lsm_oi;
+	} else {
+		lum_size = lov_mds_md_size(0, lum.lmm_magic);
+		lumk = &lum;
+	}
+
+	lumk->lmm_oi = lsm->lsm_oi;
+	lumk->lmm_stripe_count = 1;
+
+	if (copy_to_user(lump, lumk, lum_size))
+		rc = -EFAULT;
+
+	if (lumk != &lum)
+		OBD_FREE(lumk, lum_size);
+
+	return rc;
+}
+
+
+static int osc_iocontrol(unsigned int cmd, struct obd_export *exp, int len,
+			 void *karg, void *uarg)
+{
+	struct obd_device *obd = exp->exp_obd;
+	struct obd_ioctl_data *data = karg;
+	int err = 0;
+
+	if (!try_module_get(THIS_MODULE)) {
+		CERROR("Can't get module. Is it alive?");
+		return -EINVAL;
+	}
+	switch (cmd) {
+	case OBD_IOC_LOV_GET_CONFIG: {
+		char *buf;
+		struct lov_desc *desc;
+		struct obd_uuid uuid;
+
+		buf = NULL;
+		len = 0;
+		if (obd_ioctl_getdata(&buf, &len, (void *)uarg)) {
+			err = -EINVAL;
+			goto out;
+		}
+
+		data = (struct obd_ioctl_data *)buf;
+
+		if (sizeof(*desc) > data->ioc_inllen1) {
+			obd_ioctl_freedata(buf, len);
+			err = -EINVAL;
+			goto out;
+		}
+
+		if (data->ioc_inllen2 < sizeof(uuid)) {
+			obd_ioctl_freedata(buf, len);
+			err = -EINVAL;
+			goto out;
+		}
+
+		desc = (struct lov_desc *)data->ioc_inlbuf1;
+		desc->ld_tgt_count = 1;
+		desc->ld_active_tgt_count = 1;
+		desc->ld_default_stripe_count = 1;
+		desc->ld_default_stripe_size = 0;
+		desc->ld_default_stripe_offset = 0;
+		desc->ld_pattern = 0;
+		memcpy(&desc->ld_uuid, &obd->obd_uuid, sizeof(uuid));
+
+		memcpy(data->ioc_inlbuf2, &obd->obd_uuid, sizeof(uuid));
+
+		err = copy_to_user((void *)uarg, buf, len);
+		if (err)
+			err = -EFAULT;
+		obd_ioctl_freedata(buf, len);
+		goto out;
+	}
+	case LL_IOC_LOV_SETSTRIPE:
+		err = obd_alloc_memmd(exp, karg);
+		if (err > 0)
+			err = 0;
+		goto out;
+	case LL_IOC_LOV_GETSTRIPE:
+		err = osc_getstripe(karg, uarg);
+		goto out;
+	case OBD_IOC_CLIENT_RECOVER:
+		err = ptlrpc_recover_import(obd->u.cli.cl_import,
+					    data->ioc_inlbuf1, 0);
+		if (err > 0)
+			err = 0;
+		goto out;
+	case IOC_OSC_SET_ACTIVE:
+		err = ptlrpc_set_import_active(obd->u.cli.cl_import,
+					       data->ioc_offset);
+		goto out;
+	case OBD_IOC_POLL_QUOTACHECK:
+		err = osc_quota_poll_check(exp, (struct if_quotacheck *)karg);
+		goto out;
+	case OBD_IOC_PING_TARGET:
+		err = ptlrpc_obd_ping(obd);
+		goto out;
+	default:
+		CDEBUG(D_INODE, "unrecognised ioctl %#x by %s\n",
+		       cmd, current_comm());
+		err = -ENOTTY;
+		goto out;
+	}
+out:
+	module_put(THIS_MODULE);
+	return err;
+}
+
+static int osc_get_info(const struct lu_env *env, struct obd_export *exp,
+			u32 keylen, void *key, __u32 *vallen, void *val,
+			struct lov_stripe_md *lsm)
+{
+	if (!vallen || !val)
+		return -EFAULT;
+
+	if (KEY_IS(KEY_LOCK_TO_STRIPE)) {
+		__u32 *stripe = val;
+		*vallen = sizeof(*stripe);
+		*stripe = 0;
+		return 0;
+	} else if (KEY_IS(KEY_LAST_ID)) {
+		struct ptlrpc_request *req;
+		u64		*reply;
+		char		  *tmp;
+		int		    rc;
+
+		req = ptlrpc_request_alloc(class_exp2cliimp(exp),
+					   &RQF_OST_GET_INFO_LAST_ID);
+		if (req == NULL)
+			return -ENOMEM;
+
+		req_capsule_set_size(&req->rq_pill, &RMF_SETINFO_KEY,
+				     RCL_CLIENT, keylen);
+		rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_GET_INFO);
+		if (rc) {
+			ptlrpc_request_free(req);
+			return rc;
+		}
+
+		tmp = req_capsule_client_get(&req->rq_pill, &RMF_SETINFO_KEY);
+		memcpy(tmp, key, keylen);
+
+		req->rq_no_delay = req->rq_no_resend = 1;
+		ptlrpc_request_set_replen(req);
+		rc = ptlrpc_queue_wait(req);
+		if (rc)
+			goto out;
+
+		reply = req_capsule_server_get(&req->rq_pill, &RMF_OBD_ID);
+		if (reply == NULL) {
+			rc = -EPROTO;
+			goto out;
+		}
+
+		*((u64 *)val) = *reply;
+	out:
+		ptlrpc_req_finished(req);
+		return rc;
+	} else if (KEY_IS(KEY_FIEMAP)) {
+		struct ll_fiemap_info_key *fm_key =
+				(struct ll_fiemap_info_key *)key;
+		struct ldlm_res_id	 res_id;
+		ldlm_policy_data_t	 policy;
+		struct lustre_handle	 lockh;
+		ldlm_mode_t		 mode = 0;
+		struct ptlrpc_request	*req;
+		struct ll_user_fiemap	*reply;
+		char			*tmp;
+		int			 rc;
+
+		if (!(fm_key->fiemap.fm_flags & FIEMAP_FLAG_SYNC))
+			goto skip_locking;
+
+		policy.l_extent.start = fm_key->fiemap.fm_start &
+						CFS_PAGE_MASK;
+
+		if (OBD_OBJECT_EOF - fm_key->fiemap.fm_length <=
+		    fm_key->fiemap.fm_start + PAGE_CACHE_SIZE - 1)
+			policy.l_extent.end = OBD_OBJECT_EOF;
+		else
+			policy.l_extent.end = (fm_key->fiemap.fm_start +
+				fm_key->fiemap.fm_length +
+				PAGE_CACHE_SIZE - 1) & CFS_PAGE_MASK;
+
+		ostid_build_res_name(&fm_key->oa.o_oi, &res_id);
+		mode = ldlm_lock_match(exp->exp_obd->obd_namespace,
+				       LDLM_FL_BLOCK_GRANTED |
+				       LDLM_FL_LVB_READY,
+				       &res_id, LDLM_EXTENT, &policy,
+				       LCK_PR | LCK_PW, &lockh, 0);
+		if (mode) { /* lock is cached on client */
+			if (mode != LCK_PR) {
+				ldlm_lock_addref(&lockh, LCK_PR);
+				ldlm_lock_decref(&lockh, LCK_PW);
+			}
+		} else { /* no cached lock, needs acquire lock on server side */
+			fm_key->oa.o_valid |= OBD_MD_FLFLAGS;
+			fm_key->oa.o_flags |= OBD_FL_SRVLOCK;
+		}
+
+skip_locking:
+		req = ptlrpc_request_alloc(class_exp2cliimp(exp),
+					   &RQF_OST_GET_INFO_FIEMAP);
+		if (req == NULL) {
+			rc = -ENOMEM;
+			goto drop_lock;
+		}
+
+		req_capsule_set_size(&req->rq_pill, &RMF_FIEMAP_KEY,
+				     RCL_CLIENT, keylen);
+		req_capsule_set_size(&req->rq_pill, &RMF_FIEMAP_VAL,
+				     RCL_CLIENT, *vallen);
+		req_capsule_set_size(&req->rq_pill, &RMF_FIEMAP_VAL,
+				     RCL_SERVER, *vallen);
+
+		rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_GET_INFO);
+		if (rc) {
+			ptlrpc_request_free(req);
+			goto drop_lock;
+		}
+
+		tmp = req_capsule_client_get(&req->rq_pill, &RMF_FIEMAP_KEY);
+		memcpy(tmp, key, keylen);
+		tmp = req_capsule_client_get(&req->rq_pill, &RMF_FIEMAP_VAL);
+		memcpy(tmp, val, *vallen);
+
+		ptlrpc_request_set_replen(req);
+		rc = ptlrpc_queue_wait(req);
+		if (rc)
+			goto fini_req;
+
+		reply = req_capsule_server_get(&req->rq_pill, &RMF_FIEMAP_VAL);
+		if (reply == NULL) {
+			rc = -EPROTO;
+			goto fini_req;
+		}
+
+		memcpy(val, reply, *vallen);
+fini_req:
+		ptlrpc_req_finished(req);
+drop_lock:
+		if (mode)
+			ldlm_lock_decref(&lockh, LCK_PR);
+		return rc;
+	}
+
+	return -EINVAL;
+}
+
+static int osc_set_info_async(const struct lu_env *env, struct obd_export *exp,
+			      u32 keylen, void *key, u32 vallen,
+			      void *val, struct ptlrpc_request_set *set)
+{
+	struct ptlrpc_request *req;
+	struct obd_device     *obd = exp->exp_obd;
+	struct obd_import     *imp = class_exp2cliimp(exp);
+	char		  *tmp;
+	int		    rc;
+
+	OBD_FAIL_TIMEOUT(OBD_FAIL_OSC_SHUTDOWN, 10);
+
+	if (KEY_IS(KEY_CHECKSUM)) {
+		if (vallen != sizeof(int))
+			return -EINVAL;
+		exp->exp_obd->u.cli.cl_checksum = (*(int *)val) ? 1 : 0;
+		return 0;
+	}
+
+	if (KEY_IS(KEY_SPTLRPC_CONF)) {
+		sptlrpc_conf_client_adapt(obd);
+		return 0;
+	}
+
+	if (KEY_IS(KEY_FLUSH_CTX)) {
+		sptlrpc_import_flush_my_ctx(imp);
+		return 0;
+	}
+
+	if (KEY_IS(KEY_CACHE_SET)) {
+		struct client_obd *cli = &obd->u.cli;
+
+		LASSERT(cli->cl_cache == NULL); /* only once */
+		cli->cl_cache = (struct cl_client_cache *)val;
+		atomic_inc(&cli->cl_cache->ccc_users);
+		cli->cl_lru_left = &cli->cl_cache->ccc_lru_left;
+
+		/* add this osc into entity list */
+		LASSERT(list_empty(&cli->cl_lru_osc));
+		spin_lock(&cli->cl_cache->ccc_lru_lock);
+		list_add(&cli->cl_lru_osc, &cli->cl_cache->ccc_lru);
+		spin_unlock(&cli->cl_cache->ccc_lru_lock);
+
+		return 0;
+	}
+
+	if (KEY_IS(KEY_CACHE_LRU_SHRINK)) {
+		struct client_obd *cli = &obd->u.cli;
+		int nr = atomic_read(&cli->cl_lru_in_list) >> 1;
+		int target = *(int *)val;
+
+		nr = osc_lru_shrink(cli, min(nr, target));
+		*(int *)val -= nr;
+		return 0;
+	}
+
+	if (!set && !KEY_IS(KEY_GRANT_SHRINK))
+		return -EINVAL;
+
+	/* We pass all other commands directly to OST. Since nobody calls osc
+	   methods directly and everybody is supposed to go through LOV, we
+	   assume lov checked invalid values for us.
+	   The only recognised values so far are evict_by_nid and mds_conn.
+	   Even if something bad goes through, we'd get a -EINVAL from OST
+	   anyway. */
+
+	req = ptlrpc_request_alloc(imp, KEY_IS(KEY_GRANT_SHRINK) ?
+						&RQF_OST_SET_GRANT_INFO :
+						&RQF_OBD_SET_INFO);
+	if (req == NULL)
+		return -ENOMEM;
+
+	req_capsule_set_size(&req->rq_pill, &RMF_SETINFO_KEY,
+			     RCL_CLIENT, keylen);
+	if (!KEY_IS(KEY_GRANT_SHRINK))
+		req_capsule_set_size(&req->rq_pill, &RMF_SETINFO_VAL,
+				     RCL_CLIENT, vallen);
+	rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_SET_INFO);
+	if (rc) {
+		ptlrpc_request_free(req);
+		return rc;
+	}
+
+	tmp = req_capsule_client_get(&req->rq_pill, &RMF_SETINFO_KEY);
+	memcpy(tmp, key, keylen);
+	tmp = req_capsule_client_get(&req->rq_pill, KEY_IS(KEY_GRANT_SHRINK) ?
+							&RMF_OST_BODY :
+							&RMF_SETINFO_VAL);
+	memcpy(tmp, val, vallen);
+
+	if (KEY_IS(KEY_GRANT_SHRINK)) {
+		struct osc_brw_async_args *aa;
+		struct obdo *oa;
+
+		CLASSERT(sizeof(*aa) <= sizeof(req->rq_async_args));
+		aa = ptlrpc_req_async_args(req);
+		OBDO_ALLOC(oa);
+		if (!oa) {
+			ptlrpc_req_finished(req);
+			return -ENOMEM;
+		}
+		*oa = ((struct ost_body *)val)->oa;
+		aa->aa_oa = oa;
+		req->rq_interpret_reply = osc_shrink_grant_interpret;
+	}
+
+	ptlrpc_request_set_replen(req);
+	if (!KEY_IS(KEY_GRANT_SHRINK)) {
+		LASSERT(set != NULL);
+		ptlrpc_set_add_req(set, req);
+		ptlrpc_check_set(NULL, set);
+	} else
+		ptlrpcd_add_req(req, PDL_POLICY_ROUND, -1);
+
+	return 0;
+}
+
+static int osc_reconnect(const struct lu_env *env,
+			 struct obd_export *exp, struct obd_device *obd,
+			 struct obd_uuid *cluuid,
+			 struct obd_connect_data *data,
+			 void *localdata)
+{
+	struct client_obd *cli = &obd->u.cli;
+
+	if (data != NULL && (data->ocd_connect_flags & OBD_CONNECT_GRANT)) {
+		long lost_grant;
+
+		client_obd_list_lock(&cli->cl_loi_list_lock);
+		data->ocd_grant = (cli->cl_avail_grant + cli->cl_dirty) ?:
+				2 * cli_brw_size(obd);
+		lost_grant = cli->cl_lost_grant;
+		cli->cl_lost_grant = 0;
+		client_obd_list_unlock(&cli->cl_loi_list_lock);
+
+		CDEBUG(D_RPCTRACE, "ocd_connect_flags: %#llx ocd_version: %d ocd_grant: %d, lost: %ld.\n",
+		       data->ocd_connect_flags,
+		       data->ocd_version, data->ocd_grant, lost_grant);
+	}
+
+	return 0;
+}
+
+static int osc_disconnect(struct obd_export *exp)
+{
+	struct obd_device *obd = class_exp2obd(exp);
+	int rc;
+
+	rc = client_disconnect_export(exp);
+	/**
+	 * Initially we put del_shrink_grant before disconnect_export, but it
+	 * causes the following problem if setup (connect) and cleanup
+	 * (disconnect) are tangled together.
+	 *      connect p1		     disconnect p2
+	 *   ptlrpc_connect_import
+	 *     ...............	       class_manual_cleanup
+	 *				     osc_disconnect
+	 *				     del_shrink_grant
+	 *   ptlrpc_connect_interrupt
+	 *     init_grant_shrink
+	 *   add this client to shrink list
+	 *				      cleanup_osc
+	 * Bang! pinger trigger the shrink.
+	 * So the osc should be disconnected from the shrink list, after we
+	 * are sure the import has been destroyed. BUG18662
+	 */
+	if (obd->u.cli.cl_import == NULL)
+		osc_del_shrink_grant(&obd->u.cli);
+	return rc;
+}
+
+static int osc_import_event(struct obd_device *obd,
+			    struct obd_import *imp,
+			    enum obd_import_event event)
+{
+	struct client_obd *cli;
+	int rc = 0;
+
+	LASSERT(imp->imp_obd == obd);
+
+	switch (event) {
+	case IMP_EVENT_DISCON: {
+		cli = &obd->u.cli;
+		client_obd_list_lock(&cli->cl_loi_list_lock);
+		cli->cl_avail_grant = 0;
+		cli->cl_lost_grant = 0;
+		client_obd_list_unlock(&cli->cl_loi_list_lock);
+		break;
+	}
+	case IMP_EVENT_INACTIVE: {
+		rc = obd_notify_observer(obd, obd, OBD_NOTIFY_INACTIVE, NULL);
+		break;
+	}
+	case IMP_EVENT_INVALIDATE: {
+		struct ldlm_namespace *ns = obd->obd_namespace;
+		struct lu_env	 *env;
+		int		    refcheck;
+
+		env = cl_env_get(&refcheck);
+		if (!IS_ERR(env)) {
+			/* Reset grants */
+			cli = &obd->u.cli;
+			/* all pages go to failing rpcs due to the invalid
+			 * import */
+			osc_io_unplug(env, cli, NULL, PDL_POLICY_ROUND);
+
+			ldlm_namespace_cleanup(ns, LDLM_FL_LOCAL_ONLY);
+			cl_env_put(env, &refcheck);
+		} else
+			rc = PTR_ERR(env);
+		break;
+	}
+	case IMP_EVENT_ACTIVE: {
+		rc = obd_notify_observer(obd, obd, OBD_NOTIFY_ACTIVE, NULL);
+		break;
+	}
+	case IMP_EVENT_OCD: {
+		struct obd_connect_data *ocd = &imp->imp_connect_data;
+
+		if (ocd->ocd_connect_flags & OBD_CONNECT_GRANT)
+			osc_init_grant(&obd->u.cli, ocd);
+
+		/* See bug 7198 */
+		if (ocd->ocd_connect_flags & OBD_CONNECT_REQPORTAL)
+			imp->imp_client->cli_request_portal =OST_REQUEST_PORTAL;
+
+		rc = obd_notify_observer(obd, obd, OBD_NOTIFY_OCD, NULL);
+		break;
+	}
+	case IMP_EVENT_DEACTIVATE: {
+		rc = obd_notify_observer(obd, obd, OBD_NOTIFY_DEACTIVATE, NULL);
+		break;
+	}
+	case IMP_EVENT_ACTIVATE: {
+		rc = obd_notify_observer(obd, obd, OBD_NOTIFY_ACTIVATE, NULL);
+		break;
+	}
+	default:
+		CERROR("Unknown import event %d\n", event);
+		LBUG();
+	}
+	return rc;
+}
+
+/**
+ * Determine whether the lock can be canceled before replaying the lock
+ * during recovery, see bug16774 for detailed information.
+ *
+ * \retval zero the lock can't be canceled
+ * \retval other ok to cancel
+ */
+static int osc_cancel_for_recovery(struct ldlm_lock *lock)
+{
+	check_res_locked(lock->l_resource);
+
+	/*
+	 * Cancel all unused extent lock in granted mode LCK_PR or LCK_CR.
+	 *
+	 * XXX as a future improvement, we can also cancel unused write lock
+	 * if it doesn't have dirty data and active mmaps.
+	 */
+	if (lock->l_resource->lr_type == LDLM_EXTENT &&
+	    (lock->l_granted_mode == LCK_PR ||
+	     lock->l_granted_mode == LCK_CR) &&
+	    (osc_dlm_lock_pageref(lock) == 0))
+		return 1;
+
+	return 0;
+}
+
+static int brw_queue_work(const struct lu_env *env, void *data)
+{
+	struct client_obd *cli = data;
+
+	CDEBUG(D_CACHE, "Run writeback work for client obd %p.\n", cli);
+
+	osc_io_unplug(env, cli, NULL, PDL_POLICY_SAME);
+	return 0;
+}
+
+int osc_setup(struct obd_device *obd, struct lustre_cfg *lcfg)
+{
+	struct lprocfs_static_vars lvars = { NULL };
+	struct client_obd	  *cli = &obd->u.cli;
+	void		       *handler;
+	int			rc;
+
+	rc = ptlrpcd_addref();
+	if (rc)
+		return rc;
+
+	rc = client_obd_setup(obd, lcfg);
+	if (rc)
+		goto out_ptlrpcd;
+
+	handler = ptlrpcd_alloc_work(cli->cl_import, brw_queue_work, cli);
+	if (IS_ERR(handler)) {
+		rc = PTR_ERR(handler);
+		goto out_client_setup;
+	}
+	cli->cl_writeback_work = handler;
+
+	rc = osc_quota_setup(obd);
+	if (rc)
+		goto out_ptlrpcd_work;
+
+	cli->cl_grant_shrink_interval = GRANT_SHRINK_INTERVAL;
+	lprocfs_osc_init_vars(&lvars);
+	if (lprocfs_obd_setup(obd, lvars.obd_vars) == 0) {
+		lproc_osc_attach_seqstat(obd);
+		sptlrpc_lprocfs_cliobd_attach(obd);
+		ptlrpc_lprocfs_register_obd(obd);
+	}
+
+	/* We need to allocate a few requests more, because
+	 * brw_interpret tries to create new requests before freeing
+	 * previous ones, Ideally we want to have 2x max_rpcs_in_flight
+	 * reserved, but I'm afraid that might be too much wasted RAM
+	 * in fact, so 2 is just my guess and still should work. */
+	cli->cl_import->imp_rq_pool =
+		ptlrpc_init_rq_pool(cli->cl_max_rpcs_in_flight + 2,
+				    OST_MAXREQSIZE,
+				    ptlrpc_add_rqs_to_pool);
+
+	INIT_LIST_HEAD(&cli->cl_grant_shrink_list);
+	ns_register_cancel(obd->obd_namespace, osc_cancel_for_recovery);
+	return rc;
+
+out_ptlrpcd_work:
+	ptlrpcd_destroy_work(handler);
+out_client_setup:
+	client_obd_cleanup(obd);
+out_ptlrpcd:
+	ptlrpcd_decref();
+	return rc;
+}
+
+static int osc_precleanup(struct obd_device *obd, enum obd_cleanup_stage stage)
+{
+	switch (stage) {
+	case OBD_CLEANUP_EARLY: {
+		struct obd_import *imp;
+		imp = obd->u.cli.cl_import;
+		CDEBUG(D_HA, "Deactivating import %s\n", obd->obd_name);
+		/* ptlrpc_abort_inflight to stop an mds_lov_synchronize */
+		ptlrpc_deactivate_import(imp);
+		spin_lock(&imp->imp_lock);
+		imp->imp_pingable = 0;
+		spin_unlock(&imp->imp_lock);
+		break;
+	}
+	case OBD_CLEANUP_EXPORTS: {
+		struct client_obd *cli = &obd->u.cli;
+		/* LU-464
+		 * for echo client, export may be on zombie list, wait for
+		 * zombie thread to cull it, because cli.cl_import will be
+		 * cleared in client_disconnect_export():
+		 *   class_export_destroy() -> obd_cleanup() ->
+		 *   echo_device_free() -> echo_client_cleanup() ->
+		 *   obd_disconnect() -> osc_disconnect() ->
+		 *   client_disconnect_export()
+		 */
+		obd_zombie_barrier();
+		if (cli->cl_writeback_work) {
+			ptlrpcd_destroy_work(cli->cl_writeback_work);
+			cli->cl_writeback_work = NULL;
+		}
+		obd_cleanup_client_import(obd);
+		ptlrpc_lprocfs_unregister_obd(obd);
+		lprocfs_obd_cleanup(obd);
+		break;
+		}
+	}
+	return 0;
+}
+
+int osc_cleanup(struct obd_device *obd)
+{
+	struct client_obd *cli = &obd->u.cli;
+	int rc;
+
+	/* lru cleanup */
+	if (cli->cl_cache != NULL) {
+		LASSERT(atomic_read(&cli->cl_cache->ccc_users) > 0);
+		spin_lock(&cli->cl_cache->ccc_lru_lock);
+		list_del_init(&cli->cl_lru_osc);
+		spin_unlock(&cli->cl_cache->ccc_lru_lock);
+		cli->cl_lru_left = NULL;
+		atomic_dec(&cli->cl_cache->ccc_users);
+		cli->cl_cache = NULL;
+	}
+
+	/* free memory of osc quota cache */
+	osc_quota_cleanup(obd);
+
+	rc = client_obd_cleanup(obd);
+
+	ptlrpcd_decref();
+	return rc;
+}
+
+int osc_process_config_base(struct obd_device *obd, struct lustre_cfg *lcfg)
+{
+	struct lprocfs_static_vars lvars = { NULL };
+	int rc = 0;
+
+	lprocfs_osc_init_vars(&lvars);
+
+	switch (lcfg->lcfg_command) {
+	default:
+		rc = class_process_proc_param(PARAM_OSC, lvars.obd_vars,
+					      lcfg, obd);
+		if (rc > 0)
+			rc = 0;
+		break;
+	}
+
+	return rc;
+}
+
+static int osc_process_config(struct obd_device *obd, u32 len, void *buf)
+{
+	return osc_process_config_base(obd, buf);
+}
+
+struct obd_ops osc_obd_ops = {
+	.o_owner		= THIS_MODULE,
+	.o_setup		= osc_setup,
+	.o_precleanup	   = osc_precleanup,
+	.o_cleanup	      = osc_cleanup,
+	.o_add_conn	     = client_import_add_conn,
+	.o_del_conn	     = client_import_del_conn,
+	.o_connect	      = client_connect_import,
+	.o_reconnect	    = osc_reconnect,
+	.o_disconnect	   = osc_disconnect,
+	.o_statfs	       = osc_statfs,
+	.o_statfs_async	 = osc_statfs_async,
+	.o_packmd	       = osc_packmd,
+	.o_unpackmd	     = osc_unpackmd,
+	.o_create	       = osc_create,
+	.o_destroy	      = osc_destroy,
+	.o_getattr	      = osc_getattr,
+	.o_getattr_async	= osc_getattr_async,
+	.o_setattr	      = osc_setattr,
+	.o_setattr_async	= osc_setattr_async,
+	.o_find_cbdata	  = osc_find_cbdata,
+	.o_iocontrol	    = osc_iocontrol,
+	.o_get_info	     = osc_get_info,
+	.o_set_info_async       = osc_set_info_async,
+	.o_import_event	 = osc_import_event,
+	.o_process_config       = osc_process_config,
+	.o_quotactl	     = osc_quotactl,
+	.o_quotacheck	   = osc_quotacheck,
+};
+
+extern struct lu_kmem_descr osc_caches[];
+extern spinlock_t osc_ast_guard;
+extern struct lock_class_key osc_ast_guard_class;
+
+static int __init osc_init(void)
+{
+	struct lprocfs_static_vars lvars = { NULL };
+	int rc;
+
+	/* print an address of _any_ initialized kernel symbol from this
+	 * module, to allow debugging with gdb that doesn't support data
+	 * symbols from modules.*/
+	CDEBUG(D_INFO, "Lustre OSC module (%p).\n", &osc_caches);
+
+	rc = lu_kmem_init(osc_caches);
+	if (rc)
+		return rc;
+
+	lprocfs_osc_init_vars(&lvars);
+
+	rc = class_register_type(&osc_obd_ops, NULL, lvars.module_vars,
+				 LUSTRE_OSC_NAME, &osc_device_type);
+	if (rc) {
+		lu_kmem_fini(osc_caches);
+		return rc;
+	}
+
+	spin_lock_init(&osc_ast_guard);
+	lockdep_set_class(&osc_ast_guard, &osc_ast_guard_class);
+
+	return rc;
+}
+
+static void /*__exit*/ osc_exit(void)
+{
+	class_unregister_type(LUSTRE_OSC_NAME);
+	lu_kmem_fini(osc_caches);
+}
+
+MODULE_AUTHOR("Sun Microsystems, Inc. <http://www.lustre.org/>");
+MODULE_DESCRIPTION("Lustre Object Storage Client (OSC)");
+MODULE_LICENSE("GPL");
+MODULE_VERSION(LUSTRE_VERSION_STRING);
+
+module_init(osc_init);
+module_exit(osc_exit);
diff --git a/kernel/drivers/staging/lustre/lustre/ptlrpc/Makefile b/kernel/drivers/staging/lustre/lustre/ptlrpc/Makefile
new file mode 100644
index 000000000..fb50cd4c6
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/ptlrpc/Makefile
@@ -0,0 +1,20 @@
+obj-$(CONFIG_LUSTRE_FS) += ptlrpc.o
+LDLM := ../../lustre/ldlm/
+
+ldlm_objs := $(LDLM)l_lock.o $(LDLM)ldlm_lock.o
+ldlm_objs += $(LDLM)ldlm_resource.o $(LDLM)ldlm_lib.o
+ldlm_objs += $(LDLM)ldlm_plain.o $(LDLM)ldlm_extent.o
+ldlm_objs += $(LDLM)ldlm_request.o $(LDLM)ldlm_lockd.o
+ldlm_objs += $(LDLM)ldlm_flock.o $(LDLM)ldlm_inodebits.o
+ldlm_objs += $(LDLM)ldlm_pool.o
+ldlm_objs += $(LDLM)interval_tree.o
+ptlrpc_objs := client.o recover.o connection.o niobuf.o pack_generic.o
+ptlrpc_objs += events.o ptlrpc_module.o service.o pinger.o
+ptlrpc_objs += llog_net.o llog_client.o import.o ptlrpcd.o
+ptlrpc_objs += pers.o lproc_ptlrpc.o wiretest.o layout.o
+ptlrpc_objs += sec.o sec_bulk.o sec_gc.o sec_config.o
+ptlrpc_objs += sec_null.o sec_plain.o nrs.o nrs_fifo.o
+
+ptlrpc-y := $(ldlm_objs) $(ptlrpc_objs)
+ptlrpc-$(CONFIG_PROC_FS) += sec_lproc.o
+ptlrpc-$(CONFIG_LUSTRE_TRANSLATE_ERRNOS) += errno.o
diff --git a/kernel/drivers/staging/lustre/lustre/ptlrpc/client.c b/kernel/drivers/staging/lustre/lustre/ptlrpc/client.c
new file mode 100644
index 000000000..0357f1d45
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/ptlrpc/client.c
@@ -0,0 +1,3149 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+/** Implementation of client-side PortalRPC interfaces */
+
+#define DEBUG_SUBSYSTEM S_RPC
+
+#include "../include/obd_support.h"
+#include "../include/obd_class.h"
+#include "../include/lustre_lib.h"
+#include "../include/lustre_ha.h"
+#include "../include/lustre_import.h"
+#include "../include/lustre_req_layout.h"
+
+#include "ptlrpc_internal.h"
+
+static int ptlrpc_send_new_req(struct ptlrpc_request *req);
+static int ptlrpcd_check_work(struct ptlrpc_request *req);
+
+/**
+ * Initialize passed in client structure \a cl.
+ */
+void ptlrpc_init_client(int req_portal, int rep_portal, char *name,
+			struct ptlrpc_client *cl)
+{
+	cl->cli_request_portal = req_portal;
+	cl->cli_reply_portal   = rep_portal;
+	cl->cli_name	   = name;
+}
+EXPORT_SYMBOL(ptlrpc_init_client);
+
+/**
+ * Return PortalRPC connection for remote uud \a uuid
+ */
+struct ptlrpc_connection *ptlrpc_uuid_to_connection(struct obd_uuid *uuid)
+{
+	struct ptlrpc_connection *c;
+	lnet_nid_t		self;
+	lnet_process_id_t	 peer;
+	int		       err;
+
+	/* ptlrpc_uuid_to_peer() initializes its 2nd parameter
+	 * before accessing its values. */
+	/* coverity[uninit_use_in_call] */
+	err = ptlrpc_uuid_to_peer(uuid, &peer, &self);
+	if (err != 0) {
+		CNETERR("cannot find peer %s!\n", uuid->uuid);
+		return NULL;
+	}
+
+	c = ptlrpc_connection_get(peer, self, uuid);
+	if (c) {
+		memcpy(c->c_remote_uuid.uuid,
+		       uuid->uuid, sizeof(c->c_remote_uuid.uuid));
+	}
+
+	CDEBUG(D_INFO, "%s -> %p\n", uuid->uuid, c);
+
+	return c;
+}
+EXPORT_SYMBOL(ptlrpc_uuid_to_connection);
+
+/**
+ * Allocate and initialize new bulk descriptor on the sender.
+ * Returns pointer to the descriptor or NULL on error.
+ */
+struct ptlrpc_bulk_desc *ptlrpc_new_bulk(unsigned npages, unsigned max_brw,
+					 unsigned type, unsigned portal)
+{
+	struct ptlrpc_bulk_desc *desc;
+	int i;
+
+	OBD_ALLOC(desc, offsetof(struct ptlrpc_bulk_desc, bd_iov[npages]));
+	if (!desc)
+		return NULL;
+
+	spin_lock_init(&desc->bd_lock);
+	init_waitqueue_head(&desc->bd_waitq);
+	desc->bd_max_iov = npages;
+	desc->bd_iov_count = 0;
+	desc->bd_portal = portal;
+	desc->bd_type = type;
+	desc->bd_md_count = 0;
+	LASSERT(max_brw > 0);
+	desc->bd_md_max_brw = min(max_brw, PTLRPC_BULK_OPS_COUNT);
+	/* PTLRPC_BULK_OPS_COUNT is the compile-time transfer limit for this
+	 * node. Negotiated ocd_brw_size will always be <= this number. */
+	for (i = 0; i < PTLRPC_BULK_OPS_COUNT; i++)
+		LNetInvalidateHandle(&desc->bd_mds[i]);
+
+	return desc;
+}
+
+/**
+ * Prepare bulk descriptor for specified outgoing request \a req that
+ * can fit \a npages * pages. \a type is bulk type. \a portal is where
+ * the bulk to be sent. Used on client-side.
+ * Returns pointer to newly allocated initialized bulk descriptor or NULL on
+ * error.
+ */
+struct ptlrpc_bulk_desc *ptlrpc_prep_bulk_imp(struct ptlrpc_request *req,
+					      unsigned npages, unsigned max_brw,
+					      unsigned type, unsigned portal)
+{
+	struct obd_import *imp = req->rq_import;
+	struct ptlrpc_bulk_desc *desc;
+
+	LASSERT(type == BULK_PUT_SINK || type == BULK_GET_SOURCE);
+	desc = ptlrpc_new_bulk(npages, max_brw, type, portal);
+	if (desc == NULL)
+		return NULL;
+
+	desc->bd_import_generation = req->rq_import_generation;
+	desc->bd_import = class_import_get(imp);
+	desc->bd_req = req;
+
+	desc->bd_cbid.cbid_fn  = client_bulk_callback;
+	desc->bd_cbid.cbid_arg = desc;
+
+	/* This makes req own desc, and free it when she frees herself */
+	req->rq_bulk = desc;
+
+	return desc;
+}
+EXPORT_SYMBOL(ptlrpc_prep_bulk_imp);
+
+/**
+ * Add a page \a page to the bulk descriptor \a desc.
+ * Data to transfer in the page starts at offset \a pageoffset and
+ * amount of data to transfer from the page is \a len
+ */
+void __ptlrpc_prep_bulk_page(struct ptlrpc_bulk_desc *desc,
+			     struct page *page, int pageoffset, int len, int pin)
+{
+	LASSERT(desc->bd_iov_count < desc->bd_max_iov);
+	LASSERT(page != NULL);
+	LASSERT(pageoffset >= 0);
+	LASSERT(len > 0);
+	LASSERT(pageoffset + len <= PAGE_CACHE_SIZE);
+
+	desc->bd_nob += len;
+
+	if (pin)
+		page_cache_get(page);
+
+	ptlrpc_add_bulk_page(desc, page, pageoffset, len);
+}
+EXPORT_SYMBOL(__ptlrpc_prep_bulk_page);
+
+/**
+ * Uninitialize and free bulk descriptor \a desc.
+ * Works on bulk descriptors both from server and client side.
+ */
+void __ptlrpc_free_bulk(struct ptlrpc_bulk_desc *desc, int unpin)
+{
+	int i;
+
+	LASSERT(desc != NULL);
+	LASSERT(desc->bd_iov_count != LI_POISON); /* not freed already */
+	LASSERT(desc->bd_md_count == 0);	 /* network hands off */
+	LASSERT((desc->bd_export != NULL) ^ (desc->bd_import != NULL));
+
+	sptlrpc_enc_pool_put_pages(desc);
+
+	if (desc->bd_export)
+		class_export_put(desc->bd_export);
+	else
+		class_import_put(desc->bd_import);
+
+	if (unpin) {
+		for (i = 0; i < desc->bd_iov_count; i++)
+			page_cache_release(desc->bd_iov[i].kiov_page);
+	}
+
+	OBD_FREE(desc, offsetof(struct ptlrpc_bulk_desc,
+				bd_iov[desc->bd_max_iov]));
+}
+EXPORT_SYMBOL(__ptlrpc_free_bulk);
+
+/**
+ * Set server timelimit for this req, i.e. how long are we willing to wait
+ * for reply before timing out this request.
+ */
+void ptlrpc_at_set_req_timeout(struct ptlrpc_request *req)
+{
+	__u32 serv_est;
+	int idx;
+	struct imp_at *at;
+
+	LASSERT(req->rq_import);
+
+	if (AT_OFF) {
+		/* non-AT settings */
+		/**
+		 * \a imp_server_timeout means this is reverse import and
+		 * we send (currently only) ASTs to the client and cannot afford
+		 * to wait too long for the reply, otherwise the other client
+		 * (because of which we are sending this request) would
+		 * timeout waiting for us
+		 */
+		req->rq_timeout = req->rq_import->imp_server_timeout ?
+				  obd_timeout / 2 : obd_timeout;
+	} else {
+		at = &req->rq_import->imp_at;
+		idx = import_at_get_index(req->rq_import,
+					  req->rq_request_portal);
+		serv_est = at_get(&at->iat_service_estimate[idx]);
+		req->rq_timeout = at_est2timeout(serv_est);
+	}
+	/* We could get even fancier here, using history to predict increased
+	   loading... */
+
+	/* Let the server know what this RPC timeout is by putting it in the
+	   reqmsg*/
+	lustre_msg_set_timeout(req->rq_reqmsg, req->rq_timeout);
+}
+EXPORT_SYMBOL(ptlrpc_at_set_req_timeout);
+
+/* Adjust max service estimate based on server value */
+static void ptlrpc_at_adj_service(struct ptlrpc_request *req,
+				  unsigned int serv_est)
+{
+	int idx;
+	unsigned int oldse;
+	struct imp_at *at;
+
+	LASSERT(req->rq_import);
+	at = &req->rq_import->imp_at;
+
+	idx = import_at_get_index(req->rq_import, req->rq_request_portal);
+	/* max service estimates are tracked on the server side,
+	   so just keep minimal history here */
+	oldse = at_measured(&at->iat_service_estimate[idx], serv_est);
+	if (oldse != 0)
+		CDEBUG(D_ADAPTTO, "The RPC service estimate for %s ptl %d has changed from %d to %d\n",
+		       req->rq_import->imp_obd->obd_name, req->rq_request_portal,
+		       oldse, at_get(&at->iat_service_estimate[idx]));
+}
+
+/* Expected network latency per remote node (secs) */
+int ptlrpc_at_get_net_latency(struct ptlrpc_request *req)
+{
+	return AT_OFF ? 0 : at_get(&req->rq_import->imp_at.iat_net_latency);
+}
+
+/* Adjust expected network latency */
+static void ptlrpc_at_adj_net_latency(struct ptlrpc_request *req,
+				      unsigned int service_time)
+{
+	unsigned int nl, oldnl;
+	struct imp_at *at;
+	time_t now = get_seconds();
+
+	LASSERT(req->rq_import);
+
+	if (service_time > now - req->rq_sent + 3) {
+		/* bz16408, however, this can also happen if early reply
+		 * is lost and client RPC is expired and resent, early reply
+		 * or reply of original RPC can still be fit in reply buffer
+		 * of resent RPC, now client is measuring time from the
+		 * resent time, but server sent back service time of original
+		 * RPC.
+		 */
+		CDEBUG((lustre_msg_get_flags(req->rq_reqmsg) & MSG_RESENT) ?
+		       D_ADAPTTO : D_WARNING,
+		       "Reported service time %u > total measured time "
+		       CFS_DURATION_T"\n", service_time,
+		       cfs_time_sub(now, req->rq_sent));
+		return;
+	}
+
+	/* Network latency is total time less server processing time */
+	nl = max_t(int, now - req->rq_sent -
+			service_time, 0) + 1; /* st rounding */
+	at = &req->rq_import->imp_at;
+
+	oldnl = at_measured(&at->iat_net_latency, nl);
+	if (oldnl != 0)
+		CDEBUG(D_ADAPTTO, "The network latency for %s (nid %s) has changed from %d to %d\n",
+		       req->rq_import->imp_obd->obd_name,
+		       obd_uuid2str(
+			       &req->rq_import->imp_connection->c_remote_uuid),
+		       oldnl, at_get(&at->iat_net_latency));
+}
+
+static int unpack_reply(struct ptlrpc_request *req)
+{
+	int rc;
+
+	if (SPTLRPC_FLVR_POLICY(req->rq_flvr.sf_rpc) != SPTLRPC_POLICY_NULL) {
+		rc = ptlrpc_unpack_rep_msg(req, req->rq_replen);
+		if (rc) {
+			DEBUG_REQ(D_ERROR, req, "unpack_rep failed: %d", rc);
+			return -EPROTO;
+		}
+	}
+
+	rc = lustre_unpack_rep_ptlrpc_body(req, MSG_PTLRPC_BODY_OFF);
+	if (rc) {
+		DEBUG_REQ(D_ERROR, req, "unpack ptlrpc body failed: %d", rc);
+		return -EPROTO;
+	}
+	return 0;
+}
+
+/**
+ * Handle an early reply message, called with the rq_lock held.
+ * If anything goes wrong just ignore it - same as if it never happened
+ */
+static int ptlrpc_at_recv_early_reply(struct ptlrpc_request *req)
+{
+	struct ptlrpc_request *early_req;
+	time_t		 olddl;
+	int		    rc;
+
+	req->rq_early = 0;
+	spin_unlock(&req->rq_lock);
+
+	rc = sptlrpc_cli_unwrap_early_reply(req, &early_req);
+	if (rc) {
+		spin_lock(&req->rq_lock);
+		return rc;
+	}
+
+	rc = unpack_reply(early_req);
+	if (rc == 0) {
+		/* Expecting to increase the service time estimate here */
+		ptlrpc_at_adj_service(req,
+			lustre_msg_get_timeout(early_req->rq_repmsg));
+		ptlrpc_at_adj_net_latency(req,
+			lustre_msg_get_service_time(early_req->rq_repmsg));
+	}
+
+	sptlrpc_cli_finish_early_reply(early_req);
+
+	if (rc != 0) {
+		spin_lock(&req->rq_lock);
+		return rc;
+	}
+
+	/* Adjust the local timeout for this req */
+	ptlrpc_at_set_req_timeout(req);
+
+	spin_lock(&req->rq_lock);
+	olddl = req->rq_deadline;
+	/* server assumes it now has rq_timeout from when it sent the
+	 * early reply, so client should give it at least that long. */
+	req->rq_deadline = get_seconds() + req->rq_timeout +
+			   ptlrpc_at_get_net_latency(req);
+
+	DEBUG_REQ(D_ADAPTTO, req,
+		  "Early reply #%d, new deadline in " CFS_DURATION_T "s (" CFS_DURATION_T "s)",
+		  req->rq_early_count,
+		  cfs_time_sub(req->rq_deadline, get_seconds()),
+		  cfs_time_sub(req->rq_deadline, olddl));
+
+	return rc;
+}
+
+struct kmem_cache *request_cache;
+
+int ptlrpc_request_cache_init(void)
+{
+	request_cache = kmem_cache_create("ptlrpc_cache",
+					  sizeof(struct ptlrpc_request),
+					  0, SLAB_HWCACHE_ALIGN, NULL);
+	return request_cache == NULL ? -ENOMEM : 0;
+}
+
+void ptlrpc_request_cache_fini(void)
+{
+	kmem_cache_destroy(request_cache);
+}
+
+struct ptlrpc_request *ptlrpc_request_cache_alloc(gfp_t flags)
+{
+	struct ptlrpc_request *req;
+
+	OBD_SLAB_ALLOC_PTR_GFP(req, request_cache, flags);
+	return req;
+}
+
+void ptlrpc_request_cache_free(struct ptlrpc_request *req)
+{
+	OBD_SLAB_FREE_PTR(req, request_cache);
+}
+
+/**
+ * Wind down request pool \a pool.
+ * Frees all requests from the pool too
+ */
+void ptlrpc_free_rq_pool(struct ptlrpc_request_pool *pool)
+{
+	struct list_head *l, *tmp;
+	struct ptlrpc_request *req;
+
+	LASSERT(pool != NULL);
+
+	spin_lock(&pool->prp_lock);
+	list_for_each_safe(l, tmp, &pool->prp_req_list) {
+		req = list_entry(l, struct ptlrpc_request, rq_list);
+		list_del(&req->rq_list);
+		LASSERT(req->rq_reqbuf);
+		LASSERT(req->rq_reqbuf_len == pool->prp_rq_size);
+		OBD_FREE_LARGE(req->rq_reqbuf, pool->prp_rq_size);
+		ptlrpc_request_cache_free(req);
+	}
+	spin_unlock(&pool->prp_lock);
+	OBD_FREE(pool, sizeof(*pool));
+}
+EXPORT_SYMBOL(ptlrpc_free_rq_pool);
+
+/**
+ * Allocates, initializes and adds \a num_rq requests to the pool \a pool
+ */
+void ptlrpc_add_rqs_to_pool(struct ptlrpc_request_pool *pool, int num_rq)
+{
+	int i;
+	int size = 1;
+
+	while (size < pool->prp_rq_size)
+		size <<= 1;
+
+	LASSERTF(list_empty(&pool->prp_req_list) ||
+		 size == pool->prp_rq_size,
+		 "Trying to change pool size with nonempty pool from %d to %d bytes\n",
+		 pool->prp_rq_size, size);
+
+	spin_lock(&pool->prp_lock);
+	pool->prp_rq_size = size;
+	for (i = 0; i < num_rq; i++) {
+		struct ptlrpc_request *req;
+		struct lustre_msg *msg;
+
+		spin_unlock(&pool->prp_lock);
+		req = ptlrpc_request_cache_alloc(GFP_NOFS);
+		if (!req)
+			return;
+		OBD_ALLOC_LARGE(msg, size);
+		if (!msg) {
+			ptlrpc_request_cache_free(req);
+			return;
+		}
+		req->rq_reqbuf = msg;
+		req->rq_reqbuf_len = size;
+		req->rq_pool = pool;
+		spin_lock(&pool->prp_lock);
+		list_add_tail(&req->rq_list, &pool->prp_req_list);
+	}
+	spin_unlock(&pool->prp_lock);
+}
+EXPORT_SYMBOL(ptlrpc_add_rqs_to_pool);
+
+/**
+ * Create and initialize new request pool with given attributes:
+ * \a num_rq - initial number of requests to create for the pool
+ * \a msgsize - maximum message size possible for requests in thid pool
+ * \a populate_pool - function to be called when more requests need to be added
+ *		    to the pool
+ * Returns pointer to newly created pool or NULL on error.
+ */
+struct ptlrpc_request_pool *
+ptlrpc_init_rq_pool(int num_rq, int msgsize,
+		    void (*populate_pool)(struct ptlrpc_request_pool *, int))
+{
+	struct ptlrpc_request_pool *pool;
+
+	OBD_ALLOC(pool, sizeof(struct ptlrpc_request_pool));
+	if (!pool)
+		return NULL;
+
+	/* Request next power of two for the allocation, because internally
+	   kernel would do exactly this */
+
+	spin_lock_init(&pool->prp_lock);
+	INIT_LIST_HEAD(&pool->prp_req_list);
+	pool->prp_rq_size = msgsize + SPTLRPC_MAX_PAYLOAD;
+	pool->prp_populate = populate_pool;
+
+	populate_pool(pool, num_rq);
+
+	if (list_empty(&pool->prp_req_list)) {
+		/* have not allocated a single request for the pool */
+		OBD_FREE(pool, sizeof(struct ptlrpc_request_pool));
+		pool = NULL;
+	}
+	return pool;
+}
+EXPORT_SYMBOL(ptlrpc_init_rq_pool);
+
+/**
+ * Fetches one request from pool \a pool
+ */
+static struct ptlrpc_request *
+ptlrpc_prep_req_from_pool(struct ptlrpc_request_pool *pool)
+{
+	struct ptlrpc_request *request;
+	struct lustre_msg *reqbuf;
+
+	if (!pool)
+		return NULL;
+
+	spin_lock(&pool->prp_lock);
+
+	/* See if we have anything in a pool, and bail out if nothing,
+	 * in writeout path, where this matters, this is safe to do, because
+	 * nothing is lost in this case, and when some in-flight requests
+	 * complete, this code will be called again. */
+	if (unlikely(list_empty(&pool->prp_req_list))) {
+		spin_unlock(&pool->prp_lock);
+		return NULL;
+	}
+
+	request = list_entry(pool->prp_req_list.next, struct ptlrpc_request,
+				 rq_list);
+	list_del_init(&request->rq_list);
+	spin_unlock(&pool->prp_lock);
+
+	LASSERT(request->rq_reqbuf);
+	LASSERT(request->rq_pool);
+
+	reqbuf = request->rq_reqbuf;
+	memset(request, 0, sizeof(*request));
+	request->rq_reqbuf = reqbuf;
+	request->rq_reqbuf_len = pool->prp_rq_size;
+	request->rq_pool = pool;
+
+	return request;
+}
+
+/**
+ * Returns freed \a request to pool.
+ */
+static void __ptlrpc_free_req_to_pool(struct ptlrpc_request *request)
+{
+	struct ptlrpc_request_pool *pool = request->rq_pool;
+
+	spin_lock(&pool->prp_lock);
+	LASSERT(list_empty(&request->rq_list));
+	LASSERT(!request->rq_receiving_reply);
+	list_add_tail(&request->rq_list, &pool->prp_req_list);
+	spin_unlock(&pool->prp_lock);
+}
+
+static int __ptlrpc_request_bufs_pack(struct ptlrpc_request *request,
+				      __u32 version, int opcode,
+				      int count, __u32 *lengths, char **bufs,
+				      struct ptlrpc_cli_ctx *ctx)
+{
+	struct obd_import  *imp = request->rq_import;
+	int		 rc;
+
+	if (unlikely(ctx))
+		request->rq_cli_ctx = sptlrpc_cli_ctx_get(ctx);
+	else {
+		rc = sptlrpc_req_get_ctx(request);
+		if (rc)
+			goto out_free;
+	}
+
+	sptlrpc_req_set_flavor(request, opcode);
+
+	rc = lustre_pack_request(request, imp->imp_msg_magic, count,
+				 lengths, bufs);
+	if (rc) {
+		LASSERT(!request->rq_pool);
+		goto out_ctx;
+	}
+
+	lustre_msg_add_version(request->rq_reqmsg, version);
+	request->rq_send_state = LUSTRE_IMP_FULL;
+	request->rq_type = PTL_RPC_MSG_REQUEST;
+	request->rq_export = NULL;
+
+	request->rq_req_cbid.cbid_fn  = request_out_callback;
+	request->rq_req_cbid.cbid_arg = request;
+
+	request->rq_reply_cbid.cbid_fn  = reply_in_callback;
+	request->rq_reply_cbid.cbid_arg = request;
+
+	request->rq_reply_deadline = 0;
+	request->rq_phase = RQ_PHASE_NEW;
+	request->rq_next_phase = RQ_PHASE_UNDEFINED;
+
+	request->rq_request_portal = imp->imp_client->cli_request_portal;
+	request->rq_reply_portal = imp->imp_client->cli_reply_portal;
+
+	ptlrpc_at_set_req_timeout(request);
+
+	spin_lock_init(&request->rq_lock);
+	INIT_LIST_HEAD(&request->rq_list);
+	INIT_LIST_HEAD(&request->rq_timed_list);
+	INIT_LIST_HEAD(&request->rq_replay_list);
+	INIT_LIST_HEAD(&request->rq_ctx_chain);
+	INIT_LIST_HEAD(&request->rq_set_chain);
+	INIT_LIST_HEAD(&request->rq_history_list);
+	INIT_LIST_HEAD(&request->rq_exp_list);
+	init_waitqueue_head(&request->rq_reply_waitq);
+	init_waitqueue_head(&request->rq_set_waitq);
+	request->rq_xid = ptlrpc_next_xid();
+	atomic_set(&request->rq_refcount, 1);
+
+	lustre_msg_set_opc(request->rq_reqmsg, opcode);
+
+	return 0;
+out_ctx:
+	sptlrpc_cli_ctx_put(request->rq_cli_ctx, 1);
+out_free:
+	class_import_put(imp);
+	return rc;
+}
+
+int ptlrpc_request_bufs_pack(struct ptlrpc_request *request,
+			     __u32 version, int opcode, char **bufs,
+			     struct ptlrpc_cli_ctx *ctx)
+{
+	int count;
+
+	count = req_capsule_filled_sizes(&request->rq_pill, RCL_CLIENT);
+	return __ptlrpc_request_bufs_pack(request, version, opcode, count,
+					  request->rq_pill.rc_area[RCL_CLIENT],
+					  bufs, ctx);
+}
+EXPORT_SYMBOL(ptlrpc_request_bufs_pack);
+
+/**
+ * Pack request buffers for network transfer, performing necessary encryption
+ * steps if necessary.
+ */
+int ptlrpc_request_pack(struct ptlrpc_request *request,
+			__u32 version, int opcode)
+{
+	int rc;
+	rc = ptlrpc_request_bufs_pack(request, version, opcode, NULL, NULL);
+	if (rc)
+		return rc;
+
+	/* For some old 1.8 clients (< 1.8.7), they will LASSERT the size of
+	 * ptlrpc_body sent from server equal to local ptlrpc_body size, so we
+	 * have to send old ptlrpc_body to keep interoperability with these
+	 * clients.
+	 *
+	 * Only three kinds of server->client RPCs so far:
+	 *  - LDLM_BL_CALLBACK
+	 *  - LDLM_CP_CALLBACK
+	 *  - LDLM_GL_CALLBACK
+	 *
+	 * XXX This should be removed whenever we drop the interoperability with
+	 *     the these old clients.
+	 */
+	if (opcode == LDLM_BL_CALLBACK || opcode == LDLM_CP_CALLBACK ||
+	    opcode == LDLM_GL_CALLBACK)
+		req_capsule_shrink(&request->rq_pill, &RMF_PTLRPC_BODY,
+				   sizeof(struct ptlrpc_body_v2), RCL_CLIENT);
+
+	return rc;
+}
+EXPORT_SYMBOL(ptlrpc_request_pack);
+
+/**
+ * Helper function to allocate new request on import \a imp
+ * and possibly using existing request from pool \a pool if provided.
+ * Returns allocated request structure with import field filled or
+ * NULL on error.
+ */
+static inline
+struct ptlrpc_request *__ptlrpc_request_alloc(struct obd_import *imp,
+					      struct ptlrpc_request_pool *pool)
+{
+	struct ptlrpc_request *request = NULL;
+
+	if (pool)
+		request = ptlrpc_prep_req_from_pool(pool);
+
+	if (!request)
+		request = ptlrpc_request_cache_alloc(GFP_NOFS);
+
+	if (request) {
+		LASSERTF((unsigned long)imp > 0x1000, "%p", imp);
+		LASSERT(imp != LP_POISON);
+		LASSERTF((unsigned long)imp->imp_client > 0x1000, "%p",
+			imp->imp_client);
+		LASSERT(imp->imp_client != LP_POISON);
+
+		request->rq_import = class_import_get(imp);
+	} else {
+		CERROR("request allocation out of memory\n");
+	}
+
+	return request;
+}
+
+/**
+ * Helper function for creating a request.
+ * Calls __ptlrpc_request_alloc to allocate new request structure and inits
+ * buffer structures according to capsule template \a format.
+ * Returns allocated request structure pointer or NULL on error.
+ */
+static struct ptlrpc_request *
+ptlrpc_request_alloc_internal(struct obd_import *imp,
+			      struct ptlrpc_request_pool *pool,
+			      const struct req_format *format)
+{
+	struct ptlrpc_request *request;
+
+	request = __ptlrpc_request_alloc(imp, pool);
+	if (request == NULL)
+		return NULL;
+
+	req_capsule_init(&request->rq_pill, request, RCL_CLIENT);
+	req_capsule_set(&request->rq_pill, format);
+	return request;
+}
+
+/**
+ * Allocate new request structure for import \a imp and initialize its
+ * buffer structure according to capsule template \a format.
+ */
+struct ptlrpc_request *ptlrpc_request_alloc(struct obd_import *imp,
+					    const struct req_format *format)
+{
+	return ptlrpc_request_alloc_internal(imp, NULL, format);
+}
+EXPORT_SYMBOL(ptlrpc_request_alloc);
+
+/**
+ * Allocate new request structure for import \a imp from pool \a pool and
+ * initialize its buffer structure according to capsule template \a format.
+ */
+struct ptlrpc_request *ptlrpc_request_alloc_pool(struct obd_import *imp,
+					    struct ptlrpc_request_pool *pool,
+					    const struct req_format *format)
+{
+	return ptlrpc_request_alloc_internal(imp, pool, format);
+}
+EXPORT_SYMBOL(ptlrpc_request_alloc_pool);
+
+/**
+ * For requests not from pool, free memory of the request structure.
+ * For requests obtained from a pool earlier, return request back to pool.
+ */
+void ptlrpc_request_free(struct ptlrpc_request *request)
+{
+	if (request->rq_pool)
+		__ptlrpc_free_req_to_pool(request);
+	else
+		ptlrpc_request_cache_free(request);
+}
+EXPORT_SYMBOL(ptlrpc_request_free);
+
+/**
+ * Allocate new request for operation \a opcode and immediately pack it for
+ * network transfer.
+ * Only used for simple requests like OBD_PING where the only important
+ * part of the request is operation itself.
+ * Returns allocated request or NULL on error.
+ */
+struct ptlrpc_request *ptlrpc_request_alloc_pack(struct obd_import *imp,
+						const struct req_format *format,
+						__u32 version, int opcode)
+{
+	struct ptlrpc_request *req = ptlrpc_request_alloc(imp, format);
+	int		    rc;
+
+	if (req) {
+		rc = ptlrpc_request_pack(req, version, opcode);
+		if (rc) {
+			ptlrpc_request_free(req);
+			req = NULL;
+		}
+	}
+	return req;
+}
+EXPORT_SYMBOL(ptlrpc_request_alloc_pack);
+
+/**
+ * Prepare request (fetched from pool \a pool if not NULL) on import \a imp
+ * for operation \a opcode. Request would contain \a count buffers.
+ * Sizes of buffers are described in array \a lengths and buffers themselves
+ * are provided by a pointer \a bufs.
+ * Returns prepared request structure pointer or NULL on error.
+ */
+struct ptlrpc_request *
+ptlrpc_prep_req_pool(struct obd_import *imp,
+		     __u32 version, int opcode,
+		     int count, __u32 *lengths, char **bufs,
+		     struct ptlrpc_request_pool *pool)
+{
+	struct ptlrpc_request *request;
+	int		    rc;
+
+	request = __ptlrpc_request_alloc(imp, pool);
+	if (!request)
+		return NULL;
+
+	rc = __ptlrpc_request_bufs_pack(request, version, opcode, count,
+					lengths, bufs, NULL);
+	if (rc) {
+		ptlrpc_request_free(request);
+		request = NULL;
+	}
+	return request;
+}
+EXPORT_SYMBOL(ptlrpc_prep_req_pool);
+
+/**
+ * Same as ptlrpc_prep_req_pool, but without pool
+ */
+struct ptlrpc_request *
+ptlrpc_prep_req(struct obd_import *imp, __u32 version, int opcode, int count,
+		__u32 *lengths, char **bufs)
+{
+	return ptlrpc_prep_req_pool(imp, version, opcode, count, lengths, bufs,
+				    NULL);
+}
+EXPORT_SYMBOL(ptlrpc_prep_req);
+
+/**
+ * Allocate and initialize new request set structure.
+ * Returns a pointer to the newly allocated set structure or NULL on error.
+ */
+struct ptlrpc_request_set *ptlrpc_prep_set(void)
+{
+	struct ptlrpc_request_set *set;
+
+	OBD_ALLOC(set, sizeof(*set));
+	if (!set)
+		return NULL;
+	atomic_set(&set->set_refcount, 1);
+	INIT_LIST_HEAD(&set->set_requests);
+	init_waitqueue_head(&set->set_waitq);
+	atomic_set(&set->set_new_count, 0);
+	atomic_set(&set->set_remaining, 0);
+	spin_lock_init(&set->set_new_req_lock);
+	INIT_LIST_HEAD(&set->set_new_requests);
+	INIT_LIST_HEAD(&set->set_cblist);
+	set->set_max_inflight = UINT_MAX;
+	set->set_producer     = NULL;
+	set->set_producer_arg = NULL;
+	set->set_rc	   = 0;
+
+	return set;
+}
+EXPORT_SYMBOL(ptlrpc_prep_set);
+
+/**
+ * Allocate and initialize new request set structure with flow control
+ * extension. This extension allows to control the number of requests in-flight
+ * for the whole set. A callback function to generate requests must be provided
+ * and the request set will keep the number of requests sent over the wire to
+ * @max_inflight.
+ * Returns a pointer to the newly allocated set structure or NULL on error.
+ */
+struct ptlrpc_request_set *ptlrpc_prep_fcset(int max, set_producer_func func,
+					     void *arg)
+
+{
+	struct ptlrpc_request_set *set;
+
+	set = ptlrpc_prep_set();
+	if (!set)
+		return NULL;
+
+	set->set_max_inflight  = max;
+	set->set_producer      = func;
+	set->set_producer_arg  = arg;
+
+	return set;
+}
+EXPORT_SYMBOL(ptlrpc_prep_fcset);
+
+/**
+ * Wind down and free request set structure previously allocated with
+ * ptlrpc_prep_set.
+ * Ensures that all requests on the set have completed and removes
+ * all requests from the request list in a set.
+ * If any unsent request happen to be on the list, pretends that they got
+ * an error in flight and calls their completion handler.
+ */
+void ptlrpc_set_destroy(struct ptlrpc_request_set *set)
+{
+	struct list_head       *tmp;
+	struct list_head       *next;
+	int	       expected_phase;
+	int	       n = 0;
+
+	/* Requests on the set should either all be completed, or all be new */
+	expected_phase = (atomic_read(&set->set_remaining) == 0) ?
+			 RQ_PHASE_COMPLETE : RQ_PHASE_NEW;
+	list_for_each(tmp, &set->set_requests) {
+		struct ptlrpc_request *req =
+			list_entry(tmp, struct ptlrpc_request,
+				       rq_set_chain);
+
+		LASSERT(req->rq_phase == expected_phase);
+		n++;
+	}
+
+	LASSERTF(atomic_read(&set->set_remaining) == 0 ||
+		 atomic_read(&set->set_remaining) == n, "%d / %d\n",
+		 atomic_read(&set->set_remaining), n);
+
+	list_for_each_safe(tmp, next, &set->set_requests) {
+		struct ptlrpc_request *req =
+			list_entry(tmp, struct ptlrpc_request,
+				       rq_set_chain);
+		list_del_init(&req->rq_set_chain);
+
+		LASSERT(req->rq_phase == expected_phase);
+
+		if (req->rq_phase == RQ_PHASE_NEW) {
+			ptlrpc_req_interpret(NULL, req, -EBADR);
+			atomic_dec(&set->set_remaining);
+		}
+
+		spin_lock(&req->rq_lock);
+		req->rq_set = NULL;
+		req->rq_invalid_rqset = 0;
+		spin_unlock(&req->rq_lock);
+
+		ptlrpc_req_finished(req);
+	}
+
+	LASSERT(atomic_read(&set->set_remaining) == 0);
+
+	ptlrpc_reqset_put(set);
+}
+EXPORT_SYMBOL(ptlrpc_set_destroy);
+
+/**
+ * Add a callback function \a fn to the set.
+ * This function would be called when all requests on this set are completed.
+ * The function will be passed \a data argument.
+ */
+int ptlrpc_set_add_cb(struct ptlrpc_request_set *set,
+		      set_interpreter_func fn, void *data)
+{
+	struct ptlrpc_set_cbdata *cbdata;
+
+	OBD_ALLOC_PTR(cbdata);
+	if (cbdata == NULL)
+		return -ENOMEM;
+
+	cbdata->psc_interpret = fn;
+	cbdata->psc_data = data;
+	list_add_tail(&cbdata->psc_item, &set->set_cblist);
+
+	return 0;
+}
+EXPORT_SYMBOL(ptlrpc_set_add_cb);
+
+/**
+ * Add a new request to the general purpose request set.
+ * Assumes request reference from the caller.
+ */
+void ptlrpc_set_add_req(struct ptlrpc_request_set *set,
+			struct ptlrpc_request *req)
+{
+	LASSERT(list_empty(&req->rq_set_chain));
+
+	/* The set takes over the caller's request reference */
+	list_add_tail(&req->rq_set_chain, &set->set_requests);
+	req->rq_set = set;
+	atomic_inc(&set->set_remaining);
+	req->rq_queued_time = cfs_time_current();
+
+	if (req->rq_reqmsg != NULL)
+		lustre_msg_set_jobid(req->rq_reqmsg, NULL);
+
+	if (set->set_producer != NULL)
+		/* If the request set has a producer callback, the RPC must be
+		 * sent straight away */
+		ptlrpc_send_new_req(req);
+}
+EXPORT_SYMBOL(ptlrpc_set_add_req);
+
+/**
+ * Add a request to a request with dedicated server thread
+ * and wake the thread to make any necessary processing.
+ * Currently only used for ptlrpcd.
+ */
+void ptlrpc_set_add_new_req(struct ptlrpcd_ctl *pc,
+			   struct ptlrpc_request *req)
+{
+	struct ptlrpc_request_set *set = pc->pc_set;
+	int count, i;
+
+	LASSERT(req->rq_set == NULL);
+	LASSERT(test_bit(LIOD_STOP, &pc->pc_flags) == 0);
+
+	spin_lock(&set->set_new_req_lock);
+	/*
+	 * The set takes over the caller's request reference.
+	 */
+	req->rq_set = set;
+	req->rq_queued_time = cfs_time_current();
+	list_add_tail(&req->rq_set_chain, &set->set_new_requests);
+	count = atomic_inc_return(&set->set_new_count);
+	spin_unlock(&set->set_new_req_lock);
+
+	/* Only need to call wakeup once for the first entry. */
+	if (count == 1) {
+		wake_up(&set->set_waitq);
+
+		/* XXX: It maybe unnecessary to wakeup all the partners. But to
+		 *      guarantee the async RPC can be processed ASAP, we have
+		 *      no other better choice. It maybe fixed in future. */
+		for (i = 0; i < pc->pc_npartners; i++)
+			wake_up(&pc->pc_partners[i]->pc_set->set_waitq);
+	}
+}
+EXPORT_SYMBOL(ptlrpc_set_add_new_req);
+
+/**
+ * Based on the current state of the import, determine if the request
+ * can be sent, is an error, or should be delayed.
+ *
+ * Returns true if this request should be delayed. If false, and
+ * *status is set, then the request can not be sent and *status is the
+ * error code.  If false and status is 0, then request can be sent.
+ *
+ * The imp->imp_lock must be held.
+ */
+static int ptlrpc_import_delay_req(struct obd_import *imp,
+				   struct ptlrpc_request *req, int *status)
+{
+	int delay = 0;
+
+	LASSERT(status != NULL);
+	*status = 0;
+
+	if (req->rq_ctx_init || req->rq_ctx_fini) {
+		/* always allow ctx init/fini rpc go through */
+	} else if (imp->imp_state == LUSTRE_IMP_NEW) {
+		DEBUG_REQ(D_ERROR, req, "Uninitialized import.");
+		*status = -EIO;
+	} else if (imp->imp_state == LUSTRE_IMP_CLOSED) {
+		/* pings may safely race with umount */
+		DEBUG_REQ(lustre_msg_get_opc(req->rq_reqmsg) == OBD_PING ?
+			  D_HA : D_ERROR, req, "IMP_CLOSED ");
+		*status = -EIO;
+	} else if (ptlrpc_send_limit_expired(req)) {
+		/* probably doesn't need to be a D_ERROR after initial testing */
+		DEBUG_REQ(D_ERROR, req, "send limit expired ");
+		*status = -EIO;
+	} else if (req->rq_send_state == LUSTRE_IMP_CONNECTING &&
+		   imp->imp_state == LUSTRE_IMP_CONNECTING) {
+		/* allow CONNECT even if import is invalid */
+		if (atomic_read(&imp->imp_inval_count) != 0) {
+			DEBUG_REQ(D_ERROR, req, "invalidate in flight");
+			*status = -EIO;
+		}
+	} else if (imp->imp_invalid || imp->imp_obd->obd_no_recov) {
+		if (!imp->imp_deactive)
+			DEBUG_REQ(D_NET, req, "IMP_INVALID");
+		*status = -ESHUTDOWN; /* bz 12940 */
+	} else if (req->rq_import_generation != imp->imp_generation) {
+		DEBUG_REQ(D_ERROR, req, "req wrong generation:");
+		*status = -EIO;
+	} else if (req->rq_send_state != imp->imp_state) {
+		/* invalidate in progress - any requests should be drop */
+		if (atomic_read(&imp->imp_inval_count) != 0) {
+			DEBUG_REQ(D_ERROR, req, "invalidate in flight");
+			*status = -EIO;
+		} else if (imp->imp_dlm_fake || req->rq_no_delay) {
+			*status = -EWOULDBLOCK;
+		} else if (req->rq_allow_replay &&
+			  (imp->imp_state == LUSTRE_IMP_REPLAY ||
+			   imp->imp_state == LUSTRE_IMP_REPLAY_LOCKS ||
+			   imp->imp_state == LUSTRE_IMP_REPLAY_WAIT ||
+			   imp->imp_state == LUSTRE_IMP_RECOVER)) {
+			DEBUG_REQ(D_HA, req, "allow during recovery.\n");
+		} else {
+			delay = 1;
+		}
+	}
+
+	return delay;
+}
+
+/**
+ * Decide if the error message regarding provided request \a req
+ * should be printed to the console or not.
+ * Makes it's decision on request status and other properties.
+ * Returns 1 to print error on the system console or 0 if not.
+ */
+static int ptlrpc_console_allow(struct ptlrpc_request *req)
+{
+	__u32 opc;
+	int err;
+
+	LASSERT(req->rq_reqmsg != NULL);
+	opc = lustre_msg_get_opc(req->rq_reqmsg);
+
+	/* Suppress particular reconnect errors which are to be expected.  No
+	 * errors are suppressed for the initial connection on an import */
+	if ((lustre_handle_is_used(&req->rq_import->imp_remote_handle)) &&
+	    (opc == OST_CONNECT || opc == MDS_CONNECT || opc == MGS_CONNECT)) {
+
+		/* Suppress timed out reconnect requests */
+		if (req->rq_timedout)
+			return 0;
+
+		/* Suppress unavailable/again reconnect requests */
+		err = lustre_msg_get_status(req->rq_repmsg);
+		if (err == -ENODEV || err == -EAGAIN)
+			return 0;
+	}
+
+	return 1;
+}
+
+/**
+ * Check request processing status.
+ * Returns the status.
+ */
+static int ptlrpc_check_status(struct ptlrpc_request *req)
+{
+	int err;
+
+	err = lustre_msg_get_status(req->rq_repmsg);
+	if (lustre_msg_get_type(req->rq_repmsg) == PTL_RPC_MSG_ERR) {
+		struct obd_import *imp = req->rq_import;
+		__u32 opc = lustre_msg_get_opc(req->rq_reqmsg);
+		if (ptlrpc_console_allow(req))
+			LCONSOLE_ERROR_MSG(0x011, "%s: Communicating with %s, operation %s failed with %d.\n",
+					   imp->imp_obd->obd_name,
+					   libcfs_nid2str(
+						   imp->imp_connection->c_peer.nid),
+					   ll_opcode2str(opc), err);
+		return err < 0 ? err : -EINVAL;
+	}
+
+	if (err < 0) {
+		DEBUG_REQ(D_INFO, req, "status is %d", err);
+	} else if (err > 0) {
+		/* XXX: translate this error from net to host */
+		DEBUG_REQ(D_INFO, req, "status is %d", err);
+	}
+
+	return err;
+}
+
+/**
+ * save pre-versions of objects into request for replay.
+ * Versions are obtained from server reply.
+ * used for VBR.
+ */
+static void ptlrpc_save_versions(struct ptlrpc_request *req)
+{
+	struct lustre_msg *repmsg = req->rq_repmsg;
+	struct lustre_msg *reqmsg = req->rq_reqmsg;
+	__u64 *versions = lustre_msg_get_versions(repmsg);
+
+	if (lustre_msg_get_flags(req->rq_reqmsg) & MSG_REPLAY)
+		return;
+
+	LASSERT(versions);
+	lustre_msg_set_versions(reqmsg, versions);
+	CDEBUG(D_INFO, "Client save versions [%#llx/%#llx]\n",
+	       versions[0], versions[1]);
+}
+
+/**
+ * Callback function called when client receives RPC reply for \a req.
+ * Returns 0 on success or error code.
+ * The return value would be assigned to req->rq_status by the caller
+ * as request processing status.
+ * This function also decides if the request needs to be saved for later replay.
+ */
+static int after_reply(struct ptlrpc_request *req)
+{
+	struct obd_import *imp = req->rq_import;
+	struct obd_device *obd = req->rq_import->imp_obd;
+	int rc;
+	struct timeval work_start;
+	long timediff;
+
+	LASSERT(obd != NULL);
+	/* repbuf must be unlinked */
+	LASSERT(!req->rq_receiving_reply && !req->rq_reply_unlink);
+
+	if (req->rq_reply_truncate) {
+		if (ptlrpc_no_resend(req)) {
+			DEBUG_REQ(D_ERROR, req, "reply buffer overflow, expected: %d, actual size: %d",
+				  req->rq_nob_received, req->rq_repbuf_len);
+			return -EOVERFLOW;
+		}
+
+		sptlrpc_cli_free_repbuf(req);
+		/* Pass the required reply buffer size (include
+		 * space for early reply).
+		 * NB: no need to roundup because alloc_repbuf
+		 * will roundup it */
+		req->rq_replen       = req->rq_nob_received;
+		req->rq_nob_received = 0;
+		spin_lock(&req->rq_lock);
+		req->rq_resend       = 1;
+		spin_unlock(&req->rq_lock);
+		return 0;
+	}
+
+	/*
+	 * NB Until this point, the whole of the incoming message,
+	 * including buflens, status etc is in the sender's byte order.
+	 */
+	rc = sptlrpc_cli_unwrap_reply(req);
+	if (rc) {
+		DEBUG_REQ(D_ERROR, req, "unwrap reply failed (%d):", rc);
+		return rc;
+	}
+
+	/*
+	 * Security layer unwrap might ask resend this request.
+	 */
+	if (req->rq_resend)
+		return 0;
+
+	rc = unpack_reply(req);
+	if (rc)
+		return rc;
+
+	/* retry indefinitely on EINPROGRESS */
+	if (lustre_msg_get_status(req->rq_repmsg) == -EINPROGRESS &&
+	    ptlrpc_no_resend(req) == 0 && !req->rq_no_retry_einprogress) {
+		time_t	now = get_seconds();
+
+		DEBUG_REQ(D_RPCTRACE, req, "Resending request on EINPROGRESS");
+		spin_lock(&req->rq_lock);
+		req->rq_resend = 1;
+		spin_unlock(&req->rq_lock);
+		req->rq_nr_resend++;
+
+		/* allocate new xid to avoid reply reconstruction */
+		if (!req->rq_bulk) {
+			/* new xid is already allocated for bulk in
+			 * ptlrpc_check_set() */
+			req->rq_xid = ptlrpc_next_xid();
+			DEBUG_REQ(D_RPCTRACE, req, "Allocating new xid for resend on EINPROGRESS");
+		}
+
+		/* Readjust the timeout for current conditions */
+		ptlrpc_at_set_req_timeout(req);
+		/* delay resend to give a chance to the server to get ready.
+		 * The delay is increased by 1s on every resend and is capped to
+		 * the current request timeout (i.e. obd_timeout if AT is off,
+		 * or AT service time x 125% + 5s, see at_est2timeout) */
+		if (req->rq_nr_resend > req->rq_timeout)
+			req->rq_sent = now + req->rq_timeout;
+		else
+			req->rq_sent = now + req->rq_nr_resend;
+
+		return 0;
+	}
+
+	do_gettimeofday(&work_start);
+	timediff = cfs_timeval_sub(&work_start, &req->rq_arrival_time, NULL);
+	if (obd->obd_svc_stats != NULL) {
+		lprocfs_counter_add(obd->obd_svc_stats, PTLRPC_REQWAIT_CNTR,
+				    timediff);
+		ptlrpc_lprocfs_rpc_sent(req, timediff);
+	}
+
+	if (lustre_msg_get_type(req->rq_repmsg) != PTL_RPC_MSG_REPLY &&
+	    lustre_msg_get_type(req->rq_repmsg) != PTL_RPC_MSG_ERR) {
+		DEBUG_REQ(D_ERROR, req, "invalid packet received (type=%u)",
+			  lustre_msg_get_type(req->rq_repmsg));
+		return -EPROTO;
+	}
+
+	if (lustre_msg_get_opc(req->rq_reqmsg) != OBD_PING)
+		CFS_FAIL_TIMEOUT(OBD_FAIL_PTLRPC_PAUSE_REP, cfs_fail_val);
+	ptlrpc_at_adj_service(req, lustre_msg_get_timeout(req->rq_repmsg));
+	ptlrpc_at_adj_net_latency(req,
+				  lustre_msg_get_service_time(req->rq_repmsg));
+
+	rc = ptlrpc_check_status(req);
+	imp->imp_connect_error = rc;
+
+	if (rc) {
+		/*
+		 * Either we've been evicted, or the server has failed for
+		 * some reason. Try to reconnect, and if that fails, punt to
+		 * the upcall.
+		 */
+		if (ll_rpc_recoverable_error(rc)) {
+			if (req->rq_send_state != LUSTRE_IMP_FULL ||
+			    imp->imp_obd->obd_no_recov || imp->imp_dlm_fake) {
+				return rc;
+			}
+			ptlrpc_request_handle_notconn(req);
+			return rc;
+		}
+	} else {
+		/*
+		 * Let's look if server sent slv. Do it only for RPC with
+		 * rc == 0.
+		 */
+		ldlm_cli_update_pool(req);
+	}
+
+	/*
+	 * Store transno in reqmsg for replay.
+	 */
+	if (!(lustre_msg_get_flags(req->rq_reqmsg) & MSG_REPLAY)) {
+		req->rq_transno = lustre_msg_get_transno(req->rq_repmsg);
+		lustre_msg_set_transno(req->rq_reqmsg, req->rq_transno);
+	}
+
+	if (imp->imp_replayable) {
+		spin_lock(&imp->imp_lock);
+		/*
+		 * No point in adding already-committed requests to the replay
+		 * list, we will just remove them immediately. b=9829
+		 */
+		if (req->rq_transno != 0 &&
+		    (req->rq_transno >
+		     lustre_msg_get_last_committed(req->rq_repmsg) ||
+		     req->rq_replay)) {
+			/** version recovery */
+			ptlrpc_save_versions(req);
+			ptlrpc_retain_replayable_request(req, imp);
+		} else if (req->rq_commit_cb != NULL &&
+			   list_empty(&req->rq_replay_list)) {
+			/* NB: don't call rq_commit_cb if it's already on
+			 * rq_replay_list, ptlrpc_free_committed() will call
+			 * it later, see LU-3618 for details */
+			spin_unlock(&imp->imp_lock);
+			req->rq_commit_cb(req);
+			spin_lock(&imp->imp_lock);
+		}
+
+		/*
+		 * Replay-enabled imports return commit-status information.
+		 */
+		if (lustre_msg_get_last_committed(req->rq_repmsg)) {
+			imp->imp_peer_committed_transno =
+				lustre_msg_get_last_committed(req->rq_repmsg);
+		}
+
+		ptlrpc_free_committed(imp);
+
+		if (!list_empty(&imp->imp_replay_list)) {
+			struct ptlrpc_request *last;
+
+			last = list_entry(imp->imp_replay_list.prev,
+					      struct ptlrpc_request,
+					      rq_replay_list);
+			/*
+			 * Requests with rq_replay stay on the list even if no
+			 * commit is expected.
+			 */
+			if (last->rq_transno > imp->imp_peer_committed_transno)
+				ptlrpc_pinger_commit_expected(imp);
+		}
+
+		spin_unlock(&imp->imp_lock);
+	}
+
+	return rc;
+}
+
+/**
+ * Helper function to send request \a req over the network for the first time
+ * Also adjusts request phase.
+ * Returns 0 on success or error code.
+ */
+static int ptlrpc_send_new_req(struct ptlrpc_request *req)
+{
+	struct obd_import     *imp = req->rq_import;
+	int rc;
+
+	LASSERT(req->rq_phase == RQ_PHASE_NEW);
+	if (req->rq_sent && (req->rq_sent > get_seconds()) &&
+	    (!req->rq_generation_set ||
+	     req->rq_import_generation == imp->imp_generation))
+		return 0;
+
+	ptlrpc_rqphase_move(req, RQ_PHASE_RPC);
+
+	spin_lock(&imp->imp_lock);
+
+	if (!req->rq_generation_set)
+		req->rq_import_generation = imp->imp_generation;
+
+	if (ptlrpc_import_delay_req(imp, req, &rc)) {
+		spin_lock(&req->rq_lock);
+		req->rq_waiting = 1;
+		spin_unlock(&req->rq_lock);
+
+		DEBUG_REQ(D_HA, req, "req from PID %d waiting for recovery: (%s != %s)",
+			  lustre_msg_get_status(req->rq_reqmsg),
+			  ptlrpc_import_state_name(req->rq_send_state),
+			  ptlrpc_import_state_name(imp->imp_state));
+		LASSERT(list_empty(&req->rq_list));
+		list_add_tail(&req->rq_list, &imp->imp_delayed_list);
+		atomic_inc(&req->rq_import->imp_inflight);
+		spin_unlock(&imp->imp_lock);
+		return 0;
+	}
+
+	if (rc != 0) {
+		spin_unlock(&imp->imp_lock);
+		req->rq_status = rc;
+		ptlrpc_rqphase_move(req, RQ_PHASE_INTERPRET);
+		return rc;
+	}
+
+	LASSERT(list_empty(&req->rq_list));
+	list_add_tail(&req->rq_list, &imp->imp_sending_list);
+	atomic_inc(&req->rq_import->imp_inflight);
+	spin_unlock(&imp->imp_lock);
+
+	lustre_msg_set_status(req->rq_reqmsg, current_pid());
+
+	rc = sptlrpc_req_refresh_ctx(req, -1);
+	if (rc) {
+		if (req->rq_err) {
+			req->rq_status = rc;
+			return 1;
+		}
+		spin_lock(&req->rq_lock);
+		req->rq_wait_ctx = 1;
+		spin_unlock(&req->rq_lock);
+		return 0;
+	}
+
+	CDEBUG(D_RPCTRACE, "Sending RPC pname:cluuid:pid:xid:nid:opc %s:%s:%d:%llu:%s:%d\n",
+	       current_comm(),
+	       imp->imp_obd->obd_uuid.uuid,
+	       lustre_msg_get_status(req->rq_reqmsg), req->rq_xid,
+	       libcfs_nid2str(imp->imp_connection->c_peer.nid),
+	       lustre_msg_get_opc(req->rq_reqmsg));
+
+	rc = ptl_send_rpc(req, 0);
+	if (rc) {
+		DEBUG_REQ(D_HA, req, "send failed (%d); expect timeout", rc);
+		spin_lock(&req->rq_lock);
+		req->rq_net_err = 1;
+		spin_unlock(&req->rq_lock);
+		return rc;
+	}
+	return 0;
+}
+
+static inline int ptlrpc_set_producer(struct ptlrpc_request_set *set)
+{
+	int remaining, rc;
+
+	LASSERT(set->set_producer != NULL);
+
+	remaining = atomic_read(&set->set_remaining);
+
+	/* populate the ->set_requests list with requests until we
+	 * reach the maximum number of RPCs in flight for this set */
+	while (atomic_read(&set->set_remaining) < set->set_max_inflight) {
+		rc = set->set_producer(set, set->set_producer_arg);
+		if (rc == -ENOENT) {
+			/* no more RPC to produce */
+			set->set_producer     = NULL;
+			set->set_producer_arg = NULL;
+			return 0;
+		}
+	}
+
+	return (atomic_read(&set->set_remaining) - remaining);
+}
+
+/**
+ * this sends any unsent RPCs in \a set and returns 1 if all are sent
+ * and no more replies are expected.
+ * (it is possible to get less replies than requests sent e.g. due to timed out
+ * requests or requests that we had trouble to send out)
+ *
+ * NOTE: This function contains a potential schedule point (cond_resched()).
+ */
+int ptlrpc_check_set(const struct lu_env *env, struct ptlrpc_request_set *set)
+{
+	struct list_head *tmp, *next;
+	struct list_head comp_reqs;
+	int force_timer_recalc = 0;
+
+	if (atomic_read(&set->set_remaining) == 0)
+		return 1;
+
+	INIT_LIST_HEAD(&comp_reqs);
+	list_for_each_safe(tmp, next, &set->set_requests) {
+		struct ptlrpc_request *req =
+			list_entry(tmp, struct ptlrpc_request,
+				       rq_set_chain);
+		struct obd_import *imp = req->rq_import;
+		int unregistered = 0;
+		int rc = 0;
+
+		/* This schedule point is mainly for the ptlrpcd caller of this
+		 * function.  Most ptlrpc sets are not long-lived and unbounded
+		 * in length, but at the least the set used by the ptlrpcd is.
+		 * Since the processing time is unbounded, we need to insert an
+		 * explicit schedule point to make the thread well-behaved.
+		 */
+		cond_resched();
+
+		if (req->rq_phase == RQ_PHASE_NEW &&
+		    ptlrpc_send_new_req(req)) {
+			force_timer_recalc = 1;
+		}
+
+		/* delayed send - skip */
+		if (req->rq_phase == RQ_PHASE_NEW && req->rq_sent)
+			continue;
+
+		/* delayed resend - skip */
+		if (req->rq_phase == RQ_PHASE_RPC && req->rq_resend &&
+		    req->rq_sent > get_seconds())
+			continue;
+
+		if (!(req->rq_phase == RQ_PHASE_RPC ||
+		      req->rq_phase == RQ_PHASE_BULK ||
+		      req->rq_phase == RQ_PHASE_INTERPRET ||
+		      req->rq_phase == RQ_PHASE_UNREGISTERING ||
+		      req->rq_phase == RQ_PHASE_COMPLETE)) {
+			DEBUG_REQ(D_ERROR, req, "bad phase %x", req->rq_phase);
+			LBUG();
+		}
+
+		if (req->rq_phase == RQ_PHASE_UNREGISTERING) {
+			LASSERT(req->rq_next_phase != req->rq_phase);
+			LASSERT(req->rq_next_phase != RQ_PHASE_UNDEFINED);
+
+			/*
+			 * Skip processing until reply is unlinked. We
+			 * can't return to pool before that and we can't
+			 * call interpret before that. We need to make
+			 * sure that all rdma transfers finished and will
+			 * not corrupt any data.
+			 */
+			if (ptlrpc_client_recv_or_unlink(req) ||
+			    ptlrpc_client_bulk_active(req))
+				continue;
+
+			/*
+			 * Turn fail_loc off to prevent it from looping
+			 * forever.
+			 */
+			if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_REPL_UNLINK)) {
+				OBD_FAIL_CHECK_ORSET(OBD_FAIL_PTLRPC_LONG_REPL_UNLINK,
+						     OBD_FAIL_ONCE);
+			}
+			if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_BULK_UNLINK)) {
+				OBD_FAIL_CHECK_ORSET(OBD_FAIL_PTLRPC_LONG_BULK_UNLINK,
+						     OBD_FAIL_ONCE);
+			}
+
+			/*
+			 * Move to next phase if reply was successfully
+			 * unlinked.
+			 */
+			ptlrpc_rqphase_move(req, req->rq_next_phase);
+		}
+
+		if (req->rq_phase == RQ_PHASE_COMPLETE) {
+			list_move_tail(&req->rq_set_chain, &comp_reqs);
+			continue;
+		}
+
+		if (req->rq_phase == RQ_PHASE_INTERPRET)
+			goto interpret;
+
+		/*
+		 * Note that this also will start async reply unlink.
+		 */
+		if (req->rq_net_err && !req->rq_timedout) {
+			ptlrpc_expire_one_request(req, 1);
+
+			/*
+			 * Check if we still need to wait for unlink.
+			 */
+			if (ptlrpc_client_recv_or_unlink(req) ||
+			    ptlrpc_client_bulk_active(req))
+				continue;
+			/* If there is no need to resend, fail it now. */
+			if (req->rq_no_resend) {
+				if (req->rq_status == 0)
+					req->rq_status = -EIO;
+				ptlrpc_rqphase_move(req, RQ_PHASE_INTERPRET);
+				goto interpret;
+			} else {
+				continue;
+			}
+		}
+
+		if (req->rq_err) {
+			spin_lock(&req->rq_lock);
+			req->rq_replied = 0;
+			spin_unlock(&req->rq_lock);
+			if (req->rq_status == 0)
+				req->rq_status = -EIO;
+			ptlrpc_rqphase_move(req, RQ_PHASE_INTERPRET);
+			goto interpret;
+		}
+
+		/* ptlrpc_set_wait->l_wait_event sets lwi_allow_intr
+		 * so it sets rq_intr regardless of individual rpc
+		 * timeouts. The synchronous IO waiting path sets
+		 * rq_intr irrespective of whether ptlrpcd
+		 * has seen a timeout.  Our policy is to only interpret
+		 * interrupted rpcs after they have timed out, so we
+		 * need to enforce that here.
+		 */
+
+		if (req->rq_intr && (req->rq_timedout || req->rq_waiting ||
+				     req->rq_wait_ctx)) {
+			req->rq_status = -EINTR;
+			ptlrpc_rqphase_move(req, RQ_PHASE_INTERPRET);
+			goto interpret;
+		}
+
+		if (req->rq_phase == RQ_PHASE_RPC) {
+			if (req->rq_timedout || req->rq_resend ||
+			    req->rq_waiting || req->rq_wait_ctx) {
+				int status;
+
+				if (!ptlrpc_unregister_reply(req, 1))
+					continue;
+
+				spin_lock(&imp->imp_lock);
+				if (ptlrpc_import_delay_req(imp, req,
+							    &status)) {
+					/* put on delay list - only if we wait
+					 * recovery finished - before send */
+					list_del_init(&req->rq_list);
+					list_add_tail(&req->rq_list,
+							  &imp->
+							  imp_delayed_list);
+					spin_unlock(&imp->imp_lock);
+					continue;
+				}
+
+				if (status != 0)  {
+					req->rq_status = status;
+					ptlrpc_rqphase_move(req,
+						RQ_PHASE_INTERPRET);
+					spin_unlock(&imp->imp_lock);
+					goto interpret;
+				}
+				if (ptlrpc_no_resend(req) &&
+				    !req->rq_wait_ctx) {
+					req->rq_status = -ENOTCONN;
+					ptlrpc_rqphase_move(req,
+							    RQ_PHASE_INTERPRET);
+					spin_unlock(&imp->imp_lock);
+					goto interpret;
+				}
+
+				list_del_init(&req->rq_list);
+				list_add_tail(&req->rq_list,
+						  &imp->imp_sending_list);
+
+				spin_unlock(&imp->imp_lock);
+
+				spin_lock(&req->rq_lock);
+				req->rq_waiting = 0;
+				spin_unlock(&req->rq_lock);
+
+				if (req->rq_timedout || req->rq_resend) {
+					/* This is re-sending anyways,
+					 * let's mark req as resend. */
+					spin_lock(&req->rq_lock);
+					req->rq_resend = 1;
+					spin_unlock(&req->rq_lock);
+					if (req->rq_bulk) {
+						__u64 old_xid;
+
+						if (!ptlrpc_unregister_bulk(req, 1))
+							continue;
+
+						/* ensure previous bulk fails */
+						old_xid = req->rq_xid;
+						req->rq_xid = ptlrpc_next_xid();
+						CDEBUG(D_HA, "resend bulk old x%llu new x%llu\n",
+						       old_xid, req->rq_xid);
+					}
+				}
+				/*
+				 * rq_wait_ctx is only touched by ptlrpcd,
+				 * so no lock is needed here.
+				 */
+				status = sptlrpc_req_refresh_ctx(req, -1);
+				if (status) {
+					if (req->rq_err) {
+						req->rq_status = status;
+						spin_lock(&req->rq_lock);
+						req->rq_wait_ctx = 0;
+						spin_unlock(&req->rq_lock);
+						force_timer_recalc = 1;
+					} else {
+						spin_lock(&req->rq_lock);
+						req->rq_wait_ctx = 1;
+						spin_unlock(&req->rq_lock);
+					}
+
+					continue;
+				} else {
+					spin_lock(&req->rq_lock);
+					req->rq_wait_ctx = 0;
+					spin_unlock(&req->rq_lock);
+				}
+
+				rc = ptl_send_rpc(req, 0);
+				if (rc) {
+					DEBUG_REQ(D_HA, req,
+						  "send failed: rc = %d", rc);
+					force_timer_recalc = 1;
+					spin_lock(&req->rq_lock);
+					req->rq_net_err = 1;
+					spin_unlock(&req->rq_lock);
+					continue;
+				}
+				/* need to reset the timeout */
+				force_timer_recalc = 1;
+			}
+
+			spin_lock(&req->rq_lock);
+
+			if (ptlrpc_client_early(req)) {
+				ptlrpc_at_recv_early_reply(req);
+				spin_unlock(&req->rq_lock);
+				continue;
+			}
+
+			/* Still waiting for a reply? */
+			if (ptlrpc_client_recv(req)) {
+				spin_unlock(&req->rq_lock);
+				continue;
+			}
+
+			/* Did we actually receive a reply? */
+			if (!ptlrpc_client_replied(req)) {
+				spin_unlock(&req->rq_lock);
+				continue;
+			}
+
+			spin_unlock(&req->rq_lock);
+
+			/* unlink from net because we are going to
+			 * swab in-place of reply buffer */
+			unregistered = ptlrpc_unregister_reply(req, 1);
+			if (!unregistered)
+				continue;
+
+			req->rq_status = after_reply(req);
+			if (req->rq_resend)
+				continue;
+
+			/* If there is no bulk associated with this request,
+			 * then we're done and should let the interpreter
+			 * process the reply. Similarly if the RPC returned
+			 * an error, and therefore the bulk will never arrive.
+			 */
+			if (req->rq_bulk == NULL || req->rq_status < 0) {
+				ptlrpc_rqphase_move(req, RQ_PHASE_INTERPRET);
+				goto interpret;
+			}
+
+			ptlrpc_rqphase_move(req, RQ_PHASE_BULK);
+		}
+
+		LASSERT(req->rq_phase == RQ_PHASE_BULK);
+		if (ptlrpc_client_bulk_active(req))
+			continue;
+
+		if (req->rq_bulk->bd_failure) {
+			/* The RPC reply arrived OK, but the bulk screwed
+			 * up!  Dead weird since the server told us the RPC
+			 * was good after getting the REPLY for her GET or
+			 * the ACK for her PUT. */
+			DEBUG_REQ(D_ERROR, req, "bulk transfer failed");
+			req->rq_status = -EIO;
+		}
+
+		ptlrpc_rqphase_move(req, RQ_PHASE_INTERPRET);
+
+interpret:
+		LASSERT(req->rq_phase == RQ_PHASE_INTERPRET);
+
+		/* This moves to "unregistering" phase we need to wait for
+		 * reply unlink. */
+		if (!unregistered && !ptlrpc_unregister_reply(req, 1)) {
+			/* start async bulk unlink too */
+			ptlrpc_unregister_bulk(req, 1);
+			continue;
+		}
+
+		if (!ptlrpc_unregister_bulk(req, 1))
+			continue;
+
+		/* When calling interpret receiving already should be
+		 * finished. */
+		LASSERT(!req->rq_receiving_reply);
+
+		ptlrpc_req_interpret(env, req, req->rq_status);
+
+		if (ptlrpcd_check_work(req)) {
+			atomic_dec(&set->set_remaining);
+			continue;
+		}
+		ptlrpc_rqphase_move(req, RQ_PHASE_COMPLETE);
+
+		CDEBUG(req->rq_reqmsg != NULL ? D_RPCTRACE : 0,
+		       "Completed RPC pname:cluuid:pid:xid:nid:opc %s:%s:%d:%llu:%s:%d\n",
+		       current_comm(), imp->imp_obd->obd_uuid.uuid,
+		       lustre_msg_get_status(req->rq_reqmsg), req->rq_xid,
+		       libcfs_nid2str(imp->imp_connection->c_peer.nid),
+		       lustre_msg_get_opc(req->rq_reqmsg));
+
+		spin_lock(&imp->imp_lock);
+		/* Request already may be not on sending or delaying list. This
+		 * may happen in the case of marking it erroneous for the case
+		 * ptlrpc_import_delay_req(req, status) find it impossible to
+		 * allow sending this rpc and returns *status != 0. */
+		if (!list_empty(&req->rq_list)) {
+			list_del_init(&req->rq_list);
+			atomic_dec(&imp->imp_inflight);
+		}
+		spin_unlock(&imp->imp_lock);
+
+		atomic_dec(&set->set_remaining);
+		wake_up_all(&imp->imp_recovery_waitq);
+
+		if (set->set_producer) {
+			/* produce a new request if possible */
+			if (ptlrpc_set_producer(set) > 0)
+				force_timer_recalc = 1;
+
+			/* free the request that has just been completed
+			 * in order not to pollute set->set_requests */
+			list_del_init(&req->rq_set_chain);
+			spin_lock(&req->rq_lock);
+			req->rq_set = NULL;
+			req->rq_invalid_rqset = 0;
+			spin_unlock(&req->rq_lock);
+
+			/* record rq_status to compute the final status later */
+			if (req->rq_status != 0)
+				set->set_rc = req->rq_status;
+			ptlrpc_req_finished(req);
+		} else {
+			list_move_tail(&req->rq_set_chain, &comp_reqs);
+		}
+	}
+
+	/* move completed request at the head of list so it's easier for
+	 * caller to find them */
+	list_splice(&comp_reqs, &set->set_requests);
+
+	/* If we hit an error, we want to recover promptly. */
+	return atomic_read(&set->set_remaining) == 0 || force_timer_recalc;
+}
+EXPORT_SYMBOL(ptlrpc_check_set);
+
+/**
+ * Time out request \a req. is \a async_unlink is set, that means do not wait
+ * until LNet actually confirms network buffer unlinking.
+ * Return 1 if we should give up further retrying attempts or 0 otherwise.
+ */
+int ptlrpc_expire_one_request(struct ptlrpc_request *req, int async_unlink)
+{
+	struct obd_import *imp = req->rq_import;
+	int rc = 0;
+
+	spin_lock(&req->rq_lock);
+	req->rq_timedout = 1;
+	spin_unlock(&req->rq_lock);
+
+	DEBUG_REQ(D_WARNING, req, "Request sent has %s: [sent "CFS_DURATION_T
+		  "/real "CFS_DURATION_T"]",
+		  req->rq_net_err ? "failed due to network error" :
+		     ((req->rq_real_sent == 0 ||
+		       time_before((unsigned long)req->rq_real_sent, (unsigned long)req->rq_sent) ||
+		       cfs_time_aftereq(req->rq_real_sent, req->rq_deadline)) ?
+		      "timed out for sent delay" : "timed out for slow reply"),
+		  req->rq_sent, req->rq_real_sent);
+
+	if (imp != NULL && obd_debug_peer_on_timeout)
+		LNetCtl(IOC_LIBCFS_DEBUG_PEER, &imp->imp_connection->c_peer);
+
+	ptlrpc_unregister_reply(req, async_unlink);
+	ptlrpc_unregister_bulk(req, async_unlink);
+
+	if (obd_dump_on_timeout)
+		libcfs_debug_dumplog();
+
+	if (imp == NULL) {
+		DEBUG_REQ(D_HA, req, "NULL import: already cleaned up?");
+		return 1;
+	}
+
+	atomic_inc(&imp->imp_timeouts);
+
+	/* The DLM server doesn't want recovery run on its imports. */
+	if (imp->imp_dlm_fake)
+		return 1;
+
+	/* If this request is for recovery or other primordial tasks,
+	 * then error it out here. */
+	if (req->rq_ctx_init || req->rq_ctx_fini ||
+	    req->rq_send_state != LUSTRE_IMP_FULL ||
+	    imp->imp_obd->obd_no_recov) {
+		DEBUG_REQ(D_RPCTRACE, req, "err -110, sent_state=%s (now=%s)",
+			  ptlrpc_import_state_name(req->rq_send_state),
+			  ptlrpc_import_state_name(imp->imp_state));
+		spin_lock(&req->rq_lock);
+		req->rq_status = -ETIMEDOUT;
+		req->rq_err = 1;
+		spin_unlock(&req->rq_lock);
+		return 1;
+	}
+
+	/* if a request can't be resent we can't wait for an answer after
+	   the timeout */
+	if (ptlrpc_no_resend(req)) {
+		DEBUG_REQ(D_RPCTRACE, req, "TIMEOUT-NORESEND:");
+		rc = 1;
+	}
+
+	ptlrpc_fail_import(imp, lustre_msg_get_conn_cnt(req->rq_reqmsg));
+
+	return rc;
+}
+
+/**
+ * Time out all uncompleted requests in request set pointed by \a data
+ * Callback used when waiting on sets with l_wait_event.
+ * Always returns 1.
+ */
+int ptlrpc_expired_set(void *data)
+{
+	struct ptlrpc_request_set *set = data;
+	struct list_head		*tmp;
+	time_t		     now = get_seconds();
+
+	LASSERT(set != NULL);
+
+	/*
+	 * A timeout expired. See which reqs it applies to...
+	 */
+	list_for_each(tmp, &set->set_requests) {
+		struct ptlrpc_request *req =
+			list_entry(tmp, struct ptlrpc_request,
+				       rq_set_chain);
+
+		/* don't expire request waiting for context */
+		if (req->rq_wait_ctx)
+			continue;
+
+		/* Request in-flight? */
+		if (!((req->rq_phase == RQ_PHASE_RPC &&
+		       !req->rq_waiting && !req->rq_resend) ||
+		      (req->rq_phase == RQ_PHASE_BULK)))
+			continue;
+
+		if (req->rq_timedout ||     /* already dealt with */
+		    req->rq_deadline > now) /* not expired */
+			continue;
+
+		/* Deal with this guy. Do it asynchronously to not block
+		 * ptlrpcd thread. */
+		ptlrpc_expire_one_request(req, 1);
+	}
+
+	/*
+	 * When waiting for a whole set, we always break out of the
+	 * sleep so we can recalculate the timeout, or enable interrupts
+	 * if everyone's timed out.
+	 */
+	return 1;
+}
+EXPORT_SYMBOL(ptlrpc_expired_set);
+
+/**
+ * Sets rq_intr flag in \a req under spinlock.
+ */
+void ptlrpc_mark_interrupted(struct ptlrpc_request *req)
+{
+	spin_lock(&req->rq_lock);
+	req->rq_intr = 1;
+	spin_unlock(&req->rq_lock);
+}
+EXPORT_SYMBOL(ptlrpc_mark_interrupted);
+
+/**
+ * Interrupts (sets interrupted flag) all uncompleted requests in
+ * a set \a data. Callback for l_wait_event for interruptible waits.
+ */
+void ptlrpc_interrupted_set(void *data)
+{
+	struct ptlrpc_request_set *set = data;
+	struct list_head *tmp;
+
+	LASSERT(set != NULL);
+	CDEBUG(D_RPCTRACE, "INTERRUPTED SET %p\n", set);
+
+	list_for_each(tmp, &set->set_requests) {
+		struct ptlrpc_request *req =
+			list_entry(tmp, struct ptlrpc_request,
+				       rq_set_chain);
+
+		if (req->rq_phase != RQ_PHASE_RPC &&
+		    req->rq_phase != RQ_PHASE_UNREGISTERING)
+			continue;
+
+		ptlrpc_mark_interrupted(req);
+	}
+}
+EXPORT_SYMBOL(ptlrpc_interrupted_set);
+
+/**
+ * Get the smallest timeout in the set; this does NOT set a timeout.
+ */
+int ptlrpc_set_next_timeout(struct ptlrpc_request_set *set)
+{
+	struct list_head	    *tmp;
+	time_t		 now = get_seconds();
+	int		    timeout = 0;
+	struct ptlrpc_request *req;
+	int		    deadline;
+
+	list_for_each(tmp, &set->set_requests) {
+		req = list_entry(tmp, struct ptlrpc_request, rq_set_chain);
+
+		/*
+		 * Request in-flight?
+		 */
+		if (!(((req->rq_phase == RQ_PHASE_RPC) && !req->rq_waiting) ||
+		      (req->rq_phase == RQ_PHASE_BULK) ||
+		      (req->rq_phase == RQ_PHASE_NEW)))
+			continue;
+
+		/*
+		 * Already timed out.
+		 */
+		if (req->rq_timedout)
+			continue;
+
+		/*
+		 * Waiting for ctx.
+		 */
+		if (req->rq_wait_ctx)
+			continue;
+
+		if (req->rq_phase == RQ_PHASE_NEW)
+			deadline = req->rq_sent;
+		else if (req->rq_phase == RQ_PHASE_RPC && req->rq_resend)
+			deadline = req->rq_sent;
+		else
+			deadline = req->rq_sent + req->rq_timeout;
+
+		if (deadline <= now)    /* actually expired already */
+			timeout = 1;    /* ASAP */
+		else if (timeout == 0 || timeout > deadline - now)
+			timeout = deadline - now;
+	}
+	return timeout;
+}
+EXPORT_SYMBOL(ptlrpc_set_next_timeout);
+
+/**
+ * Send all unset request from the set and then wait until all
+ * requests in the set complete (either get a reply, timeout, get an
+ * error or otherwise be interrupted).
+ * Returns 0 on success or error code otherwise.
+ */
+int ptlrpc_set_wait(struct ptlrpc_request_set *set)
+{
+	struct list_head	    *tmp;
+	struct ptlrpc_request *req;
+	struct l_wait_info     lwi;
+	int		    rc, timeout;
+
+	if (set->set_producer)
+		(void)ptlrpc_set_producer(set);
+	else
+		list_for_each(tmp, &set->set_requests) {
+			req = list_entry(tmp, struct ptlrpc_request,
+					     rq_set_chain);
+			if (req->rq_phase == RQ_PHASE_NEW)
+				(void)ptlrpc_send_new_req(req);
+		}
+
+	if (list_empty(&set->set_requests))
+		return 0;
+
+	do {
+		timeout = ptlrpc_set_next_timeout(set);
+
+		/* wait until all complete, interrupted, or an in-flight
+		 * req times out */
+		CDEBUG(D_RPCTRACE, "set %p going to sleep for %d seconds\n",
+		       set, timeout);
+
+		if (timeout == 0 && !cfs_signal_pending())
+			/*
+			 * No requests are in-flight (ether timed out
+			 * or delayed), so we can allow interrupts.
+			 * We still want to block for a limited time,
+			 * so we allow interrupts during the timeout.
+			 */
+			lwi = LWI_TIMEOUT_INTR_ALL(cfs_time_seconds(1),
+						   ptlrpc_expired_set,
+						   ptlrpc_interrupted_set, set);
+		else
+			/*
+			 * At least one request is in flight, so no
+			 * interrupts are allowed. Wait until all
+			 * complete, or an in-flight req times out.
+			 */
+			lwi = LWI_TIMEOUT(cfs_time_seconds(timeout ? timeout : 1),
+					  ptlrpc_expired_set, set);
+
+		rc = l_wait_event(set->set_waitq, ptlrpc_check_set(NULL, set), &lwi);
+
+		/* LU-769 - if we ignored the signal because it was already
+		 * pending when we started, we need to handle it now or we risk
+		 * it being ignored forever */
+		if (rc == -ETIMEDOUT && !lwi.lwi_allow_intr &&
+		    cfs_signal_pending()) {
+			sigset_t blocked_sigs =
+					   cfs_block_sigsinv(LUSTRE_FATAL_SIGS);
+
+			/* In fact we only interrupt for the "fatal" signals
+			 * like SIGINT or SIGKILL. We still ignore less
+			 * important signals since ptlrpc set is not easily
+			 * reentrant from userspace again */
+			if (cfs_signal_pending())
+				ptlrpc_interrupted_set(set);
+			cfs_restore_sigs(blocked_sigs);
+		}
+
+		LASSERT(rc == 0 || rc == -EINTR || rc == -ETIMEDOUT);
+
+		/* -EINTR => all requests have been flagged rq_intr so next
+		 * check completes.
+		 * -ETIMEDOUT => someone timed out.  When all reqs have
+		 * timed out, signals are enabled allowing completion with
+		 * EINTR.
+		 * I don't really care if we go once more round the loop in
+		 * the error cases -eeb. */
+		if (rc == 0 && atomic_read(&set->set_remaining) == 0) {
+			list_for_each(tmp, &set->set_requests) {
+				req = list_entry(tmp, struct ptlrpc_request,
+						     rq_set_chain);
+				spin_lock(&req->rq_lock);
+				req->rq_invalid_rqset = 1;
+				spin_unlock(&req->rq_lock);
+			}
+		}
+	} while (rc != 0 || atomic_read(&set->set_remaining) != 0);
+
+	LASSERT(atomic_read(&set->set_remaining) == 0);
+
+	rc = set->set_rc; /* rq_status of already freed requests if any */
+	list_for_each(tmp, &set->set_requests) {
+		req = list_entry(tmp, struct ptlrpc_request, rq_set_chain);
+
+		LASSERT(req->rq_phase == RQ_PHASE_COMPLETE);
+		if (req->rq_status != 0)
+			rc = req->rq_status;
+	}
+
+	if (set->set_interpret != NULL) {
+		int (*interpreter)(struct ptlrpc_request_set *set, void *, int) =
+			set->set_interpret;
+		rc = interpreter(set, set->set_arg, rc);
+	} else {
+		struct ptlrpc_set_cbdata *cbdata, *n;
+		int err;
+
+		list_for_each_entry_safe(cbdata, n,
+					 &set->set_cblist, psc_item) {
+			list_del_init(&cbdata->psc_item);
+			err = cbdata->psc_interpret(set, cbdata->psc_data, rc);
+			if (err && !rc)
+				rc = err;
+			OBD_FREE_PTR(cbdata);
+		}
+	}
+
+	return rc;
+}
+EXPORT_SYMBOL(ptlrpc_set_wait);
+
+/**
+ * Helper function for request freeing.
+ * Called when request count reached zero and request needs to be freed.
+ * Removes request from all sorts of sending/replay lists it might be on,
+ * frees network buffers if any are present.
+ * If \a locked is set, that means caller is already holding import imp_lock
+ * and so we no longer need to reobtain it (for certain lists manipulations)
+ */
+static void __ptlrpc_free_req(struct ptlrpc_request *request, int locked)
+{
+	if (request == NULL)
+		return;
+	LASSERTF(!request->rq_receiving_reply, "req %p\n", request);
+	LASSERTF(request->rq_rqbd == NULL, "req %p\n", request);/* client-side */
+	LASSERTF(list_empty(&request->rq_list), "req %p\n", request);
+	LASSERTF(list_empty(&request->rq_set_chain), "req %p\n", request);
+	LASSERTF(list_empty(&request->rq_exp_list), "req %p\n", request);
+	LASSERTF(!request->rq_replay, "req %p\n", request);
+
+	req_capsule_fini(&request->rq_pill);
+
+	/* We must take it off the imp_replay_list first.  Otherwise, we'll set
+	 * request->rq_reqmsg to NULL while osc_close is dereferencing it. */
+	if (request->rq_import != NULL) {
+		if (!locked)
+			spin_lock(&request->rq_import->imp_lock);
+		list_del_init(&request->rq_replay_list);
+		if (!locked)
+			spin_unlock(&request->rq_import->imp_lock);
+	}
+	LASSERTF(list_empty(&request->rq_replay_list), "req %p\n", request);
+
+	if (atomic_read(&request->rq_refcount) != 0) {
+		DEBUG_REQ(D_ERROR, request,
+			  "freeing request with nonzero refcount");
+		LBUG();
+	}
+
+	if (request->rq_repbuf != NULL)
+		sptlrpc_cli_free_repbuf(request);
+	if (request->rq_export != NULL) {
+		class_export_put(request->rq_export);
+		request->rq_export = NULL;
+	}
+	if (request->rq_import != NULL) {
+		class_import_put(request->rq_import);
+		request->rq_import = NULL;
+	}
+	if (request->rq_bulk != NULL)
+		ptlrpc_free_bulk_pin(request->rq_bulk);
+
+	if (request->rq_reqbuf != NULL || request->rq_clrbuf != NULL)
+		sptlrpc_cli_free_reqbuf(request);
+
+	if (request->rq_cli_ctx)
+		sptlrpc_req_put_ctx(request, !locked);
+
+	if (request->rq_pool)
+		__ptlrpc_free_req_to_pool(request);
+	else
+		ptlrpc_request_cache_free(request);
+}
+
+static int __ptlrpc_req_finished(struct ptlrpc_request *request, int locked);
+/**
+ * Drop one request reference. Must be called with import imp_lock held.
+ * When reference count drops to zero, request is freed.
+ */
+void ptlrpc_req_finished_with_imp_lock(struct ptlrpc_request *request)
+{
+	assert_spin_locked(&request->rq_import->imp_lock);
+	(void)__ptlrpc_req_finished(request, 1);
+}
+EXPORT_SYMBOL(ptlrpc_req_finished_with_imp_lock);
+
+/**
+ * Helper function
+ * Drops one reference count for request \a request.
+ * \a locked set indicates that caller holds import imp_lock.
+ * Frees the request when reference count reaches zero.
+ */
+static int __ptlrpc_req_finished(struct ptlrpc_request *request, int locked)
+{
+	if (request == NULL)
+		return 1;
+
+	if (request == LP_POISON ||
+	    request->rq_reqmsg == LP_POISON) {
+		CERROR("dereferencing freed request (bug 575)\n");
+		LBUG();
+		return 1;
+	}
+
+	DEBUG_REQ(D_INFO, request, "refcount now %u",
+		  atomic_read(&request->rq_refcount) - 1);
+
+	if (atomic_dec_and_test(&request->rq_refcount)) {
+		__ptlrpc_free_req(request, locked);
+		return 1;
+	}
+
+	return 0;
+}
+
+/**
+ * Drops one reference count for a request.
+ */
+void ptlrpc_req_finished(struct ptlrpc_request *request)
+{
+	__ptlrpc_req_finished(request, 0);
+}
+EXPORT_SYMBOL(ptlrpc_req_finished);
+
+/**
+ * Returns xid of a \a request
+ */
+__u64 ptlrpc_req_xid(struct ptlrpc_request *request)
+{
+	return request->rq_xid;
+}
+EXPORT_SYMBOL(ptlrpc_req_xid);
+
+/**
+ * Disengage the client's reply buffer from the network
+ * NB does _NOT_ unregister any client-side bulk.
+ * IDEMPOTENT, but _not_ safe against concurrent callers.
+ * The request owner (i.e. the thread doing the I/O) must call...
+ * Returns 0 on success or 1 if unregistering cannot be made.
+ */
+int ptlrpc_unregister_reply(struct ptlrpc_request *request, int async)
+{
+	int		rc;
+	wait_queue_head_t       *wq;
+	struct l_wait_info lwi;
+
+	/*
+	 * Might sleep.
+	 */
+	LASSERT(!in_interrupt());
+
+	/*
+	 * Let's setup deadline for reply unlink.
+	 */
+	if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_REPL_UNLINK) &&
+	    async && request->rq_reply_deadline == 0)
+		request->rq_reply_deadline = get_seconds()+LONG_UNLINK;
+
+	/*
+	 * Nothing left to do.
+	 */
+	if (!ptlrpc_client_recv_or_unlink(request))
+		return 1;
+
+	LNetMDUnlink(request->rq_reply_md_h);
+
+	/*
+	 * Let's check it once again.
+	 */
+	if (!ptlrpc_client_recv_or_unlink(request))
+		return 1;
+
+	/*
+	 * Move to "Unregistering" phase as reply was not unlinked yet.
+	 */
+	ptlrpc_rqphase_move(request, RQ_PHASE_UNREGISTERING);
+
+	/*
+	 * Do not wait for unlink to finish.
+	 */
+	if (async)
+		return 0;
+
+	/*
+	 * We have to l_wait_event() whatever the result, to give liblustre
+	 * a chance to run reply_in_callback(), and to make sure we've
+	 * unlinked before returning a req to the pool.
+	 */
+	if (request->rq_set != NULL)
+		wq = &request->rq_set->set_waitq;
+	else
+		wq = &request->rq_reply_waitq;
+
+	for (;;) {
+		/* Network access will complete in finite time but the HUGE
+		 * timeout lets us CWARN for visibility of sluggish NALs */
+		lwi = LWI_TIMEOUT_INTERVAL(cfs_time_seconds(LONG_UNLINK),
+					   cfs_time_seconds(1), NULL, NULL);
+		rc = l_wait_event(*wq, !ptlrpc_client_recv_or_unlink(request),
+				  &lwi);
+		if (rc == 0) {
+			ptlrpc_rqphase_move(request, request->rq_next_phase);
+			return 1;
+		}
+
+		LASSERT(rc == -ETIMEDOUT);
+		DEBUG_REQ(D_WARNING, request,
+			  "Unexpectedly long timeout rvcng=%d unlnk=%d/%d",
+			  request->rq_receiving_reply,
+			  request->rq_req_unlink, request->rq_reply_unlink);
+	}
+	return 0;
+}
+EXPORT_SYMBOL(ptlrpc_unregister_reply);
+
+static void ptlrpc_free_request(struct ptlrpc_request *req)
+{
+	spin_lock(&req->rq_lock);
+	req->rq_replay = 0;
+	spin_unlock(&req->rq_lock);
+
+	if (req->rq_commit_cb != NULL)
+		req->rq_commit_cb(req);
+	list_del_init(&req->rq_replay_list);
+
+	__ptlrpc_req_finished(req, 1);
+}
+
+/**
+ * the request is committed and dropped from the replay list of its import
+ */
+void ptlrpc_request_committed(struct ptlrpc_request *req, int force)
+{
+	struct obd_import	*imp = req->rq_import;
+
+	spin_lock(&imp->imp_lock);
+	if (list_empty(&req->rq_replay_list)) {
+		spin_unlock(&imp->imp_lock);
+		return;
+	}
+
+	if (force || req->rq_transno <= imp->imp_peer_committed_transno)
+		ptlrpc_free_request(req);
+
+	spin_unlock(&imp->imp_lock);
+}
+EXPORT_SYMBOL(ptlrpc_request_committed);
+
+/**
+ * Iterates through replay_list on import and prunes
+ * all requests have transno smaller than last_committed for the
+ * import and don't have rq_replay set.
+ * Since requests are sorted in transno order, stops when meeting first
+ * transno bigger than last_committed.
+ * caller must hold imp->imp_lock
+ */
+void ptlrpc_free_committed(struct obd_import *imp)
+{
+	struct ptlrpc_request *req, *saved;
+	struct ptlrpc_request *last_req = NULL; /* temporary fire escape */
+	bool		       skip_committed_list = true;
+
+	LASSERT(imp != NULL);
+	assert_spin_locked(&imp->imp_lock);
+
+	if (imp->imp_peer_committed_transno == imp->imp_last_transno_checked &&
+	    imp->imp_generation == imp->imp_last_generation_checked) {
+		CDEBUG(D_INFO, "%s: skip recheck: last_committed %llu\n",
+		       imp->imp_obd->obd_name, imp->imp_peer_committed_transno);
+		return;
+	}
+	CDEBUG(D_RPCTRACE, "%s: committing for last_committed %llu gen %d\n",
+	       imp->imp_obd->obd_name, imp->imp_peer_committed_transno,
+	       imp->imp_generation);
+
+	if (imp->imp_generation != imp->imp_last_generation_checked)
+		skip_committed_list = false;
+
+	imp->imp_last_transno_checked = imp->imp_peer_committed_transno;
+	imp->imp_last_generation_checked = imp->imp_generation;
+
+	list_for_each_entry_safe(req, saved, &imp->imp_replay_list,
+				 rq_replay_list) {
+		/* XXX ok to remove when 1357 resolved - rread 05/29/03  */
+		LASSERT(req != last_req);
+		last_req = req;
+
+		if (req->rq_transno == 0) {
+			DEBUG_REQ(D_EMERG, req, "zero transno during replay");
+			LBUG();
+		}
+		if (req->rq_import_generation < imp->imp_generation) {
+			DEBUG_REQ(D_RPCTRACE, req, "free request with old gen");
+			goto free_req;
+		}
+
+		/* not yet committed */
+		if (req->rq_transno > imp->imp_peer_committed_transno) {
+			DEBUG_REQ(D_RPCTRACE, req, "stopping search");
+			break;
+		}
+
+		if (req->rq_replay) {
+			DEBUG_REQ(D_RPCTRACE, req, "keeping (FL_REPLAY)");
+			list_move_tail(&req->rq_replay_list,
+				       &imp->imp_committed_list);
+			continue;
+		}
+
+		DEBUG_REQ(D_INFO, req, "commit (last_committed %llu)",
+			  imp->imp_peer_committed_transno);
+free_req:
+		ptlrpc_free_request(req);
+	}
+	if (skip_committed_list)
+		return;
+
+	list_for_each_entry_safe(req, saved, &imp->imp_committed_list,
+				 rq_replay_list) {
+		LASSERT(req->rq_transno != 0);
+		if (req->rq_import_generation < imp->imp_generation) {
+			DEBUG_REQ(D_RPCTRACE, req, "free stale open request");
+			ptlrpc_free_request(req);
+		}
+	}
+}
+
+void ptlrpc_cleanup_client(struct obd_import *imp)
+{
+}
+EXPORT_SYMBOL(ptlrpc_cleanup_client);
+
+/**
+ * Schedule previously sent request for resend.
+ * For bulk requests we assign new xid (to avoid problems with
+ * lost replies and therefore several transfers landing into same buffer
+ * from different sending attempts).
+ */
+void ptlrpc_resend_req(struct ptlrpc_request *req)
+{
+	DEBUG_REQ(D_HA, req, "going to resend");
+	spin_lock(&req->rq_lock);
+
+	/* Request got reply but linked to the import list still.
+	   Let ptlrpc_check_set() to process it. */
+	if (ptlrpc_client_replied(req)) {
+		spin_unlock(&req->rq_lock);
+		DEBUG_REQ(D_HA, req, "it has reply, so skip it");
+		return;
+	}
+
+	lustre_msg_set_handle(req->rq_reqmsg, &(struct lustre_handle){ 0 });
+	req->rq_status = -EAGAIN;
+
+	req->rq_resend = 1;
+	req->rq_net_err = 0;
+	req->rq_timedout = 0;
+	if (req->rq_bulk) {
+		__u64 old_xid = req->rq_xid;
+
+		/* ensure previous bulk fails */
+		req->rq_xid = ptlrpc_next_xid();
+		CDEBUG(D_HA, "resend bulk old x%llu new x%llu\n",
+		       old_xid, req->rq_xid);
+	}
+	ptlrpc_client_wake_req(req);
+	spin_unlock(&req->rq_lock);
+}
+EXPORT_SYMBOL(ptlrpc_resend_req);
+
+/* XXX: this function and rq_status are currently unused */
+void ptlrpc_restart_req(struct ptlrpc_request *req)
+{
+	DEBUG_REQ(D_HA, req, "restarting (possibly-)completed request");
+	req->rq_status = -ERESTARTSYS;
+
+	spin_lock(&req->rq_lock);
+	req->rq_restart = 1;
+	req->rq_timedout = 0;
+	ptlrpc_client_wake_req(req);
+	spin_unlock(&req->rq_lock);
+}
+EXPORT_SYMBOL(ptlrpc_restart_req);
+
+/**
+ * Grab additional reference on a request \a req
+ */
+struct ptlrpc_request *ptlrpc_request_addref(struct ptlrpc_request *req)
+{
+	atomic_inc(&req->rq_refcount);
+	return req;
+}
+EXPORT_SYMBOL(ptlrpc_request_addref);
+
+/**
+ * Add a request to import replay_list.
+ * Must be called under imp_lock
+ */
+void ptlrpc_retain_replayable_request(struct ptlrpc_request *req,
+				      struct obd_import *imp)
+{
+	struct list_head *tmp;
+
+	assert_spin_locked(&imp->imp_lock);
+
+	if (req->rq_transno == 0) {
+		DEBUG_REQ(D_EMERG, req, "saving request with zero transno");
+		LBUG();
+	}
+
+	/* clear this for new requests that were resent as well
+	   as resent replayed requests. */
+	lustre_msg_clear_flags(req->rq_reqmsg, MSG_RESENT);
+
+	/* don't re-add requests that have been replayed */
+	if (!list_empty(&req->rq_replay_list))
+		return;
+
+	lustre_msg_add_flags(req->rq_reqmsg, MSG_REPLAY);
+
+	LASSERT(imp->imp_replayable);
+	/* Balanced in ptlrpc_free_committed, usually. */
+	ptlrpc_request_addref(req);
+	list_for_each_prev(tmp, &imp->imp_replay_list) {
+		struct ptlrpc_request *iter =
+			list_entry(tmp, struct ptlrpc_request,
+				       rq_replay_list);
+
+		/* We may have duplicate transnos if we create and then
+		 * open a file, or for closes retained if to match creating
+		 * opens, so use req->rq_xid as a secondary key.
+		 * (See bugs 684, 685, and 428.)
+		 * XXX no longer needed, but all opens need transnos!
+		 */
+		if (iter->rq_transno > req->rq_transno)
+			continue;
+
+		if (iter->rq_transno == req->rq_transno) {
+			LASSERT(iter->rq_xid != req->rq_xid);
+			if (iter->rq_xid > req->rq_xid)
+				continue;
+		}
+
+		list_add(&req->rq_replay_list, &iter->rq_replay_list);
+		return;
+	}
+
+	list_add(&req->rq_replay_list, &imp->imp_replay_list);
+}
+EXPORT_SYMBOL(ptlrpc_retain_replayable_request);
+
+/**
+ * Send request and wait until it completes.
+ * Returns request processing status.
+ */
+int ptlrpc_queue_wait(struct ptlrpc_request *req)
+{
+	struct ptlrpc_request_set *set;
+	int rc;
+
+	LASSERT(req->rq_set == NULL);
+	LASSERT(!req->rq_receiving_reply);
+
+	set = ptlrpc_prep_set();
+	if (set == NULL) {
+		CERROR("Unable to allocate ptlrpc set.");
+		return -ENOMEM;
+	}
+
+	/* for distributed debugging */
+	lustre_msg_set_status(req->rq_reqmsg, current_pid());
+
+	/* add a ref for the set (see comment in ptlrpc_set_add_req) */
+	ptlrpc_request_addref(req);
+	ptlrpc_set_add_req(set, req);
+	rc = ptlrpc_set_wait(set);
+	ptlrpc_set_destroy(set);
+
+	return rc;
+}
+EXPORT_SYMBOL(ptlrpc_queue_wait);
+
+struct ptlrpc_replay_async_args {
+	int praa_old_state;
+	int praa_old_status;
+};
+
+/**
+ * Callback used for replayed requests reply processing.
+ * In case of successful reply calls registered request replay callback.
+ * In case of error restart replay process.
+ */
+static int ptlrpc_replay_interpret(const struct lu_env *env,
+				   struct ptlrpc_request *req,
+				   void *data, int rc)
+{
+	struct ptlrpc_replay_async_args *aa = data;
+	struct obd_import *imp = req->rq_import;
+
+	atomic_dec(&imp->imp_replay_inflight);
+
+	if (!ptlrpc_client_replied(req)) {
+		CERROR("request replay timed out, restarting recovery\n");
+		rc = -ETIMEDOUT;
+		goto out;
+	}
+
+	if (lustre_msg_get_type(req->rq_repmsg) == PTL_RPC_MSG_ERR &&
+	    (lustre_msg_get_status(req->rq_repmsg) == -ENOTCONN ||
+	     lustre_msg_get_status(req->rq_repmsg) == -ENODEV)) {
+		rc = lustre_msg_get_status(req->rq_repmsg);
+		goto out;
+	}
+
+	/** VBR: check version failure */
+	if (lustre_msg_get_status(req->rq_repmsg) == -EOVERFLOW) {
+		/** replay was failed due to version mismatch */
+		DEBUG_REQ(D_WARNING, req, "Version mismatch during replay\n");
+		spin_lock(&imp->imp_lock);
+		imp->imp_vbr_failed = 1;
+		imp->imp_no_lock_replay = 1;
+		spin_unlock(&imp->imp_lock);
+		lustre_msg_set_status(req->rq_repmsg, aa->praa_old_status);
+	} else {
+		/** The transno had better not change over replay. */
+		LASSERTF(lustre_msg_get_transno(req->rq_reqmsg) ==
+			 lustre_msg_get_transno(req->rq_repmsg) ||
+			 lustre_msg_get_transno(req->rq_repmsg) == 0,
+			 "%#llx/%#llx\n",
+			 lustre_msg_get_transno(req->rq_reqmsg),
+			 lustre_msg_get_transno(req->rq_repmsg));
+	}
+
+	spin_lock(&imp->imp_lock);
+	/** if replays by version then gap occur on server, no trust to locks */
+	if (lustre_msg_get_flags(req->rq_repmsg) & MSG_VERSION_REPLAY)
+		imp->imp_no_lock_replay = 1;
+	imp->imp_last_replay_transno = lustre_msg_get_transno(req->rq_reqmsg);
+	spin_unlock(&imp->imp_lock);
+	LASSERT(imp->imp_last_replay_transno);
+
+	/* transaction number shouldn't be bigger than the latest replayed */
+	if (req->rq_transno > lustre_msg_get_transno(req->rq_reqmsg)) {
+		DEBUG_REQ(D_ERROR, req,
+			  "Reported transno %llu is bigger than the replayed one: %llu",
+			  req->rq_transno,
+			  lustre_msg_get_transno(req->rq_reqmsg));
+		rc = -EINVAL;
+		goto out;
+	}
+
+	DEBUG_REQ(D_HA, req, "got rep");
+
+	/* let the callback do fixups, possibly including in the request */
+	if (req->rq_replay_cb)
+		req->rq_replay_cb(req);
+
+	if (ptlrpc_client_replied(req) &&
+	    lustre_msg_get_status(req->rq_repmsg) != aa->praa_old_status) {
+		DEBUG_REQ(D_ERROR, req, "status %d, old was %d",
+			  lustre_msg_get_status(req->rq_repmsg),
+			  aa->praa_old_status);
+	} else {
+		/* Put it back for re-replay. */
+		lustre_msg_set_status(req->rq_repmsg, aa->praa_old_status);
+	}
+
+	/*
+	 * Errors while replay can set transno to 0, but
+	 * imp_last_replay_transno shouldn't be set to 0 anyway
+	 */
+	if (req->rq_transno == 0)
+		CERROR("Transno is 0 during replay!\n");
+
+	/* continue with recovery */
+	rc = ptlrpc_import_recovery_state_machine(imp);
+ out:
+	req->rq_send_state = aa->praa_old_state;
+
+	if (rc != 0)
+		/* this replay failed, so restart recovery */
+		ptlrpc_connect_import(imp);
+
+	return rc;
+}
+
+/**
+ * Prepares and queues request for replay.
+ * Adds it to ptlrpcd queue for actual sending.
+ * Returns 0 on success.
+ */
+int ptlrpc_replay_req(struct ptlrpc_request *req)
+{
+	struct ptlrpc_replay_async_args *aa;
+
+	LASSERT(req->rq_import->imp_state == LUSTRE_IMP_REPLAY);
+
+	LASSERT(sizeof(*aa) <= sizeof(req->rq_async_args));
+	aa = ptlrpc_req_async_args(req);
+	memset(aa, 0, sizeof(*aa));
+
+	/* Prepare request to be resent with ptlrpcd */
+	aa->praa_old_state = req->rq_send_state;
+	req->rq_send_state = LUSTRE_IMP_REPLAY;
+	req->rq_phase = RQ_PHASE_NEW;
+	req->rq_next_phase = RQ_PHASE_UNDEFINED;
+	if (req->rq_repmsg)
+		aa->praa_old_status = lustre_msg_get_status(req->rq_repmsg);
+	req->rq_status = 0;
+	req->rq_interpret_reply = ptlrpc_replay_interpret;
+	/* Readjust the timeout for current conditions */
+	ptlrpc_at_set_req_timeout(req);
+
+	/* Tell server the net_latency, so the server can calculate how long
+	 * it should wait for next replay */
+	lustre_msg_set_service_time(req->rq_reqmsg,
+				    ptlrpc_at_get_net_latency(req));
+	DEBUG_REQ(D_HA, req, "REPLAY");
+
+	atomic_inc(&req->rq_import->imp_replay_inflight);
+	ptlrpc_request_addref(req); /* ptlrpcd needs a ref */
+
+	ptlrpcd_add_req(req, PDL_POLICY_LOCAL, -1);
+	return 0;
+}
+EXPORT_SYMBOL(ptlrpc_replay_req);
+
+/**
+ * Aborts all in-flight request on import \a imp sending and delayed lists
+ */
+void ptlrpc_abort_inflight(struct obd_import *imp)
+{
+	struct list_head *tmp, *n;
+
+	/* Make sure that no new requests get processed for this import.
+	 * ptlrpc_{queue,set}_wait must (and does) hold imp_lock while testing
+	 * this flag and then putting requests on sending_list or delayed_list.
+	 */
+	spin_lock(&imp->imp_lock);
+
+	/* XXX locking?  Maybe we should remove each request with the list
+	 * locked?  Also, how do we know if the requests on the list are
+	 * being freed at this time?
+	 */
+	list_for_each_safe(tmp, n, &imp->imp_sending_list) {
+		struct ptlrpc_request *req =
+			list_entry(tmp, struct ptlrpc_request, rq_list);
+
+		DEBUG_REQ(D_RPCTRACE, req, "inflight");
+
+		spin_lock(&req->rq_lock);
+		if (req->rq_import_generation < imp->imp_generation) {
+			req->rq_err = 1;
+			req->rq_status = -EIO;
+			ptlrpc_client_wake_req(req);
+		}
+		spin_unlock(&req->rq_lock);
+	}
+
+	list_for_each_safe(tmp, n, &imp->imp_delayed_list) {
+		struct ptlrpc_request *req =
+			list_entry(tmp, struct ptlrpc_request, rq_list);
+
+		DEBUG_REQ(D_RPCTRACE, req, "aborting waiting req");
+
+		spin_lock(&req->rq_lock);
+		if (req->rq_import_generation < imp->imp_generation) {
+			req->rq_err = 1;
+			req->rq_status = -EIO;
+			ptlrpc_client_wake_req(req);
+		}
+		spin_unlock(&req->rq_lock);
+	}
+
+	/* Last chance to free reqs left on the replay list, but we
+	 * will still leak reqs that haven't committed.  */
+	if (imp->imp_replayable)
+		ptlrpc_free_committed(imp);
+
+	spin_unlock(&imp->imp_lock);
+}
+EXPORT_SYMBOL(ptlrpc_abort_inflight);
+
+/**
+ * Abort all uncompleted requests in request set \a set
+ */
+void ptlrpc_abort_set(struct ptlrpc_request_set *set)
+{
+	struct list_head *tmp, *pos;
+
+	LASSERT(set != NULL);
+
+	list_for_each_safe(pos, tmp, &set->set_requests) {
+		struct ptlrpc_request *req =
+			list_entry(pos, struct ptlrpc_request,
+				       rq_set_chain);
+
+		spin_lock(&req->rq_lock);
+		if (req->rq_phase != RQ_PHASE_RPC) {
+			spin_unlock(&req->rq_lock);
+			continue;
+		}
+
+		req->rq_err = 1;
+		req->rq_status = -EINTR;
+		ptlrpc_client_wake_req(req);
+		spin_unlock(&req->rq_lock);
+	}
+}
+
+static __u64 ptlrpc_last_xid;
+static spinlock_t ptlrpc_last_xid_lock;
+
+/**
+ * Initialize the XID for the node.  This is common among all requests on
+ * this node, and only requires the property that it is monotonically
+ * increasing.  It does not need to be sequential.  Since this is also used
+ * as the RDMA match bits, it is important that a single client NOT have
+ * the same match bits for two different in-flight requests, hence we do
+ * NOT want to have an XID per target or similar.
+ *
+ * To avoid an unlikely collision between match bits after a client reboot
+ * (which would deliver old data into the wrong RDMA buffer) initialize
+ * the XID based on the current time, assuming a maximum RPC rate of 1M RPC/s.
+ * If the time is clearly incorrect, we instead use a 62-bit random number.
+ * In the worst case the random number will overflow 1M RPCs per second in
+ * 9133 years, or permutations thereof.
+ */
+#define YEAR_2004 (1ULL << 30)
+void ptlrpc_init_xid(void)
+{
+	time_t now = get_seconds();
+
+	spin_lock_init(&ptlrpc_last_xid_lock);
+	if (now < YEAR_2004) {
+		cfs_get_random_bytes(&ptlrpc_last_xid, sizeof(ptlrpc_last_xid));
+		ptlrpc_last_xid >>= 2;
+		ptlrpc_last_xid |= (1ULL << 61);
+	} else {
+		ptlrpc_last_xid = (__u64)now << 20;
+	}
+
+	/* Always need to be aligned to a power-of-two for multi-bulk BRW */
+	CLASSERT((PTLRPC_BULK_OPS_COUNT & (PTLRPC_BULK_OPS_COUNT - 1)) == 0);
+	ptlrpc_last_xid &= PTLRPC_BULK_OPS_MASK;
+}
+
+/**
+ * Increase xid and returns resulting new value to the caller.
+ *
+ * Multi-bulk BRW RPCs consume multiple XIDs for each bulk transfer, starting
+ * at the returned xid, up to xid + PTLRPC_BULK_OPS_COUNT - 1. The BRW RPC
+ * itself uses the last bulk xid needed, so the server can determine the
+ * the number of bulk transfers from the RPC XID and a bitmask.  The starting
+ * xid must align to a power-of-two value.
+ *
+ * This is assumed to be true due to the initial ptlrpc_last_xid
+ * value also being initialized to a power-of-two value. LU-1431
+ */
+__u64 ptlrpc_next_xid(void)
+{
+	__u64 next;
+
+	spin_lock(&ptlrpc_last_xid_lock);
+	next = ptlrpc_last_xid + PTLRPC_BULK_OPS_COUNT;
+	ptlrpc_last_xid = next;
+	spin_unlock(&ptlrpc_last_xid_lock);
+
+	return next;
+}
+EXPORT_SYMBOL(ptlrpc_next_xid);
+
+/**
+ * Get a glimpse at what next xid value might have been.
+ * Returns possible next xid.
+ */
+__u64 ptlrpc_sample_next_xid(void)
+{
+#if BITS_PER_LONG == 32
+	/* need to avoid possible word tearing on 32-bit systems */
+	__u64 next;
+
+	spin_lock(&ptlrpc_last_xid_lock);
+	next = ptlrpc_last_xid + PTLRPC_BULK_OPS_COUNT;
+	spin_unlock(&ptlrpc_last_xid_lock);
+
+	return next;
+#else
+	/* No need to lock, since returned value is racy anyways */
+	return ptlrpc_last_xid + PTLRPC_BULK_OPS_COUNT;
+#endif
+}
+EXPORT_SYMBOL(ptlrpc_sample_next_xid);
+
+/**
+ * Functions for operating ptlrpc workers.
+ *
+ * A ptlrpc work is a function which will be running inside ptlrpc context.
+ * The callback shouldn't sleep otherwise it will block that ptlrpcd thread.
+ *
+ * 1. after a work is created, it can be used many times, that is:
+ *	 handler = ptlrpcd_alloc_work();
+ *	 ptlrpcd_queue_work();
+ *
+ *    queue it again when necessary:
+ *	 ptlrpcd_queue_work();
+ *	 ptlrpcd_destroy_work();
+ * 2. ptlrpcd_queue_work() can be called by multiple processes meanwhile, but
+ *    it will only be queued once in any time. Also as its name implies, it may
+ *    have delay before it really runs by ptlrpcd thread.
+ */
+struct ptlrpc_work_async_args {
+	int   (*cb)(const struct lu_env *, void *);
+	void   *cbdata;
+};
+
+static void ptlrpcd_add_work_req(struct ptlrpc_request *req)
+{
+	/* re-initialize the req */
+	req->rq_timeout		= obd_timeout;
+	req->rq_sent		= get_seconds();
+	req->rq_deadline	= req->rq_sent + req->rq_timeout;
+	req->rq_reply_deadline	= req->rq_deadline;
+	req->rq_phase		= RQ_PHASE_INTERPRET;
+	req->rq_next_phase	= RQ_PHASE_COMPLETE;
+	req->rq_xid		= ptlrpc_next_xid();
+	req->rq_import_generation = req->rq_import->imp_generation;
+
+	ptlrpcd_add_req(req, PDL_POLICY_ROUND, -1);
+}
+
+static int work_interpreter(const struct lu_env *env,
+			    struct ptlrpc_request *req, void *data, int rc)
+{
+	struct ptlrpc_work_async_args *arg = data;
+
+	LASSERT(ptlrpcd_check_work(req));
+	LASSERT(arg->cb != NULL);
+
+	rc = arg->cb(env, arg->cbdata);
+
+	list_del_init(&req->rq_set_chain);
+	req->rq_set = NULL;
+
+	if (atomic_dec_return(&req->rq_refcount) > 1) {
+		atomic_set(&req->rq_refcount, 2);
+		ptlrpcd_add_work_req(req);
+	}
+	return rc;
+}
+
+static int worker_format;
+
+static int ptlrpcd_check_work(struct ptlrpc_request *req)
+{
+	return req->rq_pill.rc_fmt == (void *)&worker_format;
+}
+
+/**
+ * Create a work for ptlrpc.
+ */
+void *ptlrpcd_alloc_work(struct obd_import *imp,
+			 int (*cb)(const struct lu_env *, void *), void *cbdata)
+{
+	struct ptlrpc_request	 *req = NULL;
+	struct ptlrpc_work_async_args *args;
+
+	might_sleep();
+
+	if (cb == NULL)
+		return ERR_PTR(-EINVAL);
+
+	/* copy some code from deprecated fakereq. */
+	req = ptlrpc_request_cache_alloc(GFP_NOFS);
+	if (req == NULL) {
+		CERROR("ptlrpc: run out of memory!\n");
+		return ERR_PTR(-ENOMEM);
+	}
+
+	req->rq_send_state = LUSTRE_IMP_FULL;
+	req->rq_type = PTL_RPC_MSG_REQUEST;
+	req->rq_import = class_import_get(imp);
+	req->rq_export = NULL;
+	req->rq_interpret_reply = work_interpreter;
+	/* don't want reply */
+	req->rq_receiving_reply = 0;
+	req->rq_req_unlink = req->rq_reply_unlink = 0;
+	req->rq_no_delay = req->rq_no_resend = 1;
+	req->rq_pill.rc_fmt = (void *)&worker_format;
+
+	spin_lock_init(&req->rq_lock);
+	INIT_LIST_HEAD(&req->rq_list);
+	INIT_LIST_HEAD(&req->rq_replay_list);
+	INIT_LIST_HEAD(&req->rq_set_chain);
+	INIT_LIST_HEAD(&req->rq_history_list);
+	INIT_LIST_HEAD(&req->rq_exp_list);
+	init_waitqueue_head(&req->rq_reply_waitq);
+	init_waitqueue_head(&req->rq_set_waitq);
+	atomic_set(&req->rq_refcount, 1);
+
+	CLASSERT(sizeof(*args) <= sizeof(req->rq_async_args));
+	args = ptlrpc_req_async_args(req);
+	args->cb     = cb;
+	args->cbdata = cbdata;
+
+	return req;
+}
+EXPORT_SYMBOL(ptlrpcd_alloc_work);
+
+void ptlrpcd_destroy_work(void *handler)
+{
+	struct ptlrpc_request *req = handler;
+
+	if (req)
+		ptlrpc_req_finished(req);
+}
+EXPORT_SYMBOL(ptlrpcd_destroy_work);
+
+int ptlrpcd_queue_work(void *handler)
+{
+	struct ptlrpc_request *req = handler;
+
+	/*
+	 * Check if the req is already being queued.
+	 *
+	 * Here comes a trick: it lacks a way of checking if a req is being
+	 * processed reliably in ptlrpc. Here I have to use refcount of req
+	 * for this purpose. This is okay because the caller should use this
+	 * req as opaque data. - Jinshan
+	 */
+	LASSERT(atomic_read(&req->rq_refcount) > 0);
+	if (atomic_inc_return(&req->rq_refcount) == 2)
+		ptlrpcd_add_work_req(req);
+	return 0;
+}
+EXPORT_SYMBOL(ptlrpcd_queue_work);
diff --git a/kernel/drivers/staging/lustre/lustre/ptlrpc/connection.c b/kernel/drivers/staging/lustre/lustre/ptlrpc/connection.c
new file mode 100644
index 000000000..7e27397ce
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/ptlrpc/connection.c
@@ -0,0 +1,241 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_RPC
+#include "../include/obd_support.h"
+#include "../include/obd_class.h"
+#include "../include/lustre_net.h"
+
+#include "ptlrpc_internal.h"
+
+static struct cfs_hash *conn_hash;
+static cfs_hash_ops_t conn_hash_ops;
+
+struct ptlrpc_connection *
+ptlrpc_connection_get(lnet_process_id_t peer, lnet_nid_t self,
+		      struct obd_uuid *uuid)
+{
+	struct ptlrpc_connection *conn, *conn2;
+
+	conn = cfs_hash_lookup(conn_hash, &peer);
+	if (conn)
+		goto out;
+
+	OBD_ALLOC_PTR(conn);
+	if (!conn)
+		return NULL;
+
+	conn->c_peer = peer;
+	conn->c_self = self;
+	INIT_HLIST_NODE(&conn->c_hash);
+	atomic_set(&conn->c_refcount, 1);
+	if (uuid)
+		obd_str2uuid(&conn->c_remote_uuid, uuid->uuid);
+
+	/*
+	 * Add the newly created conn to the hash, on key collision we
+	 * lost a racing addition and must destroy our newly allocated
+	 * connection.  The object which exists in the has will be
+	 * returned and may be compared against out object.
+	 */
+	/* In the function below, .hs_keycmp resolves to
+	 * conn_keycmp() */
+	/* coverity[overrun-buffer-val] */
+	conn2 = cfs_hash_findadd_unique(conn_hash, &peer, &conn->c_hash);
+	if (conn != conn2) {
+		OBD_FREE_PTR(conn);
+		conn = conn2;
+	}
+out:
+	CDEBUG(D_INFO, "conn=%p refcount %d to %s\n",
+	       conn, atomic_read(&conn->c_refcount),
+	       libcfs_nid2str(conn->c_peer.nid));
+	return conn;
+}
+EXPORT_SYMBOL(ptlrpc_connection_get);
+
+int ptlrpc_connection_put(struct ptlrpc_connection *conn)
+{
+	int rc = 0;
+
+	if (!conn)
+		return rc;
+
+	LASSERT(atomic_read(&conn->c_refcount) > 1);
+
+	/*
+	 * We do not remove connection from hashtable and
+	 * do not free it even if last caller released ref,
+	 * as we want to have it cached for the case it is
+	 * needed again.
+	 *
+	 * Deallocating it and later creating new connection
+	 * again would be wastful. This way we also avoid
+	 * expensive locking to protect things from get/put
+	 * race when found cached connection is freed by
+	 * ptlrpc_connection_put().
+	 *
+	 * It will be freed later in module unload time,
+	 * when ptlrpc_connection_fini()->lh_exit->conn_exit()
+	 * path is called.
+	 */
+	if (atomic_dec_return(&conn->c_refcount) == 1)
+		rc = 1;
+
+	CDEBUG(D_INFO, "PUT conn=%p refcount %d to %s\n",
+	       conn, atomic_read(&conn->c_refcount),
+	       libcfs_nid2str(conn->c_peer.nid));
+
+	return rc;
+}
+EXPORT_SYMBOL(ptlrpc_connection_put);
+
+struct ptlrpc_connection *
+ptlrpc_connection_addref(struct ptlrpc_connection *conn)
+{
+	atomic_inc(&conn->c_refcount);
+	CDEBUG(D_INFO, "conn=%p refcount %d to %s\n",
+	       conn, atomic_read(&conn->c_refcount),
+	       libcfs_nid2str(conn->c_peer.nid));
+
+	return conn;
+}
+EXPORT_SYMBOL(ptlrpc_connection_addref);
+
+int ptlrpc_connection_init(void)
+{
+	conn_hash = cfs_hash_create("CONN_HASH",
+				    HASH_CONN_CUR_BITS,
+				    HASH_CONN_MAX_BITS,
+				    HASH_CONN_BKT_BITS, 0,
+				    CFS_HASH_MIN_THETA,
+				    CFS_HASH_MAX_THETA,
+				    &conn_hash_ops, CFS_HASH_DEFAULT);
+	if (!conn_hash)
+		return -ENOMEM;
+
+	return 0;
+}
+EXPORT_SYMBOL(ptlrpc_connection_init);
+
+void ptlrpc_connection_fini(void)
+{
+	cfs_hash_putref(conn_hash);
+}
+EXPORT_SYMBOL(ptlrpc_connection_fini);
+
+/*
+ * Hash operations for net_peer<->connection
+ */
+static unsigned
+conn_hashfn(struct cfs_hash *hs, const void *key, unsigned mask)
+{
+	return cfs_hash_djb2_hash(key, sizeof(lnet_process_id_t), mask);
+}
+
+static int
+conn_keycmp(const void *key, struct hlist_node *hnode)
+{
+	struct ptlrpc_connection *conn;
+	const lnet_process_id_t *conn_key;
+
+	LASSERT(key != NULL);
+	conn_key = (lnet_process_id_t *)key;
+	conn = hlist_entry(hnode, struct ptlrpc_connection, c_hash);
+
+	return conn_key->nid == conn->c_peer.nid &&
+	       conn_key->pid == conn->c_peer.pid;
+}
+
+static void *
+conn_key(struct hlist_node *hnode)
+{
+	struct ptlrpc_connection *conn;
+
+	conn = hlist_entry(hnode, struct ptlrpc_connection, c_hash);
+	return &conn->c_peer;
+}
+
+static void *
+conn_object(struct hlist_node *hnode)
+{
+	return hlist_entry(hnode, struct ptlrpc_connection, c_hash);
+}
+
+static void
+conn_get(struct cfs_hash *hs, struct hlist_node *hnode)
+{
+	struct ptlrpc_connection *conn;
+
+	conn = hlist_entry(hnode, struct ptlrpc_connection, c_hash);
+	atomic_inc(&conn->c_refcount);
+}
+
+static void
+conn_put_locked(struct cfs_hash *hs, struct hlist_node *hnode)
+{
+	struct ptlrpc_connection *conn;
+
+	conn = hlist_entry(hnode, struct ptlrpc_connection, c_hash);
+	atomic_dec(&conn->c_refcount);
+}
+
+static void
+conn_exit(struct cfs_hash *hs, struct hlist_node *hnode)
+{
+	struct ptlrpc_connection *conn;
+
+	conn = hlist_entry(hnode, struct ptlrpc_connection, c_hash);
+	/*
+	 * Nothing should be left. Connection user put it and
+	 * connection also was deleted from table by this time
+	 * so we should have 0 refs.
+	 */
+	LASSERTF(atomic_read(&conn->c_refcount) == 0,
+		 "Busy connection with %d refs\n",
+		 atomic_read(&conn->c_refcount));
+	OBD_FREE_PTR(conn);
+}
+
+static cfs_hash_ops_t conn_hash_ops = {
+	.hs_hash	= conn_hashfn,
+	.hs_keycmp      = conn_keycmp,
+	.hs_key	 = conn_key,
+	.hs_object      = conn_object,
+	.hs_get	 = conn_get,
+	.hs_put_locked  = conn_put_locked,
+	.hs_exit	= conn_exit,
+};
diff --git a/kernel/drivers/staging/lustre/lustre/ptlrpc/errno.c b/kernel/drivers/staging/lustre/lustre/ptlrpc/errno.c
new file mode 100644
index 000000000..73f8374f1
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/ptlrpc/errno.c
@@ -0,0 +1,380 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.txt
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (C) 2011 FUJITSU LIMITED.  All rights reserved.
+ *
+ * Copyright (c) 2013, Intel Corporation.
+ */
+
+#include "../../include/linux/libcfs/libcfs.h"
+#include "../include/lustre/lustre_errno.h"
+
+/*
+ * The two translation tables below must define a one-to-one mapping between
+ * host and network errnos.
+ *
+ * EWOULDBLOCK is equal to EAGAIN on all architectures except for parisc, which
+ * appears irrelevant.  Thus, existing references to EWOULDBLOCK are fine.
+ *
+ * EDEADLOCK is equal to EDEADLK on x86 but not on sparc, at least.  A sparc
+ * host has no context-free way to determine if a LUSTRE_EDEADLK represents an
+ * EDEADLK or an EDEADLOCK.  Therefore, all existing references to EDEADLOCK
+ * that need to be transferred on wire have been replaced with EDEADLK.
+ */
+static int lustre_errno_hton_mapping[] = {
+	[EPERM]			= LUSTRE_EPERM,
+	[ENOENT]		= LUSTRE_ENOENT,
+	[ESRCH]			= LUSTRE_ESRCH,
+	[EINTR]			= LUSTRE_EINTR,
+	[EIO]			= LUSTRE_EIO,
+	[ENXIO]			= LUSTRE_ENXIO,
+	[E2BIG]			= LUSTRE_E2BIG,
+	[ENOEXEC]		= LUSTRE_ENOEXEC,
+	[EBADF]			= LUSTRE_EBADF,
+	[ECHILD]		= LUSTRE_ECHILD,
+	[EAGAIN]		= LUSTRE_EAGAIN,
+	[ENOMEM]		= LUSTRE_ENOMEM,
+	[EACCES]		= LUSTRE_EACCES,
+	[EFAULT]		= LUSTRE_EFAULT,
+	[ENOTBLK]		= LUSTRE_ENOTBLK,
+	[EBUSY]			= LUSTRE_EBUSY,
+	[EEXIST]		= LUSTRE_EEXIST,
+	[EXDEV]			= LUSTRE_EXDEV,
+	[ENODEV]		= LUSTRE_ENODEV,
+	[ENOTDIR]		= LUSTRE_ENOTDIR,
+	[EISDIR]		= LUSTRE_EISDIR,
+	[EINVAL]		= LUSTRE_EINVAL,
+	[ENFILE]		= LUSTRE_ENFILE,
+	[EMFILE]		= LUSTRE_EMFILE,
+	[ENOTTY]		= LUSTRE_ENOTTY,
+	[ETXTBSY]		= LUSTRE_ETXTBSY,
+	[EFBIG]			= LUSTRE_EFBIG,
+	[ENOSPC]		= LUSTRE_ENOSPC,
+	[ESPIPE]		= LUSTRE_ESPIPE,
+	[EROFS]			= LUSTRE_EROFS,
+	[EMLINK]		= LUSTRE_EMLINK,
+	[EPIPE]			= LUSTRE_EPIPE,
+	[EDOM]			= LUSTRE_EDOM,
+	[ERANGE]		= LUSTRE_ERANGE,
+	[EDEADLK]		= LUSTRE_EDEADLK,
+	[ENAMETOOLONG]		= LUSTRE_ENAMETOOLONG,
+	[ENOLCK]		= LUSTRE_ENOLCK,
+	[ENOSYS]		= LUSTRE_ENOSYS,
+	[ENOTEMPTY]		= LUSTRE_ENOTEMPTY,
+	[ELOOP]			= LUSTRE_ELOOP,
+	[ENOMSG]		= LUSTRE_ENOMSG,
+	[EIDRM]			= LUSTRE_EIDRM,
+	[ECHRNG]		= LUSTRE_ECHRNG,
+	[EL2NSYNC]		= LUSTRE_EL2NSYNC,
+	[EL3HLT]		= LUSTRE_EL3HLT,
+	[EL3RST]		= LUSTRE_EL3RST,
+	[ELNRNG]		= LUSTRE_ELNRNG,
+	[EUNATCH]		= LUSTRE_EUNATCH,
+	[ENOCSI]		= LUSTRE_ENOCSI,
+	[EL2HLT]		= LUSTRE_EL2HLT,
+	[EBADE]			= LUSTRE_EBADE,
+	[EBADR]			= LUSTRE_EBADR,
+	[EXFULL]		= LUSTRE_EXFULL,
+	[ENOANO]		= LUSTRE_ENOANO,
+	[EBADRQC]		= LUSTRE_EBADRQC,
+	[EBADSLT]		= LUSTRE_EBADSLT,
+	[EBFONT]		= LUSTRE_EBFONT,
+	[ENOSTR]		= LUSTRE_ENOSTR,
+	[ENODATA]		= LUSTRE_ENODATA,
+	[ETIME]			= LUSTRE_ETIME,
+	[ENOSR]			= LUSTRE_ENOSR,
+	[ENONET]		= LUSTRE_ENONET,
+	[ENOPKG]		= LUSTRE_ENOPKG,
+	[EREMOTE]		= LUSTRE_EREMOTE,
+	[ENOLINK]		= LUSTRE_ENOLINK,
+	[EADV]			= LUSTRE_EADV,
+	[ESRMNT]		= LUSTRE_ESRMNT,
+	[ECOMM]			= LUSTRE_ECOMM,
+	[EPROTO]		= LUSTRE_EPROTO,
+	[EMULTIHOP]		= LUSTRE_EMULTIHOP,
+	[EDOTDOT]		= LUSTRE_EDOTDOT,
+	[EBADMSG]		= LUSTRE_EBADMSG,
+	[EOVERFLOW]		= LUSTRE_EOVERFLOW,
+	[ENOTUNIQ]		= LUSTRE_ENOTUNIQ,
+	[EBADFD]		= LUSTRE_EBADFD,
+	[EREMCHG]		= LUSTRE_EREMCHG,
+	[ELIBACC]		= LUSTRE_ELIBACC,
+	[ELIBBAD]		= LUSTRE_ELIBBAD,
+	[ELIBSCN]		= LUSTRE_ELIBSCN,
+	[ELIBMAX]		= LUSTRE_ELIBMAX,
+	[ELIBEXEC]		= LUSTRE_ELIBEXEC,
+	[EILSEQ]		= LUSTRE_EILSEQ,
+	[ERESTART]		= LUSTRE_ERESTART,
+	[ESTRPIPE]		= LUSTRE_ESTRPIPE,
+	[EUSERS]		= LUSTRE_EUSERS,
+	[ENOTSOCK]		= LUSTRE_ENOTSOCK,
+	[EDESTADDRREQ]		= LUSTRE_EDESTADDRREQ,
+	[EMSGSIZE]		= LUSTRE_EMSGSIZE,
+	[EPROTOTYPE]		= LUSTRE_EPROTOTYPE,
+	[ENOPROTOOPT]		= LUSTRE_ENOPROTOOPT,
+	[EPROTONOSUPPORT]	= LUSTRE_EPROTONOSUPPORT,
+	[ESOCKTNOSUPPORT]	= LUSTRE_ESOCKTNOSUPPORT,
+	[EOPNOTSUPP]		= LUSTRE_EOPNOTSUPP,
+	[EPFNOSUPPORT]		= LUSTRE_EPFNOSUPPORT,
+	[EAFNOSUPPORT]		= LUSTRE_EAFNOSUPPORT,
+	[EADDRINUSE]		= LUSTRE_EADDRINUSE,
+	[EADDRNOTAVAIL]		= LUSTRE_EADDRNOTAVAIL,
+	[ENETDOWN]		= LUSTRE_ENETDOWN,
+	[ENETUNREACH]		= LUSTRE_ENETUNREACH,
+	[ENETRESET]		= LUSTRE_ENETRESET,
+	[ECONNABORTED]		= LUSTRE_ECONNABORTED,
+	[ECONNRESET]		= LUSTRE_ECONNRESET,
+	[ENOBUFS]		= LUSTRE_ENOBUFS,
+	[EISCONN]		= LUSTRE_EISCONN,
+	[ENOTCONN]		= LUSTRE_ENOTCONN,
+	[ESHUTDOWN]		= LUSTRE_ESHUTDOWN,
+	[ETOOMANYREFS]		= LUSTRE_ETOOMANYREFS,
+	[ETIMEDOUT]		= LUSTRE_ETIMEDOUT,
+	[ECONNREFUSED]		= LUSTRE_ECONNREFUSED,
+	[EHOSTDOWN]		= LUSTRE_EHOSTDOWN,
+	[EHOSTUNREACH]		= LUSTRE_EHOSTUNREACH,
+	[EALREADY]		= LUSTRE_EALREADY,
+	[EINPROGRESS]		= LUSTRE_EINPROGRESS,
+	[ESTALE]		= LUSTRE_ESTALE,
+	[EUCLEAN]		= LUSTRE_EUCLEAN,
+	[ENOTNAM]		= LUSTRE_ENOTNAM,
+	[ENAVAIL]		= LUSTRE_ENAVAIL,
+	[EISNAM]		= LUSTRE_EISNAM,
+	[EREMOTEIO]		= LUSTRE_EREMOTEIO,
+	[EDQUOT]		= LUSTRE_EDQUOT,
+	[ENOMEDIUM]		= LUSTRE_ENOMEDIUM,
+	[EMEDIUMTYPE]		= LUSTRE_EMEDIUMTYPE,
+	[ECANCELED]		= LUSTRE_ECANCELED,
+	[ENOKEY]		= LUSTRE_ENOKEY,
+	[EKEYEXPIRED]		= LUSTRE_EKEYEXPIRED,
+	[EKEYREVOKED]		= LUSTRE_EKEYREVOKED,
+	[EKEYREJECTED]		= LUSTRE_EKEYREJECTED,
+	[EOWNERDEAD]		= LUSTRE_EOWNERDEAD,
+	[ENOTRECOVERABLE]	= LUSTRE_ENOTRECOVERABLE,
+	[ERESTARTSYS]		= LUSTRE_ERESTARTSYS,
+	[ERESTARTNOINTR]	= LUSTRE_ERESTARTNOINTR,
+	[ERESTARTNOHAND]	= LUSTRE_ERESTARTNOHAND,
+	[ENOIOCTLCMD]		= LUSTRE_ENOIOCTLCMD,
+	[ERESTART_RESTARTBLOCK]	= LUSTRE_ERESTART_RESTARTBLOCK,
+	[EBADHANDLE]		= LUSTRE_EBADHANDLE,
+	[ENOTSYNC]		= LUSTRE_ENOTSYNC,
+	[EBADCOOKIE]		= LUSTRE_EBADCOOKIE,
+	[ENOTSUPP]		= LUSTRE_ENOTSUPP,
+	[ETOOSMALL]		= LUSTRE_ETOOSMALL,
+	[ESERVERFAULT]		= LUSTRE_ESERVERFAULT,
+	[EBADTYPE]		= LUSTRE_EBADTYPE,
+	[EJUKEBOX]		= LUSTRE_EJUKEBOX,
+	[EIOCBQUEUED]		= LUSTRE_EIOCBQUEUED,
+};
+
+static int lustre_errno_ntoh_mapping[] = {
+	[LUSTRE_EPERM]			= EPERM,
+	[LUSTRE_ENOENT]			= ENOENT,
+	[LUSTRE_ESRCH]			= ESRCH,
+	[LUSTRE_EINTR]			= EINTR,
+	[LUSTRE_EIO]			= EIO,
+	[LUSTRE_ENXIO]			= ENXIO,
+	[LUSTRE_E2BIG]			= E2BIG,
+	[LUSTRE_ENOEXEC]		= ENOEXEC,
+	[LUSTRE_EBADF]			= EBADF,
+	[LUSTRE_ECHILD]			= ECHILD,
+	[LUSTRE_EAGAIN]			= EAGAIN,
+	[LUSTRE_ENOMEM]			= ENOMEM,
+	[LUSTRE_EACCES]			= EACCES,
+	[LUSTRE_EFAULT]			= EFAULT,
+	[LUSTRE_ENOTBLK]		= ENOTBLK,
+	[LUSTRE_EBUSY]			= EBUSY,
+	[LUSTRE_EEXIST]			= EEXIST,
+	[LUSTRE_EXDEV]			= EXDEV,
+	[LUSTRE_ENODEV]			= ENODEV,
+	[LUSTRE_ENOTDIR]		= ENOTDIR,
+	[LUSTRE_EISDIR]			= EISDIR,
+	[LUSTRE_EINVAL]			= EINVAL,
+	[LUSTRE_ENFILE]			= ENFILE,
+	[LUSTRE_EMFILE]			= EMFILE,
+	[LUSTRE_ENOTTY]			= ENOTTY,
+	[LUSTRE_ETXTBSY]		= ETXTBSY,
+	[LUSTRE_EFBIG]			= EFBIG,
+	[LUSTRE_ENOSPC]			= ENOSPC,
+	[LUSTRE_ESPIPE]			= ESPIPE,
+	[LUSTRE_EROFS]			= EROFS,
+	[LUSTRE_EMLINK]			= EMLINK,
+	[LUSTRE_EPIPE]			= EPIPE,
+	[LUSTRE_EDOM]			= EDOM,
+	[LUSTRE_ERANGE]			= ERANGE,
+	[LUSTRE_EDEADLK]		= EDEADLK,
+	[LUSTRE_ENAMETOOLONG]		= ENAMETOOLONG,
+	[LUSTRE_ENOLCK]			= ENOLCK,
+	[LUSTRE_ENOSYS]			= ENOSYS,
+	[LUSTRE_ENOTEMPTY]		= ENOTEMPTY,
+	[LUSTRE_ELOOP]			= ELOOP,
+	[LUSTRE_ENOMSG]			= ENOMSG,
+	[LUSTRE_EIDRM]			= EIDRM,
+	[LUSTRE_ECHRNG]			= ECHRNG,
+	[LUSTRE_EL2NSYNC]		= EL2NSYNC,
+	[LUSTRE_EL3HLT]			= EL3HLT,
+	[LUSTRE_EL3RST]			= EL3RST,
+	[LUSTRE_ELNRNG]			= ELNRNG,
+	[LUSTRE_EUNATCH]		= EUNATCH,
+	[LUSTRE_ENOCSI]			= ENOCSI,
+	[LUSTRE_EL2HLT]			= EL2HLT,
+	[LUSTRE_EBADE]			= EBADE,
+	[LUSTRE_EBADR]			= EBADR,
+	[LUSTRE_EXFULL]			= EXFULL,
+	[LUSTRE_ENOANO]			= ENOANO,
+	[LUSTRE_EBADRQC]		= EBADRQC,
+	[LUSTRE_EBADSLT]		= EBADSLT,
+	[LUSTRE_EBFONT]			= EBFONT,
+	[LUSTRE_ENOSTR]			= ENOSTR,
+	[LUSTRE_ENODATA]		= ENODATA,
+	[LUSTRE_ETIME]			= ETIME,
+	[LUSTRE_ENOSR]			= ENOSR,
+	[LUSTRE_ENONET]			= ENONET,
+	[LUSTRE_ENOPKG]			= ENOPKG,
+	[LUSTRE_EREMOTE]		= EREMOTE,
+	[LUSTRE_ENOLINK]		= ENOLINK,
+	[LUSTRE_EADV]			= EADV,
+	[LUSTRE_ESRMNT]			= ESRMNT,
+	[LUSTRE_ECOMM]			= ECOMM,
+	[LUSTRE_EPROTO]			= EPROTO,
+	[LUSTRE_EMULTIHOP]		= EMULTIHOP,
+	[LUSTRE_EDOTDOT]		= EDOTDOT,
+	[LUSTRE_EBADMSG]		= EBADMSG,
+	[LUSTRE_EOVERFLOW]		= EOVERFLOW,
+	[LUSTRE_ENOTUNIQ]		= ENOTUNIQ,
+	[LUSTRE_EBADFD]			= EBADFD,
+	[LUSTRE_EREMCHG]		= EREMCHG,
+	[LUSTRE_ELIBACC]		= ELIBACC,
+	[LUSTRE_ELIBBAD]		= ELIBBAD,
+	[LUSTRE_ELIBSCN]		= ELIBSCN,
+	[LUSTRE_ELIBMAX]		= ELIBMAX,
+	[LUSTRE_ELIBEXEC]		= ELIBEXEC,
+	[LUSTRE_EILSEQ]			= EILSEQ,
+	[LUSTRE_ERESTART]		= ERESTART,
+	[LUSTRE_ESTRPIPE]		= ESTRPIPE,
+	[LUSTRE_EUSERS]			= EUSERS,
+	[LUSTRE_ENOTSOCK]		= ENOTSOCK,
+	[LUSTRE_EDESTADDRREQ]		= EDESTADDRREQ,
+	[LUSTRE_EMSGSIZE]		= EMSGSIZE,
+	[LUSTRE_EPROTOTYPE]		= EPROTOTYPE,
+	[LUSTRE_ENOPROTOOPT]		= ENOPROTOOPT,
+	[LUSTRE_EPROTONOSUPPORT]	= EPROTONOSUPPORT,
+	[LUSTRE_ESOCKTNOSUPPORT]	= ESOCKTNOSUPPORT,
+	[LUSTRE_EOPNOTSUPP]		= EOPNOTSUPP,
+	[LUSTRE_EPFNOSUPPORT]		= EPFNOSUPPORT,
+	[LUSTRE_EAFNOSUPPORT]		= EAFNOSUPPORT,
+	[LUSTRE_EADDRINUSE]		= EADDRINUSE,
+	[LUSTRE_EADDRNOTAVAIL]		= EADDRNOTAVAIL,
+	[LUSTRE_ENETDOWN]		= ENETDOWN,
+	[LUSTRE_ENETUNREACH]		= ENETUNREACH,
+	[LUSTRE_ENETRESET]		= ENETRESET,
+	[LUSTRE_ECONNABORTED]		= ECONNABORTED,
+	[LUSTRE_ECONNRESET]		= ECONNRESET,
+	[LUSTRE_ENOBUFS]		= ENOBUFS,
+	[LUSTRE_EISCONN]		= EISCONN,
+	[LUSTRE_ENOTCONN]		= ENOTCONN,
+	[LUSTRE_ESHUTDOWN]		= ESHUTDOWN,
+	[LUSTRE_ETOOMANYREFS]		= ETOOMANYREFS,
+	[LUSTRE_ETIMEDOUT]		= ETIMEDOUT,
+	[LUSTRE_ECONNREFUSED]		= ECONNREFUSED,
+	[LUSTRE_EHOSTDOWN]		= EHOSTDOWN,
+	[LUSTRE_EHOSTUNREACH]		= EHOSTUNREACH,
+	[LUSTRE_EALREADY]		= EALREADY,
+	[LUSTRE_EINPROGRESS]		= EINPROGRESS,
+	[LUSTRE_ESTALE]			= ESTALE,
+	[LUSTRE_EUCLEAN]		= EUCLEAN,
+	[LUSTRE_ENOTNAM]		= ENOTNAM,
+	[LUSTRE_ENAVAIL]		= ENAVAIL,
+	[LUSTRE_EISNAM]			= EISNAM,
+	[LUSTRE_EREMOTEIO]		= EREMOTEIO,
+	[LUSTRE_EDQUOT]			= EDQUOT,
+	[LUSTRE_ENOMEDIUM]		= ENOMEDIUM,
+	[LUSTRE_EMEDIUMTYPE]		= EMEDIUMTYPE,
+	[LUSTRE_ECANCELED]		= ECANCELED,
+	[LUSTRE_ENOKEY]			= ENOKEY,
+	[LUSTRE_EKEYEXPIRED]		= EKEYEXPIRED,
+	[LUSTRE_EKEYREVOKED]		= EKEYREVOKED,
+	[LUSTRE_EKEYREJECTED]		= EKEYREJECTED,
+	[LUSTRE_EOWNERDEAD]		= EOWNERDEAD,
+	[LUSTRE_ENOTRECOVERABLE]	= ENOTRECOVERABLE,
+	[LUSTRE_ERESTARTSYS]		= ERESTARTSYS,
+	[LUSTRE_ERESTARTNOINTR]		= ERESTARTNOINTR,
+	[LUSTRE_ERESTARTNOHAND]		= ERESTARTNOHAND,
+	[LUSTRE_ENOIOCTLCMD]		= ENOIOCTLCMD,
+	[LUSTRE_ERESTART_RESTARTBLOCK]	= ERESTART_RESTARTBLOCK,
+	[LUSTRE_EBADHANDLE]		= EBADHANDLE,
+	[LUSTRE_ENOTSYNC]		= ENOTSYNC,
+	[LUSTRE_EBADCOOKIE]		= EBADCOOKIE,
+	[LUSTRE_ENOTSUPP]		= ENOTSUPP,
+	[LUSTRE_ETOOSMALL]		= ETOOSMALL,
+	[LUSTRE_ESERVERFAULT]		= ESERVERFAULT,
+	[LUSTRE_EBADTYPE]		= EBADTYPE,
+	[LUSTRE_EJUKEBOX]		= EJUKEBOX,
+	[LUSTRE_EIOCBQUEUED]		= EIOCBQUEUED,
+};
+
+unsigned int lustre_errno_hton(unsigned int h)
+{
+	unsigned int n;
+
+	if (h == 0) {
+		n = 0;
+	} else if (h < ARRAY_SIZE(lustre_errno_hton_mapping)) {
+		n = lustre_errno_hton_mapping[h];
+		if (n == 0)
+			goto generic;
+	} else {
+generic:
+		/*
+		 * A generic errno is better than the unknown one that could
+		 * mean anything to a different host.
+		 */
+		n = LUSTRE_EIO;
+	}
+
+	return n;
+}
+EXPORT_SYMBOL(lustre_errno_hton);
+
+unsigned int lustre_errno_ntoh(unsigned int n)
+{
+	unsigned int h;
+
+	if (n == 0) {
+		h = 0;
+	} else if (n < ARRAY_SIZE(lustre_errno_ntoh_mapping)) {
+		h = lustre_errno_ntoh_mapping[n];
+		if (h == 0)
+			goto generic;
+	} else {
+generic:
+		/*
+		 * Similar to the situation in lustre_errno_hton(), an unknown
+		 * network errno could coincide with anything.  Hence, it is
+		 * better to return a generic errno.
+		 */
+		h = EIO;
+	}
+
+	return h;
+}
+EXPORT_SYMBOL(lustre_errno_ntoh);
diff --git a/kernel/drivers/staging/lustre/lustre/ptlrpc/events.c b/kernel/drivers/staging/lustre/lustre/ptlrpc/events.c
new file mode 100644
index 000000000..7f8644e01
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/ptlrpc/events.c
@@ -0,0 +1,585 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_RPC
+
+#include "../../include/linux/libcfs/libcfs.h"
+# ifdef __mips64__
+#  include <linux/kernel.h>
+# endif
+
+#include "../include/obd_class.h"
+#include "../include/lustre_net.h"
+#include "../include/lustre_sec.h"
+#include "ptlrpc_internal.h"
+
+lnet_handle_eq_t   ptlrpc_eq_h;
+
+/*
+ *  Client's outgoing request callback
+ */
+void request_out_callback(lnet_event_t *ev)
+{
+	struct ptlrpc_cb_id   *cbid = ev->md.user_ptr;
+	struct ptlrpc_request *req = cbid->cbid_arg;
+
+	LASSERT(ev->type == LNET_EVENT_SEND ||
+		ev->type == LNET_EVENT_UNLINK);
+	LASSERT(ev->unlinked);
+
+	DEBUG_REQ(D_NET, req, "type %d, status %d", ev->type, ev->status);
+
+	sptlrpc_request_out_callback(req);
+	spin_lock(&req->rq_lock);
+	req->rq_real_sent = get_seconds();
+	if (ev->unlinked)
+		req->rq_req_unlink = 0;
+
+	if (ev->type == LNET_EVENT_UNLINK || ev->status != 0) {
+
+		/* Failed send: make it seem like the reply timed out, just
+		 * like failing sends in client.c does currently...  */
+
+		req->rq_net_err = 1;
+		ptlrpc_client_wake_req(req);
+	}
+	spin_unlock(&req->rq_lock);
+
+	ptlrpc_req_finished(req);
+}
+
+/*
+ * Client's incoming reply callback
+ */
+void reply_in_callback(lnet_event_t *ev)
+{
+	struct ptlrpc_cb_id   *cbid = ev->md.user_ptr;
+	struct ptlrpc_request *req = cbid->cbid_arg;
+
+	DEBUG_REQ(D_NET, req, "type %d, status %d", ev->type, ev->status);
+
+	LASSERT(ev->type == LNET_EVENT_PUT || ev->type == LNET_EVENT_UNLINK);
+	LASSERT(ev->md.start == req->rq_repbuf);
+	LASSERT(ev->offset + ev->mlength <= req->rq_repbuf_len);
+	/* We've set LNET_MD_MANAGE_REMOTE for all outgoing requests
+	   for adaptive timeouts' early reply. */
+	LASSERT((ev->md.options & LNET_MD_MANAGE_REMOTE) != 0);
+
+	spin_lock(&req->rq_lock);
+
+	req->rq_receiving_reply = 0;
+	req->rq_early = 0;
+	if (ev->unlinked)
+		req->rq_reply_unlink = 0;
+
+	if (ev->status)
+		goto out_wake;
+
+	if (ev->type == LNET_EVENT_UNLINK) {
+		LASSERT(ev->unlinked);
+		DEBUG_REQ(D_NET, req, "unlink");
+		goto out_wake;
+	}
+
+	if (ev->mlength < ev->rlength) {
+		CDEBUG(D_RPCTRACE, "truncate req %p rpc %d - %d+%d\n", req,
+		       req->rq_replen, ev->rlength, ev->offset);
+		req->rq_reply_truncate = 1;
+		req->rq_replied = 1;
+		req->rq_status = -EOVERFLOW;
+		req->rq_nob_received = ev->rlength + ev->offset;
+		goto out_wake;
+	}
+
+	if ((ev->offset == 0) &&
+	    ((lustre_msghdr_get_flags(req->rq_reqmsg) & MSGHDR_AT_SUPPORT))) {
+		/* Early reply */
+		DEBUG_REQ(D_ADAPTTO, req,
+			  "Early reply received: mlen=%u offset=%d replen=%d replied=%d unlinked=%d",
+			  ev->mlength, ev->offset,
+			  req->rq_replen, req->rq_replied, ev->unlinked);
+
+		req->rq_early_count++; /* number received, client side */
+
+		if (req->rq_replied)   /* already got the real reply */
+			goto out_wake;
+
+		req->rq_early = 1;
+		req->rq_reply_off = ev->offset;
+		req->rq_nob_received = ev->mlength;
+		/* And we're still receiving */
+		req->rq_receiving_reply = 1;
+	} else {
+		/* Real reply */
+		req->rq_rep_swab_mask = 0;
+		req->rq_replied = 1;
+		/* Got reply, no resend required */
+		req->rq_resend = 0;
+		req->rq_reply_off = ev->offset;
+		req->rq_nob_received = ev->mlength;
+		/* LNetMDUnlink can't be called under the LNET_LOCK,
+		   so we must unlink in ptlrpc_unregister_reply */
+		DEBUG_REQ(D_INFO, req,
+			  "reply in flags=%x mlen=%u offset=%d replen=%d",
+			  lustre_msg_get_flags(req->rq_reqmsg),
+			  ev->mlength, ev->offset, req->rq_replen);
+	}
+
+	req->rq_import->imp_last_reply_time = get_seconds();
+
+out_wake:
+	/* NB don't unlock till after wakeup; req can disappear under us
+	 * since we don't have our own ref */
+	ptlrpc_client_wake_req(req);
+	spin_unlock(&req->rq_lock);
+}
+
+/*
+ * Client's bulk has been written/read
+ */
+void client_bulk_callback(lnet_event_t *ev)
+{
+	struct ptlrpc_cb_id     *cbid = ev->md.user_ptr;
+	struct ptlrpc_bulk_desc *desc = cbid->cbid_arg;
+	struct ptlrpc_request   *req;
+
+	LASSERT((desc->bd_type == BULK_PUT_SINK &&
+		 ev->type == LNET_EVENT_PUT) ||
+		(desc->bd_type == BULK_GET_SOURCE &&
+		 ev->type == LNET_EVENT_GET) ||
+		ev->type == LNET_EVENT_UNLINK);
+	LASSERT(ev->unlinked);
+
+	if (CFS_FAIL_CHECK_ORSET(OBD_FAIL_PTLRPC_CLIENT_BULK_CB, CFS_FAIL_ONCE))
+		ev->status = -EIO;
+
+	if (CFS_FAIL_CHECK_ORSET(OBD_FAIL_PTLRPC_CLIENT_BULK_CB2,
+				 CFS_FAIL_ONCE))
+		ev->status = -EIO;
+
+	CDEBUG((ev->status == 0) ? D_NET : D_ERROR,
+	       "event type %d, status %d, desc %p\n",
+	       ev->type, ev->status, desc);
+
+	spin_lock(&desc->bd_lock);
+	req = desc->bd_req;
+	LASSERT(desc->bd_md_count > 0);
+	desc->bd_md_count--;
+
+	if (ev->type != LNET_EVENT_UNLINK && ev->status == 0) {
+		desc->bd_nob_transferred += ev->mlength;
+		desc->bd_sender = ev->sender;
+	} else {
+		/* start reconnect and resend if network error hit */
+		spin_lock(&req->rq_lock);
+		req->rq_net_err = 1;
+		spin_unlock(&req->rq_lock);
+	}
+
+	if (ev->status != 0)
+		desc->bd_failure = 1;
+
+	/* NB don't unlock till after wakeup; desc can disappear under us
+	 * otherwise */
+	if (desc->bd_md_count == 0)
+		ptlrpc_client_wake_req(desc->bd_req);
+
+	spin_unlock(&desc->bd_lock);
+}
+
+/*
+ * We will have percpt request history list for ptlrpc service in upcoming
+ * patches because we don't want to be serialized by current per-service
+ * history operations. So we require history ID can (somehow) show arriving
+ * order w/o grabbing global lock, and user can sort them in userspace.
+ *
+ * This is how we generate history ID for ptlrpc_request:
+ * ----------------------------------------------------
+ * |  32 bits  |  16 bits  | (16 - X)bits  |  X bits  |
+ * ----------------------------------------------------
+ * |  seconds  | usec / 16 |   sequence    | CPT id   |
+ * ----------------------------------------------------
+ *
+ * it might not be precise but should be good enough.
+ */
+
+#define REQS_CPT_BITS(svcpt)	((svcpt)->scp_service->srv_cpt_bits)
+
+#define REQS_SEC_SHIFT		32
+#define REQS_USEC_SHIFT		16
+#define REQS_SEQ_SHIFT(svcpt)	REQS_CPT_BITS(svcpt)
+
+static void ptlrpc_req_add_history(struct ptlrpc_service_part *svcpt,
+				   struct ptlrpc_request *req)
+{
+	__u64	sec = req->rq_arrival_time.tv_sec;
+	__u32	usec = req->rq_arrival_time.tv_usec >> 4; /* usec / 16 */
+	__u64	new_seq;
+
+	/* set sequence ID for request and add it to history list,
+	 * it must be called with hold svcpt::scp_lock */
+
+	new_seq = (sec << REQS_SEC_SHIFT) |
+		  (usec << REQS_USEC_SHIFT) |
+		  (svcpt->scp_cpt < 0 ? 0 : svcpt->scp_cpt);
+
+	if (new_seq > svcpt->scp_hist_seq) {
+		/* This handles the initial case of scp_hist_seq == 0 or
+		 * we just jumped into a new time window */
+		svcpt->scp_hist_seq = new_seq;
+	} else {
+		LASSERT(REQS_SEQ_SHIFT(svcpt) < REQS_USEC_SHIFT);
+		/* NB: increase sequence number in current usec bucket,
+		 * however, it's possible that we used up all bits for
+		 * sequence and jumped into the next usec bucket (future time),
+		 * then we hope there will be less RPCs per bucket at some
+		 * point, and sequence will catch up again */
+		svcpt->scp_hist_seq += (1U << REQS_SEQ_SHIFT(svcpt));
+		new_seq = svcpt->scp_hist_seq;
+	}
+
+	req->rq_history_seq = new_seq;
+
+	list_add_tail(&req->rq_history_list, &svcpt->scp_hist_reqs);
+}
+
+/*
+ * Server's incoming request callback
+ */
+void request_in_callback(lnet_event_t *ev)
+{
+	struct ptlrpc_cb_id		  *cbid = ev->md.user_ptr;
+	struct ptlrpc_request_buffer_desc *rqbd = cbid->cbid_arg;
+	struct ptlrpc_service_part	  *svcpt = rqbd->rqbd_svcpt;
+	struct ptlrpc_service	     *service = svcpt->scp_service;
+	struct ptlrpc_request	     *req;
+
+	LASSERT(ev->type == LNET_EVENT_PUT ||
+		ev->type == LNET_EVENT_UNLINK);
+	LASSERT((char *)ev->md.start >= rqbd->rqbd_buffer);
+	LASSERT((char *)ev->md.start + ev->offset + ev->mlength <=
+		rqbd->rqbd_buffer + service->srv_buf_size);
+
+	CDEBUG((ev->status == 0) ? D_NET : D_ERROR,
+	       "event type %d, status %d, service %s\n",
+	       ev->type, ev->status, service->srv_name);
+
+	if (ev->unlinked) {
+		/* If this is the last request message to fit in the
+		 * request buffer we can use the request object embedded in
+		 * rqbd.  Note that if we failed to allocate a request,
+		 * we'd have to re-post the rqbd, which we can't do in this
+		 * context. */
+		req = &rqbd->rqbd_req;
+		memset(req, 0, sizeof(*req));
+	} else {
+		LASSERT(ev->type == LNET_EVENT_PUT);
+		if (ev->status != 0) {
+			/* We moaned above already... */
+			return;
+		}
+		req = ptlrpc_request_cache_alloc(GFP_ATOMIC);
+		if (req == NULL) {
+			CERROR("Can't allocate incoming request descriptor: Dropping %s RPC from %s\n",
+			       service->srv_name,
+			       libcfs_id2str(ev->initiator));
+			return;
+		}
+	}
+
+	/* NB we ABSOLUTELY RELY on req being zeroed, so pointers are NULL,
+	 * flags are reset and scalars are zero.  We only set the message
+	 * size to non-zero if this was a successful receive. */
+	req->rq_xid = ev->match_bits;
+	req->rq_reqbuf = ev->md.start + ev->offset;
+	if (ev->type == LNET_EVENT_PUT && ev->status == 0)
+		req->rq_reqdata_len = ev->mlength;
+	do_gettimeofday(&req->rq_arrival_time);
+	req->rq_peer = ev->initiator;
+	req->rq_self = ev->target.nid;
+	req->rq_rqbd = rqbd;
+	req->rq_phase = RQ_PHASE_NEW;
+	spin_lock_init(&req->rq_lock);
+	INIT_LIST_HEAD(&req->rq_timed_list);
+	INIT_LIST_HEAD(&req->rq_exp_list);
+	atomic_set(&req->rq_refcount, 1);
+	if (ev->type == LNET_EVENT_PUT)
+		CDEBUG(D_INFO, "incoming req@%p x%llu msgsize %u\n",
+		       req, req->rq_xid, ev->mlength);
+
+	CDEBUG(D_RPCTRACE, "peer: %s\n", libcfs_id2str(req->rq_peer));
+
+	spin_lock(&svcpt->scp_lock);
+
+	ptlrpc_req_add_history(svcpt, req);
+
+	if (ev->unlinked) {
+		svcpt->scp_nrqbds_posted--;
+		CDEBUG(D_INFO, "Buffer complete: %d buffers still posted\n",
+		       svcpt->scp_nrqbds_posted);
+
+		/* Normally, don't complain about 0 buffers posted; LNET won't
+		 * drop incoming reqs since we set the portal lazy */
+		if (test_req_buffer_pressure &&
+		    ev->type != LNET_EVENT_UNLINK &&
+		    svcpt->scp_nrqbds_posted == 0)
+			CWARN("All %s request buffers busy\n",
+			      service->srv_name);
+
+		/* req takes over the network's ref on rqbd */
+	} else {
+		/* req takes a ref on rqbd */
+		rqbd->rqbd_refcount++;
+	}
+
+	list_add_tail(&req->rq_list, &svcpt->scp_req_incoming);
+	svcpt->scp_nreqs_incoming++;
+
+	/* NB everything can disappear under us once the request
+	 * has been queued and we unlock, so do the wake now... */
+	wake_up(&svcpt->scp_waitq);
+
+	spin_unlock(&svcpt->scp_lock);
+}
+
+/*
+ *  Server's outgoing reply callback
+ */
+void reply_out_callback(lnet_event_t *ev)
+{
+	struct ptlrpc_cb_id	  *cbid = ev->md.user_ptr;
+	struct ptlrpc_reply_state *rs = cbid->cbid_arg;
+	struct ptlrpc_service_part *svcpt = rs->rs_svcpt;
+
+	LASSERT(ev->type == LNET_EVENT_SEND ||
+		ev->type == LNET_EVENT_ACK ||
+		ev->type == LNET_EVENT_UNLINK);
+
+	if (!rs->rs_difficult) {
+		/* 'Easy' replies have no further processing so I drop the
+		 * net's ref on 'rs' */
+		LASSERT(ev->unlinked);
+		ptlrpc_rs_decref(rs);
+		return;
+	}
+
+	LASSERT(rs->rs_on_net);
+
+	if (ev->unlinked) {
+		/* Last network callback. The net's ref on 'rs' stays put
+		 * until ptlrpc_handle_rs() is done with it */
+		spin_lock(&svcpt->scp_rep_lock);
+		spin_lock(&rs->rs_lock);
+
+		rs->rs_on_net = 0;
+		if (!rs->rs_no_ack ||
+		    rs->rs_transno <=
+		    rs->rs_export->exp_obd->obd_last_committed)
+			ptlrpc_schedule_difficult_reply(rs);
+
+		spin_unlock(&rs->rs_lock);
+		spin_unlock(&svcpt->scp_rep_lock);
+	}
+}
+
+
+static void ptlrpc_master_callback(lnet_event_t *ev)
+{
+	struct ptlrpc_cb_id *cbid = ev->md.user_ptr;
+	void (*callback)(lnet_event_t *ev) = cbid->cbid_fn;
+
+	/* Honestly, it's best to find out early. */
+	LASSERT(cbid->cbid_arg != LP_POISON);
+	LASSERT(callback == request_out_callback ||
+		callback == reply_in_callback ||
+		callback == client_bulk_callback ||
+		callback == request_in_callback ||
+		callback == reply_out_callback);
+
+	callback(ev);
+}
+
+int ptlrpc_uuid_to_peer(struct obd_uuid *uuid,
+			 lnet_process_id_t *peer, lnet_nid_t *self)
+{
+	int	       best_dist = 0;
+	__u32	     best_order = 0;
+	int	       count = 0;
+	int	       rc = -ENOENT;
+	int	       portals_compatibility;
+	int	       dist;
+	__u32	     order;
+	lnet_nid_t	dst_nid;
+	lnet_nid_t	src_nid;
+
+	portals_compatibility = LNetCtl(IOC_LIBCFS_PORTALS_COMPATIBILITY, NULL);
+
+	peer->pid = LUSTRE_SRV_LNET_PID;
+
+	/* Choose the matching UUID that's closest */
+	while (lustre_uuid_to_peer(uuid->uuid, &dst_nid, count++) == 0) {
+		dist = LNetDist(dst_nid, &src_nid, &order);
+		if (dist < 0)
+			continue;
+
+		if (dist == 0) {		/* local! use loopback LND */
+			peer->nid = *self = LNET_MKNID(LNET_MKNET(LOLND, 0), 0);
+			rc = 0;
+			break;
+		}
+
+		if (rc < 0 ||
+		    dist < best_dist ||
+		    (dist == best_dist && order < best_order)) {
+			best_dist = dist;
+			best_order = order;
+
+			if (portals_compatibility > 1) {
+				/* Strong portals compatibility: Zero the nid's
+				 * NET, so if I'm reading new config logs, or
+				 * getting configured by (new) lconf I can
+				 * still talk to old servers. */
+				dst_nid = LNET_MKNID(0, LNET_NIDADDR(dst_nid));
+				src_nid = LNET_MKNID(0, LNET_NIDADDR(src_nid));
+			}
+			peer->nid = dst_nid;
+			*self = src_nid;
+			rc = 0;
+		}
+	}
+
+	CDEBUG(D_NET, "%s->%s\n", uuid->uuid, libcfs_id2str(*peer));
+	return rc;
+}
+
+void ptlrpc_ni_fini(void)
+{
+	wait_queue_head_t	 waitq;
+	struct l_wait_info  lwi;
+	int		 rc;
+	int		 retries;
+
+	/* Wait for the event queue to become idle since there may still be
+	 * messages in flight with pending events (i.e. the fire-and-forget
+	 * messages == client requests and "non-difficult" server
+	 * replies */
+
+	for (retries = 0;; retries++) {
+		rc = LNetEQFree(ptlrpc_eq_h);
+		switch (rc) {
+		default:
+			LBUG();
+
+		case 0:
+			LNetNIFini();
+			return;
+
+		case -EBUSY:
+			if (retries != 0)
+				CWARN("Event queue still busy\n");
+
+			/* Wait for a bit */
+			init_waitqueue_head(&waitq);
+			lwi = LWI_TIMEOUT(cfs_time_seconds(2), NULL, NULL);
+			l_wait_event(waitq, 0, &lwi);
+			break;
+		}
+	}
+	/* notreached */
+}
+
+lnet_pid_t ptl_get_pid(void)
+{
+	lnet_pid_t	pid;
+
+	pid = LUSTRE_SRV_LNET_PID;
+	return pid;
+}
+
+int ptlrpc_ni_init(void)
+{
+	int	      rc;
+	lnet_pid_t       pid;
+
+	pid = ptl_get_pid();
+	CDEBUG(D_NET, "My pid is: %x\n", pid);
+
+	/* We're not passing any limits yet... */
+	rc = LNetNIInit(pid);
+	if (rc < 0) {
+		CDEBUG(D_NET, "Can't init network interface: %d\n", rc);
+		return -ENOENT;
+	}
+
+	/* CAVEAT EMPTOR: how we process portals events is _radically_
+	 * different depending on... */
+	/* kernel LNet calls our master callback when there are new event,
+	 * because we are guaranteed to get every event via callback,
+	 * so we just set EQ size to 0 to avoid overhead of serializing
+	 * enqueue/dequeue operations in LNet. */
+	rc = LNetEQAlloc(0, ptlrpc_master_callback, &ptlrpc_eq_h);
+	if (rc == 0)
+		return 0;
+
+	CERROR("Failed to allocate event queue: %d\n", rc);
+	LNetNIFini();
+
+	return -ENOMEM;
+}
+
+
+int ptlrpc_init_portals(void)
+{
+	int   rc = ptlrpc_ni_init();
+
+	if (rc != 0) {
+		CERROR("network initialisation failed\n");
+		return -EIO;
+	}
+	rc = ptlrpcd_addref();
+	if (rc == 0)
+		return 0;
+
+	CERROR("rpcd initialisation failed\n");
+	ptlrpc_ni_fini();
+	return rc;
+}
+
+void ptlrpc_exit_portals(void)
+{
+	ptlrpcd_decref();
+	ptlrpc_ni_fini();
+}
diff --git a/kernel/drivers/staging/lustre/lustre/ptlrpc/import.c b/kernel/drivers/staging/lustre/lustre/ptlrpc/import.c
new file mode 100644
index 000000000..d5fc689c0
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/ptlrpc/import.c
@@ -0,0 +1,1642 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/ptlrpc/import.c
+ *
+ * Author: Mike Shaver <shaver@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_RPC
+
+#include "../include/obd_support.h"
+#include "../include/lustre_ha.h"
+#include "../include/lustre_net.h"
+#include "../include/lustre_import.h"
+#include "../include/lustre_export.h"
+#include "../include/obd.h"
+#include "../include/obd_cksum.h"
+#include "../include/obd_class.h"
+
+#include "ptlrpc_internal.h"
+
+struct ptlrpc_connect_async_args {
+	 __u64 pcaa_peer_committed;
+	int pcaa_initial_connect;
+};
+
+/**
+ * Updates import \a imp current state to provided \a state value
+ * Helper function. Must be called under imp_lock.
+ */
+static void __import_set_state(struct obd_import *imp,
+			       enum lustre_imp_state state)
+{
+	switch (state) {
+	case LUSTRE_IMP_CLOSED:
+	case LUSTRE_IMP_NEW:
+	case LUSTRE_IMP_DISCON:
+	case LUSTRE_IMP_CONNECTING:
+		break;
+	case LUSTRE_IMP_REPLAY_WAIT:
+		imp->imp_replay_state = LUSTRE_IMP_REPLAY_LOCKS;
+		break;
+	default:
+		imp->imp_replay_state = LUSTRE_IMP_REPLAY;
+	}
+
+	imp->imp_state = state;
+	imp->imp_state_hist[imp->imp_state_hist_idx].ish_state = state;
+	imp->imp_state_hist[imp->imp_state_hist_idx].ish_time =
+		get_seconds();
+	imp->imp_state_hist_idx = (imp->imp_state_hist_idx + 1) %
+		IMP_STATE_HIST_LEN;
+}
+
+/* A CLOSED import should remain so. */
+#define IMPORT_SET_STATE_NOLOCK(imp, state)				       \
+do {									       \
+	if (imp->imp_state != LUSTRE_IMP_CLOSED) {			       \
+		CDEBUG(D_HA, "%p %s: changing import state from %s to %s\n",   \
+		       imp, obd2cli_tgt(imp->imp_obd),			       \
+		       ptlrpc_import_state_name(imp->imp_state),	       \
+		       ptlrpc_import_state_name(state));		       \
+		__import_set_state(imp, state);				       \
+	}								       \
+} while (0)
+
+#define IMPORT_SET_STATE(imp, state)					\
+do {									\
+	spin_lock(&imp->imp_lock);					\
+	IMPORT_SET_STATE_NOLOCK(imp, state);				\
+	spin_unlock(&imp->imp_lock);					\
+} while (0)
+
+
+static int ptlrpc_connect_interpret(const struct lu_env *env,
+				    struct ptlrpc_request *request,
+				    void *data, int rc);
+int ptlrpc_import_recovery_state_machine(struct obd_import *imp);
+
+/* Only this function is allowed to change the import state when it is
+ * CLOSED. I would rather refcount the import and free it after
+ * disconnection like we do with exports. To do that, the client_obd
+ * will need to save the peer info somewhere other than in the import,
+ * though. */
+int ptlrpc_init_import(struct obd_import *imp)
+{
+	spin_lock(&imp->imp_lock);
+
+	imp->imp_generation++;
+	imp->imp_state =  LUSTRE_IMP_NEW;
+
+	spin_unlock(&imp->imp_lock);
+
+	return 0;
+}
+EXPORT_SYMBOL(ptlrpc_init_import);
+
+#define UUID_STR "_UUID"
+void deuuidify(char *uuid, const char *prefix, char **uuid_start, int *uuid_len)
+{
+	*uuid_start = !prefix || strncmp(uuid, prefix, strlen(prefix))
+		? uuid : uuid + strlen(prefix);
+
+	*uuid_len = strlen(*uuid_start);
+
+	if (*uuid_len < strlen(UUID_STR))
+		return;
+
+	if (!strncmp(*uuid_start + *uuid_len - strlen(UUID_STR),
+		    UUID_STR, strlen(UUID_STR)))
+		*uuid_len -= strlen(UUID_STR);
+}
+EXPORT_SYMBOL(deuuidify);
+
+/**
+ * Returns true if import was FULL, false if import was already not
+ * connected.
+ * @imp - import to be disconnected
+ * @conn_cnt - connection count (epoch) of the request that timed out
+ *	     and caused the disconnection.  In some cases, multiple
+ *	     inflight requests can fail to a single target (e.g. OST
+ *	     bulk requests) and if one has already caused a reconnection
+ *	     (increasing the import->conn_cnt) the older failure should
+ *	     not also cause a reconnection.  If zero it forces a reconnect.
+ */
+int ptlrpc_set_import_discon(struct obd_import *imp, __u32 conn_cnt)
+{
+	int rc = 0;
+
+	spin_lock(&imp->imp_lock);
+
+	if (imp->imp_state == LUSTRE_IMP_FULL &&
+	    (conn_cnt == 0 || conn_cnt == imp->imp_conn_cnt)) {
+		char *target_start;
+		int   target_len;
+
+		deuuidify(obd2cli_tgt(imp->imp_obd), NULL,
+			  &target_start, &target_len);
+
+		if (imp->imp_replayable) {
+			LCONSOLE_WARN("%s: Connection to %.*s (at %s) was lost; in progress operations using this service will wait for recovery to complete\n",
+				      imp->imp_obd->obd_name, target_len, target_start,
+				      libcfs_nid2str(imp->imp_connection->c_peer.nid));
+		} else {
+			LCONSOLE_ERROR_MSG(0x166, "%s: Connection to %.*s (at %s) was lost; in progress operations using this service will fail\n",
+					   imp->imp_obd->obd_name,
+					   target_len, target_start,
+					   libcfs_nid2str(imp->imp_connection->c_peer.nid));
+		}
+		IMPORT_SET_STATE_NOLOCK(imp, LUSTRE_IMP_DISCON);
+		spin_unlock(&imp->imp_lock);
+
+		if (obd_dump_on_timeout)
+			libcfs_debug_dumplog();
+
+		obd_import_event(imp->imp_obd, imp, IMP_EVENT_DISCON);
+		rc = 1;
+	} else {
+		spin_unlock(&imp->imp_lock);
+		CDEBUG(D_HA, "%s: import %p already %s (conn %u, was %u): %s\n",
+		       imp->imp_client->cli_name, imp,
+		       (imp->imp_state == LUSTRE_IMP_FULL &&
+			imp->imp_conn_cnt > conn_cnt) ?
+		       "reconnected" : "not connected", imp->imp_conn_cnt,
+		       conn_cnt, ptlrpc_import_state_name(imp->imp_state));
+	}
+
+	return rc;
+}
+
+/* Must be called with imp_lock held! */
+static void ptlrpc_deactivate_and_unlock_import(struct obd_import *imp)
+{
+	assert_spin_locked(&imp->imp_lock);
+
+	CDEBUG(D_HA, "setting import %s INVALID\n", obd2cli_tgt(imp->imp_obd));
+	imp->imp_invalid = 1;
+	imp->imp_generation++;
+	spin_unlock(&imp->imp_lock);
+
+	ptlrpc_abort_inflight(imp);
+	obd_import_event(imp->imp_obd, imp, IMP_EVENT_INACTIVE);
+}
+
+/*
+ * This acts as a barrier; all existing requests are rejected, and
+ * no new requests will be accepted until the import is valid again.
+ */
+void ptlrpc_deactivate_import(struct obd_import *imp)
+{
+	spin_lock(&imp->imp_lock);
+	ptlrpc_deactivate_and_unlock_import(imp);
+}
+EXPORT_SYMBOL(ptlrpc_deactivate_import);
+
+static unsigned int
+ptlrpc_inflight_deadline(struct ptlrpc_request *req, time_t now)
+{
+	long dl;
+
+	if (!(((req->rq_phase == RQ_PHASE_RPC) && !req->rq_waiting) ||
+	      (req->rq_phase == RQ_PHASE_BULK) ||
+	      (req->rq_phase == RQ_PHASE_NEW)))
+		return 0;
+
+	if (req->rq_timedout)
+		return 0;
+
+	if (req->rq_phase == RQ_PHASE_NEW)
+		dl = req->rq_sent;
+	else
+		dl = req->rq_deadline;
+
+	if (dl <= now)
+		return 0;
+
+	return dl - now;
+}
+
+static unsigned int ptlrpc_inflight_timeout(struct obd_import *imp)
+{
+	time_t now = get_seconds();
+	struct list_head *tmp, *n;
+	struct ptlrpc_request *req;
+	unsigned int timeout = 0;
+
+	spin_lock(&imp->imp_lock);
+	list_for_each_safe(tmp, n, &imp->imp_sending_list) {
+		req = list_entry(tmp, struct ptlrpc_request, rq_list);
+		timeout = max(ptlrpc_inflight_deadline(req, now), timeout);
+	}
+	spin_unlock(&imp->imp_lock);
+	return timeout;
+}
+
+/**
+ * This function will invalidate the import, if necessary, then block
+ * for all the RPC completions, and finally notify the obd to
+ * invalidate its state (ie cancel locks, clear pending requests,
+ * etc).
+ */
+void ptlrpc_invalidate_import(struct obd_import *imp)
+{
+	struct list_head *tmp, *n;
+	struct ptlrpc_request *req;
+	struct l_wait_info lwi;
+	unsigned int timeout;
+	int rc;
+
+	atomic_inc(&imp->imp_inval_count);
+
+	if (!imp->imp_invalid || imp->imp_obd->obd_no_recov)
+		ptlrpc_deactivate_import(imp);
+
+	CFS_FAIL_TIMEOUT(OBD_FAIL_MGS_CONNECT_NET, 3 * cfs_fail_val / 2);
+	LASSERT(imp->imp_invalid);
+
+	/* Wait forever until inflight == 0. We really can't do it another
+	 * way because in some cases we need to wait for very long reply
+	 * unlink. We can't do anything before that because there is really
+	 * no guarantee that some rdma transfer is not in progress right now. */
+	do {
+		/* Calculate max timeout for waiting on rpcs to error
+		 * out. Use obd_timeout if calculated value is smaller
+		 * than it. */
+		if (!OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_REPL_UNLINK)) {
+			timeout = ptlrpc_inflight_timeout(imp);
+			timeout += timeout / 3;
+
+			if (timeout == 0)
+				timeout = obd_timeout;
+		} else {
+			/* decrease the interval to increase race condition */
+			timeout = 1;
+		}
+
+		CDEBUG(D_RPCTRACE,
+		       "Sleeping %d sec for inflight to error out\n",
+		       timeout);
+
+		/* Wait for all requests to error out and call completion
+		 * callbacks. Cap it at obd_timeout -- these should all
+		 * have been locally cancelled by ptlrpc_abort_inflight. */
+		lwi = LWI_TIMEOUT_INTERVAL(
+			cfs_timeout_cap(cfs_time_seconds(timeout)),
+			(timeout > 1)?cfs_time_seconds(1):cfs_time_seconds(1)/2,
+			NULL, NULL);
+		rc = l_wait_event(imp->imp_recovery_waitq,
+				  (atomic_read(&imp->imp_inflight) == 0),
+				  &lwi);
+		if (rc) {
+			const char *cli_tgt = obd2cli_tgt(imp->imp_obd);
+
+			CERROR("%s: rc = %d waiting for callback (%d != 0)\n",
+			       cli_tgt, rc,
+			       atomic_read(&imp->imp_inflight));
+
+			spin_lock(&imp->imp_lock);
+			if (atomic_read(&imp->imp_inflight) == 0) {
+				int count = atomic_read(&imp->imp_unregistering);
+
+				/* We know that "unregistering" rpcs only can
+				 * survive in sending or delaying lists (they
+				 * maybe waiting for long reply unlink in
+				 * sluggish nets). Let's check this. If there
+				 * is no inflight and unregistering != 0, this
+				 * is bug. */
+				LASSERTF(count == 0, "Some RPCs are still unregistering: %d\n",
+					 count);
+
+				/* Let's save one loop as soon as inflight have
+				 * dropped to zero. No new inflights possible at
+				 * this point. */
+				rc = 0;
+			} else {
+				list_for_each_safe(tmp, n,
+						       &imp->imp_sending_list) {
+					req = list_entry(tmp,
+							     struct ptlrpc_request,
+							     rq_list);
+					DEBUG_REQ(D_ERROR, req,
+						  "still on sending list");
+				}
+				list_for_each_safe(tmp, n,
+						       &imp->imp_delayed_list) {
+					req = list_entry(tmp,
+							     struct ptlrpc_request,
+							     rq_list);
+					DEBUG_REQ(D_ERROR, req,
+						  "still on delayed list");
+				}
+
+				CERROR("%s: RPCs in \"%s\" phase found (%d). Network is sluggish? Waiting them to error out.\n",
+				       cli_tgt,
+				       ptlrpc_phase2str(RQ_PHASE_UNREGISTERING),
+				       atomic_read(&imp->
+						   imp_unregistering));
+			}
+			spin_unlock(&imp->imp_lock);
+		  }
+	} while (rc != 0);
+
+	/*
+	 * Let's additionally check that no new rpcs added to import in
+	 * "invalidate" state.
+	 */
+	LASSERT(atomic_read(&imp->imp_inflight) == 0);
+	obd_import_event(imp->imp_obd, imp, IMP_EVENT_INVALIDATE);
+	sptlrpc_import_flush_all_ctx(imp);
+
+	atomic_dec(&imp->imp_inval_count);
+	wake_up_all(&imp->imp_recovery_waitq);
+}
+EXPORT_SYMBOL(ptlrpc_invalidate_import);
+
+/* unset imp_invalid */
+void ptlrpc_activate_import(struct obd_import *imp)
+{
+	struct obd_device *obd = imp->imp_obd;
+
+	spin_lock(&imp->imp_lock);
+	if (imp->imp_deactive != 0) {
+		spin_unlock(&imp->imp_lock);
+		return;
+	}
+
+	imp->imp_invalid = 0;
+	spin_unlock(&imp->imp_lock);
+	obd_import_event(obd, imp, IMP_EVENT_ACTIVE);
+}
+EXPORT_SYMBOL(ptlrpc_activate_import);
+
+static void ptlrpc_pinger_force(struct obd_import *imp)
+{
+	CDEBUG(D_HA, "%s: waking up pinger s:%s\n", obd2cli_tgt(imp->imp_obd),
+	       ptlrpc_import_state_name(imp->imp_state));
+
+	spin_lock(&imp->imp_lock);
+	imp->imp_force_verify = 1;
+	spin_unlock(&imp->imp_lock);
+
+	if (imp->imp_state != LUSTRE_IMP_CONNECTING)
+		ptlrpc_pinger_wake_up();
+}
+
+void ptlrpc_fail_import(struct obd_import *imp, __u32 conn_cnt)
+{
+	LASSERT(!imp->imp_dlm_fake);
+
+	if (ptlrpc_set_import_discon(imp, conn_cnt)) {
+		if (!imp->imp_replayable) {
+			CDEBUG(D_HA, "import %s@%s for %s not replayable, auto-deactivating\n",
+			       obd2cli_tgt(imp->imp_obd),
+			       imp->imp_connection->c_remote_uuid.uuid,
+			       imp->imp_obd->obd_name);
+			ptlrpc_deactivate_import(imp);
+		}
+
+		ptlrpc_pinger_force(imp);
+	}
+}
+EXPORT_SYMBOL(ptlrpc_fail_import);
+
+int ptlrpc_reconnect_import(struct obd_import *imp)
+{
+#ifdef ENABLE_PINGER
+	struct l_wait_info lwi;
+	int secs = cfs_time_seconds(obd_timeout);
+	int rc;
+
+	ptlrpc_pinger_force(imp);
+
+	CDEBUG(D_HA, "%s: recovery started, waiting %u seconds\n",
+	       obd2cli_tgt(imp->imp_obd), secs);
+
+	lwi = LWI_TIMEOUT(secs, NULL, NULL);
+	rc = l_wait_event(imp->imp_recovery_waitq,
+			  !ptlrpc_import_in_recovery(imp), &lwi);
+	CDEBUG(D_HA, "%s: recovery finished s:%s\n", obd2cli_tgt(imp->imp_obd),
+	       ptlrpc_import_state_name(imp->imp_state));
+	return rc;
+#else
+	ptlrpc_set_import_discon(imp, 0);
+	/* Force a new connect attempt */
+	ptlrpc_invalidate_import(imp);
+	/* Do a fresh connect next time by zeroing the handle */
+	ptlrpc_disconnect_import(imp, 1);
+	/* Wait for all invalidate calls to finish */
+	if (atomic_read(&imp->imp_inval_count) > 0) {
+		int rc;
+		struct l_wait_info lwi = LWI_INTR(LWI_ON_SIGNAL_NOOP, NULL);
+		rc = l_wait_event(imp->imp_recovery_waitq,
+				  (atomic_read(&imp->imp_inval_count) == 0),
+				  &lwi);
+		if (rc)
+			CERROR("Interrupted, inval=%d\n",
+			       atomic_read(&imp->imp_inval_count));
+	}
+
+	/* Allow reconnect attempts */
+	imp->imp_obd->obd_no_recov = 0;
+	/* Remove 'invalid' flag */
+	ptlrpc_activate_import(imp);
+	/* Attempt a new connect */
+	ptlrpc_recover_import(imp, NULL, 0);
+	return 0;
+#endif
+}
+EXPORT_SYMBOL(ptlrpc_reconnect_import);
+
+/**
+ * Connection on import \a imp is changed to another one (if more than one is
+ * present). We typically chose connection that we have not tried to connect to
+ * the longest
+ */
+static int import_select_connection(struct obd_import *imp)
+{
+	struct obd_import_conn *imp_conn = NULL, *conn;
+	struct obd_export *dlmexp;
+	char *target_start;
+	int target_len, tried_all = 1;
+
+	spin_lock(&imp->imp_lock);
+
+	if (list_empty(&imp->imp_conn_list)) {
+		CERROR("%s: no connections available\n",
+		       imp->imp_obd->obd_name);
+		spin_unlock(&imp->imp_lock);
+		return -EINVAL;
+	}
+
+	list_for_each_entry(conn, &imp->imp_conn_list, oic_item) {
+		CDEBUG(D_HA, "%s: connect to NID %s last attempt %llu\n",
+		       imp->imp_obd->obd_name,
+		       libcfs_nid2str(conn->oic_conn->c_peer.nid),
+		       conn->oic_last_attempt);
+
+		/* If we have not tried this connection since
+		   the last successful attempt, go with this one */
+		if ((conn->oic_last_attempt == 0) ||
+		    cfs_time_beforeq_64(conn->oic_last_attempt,
+				       imp->imp_last_success_conn)) {
+			imp_conn = conn;
+			tried_all = 0;
+			break;
+		}
+
+		/* If all of the connections have already been tried
+		   since the last successful connection; just choose the
+		   least recently used */
+		if (!imp_conn)
+			imp_conn = conn;
+		else if (cfs_time_before_64(conn->oic_last_attempt,
+					    imp_conn->oic_last_attempt))
+			imp_conn = conn;
+	}
+
+	/* if not found, simply choose the current one */
+	if (!imp_conn || imp->imp_force_reconnect) {
+		LASSERT(imp->imp_conn_current);
+		imp_conn = imp->imp_conn_current;
+		tried_all = 0;
+	}
+	LASSERT(imp_conn->oic_conn);
+
+	/* If we've tried everything, and we're back to the beginning of the
+	   list, increase our timeout and try again. It will be reset when
+	   we do finally connect. (FIXME: really we should wait for all network
+	   state associated with the last connection attempt to drain before
+	   trying to reconnect on it.) */
+	if (tried_all && (imp->imp_conn_list.next == &imp_conn->oic_item)) {
+		struct adaptive_timeout *at = &imp->imp_at.iat_net_latency;
+		if (at_get(at) < CONNECTION_SWITCH_MAX) {
+			at_measured(at, at_get(at) + CONNECTION_SWITCH_INC);
+			if (at_get(at) > CONNECTION_SWITCH_MAX)
+				at_reset(at, CONNECTION_SWITCH_MAX);
+		}
+		LASSERT(imp_conn->oic_last_attempt);
+		CDEBUG(D_HA, "%s: tried all connections, increasing latency to %ds\n",
+		       imp->imp_obd->obd_name, at_get(at));
+	}
+
+	imp_conn->oic_last_attempt = cfs_time_current_64();
+
+	/* switch connection, don't mind if it's same as the current one */
+	if (imp->imp_connection)
+		ptlrpc_connection_put(imp->imp_connection);
+	imp->imp_connection = ptlrpc_connection_addref(imp_conn->oic_conn);
+
+	dlmexp =  class_conn2export(&imp->imp_dlm_handle);
+	LASSERT(dlmexp != NULL);
+	if (dlmexp->exp_connection)
+		ptlrpc_connection_put(dlmexp->exp_connection);
+	dlmexp->exp_connection = ptlrpc_connection_addref(imp_conn->oic_conn);
+	class_export_put(dlmexp);
+
+	if (imp->imp_conn_current != imp_conn) {
+		if (imp->imp_conn_current) {
+			deuuidify(obd2cli_tgt(imp->imp_obd), NULL,
+				  &target_start, &target_len);
+
+			CDEBUG(D_HA, "%s: Connection changing to %.*s (at %s)\n",
+			       imp->imp_obd->obd_name,
+			       target_len, target_start,
+			       libcfs_nid2str(imp_conn->oic_conn->c_peer.nid));
+		}
+
+		imp->imp_conn_current = imp_conn;
+	}
+
+	CDEBUG(D_HA, "%s: import %p using connection %s/%s\n",
+	       imp->imp_obd->obd_name, imp, imp_conn->oic_uuid.uuid,
+	       libcfs_nid2str(imp_conn->oic_conn->c_peer.nid));
+
+	spin_unlock(&imp->imp_lock);
+
+	return 0;
+}
+
+/*
+ * must be called under imp_lock
+ */
+static int ptlrpc_first_transno(struct obd_import *imp, __u64 *transno)
+{
+	struct ptlrpc_request *req;
+	struct list_head *tmp;
+
+	/* The requests in committed_list always have smaller transnos than
+	 * the requests in replay_list */
+	if (!list_empty(&imp->imp_committed_list)) {
+		tmp = imp->imp_committed_list.next;
+		req = list_entry(tmp, struct ptlrpc_request, rq_replay_list);
+		*transno = req->rq_transno;
+		if (req->rq_transno == 0) {
+			DEBUG_REQ(D_ERROR, req,
+				  "zero transno in committed_list");
+			LBUG();
+		}
+		return 1;
+	}
+	if (!list_empty(&imp->imp_replay_list)) {
+		tmp = imp->imp_replay_list.next;
+		req = list_entry(tmp, struct ptlrpc_request, rq_replay_list);
+		*transno = req->rq_transno;
+		if (req->rq_transno == 0) {
+			DEBUG_REQ(D_ERROR, req, "zero transno in replay_list");
+			LBUG();
+		}
+		return 1;
+	}
+	return 0;
+}
+
+/**
+ * Attempt to (re)connect import \a imp. This includes all preparations,
+ * initializing CONNECT RPC request and passing it to ptlrpcd for
+ * actual sending.
+ * Returns 0 on success or error code.
+ */
+int ptlrpc_connect_import(struct obd_import *imp)
+{
+	struct obd_device *obd = imp->imp_obd;
+	int initial_connect = 0;
+	int set_transno = 0;
+	__u64 committed_before_reconnect = 0;
+	struct ptlrpc_request *request;
+	char *bufs[] = { NULL,
+			 obd2cli_tgt(imp->imp_obd),
+			 obd->obd_uuid.uuid,
+			 (char *)&imp->imp_dlm_handle,
+			 (char *)&imp->imp_connect_data };
+	struct ptlrpc_connect_async_args *aa;
+	int rc;
+
+	spin_lock(&imp->imp_lock);
+	if (imp->imp_state == LUSTRE_IMP_CLOSED) {
+		spin_unlock(&imp->imp_lock);
+		CERROR("can't connect to a closed import\n");
+		return -EINVAL;
+	} else if (imp->imp_state == LUSTRE_IMP_FULL) {
+		spin_unlock(&imp->imp_lock);
+		CERROR("already connected\n");
+		return 0;
+	} else if (imp->imp_state == LUSTRE_IMP_CONNECTING) {
+		spin_unlock(&imp->imp_lock);
+		CERROR("already connecting\n");
+		return -EALREADY;
+	}
+
+	IMPORT_SET_STATE_NOLOCK(imp, LUSTRE_IMP_CONNECTING);
+
+	imp->imp_conn_cnt++;
+	imp->imp_resend_replay = 0;
+
+	if (!lustre_handle_is_used(&imp->imp_remote_handle))
+		initial_connect = 1;
+	else
+		committed_before_reconnect = imp->imp_peer_committed_transno;
+
+	set_transno = ptlrpc_first_transno(imp,
+					   &imp->imp_connect_data.ocd_transno);
+	spin_unlock(&imp->imp_lock);
+
+	rc = import_select_connection(imp);
+	if (rc)
+		goto out;
+
+	rc = sptlrpc_import_sec_adapt(imp, NULL, NULL);
+	if (rc)
+		goto out;
+
+	/* Reset connect flags to the originally requested flags, in case
+	 * the server is updated on-the-fly we will get the new features. */
+	imp->imp_connect_data.ocd_connect_flags = imp->imp_connect_flags_orig;
+	/* Reset ocd_version each time so the server knows the exact versions */
+	imp->imp_connect_data.ocd_version = LUSTRE_VERSION_CODE;
+	imp->imp_msghdr_flags &= ~MSGHDR_AT_SUPPORT;
+	imp->imp_msghdr_flags &= ~MSGHDR_CKSUM_INCOMPAT18;
+
+	rc = obd_reconnect(NULL, imp->imp_obd->obd_self_export, obd,
+			   &obd->obd_uuid, &imp->imp_connect_data, NULL);
+	if (rc)
+		goto out;
+
+	request = ptlrpc_request_alloc(imp, &RQF_MDS_CONNECT);
+	if (request == NULL) {
+		rc = -ENOMEM;
+		goto out;
+	}
+
+	rc = ptlrpc_request_bufs_pack(request, LUSTRE_OBD_VERSION,
+				      imp->imp_connect_op, bufs, NULL);
+	if (rc) {
+		ptlrpc_request_free(request);
+		goto out;
+	}
+
+	/* Report the rpc service time to the server so that it knows how long
+	 * to wait for clients to join recovery */
+	lustre_msg_set_service_time(request->rq_reqmsg,
+				    at_timeout2est(request->rq_timeout));
+
+	/* The amount of time we give the server to process the connect req.
+	 * import_select_connection will increase the net latency on
+	 * repeated reconnect attempts to cover slow networks.
+	 * We override/ignore the server rpc completion estimate here,
+	 * which may be large if this is a reconnect attempt */
+	request->rq_timeout = INITIAL_CONNECT_TIMEOUT;
+	lustre_msg_set_timeout(request->rq_reqmsg, request->rq_timeout);
+
+	lustre_msg_add_op_flags(request->rq_reqmsg, MSG_CONNECT_NEXT_VER);
+
+	request->rq_no_resend = request->rq_no_delay = 1;
+	request->rq_send_state = LUSTRE_IMP_CONNECTING;
+	/* Allow a slightly larger reply for future growth compatibility */
+	req_capsule_set_size(&request->rq_pill, &RMF_CONNECT_DATA, RCL_SERVER,
+			     sizeof(struct obd_connect_data)+16*sizeof(__u64));
+	ptlrpc_request_set_replen(request);
+	request->rq_interpret_reply = ptlrpc_connect_interpret;
+
+	CLASSERT(sizeof(*aa) <= sizeof(request->rq_async_args));
+	aa = ptlrpc_req_async_args(request);
+	memset(aa, 0, sizeof(*aa));
+
+	aa->pcaa_peer_committed = committed_before_reconnect;
+	aa->pcaa_initial_connect = initial_connect;
+
+	if (aa->pcaa_initial_connect) {
+		spin_lock(&imp->imp_lock);
+		imp->imp_replayable = 1;
+		spin_unlock(&imp->imp_lock);
+		lustre_msg_add_op_flags(request->rq_reqmsg,
+					MSG_CONNECT_INITIAL);
+	}
+
+	if (set_transno)
+		lustre_msg_add_op_flags(request->rq_reqmsg,
+					MSG_CONNECT_TRANSNO);
+
+	DEBUG_REQ(D_RPCTRACE, request, "(re)connect request (timeout %d)",
+		  request->rq_timeout);
+	ptlrpcd_add_req(request, PDL_POLICY_ROUND, -1);
+	rc = 0;
+out:
+	if (rc != 0) {
+		IMPORT_SET_STATE(imp, LUSTRE_IMP_DISCON);
+	}
+
+	return rc;
+}
+EXPORT_SYMBOL(ptlrpc_connect_import);
+
+static void ptlrpc_maybe_ping_import_soon(struct obd_import *imp)
+{
+	int force_verify;
+
+	spin_lock(&imp->imp_lock);
+	force_verify = imp->imp_force_verify != 0;
+	spin_unlock(&imp->imp_lock);
+
+	if (force_verify)
+		ptlrpc_pinger_wake_up();
+}
+
+static int ptlrpc_busy_reconnect(int rc)
+{
+	return (rc == -EBUSY) || (rc == -EAGAIN);
+}
+
+/**
+ * interpret_reply callback for connect RPCs.
+ * Looks into returned status of connect operation and decides
+ * what to do with the import - i.e enter recovery, promote it to
+ * full state for normal operations of disconnect it due to an error.
+ */
+static int ptlrpc_connect_interpret(const struct lu_env *env,
+				    struct ptlrpc_request *request,
+				    void *data, int rc)
+{
+	struct ptlrpc_connect_async_args *aa = data;
+	struct obd_import *imp = request->rq_import;
+	struct client_obd *cli = &imp->imp_obd->u.cli;
+	struct lustre_handle old_hdl;
+	__u64 old_connect_flags;
+	int msg_flags;
+	struct obd_connect_data *ocd;
+	struct obd_export *exp;
+	int ret;
+
+	spin_lock(&imp->imp_lock);
+	if (imp->imp_state == LUSTRE_IMP_CLOSED) {
+		imp->imp_connect_tried = 1;
+		spin_unlock(&imp->imp_lock);
+		return 0;
+	}
+
+	if (rc) {
+		/* if this reconnect to busy export - not need select new target
+		 * for connecting*/
+		imp->imp_force_reconnect = ptlrpc_busy_reconnect(rc);
+		spin_unlock(&imp->imp_lock);
+		ptlrpc_maybe_ping_import_soon(imp);
+		goto out;
+	}
+	spin_unlock(&imp->imp_lock);
+
+	LASSERT(imp->imp_conn_current);
+
+	msg_flags = lustre_msg_get_op_flags(request->rq_repmsg);
+
+	ret = req_capsule_get_size(&request->rq_pill, &RMF_CONNECT_DATA,
+				   RCL_SERVER);
+	/* server replied obd_connect_data is always bigger */
+	ocd = req_capsule_server_sized_get(&request->rq_pill,
+					   &RMF_CONNECT_DATA, ret);
+
+	if (ocd == NULL) {
+		CERROR("%s: no connect data from server\n",
+		       imp->imp_obd->obd_name);
+		rc = -EPROTO;
+		goto out;
+	}
+
+	spin_lock(&imp->imp_lock);
+
+	/* All imports are pingable */
+	imp->imp_pingable = 1;
+	imp->imp_force_reconnect = 0;
+	imp->imp_force_verify = 0;
+
+	imp->imp_connect_data = *ocd;
+
+	CDEBUG(D_HA, "%s: connect to target with instance %u\n",
+	       imp->imp_obd->obd_name, ocd->ocd_instance);
+	exp = class_conn2export(&imp->imp_dlm_handle);
+
+	spin_unlock(&imp->imp_lock);
+
+	/* check that server granted subset of flags we asked for. */
+	if ((ocd->ocd_connect_flags & imp->imp_connect_flags_orig) !=
+	    ocd->ocd_connect_flags) {
+		CERROR("%s: Server didn't granted asked subset of flags: asked=%#llx grranted=%#llx\n",
+		       imp->imp_obd->obd_name, imp->imp_connect_flags_orig,
+		       ocd->ocd_connect_flags);
+		rc = -EPROTO;
+		goto out;
+	}
+
+	if (!exp) {
+		/* This could happen if export is cleaned during the
+		   connect attempt */
+		CERROR("%s: missing export after connect\n",
+		       imp->imp_obd->obd_name);
+		rc = -ENODEV;
+		goto out;
+	}
+	old_connect_flags = exp_connect_flags(exp);
+	exp->exp_connect_data = *ocd;
+	imp->imp_obd->obd_self_export->exp_connect_data = *ocd;
+	class_export_put(exp);
+
+	obd_import_event(imp->imp_obd, imp, IMP_EVENT_OCD);
+
+	if (aa->pcaa_initial_connect) {
+		spin_lock(&imp->imp_lock);
+		if (msg_flags & MSG_CONNECT_REPLAYABLE) {
+			imp->imp_replayable = 1;
+			spin_unlock(&imp->imp_lock);
+			CDEBUG(D_HA, "connected to replayable target: %s\n",
+			       obd2cli_tgt(imp->imp_obd));
+		} else {
+			imp->imp_replayable = 0;
+			spin_unlock(&imp->imp_lock);
+		}
+
+		/* if applies, adjust the imp->imp_msg_magic here
+		 * according to reply flags */
+
+		imp->imp_remote_handle =
+				*lustre_msg_get_handle(request->rq_repmsg);
+
+		/* Initial connects are allowed for clients with non-random
+		 * uuids when servers are in recovery.  Simply signal the
+		 * servers replay is complete and wait in REPLAY_WAIT. */
+		if (msg_flags & MSG_CONNECT_RECOVERING) {
+			CDEBUG(D_HA, "connect to %s during recovery\n",
+			       obd2cli_tgt(imp->imp_obd));
+			IMPORT_SET_STATE(imp, LUSTRE_IMP_REPLAY_LOCKS);
+		} else {
+			IMPORT_SET_STATE(imp, LUSTRE_IMP_FULL);
+			ptlrpc_activate_import(imp);
+		}
+
+		rc = 0;
+		goto finish;
+	}
+
+	/* Determine what recovery state to move the import to. */
+	if (MSG_CONNECT_RECONNECT & msg_flags) {
+		memset(&old_hdl, 0, sizeof(old_hdl));
+		if (!memcmp(&old_hdl, lustre_msg_get_handle(request->rq_repmsg),
+			    sizeof(old_hdl))) {
+			LCONSOLE_WARN("Reconnect to %s (at @%s) failed due bad handle %#llx\n",
+				      obd2cli_tgt(imp->imp_obd),
+				      imp->imp_connection->c_remote_uuid.uuid,
+				      imp->imp_dlm_handle.cookie);
+			rc = -ENOTCONN;
+			goto out;
+		}
+
+		if (memcmp(&imp->imp_remote_handle,
+			   lustre_msg_get_handle(request->rq_repmsg),
+			   sizeof(imp->imp_remote_handle))) {
+			int level = msg_flags & MSG_CONNECT_RECOVERING ?
+				D_HA : D_WARNING;
+
+			/* Bug 16611/14775: if server handle have changed,
+			 * that means some sort of disconnection happened.
+			 * If the server is not in recovery, that also means it
+			 * already erased all of our state because of previous
+			 * eviction. If it is in recovery - we are safe to
+			 * participate since we can reestablish all of our state
+			 * with server again */
+			if ((MSG_CONNECT_RECOVERING & msg_flags)) {
+				CDEBUG(level, "%s@%s changed server handle from %#llx to %#llx but is still in recovery\n",
+				       obd2cli_tgt(imp->imp_obd),
+				       imp->imp_connection->c_remote_uuid.uuid,
+				       imp->imp_remote_handle.cookie,
+				       lustre_msg_get_handle(
+				       request->rq_repmsg)->cookie);
+			} else {
+				LCONSOLE_WARN("Evicted from %s (at %s) after server handle changed from %#llx to %#llx\n",
+					      obd2cli_tgt(imp->imp_obd),
+					      imp->imp_connection-> \
+					      c_remote_uuid.uuid,
+					      imp->imp_remote_handle.cookie,
+					      lustre_msg_get_handle(
+						      request->rq_repmsg)->cookie);
+			}
+
+
+			imp->imp_remote_handle =
+				     *lustre_msg_get_handle(request->rq_repmsg);
+
+			if (!(MSG_CONNECT_RECOVERING & msg_flags)) {
+				IMPORT_SET_STATE(imp, LUSTRE_IMP_EVICTED);
+				rc = 0;
+				goto finish;
+			}
+
+		} else {
+			CDEBUG(D_HA, "reconnected to %s@%s after partition\n",
+			       obd2cli_tgt(imp->imp_obd),
+			       imp->imp_connection->c_remote_uuid.uuid);
+		}
+
+		if (imp->imp_invalid) {
+			CDEBUG(D_HA, "%s: reconnected but import is invalid; marking evicted\n",
+			       imp->imp_obd->obd_name);
+			IMPORT_SET_STATE(imp, LUSTRE_IMP_EVICTED);
+		} else if (MSG_CONNECT_RECOVERING & msg_flags) {
+			CDEBUG(D_HA, "%s: reconnected to %s during replay\n",
+			       imp->imp_obd->obd_name,
+			       obd2cli_tgt(imp->imp_obd));
+
+			spin_lock(&imp->imp_lock);
+			imp->imp_resend_replay = 1;
+			spin_unlock(&imp->imp_lock);
+
+			IMPORT_SET_STATE(imp, imp->imp_replay_state);
+		} else {
+			IMPORT_SET_STATE(imp, LUSTRE_IMP_RECOVER);
+		}
+	} else if ((MSG_CONNECT_RECOVERING & msg_flags) && !imp->imp_invalid) {
+		LASSERT(imp->imp_replayable);
+		imp->imp_remote_handle =
+				*lustre_msg_get_handle(request->rq_repmsg);
+		imp->imp_last_replay_transno = 0;
+		IMPORT_SET_STATE(imp, LUSTRE_IMP_REPLAY);
+	} else {
+		DEBUG_REQ(D_HA, request, "%s: evicting (reconnect/recover flags not set: %x)",
+			  imp->imp_obd->obd_name, msg_flags);
+		imp->imp_remote_handle =
+				*lustre_msg_get_handle(request->rq_repmsg);
+		IMPORT_SET_STATE(imp, LUSTRE_IMP_EVICTED);
+	}
+
+	/* Sanity checks for a reconnected import. */
+	if (!(imp->imp_replayable) != !(msg_flags & MSG_CONNECT_REPLAYABLE)) {
+		CERROR("imp_replayable flag does not match server after reconnect. We should LBUG right here.\n");
+	}
+
+	if (lustre_msg_get_last_committed(request->rq_repmsg) > 0 &&
+	    lustre_msg_get_last_committed(request->rq_repmsg) <
+	    aa->pcaa_peer_committed) {
+		CERROR("%s went back in time (transno %lld was previously committed, server now claims %lld)!  See https://bugzilla.lustre.org/show_bug.cgi?id=9646\n",
+		       obd2cli_tgt(imp->imp_obd), aa->pcaa_peer_committed,
+		       lustre_msg_get_last_committed(request->rq_repmsg));
+	}
+
+finish:
+	rc = ptlrpc_import_recovery_state_machine(imp);
+	if (rc != 0) {
+		if (rc == -ENOTCONN) {
+			CDEBUG(D_HA, "evicted/aborted by %s@%s during recovery; invalidating and reconnecting\n",
+			       obd2cli_tgt(imp->imp_obd),
+			       imp->imp_connection->c_remote_uuid.uuid);
+			ptlrpc_connect_import(imp);
+			imp->imp_connect_tried = 1;
+			return 0;
+		}
+	} else {
+
+		spin_lock(&imp->imp_lock);
+		list_del(&imp->imp_conn_current->oic_item);
+		list_add(&imp->imp_conn_current->oic_item,
+			     &imp->imp_conn_list);
+		imp->imp_last_success_conn =
+			imp->imp_conn_current->oic_last_attempt;
+
+		spin_unlock(&imp->imp_lock);
+
+		if ((imp->imp_connect_flags_orig & OBD_CONNECT_IBITS) &&
+		    !(ocd->ocd_connect_flags & OBD_CONNECT_IBITS)) {
+			LCONSOLE_WARN("%s: MDS %s does not support ibits lock, either very old or invalid: requested %llx, replied %llx\n",
+				      imp->imp_obd->obd_name,
+				      imp->imp_connection->c_remote_uuid.uuid,
+				      imp->imp_connect_flags_orig,
+				      ocd->ocd_connect_flags);
+			rc = -EPROTO;
+			goto out;
+		}
+
+		if ((ocd->ocd_connect_flags & OBD_CONNECT_VERSION) &&
+		    (ocd->ocd_version > LUSTRE_VERSION_CODE +
+					LUSTRE_VERSION_OFFSET_WARN ||
+		     ocd->ocd_version < LUSTRE_VERSION_CODE -
+					LUSTRE_VERSION_OFFSET_WARN)) {
+			/* Sigh, some compilers do not like #ifdef in the middle
+			   of macro arguments */
+			const char *older = "older. Consider upgrading server or downgrading client"
+				;
+			const char *newer = "newer than client version. Consider upgrading client"
+					    ;
+
+			LCONSOLE_WARN("Server %s version (%d.%d.%d.%d) is much %s (%s)\n",
+				      obd2cli_tgt(imp->imp_obd),
+				      OBD_OCD_VERSION_MAJOR(ocd->ocd_version),
+				      OBD_OCD_VERSION_MINOR(ocd->ocd_version),
+				      OBD_OCD_VERSION_PATCH(ocd->ocd_version),
+				      OBD_OCD_VERSION_FIX(ocd->ocd_version),
+				      ocd->ocd_version > LUSTRE_VERSION_CODE ?
+				      newer : older, LUSTRE_VERSION_STRING);
+		}
+
+#if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(3, 2, 50, 0)
+		/* Check if server has LU-1252 fix applied to not always swab
+		 * the IR MNE entries. Do this only once per connection.  This
+		 * fixup is version-limited, because we don't want to carry the
+		 * OBD_CONNECT_MNE_SWAB flag around forever, just so long as we
+		 * need interop with unpatched 2.2 servers.  For newer servers,
+		 * the client will do MNE swabbing only as needed.  LU-1644 */
+		if (unlikely((ocd->ocd_connect_flags & OBD_CONNECT_VERSION) &&
+			     !(ocd->ocd_connect_flags & OBD_CONNECT_MNE_SWAB) &&
+			     OBD_OCD_VERSION_MAJOR(ocd->ocd_version) == 2 &&
+			     OBD_OCD_VERSION_MINOR(ocd->ocd_version) == 2 &&
+			     OBD_OCD_VERSION_PATCH(ocd->ocd_version) < 55 &&
+			     strcmp(imp->imp_obd->obd_type->typ_name,
+				    LUSTRE_MGC_NAME) == 0))
+			imp->imp_need_mne_swab = 1;
+		else /* clear if server was upgraded since last connect */
+			imp->imp_need_mne_swab = 0;
+#else
+#warning "LU-1644: Remove old OBD_CONNECT_MNE_SWAB fixup and imp_need_mne_swab"
+#endif
+
+		if (ocd->ocd_connect_flags & OBD_CONNECT_CKSUM) {
+			/* We sent to the server ocd_cksum_types with bits set
+			 * for algorithms we understand. The server masked off
+			 * the checksum types it doesn't support */
+			if ((ocd->ocd_cksum_types &
+			     cksum_types_supported_client()) == 0) {
+				LCONSOLE_WARN("The negotiation of the checksum algorithm to use with server %s failed (%x/%x), disabling checksums\n",
+					      obd2cli_tgt(imp->imp_obd),
+					      ocd->ocd_cksum_types,
+					      cksum_types_supported_client());
+				cli->cl_checksum = 0;
+				cli->cl_supp_cksum_types = OBD_CKSUM_ADLER;
+			} else {
+				cli->cl_supp_cksum_types = ocd->ocd_cksum_types;
+			}
+		} else {
+			/* The server does not support OBD_CONNECT_CKSUM.
+			 * Enforce ADLER for backward compatibility*/
+			cli->cl_supp_cksum_types = OBD_CKSUM_ADLER;
+		}
+		cli->cl_cksum_type = cksum_type_select(cli->cl_supp_cksum_types);
+
+		if (ocd->ocd_connect_flags & OBD_CONNECT_BRW_SIZE)
+			cli->cl_max_pages_per_rpc =
+				min(ocd->ocd_brw_size >> PAGE_CACHE_SHIFT,
+				    cli->cl_max_pages_per_rpc);
+		else if (imp->imp_connect_op == MDS_CONNECT ||
+			 imp->imp_connect_op == MGS_CONNECT)
+			cli->cl_max_pages_per_rpc = 1;
+
+		/* Reset ns_connect_flags only for initial connect. It might be
+		 * changed in while using FS and if we reset it in reconnect
+		 * this leads to losing user settings done before such as
+		 * disable lru_resize, etc. */
+		if (old_connect_flags != exp_connect_flags(exp) ||
+		    aa->pcaa_initial_connect) {
+			CDEBUG(D_HA, "%s: Resetting ns_connect_flags to server flags: %#llx\n",
+			       imp->imp_obd->obd_name, ocd->ocd_connect_flags);
+			imp->imp_obd->obd_namespace->ns_connect_flags =
+				ocd->ocd_connect_flags;
+			imp->imp_obd->obd_namespace->ns_orig_connect_flags =
+				ocd->ocd_connect_flags;
+		}
+
+		if ((ocd->ocd_connect_flags & OBD_CONNECT_AT) &&
+		    (imp->imp_msg_magic == LUSTRE_MSG_MAGIC_V2))
+			/* We need a per-message support flag, because
+			   a. we don't know if the incoming connect reply
+			      supports AT or not (in reply_in_callback)
+			      until we unpack it.
+			   b. failovered server means export and flags are gone
+			      (in ptlrpc_send_reply).
+			   Can only be set when we know AT is supported at
+			   both ends */
+			imp->imp_msghdr_flags |= MSGHDR_AT_SUPPORT;
+		else
+			imp->imp_msghdr_flags &= ~MSGHDR_AT_SUPPORT;
+
+		if ((ocd->ocd_connect_flags & OBD_CONNECT_FULL20) &&
+		    (imp->imp_msg_magic == LUSTRE_MSG_MAGIC_V2))
+			imp->imp_msghdr_flags |= MSGHDR_CKSUM_INCOMPAT18;
+		else
+			imp->imp_msghdr_flags &= ~MSGHDR_CKSUM_INCOMPAT18;
+
+		LASSERT((cli->cl_max_pages_per_rpc <= PTLRPC_MAX_BRW_PAGES) &&
+			(cli->cl_max_pages_per_rpc > 0));
+	}
+
+out:
+	imp->imp_connect_tried = 1;
+
+	if (rc != 0) {
+		IMPORT_SET_STATE(imp, LUSTRE_IMP_DISCON);
+		if (rc == -EACCES) {
+			/*
+			 * Give up trying to reconnect
+			 * EACCES means client has no permission for connection
+			 */
+			imp->imp_obd->obd_no_recov = 1;
+			ptlrpc_deactivate_import(imp);
+		}
+
+		if (rc == -EPROTO) {
+			struct obd_connect_data *ocd;
+
+			/* reply message might not be ready */
+			if (request->rq_repmsg == NULL)
+				return -EPROTO;
+
+			ocd = req_capsule_server_get(&request->rq_pill,
+						     &RMF_CONNECT_DATA);
+			if (ocd &&
+			    (ocd->ocd_connect_flags & OBD_CONNECT_VERSION) &&
+			    (ocd->ocd_version != LUSTRE_VERSION_CODE)) {
+				/*
+				 * Actually servers are only supposed to refuse
+				 * connection from liblustre clients, so we
+				 * should never see this from VFS context
+				 */
+				LCONSOLE_ERROR_MSG(0x16a, "Server %s version (%d.%d.%d.%d) refused connection from this client with an incompatible version (%s).  Client must be recompiled\n",
+						   obd2cli_tgt(imp->imp_obd),
+						   OBD_OCD_VERSION_MAJOR(ocd->ocd_version),
+						   OBD_OCD_VERSION_MINOR(ocd->ocd_version),
+						   OBD_OCD_VERSION_PATCH(ocd->ocd_version),
+						   OBD_OCD_VERSION_FIX(ocd->ocd_version),
+						   LUSTRE_VERSION_STRING);
+				ptlrpc_deactivate_import(imp);
+				IMPORT_SET_STATE(imp, LUSTRE_IMP_CLOSED);
+			}
+			return -EPROTO;
+		}
+
+		ptlrpc_maybe_ping_import_soon(imp);
+
+		CDEBUG(D_HA, "recovery of %s on %s failed (%d)\n",
+		       obd2cli_tgt(imp->imp_obd),
+		       (char *)imp->imp_connection->c_remote_uuid.uuid, rc);
+	}
+
+	wake_up_all(&imp->imp_recovery_waitq);
+	return rc;
+}
+
+/**
+ * interpret callback for "completed replay" RPCs.
+ * \see signal_completed_replay
+ */
+static int completed_replay_interpret(const struct lu_env *env,
+				      struct ptlrpc_request *req,
+				      void *data, int rc)
+{
+	atomic_dec(&req->rq_import->imp_replay_inflight);
+	if (req->rq_status == 0 &&
+	    !req->rq_import->imp_vbr_failed) {
+		ptlrpc_import_recovery_state_machine(req->rq_import);
+	} else {
+		if (req->rq_import->imp_vbr_failed) {
+			CDEBUG(D_WARNING,
+			       "%s: version recovery fails, reconnecting\n",
+			       req->rq_import->imp_obd->obd_name);
+		} else {
+			CDEBUG(D_HA, "%s: LAST_REPLAY message error: %d, reconnecting\n",
+			       req->rq_import->imp_obd->obd_name,
+			       req->rq_status);
+		}
+		ptlrpc_connect_import(req->rq_import);
+	}
+
+	return 0;
+}
+
+/**
+ * Let server know that we have no requests to replay anymore.
+ * Achieved by just sending a PING request
+ */
+static int signal_completed_replay(struct obd_import *imp)
+{
+	struct ptlrpc_request *req;
+
+	if (unlikely(OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_FINISH_REPLAY)))
+		return 0;
+
+	LASSERT(atomic_read(&imp->imp_replay_inflight) == 0);
+	atomic_inc(&imp->imp_replay_inflight);
+
+	req = ptlrpc_request_alloc_pack(imp, &RQF_OBD_PING, LUSTRE_OBD_VERSION,
+					OBD_PING);
+	if (req == NULL) {
+		atomic_dec(&imp->imp_replay_inflight);
+		return -ENOMEM;
+	}
+
+	ptlrpc_request_set_replen(req);
+	req->rq_send_state = LUSTRE_IMP_REPLAY_WAIT;
+	lustre_msg_add_flags(req->rq_reqmsg,
+			     MSG_LOCK_REPLAY_DONE | MSG_REQ_REPLAY_DONE);
+	if (AT_OFF)
+		req->rq_timeout *= 3;
+	req->rq_interpret_reply = completed_replay_interpret;
+
+	ptlrpcd_add_req(req, PDL_POLICY_ROUND, -1);
+	return 0;
+}
+
+/**
+ * In kernel code all import invalidation happens in its own
+ * separate thread, so that whatever application happened to encounter
+ * a problem could still be killed or otherwise continue
+ */
+static int ptlrpc_invalidate_import_thread(void *data)
+{
+	struct obd_import *imp = data;
+
+	unshare_fs_struct();
+
+	CDEBUG(D_HA, "thread invalidate import %s to %s@%s\n",
+	       imp->imp_obd->obd_name, obd2cli_tgt(imp->imp_obd),
+	       imp->imp_connection->c_remote_uuid.uuid);
+
+	ptlrpc_invalidate_import(imp);
+
+	if (obd_dump_on_eviction) {
+		CERROR("dump the log upon eviction\n");
+		libcfs_debug_dumplog();
+	}
+
+	IMPORT_SET_STATE(imp, LUSTRE_IMP_RECOVER);
+	ptlrpc_import_recovery_state_machine(imp);
+
+	class_import_put(imp);
+	return 0;
+}
+
+/**
+ * This is the state machine for client-side recovery on import.
+ *
+ * Typically we have two possibly paths. If we came to server and it is not
+ * in recovery, we just enter IMP_EVICTED state, invalidate our import
+ * state and reconnect from scratch.
+ * If we came to server that is in recovery, we enter IMP_REPLAY import state.
+ * We go through our list of requests to replay and send them to server one by
+ * one.
+ * After sending all request from the list we change import state to
+ * IMP_REPLAY_LOCKS and re-request all the locks we believe we have from server
+ * and also all the locks we don't yet have and wait for server to grant us.
+ * After that we send a special "replay completed" request and change import
+ * state to IMP_REPLAY_WAIT.
+ * Upon receiving reply to that "replay completed" RPC we enter IMP_RECOVER
+ * state and resend all requests from sending list.
+ * After that we promote import to FULL state and send all delayed requests
+ * and import is fully operational after that.
+ *
+ */
+int ptlrpc_import_recovery_state_machine(struct obd_import *imp)
+{
+	int rc = 0;
+	int inflight;
+	char *target_start;
+	int target_len;
+
+	if (imp->imp_state == LUSTRE_IMP_EVICTED) {
+		deuuidify(obd2cli_tgt(imp->imp_obd), NULL,
+			  &target_start, &target_len);
+		/* Don't care about MGC eviction */
+		if (strcmp(imp->imp_obd->obd_type->typ_name,
+			   LUSTRE_MGC_NAME) != 0) {
+			LCONSOLE_ERROR_MSG(0x167, "%s: This client was evicted by %.*s; in progress operations using this service will fail.\n",
+					   imp->imp_obd->obd_name, target_len,
+					   target_start);
+		}
+		CDEBUG(D_HA, "evicted from %s@%s; invalidating\n",
+		       obd2cli_tgt(imp->imp_obd),
+		       imp->imp_connection->c_remote_uuid.uuid);
+		/* reset vbr_failed flag upon eviction */
+		spin_lock(&imp->imp_lock);
+		imp->imp_vbr_failed = 0;
+		spin_unlock(&imp->imp_lock);
+
+		{
+		struct task_struct *task;
+		/* bug 17802:  XXX client_disconnect_export vs connect request
+		 * race. if client will evicted at this time, we start
+		 * invalidate thread without reference to import and import can
+		 * be freed at same time. */
+		class_import_get(imp);
+		task = kthread_run(ptlrpc_invalidate_import_thread, imp,
+				     "ll_imp_inval");
+		if (IS_ERR(task)) {
+			class_import_put(imp);
+			CERROR("error starting invalidate thread: %d\n", rc);
+			rc = PTR_ERR(task);
+		} else {
+			rc = 0;
+		}
+		return rc;
+		}
+	}
+
+	if (imp->imp_state == LUSTRE_IMP_REPLAY) {
+		CDEBUG(D_HA, "replay requested by %s\n",
+		       obd2cli_tgt(imp->imp_obd));
+		rc = ptlrpc_replay_next(imp, &inflight);
+		if (inflight == 0 &&
+		    atomic_read(&imp->imp_replay_inflight) == 0) {
+			IMPORT_SET_STATE(imp, LUSTRE_IMP_REPLAY_LOCKS);
+			rc = ldlm_replay_locks(imp);
+			if (rc)
+				goto out;
+		}
+		rc = 0;
+	}
+
+	if (imp->imp_state == LUSTRE_IMP_REPLAY_LOCKS) {
+		if (atomic_read(&imp->imp_replay_inflight) == 0) {
+			IMPORT_SET_STATE(imp, LUSTRE_IMP_REPLAY_WAIT);
+			rc = signal_completed_replay(imp);
+			if (rc)
+				goto out;
+		}
+
+	}
+
+	if (imp->imp_state == LUSTRE_IMP_REPLAY_WAIT) {
+		if (atomic_read(&imp->imp_replay_inflight) == 0) {
+			IMPORT_SET_STATE(imp, LUSTRE_IMP_RECOVER);
+		}
+	}
+
+	if (imp->imp_state == LUSTRE_IMP_RECOVER) {
+		CDEBUG(D_HA, "reconnected to %s@%s\n",
+		       obd2cli_tgt(imp->imp_obd),
+		       imp->imp_connection->c_remote_uuid.uuid);
+
+		rc = ptlrpc_resend(imp);
+		if (rc)
+			goto out;
+		IMPORT_SET_STATE(imp, LUSTRE_IMP_FULL);
+		ptlrpc_activate_import(imp);
+
+		deuuidify(obd2cli_tgt(imp->imp_obd), NULL,
+			  &target_start, &target_len);
+		LCONSOLE_INFO("%s: Connection restored to %.*s (at %s)\n",
+			      imp->imp_obd->obd_name,
+			      target_len, target_start,
+			      libcfs_nid2str(imp->imp_connection->c_peer.nid));
+	}
+
+	if (imp->imp_state == LUSTRE_IMP_FULL) {
+		wake_up_all(&imp->imp_recovery_waitq);
+		ptlrpc_wake_delayed(imp);
+	}
+
+out:
+	return rc;
+}
+
+int ptlrpc_disconnect_import(struct obd_import *imp, int noclose)
+{
+	struct ptlrpc_request *req;
+	int rq_opc, rc = 0;
+
+	if (imp->imp_obd->obd_force)
+		goto set_state;
+
+	switch (imp->imp_connect_op) {
+	case OST_CONNECT:
+		rq_opc = OST_DISCONNECT;
+		break;
+	case MDS_CONNECT:
+		rq_opc = MDS_DISCONNECT;
+		break;
+	case MGS_CONNECT:
+		rq_opc = MGS_DISCONNECT;
+		break;
+	default:
+		rc = -EINVAL;
+		CERROR("%s: don't know how to disconnect from %s (connect_op %d): rc = %d\n",
+		       imp->imp_obd->obd_name, obd2cli_tgt(imp->imp_obd),
+		       imp->imp_connect_op, rc);
+		return rc;
+	}
+
+	if (ptlrpc_import_in_recovery(imp)) {
+		struct l_wait_info lwi;
+		long timeout;
+
+		if (AT_OFF) {
+			if (imp->imp_server_timeout)
+				timeout = cfs_time_seconds(obd_timeout / 2);
+			else
+				timeout = cfs_time_seconds(obd_timeout);
+		} else {
+			int idx = import_at_get_index(imp,
+				imp->imp_client->cli_request_portal);
+			timeout = cfs_time_seconds(
+				at_get(&imp->imp_at.iat_service_estimate[idx]));
+		}
+
+		lwi = LWI_TIMEOUT_INTR(cfs_timeout_cap(timeout),
+				       back_to_sleep, LWI_ON_SIGNAL_NOOP, NULL);
+		rc = l_wait_event(imp->imp_recovery_waitq,
+				  !ptlrpc_import_in_recovery(imp), &lwi);
+
+	}
+
+	spin_lock(&imp->imp_lock);
+	if (imp->imp_state != LUSTRE_IMP_FULL)
+		goto out;
+	spin_unlock(&imp->imp_lock);
+
+	req = ptlrpc_request_alloc_pack(imp, &RQF_MDS_DISCONNECT,
+					LUSTRE_OBD_VERSION, rq_opc);
+	if (req) {
+		/* We are disconnecting, do not retry a failed DISCONNECT rpc if
+		 * it fails.  We can get through the above with a down server
+		 * if the client doesn't know the server is gone yet. */
+		req->rq_no_resend = 1;
+
+		/* We want client umounts to happen quickly, no matter the
+		   server state... */
+		req->rq_timeout = min_t(int, req->rq_timeout,
+					INITIAL_CONNECT_TIMEOUT);
+
+		IMPORT_SET_STATE(imp, LUSTRE_IMP_CONNECTING);
+		req->rq_send_state =  LUSTRE_IMP_CONNECTING;
+		ptlrpc_request_set_replen(req);
+		rc = ptlrpc_queue_wait(req);
+		ptlrpc_req_finished(req);
+	}
+
+set_state:
+	spin_lock(&imp->imp_lock);
+out:
+	if (noclose)
+		IMPORT_SET_STATE_NOLOCK(imp, LUSTRE_IMP_DISCON);
+	else
+		IMPORT_SET_STATE_NOLOCK(imp, LUSTRE_IMP_CLOSED);
+	memset(&imp->imp_remote_handle, 0, sizeof(imp->imp_remote_handle));
+	spin_unlock(&imp->imp_lock);
+
+	if (rc == -ETIMEDOUT || rc == -ENOTCONN || rc == -ESHUTDOWN)
+		rc = 0;
+
+	return rc;
+}
+EXPORT_SYMBOL(ptlrpc_disconnect_import);
+
+void ptlrpc_cleanup_imp(struct obd_import *imp)
+{
+	spin_lock(&imp->imp_lock);
+	IMPORT_SET_STATE_NOLOCK(imp, LUSTRE_IMP_CLOSED);
+	imp->imp_generation++;
+	spin_unlock(&imp->imp_lock);
+	ptlrpc_abort_inflight(imp);
+}
+EXPORT_SYMBOL(ptlrpc_cleanup_imp);
+
+/* Adaptive Timeout utils */
+extern unsigned int at_min, at_max, at_history;
+
+/* Bin into timeslices using AT_BINS bins.
+   This gives us a max of the last binlimit*AT_BINS secs without the storage,
+   but still smoothing out a return to normalcy from a slow response.
+   (E.g. remember the maximum latency in each minute of the last 4 minutes.) */
+int at_measured(struct adaptive_timeout *at, unsigned int val)
+{
+	unsigned int old = at->at_current;
+	time_t now = get_seconds();
+	time_t binlimit = max_t(time_t, at_history / AT_BINS, 1);
+
+	LASSERT(at);
+	CDEBUG(D_OTHER, "add %u to %p time=%lu v=%u (%u %u %u %u)\n",
+	       val, at, now - at->at_binstart, at->at_current,
+	       at->at_hist[0], at->at_hist[1], at->at_hist[2], at->at_hist[3]);
+
+	if (val == 0)
+		/* 0's don't count, because we never want our timeout to
+		   drop to 0, and because 0 could mean an error */
+		return 0;
+
+	spin_lock(&at->at_lock);
+
+	if (unlikely(at->at_binstart == 0)) {
+		/* Special case to remove default from history */
+		at->at_current = val;
+		at->at_worst_ever = val;
+		at->at_worst_time = now;
+		at->at_hist[0] = val;
+		at->at_binstart = now;
+	} else if (now - at->at_binstart < binlimit) {
+		/* in bin 0 */
+		at->at_hist[0] = max(val, at->at_hist[0]);
+		at->at_current = max(val, at->at_current);
+	} else {
+		int i, shift;
+		unsigned int maxv = val;
+		/* move bins over */
+		shift = (now - at->at_binstart) / binlimit;
+		LASSERT(shift > 0);
+		for (i = AT_BINS - 1; i >= 0; i--) {
+			if (i >= shift) {
+				at->at_hist[i] = at->at_hist[i - shift];
+				maxv = max(maxv, at->at_hist[i]);
+			} else {
+				at->at_hist[i] = 0;
+			}
+		}
+		at->at_hist[0] = val;
+		at->at_current = maxv;
+		at->at_binstart += shift * binlimit;
+	}
+
+	if (at->at_current > at->at_worst_ever) {
+		at->at_worst_ever = at->at_current;
+		at->at_worst_time = now;
+	}
+
+	if (at->at_flags & AT_FLG_NOHIST)
+		/* Only keep last reported val; keeping the rest of the history
+		   for proc only */
+		at->at_current = val;
+
+	if (at_max > 0)
+		at->at_current =  min(at->at_current, at_max);
+	at->at_current =  max(at->at_current, at_min);
+
+	if (at->at_current != old)
+		CDEBUG(D_OTHER, "AT %p change: old=%u new=%u delta=%d (val=%u) hist %u %u %u %u\n",
+		       at,
+		       old, at->at_current, at->at_current - old, val,
+		       at->at_hist[0], at->at_hist[1], at->at_hist[2],
+		       at->at_hist[3]);
+
+	/* if we changed, report the old value */
+	old = (at->at_current != old) ? old : 0;
+
+	spin_unlock(&at->at_lock);
+	return old;
+}
+
+/* Find the imp_at index for a given portal; assign if space available */
+int import_at_get_index(struct obd_import *imp, int portal)
+{
+	struct imp_at *at = &imp->imp_at;
+	int i;
+
+	for (i = 0; i < IMP_AT_MAX_PORTALS; i++) {
+		if (at->iat_portal[i] == portal)
+			return i;
+		if (at->iat_portal[i] == 0)
+			/* unused */
+			break;
+	}
+
+	/* Not found in list, add it under a lock */
+	spin_lock(&imp->imp_lock);
+
+	/* Check unused under lock */
+	for (; i < IMP_AT_MAX_PORTALS; i++) {
+		if (at->iat_portal[i] == portal)
+			goto out;
+		if (at->iat_portal[i] == 0)
+			/* unused */
+			break;
+	}
+
+	/* Not enough portals? */
+	LASSERT(i < IMP_AT_MAX_PORTALS);
+
+	at->iat_portal[i] = portal;
+out:
+	spin_unlock(&imp->imp_lock);
+	return i;
+}
diff --git a/kernel/drivers/staging/lustre/lustre/ptlrpc/layout.c b/kernel/drivers/staging/lustre/lustre/ptlrpc/layout.c
new file mode 100644
index 000000000..a42335e26
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/ptlrpc/layout.c
@@ -0,0 +1,2442 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/ptlrpc/layout.c
+ *
+ * Lustre Metadata Target (mdt) request handler
+ *
+ * Author: Nikita Danilov <nikita@clusterfs.com>
+ */
+/*
+ * This file contains the "capsule/pill" abstraction layered above PTLRPC.
+ *
+ * Every struct ptlrpc_request contains a "pill", which points to a description
+ * of the format that the request conforms to.
+ */
+
+#if !defined(__REQ_LAYOUT_USER__)
+
+#define DEBUG_SUBSYSTEM S_RPC
+
+#include <linux/module.h>
+
+/* LUSTRE_VERSION_CODE */
+#include "../include/lustre_ver.h"
+
+#include "../include/obd_support.h"
+/* lustre_swab_mdt_body */
+#include "../include/lustre/lustre_idl.h"
+/* obd2cli_tgt() (required by DEBUG_REQ()) */
+#include "../include/obd.h"
+
+/* __REQ_LAYOUT_USER__ */
+#endif
+/* struct ptlrpc_request, lustre_msg* */
+#include "../include/lustre_req_layout.h"
+#include "../include/lustre_acl.h"
+#include "../include/lustre_debug.h"
+
+/*
+ * RQFs (see below) refer to two struct req_msg_field arrays describing the
+ * client request and server reply, respectively.
+ */
+/* empty set of fields... for suitable definition of emptiness. */
+static const struct req_msg_field *empty[] = {
+	&RMF_PTLRPC_BODY
+};
+
+static const struct req_msg_field *mgs_target_info_only[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_MGS_TARGET_INFO
+};
+
+static const struct req_msg_field *mgs_set_info[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_MGS_SEND_PARAM
+};
+
+static const struct req_msg_field *mgs_config_read_client[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_MGS_CONFIG_BODY
+};
+
+static const struct req_msg_field *mgs_config_read_server[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_MGS_CONFIG_RES
+};
+
+static const struct req_msg_field *log_cancel_client[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_LOGCOOKIES
+};
+
+static const struct req_msg_field *mdt_body_only[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_MDT_BODY
+};
+
+static const struct req_msg_field *mdt_body_capa[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_MDT_BODY,
+	&RMF_CAPA1
+};
+
+static const struct req_msg_field *quotactl_only[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_OBD_QUOTACTL
+};
+
+static const struct req_msg_field *quota_body_only[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_QUOTA_BODY
+};
+
+static const struct req_msg_field *ldlm_intent_quota_client[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_DLM_REQ,
+	&RMF_LDLM_INTENT,
+	&RMF_QUOTA_BODY
+};
+
+static const struct req_msg_field *ldlm_intent_quota_server[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_DLM_REP,
+	&RMF_DLM_LVB,
+	&RMF_QUOTA_BODY
+};
+
+static const struct req_msg_field *mdt_close_client[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_MDT_EPOCH,
+	&RMF_REC_REINT,
+	&RMF_CAPA1
+};
+
+static const struct req_msg_field *mdt_release_close_client[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_MDT_EPOCH,
+	&RMF_REC_REINT,
+	&RMF_CAPA1,
+	&RMF_CLOSE_DATA
+};
+
+static const struct req_msg_field *obd_statfs_server[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_OBD_STATFS
+};
+
+static const struct req_msg_field *seq_query_client[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_SEQ_OPC,
+	&RMF_SEQ_RANGE
+};
+
+static const struct req_msg_field *seq_query_server[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_SEQ_RANGE
+};
+
+static const struct req_msg_field *fld_query_client[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_FLD_OPC,
+	&RMF_FLD_MDFLD
+};
+
+static const struct req_msg_field *fld_query_server[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_FLD_MDFLD
+};
+
+static const struct req_msg_field *mds_getattr_name_client[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_MDT_BODY,
+	&RMF_CAPA1,
+	&RMF_NAME
+};
+
+static const struct req_msg_field *mds_reint_client[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_REC_REINT
+};
+
+static const struct req_msg_field *mds_reint_create_client[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_REC_REINT,
+	&RMF_CAPA1,
+	&RMF_NAME
+};
+
+static const struct req_msg_field *mds_reint_create_slave_client[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_REC_REINT,
+	&RMF_CAPA1,
+	&RMF_NAME,
+	&RMF_EADATA,
+	&RMF_DLM_REQ
+};
+
+static const struct req_msg_field *mds_reint_create_rmt_acl_client[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_REC_REINT,
+	&RMF_CAPA1,
+	&RMF_NAME,
+	&RMF_EADATA,
+	&RMF_DLM_REQ
+};
+
+static const struct req_msg_field *mds_reint_create_sym_client[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_REC_REINT,
+	&RMF_CAPA1,
+	&RMF_NAME,
+	&RMF_SYMTGT,
+	&RMF_DLM_REQ
+};
+
+static const struct req_msg_field *mds_reint_open_client[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_REC_REINT,
+	&RMF_CAPA1,
+	&RMF_CAPA2,
+	&RMF_NAME,
+	&RMF_EADATA
+};
+
+static const struct req_msg_field *mds_reint_open_server[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_MDT_BODY,
+	&RMF_MDT_MD,
+	&RMF_ACL,
+	&RMF_CAPA1,
+	&RMF_CAPA2
+};
+
+static const struct req_msg_field *mds_reint_unlink_client[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_REC_REINT,
+	&RMF_CAPA1,
+	&RMF_NAME,
+	&RMF_DLM_REQ
+};
+
+static const struct req_msg_field *mds_reint_link_client[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_REC_REINT,
+	&RMF_CAPA1,
+	&RMF_CAPA2,
+	&RMF_NAME,
+	&RMF_DLM_REQ
+};
+
+static const struct req_msg_field *mds_reint_rename_client[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_REC_REINT,
+	&RMF_CAPA1,
+	&RMF_CAPA2,
+	&RMF_NAME,
+	&RMF_SYMTGT,
+	&RMF_DLM_REQ
+};
+
+static const struct req_msg_field *mds_last_unlink_server[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_MDT_BODY,
+	&RMF_MDT_MD,
+	&RMF_LOGCOOKIES,
+	&RMF_CAPA1,
+	&RMF_CAPA2
+};
+
+static const struct req_msg_field *mds_reint_setattr_client[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_REC_REINT,
+	&RMF_CAPA1,
+	&RMF_MDT_EPOCH,
+	&RMF_EADATA,
+	&RMF_LOGCOOKIES,
+	&RMF_DLM_REQ
+};
+
+static const struct req_msg_field *mds_reint_setxattr_client[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_REC_REINT,
+	&RMF_CAPA1,
+	&RMF_NAME,
+	&RMF_EADATA,
+	&RMF_DLM_REQ
+};
+
+static const struct req_msg_field *mdt_swap_layouts[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_MDT_BODY,
+	&RMF_SWAP_LAYOUTS,
+	&RMF_CAPA1,
+	&RMF_CAPA2,
+	&RMF_DLM_REQ
+};
+
+static const struct req_msg_field *obd_connect_client[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_TGTUUID,
+	&RMF_CLUUID,
+	&RMF_CONN,
+	&RMF_CONNECT_DATA
+};
+
+static const struct req_msg_field *obd_connect_server[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_CONNECT_DATA
+};
+
+static const struct req_msg_field *obd_set_info_client[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_SETINFO_KEY,
+	&RMF_SETINFO_VAL
+};
+
+static const struct req_msg_field *ost_grant_shrink_client[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_SETINFO_KEY,
+	&RMF_OST_BODY
+};
+
+static const struct req_msg_field *mds_getinfo_client[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_GETINFO_KEY,
+	&RMF_GETINFO_VALLEN
+};
+
+static const struct req_msg_field *mds_getinfo_server[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_GETINFO_VAL,
+};
+
+static const struct req_msg_field *ldlm_enqueue_client[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_DLM_REQ
+};
+
+static const struct req_msg_field *ldlm_enqueue_server[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_DLM_REP
+};
+
+static const struct req_msg_field *ldlm_enqueue_lvb_server[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_DLM_REP,
+	&RMF_DLM_LVB
+};
+
+static const struct req_msg_field *ldlm_cp_callback_client[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_DLM_REQ,
+	&RMF_DLM_LVB
+};
+
+static const struct req_msg_field *ldlm_gl_callback_desc_client[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_DLM_REQ,
+	&RMF_DLM_GL_DESC
+};
+
+static const struct req_msg_field *ldlm_gl_callback_server[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_DLM_LVB
+};
+
+static const struct req_msg_field *ldlm_intent_basic_client[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_DLM_REQ,
+	&RMF_LDLM_INTENT,
+};
+
+static const struct req_msg_field *ldlm_intent_client[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_DLM_REQ,
+	&RMF_LDLM_INTENT,
+	&RMF_REC_REINT
+};
+
+static const struct req_msg_field *ldlm_intent_server[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_DLM_REP,
+	&RMF_MDT_BODY,
+	&RMF_MDT_MD,
+	&RMF_ACL
+};
+
+static const struct req_msg_field *ldlm_intent_layout_client[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_DLM_REQ,
+	&RMF_LDLM_INTENT,
+	&RMF_LAYOUT_INTENT,
+	&RMF_EADATA /* for new layout to be set up */
+};
+static const struct req_msg_field *ldlm_intent_open_server[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_DLM_REP,
+	&RMF_MDT_BODY,
+	&RMF_MDT_MD,
+	&RMF_ACL,
+	&RMF_CAPA1,
+	&RMF_CAPA2
+};
+
+static const struct req_msg_field *ldlm_intent_getattr_client[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_DLM_REQ,
+	&RMF_LDLM_INTENT,
+	&RMF_MDT_BODY,     /* coincides with mds_getattr_name_client[] */
+	&RMF_CAPA1,
+	&RMF_NAME
+};
+
+static const struct req_msg_field *ldlm_intent_getattr_server[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_DLM_REP,
+	&RMF_MDT_BODY,
+	&RMF_MDT_MD,
+	&RMF_ACL,
+	&RMF_CAPA1
+};
+
+static const struct req_msg_field *ldlm_intent_create_client[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_DLM_REQ,
+	&RMF_LDLM_INTENT,
+	&RMF_REC_REINT,    /* coincides with mds_reint_create_client[] */
+	&RMF_CAPA1,
+	&RMF_NAME,
+	&RMF_EADATA
+};
+
+static const struct req_msg_field *ldlm_intent_open_client[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_DLM_REQ,
+	&RMF_LDLM_INTENT,
+	&RMF_REC_REINT,    /* coincides with mds_reint_open_client[] */
+	&RMF_CAPA1,
+	&RMF_CAPA2,
+	&RMF_NAME,
+	&RMF_EADATA
+};
+
+static const struct req_msg_field *ldlm_intent_unlink_client[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_DLM_REQ,
+	&RMF_LDLM_INTENT,
+	&RMF_REC_REINT,    /* coincides with mds_reint_unlink_client[] */
+	&RMF_CAPA1,
+	&RMF_NAME
+};
+
+static const struct req_msg_field *ldlm_intent_getxattr_client[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_DLM_REQ,
+	&RMF_LDLM_INTENT,
+	&RMF_MDT_BODY,
+	&RMF_CAPA1,
+};
+
+static const struct req_msg_field *ldlm_intent_getxattr_server[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_DLM_REP,
+	&RMF_MDT_BODY,
+	&RMF_MDT_MD,
+	&RMF_ACL, /* for req_capsule_extend/mdt_intent_policy */
+	&RMF_EADATA,
+	&RMF_EAVALS,
+	&RMF_EAVALS_LENS
+};
+
+static const struct req_msg_field *mds_getxattr_client[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_MDT_BODY,
+	&RMF_CAPA1,
+	&RMF_NAME,
+	&RMF_EADATA
+};
+
+static const struct req_msg_field *mds_getxattr_server[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_MDT_BODY,
+	&RMF_EADATA
+};
+
+static const struct req_msg_field *mds_getattr_server[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_MDT_BODY,
+	&RMF_MDT_MD,
+	&RMF_ACL,
+	&RMF_CAPA1,
+	&RMF_CAPA2
+};
+
+static const struct req_msg_field *mds_setattr_server[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_MDT_BODY,
+	&RMF_MDT_MD,
+	&RMF_ACL,
+	&RMF_CAPA1,
+	&RMF_CAPA2
+};
+
+static const struct req_msg_field *mds_update_client[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_UPDATE,
+};
+
+static const struct req_msg_field *mds_update_server[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_UPDATE_REPLY,
+};
+
+static const struct req_msg_field *llog_origin_handle_create_client[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_LLOGD_BODY,
+	&RMF_NAME
+};
+
+static const struct req_msg_field *llogd_body_only[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_LLOGD_BODY
+};
+
+static const struct req_msg_field *llog_log_hdr_only[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_LLOG_LOG_HDR
+};
+
+static const struct req_msg_field *llogd_conn_body_only[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_LLOGD_CONN_BODY
+};
+
+static const struct req_msg_field *llog_origin_handle_next_block_server[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_LLOGD_BODY,
+	&RMF_EADATA
+};
+
+static const struct req_msg_field *obd_idx_read_client[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_IDX_INFO
+};
+
+static const struct req_msg_field *obd_idx_read_server[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_IDX_INFO
+};
+
+static const struct req_msg_field *ost_body_only[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_OST_BODY
+};
+
+static const struct req_msg_field *ost_body_capa[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_OST_BODY,
+	&RMF_CAPA1
+};
+
+static const struct req_msg_field *ost_destroy_client[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_OST_BODY,
+	&RMF_DLM_REQ,
+	&RMF_CAPA1
+};
+
+
+static const struct req_msg_field *ost_brw_client[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_OST_BODY,
+	&RMF_OBD_IOOBJ,
+	&RMF_NIOBUF_REMOTE,
+	&RMF_CAPA1
+};
+
+static const struct req_msg_field *ost_brw_read_server[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_OST_BODY
+};
+
+static const struct req_msg_field *ost_brw_write_server[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_OST_BODY,
+	&RMF_RCS
+};
+
+static const struct req_msg_field *ost_get_info_generic_server[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_GENERIC_DATA,
+};
+
+static const struct req_msg_field *ost_get_info_generic_client[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_SETINFO_KEY
+};
+
+static const struct req_msg_field *ost_get_last_id_server[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_OBD_ID
+};
+
+static const struct req_msg_field *ost_get_last_fid_server[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_FID,
+};
+
+static const struct req_msg_field *ost_get_fiemap_client[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_FIEMAP_KEY,
+	&RMF_FIEMAP_VAL
+};
+
+static const struct req_msg_field *ost_get_fiemap_server[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_FIEMAP_VAL
+};
+
+static const struct req_msg_field *mdt_hsm_progress[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_MDT_BODY,
+	&RMF_MDS_HSM_PROGRESS,
+};
+
+static const struct req_msg_field *mdt_hsm_ct_register[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_MDT_BODY,
+	&RMF_MDS_HSM_ARCHIVE,
+};
+
+static const struct req_msg_field *mdt_hsm_ct_unregister[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_MDT_BODY,
+};
+
+static const struct req_msg_field *mdt_hsm_action_server[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_MDT_BODY,
+	&RMF_MDS_HSM_CURRENT_ACTION,
+};
+
+static const struct req_msg_field *mdt_hsm_state_get_server[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_MDT_BODY,
+	&RMF_HSM_USER_STATE,
+};
+
+static const struct req_msg_field *mdt_hsm_state_set[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_MDT_BODY,
+	&RMF_CAPA1,
+	&RMF_HSM_STATE_SET,
+};
+
+static const struct req_msg_field *mdt_hsm_request[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_MDT_BODY,
+	&RMF_MDS_HSM_REQUEST,
+	&RMF_MDS_HSM_USER_ITEM,
+	&RMF_GENERIC_DATA,
+};
+
+static struct req_format *req_formats[] = {
+	&RQF_OBD_PING,
+	&RQF_OBD_SET_INFO,
+	&RQF_OBD_IDX_READ,
+	&RQF_SEC_CTX,
+	&RQF_MGS_TARGET_REG,
+	&RQF_MGS_SET_INFO,
+	&RQF_MGS_CONFIG_READ,
+	&RQF_SEQ_QUERY,
+	&RQF_FLD_QUERY,
+	&RQF_MDS_CONNECT,
+	&RQF_MDS_DISCONNECT,
+	&RQF_MDS_GET_INFO,
+	&RQF_MDS_GETSTATUS,
+	&RQF_MDS_STATFS,
+	&RQF_MDS_GETATTR,
+	&RQF_MDS_GETATTR_NAME,
+	&RQF_MDS_GETXATTR,
+	&RQF_MDS_SYNC,
+	&RQF_MDS_CLOSE,
+	&RQF_MDS_RELEASE_CLOSE,
+	&RQF_MDS_PIN,
+	&RQF_MDS_UNPIN,
+	&RQF_MDS_READPAGE,
+	&RQF_MDS_WRITEPAGE,
+	&RQF_MDS_IS_SUBDIR,
+	&RQF_MDS_DONE_WRITING,
+	&RQF_MDS_REINT,
+	&RQF_MDS_REINT_CREATE,
+	&RQF_MDS_REINT_CREATE_RMT_ACL,
+	&RQF_MDS_REINT_CREATE_SLAVE,
+	&RQF_MDS_REINT_CREATE_SYM,
+	&RQF_MDS_REINT_OPEN,
+	&RQF_MDS_REINT_UNLINK,
+	&RQF_MDS_REINT_LINK,
+	&RQF_MDS_REINT_RENAME,
+	&RQF_MDS_REINT_SETATTR,
+	&RQF_MDS_REINT_SETXATTR,
+	&RQF_MDS_QUOTACHECK,
+	&RQF_MDS_QUOTACTL,
+	&RQF_MDS_HSM_PROGRESS,
+	&RQF_MDS_HSM_CT_REGISTER,
+	&RQF_MDS_HSM_CT_UNREGISTER,
+	&RQF_MDS_HSM_STATE_GET,
+	&RQF_MDS_HSM_STATE_SET,
+	&RQF_MDS_HSM_ACTION,
+	&RQF_MDS_HSM_REQUEST,
+	&RQF_MDS_SWAP_LAYOUTS,
+	&RQF_UPDATE_OBJ,
+	&RQF_QC_CALLBACK,
+	&RQF_OST_CONNECT,
+	&RQF_OST_DISCONNECT,
+	&RQF_OST_QUOTACHECK,
+	&RQF_OST_QUOTACTL,
+	&RQF_OST_GETATTR,
+	&RQF_OST_SETATTR,
+	&RQF_OST_CREATE,
+	&RQF_OST_PUNCH,
+	&RQF_OST_SYNC,
+	&RQF_OST_DESTROY,
+	&RQF_OST_BRW_READ,
+	&RQF_OST_BRW_WRITE,
+	&RQF_OST_STATFS,
+	&RQF_OST_SET_GRANT_INFO,
+	&RQF_OST_GET_INFO_GENERIC,
+	&RQF_OST_GET_INFO_LAST_ID,
+	&RQF_OST_GET_INFO_LAST_FID,
+	&RQF_OST_SET_INFO_LAST_FID,
+	&RQF_OST_GET_INFO_FIEMAP,
+	&RQF_LDLM_ENQUEUE,
+	&RQF_LDLM_ENQUEUE_LVB,
+	&RQF_LDLM_CONVERT,
+	&RQF_LDLM_CANCEL,
+	&RQF_LDLM_CALLBACK,
+	&RQF_LDLM_CP_CALLBACK,
+	&RQF_LDLM_BL_CALLBACK,
+	&RQF_LDLM_GL_CALLBACK,
+	&RQF_LDLM_GL_DESC_CALLBACK,
+	&RQF_LDLM_INTENT,
+	&RQF_LDLM_INTENT_BASIC,
+	&RQF_LDLM_INTENT_LAYOUT,
+	&RQF_LDLM_INTENT_GETATTR,
+	&RQF_LDLM_INTENT_OPEN,
+	&RQF_LDLM_INTENT_CREATE,
+	&RQF_LDLM_INTENT_UNLINK,
+	&RQF_LDLM_INTENT_GETXATTR,
+	&RQF_LDLM_INTENT_QUOTA,
+	&RQF_QUOTA_DQACQ,
+	&RQF_LOG_CANCEL,
+	&RQF_LLOG_ORIGIN_HANDLE_CREATE,
+	&RQF_LLOG_ORIGIN_HANDLE_DESTROY,
+	&RQF_LLOG_ORIGIN_HANDLE_NEXT_BLOCK,
+	&RQF_LLOG_ORIGIN_HANDLE_PREV_BLOCK,
+	&RQF_LLOG_ORIGIN_HANDLE_READ_HEADER,
+	&RQF_LLOG_ORIGIN_CONNECT,
+	&RQF_CONNECT,
+};
+
+struct req_msg_field {
+	const __u32 rmf_flags;
+	const char  *rmf_name;
+	/**
+	 * Field length. (-1) means "variable length".  If the
+	 * \a RMF_F_STRUCT_ARRAY flag is set the field is also variable-length,
+	 * but the actual size must be a whole multiple of \a rmf_size.
+	 */
+	const int   rmf_size;
+	void	(*rmf_swabber)(void *);
+	void	(*rmf_dumper)(void *);
+	int	 rmf_offset[ARRAY_SIZE(req_formats)][RCL_NR];
+};
+
+enum rmf_flags {
+	/**
+	 * The field is a string, must be NUL-terminated.
+	 */
+	RMF_F_STRING = 1 << 0,
+	/**
+	 * The field's buffer size need not match the declared \a rmf_size.
+	 */
+	RMF_F_NO_SIZE_CHECK = 1 << 1,
+	/**
+	 * The field's buffer size must be a whole multiple of the declared \a
+	 * rmf_size and the \a rmf_swabber function must work on the declared \a
+	 * rmf_size worth of bytes.
+	 */
+	RMF_F_STRUCT_ARRAY = 1 << 2
+};
+
+struct req_capsule;
+
+/*
+ * Request fields.
+ */
+#define DEFINE_MSGF(name, flags, size, swabber, dumper) {       \
+	.rmf_name    = (name),				  \
+	.rmf_flags   = (flags),				 \
+	.rmf_size    = (size),				  \
+	.rmf_swabber = (void (*)(void *))(swabber),	      \
+	.rmf_dumper  = (void (*)(void *))(dumper)		\
+}
+
+struct req_msg_field RMF_GENERIC_DATA =
+	DEFINE_MSGF("generic_data", 0,
+		    -1, NULL, NULL);
+EXPORT_SYMBOL(RMF_GENERIC_DATA);
+
+struct req_msg_field RMF_MGS_TARGET_INFO =
+	DEFINE_MSGF("mgs_target_info", 0,
+		    sizeof(struct mgs_target_info),
+		    lustre_swab_mgs_target_info, NULL);
+EXPORT_SYMBOL(RMF_MGS_TARGET_INFO);
+
+struct req_msg_field RMF_MGS_SEND_PARAM =
+	DEFINE_MSGF("mgs_send_param", 0,
+		    sizeof(struct mgs_send_param),
+		    NULL, NULL);
+EXPORT_SYMBOL(RMF_MGS_SEND_PARAM);
+
+struct req_msg_field RMF_MGS_CONFIG_BODY =
+	DEFINE_MSGF("mgs_config_read request", 0,
+		    sizeof(struct mgs_config_body),
+		    lustre_swab_mgs_config_body, NULL);
+EXPORT_SYMBOL(RMF_MGS_CONFIG_BODY);
+
+struct req_msg_field RMF_MGS_CONFIG_RES =
+	DEFINE_MSGF("mgs_config_read reply ", 0,
+		    sizeof(struct mgs_config_res),
+		    lustre_swab_mgs_config_res, NULL);
+EXPORT_SYMBOL(RMF_MGS_CONFIG_RES);
+
+struct req_msg_field RMF_U32 =
+	DEFINE_MSGF("generic u32", 0,
+		    sizeof(__u32), lustre_swab_generic_32s, NULL);
+EXPORT_SYMBOL(RMF_U32);
+
+struct req_msg_field RMF_SETINFO_VAL =
+	DEFINE_MSGF("setinfo_val", 0, -1, NULL, NULL);
+EXPORT_SYMBOL(RMF_SETINFO_VAL);
+
+struct req_msg_field RMF_GETINFO_KEY =
+	DEFINE_MSGF("getinfo_key", 0, -1, NULL, NULL);
+EXPORT_SYMBOL(RMF_GETINFO_KEY);
+
+struct req_msg_field RMF_GETINFO_VALLEN =
+	DEFINE_MSGF("getinfo_vallen", 0,
+		    sizeof(__u32), lustre_swab_generic_32s, NULL);
+EXPORT_SYMBOL(RMF_GETINFO_VALLEN);
+
+struct req_msg_field RMF_GETINFO_VAL =
+	DEFINE_MSGF("getinfo_val", 0, -1, NULL, NULL);
+EXPORT_SYMBOL(RMF_GETINFO_VAL);
+
+struct req_msg_field RMF_SEQ_OPC =
+	DEFINE_MSGF("seq_query_opc", 0,
+		    sizeof(__u32), lustre_swab_generic_32s, NULL);
+EXPORT_SYMBOL(RMF_SEQ_OPC);
+
+struct req_msg_field RMF_SEQ_RANGE =
+	DEFINE_MSGF("seq_query_range", 0,
+		    sizeof(struct lu_seq_range),
+		    lustre_swab_lu_seq_range, NULL);
+EXPORT_SYMBOL(RMF_SEQ_RANGE);
+
+struct req_msg_field RMF_FLD_OPC =
+	DEFINE_MSGF("fld_query_opc", 0,
+		    sizeof(__u32), lustre_swab_generic_32s, NULL);
+EXPORT_SYMBOL(RMF_FLD_OPC);
+
+struct req_msg_field RMF_FLD_MDFLD =
+	DEFINE_MSGF("fld_query_mdfld", 0,
+		    sizeof(struct lu_seq_range),
+		    lustre_swab_lu_seq_range, NULL);
+EXPORT_SYMBOL(RMF_FLD_MDFLD);
+
+struct req_msg_field RMF_MDT_BODY =
+	DEFINE_MSGF("mdt_body", 0,
+		    sizeof(struct mdt_body), lustre_swab_mdt_body, NULL);
+EXPORT_SYMBOL(RMF_MDT_BODY);
+
+struct req_msg_field RMF_OBD_QUOTACTL =
+	DEFINE_MSGF("obd_quotactl", 0,
+		    sizeof(struct obd_quotactl),
+		    lustre_swab_obd_quotactl, NULL);
+EXPORT_SYMBOL(RMF_OBD_QUOTACTL);
+
+struct req_msg_field RMF_QUOTA_BODY =
+	DEFINE_MSGF("quota_body", 0,
+		    sizeof(struct quota_body), lustre_swab_quota_body, NULL);
+EXPORT_SYMBOL(RMF_QUOTA_BODY);
+
+struct req_msg_field RMF_MDT_EPOCH =
+	DEFINE_MSGF("mdt_ioepoch", 0,
+		    sizeof(struct mdt_ioepoch), lustre_swab_mdt_ioepoch, NULL);
+EXPORT_SYMBOL(RMF_MDT_EPOCH);
+
+struct req_msg_field RMF_PTLRPC_BODY =
+	DEFINE_MSGF("ptlrpc_body", 0,
+		    sizeof(struct ptlrpc_body), lustre_swab_ptlrpc_body, NULL);
+EXPORT_SYMBOL(RMF_PTLRPC_BODY);
+
+struct req_msg_field RMF_CLOSE_DATA =
+	DEFINE_MSGF("data_version", 0,
+		    sizeof(struct close_data), lustre_swab_close_data, NULL);
+EXPORT_SYMBOL(RMF_CLOSE_DATA);
+
+struct req_msg_field RMF_OBD_STATFS =
+	DEFINE_MSGF("obd_statfs", 0,
+		    sizeof(struct obd_statfs), lustre_swab_obd_statfs, NULL);
+EXPORT_SYMBOL(RMF_OBD_STATFS);
+
+struct req_msg_field RMF_SETINFO_KEY =
+	DEFINE_MSGF("setinfo_key", 0, -1, NULL, NULL);
+EXPORT_SYMBOL(RMF_SETINFO_KEY);
+
+struct req_msg_field RMF_NAME =
+	DEFINE_MSGF("name", RMF_F_STRING, -1, NULL, NULL);
+EXPORT_SYMBOL(RMF_NAME);
+
+struct req_msg_field RMF_SYMTGT =
+	DEFINE_MSGF("symtgt", RMF_F_STRING, -1, NULL, NULL);
+EXPORT_SYMBOL(RMF_SYMTGT);
+
+struct req_msg_field RMF_TGTUUID =
+	DEFINE_MSGF("tgtuuid", RMF_F_STRING, sizeof(struct obd_uuid) - 1, NULL,
+	NULL);
+EXPORT_SYMBOL(RMF_TGTUUID);
+
+struct req_msg_field RMF_CLUUID =
+	DEFINE_MSGF("cluuid", RMF_F_STRING, sizeof(struct obd_uuid) - 1, NULL,
+	NULL);
+EXPORT_SYMBOL(RMF_CLUUID);
+
+struct req_msg_field RMF_STRING =
+	DEFINE_MSGF("string", RMF_F_STRING, -1, NULL, NULL);
+EXPORT_SYMBOL(RMF_STRING);
+
+struct req_msg_field RMF_LLOGD_BODY =
+	DEFINE_MSGF("llogd_body", 0,
+		    sizeof(struct llogd_body), lustre_swab_llogd_body, NULL);
+EXPORT_SYMBOL(RMF_LLOGD_BODY);
+
+struct req_msg_field RMF_LLOG_LOG_HDR =
+	DEFINE_MSGF("llog_log_hdr", 0,
+		    sizeof(struct llog_log_hdr), lustre_swab_llog_hdr, NULL);
+EXPORT_SYMBOL(RMF_LLOG_LOG_HDR);
+
+struct req_msg_field RMF_LLOGD_CONN_BODY =
+	DEFINE_MSGF("llogd_conn_body", 0,
+		    sizeof(struct llogd_conn_body),
+		    lustre_swab_llogd_conn_body, NULL);
+EXPORT_SYMBOL(RMF_LLOGD_CONN_BODY);
+
+/*
+ * connection handle received in MDS_CONNECT request.
+ *
+ * No swabbing needed because struct lustre_handle contains only a 64-bit cookie
+ * that the client does not interpret at all.
+ */
+struct req_msg_field RMF_CONN =
+	DEFINE_MSGF("conn", 0, sizeof(struct lustre_handle), NULL, NULL);
+EXPORT_SYMBOL(RMF_CONN);
+
+struct req_msg_field RMF_CONNECT_DATA =
+	DEFINE_MSGF("cdata",
+		    RMF_F_NO_SIZE_CHECK /* we allow extra space for interop */,
+		    sizeof(struct obd_connect_data),
+		    lustre_swab_connect, NULL);
+EXPORT_SYMBOL(RMF_CONNECT_DATA);
+
+struct req_msg_field RMF_DLM_REQ =
+	DEFINE_MSGF("dlm_req", RMF_F_NO_SIZE_CHECK /* ldlm_request_bufsize */,
+		    sizeof(struct ldlm_request),
+		    lustre_swab_ldlm_request, NULL);
+EXPORT_SYMBOL(RMF_DLM_REQ);
+
+struct req_msg_field RMF_DLM_REP =
+	DEFINE_MSGF("dlm_rep", 0,
+		    sizeof(struct ldlm_reply), lustre_swab_ldlm_reply, NULL);
+EXPORT_SYMBOL(RMF_DLM_REP);
+
+struct req_msg_field RMF_LDLM_INTENT =
+	DEFINE_MSGF("ldlm_intent", 0,
+		    sizeof(struct ldlm_intent), lustre_swab_ldlm_intent, NULL);
+EXPORT_SYMBOL(RMF_LDLM_INTENT);
+
+struct req_msg_field RMF_DLM_LVB =
+	DEFINE_MSGF("dlm_lvb", 0, -1, NULL, NULL);
+EXPORT_SYMBOL(RMF_DLM_LVB);
+
+struct req_msg_field RMF_DLM_GL_DESC =
+	DEFINE_MSGF("dlm_gl_desc", 0, sizeof(union ldlm_gl_desc),
+		    lustre_swab_gl_desc, NULL);
+EXPORT_SYMBOL(RMF_DLM_GL_DESC);
+
+struct req_msg_field RMF_MDT_MD =
+	DEFINE_MSGF("mdt_md", RMF_F_NO_SIZE_CHECK, MIN_MD_SIZE, NULL, NULL);
+EXPORT_SYMBOL(RMF_MDT_MD);
+
+struct req_msg_field RMF_REC_REINT =
+	DEFINE_MSGF("rec_reint", 0, sizeof(struct mdt_rec_reint),
+		    lustre_swab_mdt_rec_reint, NULL);
+EXPORT_SYMBOL(RMF_REC_REINT);
+
+/* FIXME: this length should be defined as a macro */
+struct req_msg_field RMF_EADATA = DEFINE_MSGF("eadata", 0, -1,
+						    NULL, NULL);
+EXPORT_SYMBOL(RMF_EADATA);
+
+struct req_msg_field RMF_EAVALS = DEFINE_MSGF("eavals", 0, -1, NULL, NULL);
+EXPORT_SYMBOL(RMF_EAVALS);
+
+struct req_msg_field RMF_ACL =
+	DEFINE_MSGF("acl", RMF_F_NO_SIZE_CHECK,
+		    LUSTRE_POSIX_ACL_MAX_SIZE, NULL, NULL);
+EXPORT_SYMBOL(RMF_ACL);
+
+/* FIXME: this should be made to use RMF_F_STRUCT_ARRAY */
+struct req_msg_field RMF_LOGCOOKIES =
+	DEFINE_MSGF("logcookies", RMF_F_NO_SIZE_CHECK /* multiple cookies */,
+		    sizeof(struct llog_cookie), NULL, NULL);
+EXPORT_SYMBOL(RMF_LOGCOOKIES);
+
+struct req_msg_field RMF_CAPA1 =
+	DEFINE_MSGF("capa", 0, sizeof(struct lustre_capa),
+		    lustre_swab_lustre_capa, NULL);
+EXPORT_SYMBOL(RMF_CAPA1);
+
+struct req_msg_field RMF_CAPA2 =
+	DEFINE_MSGF("capa", 0, sizeof(struct lustre_capa),
+		    lustre_swab_lustre_capa, NULL);
+EXPORT_SYMBOL(RMF_CAPA2);
+
+struct req_msg_field RMF_LAYOUT_INTENT =
+	DEFINE_MSGF("layout_intent", 0,
+		    sizeof(struct layout_intent), lustre_swab_layout_intent,
+		    NULL);
+EXPORT_SYMBOL(RMF_LAYOUT_INTENT);
+
+/*
+ * OST request field.
+ */
+struct req_msg_field RMF_OST_BODY =
+	DEFINE_MSGF("ost_body", 0,
+		    sizeof(struct ost_body), lustre_swab_ost_body, dump_ost_body);
+EXPORT_SYMBOL(RMF_OST_BODY);
+
+struct req_msg_field RMF_OBD_IOOBJ =
+	DEFINE_MSGF("obd_ioobj", RMF_F_STRUCT_ARRAY,
+		    sizeof(struct obd_ioobj), lustre_swab_obd_ioobj, dump_ioo);
+EXPORT_SYMBOL(RMF_OBD_IOOBJ);
+
+struct req_msg_field RMF_NIOBUF_REMOTE =
+	DEFINE_MSGF("niobuf_remote", RMF_F_STRUCT_ARRAY,
+		    sizeof(struct niobuf_remote), lustre_swab_niobuf_remote,
+		    dump_rniobuf);
+EXPORT_SYMBOL(RMF_NIOBUF_REMOTE);
+
+struct req_msg_field RMF_RCS =
+	DEFINE_MSGF("niobuf_remote", RMF_F_STRUCT_ARRAY, sizeof(__u32),
+		    lustre_swab_generic_32s, dump_rcs);
+EXPORT_SYMBOL(RMF_RCS);
+
+struct req_msg_field RMF_EAVALS_LENS =
+	DEFINE_MSGF("eavals_lens", RMF_F_STRUCT_ARRAY, sizeof(__u32),
+		lustre_swab_generic_32s, NULL);
+EXPORT_SYMBOL(RMF_EAVALS_LENS);
+
+struct req_msg_field RMF_OBD_ID =
+	DEFINE_MSGF("u64", 0,
+		    sizeof(u64), lustre_swab_ost_last_id, NULL);
+EXPORT_SYMBOL(RMF_OBD_ID);
+
+struct req_msg_field RMF_FID =
+	DEFINE_MSGF("fid", 0,
+		    sizeof(struct lu_fid), lustre_swab_lu_fid, NULL);
+EXPORT_SYMBOL(RMF_FID);
+
+struct req_msg_field RMF_OST_ID =
+	DEFINE_MSGF("ost_id", 0,
+		    sizeof(struct ost_id), lustre_swab_ost_id, NULL);
+EXPORT_SYMBOL(RMF_OST_ID);
+
+struct req_msg_field RMF_FIEMAP_KEY =
+	DEFINE_MSGF("fiemap", 0, sizeof(struct ll_fiemap_info_key),
+		    lustre_swab_fiemap, NULL);
+EXPORT_SYMBOL(RMF_FIEMAP_KEY);
+
+struct req_msg_field RMF_FIEMAP_VAL =
+	DEFINE_MSGF("fiemap", 0, -1, lustre_swab_fiemap, NULL);
+EXPORT_SYMBOL(RMF_FIEMAP_VAL);
+
+struct req_msg_field RMF_IDX_INFO =
+	DEFINE_MSGF("idx_info", 0, sizeof(struct idx_info),
+		    lustre_swab_idx_info, NULL);
+EXPORT_SYMBOL(RMF_IDX_INFO);
+struct req_msg_field RMF_HSM_USER_STATE =
+	DEFINE_MSGF("hsm_user_state", 0, sizeof(struct hsm_user_state),
+		    lustre_swab_hsm_user_state, NULL);
+EXPORT_SYMBOL(RMF_HSM_USER_STATE);
+
+struct req_msg_field RMF_HSM_STATE_SET =
+	DEFINE_MSGF("hsm_state_set", 0, sizeof(struct hsm_state_set),
+		    lustre_swab_hsm_state_set, NULL);
+EXPORT_SYMBOL(RMF_HSM_STATE_SET);
+
+struct req_msg_field RMF_MDS_HSM_PROGRESS =
+	DEFINE_MSGF("hsm_progress", 0, sizeof(struct hsm_progress_kernel),
+		    lustre_swab_hsm_progress_kernel, NULL);
+EXPORT_SYMBOL(RMF_MDS_HSM_PROGRESS);
+
+struct req_msg_field RMF_MDS_HSM_CURRENT_ACTION =
+	DEFINE_MSGF("hsm_current_action", 0, sizeof(struct hsm_current_action),
+		    lustre_swab_hsm_current_action, NULL);
+EXPORT_SYMBOL(RMF_MDS_HSM_CURRENT_ACTION);
+
+struct req_msg_field RMF_MDS_HSM_USER_ITEM =
+	DEFINE_MSGF("hsm_user_item", RMF_F_STRUCT_ARRAY,
+		    sizeof(struct hsm_user_item), lustre_swab_hsm_user_item,
+		    NULL);
+EXPORT_SYMBOL(RMF_MDS_HSM_USER_ITEM);
+
+struct req_msg_field RMF_MDS_HSM_ARCHIVE =
+	DEFINE_MSGF("hsm_archive", 0,
+		    sizeof(__u32), lustre_swab_generic_32s, NULL);
+EXPORT_SYMBOL(RMF_MDS_HSM_ARCHIVE);
+
+struct req_msg_field RMF_MDS_HSM_REQUEST =
+	DEFINE_MSGF("hsm_request", 0, sizeof(struct hsm_request),
+		    lustre_swab_hsm_request, NULL);
+EXPORT_SYMBOL(RMF_MDS_HSM_REQUEST);
+
+struct req_msg_field RMF_UPDATE = DEFINE_MSGF("update", 0, -1,
+					      lustre_swab_update_buf, NULL);
+EXPORT_SYMBOL(RMF_UPDATE);
+
+struct req_msg_field RMF_UPDATE_REPLY = DEFINE_MSGF("update_reply", 0, -1,
+						lustre_swab_update_reply_buf,
+						    NULL);
+EXPORT_SYMBOL(RMF_UPDATE_REPLY);
+
+struct req_msg_field RMF_SWAP_LAYOUTS =
+	DEFINE_MSGF("swap_layouts", 0, sizeof(struct  mdc_swap_layouts),
+		    lustre_swab_swap_layouts, NULL);
+EXPORT_SYMBOL(RMF_SWAP_LAYOUTS);
+/*
+ * Request formats.
+ */
+
+struct req_format {
+	const char *rf_name;
+	int	 rf_idx;
+	struct {
+		int			  nr;
+		const struct req_msg_field **d;
+	} rf_fields[RCL_NR];
+};
+
+#define DEFINE_REQ_FMT(name, client, client_nr, server, server_nr) {    \
+	.rf_name   = name,					      \
+	.rf_fields = {						  \
+		[RCL_CLIENT] = {					\
+			.nr = client_nr,				\
+			.d  = client				    \
+		},						      \
+		[RCL_SERVER] = {					\
+			.nr = server_nr,				\
+			.d  = server				    \
+		}						       \
+	}							       \
+}
+
+#define DEFINE_REQ_FMT0(name, client, server)				  \
+DEFINE_REQ_FMT(name, client, ARRAY_SIZE(client), server, ARRAY_SIZE(server))
+
+struct req_format RQF_OBD_PING =
+	DEFINE_REQ_FMT0("OBD_PING", empty, empty);
+EXPORT_SYMBOL(RQF_OBD_PING);
+
+struct req_format RQF_OBD_SET_INFO =
+	DEFINE_REQ_FMT0("OBD_SET_INFO", obd_set_info_client, empty);
+EXPORT_SYMBOL(RQF_OBD_SET_INFO);
+
+/* Read index file through the network */
+struct req_format RQF_OBD_IDX_READ =
+	DEFINE_REQ_FMT0("OBD_IDX_READ",
+			obd_idx_read_client, obd_idx_read_server);
+EXPORT_SYMBOL(RQF_OBD_IDX_READ);
+
+struct req_format RQF_SEC_CTX =
+	DEFINE_REQ_FMT0("SEC_CTX", empty, empty);
+EXPORT_SYMBOL(RQF_SEC_CTX);
+
+struct req_format RQF_MGS_TARGET_REG =
+	DEFINE_REQ_FMT0("MGS_TARGET_REG", mgs_target_info_only,
+			 mgs_target_info_only);
+EXPORT_SYMBOL(RQF_MGS_TARGET_REG);
+
+struct req_format RQF_MGS_SET_INFO =
+	DEFINE_REQ_FMT0("MGS_SET_INFO", mgs_set_info,
+			 mgs_set_info);
+EXPORT_SYMBOL(RQF_MGS_SET_INFO);
+
+struct req_format RQF_MGS_CONFIG_READ =
+	DEFINE_REQ_FMT0("MGS_CONFIG_READ", mgs_config_read_client,
+			 mgs_config_read_server);
+EXPORT_SYMBOL(RQF_MGS_CONFIG_READ);
+
+struct req_format RQF_SEQ_QUERY =
+	DEFINE_REQ_FMT0("SEQ_QUERY", seq_query_client, seq_query_server);
+EXPORT_SYMBOL(RQF_SEQ_QUERY);
+
+struct req_format RQF_FLD_QUERY =
+	DEFINE_REQ_FMT0("FLD_QUERY", fld_query_client, fld_query_server);
+EXPORT_SYMBOL(RQF_FLD_QUERY);
+
+struct req_format RQF_LOG_CANCEL =
+	DEFINE_REQ_FMT0("OBD_LOG_CANCEL", log_cancel_client, empty);
+EXPORT_SYMBOL(RQF_LOG_CANCEL);
+
+struct req_format RQF_MDS_QUOTACHECK =
+	DEFINE_REQ_FMT0("MDS_QUOTACHECK", quotactl_only, empty);
+EXPORT_SYMBOL(RQF_MDS_QUOTACHECK);
+
+struct req_format RQF_OST_QUOTACHECK =
+	DEFINE_REQ_FMT0("OST_QUOTACHECK", quotactl_only, empty);
+EXPORT_SYMBOL(RQF_OST_QUOTACHECK);
+
+struct req_format RQF_MDS_QUOTACTL =
+	DEFINE_REQ_FMT0("MDS_QUOTACTL", quotactl_only, quotactl_only);
+EXPORT_SYMBOL(RQF_MDS_QUOTACTL);
+
+struct req_format RQF_OST_QUOTACTL =
+	DEFINE_REQ_FMT0("OST_QUOTACTL", quotactl_only, quotactl_only);
+EXPORT_SYMBOL(RQF_OST_QUOTACTL);
+
+struct req_format RQF_QC_CALLBACK =
+	DEFINE_REQ_FMT0("QC_CALLBACK", quotactl_only, empty);
+EXPORT_SYMBOL(RQF_QC_CALLBACK);
+
+struct req_format RQF_QUOTA_DQACQ =
+	DEFINE_REQ_FMT0("QUOTA_DQACQ", quota_body_only, quota_body_only);
+EXPORT_SYMBOL(RQF_QUOTA_DQACQ);
+
+struct req_format RQF_LDLM_INTENT_QUOTA =
+	DEFINE_REQ_FMT0("LDLM_INTENT_QUOTA",
+			ldlm_intent_quota_client,
+			ldlm_intent_quota_server);
+EXPORT_SYMBOL(RQF_LDLM_INTENT_QUOTA);
+
+struct req_format RQF_MDS_GETSTATUS =
+	DEFINE_REQ_FMT0("MDS_GETSTATUS", mdt_body_only, mdt_body_capa);
+EXPORT_SYMBOL(RQF_MDS_GETSTATUS);
+
+struct req_format RQF_MDS_STATFS =
+	DEFINE_REQ_FMT0("MDS_STATFS", empty, obd_statfs_server);
+EXPORT_SYMBOL(RQF_MDS_STATFS);
+
+struct req_format RQF_MDS_SYNC =
+	DEFINE_REQ_FMT0("MDS_SYNC", mdt_body_capa, mdt_body_only);
+EXPORT_SYMBOL(RQF_MDS_SYNC);
+
+struct req_format RQF_MDS_GETATTR =
+	DEFINE_REQ_FMT0("MDS_GETATTR", mdt_body_capa, mds_getattr_server);
+EXPORT_SYMBOL(RQF_MDS_GETATTR);
+
+struct req_format RQF_MDS_GETXATTR =
+	DEFINE_REQ_FMT0("MDS_GETXATTR",
+			mds_getxattr_client, mds_getxattr_server);
+EXPORT_SYMBOL(RQF_MDS_GETXATTR);
+
+struct req_format RQF_MDS_GETATTR_NAME =
+	DEFINE_REQ_FMT0("MDS_GETATTR_NAME",
+			mds_getattr_name_client, mds_getattr_server);
+EXPORT_SYMBOL(RQF_MDS_GETATTR_NAME);
+
+struct req_format RQF_MDS_REINT =
+	DEFINE_REQ_FMT0("MDS_REINT", mds_reint_client, mdt_body_only);
+EXPORT_SYMBOL(RQF_MDS_REINT);
+
+struct req_format RQF_MDS_REINT_CREATE =
+	DEFINE_REQ_FMT0("MDS_REINT_CREATE",
+			mds_reint_create_client, mdt_body_capa);
+EXPORT_SYMBOL(RQF_MDS_REINT_CREATE);
+
+struct req_format RQF_MDS_REINT_CREATE_RMT_ACL =
+	DEFINE_REQ_FMT0("MDS_REINT_CREATE_RMT_ACL",
+			mds_reint_create_rmt_acl_client, mdt_body_capa);
+EXPORT_SYMBOL(RQF_MDS_REINT_CREATE_RMT_ACL);
+
+struct req_format RQF_MDS_REINT_CREATE_SLAVE =
+	DEFINE_REQ_FMT0("MDS_REINT_CREATE_EA",
+			mds_reint_create_slave_client, mdt_body_capa);
+EXPORT_SYMBOL(RQF_MDS_REINT_CREATE_SLAVE);
+
+struct req_format RQF_MDS_REINT_CREATE_SYM =
+	DEFINE_REQ_FMT0("MDS_REINT_CREATE_SYM",
+			mds_reint_create_sym_client, mdt_body_capa);
+EXPORT_SYMBOL(RQF_MDS_REINT_CREATE_SYM);
+
+struct req_format RQF_MDS_REINT_OPEN =
+	DEFINE_REQ_FMT0("MDS_REINT_OPEN",
+			mds_reint_open_client, mds_reint_open_server);
+EXPORT_SYMBOL(RQF_MDS_REINT_OPEN);
+
+struct req_format RQF_MDS_REINT_UNLINK =
+	DEFINE_REQ_FMT0("MDS_REINT_UNLINK", mds_reint_unlink_client,
+			mds_last_unlink_server);
+EXPORT_SYMBOL(RQF_MDS_REINT_UNLINK);
+
+struct req_format RQF_MDS_REINT_LINK =
+	DEFINE_REQ_FMT0("MDS_REINT_LINK",
+			mds_reint_link_client, mdt_body_only);
+EXPORT_SYMBOL(RQF_MDS_REINT_LINK);
+
+struct req_format RQF_MDS_REINT_RENAME =
+	DEFINE_REQ_FMT0("MDS_REINT_RENAME", mds_reint_rename_client,
+			mds_last_unlink_server);
+EXPORT_SYMBOL(RQF_MDS_REINT_RENAME);
+
+struct req_format RQF_MDS_REINT_SETATTR =
+	DEFINE_REQ_FMT0("MDS_REINT_SETATTR",
+			mds_reint_setattr_client, mds_setattr_server);
+EXPORT_SYMBOL(RQF_MDS_REINT_SETATTR);
+
+struct req_format RQF_MDS_REINT_SETXATTR =
+	DEFINE_REQ_FMT0("MDS_REINT_SETXATTR",
+			mds_reint_setxattr_client, mdt_body_only);
+EXPORT_SYMBOL(RQF_MDS_REINT_SETXATTR);
+
+struct req_format RQF_MDS_CONNECT =
+	DEFINE_REQ_FMT0("MDS_CONNECT",
+			obd_connect_client, obd_connect_server);
+EXPORT_SYMBOL(RQF_MDS_CONNECT);
+
+struct req_format RQF_MDS_DISCONNECT =
+	DEFINE_REQ_FMT0("MDS_DISCONNECT", empty, empty);
+EXPORT_SYMBOL(RQF_MDS_DISCONNECT);
+
+struct req_format RQF_MDS_GET_INFO =
+	DEFINE_REQ_FMT0("MDS_GET_INFO", mds_getinfo_client,
+			mds_getinfo_server);
+EXPORT_SYMBOL(RQF_MDS_GET_INFO);
+
+struct req_format RQF_UPDATE_OBJ =
+	DEFINE_REQ_FMT0("OBJECT_UPDATE_OBJ", mds_update_client,
+			mds_update_server);
+EXPORT_SYMBOL(RQF_UPDATE_OBJ);
+
+struct req_format RQF_LDLM_ENQUEUE =
+	DEFINE_REQ_FMT0("LDLM_ENQUEUE",
+			ldlm_enqueue_client, ldlm_enqueue_lvb_server);
+EXPORT_SYMBOL(RQF_LDLM_ENQUEUE);
+
+struct req_format RQF_LDLM_ENQUEUE_LVB =
+	DEFINE_REQ_FMT0("LDLM_ENQUEUE_LVB",
+			ldlm_enqueue_client, ldlm_enqueue_lvb_server);
+EXPORT_SYMBOL(RQF_LDLM_ENQUEUE_LVB);
+
+struct req_format RQF_LDLM_CONVERT =
+	DEFINE_REQ_FMT0("LDLM_CONVERT",
+			ldlm_enqueue_client, ldlm_enqueue_server);
+EXPORT_SYMBOL(RQF_LDLM_CONVERT);
+
+struct req_format RQF_LDLM_CANCEL =
+	DEFINE_REQ_FMT0("LDLM_CANCEL", ldlm_enqueue_client, empty);
+EXPORT_SYMBOL(RQF_LDLM_CANCEL);
+
+struct req_format RQF_LDLM_CALLBACK =
+	DEFINE_REQ_FMT0("LDLM_CALLBACK", ldlm_enqueue_client, empty);
+EXPORT_SYMBOL(RQF_LDLM_CALLBACK);
+
+struct req_format RQF_LDLM_CP_CALLBACK =
+	DEFINE_REQ_FMT0("LDLM_CP_CALLBACK", ldlm_cp_callback_client, empty);
+EXPORT_SYMBOL(RQF_LDLM_CP_CALLBACK);
+
+struct req_format RQF_LDLM_BL_CALLBACK =
+	DEFINE_REQ_FMT0("LDLM_BL_CALLBACK", ldlm_enqueue_client, empty);
+EXPORT_SYMBOL(RQF_LDLM_BL_CALLBACK);
+
+struct req_format RQF_LDLM_GL_CALLBACK =
+	DEFINE_REQ_FMT0("LDLM_GL_CALLBACK", ldlm_enqueue_client,
+			ldlm_gl_callback_server);
+EXPORT_SYMBOL(RQF_LDLM_GL_CALLBACK);
+
+struct req_format RQF_LDLM_GL_DESC_CALLBACK =
+	DEFINE_REQ_FMT0("LDLM_GL_CALLBACK", ldlm_gl_callback_desc_client,
+			ldlm_gl_callback_server);
+EXPORT_SYMBOL(RQF_LDLM_GL_DESC_CALLBACK);
+
+struct req_format RQF_LDLM_INTENT_BASIC =
+	DEFINE_REQ_FMT0("LDLM_INTENT_BASIC",
+			ldlm_intent_basic_client, ldlm_enqueue_lvb_server);
+EXPORT_SYMBOL(RQF_LDLM_INTENT_BASIC);
+
+struct req_format RQF_LDLM_INTENT =
+	DEFINE_REQ_FMT0("LDLM_INTENT",
+			ldlm_intent_client, ldlm_intent_server);
+EXPORT_SYMBOL(RQF_LDLM_INTENT);
+
+struct req_format RQF_LDLM_INTENT_LAYOUT =
+	DEFINE_REQ_FMT0("LDLM_INTENT_LAYOUT ",
+			ldlm_intent_layout_client, ldlm_enqueue_lvb_server);
+EXPORT_SYMBOL(RQF_LDLM_INTENT_LAYOUT);
+
+struct req_format RQF_LDLM_INTENT_GETATTR =
+	DEFINE_REQ_FMT0("LDLM_INTENT_GETATTR",
+			ldlm_intent_getattr_client, ldlm_intent_getattr_server);
+EXPORT_SYMBOL(RQF_LDLM_INTENT_GETATTR);
+
+struct req_format RQF_LDLM_INTENT_OPEN =
+	DEFINE_REQ_FMT0("LDLM_INTENT_OPEN",
+			ldlm_intent_open_client, ldlm_intent_open_server);
+EXPORT_SYMBOL(RQF_LDLM_INTENT_OPEN);
+
+struct req_format RQF_LDLM_INTENT_CREATE =
+	DEFINE_REQ_FMT0("LDLM_INTENT_CREATE",
+			ldlm_intent_create_client, ldlm_intent_getattr_server);
+EXPORT_SYMBOL(RQF_LDLM_INTENT_CREATE);
+
+struct req_format RQF_LDLM_INTENT_UNLINK =
+	DEFINE_REQ_FMT0("LDLM_INTENT_UNLINK",
+			ldlm_intent_unlink_client, ldlm_intent_server);
+EXPORT_SYMBOL(RQF_LDLM_INTENT_UNLINK);
+
+struct req_format RQF_LDLM_INTENT_GETXATTR =
+	DEFINE_REQ_FMT0("LDLM_INTENT_GETXATTR",
+			ldlm_intent_getxattr_client,
+			ldlm_intent_getxattr_server);
+EXPORT_SYMBOL(RQF_LDLM_INTENT_GETXATTR);
+
+struct req_format RQF_MDS_CLOSE =
+	DEFINE_REQ_FMT0("MDS_CLOSE",
+			mdt_close_client, mds_last_unlink_server);
+EXPORT_SYMBOL(RQF_MDS_CLOSE);
+
+struct req_format RQF_MDS_RELEASE_CLOSE =
+	DEFINE_REQ_FMT0("MDS_CLOSE",
+			mdt_release_close_client, mds_last_unlink_server);
+EXPORT_SYMBOL(RQF_MDS_RELEASE_CLOSE);
+
+struct req_format RQF_MDS_PIN =
+	DEFINE_REQ_FMT0("MDS_PIN",
+			mdt_body_capa, mdt_body_only);
+EXPORT_SYMBOL(RQF_MDS_PIN);
+
+struct req_format RQF_MDS_UNPIN =
+	DEFINE_REQ_FMT0("MDS_UNPIN", mdt_body_only, empty);
+EXPORT_SYMBOL(RQF_MDS_UNPIN);
+
+struct req_format RQF_MDS_DONE_WRITING =
+	DEFINE_REQ_FMT0("MDS_DONE_WRITING",
+			mdt_close_client, mdt_body_only);
+EXPORT_SYMBOL(RQF_MDS_DONE_WRITING);
+
+struct req_format RQF_MDS_READPAGE =
+	DEFINE_REQ_FMT0("MDS_READPAGE",
+			mdt_body_capa, mdt_body_only);
+EXPORT_SYMBOL(RQF_MDS_READPAGE);
+
+struct req_format RQF_MDS_HSM_ACTION =
+	DEFINE_REQ_FMT0("MDS_HSM_ACTION", mdt_body_capa, mdt_hsm_action_server);
+EXPORT_SYMBOL(RQF_MDS_HSM_ACTION);
+
+struct req_format RQF_MDS_HSM_PROGRESS =
+	DEFINE_REQ_FMT0("MDS_HSM_PROGRESS", mdt_hsm_progress, empty);
+EXPORT_SYMBOL(RQF_MDS_HSM_PROGRESS);
+
+struct req_format RQF_MDS_HSM_CT_REGISTER =
+	DEFINE_REQ_FMT0("MDS_HSM_CT_REGISTER", mdt_hsm_ct_register, empty);
+EXPORT_SYMBOL(RQF_MDS_HSM_CT_REGISTER);
+
+struct req_format RQF_MDS_HSM_CT_UNREGISTER =
+	DEFINE_REQ_FMT0("MDS_HSM_CT_UNREGISTER", mdt_hsm_ct_unregister, empty);
+EXPORT_SYMBOL(RQF_MDS_HSM_CT_UNREGISTER);
+
+struct req_format RQF_MDS_HSM_STATE_GET =
+	DEFINE_REQ_FMT0("MDS_HSM_STATE_GET",
+			mdt_body_capa, mdt_hsm_state_get_server);
+EXPORT_SYMBOL(RQF_MDS_HSM_STATE_GET);
+
+struct req_format RQF_MDS_HSM_STATE_SET =
+	DEFINE_REQ_FMT0("MDS_HSM_STATE_SET", mdt_hsm_state_set, empty);
+EXPORT_SYMBOL(RQF_MDS_HSM_STATE_SET);
+
+struct req_format RQF_MDS_HSM_REQUEST =
+	DEFINE_REQ_FMT0("MDS_HSM_REQUEST", mdt_hsm_request, empty);
+EXPORT_SYMBOL(RQF_MDS_HSM_REQUEST);
+
+struct req_format RQF_MDS_SWAP_LAYOUTS =
+	DEFINE_REQ_FMT0("MDS_SWAP_LAYOUTS",
+			mdt_swap_layouts, empty);
+EXPORT_SYMBOL(RQF_MDS_SWAP_LAYOUTS);
+
+/* This is for split */
+struct req_format RQF_MDS_WRITEPAGE =
+	DEFINE_REQ_FMT0("MDS_WRITEPAGE",
+			mdt_body_capa, mdt_body_only);
+EXPORT_SYMBOL(RQF_MDS_WRITEPAGE);
+
+struct req_format RQF_MDS_IS_SUBDIR =
+	DEFINE_REQ_FMT0("MDS_IS_SUBDIR",
+			mdt_body_only, mdt_body_only);
+EXPORT_SYMBOL(RQF_MDS_IS_SUBDIR);
+
+struct req_format RQF_LLOG_ORIGIN_HANDLE_CREATE =
+	DEFINE_REQ_FMT0("LLOG_ORIGIN_HANDLE_CREATE",
+			llog_origin_handle_create_client, llogd_body_only);
+EXPORT_SYMBOL(RQF_LLOG_ORIGIN_HANDLE_CREATE);
+
+struct req_format RQF_LLOG_ORIGIN_HANDLE_DESTROY =
+	DEFINE_REQ_FMT0("LLOG_ORIGIN_HANDLE_DESTROY",
+			llogd_body_only, llogd_body_only);
+EXPORT_SYMBOL(RQF_LLOG_ORIGIN_HANDLE_DESTROY);
+
+struct req_format RQF_LLOG_ORIGIN_HANDLE_NEXT_BLOCK =
+	DEFINE_REQ_FMT0("LLOG_ORIGIN_HANDLE_NEXT_BLOCK",
+			llogd_body_only, llog_origin_handle_next_block_server);
+EXPORT_SYMBOL(RQF_LLOG_ORIGIN_HANDLE_NEXT_BLOCK);
+
+struct req_format RQF_LLOG_ORIGIN_HANDLE_PREV_BLOCK =
+	DEFINE_REQ_FMT0("LLOG_ORIGIN_HANDLE_PREV_BLOCK",
+			llogd_body_only, llog_origin_handle_next_block_server);
+EXPORT_SYMBOL(RQF_LLOG_ORIGIN_HANDLE_PREV_BLOCK);
+
+struct req_format RQF_LLOG_ORIGIN_HANDLE_READ_HEADER =
+	DEFINE_REQ_FMT0("LLOG_ORIGIN_HANDLE_READ_HEADER",
+			llogd_body_only, llog_log_hdr_only);
+EXPORT_SYMBOL(RQF_LLOG_ORIGIN_HANDLE_READ_HEADER);
+
+struct req_format RQF_LLOG_ORIGIN_CONNECT =
+	DEFINE_REQ_FMT0("LLOG_ORIGIN_CONNECT", llogd_conn_body_only, empty);
+EXPORT_SYMBOL(RQF_LLOG_ORIGIN_CONNECT);
+
+struct req_format RQF_CONNECT =
+	DEFINE_REQ_FMT0("CONNECT", obd_connect_client, obd_connect_server);
+EXPORT_SYMBOL(RQF_CONNECT);
+
+struct req_format RQF_OST_CONNECT =
+	DEFINE_REQ_FMT0("OST_CONNECT",
+			obd_connect_client, obd_connect_server);
+EXPORT_SYMBOL(RQF_OST_CONNECT);
+
+struct req_format RQF_OST_DISCONNECT =
+	DEFINE_REQ_FMT0("OST_DISCONNECT", empty, empty);
+EXPORT_SYMBOL(RQF_OST_DISCONNECT);
+
+struct req_format RQF_OST_GETATTR =
+	DEFINE_REQ_FMT0("OST_GETATTR", ost_body_capa, ost_body_only);
+EXPORT_SYMBOL(RQF_OST_GETATTR);
+
+struct req_format RQF_OST_SETATTR =
+	DEFINE_REQ_FMT0("OST_SETATTR", ost_body_capa, ost_body_only);
+EXPORT_SYMBOL(RQF_OST_SETATTR);
+
+struct req_format RQF_OST_CREATE =
+	DEFINE_REQ_FMT0("OST_CREATE", ost_body_only, ost_body_only);
+EXPORT_SYMBOL(RQF_OST_CREATE);
+
+struct req_format RQF_OST_PUNCH =
+	DEFINE_REQ_FMT0("OST_PUNCH", ost_body_capa, ost_body_only);
+EXPORT_SYMBOL(RQF_OST_PUNCH);
+
+struct req_format RQF_OST_SYNC =
+	DEFINE_REQ_FMT0("OST_SYNC", ost_body_capa, ost_body_only);
+EXPORT_SYMBOL(RQF_OST_SYNC);
+
+struct req_format RQF_OST_DESTROY =
+	DEFINE_REQ_FMT0("OST_DESTROY", ost_destroy_client, ost_body_only);
+EXPORT_SYMBOL(RQF_OST_DESTROY);
+
+struct req_format RQF_OST_BRW_READ =
+	DEFINE_REQ_FMT0("OST_BRW_READ", ost_brw_client, ost_brw_read_server);
+EXPORT_SYMBOL(RQF_OST_BRW_READ);
+
+struct req_format RQF_OST_BRW_WRITE =
+	DEFINE_REQ_FMT0("OST_BRW_WRITE", ost_brw_client, ost_brw_write_server);
+EXPORT_SYMBOL(RQF_OST_BRW_WRITE);
+
+struct req_format RQF_OST_STATFS =
+	DEFINE_REQ_FMT0("OST_STATFS", empty, obd_statfs_server);
+EXPORT_SYMBOL(RQF_OST_STATFS);
+
+struct req_format RQF_OST_SET_GRANT_INFO =
+	DEFINE_REQ_FMT0("OST_SET_GRANT_INFO", ost_grant_shrink_client,
+			 ost_body_only);
+EXPORT_SYMBOL(RQF_OST_SET_GRANT_INFO);
+
+struct req_format RQF_OST_GET_INFO_GENERIC =
+	DEFINE_REQ_FMT0("OST_GET_INFO", ost_get_info_generic_client,
+					ost_get_info_generic_server);
+EXPORT_SYMBOL(RQF_OST_GET_INFO_GENERIC);
+
+struct req_format RQF_OST_GET_INFO_LAST_ID =
+	DEFINE_REQ_FMT0("OST_GET_INFO_LAST_ID", ost_get_info_generic_client,
+						ost_get_last_id_server);
+EXPORT_SYMBOL(RQF_OST_GET_INFO_LAST_ID);
+
+struct req_format RQF_OST_GET_INFO_LAST_FID =
+	DEFINE_REQ_FMT0("OST_GET_INFO_LAST_FID", obd_set_info_client,
+						 ost_get_last_fid_server);
+EXPORT_SYMBOL(RQF_OST_GET_INFO_LAST_FID);
+
+struct req_format RQF_OST_SET_INFO_LAST_FID =
+	DEFINE_REQ_FMT0("OST_SET_INFO_LAST_FID", obd_set_info_client,
+						 empty);
+EXPORT_SYMBOL(RQF_OST_SET_INFO_LAST_FID);
+
+struct req_format RQF_OST_GET_INFO_FIEMAP =
+	DEFINE_REQ_FMT0("OST_GET_INFO_FIEMAP", ost_get_fiemap_client,
+					       ost_get_fiemap_server);
+EXPORT_SYMBOL(RQF_OST_GET_INFO_FIEMAP);
+
+#if !defined(__REQ_LAYOUT_USER__)
+
+/* Convenience macro */
+#define FMT_FIELD(fmt, i, j) (fmt)->rf_fields[(i)].d[(j)]
+
+/**
+ * Initializes the capsule abstraction by computing and setting the \a rf_idx
+ * field of RQFs and the \a rmf_offset field of RMFs.
+ */
+int req_layout_init(void)
+{
+	int i;
+	int j;
+	int k;
+	struct req_format *rf = NULL;
+
+	for (i = 0; i < ARRAY_SIZE(req_formats); ++i) {
+		rf = req_formats[i];
+		rf->rf_idx = i;
+		for (j = 0; j < RCL_NR; ++j) {
+			LASSERT(rf->rf_fields[j].nr <= REQ_MAX_FIELD_NR);
+			for (k = 0; k < rf->rf_fields[j].nr; ++k) {
+				struct req_msg_field *field;
+
+				field = (typeof(field))rf->rf_fields[j].d[k];
+				LASSERT(!(field->rmf_flags & RMF_F_STRUCT_ARRAY)
+					|| field->rmf_size > 0);
+				LASSERT(field->rmf_offset[i][j] == 0);
+				/*
+				 * k + 1 to detect unused format/field
+				 * combinations.
+				 */
+				field->rmf_offset[i][j] = k + 1;
+			}
+		}
+	}
+	return 0;
+}
+EXPORT_SYMBOL(req_layout_init);
+
+void req_layout_fini(void)
+{
+}
+EXPORT_SYMBOL(req_layout_fini);
+
+/**
+ * Initializes the expected sizes of each RMF in a \a pill (\a rc_area) to -1.
+ *
+ * Actual/expected field sizes are set elsewhere in functions in this file:
+ * req_capsule_init(), req_capsule_server_pack(), req_capsule_set_size() and
+ * req_capsule_msg_size().  The \a rc_area information is used by.
+ * ptlrpc_request_set_replen().
+ */
+void req_capsule_init_area(struct req_capsule *pill)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(pill->rc_area[RCL_CLIENT]); i++) {
+		pill->rc_area[RCL_CLIENT][i] = -1;
+		pill->rc_area[RCL_SERVER][i] = -1;
+	}
+}
+EXPORT_SYMBOL(req_capsule_init_area);
+
+/**
+ * Initialize a pill.
+ *
+ * The \a location indicates whether the caller is executing on the client side
+ * (RCL_CLIENT) or server side (RCL_SERVER)..
+ */
+void req_capsule_init(struct req_capsule *pill,
+		      struct ptlrpc_request *req,
+		      enum req_location location)
+{
+	LASSERT(location == RCL_SERVER || location == RCL_CLIENT);
+
+	/*
+	 * Today all capsules are embedded in ptlrpc_request structs,
+	 * but just in case that ever isn't the case, we don't reach
+	 * into req unless req != NULL and pill is the one embedded in
+	 * the req.
+	 *
+	 * The req->rq_pill_init flag makes it safe to initialize a pill
+	 * twice, which might happen in the OST paths as a result of the
+	 * high-priority RPC queue getting peeked at before ost_handle()
+	 * handles an OST RPC.
+	 */
+	if (req != NULL && pill == &req->rq_pill && req->rq_pill_init)
+		return;
+
+	memset(pill, 0, sizeof(*pill));
+	pill->rc_req = req;
+	pill->rc_loc = location;
+	req_capsule_init_area(pill);
+
+	if (req != NULL && pill == &req->rq_pill)
+		req->rq_pill_init = 1;
+}
+EXPORT_SYMBOL(req_capsule_init);
+
+void req_capsule_fini(struct req_capsule *pill)
+{
+}
+EXPORT_SYMBOL(req_capsule_fini);
+
+static int __req_format_is_sane(const struct req_format *fmt)
+{
+	return
+		0 <= fmt->rf_idx && fmt->rf_idx < ARRAY_SIZE(req_formats) &&
+		req_formats[fmt->rf_idx] == fmt;
+}
+
+static struct lustre_msg *__req_msg(const struct req_capsule *pill,
+				    enum req_location loc)
+{
+	struct ptlrpc_request *req;
+
+	req = pill->rc_req;
+	return loc == RCL_CLIENT ? req->rq_reqmsg : req->rq_repmsg;
+}
+
+/**
+ * Set the format (\a fmt) of a \a pill; format changes are not allowed here
+ * (see req_capsule_extend()).
+ */
+void req_capsule_set(struct req_capsule *pill, const struct req_format *fmt)
+{
+	LASSERT(pill->rc_fmt == NULL || pill->rc_fmt == fmt);
+	LASSERT(__req_format_is_sane(fmt));
+
+	pill->rc_fmt = fmt;
+}
+EXPORT_SYMBOL(req_capsule_set);
+
+/**
+ * Fills in any parts of the \a rc_area of a \a pill that haven't been filled in
+ * yet.
+
+ * \a rc_area is an array of REQ_MAX_FIELD_NR elements, used to store sizes of
+ * variable-sized fields.  The field sizes come from the declared \a rmf_size
+ * field of a \a pill's \a rc_fmt's RMF's.
+ */
+int req_capsule_filled_sizes(struct req_capsule *pill,
+			   enum req_location loc)
+{
+	const struct req_format *fmt = pill->rc_fmt;
+	int		      i;
+
+	LASSERT(fmt != NULL);
+
+	for (i = 0; i < fmt->rf_fields[loc].nr; ++i) {
+		if (pill->rc_area[loc][i] == -1) {
+			pill->rc_area[loc][i] =
+					    fmt->rf_fields[loc].d[i]->rmf_size;
+			if (pill->rc_area[loc][i] == -1) {
+				/*
+				 * Skip the following fields.
+				 *
+				 * If this LASSERT() trips then you're missing a
+				 * call to req_capsule_set_size().
+				 */
+				LASSERT(loc != RCL_SERVER);
+				break;
+			}
+		}
+	}
+	return i;
+}
+EXPORT_SYMBOL(req_capsule_filled_sizes);
+
+/**
+ * Capsule equivalent of lustre_pack_request() and lustre_pack_reply().
+ *
+ * This function uses the \a pill's \a rc_area as filled in by
+ * req_capsule_set_size() or req_capsule_filled_sizes() (the latter is called by
+ * this function).
+ */
+int req_capsule_server_pack(struct req_capsule *pill)
+{
+	const struct req_format *fmt;
+	int		      count;
+	int		      rc;
+
+	LASSERT(pill->rc_loc == RCL_SERVER);
+	fmt = pill->rc_fmt;
+	LASSERT(fmt != NULL);
+
+	count = req_capsule_filled_sizes(pill, RCL_SERVER);
+	rc = lustre_pack_reply(pill->rc_req, count,
+			       pill->rc_area[RCL_SERVER], NULL);
+	if (rc != 0) {
+		DEBUG_REQ(D_ERROR, pill->rc_req,
+		       "Cannot pack %d fields in format `%s': ",
+		       count, fmt->rf_name);
+	}
+	return rc;
+}
+EXPORT_SYMBOL(req_capsule_server_pack);
+
+/**
+ * Returns the PTLRPC request or reply (\a loc) buffer offset of a \a pill
+ * corresponding to the given RMF (\a field).
+ */
+static int __req_capsule_offset(const struct req_capsule *pill,
+				const struct req_msg_field *field,
+				enum req_location loc)
+{
+	int offset;
+
+	offset = field->rmf_offset[pill->rc_fmt->rf_idx][loc];
+	LASSERTF(offset > 0, "%s:%s, off=%d, loc=%d\n",
+			    pill->rc_fmt->rf_name,
+			    field->rmf_name, offset, loc);
+	offset--;
+
+	LASSERT(0 <= offset && offset < REQ_MAX_FIELD_NR);
+	return offset;
+}
+
+/**
+ * Helper for __req_capsule_get(); swabs value / array of values and/or dumps
+ * them if desired.
+ */
+static
+void
+swabber_dumper_helper(struct req_capsule *pill,
+		      const struct req_msg_field *field,
+		      enum req_location loc,
+		      int offset,
+		      void *value, int len, int dump, void (*swabber)(void *))
+{
+	void    *p;
+	int     i;
+	int     n;
+	int     do_swab;
+	int     inout = loc == RCL_CLIENT;
+
+	swabber = swabber ?: field->rmf_swabber;
+
+	if (ptlrpc_buf_need_swab(pill->rc_req, inout, offset) &&
+	    swabber != NULL && value != NULL)
+		do_swab = 1;
+	else
+		do_swab = 0;
+
+	if (!field->rmf_dumper)
+		dump = 0;
+
+	if (!(field->rmf_flags & RMF_F_STRUCT_ARRAY)) {
+		if (dump) {
+			CDEBUG(D_RPCTRACE, "Dump of %sfield %s follows\n",
+			       do_swab ? "unswabbed " : "", field->rmf_name);
+			field->rmf_dumper(value);
+		}
+		if (!do_swab)
+			return;
+		swabber(value);
+		ptlrpc_buf_set_swabbed(pill->rc_req, inout, offset);
+		if (dump) {
+			CDEBUG(D_RPCTRACE, "Dump of swabbed field %s follows\n",
+			       field->rmf_name);
+			field->rmf_dumper(value);
+		}
+
+		return;
+	}
+
+	/*
+	 * We're swabbing an array; swabber() swabs a single array element, so
+	 * swab every element.
+	 */
+	LASSERT((len % field->rmf_size) == 0);
+	for (p = value, i = 0, n = len / field->rmf_size;
+	     i < n;
+	     i++, p += field->rmf_size) {
+		if (dump) {
+			CDEBUG(D_RPCTRACE, "Dump of %sarray field %s, element %d follows\n",
+			       do_swab ? "unswabbed " : "", field->rmf_name, i);
+			field->rmf_dumper(p);
+		}
+		if (!do_swab)
+			continue;
+		swabber(p);
+		if (dump) {
+			CDEBUG(D_RPCTRACE, "Dump of swabbed array field %s, element %d follows\n",
+			       field->rmf_name, i);
+			field->rmf_dumper(value);
+		}
+	}
+	if (do_swab)
+		ptlrpc_buf_set_swabbed(pill->rc_req, inout, offset);
+}
+
+/**
+ * Returns the pointer to a PTLRPC request or reply (\a loc) buffer of a \a pill
+ * corresponding to the given RMF (\a field).
+ *
+ * The buffer will be swabbed using the given \a swabber.  If \a swabber == NULL
+ * then the \a rmf_swabber from the RMF will be used.  Soon there will be no
+ * calls to __req_capsule_get() with a non-NULL \a swabber; \a swabber will then
+ * be removed.  Fields with the \a RMF_F_STRUCT_ARRAY flag set will have each
+ * element of the array swabbed.
+ */
+static void *__req_capsule_get(struct req_capsule *pill,
+			       const struct req_msg_field *field,
+			       enum req_location loc,
+			       void (*swabber)(void *),
+			       int dump)
+{
+	const struct req_format *fmt;
+	struct lustre_msg       *msg;
+	void		    *value;
+	int		      len;
+	int		      offset;
+
+	void *(*getter)(struct lustre_msg *m, int n, int minlen);
+
+	static const char *rcl_names[RCL_NR] = {
+		[RCL_CLIENT] = "client",
+		[RCL_SERVER] = "server"
+	};
+
+	LASSERT(pill != NULL);
+	LASSERT(pill != LP_POISON);
+	fmt = pill->rc_fmt;
+	LASSERT(fmt != NULL);
+	LASSERT(fmt != LP_POISON);
+	LASSERT(__req_format_is_sane(fmt));
+
+	offset = __req_capsule_offset(pill, field, loc);
+
+	msg = __req_msg(pill, loc);
+	LASSERT(msg != NULL);
+
+	getter = (field->rmf_flags & RMF_F_STRING) ?
+		(typeof(getter))lustre_msg_string : lustre_msg_buf;
+
+	if (field->rmf_flags & RMF_F_STRUCT_ARRAY) {
+		/*
+		 * We've already asserted that field->rmf_size > 0 in
+		 * req_layout_init().
+		 */
+		len = lustre_msg_buflen(msg, offset);
+		if ((len % field->rmf_size) != 0) {
+			CERROR("%s: array field size mismatch %d modulo %d != 0 (%d)\n",
+			       field->rmf_name, len, field->rmf_size, loc);
+			return NULL;
+		}
+	} else if (pill->rc_area[loc][offset] != -1) {
+		len = pill->rc_area[loc][offset];
+	} else {
+		len = max(field->rmf_size, 0);
+	}
+	value = getter(msg, offset, len);
+
+	if (value == NULL) {
+		DEBUG_REQ(D_ERROR, pill->rc_req,
+			  "Wrong buffer for field `%s' (%d of %d) in format `%s': %d vs. %d (%s)\n",
+			  field->rmf_name, offset, lustre_msg_bufcount(msg),
+			  fmt->rf_name, lustre_msg_buflen(msg, offset), len,
+			  rcl_names[loc]);
+	} else {
+		swabber_dumper_helper(pill, field, loc, offset, value, len,
+				      dump, swabber);
+	}
+
+	return value;
+}
+
+/**
+ * Dump a request and/or reply
+ */
+static void __req_capsule_dump(struct req_capsule *pill, enum req_location loc)
+{
+	const struct    req_format *fmt;
+	const struct    req_msg_field *field;
+	int	     len;
+	int	     i;
+
+	fmt = pill->rc_fmt;
+
+	DEBUG_REQ(D_RPCTRACE, pill->rc_req, "BEGIN REQ CAPSULE DUMP\n");
+	for (i = 0; i < fmt->rf_fields[loc].nr; ++i) {
+		field = FMT_FIELD(fmt, loc, i);
+		if (field->rmf_dumper == NULL) {
+			/*
+			 * FIXME Add a default hex dumper for fields that don't
+			 * have a specific dumper
+			 */
+			len = req_capsule_get_size(pill, field, loc);
+			CDEBUG(D_RPCTRACE, "Field %s has no dumper function; field size is %d\n",
+			       field->rmf_name, len);
+		} else {
+			/* It's the dumping side-effect that we're interested in */
+			(void) __req_capsule_get(pill, field, loc, NULL, 1);
+		}
+	}
+	CDEBUG(D_RPCTRACE, "END REQ CAPSULE DUMP\n");
+}
+
+/**
+ * Dump a request.
+ */
+void req_capsule_client_dump(struct req_capsule *pill)
+{
+	__req_capsule_dump(pill, RCL_CLIENT);
+}
+EXPORT_SYMBOL(req_capsule_client_dump);
+
+/**
+ * Dump a reply
+ */
+void req_capsule_server_dump(struct req_capsule *pill)
+{
+	__req_capsule_dump(pill, RCL_SERVER);
+}
+EXPORT_SYMBOL(req_capsule_server_dump);
+
+/**
+ * Trivial wrapper around __req_capsule_get(), that returns the PTLRPC request
+ * buffer corresponding to the given RMF (\a field) of a \a pill.
+ */
+void *req_capsule_client_get(struct req_capsule *pill,
+			     const struct req_msg_field *field)
+{
+	return __req_capsule_get(pill, field, RCL_CLIENT, NULL, 0);
+}
+EXPORT_SYMBOL(req_capsule_client_get);
+
+/**
+ * Same as req_capsule_client_get(), but with a \a swabber argument.
+ *
+ * Currently unused; will be removed when req_capsule_server_swab_get() is
+ * unused too.
+ */
+void *req_capsule_client_swab_get(struct req_capsule *pill,
+				  const struct req_msg_field *field,
+				  void *swabber)
+{
+	return __req_capsule_get(pill, field, RCL_CLIENT, swabber, 0);
+}
+EXPORT_SYMBOL(req_capsule_client_swab_get);
+
+/**
+ * Utility that combines req_capsule_set_size() and req_capsule_client_get().
+ *
+ * First the \a pill's request \a field's size is set (\a rc_area) using
+ * req_capsule_set_size() with the given \a len.  Then the actual buffer is
+ * returned.
+ */
+void *req_capsule_client_sized_get(struct req_capsule *pill,
+				   const struct req_msg_field *field,
+				   int len)
+{
+	req_capsule_set_size(pill, field, RCL_CLIENT, len);
+	return __req_capsule_get(pill, field, RCL_CLIENT, NULL, 0);
+}
+EXPORT_SYMBOL(req_capsule_client_sized_get);
+
+/**
+ * Trivial wrapper around __req_capsule_get(), that returns the PTLRPC reply
+ * buffer corresponding to the given RMF (\a field) of a \a pill.
+ */
+void *req_capsule_server_get(struct req_capsule *pill,
+			     const struct req_msg_field *field)
+{
+	return __req_capsule_get(pill, field, RCL_SERVER, NULL, 0);
+}
+EXPORT_SYMBOL(req_capsule_server_get);
+
+/**
+ * Same as req_capsule_server_get(), but with a \a swabber argument.
+ *
+ * Ideally all swabbing should be done pursuant to RMF definitions, with no
+ * swabbing done outside this capsule abstraction.
+ */
+void *req_capsule_server_swab_get(struct req_capsule *pill,
+				  const struct req_msg_field *field,
+				  void *swabber)
+{
+	return __req_capsule_get(pill, field, RCL_SERVER, swabber, 0);
+}
+EXPORT_SYMBOL(req_capsule_server_swab_get);
+
+/**
+ * Utility that combines req_capsule_set_size() and req_capsule_server_get().
+ *
+ * First the \a pill's request \a field's size is set (\a rc_area) using
+ * req_capsule_set_size() with the given \a len.  Then the actual buffer is
+ * returned.
+ */
+void *req_capsule_server_sized_get(struct req_capsule *pill,
+				   const struct req_msg_field *field,
+				   int len)
+{
+	req_capsule_set_size(pill, field, RCL_SERVER, len);
+	return __req_capsule_get(pill, field, RCL_SERVER, NULL, 0);
+}
+EXPORT_SYMBOL(req_capsule_server_sized_get);
+
+void *req_capsule_server_sized_swab_get(struct req_capsule *pill,
+					const struct req_msg_field *field,
+					int len, void *swabber)
+{
+	req_capsule_set_size(pill, field, RCL_SERVER, len);
+	return __req_capsule_get(pill, field, RCL_SERVER, swabber, 0);
+}
+EXPORT_SYMBOL(req_capsule_server_sized_swab_get);
+
+/**
+ * Returns the buffer of a \a pill corresponding to the given \a field from the
+ * request (if the caller is executing on the server-side) or reply (if the
+ * caller is executing on the client-side).
+ *
+ * This function convenient for use is code that could be executed on the
+ * client and server alike.
+ */
+const void *req_capsule_other_get(struct req_capsule *pill,
+				  const struct req_msg_field *field)
+{
+	return __req_capsule_get(pill, field, pill->rc_loc ^ 1, NULL, 0);
+}
+EXPORT_SYMBOL(req_capsule_other_get);
+
+/**
+ * Set the size of the PTLRPC request/reply (\a loc) buffer for the given \a
+ * field of the given \a pill.
+ *
+ * This function must be used when constructing variable sized fields of a
+ * request or reply.
+ */
+void req_capsule_set_size(struct req_capsule *pill,
+			  const struct req_msg_field *field,
+			  enum req_location loc, int size)
+{
+	LASSERT(loc == RCL_SERVER || loc == RCL_CLIENT);
+
+	if ((size != field->rmf_size) &&
+	    (field->rmf_size != -1) &&
+	    !(field->rmf_flags & RMF_F_NO_SIZE_CHECK) &&
+	    (size > 0)) {
+		if ((field->rmf_flags & RMF_F_STRUCT_ARRAY) &&
+		    (size % field->rmf_size != 0)) {
+			CERROR("%s: array field size mismatch %d %% %d != 0 (%d)\n",
+			       field->rmf_name, size, field->rmf_size, loc);
+			LBUG();
+		} else if (!(field->rmf_flags & RMF_F_STRUCT_ARRAY) &&
+		    size < field->rmf_size) {
+			CERROR("%s: field size mismatch %d != %d (%d)\n",
+			       field->rmf_name, size, field->rmf_size, loc);
+			LBUG();
+		}
+	}
+
+	pill->rc_area[loc][__req_capsule_offset(pill, field, loc)] = size;
+}
+EXPORT_SYMBOL(req_capsule_set_size);
+
+/**
+ * Return the actual PTLRPC buffer length of a request or reply (\a loc)
+ * for the given \a pill's given \a field.
+ *
+ * NB: this function doesn't correspond with req_capsule_set_size(), which
+ * actually sets the size in pill.rc_area[loc][offset], but this function
+ * returns the message buflen[offset], maybe we should use another name.
+ */
+int req_capsule_get_size(const struct req_capsule *pill,
+			 const struct req_msg_field *field,
+			 enum req_location loc)
+{
+	LASSERT(loc == RCL_SERVER || loc == RCL_CLIENT);
+
+	return lustre_msg_buflen(__req_msg(pill, loc),
+				 __req_capsule_offset(pill, field, loc));
+}
+EXPORT_SYMBOL(req_capsule_get_size);
+
+/**
+ * Wrapper around lustre_msg_size() that returns the PTLRPC size needed for the
+ * given \a pill's request or reply (\a loc) given the field size recorded in
+ * the \a pill's rc_area.
+ *
+ * See also req_capsule_set_size().
+ */
+int req_capsule_msg_size(struct req_capsule *pill, enum req_location loc)
+{
+	return lustre_msg_size(pill->rc_req->rq_import->imp_msg_magic,
+			       pill->rc_fmt->rf_fields[loc].nr,
+			       pill->rc_area[loc]);
+}
+
+/**
+ * While req_capsule_msg_size() computes the size of a PTLRPC request or reply
+ * (\a loc) given a \a pill's \a rc_area, this function computes the size of a
+ * PTLRPC request or reply given only an RQF (\a fmt).
+ *
+ * This function should not be used for formats which contain variable size
+ * fields.
+ */
+int req_capsule_fmt_size(__u32 magic, const struct req_format *fmt,
+			 enum req_location loc)
+{
+	int size, i = 0;
+
+	/*
+	 * This function should probably LASSERT() that fmt has no fields with
+	 * RMF_F_STRUCT_ARRAY in rmf_flags, since we can't know here how many
+	 * elements in the array there will ultimately be, but then, we could
+	 * assume that there will be at least one element, and that's just what
+	 * we do.
+	 */
+	size = lustre_msg_hdr_size(magic, fmt->rf_fields[loc].nr);
+	if (size < 0)
+		return size;
+
+	for (; i < fmt->rf_fields[loc].nr; ++i)
+		if (fmt->rf_fields[loc].d[i]->rmf_size != -1)
+			size += cfs_size_round(fmt->rf_fields[loc].d[i]->
+					       rmf_size);
+	return size;
+}
+
+/**
+ * Changes the format of an RPC.
+ *
+ * The pill must already have been initialized, which means that it already has
+ * a request format.  The new format \a fmt must be an extension of the pill's
+ * old format.  Specifically: the new format must have as many request and reply
+ * fields as the old one, and all fields shared by the old and new format must
+ * be at least as large in the new format.
+ *
+ * The new format's fields may be of different "type" than the old format, but
+ * only for fields that are "opaque" blobs: fields which have a) have no
+ * \a rmf_swabber, b) \a rmf_flags == 0 or RMF_F_NO_SIZE_CHECK, and c) \a
+ * rmf_size == -1 or \a rmf_flags == RMF_F_NO_SIZE_CHECK.  For example,
+ * OBD_SET_INFO has a key field and an opaque value field that gets interpreted
+ * according to the key field.  When the value, according to the key, contains a
+ * structure (or array thereof) to be swabbed, the format should be changed to
+ * one where the value field has \a rmf_size/rmf_flags/rmf_swabber set
+ * accordingly.
+ */
+void req_capsule_extend(struct req_capsule *pill, const struct req_format *fmt)
+{
+	int i;
+	int j;
+
+	const struct req_format *old;
+
+	LASSERT(pill->rc_fmt != NULL);
+	LASSERT(__req_format_is_sane(fmt));
+
+	old = pill->rc_fmt;
+	/*
+	 * Sanity checking...
+	 */
+	for (i = 0; i < RCL_NR; ++i) {
+		LASSERT(fmt->rf_fields[i].nr >= old->rf_fields[i].nr);
+		for (j = 0; j < old->rf_fields[i].nr - 1; ++j) {
+			const struct req_msg_field *ofield = FMT_FIELD(old, i, j);
+
+			/* "opaque" fields can be transmogrified */
+			if (ofield->rmf_swabber == NULL &&
+			    (ofield->rmf_flags & ~RMF_F_NO_SIZE_CHECK) == 0 &&
+			    (ofield->rmf_size == -1 ||
+			    ofield->rmf_flags == RMF_F_NO_SIZE_CHECK))
+				continue;
+			LASSERT(FMT_FIELD(fmt, i, j) == FMT_FIELD(old, i, j));
+		}
+		/*
+		 * Last field in old format can be shorter than in new.
+		 */
+		LASSERT(FMT_FIELD(fmt, i, j)->rmf_size >=
+			FMT_FIELD(old, i, j)->rmf_size);
+	}
+
+	pill->rc_fmt = fmt;
+}
+EXPORT_SYMBOL(req_capsule_extend);
+
+/**
+ * This function returns a non-zero value if the given \a field is present in
+ * the format (\a rc_fmt) of \a pill's PTLRPC request or reply (\a loc), else it
+ * returns 0.
+ */
+int req_capsule_has_field(const struct req_capsule *pill,
+			  const struct req_msg_field *field,
+			  enum req_location loc)
+{
+	LASSERT(loc == RCL_SERVER || loc == RCL_CLIENT);
+
+	return field->rmf_offset[pill->rc_fmt->rf_idx][loc];
+}
+EXPORT_SYMBOL(req_capsule_has_field);
+
+/**
+ * Returns a non-zero value if the given \a field is present in the given \a
+ * pill's PTLRPC request or reply (\a loc), else it returns 0.
+ */
+int req_capsule_field_present(const struct req_capsule *pill,
+			      const struct req_msg_field *field,
+			      enum req_location loc)
+{
+	int offset;
+
+	LASSERT(loc == RCL_SERVER || loc == RCL_CLIENT);
+	LASSERT(req_capsule_has_field(pill, field, loc));
+
+	offset = __req_capsule_offset(pill, field, loc);
+	return lustre_msg_bufcount(__req_msg(pill, loc)) > offset;
+}
+EXPORT_SYMBOL(req_capsule_field_present);
+
+/**
+ * This function shrinks the size of the _buffer_ of the \a pill's PTLRPC
+ * request or reply (\a loc).
+ *
+ * This is not the opposite of req_capsule_extend().
+ */
+void req_capsule_shrink(struct req_capsule *pill,
+			const struct req_msg_field *field,
+			unsigned int newlen,
+			enum req_location loc)
+{
+	const struct req_format *fmt;
+	struct lustre_msg       *msg;
+	int		      len;
+	int		      offset;
+
+	fmt = pill->rc_fmt;
+	LASSERT(fmt != NULL);
+	LASSERT(__req_format_is_sane(fmt));
+	LASSERT(req_capsule_has_field(pill, field, loc));
+	LASSERT(req_capsule_field_present(pill, field, loc));
+
+	offset = __req_capsule_offset(pill, field, loc);
+
+	msg = __req_msg(pill, loc);
+	len = lustre_msg_buflen(msg, offset);
+	LASSERTF(newlen <= len, "%s:%s, oldlen=%d, newlen=%d\n",
+				fmt->rf_name, field->rmf_name, len, newlen);
+
+	if (loc == RCL_CLIENT)
+		pill->rc_req->rq_reqlen = lustre_shrink_msg(msg, offset, newlen,
+							    1);
+	else
+		pill->rc_req->rq_replen = lustre_shrink_msg(msg, offset, newlen,
+							    1);
+}
+EXPORT_SYMBOL(req_capsule_shrink);
+
+int req_capsule_server_grow(struct req_capsule *pill,
+			    const struct req_msg_field *field,
+			    unsigned int newlen)
+{
+	struct ptlrpc_reply_state *rs = pill->rc_req->rq_reply_state, *nrs;
+	char *from, *to;
+	int offset, len, rc;
+
+	LASSERT(pill->rc_fmt != NULL);
+	LASSERT(__req_format_is_sane(pill->rc_fmt));
+	LASSERT(req_capsule_has_field(pill, field, RCL_SERVER));
+	LASSERT(req_capsule_field_present(pill, field, RCL_SERVER));
+
+	len = req_capsule_get_size(pill, field, RCL_SERVER);
+	offset = __req_capsule_offset(pill, field, RCL_SERVER);
+	if (pill->rc_req->rq_repbuf_len >=
+	    lustre_packed_msg_size(pill->rc_req->rq_repmsg) - len + newlen)
+		CERROR("Inplace repack might be done\n");
+
+	pill->rc_req->rq_reply_state = NULL;
+	req_capsule_set_size(pill, field, RCL_SERVER, newlen);
+	rc = req_capsule_server_pack(pill);
+	if (rc) {
+		/* put old rs back, the caller will decide what to do */
+		pill->rc_req->rq_reply_state = rs;
+		return rc;
+	}
+	nrs = pill->rc_req->rq_reply_state;
+	/* Now we need only buffers, copy first chunk */
+	to = lustre_msg_buf(nrs->rs_msg, 0, 0);
+	from = lustre_msg_buf(rs->rs_msg, 0, 0);
+	len = (char *)lustre_msg_buf(rs->rs_msg, offset, 0) - from;
+	memcpy(to, from, len);
+	/* check if we have tail and copy it too */
+	if (rs->rs_msg->lm_bufcount > offset + 1) {
+		to = lustre_msg_buf(nrs->rs_msg, offset + 1, 0);
+		from = lustre_msg_buf(rs->rs_msg, offset + 1, 0);
+		offset = rs->rs_msg->lm_bufcount - 1;
+		len = (char *)lustre_msg_buf(rs->rs_msg, offset, 0) +
+		      cfs_size_round(rs->rs_msg->lm_buflens[offset]) - from;
+		memcpy(to, from, len);
+	}
+	/* drop old reply if everything is fine */
+	if (rs->rs_difficult) {
+		/* copy rs data */
+		int i;
+
+		nrs->rs_difficult = 1;
+		nrs->rs_no_ack = rs->rs_no_ack;
+		for (i = 0; i < rs->rs_nlocks; i++) {
+			nrs->rs_locks[i] = rs->rs_locks[i];
+			nrs->rs_modes[i] = rs->rs_modes[i];
+			nrs->rs_nlocks++;
+		}
+		rs->rs_nlocks = 0;
+		rs->rs_difficult = 0;
+		rs->rs_no_ack = 0;
+	}
+	ptlrpc_rs_decref(rs);
+	return 0;
+}
+EXPORT_SYMBOL(req_capsule_server_grow);
+/* __REQ_LAYOUT_USER__ */
+#endif
diff --git a/kernel/drivers/staging/lustre/lustre/ptlrpc/llog_client.c b/kernel/drivers/staging/lustre/lustre/ptlrpc/llog_client.c
new file mode 100644
index 000000000..e9baf5bbe
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/ptlrpc/llog_client.c
@@ -0,0 +1,366 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/ptlrpc/llog_client.c
+ *
+ * remote api for llog - client side
+ *
+ * Author: Andreas Dilger <adilger@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LOG
+
+#include "../../include/linux/libcfs/libcfs.h"
+
+#include "../include/obd_class.h"
+#include "../include/lustre_log.h"
+#include "../include/lustre_net.h"
+#include <linux/list.h>
+
+#define LLOG_CLIENT_ENTRY(ctxt, imp) do {			     \
+	mutex_lock(&ctxt->loc_mutex);			     \
+	if (ctxt->loc_imp) {					  \
+		imp = class_import_get(ctxt->loc_imp);		\
+	} else {						      \
+		CERROR("ctxt->loc_imp == NULL for context idx %d."    \
+		       "Unable to complete MDS/OSS recovery,"	 \
+		       "but I'll try again next time.  Not fatal.\n", \
+		       ctxt->loc_idx);				\
+		imp = NULL;					   \
+		mutex_unlock(&ctxt->loc_mutex);		   \
+		return (-EINVAL);				     \
+	}							     \
+	mutex_unlock(&ctxt->loc_mutex);			   \
+} while (0)
+
+#define LLOG_CLIENT_EXIT(ctxt, imp) do {			      \
+	mutex_lock(&ctxt->loc_mutex);			     \
+	if (ctxt->loc_imp != imp)				     \
+		CWARN("loc_imp has changed from %p to %p\n",	  \
+		       ctxt->loc_imp, imp);			   \
+	class_import_put(imp);					\
+	mutex_unlock(&ctxt->loc_mutex);			   \
+} while (0)
+
+/* This is a callback from the llog_* functions.
+ * Assumes caller has already pushed us into the kernel context. */
+static int llog_client_open(const struct lu_env *env,
+			    struct llog_handle *lgh, struct llog_logid *logid,
+			    char *name, enum llog_open_param open_param)
+{
+	struct obd_import     *imp;
+	struct llogd_body     *body;
+	struct llog_ctxt      *ctxt = lgh->lgh_ctxt;
+	struct ptlrpc_request *req = NULL;
+	int		    rc;
+
+	LLOG_CLIENT_ENTRY(ctxt, imp);
+
+	/* client cannot create llog */
+	LASSERTF(open_param != LLOG_OPEN_NEW, "%#x\n", open_param);
+	LASSERT(lgh);
+
+	req = ptlrpc_request_alloc(imp, &RQF_LLOG_ORIGIN_HANDLE_CREATE);
+	if (req == NULL) {
+		rc = -ENOMEM;
+		goto out;
+	}
+
+	if (name)
+		req_capsule_set_size(&req->rq_pill, &RMF_NAME, RCL_CLIENT,
+				     strlen(name) + 1);
+
+	rc = ptlrpc_request_pack(req, LUSTRE_LOG_VERSION,
+				 LLOG_ORIGIN_HANDLE_CREATE);
+	if (rc) {
+		ptlrpc_request_free(req);
+		req = NULL;
+		goto out;
+	}
+	ptlrpc_request_set_replen(req);
+
+	body = req_capsule_client_get(&req->rq_pill, &RMF_LLOGD_BODY);
+	if (logid)
+		body->lgd_logid = *logid;
+	body->lgd_ctxt_idx = ctxt->loc_idx - 1;
+
+	if (name) {
+		char *tmp;
+		tmp = req_capsule_client_sized_get(&req->rq_pill, &RMF_NAME,
+						   strlen(name) + 1);
+		LASSERT(tmp);
+		strcpy(tmp, name);
+	}
+
+	rc = ptlrpc_queue_wait(req);
+	if (rc)
+		goto out;
+
+	body = req_capsule_server_get(&req->rq_pill, &RMF_LLOGD_BODY);
+	if (body == NULL) {
+		rc = -EFAULT;
+		goto out;
+	}
+
+	lgh->lgh_id = body->lgd_logid;
+	lgh->lgh_ctxt = ctxt;
+out:
+	LLOG_CLIENT_EXIT(ctxt, imp);
+	ptlrpc_req_finished(req);
+	return rc;
+}
+
+static int llog_client_destroy(const struct lu_env *env,
+			       struct llog_handle *loghandle)
+{
+	struct obd_import     *imp;
+	struct ptlrpc_request *req = NULL;
+	struct llogd_body     *body;
+	int		    rc;
+
+	LLOG_CLIENT_ENTRY(loghandle->lgh_ctxt, imp);
+	req = ptlrpc_request_alloc_pack(imp, &RQF_LLOG_ORIGIN_HANDLE_DESTROY,
+					LUSTRE_LOG_VERSION,
+					LLOG_ORIGIN_HANDLE_DESTROY);
+	if (req == NULL) {
+		rc = -ENOMEM;
+		goto err_exit;
+	}
+
+	body = req_capsule_client_get(&req->rq_pill, &RMF_LLOGD_BODY);
+	body->lgd_logid = loghandle->lgh_id;
+	body->lgd_llh_flags = loghandle->lgh_hdr->llh_flags;
+
+	if (!(body->lgd_llh_flags & LLOG_F_IS_PLAIN))
+		CERROR("%s: wrong llog flags %x\n", imp->imp_obd->obd_name,
+		       body->lgd_llh_flags);
+
+	ptlrpc_request_set_replen(req);
+	rc = ptlrpc_queue_wait(req);
+
+	ptlrpc_req_finished(req);
+err_exit:
+	LLOG_CLIENT_EXIT(loghandle->lgh_ctxt, imp);
+	return rc;
+}
+
+
+static int llog_client_next_block(const struct lu_env *env,
+				  struct llog_handle *loghandle,
+				  int *cur_idx, int next_idx,
+				  __u64 *cur_offset, void *buf, int len)
+{
+	struct obd_import     *imp;
+	struct ptlrpc_request *req = NULL;
+	struct llogd_body     *body;
+	void		  *ptr;
+	int		    rc;
+
+	LLOG_CLIENT_ENTRY(loghandle->lgh_ctxt, imp);
+	req = ptlrpc_request_alloc_pack(imp, &RQF_LLOG_ORIGIN_HANDLE_NEXT_BLOCK,
+					LUSTRE_LOG_VERSION,
+					LLOG_ORIGIN_HANDLE_NEXT_BLOCK);
+	if (req == NULL) {
+		rc = -ENOMEM;
+		goto err_exit;
+	}
+
+	body = req_capsule_client_get(&req->rq_pill, &RMF_LLOGD_BODY);
+	body->lgd_logid = loghandle->lgh_id;
+	body->lgd_ctxt_idx = loghandle->lgh_ctxt->loc_idx - 1;
+	body->lgd_llh_flags = loghandle->lgh_hdr->llh_flags;
+	body->lgd_index = next_idx;
+	body->lgd_saved_index = *cur_idx;
+	body->lgd_len = len;
+	body->lgd_cur_offset = *cur_offset;
+
+	req_capsule_set_size(&req->rq_pill, &RMF_EADATA, RCL_SERVER, len);
+	ptlrpc_request_set_replen(req);
+	rc = ptlrpc_queue_wait(req);
+	if (rc)
+		goto out;
+
+	body = req_capsule_server_get(&req->rq_pill, &RMF_LLOGD_BODY);
+	if (body == NULL) {
+		rc = -EFAULT;
+		goto out;
+	}
+
+	/* The log records are swabbed as they are processed */
+	ptr = req_capsule_server_get(&req->rq_pill, &RMF_EADATA);
+	if (ptr == NULL) {
+		rc = -EFAULT;
+		goto out;
+	}
+
+	*cur_idx = body->lgd_saved_index;
+	*cur_offset = body->lgd_cur_offset;
+
+	memcpy(buf, ptr, len);
+out:
+	ptlrpc_req_finished(req);
+err_exit:
+	LLOG_CLIENT_EXIT(loghandle->lgh_ctxt, imp);
+	return rc;
+}
+
+static int llog_client_prev_block(const struct lu_env *env,
+				  struct llog_handle *loghandle,
+				  int prev_idx, void *buf, int len)
+{
+	struct obd_import     *imp;
+	struct ptlrpc_request *req = NULL;
+	struct llogd_body     *body;
+	void		  *ptr;
+	int		    rc;
+
+	LLOG_CLIENT_ENTRY(loghandle->lgh_ctxt, imp);
+	req = ptlrpc_request_alloc_pack(imp, &RQF_LLOG_ORIGIN_HANDLE_PREV_BLOCK,
+					LUSTRE_LOG_VERSION,
+					LLOG_ORIGIN_HANDLE_PREV_BLOCK);
+	if (req == NULL) {
+		rc = -ENOMEM;
+		goto err_exit;
+	}
+
+	body = req_capsule_client_get(&req->rq_pill, &RMF_LLOGD_BODY);
+	body->lgd_logid = loghandle->lgh_id;
+	body->lgd_ctxt_idx = loghandle->lgh_ctxt->loc_idx - 1;
+	body->lgd_llh_flags = loghandle->lgh_hdr->llh_flags;
+	body->lgd_index = prev_idx;
+	body->lgd_len = len;
+
+	req_capsule_set_size(&req->rq_pill, &RMF_EADATA, RCL_SERVER, len);
+	ptlrpc_request_set_replen(req);
+
+	rc = ptlrpc_queue_wait(req);
+	if (rc)
+		goto out;
+
+	body = req_capsule_server_get(&req->rq_pill, &RMF_LLOGD_BODY);
+	if (body == NULL) {
+		rc = -EFAULT;
+		goto out;
+	}
+
+	ptr = req_capsule_server_get(&req->rq_pill, &RMF_EADATA);
+	if (ptr == NULL) {
+		rc = -EFAULT;
+		goto out;
+	}
+
+	memcpy(buf, ptr, len);
+out:
+	ptlrpc_req_finished(req);
+err_exit:
+	LLOG_CLIENT_EXIT(loghandle->lgh_ctxt, imp);
+	return rc;
+}
+
+static int llog_client_read_header(const struct lu_env *env,
+				   struct llog_handle *handle)
+{
+	struct obd_import     *imp;
+	struct ptlrpc_request *req = NULL;
+	struct llogd_body     *body;
+	struct llog_log_hdr   *hdr;
+	struct llog_rec_hdr   *llh_hdr;
+	int		    rc;
+
+	LLOG_CLIENT_ENTRY(handle->lgh_ctxt, imp);
+	req = ptlrpc_request_alloc_pack(imp, &RQF_LLOG_ORIGIN_HANDLE_READ_HEADER,
+					LUSTRE_LOG_VERSION,
+					LLOG_ORIGIN_HANDLE_READ_HEADER);
+	if (req == NULL) {
+		rc = -ENOMEM;
+		goto err_exit;
+	}
+
+	body = req_capsule_client_get(&req->rq_pill, &RMF_LLOGD_BODY);
+	body->lgd_logid = handle->lgh_id;
+	body->lgd_ctxt_idx = handle->lgh_ctxt->loc_idx - 1;
+	body->lgd_llh_flags = handle->lgh_hdr->llh_flags;
+
+	ptlrpc_request_set_replen(req);
+	rc = ptlrpc_queue_wait(req);
+	if (rc)
+		goto out;
+
+	hdr = req_capsule_server_get(&req->rq_pill, &RMF_LLOG_LOG_HDR);
+	if (hdr == NULL) {
+		rc = -EFAULT;
+		goto out;
+	}
+
+	memcpy(handle->lgh_hdr, hdr, sizeof(*hdr));
+	handle->lgh_last_idx = handle->lgh_hdr->llh_tail.lrt_index;
+
+	/* sanity checks */
+	llh_hdr = &handle->lgh_hdr->llh_hdr;
+	if (llh_hdr->lrh_type != LLOG_HDR_MAGIC) {
+		CERROR("bad log header magic: %#x (expecting %#x)\n",
+		       llh_hdr->lrh_type, LLOG_HDR_MAGIC);
+		rc = -EIO;
+	} else if (llh_hdr->lrh_len != LLOG_CHUNK_SIZE) {
+		CERROR("incorrectly sized log header: %#x (expecting %#x)\n",
+		       llh_hdr->lrh_len, LLOG_CHUNK_SIZE);
+		CERROR("you may need to re-run lconf --write_conf.\n");
+		rc = -EIO;
+	}
+out:
+	ptlrpc_req_finished(req);
+err_exit:
+	LLOG_CLIENT_EXIT(handle->lgh_ctxt, imp);
+	return rc;
+}
+
+static int llog_client_close(const struct lu_env *env,
+			     struct llog_handle *handle)
+{
+	/* this doesn't call LLOG_ORIGIN_HANDLE_CLOSE because
+	   the servers all close the file at the end of every
+	   other LLOG_ RPC. */
+	return 0;
+}
+
+struct llog_operations llog_client_ops = {
+	.lop_next_block		= llog_client_next_block,
+	.lop_prev_block		= llog_client_prev_block,
+	.lop_read_header	= llog_client_read_header,
+	.lop_open		= llog_client_open,
+	.lop_destroy		= llog_client_destroy,
+	.lop_close		= llog_client_close,
+};
+EXPORT_SYMBOL(llog_client_ops);
diff --git a/kernel/drivers/staging/lustre/lustre/ptlrpc/llog_net.c b/kernel/drivers/staging/lustre/lustre/ptlrpc/llog_net.c
new file mode 100644
index 000000000..dac66f5b3
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/ptlrpc/llog_net.c
@@ -0,0 +1,72 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/ptlrpc/llog_net.c
+ *
+ * OST<->MDS recovery logging infrastructure.
+ *
+ * Invariants in implementation:
+ * - we do not share logs among different OST<->MDS connections, so that
+ *   if an OST or MDS fails it need only look at log(s) relevant to itself
+ *
+ * Author: Andreas Dilger <adilger@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LOG
+
+#include "../../include/linux/libcfs/libcfs.h"
+
+#include "../include/obd_class.h"
+#include "../include/lustre_log.h"
+#include <linux/list.h>
+
+int llog_initiator_connect(struct llog_ctxt *ctxt)
+{
+	struct obd_import *new_imp;
+
+	LASSERT(ctxt);
+	new_imp = ctxt->loc_obd->u.cli.cl_import;
+	LASSERTF(ctxt->loc_imp == NULL || ctxt->loc_imp == new_imp,
+		 "%p - %p\n", ctxt->loc_imp, new_imp);
+	mutex_lock(&ctxt->loc_mutex);
+	if (ctxt->loc_imp != new_imp) {
+		if (ctxt->loc_imp)
+			class_import_put(ctxt->loc_imp);
+		ctxt->loc_imp = class_import_get(new_imp);
+	}
+	mutex_unlock(&ctxt->loc_mutex);
+	return 0;
+}
+EXPORT_SYMBOL(llog_initiator_connect);
diff --git a/kernel/drivers/staging/lustre/lustre/ptlrpc/lproc_ptlrpc.c b/kernel/drivers/staging/lustre/lustre/ptlrpc/lproc_ptlrpc.c
new file mode 100644
index 000000000..9533ab976
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/ptlrpc/lproc_ptlrpc.c
@@ -0,0 +1,1366 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+#define DEBUG_SUBSYSTEM S_CLASS
+
+
+#include "../include/obd_support.h"
+#include "../include/obd.h"
+#include "../include/lprocfs_status.h"
+#include "../include/lustre/lustre_idl.h"
+#include "../include/lustre_net.h"
+#include "../include/obd_class.h"
+#include "ptlrpc_internal.h"
+
+
+static struct ll_rpc_opcode {
+	__u32       opcode;
+	const char *opname;
+} ll_rpc_opcode_table[LUSTRE_MAX_OPCODES] = {
+	{ OST_REPLY,	"ost_reply" },
+	{ OST_GETATTR,      "ost_getattr" },
+	{ OST_SETATTR,      "ost_setattr" },
+	{ OST_READ,	 "ost_read" },
+	{ OST_WRITE,	"ost_write" },
+	{ OST_CREATE ,      "ost_create" },
+	{ OST_DESTROY,      "ost_destroy" },
+	{ OST_GET_INFO,     "ost_get_info" },
+	{ OST_CONNECT,      "ost_connect" },
+	{ OST_DISCONNECT,   "ost_disconnect" },
+	{ OST_PUNCH,	"ost_punch" },
+	{ OST_OPEN,	 "ost_open" },
+	{ OST_CLOSE,	"ost_close" },
+	{ OST_STATFS,       "ost_statfs" },
+	{ 14,		NULL },    /* formerly OST_SAN_READ */
+	{ 15,		NULL },    /* formerly OST_SAN_WRITE */
+	{ OST_SYNC,	 "ost_sync" },
+	{ OST_SET_INFO,     "ost_set_info" },
+	{ OST_QUOTACHECK,   "ost_quotacheck" },
+	{ OST_QUOTACTL,     "ost_quotactl" },
+	{ OST_QUOTA_ADJUST_QUNIT, "ost_quota_adjust_qunit" },
+	{ MDS_GETATTR,      "mds_getattr" },
+	{ MDS_GETATTR_NAME, "mds_getattr_lock" },
+	{ MDS_CLOSE,	"mds_close" },
+	{ MDS_REINT,	"mds_reint" },
+	{ MDS_READPAGE,     "mds_readpage" },
+	{ MDS_CONNECT,      "mds_connect" },
+	{ MDS_DISCONNECT,   "mds_disconnect" },
+	{ MDS_GETSTATUS,    "mds_getstatus" },
+	{ MDS_STATFS,       "mds_statfs" },
+	{ MDS_PIN,	  "mds_pin" },
+	{ MDS_UNPIN,	"mds_unpin" },
+	{ MDS_SYNC,	 "mds_sync" },
+	{ MDS_DONE_WRITING, "mds_done_writing" },
+	{ MDS_SET_INFO,     "mds_set_info" },
+	{ MDS_QUOTACHECK,   "mds_quotacheck" },
+	{ MDS_QUOTACTL,     "mds_quotactl" },
+	{ MDS_GETXATTR,     "mds_getxattr" },
+	{ MDS_SETXATTR,     "mds_setxattr" },
+	{ MDS_WRITEPAGE,    "mds_writepage" },
+	{ MDS_IS_SUBDIR,    "mds_is_subdir" },
+	{ MDS_GET_INFO,     "mds_get_info" },
+	{ MDS_HSM_STATE_GET, "mds_hsm_state_get" },
+	{ MDS_HSM_STATE_SET, "mds_hsm_state_set" },
+	{ MDS_HSM_ACTION,   "mds_hsm_action" },
+	{ MDS_HSM_PROGRESS, "mds_hsm_progress" },
+	{ MDS_HSM_REQUEST,  "mds_hsm_request" },
+	{ MDS_HSM_CT_REGISTER, "mds_hsm_ct_register" },
+	{ MDS_HSM_CT_UNREGISTER, "mds_hsm_ct_unregister" },
+	{ MDS_SWAP_LAYOUTS,	"mds_swap_layouts" },
+	{ LDLM_ENQUEUE,     "ldlm_enqueue" },
+	{ LDLM_CONVERT,     "ldlm_convert" },
+	{ LDLM_CANCEL,      "ldlm_cancel" },
+	{ LDLM_BL_CALLBACK, "ldlm_bl_callback" },
+	{ LDLM_CP_CALLBACK, "ldlm_cp_callback" },
+	{ LDLM_GL_CALLBACK, "ldlm_gl_callback" },
+	{ LDLM_SET_INFO,    "ldlm_set_info" },
+	{ MGS_CONNECT,      "mgs_connect" },
+	{ MGS_DISCONNECT,   "mgs_disconnect" },
+	{ MGS_EXCEPTION,    "mgs_exception" },
+	{ MGS_TARGET_REG,   "mgs_target_reg" },
+	{ MGS_TARGET_DEL,   "mgs_target_del" },
+	{ MGS_SET_INFO,     "mgs_set_info" },
+	{ MGS_CONFIG_READ,  "mgs_config_read" },
+	{ OBD_PING,	 "obd_ping" },
+	{ OBD_LOG_CANCEL,	"llog_cancel" },
+	{ OBD_QC_CALLBACK,  "obd_quota_callback" },
+	{ OBD_IDX_READ,	    "dt_index_read" },
+	{ LLOG_ORIGIN_HANDLE_CREATE,	 "llog_origin_handle_open" },
+	{ LLOG_ORIGIN_HANDLE_NEXT_BLOCK, "llog_origin_handle_next_block" },
+	{ LLOG_ORIGIN_HANDLE_READ_HEADER, "llog_origin_handle_read_header" },
+	{ LLOG_ORIGIN_HANDLE_WRITE_REC,  "llog_origin_handle_write_rec" },
+	{ LLOG_ORIGIN_HANDLE_CLOSE,      "llog_origin_handle_close" },
+	{ LLOG_ORIGIN_CONNECT,	   "llog_origin_connect" },
+	{ LLOG_CATINFO,		  "llog_catinfo" },
+	{ LLOG_ORIGIN_HANDLE_PREV_BLOCK, "llog_origin_handle_prev_block" },
+	{ LLOG_ORIGIN_HANDLE_DESTROY,    "llog_origin_handle_destroy" },
+	{ QUOTA_DQACQ,      "quota_acquire" },
+	{ QUOTA_DQREL,      "quota_release" },
+	{ SEQ_QUERY,	"seq_query" },
+	{ SEC_CTX_INIT,     "sec_ctx_init" },
+	{ SEC_CTX_INIT_CONT, "sec_ctx_init_cont" },
+	{ SEC_CTX_FINI,     "sec_ctx_fini" },
+	{ FLD_QUERY,	"fld_query" },
+	{ UPDATE_OBJ,	    "update_obj" },
+};
+
+static struct ll_eopcode {
+	__u32       opcode;
+	const char *opname;
+} ll_eopcode_table[EXTRA_LAST_OPC] = {
+	{ LDLM_GLIMPSE_ENQUEUE, "ldlm_glimpse_enqueue" },
+	{ LDLM_PLAIN_ENQUEUE,   "ldlm_plain_enqueue" },
+	{ LDLM_EXTENT_ENQUEUE,  "ldlm_extent_enqueue" },
+	{ LDLM_FLOCK_ENQUEUE,   "ldlm_flock_enqueue" },
+	{ LDLM_IBITS_ENQUEUE,   "ldlm_ibits_enqueue" },
+	{ MDS_REINT_SETATTR,    "mds_reint_setattr" },
+	{ MDS_REINT_CREATE,     "mds_reint_create" },
+	{ MDS_REINT_LINK,       "mds_reint_link" },
+	{ MDS_REINT_UNLINK,     "mds_reint_unlink" },
+	{ MDS_REINT_RENAME,     "mds_reint_rename" },
+	{ MDS_REINT_OPEN,       "mds_reint_open" },
+	{ MDS_REINT_SETXATTR,   "mds_reint_setxattr" },
+	{ BRW_READ_BYTES,       "read_bytes" },
+	{ BRW_WRITE_BYTES,      "write_bytes" },
+};
+
+const char *ll_opcode2str(__u32 opcode)
+{
+	/* When one of the assertions below fail, chances are that:
+	 *     1) A new opcode was added in include/lustre/lustre_idl.h,
+	 *	but is missing from the table above.
+	 * or  2) The opcode space was renumbered or rearranged,
+	 *	and the opcode_offset() function in
+	 *	ptlrpc_internal.h needs to be modified.
+	 */
+	__u32 offset = opcode_offset(opcode);
+	LASSERTF(offset < LUSTRE_MAX_OPCODES,
+		 "offset %u >= LUSTRE_MAX_OPCODES %u\n",
+		 offset, LUSTRE_MAX_OPCODES);
+	LASSERTF(ll_rpc_opcode_table[offset].opcode == opcode,
+		 "ll_rpc_opcode_table[%u].opcode %u != opcode %u\n",
+		 offset, ll_rpc_opcode_table[offset].opcode, opcode);
+	return ll_rpc_opcode_table[offset].opname;
+}
+
+static const char *ll_eopcode2str(__u32 opcode)
+{
+	LASSERT(ll_eopcode_table[opcode].opcode == opcode);
+	return ll_eopcode_table[opcode].opname;
+}
+
+#if defined(CONFIG_PROC_FS)
+static void ptlrpc_lprocfs_register(struct proc_dir_entry *root, char *dir,
+				    char *name,
+				    struct proc_dir_entry **procroot_ret,
+				    struct lprocfs_stats **stats_ret)
+{
+	struct proc_dir_entry *svc_procroot;
+	struct lprocfs_stats *svc_stats;
+	int i, rc;
+	unsigned int svc_counter_config = LPROCFS_CNTR_AVGMINMAX |
+					  LPROCFS_CNTR_STDDEV;
+
+	LASSERT(*procroot_ret == NULL);
+	LASSERT(*stats_ret == NULL);
+
+	svc_stats = lprocfs_alloc_stats(EXTRA_MAX_OPCODES+LUSTRE_MAX_OPCODES,
+					0);
+	if (svc_stats == NULL)
+		return;
+
+	if (dir) {
+		svc_procroot = lprocfs_register(dir, root, NULL, NULL);
+		if (IS_ERR(svc_procroot)) {
+			lprocfs_free_stats(&svc_stats);
+			return;
+		}
+	} else {
+		svc_procroot = root;
+	}
+
+	lprocfs_counter_init(svc_stats, PTLRPC_REQWAIT_CNTR,
+			     svc_counter_config, "req_waittime", "usec");
+	lprocfs_counter_init(svc_stats, PTLRPC_REQQDEPTH_CNTR,
+			     svc_counter_config, "req_qdepth", "reqs");
+	lprocfs_counter_init(svc_stats, PTLRPC_REQACTIVE_CNTR,
+			     svc_counter_config, "req_active", "reqs");
+	lprocfs_counter_init(svc_stats, PTLRPC_TIMEOUT,
+			     svc_counter_config, "req_timeout", "sec");
+	lprocfs_counter_init(svc_stats, PTLRPC_REQBUF_AVAIL_CNTR,
+			     svc_counter_config, "reqbuf_avail", "bufs");
+	for (i = 0; i < EXTRA_LAST_OPC; i++) {
+		char *units;
+
+		switch (i) {
+		case BRW_WRITE_BYTES:
+		case BRW_READ_BYTES:
+			units = "bytes";
+			break;
+		default:
+			units = "reqs";
+			break;
+		}
+		lprocfs_counter_init(svc_stats, PTLRPC_LAST_CNTR + i,
+				     svc_counter_config,
+				     ll_eopcode2str(i), units);
+	}
+	for (i = 0; i < LUSTRE_MAX_OPCODES; i++) {
+		__u32 opcode = ll_rpc_opcode_table[i].opcode;
+		lprocfs_counter_init(svc_stats,
+				     EXTRA_MAX_OPCODES + i, svc_counter_config,
+				     ll_opcode2str(opcode), "usec");
+	}
+
+	rc = lprocfs_register_stats(svc_procroot, name, svc_stats);
+	if (rc < 0) {
+		if (dir)
+			lprocfs_remove(&svc_procroot);
+		lprocfs_free_stats(&svc_stats);
+	} else {
+		if (dir)
+			*procroot_ret = svc_procroot;
+		*stats_ret = svc_stats;
+	}
+}
+
+static int
+ptlrpc_lprocfs_req_history_len_seq_show(struct seq_file *m, void *v)
+{
+	struct ptlrpc_service *svc = m->private;
+	struct ptlrpc_service_part *svcpt;
+	int	total = 0;
+	int	i;
+
+	ptlrpc_service_for_each_part(svcpt, i, svc)
+		total += svcpt->scp_hist_nrqbds;
+
+	seq_printf(m, "%d\n", total);
+	return 0;
+}
+LPROC_SEQ_FOPS_RO(ptlrpc_lprocfs_req_history_len);
+
+static int
+ptlrpc_lprocfs_req_history_max_seq_show(struct seq_file *m, void *n)
+{
+	struct ptlrpc_service *svc = m->private;
+	struct ptlrpc_service_part *svcpt;
+	int	total = 0;
+	int	i;
+
+	ptlrpc_service_for_each_part(svcpt, i, svc)
+		total += svc->srv_hist_nrqbds_cpt_max;
+
+	seq_printf(m, "%d\n", total);
+	return 0;
+}
+
+static ssize_t
+ptlrpc_lprocfs_req_history_max_seq_write(struct file *file,
+					const char __user *buffer,
+					size_t count, loff_t *off)
+{
+	struct ptlrpc_service *svc = ((struct seq_file *)file->private_data)->private;
+	int			    bufpages;
+	int			    val;
+	int			    rc;
+
+	rc = lprocfs_write_helper(buffer, count, &val);
+	if (rc < 0)
+		return rc;
+
+	if (val < 0)
+		return -ERANGE;
+
+	/* This sanity check is more of an insanity check; we can still
+	 * hose a kernel by allowing the request history to grow too
+	 * far. */
+	bufpages = (svc->srv_buf_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+	if (val > totalram_pages / (2 * bufpages))
+		return -ERANGE;
+
+	spin_lock(&svc->srv_lock);
+
+	if (val == 0)
+		svc->srv_hist_nrqbds_cpt_max = 0;
+	else
+		svc->srv_hist_nrqbds_cpt_max = max(1, (val / svc->srv_ncpts));
+
+	spin_unlock(&svc->srv_lock);
+
+	return count;
+}
+LPROC_SEQ_FOPS(ptlrpc_lprocfs_req_history_max);
+
+static int
+ptlrpc_lprocfs_threads_min_seq_show(struct seq_file *m, void *n)
+{
+	struct ptlrpc_service *svc = m->private;
+
+	seq_printf(m, "%d\n", svc->srv_nthrs_cpt_init * svc->srv_ncpts);
+	return 0;
+}
+
+static ssize_t
+ptlrpc_lprocfs_threads_min_seq_write(struct file *file,
+					const char __user *buffer,
+					size_t count, loff_t *off)
+{
+	struct ptlrpc_service *svc = ((struct seq_file *)file->private_data)->private;
+	int	val;
+	int	rc = lprocfs_write_helper(buffer, count, &val);
+
+	if (rc < 0)
+		return rc;
+
+	if (val / svc->srv_ncpts < PTLRPC_NTHRS_INIT)
+		return -ERANGE;
+
+	spin_lock(&svc->srv_lock);
+	if (val > svc->srv_nthrs_cpt_limit * svc->srv_ncpts) {
+		spin_unlock(&svc->srv_lock);
+		return -ERANGE;
+	}
+
+	svc->srv_nthrs_cpt_init = val / svc->srv_ncpts;
+
+	spin_unlock(&svc->srv_lock);
+
+	return count;
+}
+LPROC_SEQ_FOPS(ptlrpc_lprocfs_threads_min);
+
+static int
+ptlrpc_lprocfs_threads_started_seq_show(struct seq_file *m, void *n)
+{
+	struct ptlrpc_service *svc = m->private;
+	struct ptlrpc_service_part *svcpt;
+	int	total = 0;
+	int	i;
+
+	ptlrpc_service_for_each_part(svcpt, i, svc)
+		total += svcpt->scp_nthrs_running;
+
+	seq_printf(m, "%d\n", total);
+	return 0;
+}
+LPROC_SEQ_FOPS_RO(ptlrpc_lprocfs_threads_started);
+
+static int
+ptlrpc_lprocfs_threads_max_seq_show(struct seq_file *m, void *n)
+{
+	struct ptlrpc_service *svc = m->private;
+
+	seq_printf(m, "%d\n", svc->srv_nthrs_cpt_limit * svc->srv_ncpts);
+	return 0;
+}
+
+static ssize_t
+ptlrpc_lprocfs_threads_max_seq_write(struct file *file,
+				const char __user *buffer,
+				size_t count, loff_t *off)
+{
+	struct ptlrpc_service *svc = ((struct seq_file *)file->private_data)->private;
+	int	val;
+	int	rc = lprocfs_write_helper(buffer, count, &val);
+
+	if (rc < 0)
+		return rc;
+
+	if (val / svc->srv_ncpts < PTLRPC_NTHRS_INIT)
+		return -ERANGE;
+
+	spin_lock(&svc->srv_lock);
+	if (val < svc->srv_nthrs_cpt_init * svc->srv_ncpts) {
+		spin_unlock(&svc->srv_lock);
+		return -ERANGE;
+	}
+
+	svc->srv_nthrs_cpt_limit = val / svc->srv_ncpts;
+
+	spin_unlock(&svc->srv_lock);
+
+	return count;
+}
+LPROC_SEQ_FOPS(ptlrpc_lprocfs_threads_max);
+
+/**
+ * \addtogoup nrs
+ * @{
+ */
+extern struct nrs_core nrs_core;
+
+/**
+ * Translates \e ptlrpc_nrs_pol_state values to human-readable strings.
+ *
+ * \param[in] state The policy state
+ */
+static const char *nrs_state2str(enum ptlrpc_nrs_pol_state state)
+{
+	switch (state) {
+	default:
+		LBUG();
+	case NRS_POL_STATE_INVALID:
+		return "invalid";
+	case NRS_POL_STATE_STOPPED:
+		return "stopped";
+	case NRS_POL_STATE_STOPPING:
+		return "stopping";
+	case NRS_POL_STATE_STARTING:
+		return "starting";
+	case NRS_POL_STATE_STARTED:
+		return "started";
+	}
+}
+
+/**
+ * Obtains status information for \a policy.
+ *
+ * Information is copied in \a info.
+ *
+ * \param[in] policy The policy
+ * \param[out] info  Holds returned status information
+ */
+void nrs_policy_get_info_locked(struct ptlrpc_nrs_policy *policy,
+				struct ptlrpc_nrs_pol_info *info)
+{
+	LASSERT(policy != NULL);
+	LASSERT(info != NULL);
+	assert_spin_locked(&policy->pol_nrs->nrs_lock);
+
+	memcpy(info->pi_name, policy->pol_desc->pd_name, NRS_POL_NAME_MAX);
+
+	info->pi_fallback    = !!(policy->pol_flags & PTLRPC_NRS_FL_FALLBACK);
+	info->pi_state	     = policy->pol_state;
+	/**
+	 * XXX: These are accessed without holding
+	 * ptlrpc_service_part::scp_req_lock.
+	 */
+	info->pi_req_queued  = policy->pol_req_queued;
+	info->pi_req_started = policy->pol_req_started;
+}
+
+/**
+ * Reads and prints policy status information for all policies of a PTLRPC
+ * service.
+ */
+static int ptlrpc_lprocfs_nrs_seq_show(struct seq_file *m, void *n)
+{
+	struct ptlrpc_service	       *svc = m->private;
+	struct ptlrpc_service_part     *svcpt;
+	struct ptlrpc_nrs	       *nrs;
+	struct ptlrpc_nrs_policy       *policy;
+	struct ptlrpc_nrs_pol_info     *infos;
+	struct ptlrpc_nrs_pol_info	tmp;
+	unsigned			num_pols;
+	unsigned			pol_idx = 0;
+	bool				hp = false;
+	int				i;
+	int				rc = 0;
+
+	/**
+	 * Serialize NRS core lprocfs operations with policy registration/
+	 * unregistration.
+	 */
+	mutex_lock(&nrs_core.nrs_mutex);
+
+	/**
+	 * Use the first service partition's regular NRS head in order to obtain
+	 * the number of policies registered with NRS heads of this service. All
+	 * service partitions will have the same number of policies.
+	 */
+	nrs = nrs_svcpt2nrs(svc->srv_parts[0], false);
+
+	spin_lock(&nrs->nrs_lock);
+	num_pols = svc->srv_parts[0]->scp_nrs_reg.nrs_num_pols;
+	spin_unlock(&nrs->nrs_lock);
+
+	OBD_ALLOC(infos, num_pols * sizeof(*infos));
+	if (infos == NULL) {
+		rc = -ENOMEM;
+		goto out;
+	}
+again:
+
+	ptlrpc_service_for_each_part(svcpt, i, svc) {
+		nrs = nrs_svcpt2nrs(svcpt, hp);
+		spin_lock(&nrs->nrs_lock);
+
+		pol_idx = 0;
+
+		list_for_each_entry(policy, &nrs->nrs_policy_list,
+					pol_list) {
+			LASSERT(pol_idx < num_pols);
+
+			nrs_policy_get_info_locked(policy, &tmp);
+			/**
+			 * Copy values when handling the first service
+			 * partition.
+			 */
+			if (i == 0) {
+				memcpy(infos[pol_idx].pi_name, tmp.pi_name,
+				       NRS_POL_NAME_MAX);
+				memcpy(&infos[pol_idx].pi_state, &tmp.pi_state,
+				       sizeof(tmp.pi_state));
+				infos[pol_idx].pi_fallback = tmp.pi_fallback;
+				/**
+				 * For the rest of the service partitions
+				 * sanity-check the values we get.
+				 */
+			} else {
+				LASSERT(strncmp(infos[pol_idx].pi_name,
+						tmp.pi_name,
+						NRS_POL_NAME_MAX) == 0);
+				/**
+				 * Not asserting ptlrpc_nrs_pol_info::pi_state,
+				 * because it may be different between
+				 * instances of the same policy in different
+				 * service partitions.
+				 */
+				LASSERT(infos[pol_idx].pi_fallback ==
+					tmp.pi_fallback);
+			}
+
+			infos[pol_idx].pi_req_queued += tmp.pi_req_queued;
+			infos[pol_idx].pi_req_started += tmp.pi_req_started;
+
+			pol_idx++;
+		}
+		spin_unlock(&nrs->nrs_lock);
+	}
+
+	/**
+	 * Policy status information output is in YAML format.
+	 * For example:
+	 *
+	 *	regular_requests:
+	 *	  - name: fifo
+	 *	    state: started
+	 *	    fallback: yes
+	 *	    queued: 0
+	 *	    active: 0
+	 *
+	 *	  - name: crrn
+	 *	    state: started
+	 *	    fallback: no
+	 *	    queued: 2015
+	 *	    active: 384
+	 *
+	 *	high_priority_requests:
+	 *	  - name: fifo
+	 *	    state: started
+	 *	    fallback: yes
+	 *	    queued: 0
+	 *	    active: 2
+	 *
+	 *	  - name: crrn
+	 *	    state: stopped
+	 *	    fallback: no
+	 *	    queued: 0
+	 *	    active: 0
+	 */
+	seq_printf(m, "%s\n",
+		      !hp ?  "\nregular_requests:" : "high_priority_requests:");
+
+	for (pol_idx = 0; pol_idx < num_pols; pol_idx++) {
+		seq_printf(m,  "  - name: %s\n"
+			       "    state: %s\n"
+			       "    fallback: %s\n"
+			       "    queued: %-20d\n"
+			       "    active: %-20d\n\n",
+			       infos[pol_idx].pi_name,
+			       nrs_state2str(infos[pol_idx].pi_state),
+			       infos[pol_idx].pi_fallback ? "yes" : "no",
+			       (int)infos[pol_idx].pi_req_queued,
+			       (int)infos[pol_idx].pi_req_started);
+	}
+
+	if (!hp && nrs_svc_has_hp(svc)) {
+		memset(infos, 0, num_pols * sizeof(*infos));
+
+		/**
+		 * Redo the processing for the service's HP NRS heads' policies.
+		 */
+		hp = true;
+		goto again;
+	}
+
+out:
+	if (infos)
+		OBD_FREE(infos, num_pols * sizeof(*infos));
+
+	mutex_unlock(&nrs_core.nrs_mutex);
+
+	return rc;
+}
+
+/**
+ * The longest valid command string is the maximum policy name size, plus the
+ * length of the " reg" substring
+ */
+#define LPROCFS_NRS_WR_MAX_CMD	(NRS_POL_NAME_MAX + sizeof(" reg") - 1)
+
+/**
+ * Starts and stops a given policy on a PTLRPC service.
+ *
+ * Commands consist of the policy name, followed by an optional [reg|hp] token;
+ * if the optional token is omitted, the operation is performed on both the
+ * regular and high-priority (if the service has one) NRS head.
+ */
+static ssize_t ptlrpc_lprocfs_nrs_seq_write(struct file *file,
+					const char __user *buffer,
+					size_t count, loff_t *off)
+{
+	struct ptlrpc_service *svc = ((struct seq_file *)file->private_data)->private;
+	enum ptlrpc_nrs_queue_type	queue = PTLRPC_NRS_QUEUE_BOTH;
+	char			       *cmd;
+	char			       *cmd_copy = NULL;
+	char			       *token;
+	int				rc = 0;
+
+	if (count >= LPROCFS_NRS_WR_MAX_CMD) {
+		rc = -EINVAL;
+		goto out;
+	}
+
+	OBD_ALLOC(cmd, LPROCFS_NRS_WR_MAX_CMD);
+	if (cmd == NULL) {
+		rc = -ENOMEM;
+		goto out;
+	}
+	/**
+	 * strsep() modifies its argument, so keep a copy
+	 */
+	cmd_copy = cmd;
+
+	if (copy_from_user(cmd, buffer, count)) {
+		rc = -EFAULT;
+		goto out;
+	}
+
+	cmd[count] = '\0';
+
+	token = strsep(&cmd, " ");
+
+	if (strlen(token) > NRS_POL_NAME_MAX - 1) {
+		rc = -EINVAL;
+		goto out;
+	}
+
+	/**
+	 * No [reg|hp] token has been specified
+	 */
+	if (cmd == NULL)
+		goto default_queue;
+
+	/**
+	 * The second token is either NULL, or an optional [reg|hp] string
+	 */
+	if (strcmp(cmd, "reg") == 0)
+		queue = PTLRPC_NRS_QUEUE_REG;
+	else if (strcmp(cmd, "hp") == 0)
+		queue = PTLRPC_NRS_QUEUE_HP;
+	else {
+		rc = -EINVAL;
+		goto out;
+	}
+
+default_queue:
+
+	if (queue == PTLRPC_NRS_QUEUE_HP && !nrs_svc_has_hp(svc)) {
+		rc = -ENODEV;
+		goto out;
+	} else if (queue == PTLRPC_NRS_QUEUE_BOTH && !nrs_svc_has_hp(svc))
+		queue = PTLRPC_NRS_QUEUE_REG;
+
+	/**
+	 * Serialize NRS core lprocfs operations with policy registration/
+	 * unregistration.
+	 */
+	mutex_lock(&nrs_core.nrs_mutex);
+
+	rc = ptlrpc_nrs_policy_control(svc, queue, token, PTLRPC_NRS_CTL_START,
+				       false, NULL);
+
+	mutex_unlock(&nrs_core.nrs_mutex);
+out:
+	if (cmd_copy)
+		OBD_FREE(cmd_copy, LPROCFS_NRS_WR_MAX_CMD);
+
+	return rc < 0 ? rc : count;
+}
+LPROC_SEQ_FOPS(ptlrpc_lprocfs_nrs);
+
+/** @} nrs */
+
+struct ptlrpc_srh_iterator {
+	int			srhi_idx;
+	__u64			srhi_seq;
+	struct ptlrpc_request	*srhi_req;
+};
+
+static int
+ptlrpc_lprocfs_svc_req_history_seek(struct ptlrpc_service_part *svcpt,
+				    struct ptlrpc_srh_iterator *srhi,
+				    __u64 seq)
+{
+	struct list_head		*e;
+	struct ptlrpc_request	*req;
+
+	if (srhi->srhi_req != NULL &&
+	    srhi->srhi_seq > svcpt->scp_hist_seq_culled &&
+	    srhi->srhi_seq <= seq) {
+		/* If srhi_req was set previously, hasn't been culled and
+		 * we're searching for a seq on or after it (i.e. more
+		 * recent), search from it onwards.
+		 * Since the service history is LRU (i.e. culled reqs will
+		 * be near the head), we shouldn't have to do long
+		 * re-scans */
+		LASSERTF(srhi->srhi_seq == srhi->srhi_req->rq_history_seq,
+			 "%s:%d: seek seq %llu, request seq %llu\n",
+			 svcpt->scp_service->srv_name, svcpt->scp_cpt,
+			 srhi->srhi_seq, srhi->srhi_req->rq_history_seq);
+		LASSERTF(!list_empty(&svcpt->scp_hist_reqs),
+			 "%s:%d: seek offset %llu, request seq %llu, last culled %llu\n",
+			 svcpt->scp_service->srv_name, svcpt->scp_cpt,
+			 seq, srhi->srhi_seq, svcpt->scp_hist_seq_culled);
+		e = &srhi->srhi_req->rq_history_list;
+	} else {
+		/* search from start */
+		e = svcpt->scp_hist_reqs.next;
+	}
+
+	while (e != &svcpt->scp_hist_reqs) {
+		req = list_entry(e, struct ptlrpc_request, rq_history_list);
+
+		if (req->rq_history_seq >= seq) {
+			srhi->srhi_seq = req->rq_history_seq;
+			srhi->srhi_req = req;
+			return 0;
+		}
+		e = e->next;
+	}
+
+	return -ENOENT;
+}
+
+/*
+ * ptlrpc history sequence is used as "position" of seq_file, in some case,
+ * seq_read() will increase "position" to indicate reading the next
+ * element, however, low bits of history sequence are reserved for CPT id
+ * (check the details from comments before ptlrpc_req_add_history), which
+ * means seq_read() might change CPT id of history sequence and never
+ * finish reading of requests on a CPT. To make it work, we have to shift
+ * CPT id to high bits and timestamp to low bits, so seq_read() will only
+ * increase timestamp which can correctly indicate the next position.
+ */
+
+/* convert seq_file pos to cpt */
+#define PTLRPC_REQ_POS2CPT(svc, pos)			\
+	((svc)->srv_cpt_bits == 0 ? 0 :			\
+	 (__u64)(pos) >> (64 - (svc)->srv_cpt_bits))
+
+/* make up seq_file pos from cpt */
+#define PTLRPC_REQ_CPT2POS(svc, cpt)			\
+	((svc)->srv_cpt_bits == 0 ? 0 :			\
+	 (cpt) << (64 - (svc)->srv_cpt_bits))
+
+/* convert sequence to position */
+#define PTLRPC_REQ_SEQ2POS(svc, seq)			\
+	((svc)->srv_cpt_bits == 0 ? (seq) :		\
+	 ((seq) >> (svc)->srv_cpt_bits) |		\
+	 ((seq) << (64 - (svc)->srv_cpt_bits)))
+
+/* convert position to sequence */
+#define PTLRPC_REQ_POS2SEQ(svc, pos)			\
+	((svc)->srv_cpt_bits == 0 ? (pos) :		\
+	 ((__u64)(pos) << (svc)->srv_cpt_bits) |	\
+	 ((__u64)(pos) >> (64 - (svc)->srv_cpt_bits)))
+
+static void *
+ptlrpc_lprocfs_svc_req_history_start(struct seq_file *s, loff_t *pos)
+{
+	struct ptlrpc_service		*svc = s->private;
+	struct ptlrpc_service_part	*svcpt;
+	struct ptlrpc_srh_iterator	*srhi;
+	unsigned int			cpt;
+	int				rc;
+	int				i;
+
+	if (sizeof(loff_t) != sizeof(__u64)) { /* can't support */
+		CWARN("Failed to read request history because size of loff_t %d can't match size of u64\n",
+		      (int)sizeof(loff_t));
+		return NULL;
+	}
+
+	OBD_ALLOC(srhi, sizeof(*srhi));
+	if (srhi == NULL)
+		return NULL;
+
+	srhi->srhi_seq = 0;
+	srhi->srhi_req = NULL;
+
+	cpt = PTLRPC_REQ_POS2CPT(svc, *pos);
+
+	ptlrpc_service_for_each_part(svcpt, i, svc) {
+		if (i < cpt) /* skip */
+			continue;
+		if (i > cpt) /* make up the lowest position for this CPT */
+			*pos = PTLRPC_REQ_CPT2POS(svc, i);
+
+		spin_lock(&svcpt->scp_lock);
+		rc = ptlrpc_lprocfs_svc_req_history_seek(svcpt, srhi,
+				PTLRPC_REQ_POS2SEQ(svc, *pos));
+		spin_unlock(&svcpt->scp_lock);
+		if (rc == 0) {
+			*pos = PTLRPC_REQ_SEQ2POS(svc, srhi->srhi_seq);
+			srhi->srhi_idx = i;
+			return srhi;
+		}
+	}
+
+	OBD_FREE(srhi, sizeof(*srhi));
+	return NULL;
+}
+
+static void
+ptlrpc_lprocfs_svc_req_history_stop(struct seq_file *s, void *iter)
+{
+	struct ptlrpc_srh_iterator *srhi = iter;
+
+	if (srhi != NULL)
+		OBD_FREE(srhi, sizeof(*srhi));
+}
+
+static void *
+ptlrpc_lprocfs_svc_req_history_next(struct seq_file *s,
+				    void *iter, loff_t *pos)
+{
+	struct ptlrpc_service		*svc = s->private;
+	struct ptlrpc_srh_iterator	*srhi = iter;
+	struct ptlrpc_service_part	*svcpt;
+	__u64				seq;
+	int				rc;
+	int				i;
+
+	for (i = srhi->srhi_idx; i < svc->srv_ncpts; i++) {
+		svcpt = svc->srv_parts[i];
+
+		if (i > srhi->srhi_idx) { /* reset iterator for a new CPT */
+			srhi->srhi_req = NULL;
+			seq = srhi->srhi_seq = 0;
+		} else { /* the next sequence */
+			seq = srhi->srhi_seq + (1 << svc->srv_cpt_bits);
+		}
+
+		spin_lock(&svcpt->scp_lock);
+		rc = ptlrpc_lprocfs_svc_req_history_seek(svcpt, srhi, seq);
+		spin_unlock(&svcpt->scp_lock);
+		if (rc == 0) {
+			*pos = PTLRPC_REQ_SEQ2POS(svc, srhi->srhi_seq);
+			srhi->srhi_idx = i;
+			return srhi;
+		}
+	}
+
+	OBD_FREE(srhi, sizeof(*srhi));
+	return NULL;
+}
+
+/* common ost/mdt so_req_printer */
+void target_print_req(void *seq_file, struct ptlrpc_request *req)
+{
+	/* Called holding srv_lock with irqs disabled.
+	 * Print specific req contents and a newline.
+	 * CAVEAT EMPTOR: check request message length before printing!!!
+	 * You might have received any old crap so you must be just as
+	 * careful here as the service's request parser!!! */
+	struct seq_file *sf = seq_file;
+
+	switch (req->rq_phase) {
+	case RQ_PHASE_NEW:
+		/* still awaiting a service thread's attention, or rejected
+		 * because the generic request message didn't unpack */
+		seq_printf(sf, "<not swabbed>\n");
+		break;
+	case RQ_PHASE_INTERPRET:
+		/* being handled, so basic msg swabbed, and opc is valid
+		 * but racing with mds_handle() */
+	case RQ_PHASE_COMPLETE:
+		/* been handled by mds_handle() reply state possibly still
+		 * volatile */
+		seq_printf(sf, "opc %d\n", lustre_msg_get_opc(req->rq_reqmsg));
+		break;
+	default:
+		DEBUG_REQ(D_ERROR, req, "bad phase %d", req->rq_phase);
+	}
+}
+EXPORT_SYMBOL(target_print_req);
+
+static int ptlrpc_lprocfs_svc_req_history_show(struct seq_file *s, void *iter)
+{
+	struct ptlrpc_service		*svc = s->private;
+	struct ptlrpc_srh_iterator	*srhi = iter;
+	struct ptlrpc_service_part	*svcpt;
+	struct ptlrpc_request		*req;
+	int				rc;
+
+	LASSERT(srhi->srhi_idx < svc->srv_ncpts);
+
+	svcpt = svc->srv_parts[srhi->srhi_idx];
+
+	spin_lock(&svcpt->scp_lock);
+
+	rc = ptlrpc_lprocfs_svc_req_history_seek(svcpt, srhi, srhi->srhi_seq);
+
+	if (rc == 0) {
+		req = srhi->srhi_req;
+
+		/* Print common req fields.
+		 * CAVEAT EMPTOR: we're racing with the service handler
+		 * here.  The request could contain any old crap, so you
+		 * must be just as careful as the service's request
+		 * parser. Currently I only print stuff here I know is OK
+		 * to look at coz it was set up in request_in_callback()!!! */
+		seq_printf(s, "%lld:%s:%s:x%llu:%d:%s:%ld:%lds(%+lds) ",
+			   req->rq_history_seq, libcfs_nid2str(req->rq_self),
+			   libcfs_id2str(req->rq_peer), req->rq_xid,
+			   req->rq_reqlen, ptlrpc_rqphase2str(req),
+			   req->rq_arrival_time.tv_sec,
+			   req->rq_sent - req->rq_arrival_time.tv_sec,
+			   req->rq_sent - req->rq_deadline);
+		if (svc->srv_ops.so_req_printer == NULL)
+			seq_printf(s, "\n");
+		else
+			svc->srv_ops.so_req_printer(s, srhi->srhi_req);
+	}
+
+	spin_unlock(&svcpt->scp_lock);
+	return rc;
+}
+
+static int
+ptlrpc_lprocfs_svc_req_history_open(struct inode *inode, struct file *file)
+{
+	static struct seq_operations sops = {
+		.start = ptlrpc_lprocfs_svc_req_history_start,
+		.stop  = ptlrpc_lprocfs_svc_req_history_stop,
+		.next  = ptlrpc_lprocfs_svc_req_history_next,
+		.show  = ptlrpc_lprocfs_svc_req_history_show,
+	};
+	struct seq_file       *seqf;
+	int		    rc;
+
+	rc = seq_open(file, &sops);
+	if (rc)
+		return rc;
+
+	seqf = file->private_data;
+	seqf->private = PDE_DATA(inode);
+	return 0;
+}
+
+/* See also lprocfs_rd_timeouts */
+static int ptlrpc_lprocfs_timeouts_seq_show(struct seq_file *m, void *n)
+{
+	struct ptlrpc_service		*svc = m->private;
+	struct ptlrpc_service_part	*svcpt;
+	struct dhms			ts;
+	time_t				worstt;
+	unsigned int			cur;
+	unsigned int			worst;
+	int				i;
+
+	if (AT_OFF) {
+		seq_printf(m, "adaptive timeouts off, using obd_timeout %u\n",
+			       obd_timeout);
+		return 0;
+	}
+
+	ptlrpc_service_for_each_part(svcpt, i, svc) {
+		cur	= at_get(&svcpt->scp_at_estimate);
+		worst	= svcpt->scp_at_estimate.at_worst_ever;
+		worstt	= svcpt->scp_at_estimate.at_worst_time;
+		s2dhms(&ts, get_seconds() - worstt);
+
+		seq_printf(m, "%10s : cur %3u  worst %3u (at %ld, "
+			      DHMS_FMT" ago) ", "service",
+			      cur, worst, worstt, DHMS_VARS(&ts));
+
+		lprocfs_at_hist_helper(m, &svcpt->scp_at_estimate);
+	}
+
+	return 0;
+}
+LPROC_SEQ_FOPS_RO(ptlrpc_lprocfs_timeouts);
+
+static int ptlrpc_lprocfs_hp_ratio_seq_show(struct seq_file *m, void *v)
+{
+	struct ptlrpc_service *svc = m->private;
+	seq_printf(m, "%d", svc->srv_hpreq_ratio);
+	return 0;
+}
+
+static ssize_t ptlrpc_lprocfs_hp_ratio_seq_write(struct file *file,
+					     const char __user *buffer,
+					     size_t count,
+					     loff_t *off)
+{
+	struct ptlrpc_service *svc = ((struct seq_file *)file->private_data)->private;
+	int	rc;
+	int	val;
+
+	rc = lprocfs_write_helper(buffer, count, &val);
+	if (rc < 0)
+		return rc;
+
+	if (val < 0)
+		return -ERANGE;
+
+	spin_lock(&svc->srv_lock);
+	svc->srv_hpreq_ratio = val;
+	spin_unlock(&svc->srv_lock);
+
+	return count;
+}
+LPROC_SEQ_FOPS(ptlrpc_lprocfs_hp_ratio);
+
+void ptlrpc_lprocfs_register_service(struct proc_dir_entry *entry,
+				     struct ptlrpc_service *svc)
+{
+	struct lprocfs_vars lproc_vars[] = {
+		{.name       = "high_priority_ratio",
+		 .fops	     = &ptlrpc_lprocfs_hp_ratio_fops,
+		 .data       = svc},
+		{.name       = "req_buffer_history_len",
+		 .fops	     = &ptlrpc_lprocfs_req_history_len_fops,
+		 .data       = svc},
+		{.name       = "req_buffer_history_max",
+		 .fops	     = &ptlrpc_lprocfs_req_history_max_fops,
+		 .data       = svc},
+		{.name       = "threads_min",
+		 .fops	     = &ptlrpc_lprocfs_threads_min_fops,
+		 .data       = svc},
+		{.name       = "threads_max",
+		 .fops	     = &ptlrpc_lprocfs_threads_max_fops,
+		 .data       = svc},
+		{.name       = "threads_started",
+		 .fops	     = &ptlrpc_lprocfs_threads_started_fops,
+		 .data       = svc},
+		{.name       = "timeouts",
+		 .fops	     = &ptlrpc_lprocfs_timeouts_fops,
+		 .data       = svc},
+		{.name       = "nrs_policies",
+		 .fops	     = &ptlrpc_lprocfs_nrs_fops,
+		 .data	     = svc},
+		{NULL}
+	};
+	static const struct file_operations req_history_fops = {
+		.owner       = THIS_MODULE,
+		.open	= ptlrpc_lprocfs_svc_req_history_open,
+		.read	= seq_read,
+		.llseek      = seq_lseek,
+		.release     = lprocfs_seq_release,
+	};
+
+	int rc;
+
+	ptlrpc_lprocfs_register(entry, svc->srv_name,
+				"stats", &svc->srv_procroot,
+				&svc->srv_stats);
+
+	if (svc->srv_procroot == NULL)
+		return;
+
+	lprocfs_add_vars(svc->srv_procroot, lproc_vars, NULL);
+
+	rc = lprocfs_seq_create(svc->srv_procroot, "req_history",
+				0400, &req_history_fops, svc);
+	if (rc)
+		CWARN("Error adding the req_history file\n");
+}
+
+void ptlrpc_lprocfs_register_obd(struct obd_device *obddev)
+{
+	ptlrpc_lprocfs_register(obddev->obd_proc_entry, NULL, "stats",
+				&obddev->obd_svc_procroot,
+				&obddev->obd_svc_stats);
+}
+EXPORT_SYMBOL(ptlrpc_lprocfs_register_obd);
+
+void ptlrpc_lprocfs_rpc_sent(struct ptlrpc_request *req, long amount)
+{
+	struct lprocfs_stats *svc_stats;
+	__u32 op = lustre_msg_get_opc(req->rq_reqmsg);
+	int opc = opcode_offset(op);
+
+	svc_stats = req->rq_import->imp_obd->obd_svc_stats;
+	if (svc_stats == NULL || opc <= 0)
+		return;
+	LASSERT(opc < LUSTRE_MAX_OPCODES);
+	if (!(op == LDLM_ENQUEUE || op == MDS_REINT))
+		lprocfs_counter_add(svc_stats, opc + EXTRA_MAX_OPCODES, amount);
+}
+
+void ptlrpc_lprocfs_brw(struct ptlrpc_request *req, int bytes)
+{
+	struct lprocfs_stats *svc_stats;
+	int idx;
+
+	if (!req->rq_import)
+		return;
+	svc_stats = req->rq_import->imp_obd->obd_svc_stats;
+	if (!svc_stats)
+		return;
+	idx = lustre_msg_get_opc(req->rq_reqmsg);
+	switch (idx) {
+	case OST_READ:
+		idx = BRW_READ_BYTES + PTLRPC_LAST_CNTR;
+		break;
+	case OST_WRITE:
+		idx = BRW_WRITE_BYTES + PTLRPC_LAST_CNTR;
+		break;
+	default:
+		LASSERTF(0, "unsupported opcode %u\n", idx);
+		break;
+	}
+
+	lprocfs_counter_add(svc_stats, idx, bytes);
+}
+
+EXPORT_SYMBOL(ptlrpc_lprocfs_brw);
+
+void ptlrpc_lprocfs_unregister_service(struct ptlrpc_service *svc)
+{
+	if (svc->srv_procroot != NULL)
+		lprocfs_remove(&svc->srv_procroot);
+
+	if (svc->srv_stats)
+		lprocfs_free_stats(&svc->srv_stats);
+}
+
+void ptlrpc_lprocfs_unregister_obd(struct obd_device *obd)
+{
+	if (obd->obd_svc_procroot)
+		lprocfs_remove(&obd->obd_svc_procroot);
+
+	if (obd->obd_svc_stats)
+		lprocfs_free_stats(&obd->obd_svc_stats);
+}
+EXPORT_SYMBOL(ptlrpc_lprocfs_unregister_obd);
+
+
+#define BUFLEN (UUID_MAX + 5)
+
+int lprocfs_wr_evict_client(struct file *file, const char __user *buffer,
+			    size_t count, loff_t *off)
+{
+	struct obd_device *obd = ((struct seq_file *)file->private_data)->private;
+	char	      *kbuf;
+	char	      *tmpbuf;
+
+	OBD_ALLOC(kbuf, BUFLEN);
+	if (kbuf == NULL)
+		return -ENOMEM;
+
+	/*
+	 * OBD_ALLOC() will zero kbuf, but we only copy BUFLEN - 1
+	 * bytes into kbuf, to ensure that the string is NUL-terminated.
+	 * UUID_MAX should include a trailing NUL already.
+	 */
+	if (copy_from_user(kbuf, buffer,
+			       min_t(unsigned long, BUFLEN - 1, count))) {
+		count = -EFAULT;
+		goto out;
+	}
+	tmpbuf = cfs_firststr(kbuf, min_t(unsigned long, BUFLEN - 1, count));
+	/* Kludge code(deadlock situation): the lprocfs lock has been held
+	 * since the client is evicted by writing client's
+	 * uuid/nid to procfs "evict_client" entry. However,
+	 * obd_export_evict_by_uuid() will call lprocfs_remove() to destroy
+	 * the proc entries under the being destroyed export{}, so I have
+	 * to drop the lock at first here.
+	 * - jay, jxiong@clusterfs.com */
+	class_incref(obd, __func__, current);
+
+	if (strncmp(tmpbuf, "nid:", 4) == 0)
+		obd_export_evict_by_nid(obd, tmpbuf + 4);
+	else if (strncmp(tmpbuf, "uuid:", 5) == 0)
+		obd_export_evict_by_uuid(obd, tmpbuf + 5);
+	else
+		obd_export_evict_by_uuid(obd, tmpbuf);
+
+	class_decref(obd, __func__, current);
+
+out:
+	OBD_FREE(kbuf, BUFLEN);
+	return count;
+}
+EXPORT_SYMBOL(lprocfs_wr_evict_client);
+
+#undef BUFLEN
+
+int lprocfs_wr_ping(struct file *file, const char __user *buffer,
+		    size_t count, loff_t *off)
+{
+	struct obd_device *obd = ((struct seq_file *)file->private_data)->private;
+	struct ptlrpc_request *req;
+	int		    rc;
+
+	LPROCFS_CLIMP_CHECK(obd);
+	req = ptlrpc_prep_ping(obd->u.cli.cl_import);
+	LPROCFS_CLIMP_EXIT(obd);
+	if (req == NULL)
+		return -ENOMEM;
+
+	req->rq_send_state = LUSTRE_IMP_FULL;
+
+	rc = ptlrpc_queue_wait(req);
+
+	ptlrpc_req_finished(req);
+	if (rc >= 0)
+		return count;
+	return rc;
+}
+EXPORT_SYMBOL(lprocfs_wr_ping);
+
+/* Write the connection UUID to this file to attempt to connect to that node.
+ * The connection UUID is a node's primary NID. For example,
+ * "echo connection=192.168.0.1@tcp0::instance > .../import".
+ */
+int lprocfs_wr_import(struct file *file, const char __user *buffer,
+		      size_t count, loff_t *off)
+{
+	struct obd_device *obd = ((struct seq_file *)file->private_data)->private;
+	struct obd_import *imp = obd->u.cli.cl_import;
+	char *kbuf = NULL;
+	char *uuid;
+	char *ptr;
+	int do_reconn = 1;
+	const char prefix[] = "connection=";
+	const int prefix_len = sizeof(prefix) - 1;
+
+	if (count > PAGE_CACHE_SIZE - 1 || count <= prefix_len)
+		return -EINVAL;
+
+	OBD_ALLOC(kbuf, count + 1);
+	if (kbuf == NULL)
+		return -ENOMEM;
+
+	if (copy_from_user(kbuf, buffer, count)) {
+		count = -EFAULT;
+		goto out;
+	}
+
+	kbuf[count] = 0;
+
+	/* only support connection=uuid::instance now */
+	if (strncmp(prefix, kbuf, prefix_len) != 0) {
+		count = -EINVAL;
+		goto out;
+	}
+
+	uuid = kbuf + prefix_len;
+	ptr = strstr(uuid, "::");
+	if (ptr) {
+		__u32 inst;
+		char *endptr;
+
+		*ptr = 0;
+		do_reconn = 0;
+		ptr += strlen("::");
+		inst = simple_strtol(ptr, &endptr, 10);
+		if (*endptr) {
+			CERROR("config: wrong instance # %s\n", ptr);
+		} else if (inst != imp->imp_connect_data.ocd_instance) {
+			CDEBUG(D_INFO, "IR: %s is connecting to an obsoleted target(%u/%u), reconnecting...\n",
+			       imp->imp_obd->obd_name,
+			       imp->imp_connect_data.ocd_instance, inst);
+			do_reconn = 1;
+		} else {
+			CDEBUG(D_INFO, "IR: %s has already been connecting to new target(%u)\n",
+			       imp->imp_obd->obd_name, inst);
+		}
+	}
+
+	if (do_reconn)
+		ptlrpc_recover_import(imp, uuid, 1);
+
+out:
+	OBD_FREE(kbuf, count + 1);
+	return count;
+}
+EXPORT_SYMBOL(lprocfs_wr_import);
+
+int lprocfs_rd_pinger_recov(struct seq_file *m, void *n)
+{
+	struct obd_device *obd = m->private;
+	struct obd_import *imp = obd->u.cli.cl_import;
+
+	LPROCFS_CLIMP_CHECK(obd);
+	seq_printf(m, "%d\n", !imp->imp_no_pinger_recover);
+	LPROCFS_CLIMP_EXIT(obd);
+
+	return 0;
+}
+EXPORT_SYMBOL(lprocfs_rd_pinger_recov);
+
+int lprocfs_wr_pinger_recov(struct file *file, const char __user *buffer,
+		      size_t count, loff_t *off)
+{
+	struct obd_device *obd = ((struct seq_file *)file->private_data)->private;
+	struct client_obd *cli = &obd->u.cli;
+	struct obd_import *imp = cli->cl_import;
+	int rc, val;
+
+	rc = lprocfs_write_helper(buffer, count, &val);
+	if (rc < 0)
+		return rc;
+
+	if (val != 0 && val != 1)
+		return -ERANGE;
+
+	LPROCFS_CLIMP_CHECK(obd);
+	spin_lock(&imp->imp_lock);
+	imp->imp_no_pinger_recover = !val;
+	spin_unlock(&imp->imp_lock);
+	LPROCFS_CLIMP_EXIT(obd);
+
+	return count;
+
+}
+EXPORT_SYMBOL(lprocfs_wr_pinger_recov);
+
+#endif /* CONFIG_PROC_FS */
diff --git a/kernel/drivers/staging/lustre/lustre/ptlrpc/niobuf.c b/kernel/drivers/staging/lustre/lustre/ptlrpc/niobuf.c
new file mode 100644
index 000000000..2fa258558
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/ptlrpc/niobuf.c
@@ -0,0 +1,731 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_RPC
+#include "../include/obd_support.h"
+#include "../include/lustre_net.h"
+#include "../include/lustre_lib.h"
+#include "../include/obd.h"
+#include "../include/obd_class.h"
+#include "ptlrpc_internal.h"
+
+/**
+ * Helper function. Sends \a len bytes from \a base at offset \a offset
+ * over \a conn connection to portal \a portal.
+ * Returns 0 on success or error code.
+ */
+static int ptl_send_buf(lnet_handle_md_t *mdh, void *base, int len,
+			lnet_ack_req_t ack, struct ptlrpc_cb_id *cbid,
+			struct ptlrpc_connection *conn, int portal, __u64 xid,
+			unsigned int offset)
+{
+	int	      rc;
+	lnet_md_t	 md;
+
+	LASSERT(portal != 0);
+	LASSERT(conn != NULL);
+	CDEBUG(D_INFO, "conn=%p id %s\n", conn, libcfs_id2str(conn->c_peer));
+	md.start     = base;
+	md.length    = len;
+	md.threshold = (ack == LNET_ACK_REQ) ? 2 : 1;
+	md.options   = PTLRPC_MD_OPTIONS;
+	md.user_ptr  = cbid;
+	md.eq_handle = ptlrpc_eq_h;
+
+	if (unlikely(ack == LNET_ACK_REQ &&
+		     OBD_FAIL_CHECK_ORSET(OBD_FAIL_PTLRPC_ACK,
+					  OBD_FAIL_ONCE))) {
+		/* don't ask for the ack to simulate failing client */
+		ack = LNET_NOACK_REQ;
+	}
+
+	rc = LNetMDBind(md, LNET_UNLINK, mdh);
+	if (unlikely(rc != 0)) {
+		CERROR("LNetMDBind failed: %d\n", rc);
+		LASSERT(rc == -ENOMEM);
+		return -ENOMEM;
+	}
+
+	CDEBUG(D_NET, "Sending %d bytes to portal %d, xid %lld, offset %u\n",
+	       len, portal, xid, offset);
+
+	rc = LNetPut(conn->c_self, *mdh, ack,
+		     conn->c_peer, portal, xid, offset, 0);
+	if (unlikely(rc != 0)) {
+		int rc2;
+		/* We're going to get an UNLINK event when I unlink below,
+		 * which will complete just like any other failed send, so
+		 * I fall through and return success here! */
+		CERROR("LNetPut(%s, %d, %lld) failed: %d\n",
+		       libcfs_id2str(conn->c_peer), portal, xid, rc);
+		rc2 = LNetMDUnlink(*mdh);
+		LASSERTF(rc2 == 0, "rc2 = %d\n", rc2);
+	}
+
+	return 0;
+}
+
+static void mdunlink_iterate_helper(lnet_handle_md_t *bd_mds, int count)
+{
+	int i;
+
+	for (i = 0; i < count; i++)
+		LNetMDUnlink(bd_mds[i]);
+}
+
+
+/**
+ * Register bulk at the sender for later transfer.
+ * Returns 0 on success or error code.
+ */
+int ptlrpc_register_bulk(struct ptlrpc_request *req)
+{
+	struct ptlrpc_bulk_desc *desc = req->rq_bulk;
+	lnet_process_id_t peer;
+	int rc = 0;
+	int rc2;
+	int posted_md;
+	int total_md;
+	__u64 xid;
+	lnet_handle_me_t  me_h;
+	lnet_md_t	 md;
+
+	if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_BULK_GET_NET))
+		return 0;
+
+	/* NB no locking required until desc is on the network */
+	LASSERT(desc->bd_nob > 0);
+	LASSERT(desc->bd_md_count == 0);
+	LASSERT(desc->bd_md_max_brw <= PTLRPC_BULK_OPS_COUNT);
+	LASSERT(desc->bd_iov_count <= PTLRPC_MAX_BRW_PAGES);
+	LASSERT(desc->bd_req != NULL);
+	LASSERT(desc->bd_type == BULK_PUT_SINK ||
+		desc->bd_type == BULK_GET_SOURCE);
+
+	/* cleanup the state of the bulk for it will be reused */
+	if (req->rq_resend || req->rq_send_state == LUSTRE_IMP_REPLAY)
+		desc->bd_nob_transferred = 0;
+	else
+		LASSERT(desc->bd_nob_transferred == 0);
+
+	desc->bd_failure = 0;
+
+	peer = desc->bd_import->imp_connection->c_peer;
+
+	LASSERT(desc->bd_cbid.cbid_fn == client_bulk_callback);
+	LASSERT(desc->bd_cbid.cbid_arg == desc);
+
+	/* An XID is only used for a single request from the client.
+	 * For retried bulk transfers, a new XID will be allocated in
+	 * in ptlrpc_check_set() if it needs to be resent, so it is not
+	 * using the same RDMA match bits after an error.
+	 *
+	 * For multi-bulk RPCs, rq_xid is the last XID needed for bulks. The
+	 * first bulk XID is power-of-two aligned before rq_xid. LU-1431 */
+	xid = req->rq_xid & ~((__u64)desc->bd_md_max_brw - 1);
+	LASSERTF(!(desc->bd_registered &&
+		   req->rq_send_state != LUSTRE_IMP_REPLAY) ||
+		 xid != desc->bd_last_xid,
+		 "registered: %d  rq_xid: %llu bd_last_xid: %llu\n",
+		 desc->bd_registered, xid, desc->bd_last_xid);
+
+	total_md = (desc->bd_iov_count + LNET_MAX_IOV - 1) / LNET_MAX_IOV;
+	desc->bd_registered = 1;
+	desc->bd_last_xid = xid;
+	desc->bd_md_count = total_md;
+	md.user_ptr = &desc->bd_cbid;
+	md.eq_handle = ptlrpc_eq_h;
+	md.threshold = 1;		       /* PUT or GET */
+
+	for (posted_md = 0; posted_md < total_md; posted_md++, xid++) {
+		md.options = PTLRPC_MD_OPTIONS |
+			     ((desc->bd_type == BULK_GET_SOURCE) ?
+			      LNET_MD_OP_GET : LNET_MD_OP_PUT);
+		ptlrpc_fill_bulk_md(&md, desc, posted_md);
+
+		rc = LNetMEAttach(desc->bd_portal, peer, xid, 0,
+				  LNET_UNLINK, LNET_INS_AFTER, &me_h);
+		if (rc != 0) {
+			CERROR("%s: LNetMEAttach failed x%llu/%d: rc = %d\n",
+			       desc->bd_import->imp_obd->obd_name, xid,
+			       posted_md, rc);
+			break;
+		}
+
+		/* About to let the network at it... */
+		rc = LNetMDAttach(me_h, md, LNET_UNLINK,
+				  &desc->bd_mds[posted_md]);
+		if (rc != 0) {
+			CERROR("%s: LNetMDAttach failed x%llu/%d: rc = %d\n",
+			       desc->bd_import->imp_obd->obd_name, xid,
+			       posted_md, rc);
+			rc2 = LNetMEUnlink(me_h);
+			LASSERT(rc2 == 0);
+			break;
+		}
+	}
+
+	if (rc != 0) {
+		LASSERT(rc == -ENOMEM);
+		spin_lock(&desc->bd_lock);
+		desc->bd_md_count -= total_md - posted_md;
+		spin_unlock(&desc->bd_lock);
+		LASSERT(desc->bd_md_count >= 0);
+		mdunlink_iterate_helper(desc->bd_mds, desc->bd_md_max_brw);
+		req->rq_status = -ENOMEM;
+		return -ENOMEM;
+	}
+
+	/* Set rq_xid to matchbits of the final bulk so that server can
+	 * infer the number of bulks that were prepared */
+	req->rq_xid = --xid;
+	LASSERTF(desc->bd_last_xid == (req->rq_xid & PTLRPC_BULK_OPS_MASK),
+		 "bd_last_xid = x%llu, rq_xid = x%llu\n",
+		 desc->bd_last_xid, req->rq_xid);
+
+	spin_lock(&desc->bd_lock);
+	/* Holler if peer manages to touch buffers before he knows the xid */
+	if (desc->bd_md_count != total_md)
+		CWARN("%s: Peer %s touched %d buffers while I registered\n",
+		      desc->bd_import->imp_obd->obd_name, libcfs_id2str(peer),
+		      total_md - desc->bd_md_count);
+	spin_unlock(&desc->bd_lock);
+
+	CDEBUG(D_NET, "Setup %u bulk %s buffers: %u pages %u bytes, xid x%#llx-%#llx, portal %u\n",
+	       desc->bd_md_count,
+	       desc->bd_type == BULK_GET_SOURCE ? "get-source" : "put-sink",
+	       desc->bd_iov_count, desc->bd_nob,
+	       desc->bd_last_xid, req->rq_xid, desc->bd_portal);
+
+	return 0;
+}
+EXPORT_SYMBOL(ptlrpc_register_bulk);
+
+/**
+ * Disconnect a bulk desc from the network. Idempotent. Not
+ * thread-safe (i.e. only interlocks with completion callback).
+ * Returns 1 on success or 0 if network unregistration failed for whatever
+ * reason.
+ */
+int ptlrpc_unregister_bulk(struct ptlrpc_request *req, int async)
+{
+	struct ptlrpc_bulk_desc *desc = req->rq_bulk;
+	wait_queue_head_t	     *wq;
+	struct l_wait_info       lwi;
+	int		      rc;
+
+	LASSERT(!in_interrupt());     /* might sleep */
+
+	/* Let's setup deadline for reply unlink. */
+	if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_BULK_UNLINK) &&
+	    async && req->rq_bulk_deadline == 0)
+		req->rq_bulk_deadline = get_seconds() + LONG_UNLINK;
+
+	if (ptlrpc_client_bulk_active(req) == 0)	/* completed or */
+		return 1;				/* never registered */
+
+	LASSERT(desc->bd_req == req);  /* bd_req NULL until registered */
+
+	/* the unlink ensures the callback happens ASAP and is the last
+	 * one.  If it fails, it must be because completion just happened,
+	 * but we must still l_wait_event() in this case to give liblustre
+	 * a chance to run client_bulk_callback() */
+	mdunlink_iterate_helper(desc->bd_mds, desc->bd_md_max_brw);
+
+	if (ptlrpc_client_bulk_active(req) == 0)	/* completed or */
+		return 1;				/* never registered */
+
+	/* Move to "Unregistering" phase as bulk was not unlinked yet. */
+	ptlrpc_rqphase_move(req, RQ_PHASE_UNREGISTERING);
+
+	/* Do not wait for unlink to finish. */
+	if (async)
+		return 0;
+
+	if (req->rq_set != NULL)
+		wq = &req->rq_set->set_waitq;
+	else
+		wq = &req->rq_reply_waitq;
+
+	for (;;) {
+		/* Network access will complete in finite time but the HUGE
+		 * timeout lets us CWARN for visibility of sluggish NALs */
+		lwi = LWI_TIMEOUT_INTERVAL(cfs_time_seconds(LONG_UNLINK),
+					   cfs_time_seconds(1), NULL, NULL);
+		rc = l_wait_event(*wq, !ptlrpc_client_bulk_active(req), &lwi);
+		if (rc == 0) {
+			ptlrpc_rqphase_move(req, req->rq_next_phase);
+			return 1;
+		}
+
+		LASSERT(rc == -ETIMEDOUT);
+		DEBUG_REQ(D_WARNING, req, "Unexpectedly long timeout: desc %p",
+			  desc);
+	}
+	return 0;
+}
+EXPORT_SYMBOL(ptlrpc_unregister_bulk);
+
+static void ptlrpc_at_set_reply(struct ptlrpc_request *req, int flags)
+{
+	struct ptlrpc_service_part	*svcpt = req->rq_rqbd->rqbd_svcpt;
+	struct ptlrpc_service		*svc = svcpt->scp_service;
+	int service_time = max_t(int, get_seconds() -
+				 req->rq_arrival_time.tv_sec, 1);
+
+	if (!(flags & PTLRPC_REPLY_EARLY) &&
+	    (req->rq_type != PTL_RPC_MSG_ERR) &&
+	    (req->rq_reqmsg != NULL) &&
+	    !(lustre_msg_get_flags(req->rq_reqmsg) &
+	      (MSG_RESENT | MSG_REPLAY |
+	       MSG_REQ_REPLAY_DONE | MSG_LOCK_REPLAY_DONE))) {
+		/* early replies, errors and recovery requests don't count
+		 * toward our service time estimate */
+		int oldse = at_measured(&svcpt->scp_at_estimate, service_time);
+
+		if (oldse != 0) {
+			DEBUG_REQ(D_ADAPTTO, req,
+				  "svc %s changed estimate from %d to %d",
+				  svc->srv_name, oldse,
+				  at_get(&svcpt->scp_at_estimate));
+		}
+	}
+	/* Report actual service time for client latency calc */
+	lustre_msg_set_service_time(req->rq_repmsg, service_time);
+	/* Report service time estimate for future client reqs, but report 0
+	 * (to be ignored by client) if it's a error reply during recovery.
+	 * (bz15815) */
+	if (req->rq_type == PTL_RPC_MSG_ERR &&
+	    (req->rq_export == NULL || req->rq_export->exp_obd->obd_recovering))
+		lustre_msg_set_timeout(req->rq_repmsg, 0);
+	else
+		lustre_msg_set_timeout(req->rq_repmsg,
+				       at_get(&svcpt->scp_at_estimate));
+
+	if (req->rq_reqmsg &&
+	    !(lustre_msghdr_get_flags(req->rq_reqmsg) & MSGHDR_AT_SUPPORT)) {
+		CDEBUG(D_ADAPTTO, "No early reply support: flags=%#x req_flags=%#x magic=%d:%x/%x len=%d\n",
+		       flags, lustre_msg_get_flags(req->rq_reqmsg),
+		       lustre_msg_is_v1(req->rq_reqmsg),
+		       lustre_msg_get_magic(req->rq_reqmsg),
+		       lustre_msg_get_magic(req->rq_repmsg), req->rq_replen);
+	}
+}
+
+/**
+ * Send request reply from request \a req reply buffer.
+ * \a flags defines reply types
+ * Returns 0 on success or error code
+ */
+int ptlrpc_send_reply(struct ptlrpc_request *req, int flags)
+{
+	struct ptlrpc_reply_state *rs = req->rq_reply_state;
+	struct ptlrpc_connection  *conn;
+	int			rc;
+
+	/* We must already have a reply buffer (only ptlrpc_error() may be
+	 * called without one). The reply generated by sptlrpc layer (e.g.
+	 * error notify, etc.) might have NULL rq->reqmsg; Otherwise we must
+	 * have a request buffer which is either the actual (swabbed) incoming
+	 * request, or a saved copy if this is a req saved in
+	 * target_queue_final_reply().
+	 */
+	LASSERT(req->rq_no_reply == 0);
+	LASSERT(req->rq_reqbuf != NULL);
+	LASSERT(rs != NULL);
+	LASSERT((flags & PTLRPC_REPLY_MAYBE_DIFFICULT) || !rs->rs_difficult);
+	LASSERT(req->rq_repmsg != NULL);
+	LASSERT(req->rq_repmsg == rs->rs_msg);
+	LASSERT(rs->rs_cb_id.cbid_fn == reply_out_callback);
+	LASSERT(rs->rs_cb_id.cbid_arg == rs);
+
+	/* There may be no rq_export during failover */
+
+	if (unlikely(req->rq_export && req->rq_export->exp_obd &&
+		     req->rq_export->exp_obd->obd_fail)) {
+		/* Failed obd's only send ENODEV */
+		req->rq_type = PTL_RPC_MSG_ERR;
+		req->rq_status = -ENODEV;
+		CDEBUG(D_HA, "sending ENODEV from failed obd %d\n",
+		       req->rq_export->exp_obd->obd_minor);
+	}
+
+	/* In order to keep interoperability with the client (< 2.3) which
+	 * doesn't have pb_jobid in ptlrpc_body, We have to shrink the
+	 * ptlrpc_body in reply buffer to ptlrpc_body_v2, otherwise, the
+	 * reply buffer on client will be overflow.
+	 *
+	 * XXX Remove this whenever we drop the interoperability with
+	 * such client.
+	 */
+	req->rq_replen = lustre_shrink_msg(req->rq_repmsg, 0,
+					   sizeof(struct ptlrpc_body_v2), 1);
+
+	if (req->rq_type != PTL_RPC_MSG_ERR)
+		req->rq_type = PTL_RPC_MSG_REPLY;
+
+	lustre_msg_set_type(req->rq_repmsg, req->rq_type);
+	lustre_msg_set_status(req->rq_repmsg,
+			      ptlrpc_status_hton(req->rq_status));
+	lustre_msg_set_opc(req->rq_repmsg,
+		req->rq_reqmsg ? lustre_msg_get_opc(req->rq_reqmsg) : 0);
+
+	target_pack_pool_reply(req);
+
+	ptlrpc_at_set_reply(req, flags);
+
+	if (req->rq_export == NULL || req->rq_export->exp_connection == NULL)
+		conn = ptlrpc_connection_get(req->rq_peer, req->rq_self, NULL);
+	else
+		conn = ptlrpc_connection_addref(req->rq_export->exp_connection);
+
+	if (unlikely(conn == NULL)) {
+		CERROR("not replying on NULL connection\n"); /* bug 9635 */
+		return -ENOTCONN;
+	}
+	ptlrpc_rs_addref(rs);		   /* +1 ref for the network */
+
+	rc = sptlrpc_svc_wrap_reply(req);
+	if (unlikely(rc))
+		goto out;
+
+	req->rq_sent = get_seconds();
+
+	rc = ptl_send_buf(&rs->rs_md_h, rs->rs_repbuf, rs->rs_repdata_len,
+			  (rs->rs_difficult && !rs->rs_no_ack) ?
+			  LNET_ACK_REQ : LNET_NOACK_REQ,
+			  &rs->rs_cb_id, conn,
+			  ptlrpc_req2svc(req)->srv_rep_portal,
+			  req->rq_xid, req->rq_reply_off);
+out:
+	if (unlikely(rc != 0))
+		ptlrpc_req_drop_rs(req);
+	ptlrpc_connection_put(conn);
+	return rc;
+}
+EXPORT_SYMBOL(ptlrpc_send_reply);
+
+int ptlrpc_reply(struct ptlrpc_request *req)
+{
+	if (req->rq_no_reply)
+		return 0;
+	return ptlrpc_send_reply(req, 0);
+}
+EXPORT_SYMBOL(ptlrpc_reply);
+
+/**
+ * For request \a req send an error reply back. Create empty
+ * reply buffers if necessary.
+ */
+int ptlrpc_send_error(struct ptlrpc_request *req, int may_be_difficult)
+{
+	int rc;
+
+	if (req->rq_no_reply)
+		return 0;
+
+	if (!req->rq_repmsg) {
+		rc = lustre_pack_reply(req, 1, NULL, NULL);
+		if (rc)
+			return rc;
+	}
+
+	if (req->rq_status != -ENOSPC && req->rq_status != -EACCES &&
+	    req->rq_status != -EPERM && req->rq_status != -ENOENT &&
+	    req->rq_status != -EINPROGRESS && req->rq_status != -EDQUOT)
+		req->rq_type = PTL_RPC_MSG_ERR;
+
+	rc = ptlrpc_send_reply(req, may_be_difficult);
+	return rc;
+}
+EXPORT_SYMBOL(ptlrpc_send_error);
+
+int ptlrpc_error(struct ptlrpc_request *req)
+{
+	return ptlrpc_send_error(req, 0);
+}
+EXPORT_SYMBOL(ptlrpc_error);
+
+/**
+ * Send request \a request.
+ * if \a noreply is set, don't expect any reply back and don't set up
+ * reply buffers.
+ * Returns 0 on success or error code.
+ */
+int ptl_send_rpc(struct ptlrpc_request *request, int noreply)
+{
+	int rc;
+	int rc2;
+	int mpflag = 0;
+	struct ptlrpc_connection *connection;
+	lnet_handle_me_t  reply_me_h;
+	lnet_md_t	 reply_md;
+	struct obd_device *obd = request->rq_import->imp_obd;
+
+	if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_DROP_RPC))
+		return 0;
+
+	LASSERT(request->rq_type == PTL_RPC_MSG_REQUEST);
+	LASSERT(request->rq_wait_ctx == 0);
+
+	/* If this is a re-transmit, we're required to have disengaged
+	 * cleanly from the previous attempt */
+	LASSERT(!request->rq_receiving_reply);
+	LASSERT(!((lustre_msg_get_flags(request->rq_reqmsg) & MSG_REPLAY) &&
+		(request->rq_import->imp_state == LUSTRE_IMP_FULL)));
+
+	if (unlikely(obd != NULL && obd->obd_fail)) {
+		CDEBUG(D_HA, "muting rpc for failed imp obd %s\n",
+			obd->obd_name);
+		/* this prevents us from waiting in ptlrpc_queue_wait */
+		spin_lock(&request->rq_lock);
+		request->rq_err = 1;
+		spin_unlock(&request->rq_lock);
+		request->rq_status = -ENODEV;
+		return -ENODEV;
+	}
+
+	connection = request->rq_import->imp_connection;
+
+	lustre_msg_set_handle(request->rq_reqmsg,
+			      &request->rq_import->imp_remote_handle);
+	lustre_msg_set_type(request->rq_reqmsg, PTL_RPC_MSG_REQUEST);
+	lustre_msg_set_conn_cnt(request->rq_reqmsg,
+				request->rq_import->imp_conn_cnt);
+	lustre_msghdr_set_flags(request->rq_reqmsg,
+				request->rq_import->imp_msghdr_flags);
+
+	if (request->rq_resend)
+		lustre_msg_add_flags(request->rq_reqmsg, MSG_RESENT);
+
+	if (request->rq_memalloc)
+		mpflag = cfs_memory_pressure_get_and_set();
+
+	rc = sptlrpc_cli_wrap_request(request);
+	if (rc)
+		goto out;
+
+	/* bulk register should be done after wrap_request() */
+	if (request->rq_bulk != NULL) {
+		rc = ptlrpc_register_bulk(request);
+		if (rc != 0)
+			goto out;
+	}
+
+	if (!noreply) {
+		LASSERT(request->rq_replen != 0);
+		if (request->rq_repbuf == NULL) {
+			LASSERT(request->rq_repdata == NULL);
+			LASSERT(request->rq_repmsg == NULL);
+			rc = sptlrpc_cli_alloc_repbuf(request,
+						      request->rq_replen);
+			if (rc) {
+				/* this prevents us from looping in
+				 * ptlrpc_queue_wait */
+				spin_lock(&request->rq_lock);
+				request->rq_err = 1;
+				spin_unlock(&request->rq_lock);
+				request->rq_status = rc;
+				goto cleanup_bulk;
+			}
+		} else {
+			request->rq_repdata = NULL;
+			request->rq_repmsg = NULL;
+		}
+
+		rc = LNetMEAttach(request->rq_reply_portal,/*XXX FIXME bug 249*/
+				  connection->c_peer, request->rq_xid, 0,
+				  LNET_UNLINK, LNET_INS_AFTER, &reply_me_h);
+		if (rc != 0) {
+			CERROR("LNetMEAttach failed: %d\n", rc);
+			LASSERT(rc == -ENOMEM);
+			rc = -ENOMEM;
+			goto cleanup_bulk;
+		}
+	}
+
+	spin_lock(&request->rq_lock);
+	/* If the MD attach succeeds, there _will_ be a reply_in callback */
+	request->rq_receiving_reply = !noreply;
+	request->rq_req_unlink = 1;
+	/* We are responsible for unlinking the reply buffer */
+	request->rq_reply_unlink = !noreply;
+	/* Clear any flags that may be present from previous sends. */
+	request->rq_replied = 0;
+	request->rq_err = 0;
+	request->rq_timedout = 0;
+	request->rq_net_err = 0;
+	request->rq_resend = 0;
+	request->rq_restart = 0;
+	request->rq_reply_truncate = 0;
+	spin_unlock(&request->rq_lock);
+
+	if (!noreply) {
+		reply_md.start     = request->rq_repbuf;
+		reply_md.length    = request->rq_repbuf_len;
+		/* Allow multiple early replies */
+		reply_md.threshold = LNET_MD_THRESH_INF;
+		/* Manage remote for early replies */
+		reply_md.options   = PTLRPC_MD_OPTIONS | LNET_MD_OP_PUT |
+			LNET_MD_MANAGE_REMOTE |
+			LNET_MD_TRUNCATE; /* allow to make EOVERFLOW error */;
+		reply_md.user_ptr  = &request->rq_reply_cbid;
+		reply_md.eq_handle = ptlrpc_eq_h;
+
+		/* We must see the unlink callback to unset rq_reply_unlink,
+		   so we can't auto-unlink */
+		rc = LNetMDAttach(reply_me_h, reply_md, LNET_RETAIN,
+				  &request->rq_reply_md_h);
+		if (rc != 0) {
+			CERROR("LNetMDAttach failed: %d\n", rc);
+			LASSERT(rc == -ENOMEM);
+			spin_lock(&request->rq_lock);
+			/* ...but the MD attach didn't succeed... */
+			request->rq_receiving_reply = 0;
+			spin_unlock(&request->rq_lock);
+			rc = -ENOMEM;
+			goto cleanup_me;
+		}
+
+		CDEBUG(D_NET, "Setup reply buffer: %u bytes, xid %llu, portal %u\n",
+		       request->rq_repbuf_len, request->rq_xid,
+		       request->rq_reply_portal);
+	}
+
+	/* add references on request for request_out_callback */
+	ptlrpc_request_addref(request);
+	if (obd != NULL && obd->obd_svc_stats != NULL)
+		lprocfs_counter_add(obd->obd_svc_stats, PTLRPC_REQACTIVE_CNTR,
+			atomic_read(&request->rq_import->imp_inflight));
+
+	OBD_FAIL_TIMEOUT(OBD_FAIL_PTLRPC_DELAY_SEND, request->rq_timeout + 5);
+
+	do_gettimeofday(&request->rq_arrival_time);
+	request->rq_sent = get_seconds();
+	/* We give the server rq_timeout secs to process the req, and
+	   add the network latency for our local timeout. */
+	request->rq_deadline = request->rq_sent + request->rq_timeout +
+		ptlrpc_at_get_net_latency(request);
+
+	ptlrpc_pinger_sending_on_import(request->rq_import);
+
+	DEBUG_REQ(D_INFO, request, "send flg=%x",
+		  lustre_msg_get_flags(request->rq_reqmsg));
+	rc = ptl_send_buf(&request->rq_req_md_h,
+			  request->rq_reqbuf, request->rq_reqdata_len,
+			  LNET_NOACK_REQ, &request->rq_req_cbid,
+			  connection,
+			  request->rq_request_portal,
+			  request->rq_xid, 0);
+	if (rc == 0)
+		goto out;
+
+	ptlrpc_req_finished(request);
+	if (noreply)
+		goto out;
+
+ cleanup_me:
+	/* MEUnlink is safe; the PUT didn't even get off the ground, and
+	 * nobody apart from the PUT's target has the right nid+XID to
+	 * access the reply buffer. */
+	rc2 = LNetMEUnlink(reply_me_h);
+	LASSERT(rc2 == 0);
+	/* UNLINKED callback called synchronously */
+	LASSERT(!request->rq_receiving_reply);
+
+ cleanup_bulk:
+	/* We do sync unlink here as there was no real transfer here so
+	 * the chance to have long unlink to sluggish net is smaller here. */
+	ptlrpc_unregister_bulk(request, 0);
+ out:
+	if (request->rq_memalloc)
+		cfs_memory_pressure_restore(mpflag);
+	return rc;
+}
+EXPORT_SYMBOL(ptl_send_rpc);
+
+/**
+ * Register request buffer descriptor for request receiving.
+ */
+int ptlrpc_register_rqbd(struct ptlrpc_request_buffer_desc *rqbd)
+{
+	struct ptlrpc_service	  *service = rqbd->rqbd_svcpt->scp_service;
+	static lnet_process_id_t  match_id = {LNET_NID_ANY, LNET_PID_ANY};
+	int			  rc;
+	lnet_md_t		 md;
+	lnet_handle_me_t	  me_h;
+
+	CDEBUG(D_NET, "LNetMEAttach: portal %d\n",
+	       service->srv_req_portal);
+
+	if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_RQBD))
+		return -ENOMEM;
+
+	/* NB: CPT affinity service should use new LNet flag LNET_INS_LOCAL,
+	 * which means buffer can only be attached on local CPT, and LND
+	 * threads can find it by grabbing a local lock */
+	rc = LNetMEAttach(service->srv_req_portal,
+			  match_id, 0, ~0, LNET_UNLINK,
+			  rqbd->rqbd_svcpt->scp_cpt >= 0 ?
+			  LNET_INS_LOCAL : LNET_INS_AFTER, &me_h);
+	if (rc != 0) {
+		CERROR("LNetMEAttach failed: %d\n", rc);
+		return -ENOMEM;
+	}
+
+	LASSERT(rqbd->rqbd_refcount == 0);
+	rqbd->rqbd_refcount = 1;
+
+	md.start     = rqbd->rqbd_buffer;
+	md.length    = service->srv_buf_size;
+	md.max_size  = service->srv_max_req_size;
+	md.threshold = LNET_MD_THRESH_INF;
+	md.options   = PTLRPC_MD_OPTIONS | LNET_MD_OP_PUT | LNET_MD_MAX_SIZE;
+	md.user_ptr  = &rqbd->rqbd_cbid;
+	md.eq_handle = ptlrpc_eq_h;
+
+	rc = LNetMDAttach(me_h, md, LNET_UNLINK, &rqbd->rqbd_md_h);
+	if (rc == 0)
+		return 0;
+
+	CERROR("LNetMDAttach failed: %d;\n", rc);
+	LASSERT(rc == -ENOMEM);
+	rc = LNetMEUnlink(me_h);
+	LASSERT(rc == 0);
+	rqbd->rqbd_refcount = 0;
+
+	return -ENOMEM;
+}
diff --git a/kernel/drivers/staging/lustre/lustre/ptlrpc/nrs.c b/kernel/drivers/staging/lustre/lustre/ptlrpc/nrs.c
new file mode 100644
index 000000000..81ad74732
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/ptlrpc/nrs.c
@@ -0,0 +1,1754 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License version 2 for more details.  A copy is
+ * included in the COPYING file that accompanied this code.
+
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2011 Intel Corporation
+ *
+ * Copyright 2012 Xyratex Technology Limited
+ */
+/*
+ * lustre/ptlrpc/nrs.c
+ *
+ * Network Request Scheduler (NRS)
+ *
+ * Allows to reorder the handling of RPCs at servers.
+ *
+ * Author: Liang Zhen <liang@whamcloud.com>
+ * Author: Nikitas Angelinas <nikitas_angelinas@xyratex.com>
+ */
+/**
+ * \addtogoup nrs
+ * @{
+ */
+
+#define DEBUG_SUBSYSTEM S_RPC
+#include "../include/obd_support.h"
+#include "../include/obd_class.h"
+#include "../include/lustre_net.h"
+#include "../include/lprocfs_status.h"
+#include "../../include/linux/libcfs/libcfs.h"
+#include "ptlrpc_internal.h"
+
+/* XXX: This is just for liblustre. Remove the #if defined directive when the
+ * "cfs_" prefix is dropped from cfs_list_head. */
+extern struct list_head ptlrpc_all_services;
+
+/**
+ * NRS core object.
+ */
+struct nrs_core nrs_core;
+
+static int nrs_policy_init(struct ptlrpc_nrs_policy *policy)
+{
+	return policy->pol_desc->pd_ops->op_policy_init != NULL ?
+	       policy->pol_desc->pd_ops->op_policy_init(policy) : 0;
+}
+
+static void nrs_policy_fini(struct ptlrpc_nrs_policy *policy)
+{
+	LASSERT(policy->pol_ref == 0);
+	LASSERT(policy->pol_req_queued == 0);
+
+	if (policy->pol_desc->pd_ops->op_policy_fini != NULL)
+		policy->pol_desc->pd_ops->op_policy_fini(policy);
+}
+
+static int nrs_policy_ctl_locked(struct ptlrpc_nrs_policy *policy,
+				 enum ptlrpc_nrs_ctl opc, void *arg)
+{
+	/**
+	 * The policy may be stopped, but the lprocfs files and
+	 * ptlrpc_nrs_policy instances remain present until unregistration time.
+	 * Do not perform the ctl operation if the policy is stopped, as
+	 * policy->pol_private will be NULL in such a case.
+	 */
+	if (policy->pol_state == NRS_POL_STATE_STOPPED)
+		return -ENODEV;
+
+	return policy->pol_desc->pd_ops->op_policy_ctl != NULL ?
+	       policy->pol_desc->pd_ops->op_policy_ctl(policy, opc, arg) :
+	       -ENOSYS;
+}
+
+static void nrs_policy_stop0(struct ptlrpc_nrs_policy *policy)
+{
+	struct ptlrpc_nrs *nrs = policy->pol_nrs;
+
+	if (policy->pol_desc->pd_ops->op_policy_stop != NULL) {
+		spin_unlock(&nrs->nrs_lock);
+
+		policy->pol_desc->pd_ops->op_policy_stop(policy);
+
+		spin_lock(&nrs->nrs_lock);
+	}
+
+	LASSERT(list_empty(&policy->pol_list_queued));
+	LASSERT(policy->pol_req_queued == 0 &&
+		policy->pol_req_started == 0);
+
+	policy->pol_private = NULL;
+
+	policy->pol_state = NRS_POL_STATE_STOPPED;
+
+	if (atomic_dec_and_test(&policy->pol_desc->pd_refs))
+		module_put(policy->pol_desc->pd_owner);
+}
+
+static int nrs_policy_stop_locked(struct ptlrpc_nrs_policy *policy)
+{
+	struct ptlrpc_nrs *nrs = policy->pol_nrs;
+
+	if (nrs->nrs_policy_fallback == policy && !nrs->nrs_stopping)
+		return -EPERM;
+
+	if (policy->pol_state == NRS_POL_STATE_STARTING)
+		return -EAGAIN;
+
+	/* In progress or already stopped */
+	if (policy->pol_state != NRS_POL_STATE_STARTED)
+		return 0;
+
+	policy->pol_state = NRS_POL_STATE_STOPPING;
+
+	/* Immediately make it invisible */
+	if (nrs->nrs_policy_primary == policy) {
+		nrs->nrs_policy_primary = NULL;
+
+	} else {
+		LASSERT(nrs->nrs_policy_fallback == policy);
+		nrs->nrs_policy_fallback = NULL;
+	}
+
+	/* I have the only refcount */
+	if (policy->pol_ref == 1)
+		nrs_policy_stop0(policy);
+
+	return 0;
+}
+
+/**
+ * Transitions the \a nrs NRS head's primary policy to
+ * ptlrpc_nrs_pol_state::NRS_POL_STATE_STOPPING and if the policy has no
+ * pending usage references, to ptlrpc_nrs_pol_state::NRS_POL_STATE_STOPPED.
+ *
+ * \param[in] nrs the NRS head to carry out this operation on
+ */
+static void nrs_policy_stop_primary(struct ptlrpc_nrs *nrs)
+{
+	struct ptlrpc_nrs_policy *tmp = nrs->nrs_policy_primary;
+
+	if (tmp == NULL)
+		return;
+
+	nrs->nrs_policy_primary = NULL;
+
+	LASSERT(tmp->pol_state == NRS_POL_STATE_STARTED);
+	tmp->pol_state = NRS_POL_STATE_STOPPING;
+
+	if (tmp->pol_ref == 0)
+		nrs_policy_stop0(tmp);
+}
+
+/**
+ * Transitions a policy across the ptlrpc_nrs_pol_state range of values, in
+ * response to an lprocfs command to start a policy.
+ *
+ * If a primary policy different to the current one is specified, this function
+ * will transition the new policy to the
+ * ptlrpc_nrs_pol_state::NRS_POL_STATE_STARTING and then to
+ * ptlrpc_nrs_pol_state::NRS_POL_STATE_STARTED, and will then transition
+ * the old primary policy (if there is one) to
+ * ptlrpc_nrs_pol_state::NRS_POL_STATE_STOPPING, and if there are no outstanding
+ * references on the policy to ptlrpc_nrs_pol_stae::NRS_POL_STATE_STOPPED.
+ *
+ * If the fallback policy is specified, this is taken to indicate an instruction
+ * to stop the current primary policy, without substituting it with another
+ * primary policy, so the primary policy (if any) is transitioned to
+ * ptlrpc_nrs_pol_state::NRS_POL_STATE_STOPPING, and if there are no outstanding
+ * references on the policy to ptlrpc_nrs_pol_stae::NRS_POL_STATE_STOPPED. In
+ * this case, the fallback policy is only left active in the NRS head.
+ */
+static int nrs_policy_start_locked(struct ptlrpc_nrs_policy *policy)
+{
+	struct ptlrpc_nrs      *nrs = policy->pol_nrs;
+	int			rc = 0;
+
+	/**
+	 * Don't allow multiple starting which is too complex, and has no real
+	 * benefit.
+	 */
+	if (nrs->nrs_policy_starting)
+		return -EAGAIN;
+
+	LASSERT(policy->pol_state != NRS_POL_STATE_STARTING);
+
+	if (policy->pol_state == NRS_POL_STATE_STOPPING)
+		return -EAGAIN;
+
+	if (policy->pol_flags & PTLRPC_NRS_FL_FALLBACK) {
+		/**
+		 * This is for cases in which the user sets the policy to the
+		 * fallback policy (currently fifo for all services); i.e. the
+		 * user is resetting the policy to the default; so we stop the
+		 * primary policy, if any.
+		 */
+		if (policy == nrs->nrs_policy_fallback) {
+			nrs_policy_stop_primary(nrs);
+			return 0;
+		}
+
+		/**
+		 * If we reach here, we must be setting up the fallback policy
+		 * at service startup time, and only a single policy with the
+		 * nrs_policy_flags::PTLRPC_NRS_FL_FALLBACK flag set can
+		 * register with NRS core.
+		 */
+		LASSERT(nrs->nrs_policy_fallback == NULL);
+	} else {
+		/**
+		 * Shouldn't start primary policy if w/o fallback policy.
+		 */
+		if (nrs->nrs_policy_fallback == NULL)
+			return -EPERM;
+
+		if (policy->pol_state == NRS_POL_STATE_STARTED)
+			return 0;
+	}
+
+	/**
+	 * Increase the module usage count for policies registering from other
+	 * modules.
+	 */
+	if (atomic_inc_return(&policy->pol_desc->pd_refs) == 1 &&
+	    !try_module_get(policy->pol_desc->pd_owner)) {
+		atomic_dec(&policy->pol_desc->pd_refs);
+		CERROR("NRS: cannot get module for policy %s; is it alive?\n",
+		       policy->pol_desc->pd_name);
+		return -ENODEV;
+	}
+
+	/**
+	 * Serialize policy starting across the NRS head
+	 */
+	nrs->nrs_policy_starting = 1;
+
+	policy->pol_state = NRS_POL_STATE_STARTING;
+
+	if (policy->pol_desc->pd_ops->op_policy_start) {
+		spin_unlock(&nrs->nrs_lock);
+
+		rc = policy->pol_desc->pd_ops->op_policy_start(policy);
+
+		spin_lock(&nrs->nrs_lock);
+		if (rc != 0) {
+			if (atomic_dec_and_test(&policy->pol_desc->pd_refs))
+				module_put(policy->pol_desc->pd_owner);
+
+			policy->pol_state = NRS_POL_STATE_STOPPED;
+			goto out;
+		}
+	}
+
+	policy->pol_state = NRS_POL_STATE_STARTED;
+
+	if (policy->pol_flags & PTLRPC_NRS_FL_FALLBACK) {
+		/**
+		 * This path is only used at PTLRPC service setup time.
+		 */
+		nrs->nrs_policy_fallback = policy;
+	} else {
+		/*
+		 * Try to stop the current primary policy if there is one.
+		 */
+		nrs_policy_stop_primary(nrs);
+
+		/**
+		 * And set the newly-started policy as the primary one.
+		 */
+		nrs->nrs_policy_primary = policy;
+	}
+
+out:
+	nrs->nrs_policy_starting = 0;
+
+	return rc;
+}
+
+/**
+ * Increases the policy's usage reference count.
+ */
+static inline void nrs_policy_get_locked(struct ptlrpc_nrs_policy *policy)
+{
+	policy->pol_ref++;
+}
+
+/**
+ * Decreases the policy's usage reference count, and stops the policy in case it
+ * was already stopping and have no more outstanding usage references (which
+ * indicates it has no more queued or started requests, and can be safely
+ * stopped).
+ */
+static void nrs_policy_put_locked(struct ptlrpc_nrs_policy *policy)
+{
+	LASSERT(policy->pol_ref > 0);
+
+	policy->pol_ref--;
+	if (unlikely(policy->pol_ref == 0 &&
+	    policy->pol_state == NRS_POL_STATE_STOPPING))
+		nrs_policy_stop0(policy);
+}
+
+static void nrs_policy_put(struct ptlrpc_nrs_policy *policy)
+{
+	spin_lock(&policy->pol_nrs->nrs_lock);
+	nrs_policy_put_locked(policy);
+	spin_unlock(&policy->pol_nrs->nrs_lock);
+}
+
+/**
+ * Find and return a policy by name.
+ */
+static struct ptlrpc_nrs_policy *nrs_policy_find_locked(struct ptlrpc_nrs *nrs,
+							 char *name)
+{
+	struct ptlrpc_nrs_policy *tmp;
+
+	list_for_each_entry(tmp, &nrs->nrs_policy_list, pol_list) {
+		if (strncmp(tmp->pol_desc->pd_name, name,
+			    NRS_POL_NAME_MAX) == 0) {
+			nrs_policy_get_locked(tmp);
+			return tmp;
+		}
+	}
+	return NULL;
+}
+
+/**
+ * Release references for the resource hierarchy moving upwards towards the
+ * policy instance resource.
+ */
+static void nrs_resource_put(struct ptlrpc_nrs_resource *res)
+{
+	struct ptlrpc_nrs_policy *policy = res->res_policy;
+
+	if (policy->pol_desc->pd_ops->op_res_put != NULL) {
+		struct ptlrpc_nrs_resource *parent;
+
+		for (; res != NULL; res = parent) {
+			parent = res->res_parent;
+			policy->pol_desc->pd_ops->op_res_put(policy, res);
+		}
+	}
+}
+
+/**
+ * Obtains references for each resource in the resource hierarchy for request
+ * \a nrq if it is to be handled by \a policy.
+ *
+ * \param[in] policy	  the policy
+ * \param[in] nrq	  the request
+ * \param[in] moving_req  denotes whether this is a call to the function by
+ *			  ldlm_lock_reorder_req(), in order to move \a nrq to
+ *			  the high-priority NRS head; we should not sleep when
+ *			  set.
+ *
+ * \retval NULL		  resource hierarchy references not obtained
+ * \retval valid-pointer  the bottom level of the resource hierarchy
+ *
+ * \see ptlrpc_nrs_pol_ops::op_res_get()
+ */
+static
+struct ptlrpc_nrs_resource *nrs_resource_get(struct ptlrpc_nrs_policy *policy,
+					      struct ptlrpc_nrs_request *nrq,
+					      bool moving_req)
+{
+	/**
+	 * Set to NULL to traverse the resource hierarchy from the top.
+	 */
+	struct ptlrpc_nrs_resource *res = NULL;
+	struct ptlrpc_nrs_resource *tmp = NULL;
+	int			    rc;
+
+	while (1) {
+		rc = policy->pol_desc->pd_ops->op_res_get(policy, nrq, res,
+							  &tmp, moving_req);
+		if (rc < 0) {
+			if (res != NULL)
+				nrs_resource_put(res);
+			return NULL;
+		}
+
+		LASSERT(tmp != NULL);
+		tmp->res_parent = res;
+		tmp->res_policy = policy;
+		res = tmp;
+		tmp = NULL;
+		/**
+		 * Return once we have obtained a reference to the bottom level
+		 * of the resource hierarchy.
+		 */
+		if (rc > 0)
+			return res;
+	}
+}
+
+/**
+ * Obtains resources for the resource hierarchies and policy references for
+ * the fallback and current primary policy (if any), that will later be used
+ * to handle request \a nrq.
+ *
+ * \param[in]  nrs  the NRS head instance that will be handling request \a nrq.
+ * \param[in]  nrq  the request that is being handled.
+ * \param[out] resp the array where references to the resource hierarchy are
+ *		    stored.
+ * \param[in]  moving_req  is set when obtaining resources while moving a
+ *			   request from a policy on the regular NRS head to a
+ *			   policy on the HP NRS head (via
+ *			   ldlm_lock_reorder_req()). It signifies that
+ *			   allocations to get resources should be atomic; for
+ *			   a full explanation, see comment in
+ *			   ptlrpc_nrs_pol_ops::op_res_get().
+ */
+static void nrs_resource_get_safe(struct ptlrpc_nrs *nrs,
+				  struct ptlrpc_nrs_request *nrq,
+				  struct ptlrpc_nrs_resource **resp,
+				  bool moving_req)
+{
+	struct ptlrpc_nrs_policy   *primary = NULL;
+	struct ptlrpc_nrs_policy   *fallback = NULL;
+
+	memset(resp, 0, sizeof(resp[0]) * NRS_RES_MAX);
+
+	/**
+	 * Obtain policy references.
+	 */
+	spin_lock(&nrs->nrs_lock);
+
+	fallback = nrs->nrs_policy_fallback;
+	nrs_policy_get_locked(fallback);
+
+	primary = nrs->nrs_policy_primary;
+	if (primary != NULL)
+		nrs_policy_get_locked(primary);
+
+	spin_unlock(&nrs->nrs_lock);
+
+	/**
+	 * Obtain resource hierarchy references.
+	 */
+	resp[NRS_RES_FALLBACK] = nrs_resource_get(fallback, nrq, moving_req);
+	LASSERT(resp[NRS_RES_FALLBACK] != NULL);
+
+	if (primary != NULL) {
+		resp[NRS_RES_PRIMARY] = nrs_resource_get(primary, nrq,
+							 moving_req);
+		/**
+		 * A primary policy may exist which may not wish to serve a
+		 * particular request for different reasons; release the
+		 * reference on the policy as it will not be used for this
+		 * request.
+		 */
+		if (resp[NRS_RES_PRIMARY] == NULL)
+			nrs_policy_put(primary);
+	}
+}
+
+/**
+ * Releases references to resource hierarchies and policies, because they are no
+ * longer required; used when request handling has been completed, or the
+ * request is moving to the high priority NRS head.
+ *
+ * \param resp	the resource hierarchy that is being released
+ *
+ * \see ptlrpcnrs_req_hp_move()
+ * \see ptlrpc_nrs_req_finalize()
+ */
+static void nrs_resource_put_safe(struct ptlrpc_nrs_resource **resp)
+{
+	struct ptlrpc_nrs_policy *pols[NRS_RES_MAX];
+	struct ptlrpc_nrs	 *nrs = NULL;
+	int			  i;
+
+	for (i = 0; i < NRS_RES_MAX; i++) {
+		if (resp[i] != NULL) {
+			pols[i] = resp[i]->res_policy;
+			nrs_resource_put(resp[i]);
+			resp[i] = NULL;
+		} else {
+			pols[i] = NULL;
+		}
+	}
+
+	for (i = 0; i < NRS_RES_MAX; i++) {
+		if (pols[i] == NULL)
+			continue;
+
+		if (nrs == NULL) {
+			nrs = pols[i]->pol_nrs;
+			spin_lock(&nrs->nrs_lock);
+		}
+		nrs_policy_put_locked(pols[i]);
+	}
+
+	if (nrs != NULL)
+		spin_unlock(&nrs->nrs_lock);
+}
+
+/**
+ * Obtains an NRS request from \a policy for handling or examination; the
+ * request should be removed in the 'handling' case.
+ *
+ * Calling into this function implies we already know the policy has a request
+ * waiting to be handled.
+ *
+ * \param[in] policy the policy from which a request
+ * \param[in] peek   when set, signifies that we just want to examine the
+ *		     request, and not handle it, so the request is not removed
+ *		     from the policy.
+ * \param[in] force  when set, it will force a policy to return a request if it
+ *		     has one pending
+ *
+ * \retval the NRS request to be handled
+ */
+static inline
+struct ptlrpc_nrs_request *nrs_request_get(struct ptlrpc_nrs_policy *policy,
+					    bool peek, bool force)
+{
+	struct ptlrpc_nrs_request *nrq;
+
+	LASSERT(policy->pol_req_queued > 0);
+
+	nrq = policy->pol_desc->pd_ops->op_req_get(policy, peek, force);
+
+	LASSERT(ergo(nrq != NULL, nrs_request_policy(nrq) == policy));
+
+	return nrq;
+}
+
+/**
+ * Enqueues request \a nrq for later handling, via one one the policies for
+ * which resources where earlier obtained via nrs_resource_get_safe(). The
+ * function attempts to enqueue the request first on the primary policy
+ * (if any), since this is the preferred choice.
+ *
+ * \param nrq the request being enqueued
+ *
+ * \see nrs_resource_get_safe()
+ */
+static inline void nrs_request_enqueue(struct ptlrpc_nrs_request *nrq)
+{
+	struct ptlrpc_nrs_policy *policy;
+	int			  rc;
+	int			  i;
+
+	/**
+	 * Try in descending order, because the primary policy (if any) is
+	 * the preferred choice.
+	 */
+	for (i = NRS_RES_MAX - 1; i >= 0; i--) {
+		if (nrq->nr_res_ptrs[i] == NULL)
+			continue;
+
+		nrq->nr_res_idx = i;
+		policy = nrq->nr_res_ptrs[i]->res_policy;
+
+		rc = policy->pol_desc->pd_ops->op_req_enqueue(policy, nrq);
+		if (rc == 0) {
+			policy->pol_nrs->nrs_req_queued++;
+			policy->pol_req_queued++;
+			return;
+		}
+	}
+	/**
+	 * Should never get here, as at least the primary policy's
+	 * ptlrpc_nrs_pol_ops::op_req_enqueue() implementation should always
+	 * succeed.
+	 */
+	LBUG();
+}
+
+/**
+ * Called when a request has been handled
+ *
+ * \param[in] nrs the request that has been handled; can be used for
+ *		  job/resource control.
+ *
+ * \see ptlrpc_nrs_req_stop_nolock()
+ */
+static inline void nrs_request_stop(struct ptlrpc_nrs_request *nrq)
+{
+	struct ptlrpc_nrs_policy *policy = nrs_request_policy(nrq);
+
+	if (policy->pol_desc->pd_ops->op_req_stop)
+		policy->pol_desc->pd_ops->op_req_stop(policy, nrq);
+
+	LASSERT(policy->pol_nrs->nrs_req_started > 0);
+	LASSERT(policy->pol_req_started > 0);
+
+	policy->pol_nrs->nrs_req_started--;
+	policy->pol_req_started--;
+}
+
+/**
+ * Handler for operations that can be carried out on policies.
+ *
+ * Handles opcodes that are common to all policy types within NRS core, and
+ * passes any unknown opcodes to the policy-specific control function.
+ *
+ * \param[in]	  nrs  the NRS head this policy belongs to.
+ * \param[in]	  name the human-readable policy name; should be the same as
+ *		       ptlrpc_nrs_pol_desc::pd_name.
+ * \param[in]	  opc  the opcode of the operation being carried out.
+ * \param[in,out] arg  can be used to pass information in and out between when
+ *		       carrying an operation; usually data that is private to
+ *		       the policy at some level, or generic policy status
+ *		       information.
+ *
+ * \retval -ve error condition
+ * \retval   0 operation was carried out successfully
+ */
+static int nrs_policy_ctl(struct ptlrpc_nrs *nrs, char *name,
+			  enum ptlrpc_nrs_ctl opc, void *arg)
+{
+	struct ptlrpc_nrs_policy       *policy;
+	int				rc = 0;
+
+	spin_lock(&nrs->nrs_lock);
+
+	policy = nrs_policy_find_locked(nrs, name);
+	if (policy == NULL) {
+		rc = -ENOENT;
+		goto out;
+	}
+
+	switch (opc) {
+		/**
+		 * Unknown opcode, pass it down to the policy-specific control
+		 * function for handling.
+		 */
+	default:
+		rc = nrs_policy_ctl_locked(policy, opc, arg);
+		break;
+
+		/**
+		 * Start \e policy
+		 */
+	case PTLRPC_NRS_CTL_START:
+		rc = nrs_policy_start_locked(policy);
+		break;
+	}
+out:
+	if (policy != NULL)
+		nrs_policy_put_locked(policy);
+
+	spin_unlock(&nrs->nrs_lock);
+
+	return rc;
+}
+
+/**
+ * Unregisters a policy by name.
+ *
+ * \param[in] nrs  the NRS head this policy belongs to.
+ * \param[in] name the human-readable policy name; should be the same as
+ *		   ptlrpc_nrs_pol_desc::pd_name
+ *
+ * \retval -ve error
+ * \retval   0 success
+ */
+static int nrs_policy_unregister(struct ptlrpc_nrs *nrs, char *name)
+{
+	struct ptlrpc_nrs_policy *policy = NULL;
+
+	spin_lock(&nrs->nrs_lock);
+
+	policy = nrs_policy_find_locked(nrs, name);
+	if (policy == NULL) {
+		spin_unlock(&nrs->nrs_lock);
+
+		CERROR("Can't find NRS policy %s\n", name);
+		return -ENOENT;
+	}
+
+	if (policy->pol_ref > 1) {
+		CERROR("Policy %s is busy with %d references\n", name,
+		       (int)policy->pol_ref);
+		nrs_policy_put_locked(policy);
+
+		spin_unlock(&nrs->nrs_lock);
+		return -EBUSY;
+	}
+
+	LASSERT(policy->pol_req_queued == 0);
+	LASSERT(policy->pol_req_started == 0);
+
+	if (policy->pol_state != NRS_POL_STATE_STOPPED) {
+		nrs_policy_stop_locked(policy);
+		LASSERT(policy->pol_state == NRS_POL_STATE_STOPPED);
+	}
+
+	list_del(&policy->pol_list);
+	nrs->nrs_num_pols--;
+
+	nrs_policy_put_locked(policy);
+
+	spin_unlock(&nrs->nrs_lock);
+
+	nrs_policy_fini(policy);
+
+	LASSERT(policy->pol_private == NULL);
+	OBD_FREE_PTR(policy);
+
+	return 0;
+}
+
+/**
+ * Register a policy from \policy descriptor \a desc with NRS head \a nrs.
+ *
+ * \param[in] nrs   the NRS head on which the policy will be registered.
+ * \param[in] desc  the policy descriptor from which the information will be
+ *		    obtained to register the policy.
+ *
+ * \retval -ve error
+ * \retval   0 success
+ */
+static int nrs_policy_register(struct ptlrpc_nrs *nrs,
+			       struct ptlrpc_nrs_pol_desc *desc)
+{
+	struct ptlrpc_nrs_policy       *policy;
+	struct ptlrpc_nrs_policy       *tmp;
+	struct ptlrpc_service_part     *svcpt = nrs->nrs_svcpt;
+	int				rc;
+
+	LASSERT(svcpt != NULL);
+	LASSERT(desc->pd_ops != NULL);
+	LASSERT(desc->pd_ops->op_res_get != NULL);
+	LASSERT(desc->pd_ops->op_req_get != NULL);
+	LASSERT(desc->pd_ops->op_req_enqueue != NULL);
+	LASSERT(desc->pd_ops->op_req_dequeue != NULL);
+	LASSERT(desc->pd_compat != NULL);
+
+	OBD_CPT_ALLOC_GFP(policy, svcpt->scp_service->srv_cptable,
+			  svcpt->scp_cpt, sizeof(*policy), GFP_NOFS);
+	if (policy == NULL)
+		return -ENOMEM;
+
+	policy->pol_nrs     = nrs;
+	policy->pol_desc    = desc;
+	policy->pol_state   = NRS_POL_STATE_STOPPED;
+	policy->pol_flags   = desc->pd_flags;
+
+	INIT_LIST_HEAD(&policy->pol_list);
+	INIT_LIST_HEAD(&policy->pol_list_queued);
+
+	rc = nrs_policy_init(policy);
+	if (rc != 0) {
+		OBD_FREE_PTR(policy);
+		return rc;
+	}
+
+	spin_lock(&nrs->nrs_lock);
+
+	tmp = nrs_policy_find_locked(nrs, policy->pol_desc->pd_name);
+	if (tmp != NULL) {
+		CERROR("NRS policy %s has been registered, can't register it for %s\n",
+		       policy->pol_desc->pd_name,
+		       svcpt->scp_service->srv_name);
+		nrs_policy_put_locked(tmp);
+
+		spin_unlock(&nrs->nrs_lock);
+		nrs_policy_fini(policy);
+		OBD_FREE_PTR(policy);
+
+		return -EEXIST;
+	}
+
+	list_add_tail(&policy->pol_list, &nrs->nrs_policy_list);
+	nrs->nrs_num_pols++;
+
+	if (policy->pol_flags & PTLRPC_NRS_FL_REG_START)
+		rc = nrs_policy_start_locked(policy);
+
+	spin_unlock(&nrs->nrs_lock);
+
+	if (rc != 0)
+		(void) nrs_policy_unregister(nrs, policy->pol_desc->pd_name);
+
+	return rc;
+}
+
+/**
+ * Enqueue request \a req using one of the policies its resources are referring
+ * to.
+ *
+ * \param[in] req the request to enqueue.
+ */
+static void ptlrpc_nrs_req_add_nolock(struct ptlrpc_request *req)
+{
+	struct ptlrpc_nrs_policy       *policy;
+
+	LASSERT(req->rq_nrq.nr_initialized);
+	LASSERT(!req->rq_nrq.nr_enqueued);
+
+	nrs_request_enqueue(&req->rq_nrq);
+	req->rq_nrq.nr_enqueued = 1;
+
+	policy = nrs_request_policy(&req->rq_nrq);
+	/**
+	 * Add the policy to the NRS head's list of policies with enqueued
+	 * requests, if it has not been added there.
+	 */
+	if (unlikely(list_empty(&policy->pol_list_queued)))
+		list_add_tail(&policy->pol_list_queued,
+				  &policy->pol_nrs->nrs_policy_queued);
+}
+
+/**
+ * Enqueue a request on the high priority NRS head.
+ *
+ * \param req the request to enqueue.
+ */
+static void ptlrpc_nrs_hpreq_add_nolock(struct ptlrpc_request *req)
+{
+	int	opc = lustre_msg_get_opc(req->rq_reqmsg);
+
+	spin_lock(&req->rq_lock);
+	req->rq_hp = 1;
+	ptlrpc_nrs_req_add_nolock(req);
+	if (opc != OBD_PING)
+		DEBUG_REQ(D_NET, req, "high priority req");
+	spin_unlock(&req->rq_lock);
+}
+
+/**
+ * Returns a boolean predicate indicating whether the policy described by
+ * \a desc is adequate for use with service \a svc.
+ *
+ * \param[in] svc  the service
+ * \param[in] desc the policy descriptor
+ *
+ * \retval false the policy is not compatible with the service
+ * \retval true	 the policy is compatible with the service
+ */
+static inline bool nrs_policy_compatible(const struct ptlrpc_service *svc,
+					 const struct ptlrpc_nrs_pol_desc *desc)
+{
+	return desc->pd_compat(svc, desc);
+}
+
+/**
+ * Registers all compatible policies in nrs_core.nrs_policies, for NRS head
+ * \a nrs.
+ *
+ * \param[in] nrs the NRS head
+ *
+ * \retval -ve error
+ * \retval   0 success
+ *
+ * \pre mutex_is_locked(&nrs_core.nrs_mutex)
+ *
+ * \see ptlrpc_service_nrs_setup()
+ */
+static int nrs_register_policies_locked(struct ptlrpc_nrs *nrs)
+{
+	struct ptlrpc_nrs_pol_desc *desc;
+	/* for convenience */
+	struct ptlrpc_service_part	 *svcpt = nrs->nrs_svcpt;
+	struct ptlrpc_service		 *svc = svcpt->scp_service;
+	int				  rc = -EINVAL;
+
+	LASSERT(mutex_is_locked(&nrs_core.nrs_mutex));
+
+	list_for_each_entry(desc, &nrs_core.nrs_policies, pd_list) {
+		if (nrs_policy_compatible(svc, desc)) {
+			rc = nrs_policy_register(nrs, desc);
+			if (rc != 0) {
+				CERROR("Failed to register NRS policy %s for partition %d of service %s: %d\n",
+				       desc->pd_name, svcpt->scp_cpt,
+				       svc->srv_name, rc);
+				/**
+				 * Fail registration if any of the policies'
+				 * registration fails.
+				 */
+				break;
+			}
+		}
+	}
+
+	return rc;
+}
+
+/**
+ * Initializes NRS head \a nrs of service partition \a svcpt, and registers all
+ * compatible policies in NRS core, with the NRS head.
+ *
+ * \param[in] nrs   the NRS head
+ * \param[in] svcpt the PTLRPC service partition to setup
+ *
+ * \retval -ve error
+ * \retval   0 success
+ *
+ * \pre mutex_is_locked(&nrs_core.nrs_mutex)
+ */
+static int nrs_svcpt_setup_locked0(struct ptlrpc_nrs *nrs,
+				   struct ptlrpc_service_part *svcpt)
+{
+	enum ptlrpc_nrs_queue_type	queue;
+
+	LASSERT(mutex_is_locked(&nrs_core.nrs_mutex));
+
+	if (nrs == &svcpt->scp_nrs_reg)
+		queue = PTLRPC_NRS_QUEUE_REG;
+	else if (nrs == svcpt->scp_nrs_hp)
+		queue = PTLRPC_NRS_QUEUE_HP;
+	else
+		LBUG();
+
+	nrs->nrs_svcpt = svcpt;
+	nrs->nrs_queue_type = queue;
+	spin_lock_init(&nrs->nrs_lock);
+	INIT_LIST_HEAD(&nrs->nrs_policy_list);
+	INIT_LIST_HEAD(&nrs->nrs_policy_queued);
+
+	return nrs_register_policies_locked(nrs);
+}
+
+/**
+ * Allocates a regular and optionally a high-priority NRS head (if the service
+ * handles high-priority RPCs), and then registers all available compatible
+ * policies on those NRS heads.
+ *
+ * \param[in,out] svcpt the PTLRPC service partition to setup
+ *
+ * \pre mutex_is_locked(&nrs_core.nrs_mutex)
+ */
+static int nrs_svcpt_setup_locked(struct ptlrpc_service_part *svcpt)
+{
+	struct ptlrpc_nrs	       *nrs;
+	int				rc;
+
+	LASSERT(mutex_is_locked(&nrs_core.nrs_mutex));
+
+	/**
+	 * Initialize the regular NRS head.
+	 */
+	nrs = nrs_svcpt2nrs(svcpt, false);
+	rc = nrs_svcpt_setup_locked0(nrs, svcpt);
+	if (rc < 0)
+		goto out;
+
+	/**
+	 * Optionally allocate a high-priority NRS head.
+	 */
+	if (svcpt->scp_service->srv_ops.so_hpreq_handler == NULL)
+		goto out;
+
+	OBD_CPT_ALLOC_PTR(svcpt->scp_nrs_hp,
+			  svcpt->scp_service->srv_cptable,
+			  svcpt->scp_cpt);
+	if (svcpt->scp_nrs_hp == NULL) {
+		rc = -ENOMEM;
+		goto out;
+	}
+
+	nrs = nrs_svcpt2nrs(svcpt, true);
+	rc = nrs_svcpt_setup_locked0(nrs, svcpt);
+
+out:
+	return rc;
+}
+
+/**
+ * Unregisters all policies on all available NRS heads in a service partition;
+ * called at PTLRPC service unregistration time.
+ *
+ * \param[in] svcpt the PTLRPC service partition
+ *
+ * \pre mutex_is_locked(&nrs_core.nrs_mutex)
+ */
+static void nrs_svcpt_cleanup_locked(struct ptlrpc_service_part *svcpt)
+{
+	struct ptlrpc_nrs	       *nrs;
+	struct ptlrpc_nrs_policy       *policy;
+	struct ptlrpc_nrs_policy       *tmp;
+	int				rc;
+	bool				hp = false;
+
+	LASSERT(mutex_is_locked(&nrs_core.nrs_mutex));
+
+again:
+	nrs = nrs_svcpt2nrs(svcpt, hp);
+	nrs->nrs_stopping = 1;
+
+	list_for_each_entry_safe(policy, tmp, &nrs->nrs_policy_list,
+				     pol_list) {
+		rc = nrs_policy_unregister(nrs, policy->pol_desc->pd_name);
+		LASSERT(rc == 0);
+	}
+
+	/**
+	 * If the service partition has an HP NRS head, clean that up as well.
+	 */
+	if (!hp && nrs_svcpt_has_hp(svcpt)) {
+		hp = true;
+		goto again;
+	}
+
+	if (hp)
+		OBD_FREE_PTR(nrs);
+}
+
+/**
+ * Returns the descriptor for a policy as identified by by \a name.
+ *
+ * \param[in] name the policy name
+ *
+ * \retval the policy descriptor
+ * \retval NULL
+ */
+static struct ptlrpc_nrs_pol_desc *nrs_policy_find_desc_locked(const char *name)
+{
+	struct ptlrpc_nrs_pol_desc     *tmp;
+
+	list_for_each_entry(tmp, &nrs_core.nrs_policies, pd_list) {
+		if (strncmp(tmp->pd_name, name, NRS_POL_NAME_MAX) == 0)
+			return tmp;
+	}
+	return NULL;
+}
+
+/**
+ * Removes the policy from all supported NRS heads of all partitions of all
+ * PTLRPC services.
+ *
+ * \param[in] desc the policy descriptor to unregister
+ *
+ * \retval -ve error
+ * \retval  0  successfully unregistered policy on all supported NRS heads
+ *
+ * \pre mutex_is_locked(&nrs_core.nrs_mutex)
+ * \pre mutex_is_locked(&ptlrpc_all_services_mutex)
+ */
+static int nrs_policy_unregister_locked(struct ptlrpc_nrs_pol_desc *desc)
+{
+	struct ptlrpc_nrs	       *nrs;
+	struct ptlrpc_service	       *svc;
+	struct ptlrpc_service_part     *svcpt;
+	int				i;
+	int				rc = 0;
+
+	LASSERT(mutex_is_locked(&nrs_core.nrs_mutex));
+	LASSERT(mutex_is_locked(&ptlrpc_all_services_mutex));
+
+	list_for_each_entry(svc, &ptlrpc_all_services, srv_list) {
+
+		if (!nrs_policy_compatible(svc, desc) ||
+		    unlikely(svc->srv_is_stopping))
+			continue;
+
+		ptlrpc_service_for_each_part(svcpt, i, svc) {
+			bool hp = false;
+
+again:
+			nrs = nrs_svcpt2nrs(svcpt, hp);
+			rc = nrs_policy_unregister(nrs, desc->pd_name);
+			/**
+			 * Ignore -ENOENT as the policy may not have registered
+			 * successfully on all service partitions.
+			 */
+			if (rc == -ENOENT) {
+				rc = 0;
+			} else if (rc != 0) {
+				CERROR("Failed to unregister NRS policy %s for partition %d of service %s: %d\n",
+				       desc->pd_name, svcpt->scp_cpt,
+				       svcpt->scp_service->srv_name, rc);
+				return rc;
+			}
+
+			if (!hp && nrs_svc_has_hp(svc)) {
+				hp = true;
+				goto again;
+			}
+		}
+
+		if (desc->pd_ops->op_lprocfs_fini != NULL)
+			desc->pd_ops->op_lprocfs_fini(svc);
+	}
+
+	return rc;
+}
+
+/**
+ * Registers a new policy with NRS core.
+ *
+ * The function will only succeed if policy registration with all compatible
+ * service partitions (if any) is successful.
+ *
+ * N.B. This function should be called either at ptlrpc module initialization
+ *	time when registering a policy that ships with NRS core, or in a
+ *	module's init() function for policies registering from other modules.
+ *
+ * \param[in] conf configuration information for the new policy to register
+ *
+ * \retval -ve error
+ * \retval   0 success
+ */
+int ptlrpc_nrs_policy_register(struct ptlrpc_nrs_pol_conf *conf)
+{
+	struct ptlrpc_service	       *svc;
+	struct ptlrpc_nrs_pol_desc     *desc;
+	int				rc = 0;
+
+	LASSERT(conf != NULL);
+	LASSERT(conf->nc_ops != NULL);
+	LASSERT(conf->nc_compat != NULL);
+	LASSERT(ergo(conf->nc_compat == nrs_policy_compat_one,
+		conf->nc_compat_svc_name != NULL));
+	LASSERT(ergo((conf->nc_flags & PTLRPC_NRS_FL_REG_EXTERN) != 0,
+		     conf->nc_owner != NULL));
+
+	conf->nc_name[NRS_POL_NAME_MAX - 1] = '\0';
+
+	/**
+	 * External policies are not allowed to start immediately upon
+	 * registration, as there is a relatively higher chance that their
+	 * registration might fail. In such a case, some policy instances may
+	 * already have requests queued wen unregistration needs to happen as
+	 * part o cleanup; since there is currently no way to drain requests
+	 * from a policy unless the service is unregistering, we just disallow
+	 * this.
+	 */
+	if ((conf->nc_flags & PTLRPC_NRS_FL_REG_EXTERN) &&
+	    (conf->nc_flags & (PTLRPC_NRS_FL_FALLBACK |
+			       PTLRPC_NRS_FL_REG_START))) {
+		CERROR("NRS: failing to register policy %s. Please check policy flags; external policies cannot act as fallback policies, or be started immediately upon registration without interaction with lprocfs\n",
+		       conf->nc_name);
+		return -EINVAL;
+	}
+
+	mutex_lock(&nrs_core.nrs_mutex);
+
+	if (nrs_policy_find_desc_locked(conf->nc_name) != NULL) {
+		CERROR("NRS: failing to register policy %s which has already been registered with NRS core!\n",
+		       conf->nc_name);
+		rc = -EEXIST;
+		goto fail;
+	}
+
+	OBD_ALLOC_PTR(desc);
+	if (desc == NULL) {
+		rc = -ENOMEM;
+		goto fail;
+	}
+
+	strncpy(desc->pd_name, conf->nc_name, NRS_POL_NAME_MAX);
+	desc->pd_ops		 = conf->nc_ops;
+	desc->pd_compat		 = conf->nc_compat;
+	desc->pd_compat_svc_name = conf->nc_compat_svc_name;
+	if ((conf->nc_flags & PTLRPC_NRS_FL_REG_EXTERN) != 0)
+		desc->pd_owner	 = conf->nc_owner;
+	desc->pd_flags		 = conf->nc_flags;
+	atomic_set(&desc->pd_refs, 0);
+
+	/**
+	 * For policies that are held in the same module as NRS (currently
+	 * ptlrpc), do not register the policy with all compatible services,
+	 * as the services will not have started at this point, since we are
+	 * calling from ptlrpc module initialization code. In such cases each
+	 * service will register all compatible policies later, via
+	 * ptlrpc_service_nrs_setup().
+	 */
+	if ((conf->nc_flags & PTLRPC_NRS_FL_REG_EXTERN) == 0)
+		goto internal;
+
+	/**
+	 * Register the new policy on all compatible services
+	 */
+	mutex_lock(&ptlrpc_all_services_mutex);
+
+	list_for_each_entry(svc, &ptlrpc_all_services, srv_list) {
+		struct ptlrpc_service_part     *svcpt;
+		int				i;
+		int				rc2;
+
+		if (!nrs_policy_compatible(svc, desc) ||
+		    unlikely(svc->srv_is_stopping))
+			continue;
+
+		ptlrpc_service_for_each_part(svcpt, i, svc) {
+			struct ptlrpc_nrs      *nrs;
+			bool			hp = false;
+again:
+			nrs = nrs_svcpt2nrs(svcpt, hp);
+			rc = nrs_policy_register(nrs, desc);
+			if (rc != 0) {
+				CERROR("Failed to register NRS policy %s for partition %d of service %s: %d\n",
+				       desc->pd_name, svcpt->scp_cpt,
+				       svcpt->scp_service->srv_name, rc);
+
+				rc2 = nrs_policy_unregister_locked(desc);
+				/**
+				 * Should not fail at this point
+				 */
+				LASSERT(rc2 == 0);
+				mutex_unlock(&ptlrpc_all_services_mutex);
+				OBD_FREE_PTR(desc);
+				goto fail;
+			}
+
+			if (!hp && nrs_svc_has_hp(svc)) {
+				hp = true;
+				goto again;
+			}
+		}
+
+		/**
+		 * No need to take a reference to other modules here, as we
+		 * will be calling from the module's init() function.
+		 */
+		if (desc->pd_ops->op_lprocfs_init != NULL) {
+			rc = desc->pd_ops->op_lprocfs_init(svc);
+			if (rc != 0) {
+				rc2 = nrs_policy_unregister_locked(desc);
+				/**
+				 * Should not fail at this point
+				 */
+				LASSERT(rc2 == 0);
+				mutex_unlock(&ptlrpc_all_services_mutex);
+				OBD_FREE_PTR(desc);
+				goto fail;
+			}
+		}
+	}
+
+	mutex_unlock(&ptlrpc_all_services_mutex);
+internal:
+	list_add_tail(&desc->pd_list, &nrs_core.nrs_policies);
+fail:
+	mutex_unlock(&nrs_core.nrs_mutex);
+
+	return rc;
+}
+EXPORT_SYMBOL(ptlrpc_nrs_policy_register);
+
+/**
+ * Unregisters a previously registered policy with NRS core. All instances of
+ * the policy on all NRS heads of all supported services are removed.
+ *
+ * N.B. This function should only be called from a module's exit() function.
+ *	Although it can be used for policies that ship alongside NRS core, the
+ *	function is primarily intended for policies that register externally,
+ *	from other modules.
+ *
+ * \param[in] conf configuration information for the policy to unregister
+ *
+ * \retval -ve error
+ * \retval   0 success
+ */
+int ptlrpc_nrs_policy_unregister(struct ptlrpc_nrs_pol_conf *conf)
+{
+	struct ptlrpc_nrs_pol_desc	*desc;
+	int				 rc;
+
+	LASSERT(conf != NULL);
+
+	if (conf->nc_flags & PTLRPC_NRS_FL_FALLBACK) {
+		CERROR("Unable to unregister a fallback policy, unless the PTLRPC service is stopping.\n");
+		return -EPERM;
+	}
+
+	conf->nc_name[NRS_POL_NAME_MAX - 1] = '\0';
+
+	mutex_lock(&nrs_core.nrs_mutex);
+
+	desc = nrs_policy_find_desc_locked(conf->nc_name);
+	if (desc == NULL) {
+		CERROR("Failing to unregister NRS policy %s which has not been registered with NRS core!\n",
+		       conf->nc_name);
+		rc = -ENOENT;
+		goto not_exist;
+	}
+
+	mutex_lock(&ptlrpc_all_services_mutex);
+
+	rc = nrs_policy_unregister_locked(desc);
+	if (rc < 0) {
+		if (rc == -EBUSY)
+			CERROR("Please first stop policy %s on all service partitions and then retry to unregister the policy.\n",
+			       conf->nc_name);
+		goto fail;
+	}
+
+	CDEBUG(D_INFO, "Unregistering policy %s from NRS core.\n",
+	       conf->nc_name);
+
+	list_del(&desc->pd_list);
+	OBD_FREE_PTR(desc);
+
+fail:
+	mutex_unlock(&ptlrpc_all_services_mutex);
+
+not_exist:
+	mutex_unlock(&nrs_core.nrs_mutex);
+
+	return rc;
+}
+EXPORT_SYMBOL(ptlrpc_nrs_policy_unregister);
+
+/**
+ * Setup NRS heads on all service partitions of service \a svc, and register
+ * all compatible policies on those NRS heads.
+ *
+ * To be called from within ptl
+ * \param[in] svc the service to setup
+ *
+ * \retval -ve error, the calling logic should eventually call
+ *		      ptlrpc_service_nrs_cleanup() to undo any work performed
+ *		      by this function.
+ *
+ * \see ptlrpc_register_service()
+ * \see ptlrpc_service_nrs_cleanup()
+ */
+int ptlrpc_service_nrs_setup(struct ptlrpc_service *svc)
+{
+	struct ptlrpc_service_part	       *svcpt;
+	const struct ptlrpc_nrs_pol_desc       *desc;
+	int					i;
+	int					rc = 0;
+
+	mutex_lock(&nrs_core.nrs_mutex);
+
+	/**
+	 * Initialize NRS heads on all service CPTs.
+	 */
+	ptlrpc_service_for_each_part(svcpt, i, svc) {
+		rc = nrs_svcpt_setup_locked(svcpt);
+		if (rc != 0)
+			goto failed;
+	}
+
+	/**
+	 * Set up lprocfs interfaces for all supported policies for the
+	 * service.
+	 */
+	list_for_each_entry(desc, &nrs_core.nrs_policies, pd_list) {
+		if (!nrs_policy_compatible(svc, desc))
+			continue;
+
+		if (desc->pd_ops->op_lprocfs_init != NULL) {
+			rc = desc->pd_ops->op_lprocfs_init(svc);
+			if (rc != 0)
+				goto failed;
+		}
+	}
+
+failed:
+
+	mutex_unlock(&nrs_core.nrs_mutex);
+
+	return rc;
+}
+
+/**
+ * Unregisters all policies on all service partitions of service \a svc.
+ *
+ * \param[in] svc the PTLRPC service to unregister
+ */
+void ptlrpc_service_nrs_cleanup(struct ptlrpc_service *svc)
+{
+	struct ptlrpc_service_part	     *svcpt;
+	const struct ptlrpc_nrs_pol_desc     *desc;
+	int				      i;
+
+	mutex_lock(&nrs_core.nrs_mutex);
+
+	/**
+	 * Clean up NRS heads on all service partitions
+	 */
+	ptlrpc_service_for_each_part(svcpt, i, svc)
+		nrs_svcpt_cleanup_locked(svcpt);
+
+	/**
+	 * Clean up lprocfs interfaces for all supported policies for the
+	 * service.
+	 */
+	list_for_each_entry(desc, &nrs_core.nrs_policies, pd_list) {
+		if (!nrs_policy_compatible(svc, desc))
+			continue;
+
+		if (desc->pd_ops->op_lprocfs_fini != NULL)
+			desc->pd_ops->op_lprocfs_fini(svc);
+	}
+
+	mutex_unlock(&nrs_core.nrs_mutex);
+}
+
+/**
+ * Obtains NRS head resources for request \a req.
+ *
+ * These could be either on the regular or HP NRS head of \a svcpt; resources
+ * taken on the regular head can later be swapped for HP head resources by
+ * ldlm_lock_reorder_req().
+ *
+ * \param[in] svcpt the service partition
+ * \param[in] req   the request
+ * \param[in] hp    which NRS head of \a svcpt to use
+ */
+void ptlrpc_nrs_req_initialize(struct ptlrpc_service_part *svcpt,
+			       struct ptlrpc_request *req, bool hp)
+{
+	struct ptlrpc_nrs	*nrs = nrs_svcpt2nrs(svcpt, hp);
+
+	memset(&req->rq_nrq, 0, sizeof(req->rq_nrq));
+	nrs_resource_get_safe(nrs, &req->rq_nrq, req->rq_nrq.nr_res_ptrs,
+			      false);
+
+	/**
+	 * It is fine to access \e nr_initialized without locking as there is
+	 * no contention at this early stage.
+	 */
+	req->rq_nrq.nr_initialized = 1;
+}
+
+/**
+ * Releases resources for a request; is called after the request has been
+ * handled.
+ *
+ * \param[in] req the request
+ *
+ * \see ptlrpc_server_finish_request()
+ */
+void ptlrpc_nrs_req_finalize(struct ptlrpc_request *req)
+{
+	if (req->rq_nrq.nr_initialized) {
+		nrs_resource_put_safe(req->rq_nrq.nr_res_ptrs);
+		/* no protection on bit nr_initialized because no
+		 * contention at this late stage */
+		req->rq_nrq.nr_finalized = 1;
+	}
+}
+
+void ptlrpc_nrs_req_stop_nolock(struct ptlrpc_request *req)
+{
+	if (req->rq_nrq.nr_started)
+		nrs_request_stop(&req->rq_nrq);
+}
+
+/**
+ * Enqueues request \a req on either the regular or high-priority NRS head
+ * of service partition \a svcpt.
+ *
+ * \param[in] svcpt the service partition
+ * \param[in] req   the request to be enqueued
+ * \param[in] hp    whether to enqueue the request on the regular or
+ *		    high-priority NRS head.
+ */
+void ptlrpc_nrs_req_add(struct ptlrpc_service_part *svcpt,
+			struct ptlrpc_request *req, bool hp)
+{
+	spin_lock(&svcpt->scp_req_lock);
+
+	if (hp)
+		ptlrpc_nrs_hpreq_add_nolock(req);
+	else
+		ptlrpc_nrs_req_add_nolock(req);
+
+	spin_unlock(&svcpt->scp_req_lock);
+}
+
+static void nrs_request_removed(struct ptlrpc_nrs_policy *policy)
+{
+	LASSERT(policy->pol_nrs->nrs_req_queued > 0);
+	LASSERT(policy->pol_req_queued > 0);
+
+	policy->pol_nrs->nrs_req_queued--;
+	policy->pol_req_queued--;
+
+	/**
+	 * If the policy has no more requests queued, remove it from
+	 * ptlrpc_nrs::nrs_policy_queued.
+	 */
+	if (unlikely(policy->pol_req_queued == 0)) {
+		list_del_init(&policy->pol_list_queued);
+
+		/**
+		 * If there are other policies with queued requests, move the
+		 * current policy to the end so that we can round robin over
+		 * all policies and drain the requests.
+		 */
+	} else if (policy->pol_req_queued != policy->pol_nrs->nrs_req_queued) {
+		LASSERT(policy->pol_req_queued <
+			policy->pol_nrs->nrs_req_queued);
+
+		list_move_tail(&policy->pol_list_queued,
+				   &policy->pol_nrs->nrs_policy_queued);
+	}
+}
+
+/**
+ * Obtains a request for handling from an NRS head of service partition
+ * \a svcpt.
+ *
+ * \param[in] svcpt the service partition
+ * \param[in] hp    whether to obtain a request from the regular or
+ *		    high-priority NRS head.
+ * \param[in] peek  when set, signifies that we just want to examine the
+ *		    request, and not handle it, so the request is not removed
+ *		    from the policy.
+ * \param[in] force when set, it will force a policy to return a request if it
+ *		    has one pending
+ *
+ * \retval the	request to be handled
+ * \retval NULL the head has no requests to serve
+ */
+struct ptlrpc_request *
+ptlrpc_nrs_req_get_nolock0(struct ptlrpc_service_part *svcpt, bool hp,
+			   bool peek, bool force)
+{
+	struct ptlrpc_nrs	  *nrs = nrs_svcpt2nrs(svcpt, hp);
+	struct ptlrpc_nrs_policy  *policy;
+	struct ptlrpc_nrs_request *nrq;
+
+	/**
+	 * Always try to drain requests from all NRS polices even if they are
+	 * inactive, because the user can change policy status at runtime.
+	 */
+	list_for_each_entry(policy, &nrs->nrs_policy_queued,
+				pol_list_queued) {
+		nrq = nrs_request_get(policy, peek, force);
+		if (nrq != NULL) {
+			if (likely(!peek)) {
+				nrq->nr_started = 1;
+
+				policy->pol_req_started++;
+				policy->pol_nrs->nrs_req_started++;
+
+				nrs_request_removed(policy);
+			}
+
+			return container_of(nrq, struct ptlrpc_request, rq_nrq);
+		}
+	}
+
+	return NULL;
+}
+
+/**
+ * Dequeues request \a req from the policy it has been enqueued on.
+ *
+ * \param[in] req the request
+ */
+void ptlrpc_nrs_req_del_nolock(struct ptlrpc_request *req)
+{
+	struct ptlrpc_nrs_policy *policy = nrs_request_policy(&req->rq_nrq);
+
+	policy->pol_desc->pd_ops->op_req_dequeue(policy, &req->rq_nrq);
+
+	req->rq_nrq.nr_enqueued = 0;
+
+	nrs_request_removed(policy);
+}
+
+/**
+ * Returns whether there are any requests currently enqueued on any of the
+ * policies of service partition's \a svcpt NRS head specified by \a hp. Should
+ * be called while holding ptlrpc_service_part::scp_req_lock to get a reliable
+ * result.
+ *
+ * \param[in] svcpt the service partition to enquire.
+ * \param[in] hp    whether the regular or high-priority NRS head is to be
+ *		    enquired.
+ *
+ * \retval false the indicated NRS head has no enqueued requests.
+ * \retval true	 the indicated NRS head has some enqueued requests.
+ */
+bool ptlrpc_nrs_req_pending_nolock(struct ptlrpc_service_part *svcpt, bool hp)
+{
+	struct ptlrpc_nrs *nrs = nrs_svcpt2nrs(svcpt, hp);
+
+	return nrs->nrs_req_queued > 0;
+};
+
+/**
+ * Moves request \a req from the regular to the high-priority NRS head.
+ *
+ * \param[in] req the request to move
+ */
+void ptlrpc_nrs_req_hp_move(struct ptlrpc_request *req)
+{
+	struct ptlrpc_service_part	*svcpt = req->rq_rqbd->rqbd_svcpt;
+	struct ptlrpc_nrs_request	*nrq = &req->rq_nrq;
+	struct ptlrpc_nrs_resource	*res1[NRS_RES_MAX];
+	struct ptlrpc_nrs_resource	*res2[NRS_RES_MAX];
+
+	/**
+	 * Obtain the high-priority NRS head resources.
+	 */
+	nrs_resource_get_safe(nrs_svcpt2nrs(svcpt, true), nrq, res1, true);
+
+	spin_lock(&svcpt->scp_req_lock);
+
+	if (!ptlrpc_nrs_req_can_move(req))
+		goto out;
+
+	ptlrpc_nrs_req_del_nolock(req);
+
+	memcpy(res2, nrq->nr_res_ptrs, NRS_RES_MAX * sizeof(res2[0]));
+	memcpy(nrq->nr_res_ptrs, res1, NRS_RES_MAX * sizeof(res1[0]));
+
+	ptlrpc_nrs_hpreq_add_nolock(req);
+
+	memcpy(res1, res2, NRS_RES_MAX * sizeof(res1[0]));
+out:
+	spin_unlock(&svcpt->scp_req_lock);
+
+	/**
+	 * Release either the regular NRS head resources if we moved the
+	 * request, or the high-priority NRS head resources if we took a
+	 * reference earlier in this function and ptlrpc_nrs_req_can_move()
+	 * returned false.
+	 */
+	nrs_resource_put_safe(res1);
+}
+
+/**
+ * Carries out a control operation \a opc on the policy identified by the
+ * human-readable \a name, on either all partitions, or only on the first
+ * partition of service \a svc.
+ *
+ * \param[in]	  svc	 the service the policy belongs to.
+ * \param[in]	  queue  whether to carry out the command on the policy which
+ *			 belongs to the regular, high-priority, or both NRS
+ *			 heads of service partitions of \a svc.
+ * \param[in]	  name   the policy to act upon, by human-readable name
+ * \param[in]	  opc	 the opcode of the operation to carry out
+ * \param[in]	  single when set, the operation will only be carried out on the
+ *			 NRS heads of the first service partition of \a svc.
+ *			 This is useful for some policies which e.g. share
+ *			 identical values on the same parameters of different
+ *			 service partitions; when reading these parameters via
+ *			 lprocfs, these policies may just want to obtain and
+ *			 print out the values from the first service partition.
+ *			 Storing these values centrally elsewhere then could be
+ *			 another solution for this.
+ * \param[in,out] arg	 can be used as a generic in/out buffer between control
+ *			 operations and the user environment.
+ *
+ *\retval -ve error condition
+ *\retval   0 operation was carried out successfully
+ */
+int ptlrpc_nrs_policy_control(const struct ptlrpc_service *svc,
+			      enum ptlrpc_nrs_queue_type queue, char *name,
+			      enum ptlrpc_nrs_ctl opc, bool single, void *arg)
+{
+	struct ptlrpc_service_part     *svcpt;
+	int				i;
+	int				rc = 0;
+
+	LASSERT(opc != PTLRPC_NRS_CTL_INVALID);
+
+	if ((queue & PTLRPC_NRS_QUEUE_BOTH) == 0)
+		return -EINVAL;
+
+	ptlrpc_service_for_each_part(svcpt, i, svc) {
+		if ((queue & PTLRPC_NRS_QUEUE_REG) != 0) {
+			rc = nrs_policy_ctl(nrs_svcpt2nrs(svcpt, false), name,
+					    opc, arg);
+			if (rc != 0 || (queue == PTLRPC_NRS_QUEUE_REG &&
+					single))
+				goto out;
+		}
+
+		if ((queue & PTLRPC_NRS_QUEUE_HP) != 0) {
+			/**
+			 * XXX: We could optionally check for
+			 * nrs_svc_has_hp(svc) here, and return an error if it
+			 * is false. Right now we rely on the policies' lprocfs
+			 * handlers that call the present function to make this
+			 * check; if they fail to do so, they might hit the
+			 * assertion inside nrs_svcpt2nrs() below.
+			 */
+			rc = nrs_policy_ctl(nrs_svcpt2nrs(svcpt, true), name,
+					    opc, arg);
+			if (rc != 0 || single)
+				goto out;
+		}
+	}
+out:
+	return rc;
+}
+
+
+/* ptlrpc/nrs_fifo.c */
+extern struct ptlrpc_nrs_pol_conf nrs_conf_fifo;
+
+/**
+ * Adds all policies that ship with the ptlrpc module, to NRS core's list of
+ * policies \e nrs_core.nrs_policies.
+ *
+ * \retval 0 all policies have been registered successfully
+ * \retval -ve error
+ */
+int ptlrpc_nrs_init(void)
+{
+	int	rc;
+
+	mutex_init(&nrs_core.nrs_mutex);
+	INIT_LIST_HEAD(&nrs_core.nrs_policies);
+
+	rc = ptlrpc_nrs_policy_register(&nrs_conf_fifo);
+	if (rc != 0)
+		goto fail;
+
+
+	return rc;
+fail:
+	/**
+	 * Since no PTLRPC services have been started at this point, all we need
+	 * to do for cleanup is to free the descriptors.
+	 */
+	ptlrpc_nrs_fini();
+
+	return rc;
+}
+
+/**
+ * Removes all policy descriptors from nrs_core::nrs_policies, and frees the
+ * policy descriptors.
+ *
+ * Since all PTLRPC services are stopped at this point, there are no more
+ * instances of any policies, because each service will have stopped its policy
+ * instances in ptlrpc_service_nrs_cleanup(), so we just need to free the
+ * descriptors here.
+ */
+void ptlrpc_nrs_fini(void)
+{
+	struct ptlrpc_nrs_pol_desc *desc;
+	struct ptlrpc_nrs_pol_desc *tmp;
+
+	list_for_each_entry_safe(desc, tmp, &nrs_core.nrs_policies,
+				     pd_list) {
+		list_del_init(&desc->pd_list);
+		OBD_FREE_PTR(desc);
+	}
+}
+
+/** @} nrs */
diff --git a/kernel/drivers/staging/lustre/lustre/ptlrpc/nrs_fifo.c b/kernel/drivers/staging/lustre/lustre/ptlrpc/nrs_fifo.c
new file mode 100644
index 000000000..eb40c01db
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/ptlrpc/nrs_fifo.c
@@ -0,0 +1,270 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License version 2 for more details.  A copy is
+ * included in the COPYING file that accompanied this code.
+
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2011 Intel Corporation
+ *
+ * Copyright 2012 Xyratex Technology Limited
+ */
+/*
+ * lustre/ptlrpc/nrs_fifo.c
+ *
+ * Network Request Scheduler (NRS) FIFO policy
+ *
+ * Handles RPCs in a FIFO manner, as received from the network. This policy is
+ * a logical wrapper around previous, non-NRS functionality. It is used as the
+ * default and fallback policy for all types of RPCs on all PTLRPC service
+ * partitions, for both regular and high-priority NRS heads. Default here means
+ * the policy is the one enabled at PTLRPC service partition startup time, and
+ * fallback means the policy is used to handle RPCs that are not handled
+ * successfully or are not handled at all by any primary policy that may be
+ * enabled on a given NRS head.
+ *
+ * Author: Liang Zhen <liang@whamcloud.com>
+ * Author: Nikitas Angelinas <nikitas_angelinas@xyratex.com>
+ */
+/**
+ * \addtogoup nrs
+ * @{
+ */
+
+#define DEBUG_SUBSYSTEM S_RPC
+#include "../include/obd_support.h"
+#include "../include/obd_class.h"
+#include "../../include/linux/libcfs/libcfs.h"
+#include "ptlrpc_internal.h"
+
+/**
+ * \name fifo
+ *
+ * The FIFO policy is a logical wrapper around previous, non-NRS functionality.
+ * It schedules RPCs in the same order as they are queued from LNet.
+ *
+ * @{
+ */
+
+#define NRS_POL_NAME_FIFO	"fifo"
+
+/**
+ * Is called before the policy transitions into
+ * ptlrpc_nrs_pol_state::NRS_POL_STATE_STARTED; allocates and initializes a
+ * policy-specific private data structure.
+ *
+ * \param[in] policy The policy to start
+ *
+ * \retval -ENOMEM OOM error
+ * \retval  0	   success
+ *
+ * \see nrs_policy_register()
+ * \see nrs_policy_ctl()
+ */
+static int nrs_fifo_start(struct ptlrpc_nrs_policy *policy)
+{
+	struct nrs_fifo_head *head;
+
+	OBD_CPT_ALLOC_PTR(head, nrs_pol2cptab(policy), nrs_pol2cptid(policy));
+	if (head == NULL)
+		return -ENOMEM;
+
+	INIT_LIST_HEAD(&head->fh_list);
+	policy->pol_private = head;
+	return 0;
+}
+
+/**
+ * Is called before the policy transitions into
+ * ptlrpc_nrs_pol_state::NRS_POL_STATE_STOPPED; deallocates the policy-specific
+ * private data structure.
+ *
+ * \param[in] policy The policy to stop
+ *
+ * \see nrs_policy_stop0()
+ */
+static void nrs_fifo_stop(struct ptlrpc_nrs_policy *policy)
+{
+	struct nrs_fifo_head *head = policy->pol_private;
+
+	LASSERT(head != NULL);
+	LASSERT(list_empty(&head->fh_list));
+
+	OBD_FREE_PTR(head);
+}
+
+/**
+ * Is called for obtaining a FIFO policy resource.
+ *
+ * \param[in]  policy	  The policy on which the request is being asked for
+ * \param[in]  nrq	  The request for which resources are being taken
+ * \param[in]  parent	  Parent resource, unused in this policy
+ * \param[out] resp	  Resources references are placed in this array
+ * \param[in]  moving_req Signifies limited caller context; unused in this
+ *			  policy
+ *
+ * \retval 1 The FIFO policy only has a one-level resource hierarchy, as since
+ *	     it implements a simple scheduling algorithm in which request
+ *	     priority is determined on the request arrival order, it does not
+ *	     need to maintain a set of resources that would otherwise be used
+ *	     to calculate a request's priority.
+ *
+ * \see nrs_resource_get_safe()
+ */
+static int nrs_fifo_res_get(struct ptlrpc_nrs_policy *policy,
+			    struct ptlrpc_nrs_request *nrq,
+			    const struct ptlrpc_nrs_resource *parent,
+			    struct ptlrpc_nrs_resource **resp, bool moving_req)
+{
+	/**
+	 * Just return the resource embedded inside nrs_fifo_head, and end this
+	 * resource hierarchy reference request.
+	 */
+	*resp = &((struct nrs_fifo_head *)policy->pol_private)->fh_res;
+	return 1;
+}
+
+/**
+ * Called when getting a request from the FIFO policy for handling, or just
+ * peeking; removes the request from the policy when it is to be handled.
+ *
+ * \param[in] policy The policy
+ * \param[in] peek   When set, signifies that we just want to examine the
+ *		     request, and not handle it, so the request is not removed
+ *		     from the policy.
+ * \param[in] force  Force the policy to return a request; unused in this
+ *		     policy
+ *
+ * \retval The request to be handled; this is the next request in the FIFO
+ *	   queue
+ *
+ * \see ptlrpc_nrs_req_get_nolock()
+ * \see nrs_request_get()
+ */
+static
+struct ptlrpc_nrs_request *nrs_fifo_req_get(struct ptlrpc_nrs_policy *policy,
+					     bool peek, bool force)
+{
+	struct nrs_fifo_head	  *head = policy->pol_private;
+	struct ptlrpc_nrs_request *nrq;
+
+	nrq = unlikely(list_empty(&head->fh_list)) ? NULL :
+	      list_entry(head->fh_list.next, struct ptlrpc_nrs_request,
+			     nr_u.fifo.fr_list);
+
+	if (likely(!peek && nrq != NULL)) {
+		struct ptlrpc_request *req = container_of(nrq,
+							  struct ptlrpc_request,
+							  rq_nrq);
+
+		list_del_init(&nrq->nr_u.fifo.fr_list);
+
+		CDEBUG(D_RPCTRACE, "NRS start %s request from %s, seq: %llu\n",
+		       policy->pol_desc->pd_name, libcfs_id2str(req->rq_peer),
+		       nrq->nr_u.fifo.fr_sequence);
+	}
+
+	return nrq;
+}
+
+/**
+ * Adds request \a nrq to \a policy's list of queued requests
+ *
+ * \param[in] policy The policy
+ * \param[in] nrq    The request to add
+ *
+ * \retval 0 success; nrs_request_enqueue() assumes this function will always
+ *		      succeed
+ */
+static int nrs_fifo_req_add(struct ptlrpc_nrs_policy *policy,
+			    struct ptlrpc_nrs_request *nrq)
+{
+	struct nrs_fifo_head *head;
+
+	head = container_of(nrs_request_resource(nrq), struct nrs_fifo_head,
+			    fh_res);
+	/**
+	 * Only used for debugging
+	 */
+	nrq->nr_u.fifo.fr_sequence = head->fh_sequence++;
+	list_add_tail(&nrq->nr_u.fifo.fr_list, &head->fh_list);
+
+	return 0;
+}
+
+/**
+ * Removes request \a nrq from \a policy's list of queued requests.
+ *
+ * \param[in] policy The policy
+ * \param[in] nrq    The request to remove
+ */
+static void nrs_fifo_req_del(struct ptlrpc_nrs_policy *policy,
+			     struct ptlrpc_nrs_request *nrq)
+{
+	LASSERT(!list_empty(&nrq->nr_u.fifo.fr_list));
+	list_del_init(&nrq->nr_u.fifo.fr_list);
+}
+
+/**
+ * Prints a debug statement right before the request \a nrq stops being
+ * handled.
+ *
+ * \param[in] policy The policy handling the request
+ * \param[in] nrq    The request being handled
+ *
+ * \see ptlrpc_server_finish_request()
+ * \see ptlrpc_nrs_req_stop_nolock()
+ */
+static void nrs_fifo_req_stop(struct ptlrpc_nrs_policy *policy,
+			      struct ptlrpc_nrs_request *nrq)
+{
+	struct ptlrpc_request *req = container_of(nrq, struct ptlrpc_request,
+						  rq_nrq);
+
+	CDEBUG(D_RPCTRACE, "NRS stop %s request from %s, seq: %llu\n",
+	       policy->pol_desc->pd_name, libcfs_id2str(req->rq_peer),
+	       nrq->nr_u.fifo.fr_sequence);
+}
+
+/**
+ * FIFO policy operations
+ */
+static const struct ptlrpc_nrs_pol_ops nrs_fifo_ops = {
+	.op_policy_start	= nrs_fifo_start,
+	.op_policy_stop		= nrs_fifo_stop,
+	.op_res_get		= nrs_fifo_res_get,
+	.op_req_get		= nrs_fifo_req_get,
+	.op_req_enqueue		= nrs_fifo_req_add,
+	.op_req_dequeue		= nrs_fifo_req_del,
+	.op_req_stop		= nrs_fifo_req_stop,
+};
+
+/**
+ * FIFO policy configuration
+ */
+struct ptlrpc_nrs_pol_conf nrs_conf_fifo = {
+	.nc_name		= NRS_POL_NAME_FIFO,
+	.nc_ops			= &nrs_fifo_ops,
+	.nc_compat		= nrs_policy_compat_all,
+	.nc_flags		= PTLRPC_NRS_FL_FALLBACK |
+				  PTLRPC_NRS_FL_REG_START
+};
+
+/** @} fifo */
+
+/** @} nrs */
diff --git a/kernel/drivers/staging/lustre/lustre/ptlrpc/pack_generic.c b/kernel/drivers/staging/lustre/lustre/ptlrpc/pack_generic.c
new file mode 100644
index 000000000..b51af9bf3
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/ptlrpc/pack_generic.c
@@ -0,0 +1,2536 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/ptlrpc/pack_generic.c
+ *
+ * (Un)packing of OST requests
+ *
+ * Author: Peter J. Braam <braam@clusterfs.com>
+ * Author: Phil Schwan <phil@clusterfs.com>
+ * Author: Eric Barton <eeb@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_RPC
+
+#include "../../include/linux/libcfs/libcfs.h"
+
+#include "../include/obd_support.h"
+#include "../include/obd_class.h"
+#include "../include/lustre_net.h"
+#include "../include/obd_cksum.h"
+#include "../include/lustre/ll_fiemap.h"
+
+static inline int lustre_msg_hdr_size_v2(int count)
+{
+	return cfs_size_round(offsetof(struct lustre_msg_v2,
+				       lm_buflens[count]));
+}
+
+int lustre_msg_hdr_size(__u32 magic, int count)
+{
+	switch (magic) {
+	case LUSTRE_MSG_MAGIC_V2:
+		return lustre_msg_hdr_size_v2(count);
+	default:
+		LASSERTF(0, "incorrect message magic: %08x\n", magic);
+		return -EINVAL;
+	}
+}
+EXPORT_SYMBOL(lustre_msg_hdr_size);
+
+void ptlrpc_buf_set_swabbed(struct ptlrpc_request *req, const int inout,
+			    int index)
+{
+	if (inout)
+		lustre_set_req_swabbed(req, index);
+	else
+		lustre_set_rep_swabbed(req, index);
+}
+EXPORT_SYMBOL(ptlrpc_buf_set_swabbed);
+
+int ptlrpc_buf_need_swab(struct ptlrpc_request *req, const int inout,
+			 int index)
+{
+	if (inout)
+		return (ptlrpc_req_need_swab(req) &&
+			!lustre_req_swabbed(req, index));
+	else
+		return (ptlrpc_rep_need_swab(req) &&
+			!lustre_rep_swabbed(req, index));
+}
+EXPORT_SYMBOL(ptlrpc_buf_need_swab);
+
+static inline int lustre_msg_check_version_v2(struct lustre_msg_v2 *msg,
+					      __u32 version)
+{
+	__u32 ver = lustre_msg_get_version(msg);
+	return (ver & LUSTRE_VERSION_MASK) != version;
+}
+
+int lustre_msg_check_version(struct lustre_msg *msg, __u32 version)
+{
+	switch (msg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V1:
+		CERROR("msg v1 not supported - please upgrade you system\n");
+		return -EINVAL;
+	case LUSTRE_MSG_MAGIC_V2:
+		return lustre_msg_check_version_v2(msg, version);
+	default:
+		CERROR("incorrect message magic: %08x\n", msg->lm_magic);
+		return 0;
+	}
+}
+EXPORT_SYMBOL(lustre_msg_check_version);
+
+/* early reply size */
+int lustre_msg_early_size(void)
+{
+	static int size;
+	if (!size) {
+		/* Always reply old ptlrpc_body_v2 to keep interoperability
+		 * with the old client (< 2.3) which doesn't have pb_jobid
+		 * in the ptlrpc_body.
+		 *
+		 * XXX Remove this whenever we drop interoperability with such
+		 *     client.
+		 */
+		__u32 pblen = sizeof(struct ptlrpc_body_v2);
+		size = lustre_msg_size(LUSTRE_MSG_MAGIC_V2, 1, &pblen);
+	}
+	return size;
+}
+EXPORT_SYMBOL(lustre_msg_early_size);
+
+int lustre_msg_size_v2(int count, __u32 *lengths)
+{
+	int size;
+	int i;
+
+	size = lustre_msg_hdr_size_v2(count);
+	for (i = 0; i < count; i++)
+		size += cfs_size_round(lengths[i]);
+
+	return size;
+}
+EXPORT_SYMBOL(lustre_msg_size_v2);
+
+/* This returns the size of the buffer that is required to hold a lustre_msg
+ * with the given sub-buffer lengths.
+ * NOTE: this should only be used for NEW requests, and should always be
+ *       in the form of a v2 request.  If this is a connection to a v1
+ *       target then the first buffer will be stripped because the ptlrpc
+ *       data is part of the lustre_msg_v1 header. b=14043 */
+int lustre_msg_size(__u32 magic, int count, __u32 *lens)
+{
+	__u32 size[] = { sizeof(struct ptlrpc_body) };
+
+	if (!lens) {
+		LASSERT(count == 1);
+		lens = size;
+	}
+
+	LASSERT(count > 0);
+	LASSERT(lens[MSG_PTLRPC_BODY_OFF] >= sizeof(struct ptlrpc_body_v2));
+
+	switch (magic) {
+	case LUSTRE_MSG_MAGIC_V2:
+		return lustre_msg_size_v2(count, lens);
+	default:
+		LASSERTF(0, "incorrect message magic: %08x\n", magic);
+		return -EINVAL;
+	}
+}
+EXPORT_SYMBOL(lustre_msg_size);
+
+/* This is used to determine the size of a buffer that was already packed
+ * and will correctly handle the different message formats. */
+int lustre_packed_msg_size(struct lustre_msg *msg)
+{
+	switch (msg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V2:
+		return lustre_msg_size_v2(msg->lm_bufcount, msg->lm_buflens);
+	default:
+		CERROR("incorrect message magic: %08x\n", msg->lm_magic);
+		return 0;
+	}
+}
+EXPORT_SYMBOL(lustre_packed_msg_size);
+
+void lustre_init_msg_v2(struct lustre_msg_v2 *msg, int count, __u32 *lens,
+			char **bufs)
+{
+	char *ptr;
+	int i;
+
+	msg->lm_bufcount = count;
+	/* XXX: lm_secflvr uninitialized here */
+	msg->lm_magic = LUSTRE_MSG_MAGIC_V2;
+
+	for (i = 0; i < count; i++)
+		msg->lm_buflens[i] = lens[i];
+
+	if (bufs == NULL)
+		return;
+
+	ptr = (char *)msg + lustre_msg_hdr_size_v2(count);
+	for (i = 0; i < count; i++) {
+		char *tmp = bufs[i];
+		LOGL(tmp, lens[i], ptr);
+	}
+}
+EXPORT_SYMBOL(lustre_init_msg_v2);
+
+static int lustre_pack_request_v2(struct ptlrpc_request *req,
+				  int count, __u32 *lens, char **bufs)
+{
+	int reqlen, rc;
+
+	reqlen = lustre_msg_size_v2(count, lens);
+
+	rc = sptlrpc_cli_alloc_reqbuf(req, reqlen);
+	if (rc)
+		return rc;
+
+	req->rq_reqlen = reqlen;
+
+	lustre_init_msg_v2(req->rq_reqmsg, count, lens, bufs);
+	lustre_msg_add_version(req->rq_reqmsg, PTLRPC_MSG_VERSION);
+	return 0;
+}
+
+int lustre_pack_request(struct ptlrpc_request *req, __u32 magic, int count,
+			__u32 *lens, char **bufs)
+{
+	__u32 size[] = { sizeof(struct ptlrpc_body) };
+
+	if (!lens) {
+		LASSERT(count == 1);
+		lens = size;
+	}
+
+	LASSERT(count > 0);
+	LASSERT(lens[MSG_PTLRPC_BODY_OFF] == sizeof(struct ptlrpc_body));
+
+	/* only use new format, we don't need to be compatible with 1.4 */
+	return lustre_pack_request_v2(req, count, lens, bufs);
+}
+EXPORT_SYMBOL(lustre_pack_request);
+
+#if RS_DEBUG
+LIST_HEAD(ptlrpc_rs_debug_lru);
+spinlock_t ptlrpc_rs_debug_lock;
+
+#define PTLRPC_RS_DEBUG_LRU_ADD(rs)					\
+do {									\
+	spin_lock(&ptlrpc_rs_debug_lock);				\
+	list_add_tail(&(rs)->rs_debug_list, &ptlrpc_rs_debug_lru);	\
+	spin_unlock(&ptlrpc_rs_debug_lock);				\
+} while (0)
+
+#define PTLRPC_RS_DEBUG_LRU_DEL(rs)					\
+do {									\
+	spin_lock(&ptlrpc_rs_debug_lock);				\
+	list_del(&(rs)->rs_debug_list);				\
+	spin_unlock(&ptlrpc_rs_debug_lock);				\
+} while (0)
+#else
+# define PTLRPC_RS_DEBUG_LRU_ADD(rs) do {} while (0)
+# define PTLRPC_RS_DEBUG_LRU_DEL(rs) do {} while (0)
+#endif
+
+struct ptlrpc_reply_state *
+lustre_get_emerg_rs(struct ptlrpc_service_part *svcpt)
+{
+	struct ptlrpc_reply_state *rs = NULL;
+
+	spin_lock(&svcpt->scp_rep_lock);
+
+	/* See if we have anything in a pool, and wait if nothing */
+	while (list_empty(&svcpt->scp_rep_idle)) {
+		struct l_wait_info	lwi;
+		int			rc;
+
+		spin_unlock(&svcpt->scp_rep_lock);
+		/* If we cannot get anything for some long time, we better
+		 * bail out instead of waiting infinitely */
+		lwi = LWI_TIMEOUT(cfs_time_seconds(10), NULL, NULL);
+		rc = l_wait_event(svcpt->scp_rep_waitq,
+				  !list_empty(&svcpt->scp_rep_idle), &lwi);
+		if (rc != 0)
+			goto out;
+		spin_lock(&svcpt->scp_rep_lock);
+	}
+
+	rs = list_entry(svcpt->scp_rep_idle.next,
+			    struct ptlrpc_reply_state, rs_list);
+	list_del(&rs->rs_list);
+
+	spin_unlock(&svcpt->scp_rep_lock);
+
+	memset(rs, 0, svcpt->scp_service->srv_max_reply_size);
+	rs->rs_size = svcpt->scp_service->srv_max_reply_size;
+	rs->rs_svcpt = svcpt;
+	rs->rs_prealloc = 1;
+out:
+	return rs;
+}
+
+void lustre_put_emerg_rs(struct ptlrpc_reply_state *rs)
+{
+	struct ptlrpc_service_part *svcpt = rs->rs_svcpt;
+
+	spin_lock(&svcpt->scp_rep_lock);
+	list_add(&rs->rs_list, &svcpt->scp_rep_idle);
+	spin_unlock(&svcpt->scp_rep_lock);
+	wake_up(&svcpt->scp_rep_waitq);
+}
+
+int lustre_pack_reply_v2(struct ptlrpc_request *req, int count,
+			 __u32 *lens, char **bufs, int flags)
+{
+	struct ptlrpc_reply_state *rs;
+	int			msg_len, rc;
+
+	LASSERT(req->rq_reply_state == NULL);
+
+	if ((flags & LPRFL_EARLY_REPLY) == 0) {
+		spin_lock(&req->rq_lock);
+		req->rq_packed_final = 1;
+		spin_unlock(&req->rq_lock);
+	}
+
+	msg_len = lustre_msg_size_v2(count, lens);
+	rc = sptlrpc_svc_alloc_rs(req, msg_len);
+	if (rc)
+		return rc;
+
+	rs = req->rq_reply_state;
+	atomic_set(&rs->rs_refcount, 1);    /* 1 ref for rq_reply_state */
+	rs->rs_cb_id.cbid_fn = reply_out_callback;
+	rs->rs_cb_id.cbid_arg = rs;
+	rs->rs_svcpt = req->rq_rqbd->rqbd_svcpt;
+	INIT_LIST_HEAD(&rs->rs_exp_list);
+	INIT_LIST_HEAD(&rs->rs_obd_list);
+	INIT_LIST_HEAD(&rs->rs_list);
+	spin_lock_init(&rs->rs_lock);
+
+	req->rq_replen = msg_len;
+	req->rq_reply_state = rs;
+	req->rq_repmsg = rs->rs_msg;
+
+	lustre_init_msg_v2(rs->rs_msg, count, lens, bufs);
+	lustre_msg_add_version(rs->rs_msg, PTLRPC_MSG_VERSION);
+
+	PTLRPC_RS_DEBUG_LRU_ADD(rs);
+
+	return 0;
+}
+EXPORT_SYMBOL(lustre_pack_reply_v2);
+
+int lustre_pack_reply_flags(struct ptlrpc_request *req, int count, __u32 *lens,
+			    char **bufs, int flags)
+{
+	int rc = 0;
+	__u32 size[] = { sizeof(struct ptlrpc_body) };
+
+	if (!lens) {
+		LASSERT(count == 1);
+		lens = size;
+	}
+
+	LASSERT(count > 0);
+	LASSERT(lens[MSG_PTLRPC_BODY_OFF] == sizeof(struct ptlrpc_body));
+
+	switch (req->rq_reqmsg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V2:
+		rc = lustre_pack_reply_v2(req, count, lens, bufs, flags);
+		break;
+	default:
+		LASSERTF(0, "incorrect message magic: %08x\n",
+			 req->rq_reqmsg->lm_magic);
+		rc = -EINVAL;
+	}
+	if (rc != 0)
+		CERROR("lustre_pack_reply failed: rc=%d size=%d\n", rc,
+		       lustre_msg_size(req->rq_reqmsg->lm_magic, count, lens));
+	return rc;
+}
+EXPORT_SYMBOL(lustre_pack_reply_flags);
+
+int lustre_pack_reply(struct ptlrpc_request *req, int count, __u32 *lens,
+		      char **bufs)
+{
+	return lustre_pack_reply_flags(req, count, lens, bufs, 0);
+}
+EXPORT_SYMBOL(lustre_pack_reply);
+
+void *lustre_msg_buf_v2(struct lustre_msg_v2 *m, int n, int min_size)
+{
+	int i, offset, buflen, bufcount;
+
+	LASSERT(m != NULL);
+	LASSERT(n >= 0);
+
+	bufcount = m->lm_bufcount;
+	if (unlikely(n >= bufcount)) {
+		CDEBUG(D_INFO, "msg %p buffer[%d] not present (count %d)\n",
+		       m, n, bufcount);
+		return NULL;
+	}
+
+	buflen = m->lm_buflens[n];
+	if (unlikely(buflen < min_size)) {
+		CERROR("msg %p buffer[%d] size %d too small (required %d, opc=%d)\n",
+		       m, n, buflen, min_size,
+		       n == MSG_PTLRPC_BODY_OFF ? -1 : lustre_msg_get_opc(m));
+		return NULL;
+	}
+
+	offset = lustre_msg_hdr_size_v2(bufcount);
+	for (i = 0; i < n; i++)
+		offset += cfs_size_round(m->lm_buflens[i]);
+
+	return (char *)m + offset;
+}
+
+void *lustre_msg_buf(struct lustre_msg *m, int n, int min_size)
+{
+	switch (m->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V2:
+		return lustre_msg_buf_v2(m, n, min_size);
+	default:
+		LASSERTF(0, "incorrect message magic: %08x(msg:%p)\n", m->lm_magic, m);
+		return NULL;
+	}
+}
+EXPORT_SYMBOL(lustre_msg_buf);
+
+int lustre_shrink_msg_v2(struct lustre_msg_v2 *msg, int segment,
+			 unsigned int newlen, int move_data)
+{
+	char   *tail = NULL, *newpos;
+	int     tail_len = 0, n;
+
+	LASSERT(msg);
+	LASSERT(msg->lm_bufcount > segment);
+	LASSERT(msg->lm_buflens[segment] >= newlen);
+
+	if (msg->lm_buflens[segment] == newlen)
+		goto out;
+
+	if (move_data && msg->lm_bufcount > segment + 1) {
+		tail = lustre_msg_buf_v2(msg, segment + 1, 0);
+		for (n = segment + 1; n < msg->lm_bufcount; n++)
+			tail_len += cfs_size_round(msg->lm_buflens[n]);
+	}
+
+	msg->lm_buflens[segment] = newlen;
+
+	if (tail && tail_len) {
+		newpos = lustre_msg_buf_v2(msg, segment + 1, 0);
+		LASSERT(newpos <= tail);
+		if (newpos != tail)
+			memmove(newpos, tail, tail_len);
+	}
+out:
+	return lustre_msg_size_v2(msg->lm_bufcount, msg->lm_buflens);
+}
+
+/*
+ * for @msg, shrink @segment to size @newlen. if @move_data is non-zero,
+ * we also move data forward from @segment + 1.
+ *
+ * if @newlen == 0, we remove the segment completely, but we still keep the
+ * totally bufcount the same to save possible data moving. this will leave a
+ * unused segment with size 0 at the tail, but that's ok.
+ *
+ * return new msg size after shrinking.
+ *
+ * CAUTION:
+ * + if any buffers higher than @segment has been filled in, must call shrink
+ *   with non-zero @move_data.
+ * + caller should NOT keep pointers to msg buffers which higher than @segment
+ *   after call shrink.
+ */
+int lustre_shrink_msg(struct lustre_msg *msg, int segment,
+		      unsigned int newlen, int move_data)
+{
+	switch (msg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V2:
+		return lustre_shrink_msg_v2(msg, segment, newlen, move_data);
+	default:
+		LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic);
+	}
+}
+EXPORT_SYMBOL(lustre_shrink_msg);
+
+void lustre_free_reply_state(struct ptlrpc_reply_state *rs)
+{
+	PTLRPC_RS_DEBUG_LRU_DEL(rs);
+
+	LASSERT(atomic_read(&rs->rs_refcount) == 0);
+	LASSERT(!rs->rs_difficult || rs->rs_handled);
+	LASSERT(!rs->rs_on_net);
+	LASSERT(!rs->rs_scheduled);
+	LASSERT(rs->rs_export == NULL);
+	LASSERT(rs->rs_nlocks == 0);
+	LASSERT(list_empty(&rs->rs_exp_list));
+	LASSERT(list_empty(&rs->rs_obd_list));
+
+	sptlrpc_svc_free_rs(rs);
+}
+EXPORT_SYMBOL(lustre_free_reply_state);
+
+static int lustre_unpack_msg_v2(struct lustre_msg_v2 *m, int len)
+{
+	int swabbed, required_len, i;
+
+	/* Now we know the sender speaks my language. */
+	required_len = lustre_msg_hdr_size_v2(0);
+	if (len < required_len) {
+		/* can't even look inside the message */
+		CERROR("message length %d too small for lustre_msg\n", len);
+		return -EINVAL;
+	}
+
+	swabbed = (m->lm_magic == LUSTRE_MSG_MAGIC_V2_SWABBED);
+
+	if (swabbed) {
+		__swab32s(&m->lm_magic);
+		__swab32s(&m->lm_bufcount);
+		__swab32s(&m->lm_secflvr);
+		__swab32s(&m->lm_repsize);
+		__swab32s(&m->lm_cksum);
+		__swab32s(&m->lm_flags);
+		CLASSERT(offsetof(typeof(*m), lm_padding_2) != 0);
+		CLASSERT(offsetof(typeof(*m), lm_padding_3) != 0);
+	}
+
+	required_len = lustre_msg_hdr_size_v2(m->lm_bufcount);
+	if (len < required_len) {
+		/* didn't receive all the buffer lengths */
+		CERROR("message length %d too small for %d buflens\n",
+		       len, m->lm_bufcount);
+		return -EINVAL;
+	}
+
+	for (i = 0; i < m->lm_bufcount; i++) {
+		if (swabbed)
+			__swab32s(&m->lm_buflens[i]);
+		required_len += cfs_size_round(m->lm_buflens[i]);
+	}
+
+	if (len < required_len) {
+		CERROR("len: %d, required_len %d\n", len, required_len);
+		CERROR("bufcount: %d\n", m->lm_bufcount);
+		for (i = 0; i < m->lm_bufcount; i++)
+			CERROR("buffer %d length %d\n", i, m->lm_buflens[i]);
+		return -EINVAL;
+	}
+
+	return swabbed;
+}
+
+int __lustre_unpack_msg(struct lustre_msg *m, int len)
+{
+	int required_len, rc;
+
+	/* We can provide a slightly better error log, if we check the
+	 * message magic and version first.  In the future, struct
+	 * lustre_msg may grow, and we'd like to log a version mismatch,
+	 * rather than a short message.
+	 *
+	 */
+	required_len = offsetof(struct lustre_msg, lm_magic) +
+		       sizeof(m->lm_magic);
+	if (len < required_len) {
+		/* can't even look inside the message */
+		CERROR("message length %d too small for magic/version check\n",
+		       len);
+		return -EINVAL;
+	}
+
+	rc = lustre_unpack_msg_v2(m, len);
+
+	return rc;
+}
+EXPORT_SYMBOL(__lustre_unpack_msg);
+
+int ptlrpc_unpack_req_msg(struct ptlrpc_request *req, int len)
+{
+	int rc;
+	rc = __lustre_unpack_msg(req->rq_reqmsg, len);
+	if (rc == 1) {
+		lustre_set_req_swabbed(req, MSG_PTLRPC_HEADER_OFF);
+		rc = 0;
+	}
+	return rc;
+}
+EXPORT_SYMBOL(ptlrpc_unpack_req_msg);
+
+int ptlrpc_unpack_rep_msg(struct ptlrpc_request *req, int len)
+{
+	int rc;
+	rc = __lustre_unpack_msg(req->rq_repmsg, len);
+	if (rc == 1) {
+		lustre_set_rep_swabbed(req, MSG_PTLRPC_HEADER_OFF);
+		rc = 0;
+	}
+	return rc;
+}
+EXPORT_SYMBOL(ptlrpc_unpack_rep_msg);
+
+static inline int lustre_unpack_ptlrpc_body_v2(struct ptlrpc_request *req,
+					       const int inout, int offset)
+{
+	struct ptlrpc_body *pb;
+	struct lustre_msg_v2 *m = inout ? req->rq_reqmsg : req->rq_repmsg;
+
+	pb = lustre_msg_buf_v2(m, offset, sizeof(struct ptlrpc_body_v2));
+	if (!pb) {
+		CERROR("error unpacking ptlrpc body\n");
+		return -EFAULT;
+	}
+	if (ptlrpc_buf_need_swab(req, inout, offset)) {
+		lustre_swab_ptlrpc_body(pb);
+		ptlrpc_buf_set_swabbed(req, inout, offset);
+	}
+
+	if ((pb->pb_version & ~LUSTRE_VERSION_MASK) != PTLRPC_MSG_VERSION) {
+		CERROR("wrong lustre_msg version %08x\n", pb->pb_version);
+		return -EINVAL;
+	}
+
+	if (!inout)
+		pb->pb_status = ptlrpc_status_ntoh(pb->pb_status);
+
+	return 0;
+}
+
+int lustre_unpack_req_ptlrpc_body(struct ptlrpc_request *req, int offset)
+{
+	switch (req->rq_reqmsg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V2:
+		return lustre_unpack_ptlrpc_body_v2(req, 1, offset);
+	default:
+		CERROR("bad lustre msg magic: %08x\n",
+		       req->rq_reqmsg->lm_magic);
+		return -EINVAL;
+	}
+}
+
+int lustre_unpack_rep_ptlrpc_body(struct ptlrpc_request *req, int offset)
+{
+	switch (req->rq_repmsg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V2:
+		return lustre_unpack_ptlrpc_body_v2(req, 0, offset);
+	default:
+		CERROR("bad lustre msg magic: %08x\n",
+		       req->rq_repmsg->lm_magic);
+		return -EINVAL;
+	}
+}
+
+static inline int lustre_msg_buflen_v2(struct lustre_msg_v2 *m, int n)
+{
+	if (n >= m->lm_bufcount)
+		return 0;
+
+	return m->lm_buflens[n];
+}
+
+/**
+ * lustre_msg_buflen - return the length of buffer \a n in message \a m
+ * \param m lustre_msg (request or reply) to look at
+ * \param n message index (base 0)
+ *
+ * returns zero for non-existent message indices
+ */
+int lustre_msg_buflen(struct lustre_msg *m, int n)
+{
+	switch (m->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V2:
+		return lustre_msg_buflen_v2(m, n);
+	default:
+		CERROR("incorrect message magic: %08x\n", m->lm_magic);
+		return -EINVAL;
+	}
+}
+EXPORT_SYMBOL(lustre_msg_buflen);
+
+static inline void
+lustre_msg_set_buflen_v2(struct lustre_msg_v2 *m, int n, int len)
+{
+	if (n >= m->lm_bufcount)
+		LBUG();
+
+	m->lm_buflens[n] = len;
+}
+
+void lustre_msg_set_buflen(struct lustre_msg *m, int n, int len)
+{
+	switch (m->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V2:
+		lustre_msg_set_buflen_v2(m, n, len);
+		return;
+	default:
+		LASSERTF(0, "incorrect message magic: %08x\n", m->lm_magic);
+	}
+}
+
+EXPORT_SYMBOL(lustre_msg_set_buflen);
+
+/* NB return the bufcount for lustre_msg_v2 format, so if message is packed
+ * in V1 format, the result is one bigger. (add struct ptlrpc_body). */
+int lustre_msg_bufcount(struct lustre_msg *m)
+{
+	switch (m->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V2:
+		return m->lm_bufcount;
+	default:
+		CERROR("incorrect message magic: %08x\n", m->lm_magic);
+		return -EINVAL;
+	}
+}
+EXPORT_SYMBOL(lustre_msg_bufcount);
+
+char *lustre_msg_string(struct lustre_msg *m, int index, int max_len)
+{
+	/* max_len == 0 means the string should fill the buffer */
+	char *str;
+	int slen, blen;
+
+	switch (m->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V2:
+		str = lustre_msg_buf_v2(m, index, 0);
+		blen = lustre_msg_buflen_v2(m, index);
+		break;
+	default:
+		LASSERTF(0, "incorrect message magic: %08x\n", m->lm_magic);
+	}
+
+	if (str == NULL) {
+		CERROR("can't unpack string in msg %p buffer[%d]\n", m, index);
+		return NULL;
+	}
+
+	slen = strnlen(str, blen);
+
+	if (slen == blen) {		     /* not NULL terminated */
+		CERROR("can't unpack non-NULL terminated string in msg %p buffer[%d] len %d\n",
+		       m, index, blen);
+		return NULL;
+	}
+
+	if (max_len == 0) {
+		if (slen != blen - 1) {
+			CERROR("can't unpack short string in msg %p buffer[%d] len %d: strlen %d\n",
+			       m, index, blen, slen);
+			return NULL;
+		}
+	} else if (slen > max_len) {
+		CERROR("can't unpack oversized string in msg %p buffer[%d] len %d strlen %d: max %d expected\n",
+		       m, index, blen, slen, max_len);
+		return NULL;
+	}
+
+	return str;
+}
+EXPORT_SYMBOL(lustre_msg_string);
+
+/* Wrap up the normal fixed length cases */
+static inline void *__lustre_swab_buf(struct lustre_msg *msg, int index,
+				      int min_size, void *swabber)
+{
+	void *ptr = NULL;
+
+	LASSERT(msg != NULL);
+	switch (msg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V2:
+		ptr = lustre_msg_buf_v2(msg, index, min_size);
+		break;
+	default:
+		CERROR("incorrect message magic: %08x\n", msg->lm_magic);
+	}
+
+	if (ptr && swabber)
+		((void (*)(void *))swabber)(ptr);
+
+	return ptr;
+}
+
+static inline struct ptlrpc_body *lustre_msg_ptlrpc_body(struct lustre_msg *msg)
+{
+	return lustre_msg_buf_v2(msg, MSG_PTLRPC_BODY_OFF,
+				 sizeof(struct ptlrpc_body_v2));
+}
+
+__u32 lustre_msghdr_get_flags(struct lustre_msg *msg)
+{
+	switch (msg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V1:
+	case LUSTRE_MSG_MAGIC_V1_SWABBED:
+		return 0;
+	case LUSTRE_MSG_MAGIC_V2:
+		/* already in host endian */
+		return msg->lm_flags;
+	default:
+		LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic);
+		return 0;
+	}
+}
+EXPORT_SYMBOL(lustre_msghdr_get_flags);
+
+void lustre_msghdr_set_flags(struct lustre_msg *msg, __u32 flags)
+{
+	switch (msg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V1:
+		return;
+	case LUSTRE_MSG_MAGIC_V2:
+		msg->lm_flags = flags;
+		return;
+	default:
+		LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic);
+	}
+}
+
+__u32 lustre_msg_get_flags(struct lustre_msg *msg)
+{
+	switch (msg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V2: {
+		struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+		if (!pb) {
+			CERROR("invalid msg %p: no ptlrpc body!\n", msg);
+			return 0;
+		}
+		return pb->pb_flags;
+	}
+	default:
+		/* flags might be printed in debug code while message
+		 * uninitialized */
+		return 0;
+	}
+}
+EXPORT_SYMBOL(lustre_msg_get_flags);
+
+void lustre_msg_add_flags(struct lustre_msg *msg, int flags)
+{
+	switch (msg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V2: {
+		struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+		LASSERTF(pb, "invalid msg %p: no ptlrpc body!\n", msg);
+		pb->pb_flags |= flags;
+		return;
+	}
+	default:
+		LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic);
+	}
+}
+EXPORT_SYMBOL(lustre_msg_add_flags);
+
+void lustre_msg_set_flags(struct lustre_msg *msg, int flags)
+{
+	switch (msg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V2: {
+		struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+		LASSERTF(pb, "invalid msg %p: no ptlrpc body!\n", msg);
+		pb->pb_flags = flags;
+		return;
+	}
+	default:
+		LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic);
+	}
+}
+EXPORT_SYMBOL(lustre_msg_set_flags);
+
+void lustre_msg_clear_flags(struct lustre_msg *msg, int flags)
+{
+	switch (msg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V2: {
+		struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+		LASSERTF(pb, "invalid msg %p: no ptlrpc body!\n", msg);
+		pb->pb_flags &= ~(MSG_GEN_FLAG_MASK & flags);
+		return;
+	}
+	default:
+		LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic);
+	}
+}
+EXPORT_SYMBOL(lustre_msg_clear_flags);
+
+__u32 lustre_msg_get_op_flags(struct lustre_msg *msg)
+{
+	switch (msg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V2: {
+		struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+		if (!pb) {
+			CERROR("invalid msg %p: no ptlrpc body!\n", msg);
+			return 0;
+		}
+		return pb->pb_op_flags;
+	}
+	default:
+		return 0;
+	}
+}
+EXPORT_SYMBOL(lustre_msg_get_op_flags);
+
+void lustre_msg_add_op_flags(struct lustre_msg *msg, int flags)
+{
+	switch (msg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V2: {
+		struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+		LASSERTF(pb, "invalid msg %p: no ptlrpc body!\n", msg);
+		pb->pb_op_flags |= flags;
+		return;
+	}
+	default:
+		LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic);
+	}
+}
+EXPORT_SYMBOL(lustre_msg_add_op_flags);
+
+void lustre_msg_set_op_flags(struct lustre_msg *msg, int flags)
+{
+	switch (msg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V2: {
+		struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+		LASSERTF(pb, "invalid msg %p: no ptlrpc body!\n", msg);
+		pb->pb_op_flags |= flags;
+		return;
+	}
+	default:
+		LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic);
+	}
+}
+EXPORT_SYMBOL(lustre_msg_set_op_flags);
+
+struct lustre_handle *lustre_msg_get_handle(struct lustre_msg *msg)
+{
+	switch (msg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V2: {
+		struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+		if (!pb) {
+			CERROR("invalid msg %p: no ptlrpc body!\n", msg);
+			return NULL;
+		}
+		return &pb->pb_handle;
+	}
+	default:
+		CERROR("incorrect message magic: %08x\n", msg->lm_magic);
+		return NULL;
+	}
+}
+EXPORT_SYMBOL(lustre_msg_get_handle);
+
+__u32 lustre_msg_get_type(struct lustre_msg *msg)
+{
+	switch (msg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V2: {
+		struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+		if (!pb) {
+			CERROR("invalid msg %p: no ptlrpc body!\n", msg);
+			return PTL_RPC_MSG_ERR;
+		}
+		return pb->pb_type;
+	}
+	default:
+		CERROR("incorrect message magic: %08x\n", msg->lm_magic);
+		return PTL_RPC_MSG_ERR;
+	}
+}
+EXPORT_SYMBOL(lustre_msg_get_type);
+
+__u32 lustre_msg_get_version(struct lustre_msg *msg)
+{
+	switch (msg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V2: {
+		struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+		if (!pb) {
+			CERROR("invalid msg %p: no ptlrpc body!\n", msg);
+			return 0;
+		}
+		return pb->pb_version;
+	}
+	default:
+		CERROR("incorrect message magic: %08x\n", msg->lm_magic);
+		return 0;
+	}
+}
+EXPORT_SYMBOL(lustre_msg_get_version);
+
+void lustre_msg_add_version(struct lustre_msg *msg, int version)
+{
+	switch (msg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V2: {
+		struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+		LASSERTF(pb, "invalid msg %p: no ptlrpc body!\n", msg);
+		pb->pb_version |= version;
+		return;
+	}
+	default:
+		LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic);
+	}
+}
+EXPORT_SYMBOL(lustre_msg_add_version);
+
+__u32 lustre_msg_get_opc(struct lustre_msg *msg)
+{
+	switch (msg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V2: {
+		struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+		if (!pb) {
+			CERROR("invalid msg %p: no ptlrpc body!\n", msg);
+			return 0;
+		}
+		return pb->pb_opc;
+	}
+	default:
+		CERROR("incorrect message magic: %08x(msg:%p)\n", msg->lm_magic, msg);
+		LBUG();
+		return 0;
+	}
+}
+EXPORT_SYMBOL(lustre_msg_get_opc);
+
+__u64 lustre_msg_get_last_xid(struct lustre_msg *msg)
+{
+	switch (msg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V2: {
+		struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+		if (!pb) {
+			CERROR("invalid msg %p: no ptlrpc body!\n", msg);
+			return 0;
+		}
+		return pb->pb_last_xid;
+	}
+	default:
+		CERROR("incorrect message magic: %08x\n", msg->lm_magic);
+		return 0;
+	}
+}
+EXPORT_SYMBOL(lustre_msg_get_last_xid);
+
+__u64 lustre_msg_get_last_committed(struct lustre_msg *msg)
+{
+	switch (msg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V2: {
+		struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+		if (!pb) {
+			CERROR("invalid msg %p: no ptlrpc body!\n", msg);
+			return 0;
+		}
+		return pb->pb_last_committed;
+	}
+	default:
+		CERROR("incorrect message magic: %08x\n", msg->lm_magic);
+		return 0;
+	}
+}
+EXPORT_SYMBOL(lustre_msg_get_last_committed);
+
+__u64 *lustre_msg_get_versions(struct lustre_msg *msg)
+{
+	switch (msg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V1:
+		return NULL;
+	case LUSTRE_MSG_MAGIC_V2: {
+		struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+		if (!pb) {
+			CERROR("invalid msg %p: no ptlrpc body!\n", msg);
+			return NULL;
+		}
+		return pb->pb_pre_versions;
+	}
+	default:
+		CERROR("incorrect message magic: %08x\n", msg->lm_magic);
+		return NULL;
+	}
+}
+EXPORT_SYMBOL(lustre_msg_get_versions);
+
+__u64 lustre_msg_get_transno(struct lustre_msg *msg)
+{
+	switch (msg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V2: {
+		struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+		if (!pb) {
+			CERROR("invalid msg %p: no ptlrpc body!\n", msg);
+			return 0;
+		}
+		return pb->pb_transno;
+	}
+	default:
+		CERROR("incorrect message magic: %08x\n", msg->lm_magic);
+		return 0;
+	}
+}
+EXPORT_SYMBOL(lustre_msg_get_transno);
+
+int lustre_msg_get_status(struct lustre_msg *msg)
+{
+	switch (msg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V2: {
+		struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+		if (!pb) {
+			CERROR("invalid msg %p: no ptlrpc body!\n", msg);
+			return -EINVAL;
+		}
+		return pb->pb_status;
+	}
+	default:
+		/* status might be printed in debug code while message
+		 * uninitialized */
+		return -EINVAL;
+	}
+}
+EXPORT_SYMBOL(lustre_msg_get_status);
+
+__u64 lustre_msg_get_slv(struct lustre_msg *msg)
+{
+	switch (msg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V2: {
+		struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+		if (!pb) {
+			CERROR("invalid msg %p: no ptlrpc body!\n", msg);
+			return -EINVAL;
+		}
+		return pb->pb_slv;
+	}
+	default:
+		CERROR("invalid msg magic %08x\n", msg->lm_magic);
+		return -EINVAL;
+	}
+}
+EXPORT_SYMBOL(lustre_msg_get_slv);
+
+
+void lustre_msg_set_slv(struct lustre_msg *msg, __u64 slv)
+{
+	switch (msg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V2: {
+		struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+		if (!pb) {
+			CERROR("invalid msg %p: no ptlrpc body!\n", msg);
+			return;
+		}
+		pb->pb_slv = slv;
+		return;
+	}
+	default:
+		CERROR("invalid msg magic %x\n", msg->lm_magic);
+		return;
+	}
+}
+EXPORT_SYMBOL(lustre_msg_set_slv);
+
+__u32 lustre_msg_get_limit(struct lustre_msg *msg)
+{
+	switch (msg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V2: {
+		struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+		if (!pb) {
+			CERROR("invalid msg %p: no ptlrpc body!\n", msg);
+			return -EINVAL;
+		}
+		return pb->pb_limit;
+	}
+	default:
+		CERROR("invalid msg magic %x\n", msg->lm_magic);
+		return -EINVAL;
+	}
+}
+EXPORT_SYMBOL(lustre_msg_get_limit);
+
+
+void lustre_msg_set_limit(struct lustre_msg *msg, __u64 limit)
+{
+	switch (msg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V2: {
+		struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+		if (!pb) {
+			CERROR("invalid msg %p: no ptlrpc body!\n", msg);
+			return;
+		}
+		pb->pb_limit = limit;
+		return;
+	}
+	default:
+		CERROR("invalid msg magic %08x\n", msg->lm_magic);
+		return;
+	}
+}
+EXPORT_SYMBOL(lustre_msg_set_limit);
+
+__u32 lustre_msg_get_conn_cnt(struct lustre_msg *msg)
+{
+	switch (msg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V2: {
+		struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+		if (!pb) {
+			CERROR("invalid msg %p: no ptlrpc body!\n", msg);
+			return 0;
+		}
+		return pb->pb_conn_cnt;
+	}
+	default:
+		CERROR("incorrect message magic: %08x\n", msg->lm_magic);
+		return 0;
+	}
+}
+EXPORT_SYMBOL(lustre_msg_get_conn_cnt);
+
+int lustre_msg_is_v1(struct lustre_msg *msg)
+{
+	switch (msg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V1:
+	case LUSTRE_MSG_MAGIC_V1_SWABBED:
+		return 1;
+	default:
+		return 0;
+	}
+}
+EXPORT_SYMBOL(lustre_msg_is_v1);
+
+__u32 lustre_msg_get_magic(struct lustre_msg *msg)
+{
+	switch (msg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V2:
+		return msg->lm_magic;
+	default:
+		CERROR("incorrect message magic: %08x\n", msg->lm_magic);
+		return 0;
+	}
+}
+EXPORT_SYMBOL(lustre_msg_get_magic);
+
+__u32 lustre_msg_get_timeout(struct lustre_msg *msg)
+{
+	switch (msg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V1:
+	case LUSTRE_MSG_MAGIC_V1_SWABBED:
+		return 0;
+	case LUSTRE_MSG_MAGIC_V2: {
+		struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+		if (!pb) {
+			CERROR("invalid msg %p: no ptlrpc body!\n", msg);
+			return 0;
+
+		}
+		return pb->pb_timeout;
+	}
+	default:
+		CERROR("incorrect message magic: %08x\n", msg->lm_magic);
+		return 0;
+	}
+}
+
+__u32 lustre_msg_get_service_time(struct lustre_msg *msg)
+{
+	switch (msg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V1:
+	case LUSTRE_MSG_MAGIC_V1_SWABBED:
+		return 0;
+	case LUSTRE_MSG_MAGIC_V2: {
+		struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+		if (!pb) {
+			CERROR("invalid msg %p: no ptlrpc body!\n", msg);
+			return 0;
+
+		}
+		return pb->pb_service_time;
+	}
+	default:
+		CERROR("incorrect message magic: %08x\n", msg->lm_magic);
+		return 0;
+	}
+}
+
+char *lustre_msg_get_jobid(struct lustre_msg *msg)
+{
+	switch (msg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V1:
+	case LUSTRE_MSG_MAGIC_V1_SWABBED:
+		return NULL;
+	case LUSTRE_MSG_MAGIC_V2: {
+		struct ptlrpc_body *pb =
+			lustre_msg_buf_v2(msg, MSG_PTLRPC_BODY_OFF,
+					  sizeof(struct ptlrpc_body));
+		if (!pb)
+			return NULL;
+
+		return pb->pb_jobid;
+	}
+	default:
+		CERROR("incorrect message magic: %08x\n", msg->lm_magic);
+		return NULL;
+	}
+}
+EXPORT_SYMBOL(lustre_msg_get_jobid);
+
+__u32 lustre_msg_get_cksum(struct lustre_msg *msg)
+{
+	switch (msg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V2:
+		return msg->lm_cksum;
+	default:
+		CERROR("incorrect message magic: %08x\n", msg->lm_magic);
+		return 0;
+	}
+}
+
+__u32 lustre_msg_calc_cksum(struct lustre_msg *msg)
+{
+	switch (msg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V2: {
+		struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+		__u32 crc;
+		unsigned int hsize = 4;
+		cfs_crypto_hash_digest(CFS_HASH_ALG_CRC32, (unsigned char *)pb,
+				   lustre_msg_buflen(msg, MSG_PTLRPC_BODY_OFF),
+				   NULL, 0, (unsigned char *)&crc, &hsize);
+		return crc;
+	}
+	default:
+		CERROR("incorrect message magic: %08x\n", msg->lm_magic);
+		return 0;
+	}
+}
+
+void lustre_msg_set_handle(struct lustre_msg *msg, struct lustre_handle *handle)
+{
+	switch (msg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V2: {
+		struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+		LASSERTF(pb, "invalid msg %p: no ptlrpc body!\n", msg);
+		pb->pb_handle = *handle;
+		return;
+	}
+	default:
+		LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic);
+	}
+}
+EXPORT_SYMBOL(lustre_msg_set_handle);
+
+void lustre_msg_set_type(struct lustre_msg *msg, __u32 type)
+{
+	switch (msg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V2: {
+		struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+		LASSERTF(pb, "invalid msg %p: no ptlrpc body!\n", msg);
+		pb->pb_type = type;
+		return;
+	}
+	default:
+		LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic);
+	}
+}
+EXPORT_SYMBOL(lustre_msg_set_type);
+
+void lustre_msg_set_opc(struct lustre_msg *msg, __u32 opc)
+{
+	switch (msg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V2: {
+		struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+		LASSERTF(pb, "invalid msg %p: no ptlrpc body!\n", msg);
+		pb->pb_opc = opc;
+		return;
+	}
+	default:
+		LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic);
+	}
+}
+EXPORT_SYMBOL(lustre_msg_set_opc);
+
+void lustre_msg_set_last_xid(struct lustre_msg *msg, __u64 last_xid)
+{
+	switch (msg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V2: {
+		struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+		LASSERTF(pb, "invalid msg %p: no ptlrpc body!\n", msg);
+		pb->pb_last_xid = last_xid;
+		return;
+	}
+	default:
+		LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic);
+	}
+}
+EXPORT_SYMBOL(lustre_msg_set_last_xid);
+
+void lustre_msg_set_last_committed(struct lustre_msg *msg, __u64 last_committed)
+{
+	switch (msg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V2: {
+		struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+		LASSERTF(pb, "invalid msg %p: no ptlrpc body!\n", msg);
+		pb->pb_last_committed = last_committed;
+		return;
+	}
+	default:
+		LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic);
+	}
+}
+EXPORT_SYMBOL(lustre_msg_set_last_committed);
+
+void lustre_msg_set_versions(struct lustre_msg *msg, __u64 *versions)
+{
+	switch (msg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V1:
+		return;
+	case LUSTRE_MSG_MAGIC_V2: {
+		struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+		LASSERTF(pb, "invalid msg %p: no ptlrpc body!\n", msg);
+		pb->pb_pre_versions[0] = versions[0];
+		pb->pb_pre_versions[1] = versions[1];
+		pb->pb_pre_versions[2] = versions[2];
+		pb->pb_pre_versions[3] = versions[3];
+		return;
+	}
+	default:
+		LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic);
+	}
+}
+EXPORT_SYMBOL(lustre_msg_set_versions);
+
+void lustre_msg_set_transno(struct lustre_msg *msg, __u64 transno)
+{
+	switch (msg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V2: {
+		struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+		LASSERTF(pb, "invalid msg %p: no ptlrpc body!\n", msg);
+		pb->pb_transno = transno;
+		return;
+	}
+	default:
+		LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic);
+	}
+}
+EXPORT_SYMBOL(lustre_msg_set_transno);
+
+void lustre_msg_set_status(struct lustre_msg *msg, __u32 status)
+{
+	switch (msg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V2: {
+		struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+		LASSERTF(pb, "invalid msg %p: no ptlrpc body!\n", msg);
+		pb->pb_status = status;
+		return;
+	}
+	default:
+		LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic);
+	}
+}
+EXPORT_SYMBOL(lustre_msg_set_status);
+
+void lustre_msg_set_conn_cnt(struct lustre_msg *msg, __u32 conn_cnt)
+{
+	switch (msg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V2: {
+		struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+		LASSERTF(pb, "invalid msg %p: no ptlrpc body!\n", msg);
+		pb->pb_conn_cnt = conn_cnt;
+		return;
+	}
+	default:
+		LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic);
+	}
+}
+EXPORT_SYMBOL(lustre_msg_set_conn_cnt);
+
+void lustre_msg_set_timeout(struct lustre_msg *msg, __u32 timeout)
+{
+	switch (msg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V1:
+		return;
+	case LUSTRE_MSG_MAGIC_V2: {
+		struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+		LASSERTF(pb, "invalid msg %p: no ptlrpc body!\n", msg);
+		pb->pb_timeout = timeout;
+		return;
+	}
+	default:
+		LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic);
+	}
+}
+
+void lustre_msg_set_service_time(struct lustre_msg *msg, __u32 service_time)
+{
+	switch (msg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V1:
+		return;
+	case LUSTRE_MSG_MAGIC_V2: {
+		struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+		LASSERTF(pb, "invalid msg %p: no ptlrpc body!\n", msg);
+		pb->pb_service_time = service_time;
+		return;
+	}
+	default:
+		LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic);
+	}
+}
+
+void lustre_msg_set_jobid(struct lustre_msg *msg, char *jobid)
+{
+	switch (msg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V1:
+		return;
+	case LUSTRE_MSG_MAGIC_V2: {
+		__u32 opc = lustre_msg_get_opc(msg);
+		struct ptlrpc_body *pb;
+
+		/* Don't set jobid for ldlm ast RPCs, they've been shrunk.
+		 * See the comment in ptlrpc_request_pack(). */
+		if (!opc || opc == LDLM_BL_CALLBACK ||
+		    opc == LDLM_CP_CALLBACK || opc == LDLM_GL_CALLBACK)
+			return;
+
+		pb = lustre_msg_buf_v2(msg, MSG_PTLRPC_BODY_OFF,
+				       sizeof(struct ptlrpc_body));
+		LASSERTF(pb, "invalid msg %p: no ptlrpc body!\n", msg);
+
+		if (jobid != NULL)
+			memcpy(pb->pb_jobid, jobid, JOBSTATS_JOBID_SIZE);
+		else if (pb->pb_jobid[0] == '\0')
+			lustre_get_jobid(pb->pb_jobid);
+		return;
+	}
+	default:
+		LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic);
+	}
+}
+EXPORT_SYMBOL(lustre_msg_set_jobid);
+
+void lustre_msg_set_cksum(struct lustre_msg *msg, __u32 cksum)
+{
+	switch (msg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V1:
+		return;
+	case LUSTRE_MSG_MAGIC_V2:
+		msg->lm_cksum = cksum;
+		return;
+	default:
+		LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic);
+	}
+}
+
+
+void ptlrpc_request_set_replen(struct ptlrpc_request *req)
+{
+	int count = req_capsule_filled_sizes(&req->rq_pill, RCL_SERVER);
+
+	req->rq_replen = lustre_msg_size(req->rq_reqmsg->lm_magic, count,
+					 req->rq_pill.rc_area[RCL_SERVER]);
+	if (req->rq_reqmsg->lm_magic == LUSTRE_MSG_MAGIC_V2)
+		req->rq_reqmsg->lm_repsize = req->rq_replen;
+}
+EXPORT_SYMBOL(ptlrpc_request_set_replen);
+
+void ptlrpc_req_set_repsize(struct ptlrpc_request *req, int count, __u32 *lens)
+{
+	req->rq_replen = lustre_msg_size(req->rq_reqmsg->lm_magic, count, lens);
+	if (req->rq_reqmsg->lm_magic == LUSTRE_MSG_MAGIC_V2)
+		req->rq_reqmsg->lm_repsize = req->rq_replen;
+}
+EXPORT_SYMBOL(ptlrpc_req_set_repsize);
+
+/**
+ * Send a remote set_info_async.
+ *
+ * This may go from client to server or server to client.
+ */
+int do_set_info_async(struct obd_import *imp,
+		      int opcode, int version,
+		      u32 keylen, void *key,
+		      u32 vallen, void *val,
+		      struct ptlrpc_request_set *set)
+{
+	struct ptlrpc_request *req;
+	char		  *tmp;
+	int		    rc;
+
+	req = ptlrpc_request_alloc(imp, &RQF_OBD_SET_INFO);
+	if (req == NULL)
+		return -ENOMEM;
+
+	req_capsule_set_size(&req->rq_pill, &RMF_SETINFO_KEY,
+			     RCL_CLIENT, keylen);
+	req_capsule_set_size(&req->rq_pill, &RMF_SETINFO_VAL,
+			     RCL_CLIENT, vallen);
+	rc = ptlrpc_request_pack(req, version, opcode);
+	if (rc) {
+		ptlrpc_request_free(req);
+		return rc;
+	}
+
+	tmp = req_capsule_client_get(&req->rq_pill, &RMF_SETINFO_KEY);
+	memcpy(tmp, key, keylen);
+	tmp = req_capsule_client_get(&req->rq_pill, &RMF_SETINFO_VAL);
+	memcpy(tmp, val, vallen);
+
+	ptlrpc_request_set_replen(req);
+
+	if (set) {
+		ptlrpc_set_add_req(set, req);
+		ptlrpc_check_set(NULL, set);
+	} else {
+		rc = ptlrpc_queue_wait(req);
+		ptlrpc_req_finished(req);
+	}
+
+	return rc;
+}
+EXPORT_SYMBOL(do_set_info_async);
+
+/* byte flipping routines for all wire types declared in
+ * lustre_idl.h implemented here.
+ */
+void lustre_swab_ptlrpc_body(struct ptlrpc_body *b)
+{
+	__swab32s(&b->pb_type);
+	__swab32s(&b->pb_version);
+	__swab32s(&b->pb_opc);
+	__swab32s(&b->pb_status);
+	__swab64s(&b->pb_last_xid);
+	__swab64s(&b->pb_last_seen);
+	__swab64s(&b->pb_last_committed);
+	__swab64s(&b->pb_transno);
+	__swab32s(&b->pb_flags);
+	__swab32s(&b->pb_op_flags);
+	__swab32s(&b->pb_conn_cnt);
+	__swab32s(&b->pb_timeout);
+	__swab32s(&b->pb_service_time);
+	__swab32s(&b->pb_limit);
+	__swab64s(&b->pb_slv);
+	__swab64s(&b->pb_pre_versions[0]);
+	__swab64s(&b->pb_pre_versions[1]);
+	__swab64s(&b->pb_pre_versions[2]);
+	__swab64s(&b->pb_pre_versions[3]);
+	CLASSERT(offsetof(typeof(*b), pb_padding) != 0);
+	/* While we need to maintain compatibility between
+	 * clients and servers without ptlrpc_body_v2 (< 2.3)
+	 * do not swab any fields beyond pb_jobid, as we are
+	 * using this swab function for both ptlrpc_body
+	 * and ptlrpc_body_v2. */
+	CLASSERT(offsetof(typeof(*b), pb_jobid) != 0);
+}
+EXPORT_SYMBOL(lustre_swab_ptlrpc_body);
+
+void lustre_swab_connect(struct obd_connect_data *ocd)
+{
+	__swab64s(&ocd->ocd_connect_flags);
+	__swab32s(&ocd->ocd_version);
+	__swab32s(&ocd->ocd_grant);
+	__swab64s(&ocd->ocd_ibits_known);
+	__swab32s(&ocd->ocd_index);
+	__swab32s(&ocd->ocd_brw_size);
+	/* ocd_blocksize and ocd_inodespace don't need to be swabbed because
+	 * they are 8-byte values */
+	__swab16s(&ocd->ocd_grant_extent);
+	__swab32s(&ocd->ocd_unused);
+	__swab64s(&ocd->ocd_transno);
+	__swab32s(&ocd->ocd_group);
+	__swab32s(&ocd->ocd_cksum_types);
+	__swab32s(&ocd->ocd_instance);
+	/* Fields after ocd_cksum_types are only accessible by the receiver
+	 * if the corresponding flag in ocd_connect_flags is set. Accessing
+	 * any field after ocd_maxbytes on the receiver without a valid flag
+	 * may result in out-of-bound memory access and kernel oops. */
+	if (ocd->ocd_connect_flags & OBD_CONNECT_MAX_EASIZE)
+		__swab32s(&ocd->ocd_max_easize);
+	if (ocd->ocd_connect_flags & OBD_CONNECT_MAXBYTES)
+		__swab64s(&ocd->ocd_maxbytes);
+	CLASSERT(offsetof(typeof(*ocd), padding1) != 0);
+	CLASSERT(offsetof(typeof(*ocd), padding2) != 0);
+	CLASSERT(offsetof(typeof(*ocd), padding3) != 0);
+	CLASSERT(offsetof(typeof(*ocd), padding4) != 0);
+	CLASSERT(offsetof(typeof(*ocd), padding5) != 0);
+	CLASSERT(offsetof(typeof(*ocd), padding6) != 0);
+	CLASSERT(offsetof(typeof(*ocd), padding7) != 0);
+	CLASSERT(offsetof(typeof(*ocd), padding8) != 0);
+	CLASSERT(offsetof(typeof(*ocd), padding9) != 0);
+	CLASSERT(offsetof(typeof(*ocd), paddingA) != 0);
+	CLASSERT(offsetof(typeof(*ocd), paddingB) != 0);
+	CLASSERT(offsetof(typeof(*ocd), paddingC) != 0);
+	CLASSERT(offsetof(typeof(*ocd), paddingD) != 0);
+	CLASSERT(offsetof(typeof(*ocd), paddingE) != 0);
+	CLASSERT(offsetof(typeof(*ocd), paddingF) != 0);
+}
+
+void lustre_swab_obdo(struct obdo  *o)
+{
+	__swab64s(&o->o_valid);
+	lustre_swab_ost_id(&o->o_oi);
+	__swab64s(&o->o_parent_seq);
+	__swab64s(&o->o_size);
+	__swab64s(&o->o_mtime);
+	__swab64s(&o->o_atime);
+	__swab64s(&o->o_ctime);
+	__swab64s(&o->o_blocks);
+	__swab64s(&o->o_grant);
+	__swab32s(&o->o_blksize);
+	__swab32s(&o->o_mode);
+	__swab32s(&o->o_uid);
+	__swab32s(&o->o_gid);
+	__swab32s(&o->o_flags);
+	__swab32s(&o->o_nlink);
+	__swab32s(&o->o_parent_oid);
+	__swab32s(&o->o_misc);
+	__swab64s(&o->o_ioepoch);
+	__swab32s(&o->o_stripe_idx);
+	__swab32s(&o->o_parent_ver);
+	/* o_handle is opaque */
+	/* o_lcookie is swabbed elsewhere */
+	__swab32s(&o->o_uid_h);
+	__swab32s(&o->o_gid_h);
+	__swab64s(&o->o_data_version);
+	CLASSERT(offsetof(typeof(*o), o_padding_4) != 0);
+	CLASSERT(offsetof(typeof(*o), o_padding_5) != 0);
+	CLASSERT(offsetof(typeof(*o), o_padding_6) != 0);
+
+}
+EXPORT_SYMBOL(lustre_swab_obdo);
+
+void lustre_swab_obd_statfs(struct obd_statfs *os)
+{
+	__swab64s(&os->os_type);
+	__swab64s(&os->os_blocks);
+	__swab64s(&os->os_bfree);
+	__swab64s(&os->os_bavail);
+	__swab64s(&os->os_files);
+	__swab64s(&os->os_ffree);
+	/* no need to swab os_fsid */
+	__swab32s(&os->os_bsize);
+	__swab32s(&os->os_namelen);
+	__swab64s(&os->os_maxbytes);
+	__swab32s(&os->os_state);
+	CLASSERT(offsetof(typeof(*os), os_fprecreated) != 0);
+	CLASSERT(offsetof(typeof(*os), os_spare2) != 0);
+	CLASSERT(offsetof(typeof(*os), os_spare3) != 0);
+	CLASSERT(offsetof(typeof(*os), os_spare4) != 0);
+	CLASSERT(offsetof(typeof(*os), os_spare5) != 0);
+	CLASSERT(offsetof(typeof(*os), os_spare6) != 0);
+	CLASSERT(offsetof(typeof(*os), os_spare7) != 0);
+	CLASSERT(offsetof(typeof(*os), os_spare8) != 0);
+	CLASSERT(offsetof(typeof(*os), os_spare9) != 0);
+}
+EXPORT_SYMBOL(lustre_swab_obd_statfs);
+
+void lustre_swab_obd_ioobj(struct obd_ioobj *ioo)
+{
+	lustre_swab_ost_id(&ioo->ioo_oid);
+	__swab32s(&ioo->ioo_max_brw);
+	__swab32s(&ioo->ioo_bufcnt);
+}
+EXPORT_SYMBOL(lustre_swab_obd_ioobj);
+
+void lustre_swab_niobuf_remote(struct niobuf_remote *nbr)
+{
+	__swab64s(&nbr->offset);
+	__swab32s(&nbr->len);
+	__swab32s(&nbr->flags);
+}
+EXPORT_SYMBOL(lustre_swab_niobuf_remote);
+
+void lustre_swab_ost_body(struct ost_body *b)
+{
+	lustre_swab_obdo(&b->oa);
+}
+EXPORT_SYMBOL(lustre_swab_ost_body);
+
+void lustre_swab_ost_last_id(u64 *id)
+{
+	__swab64s(id);
+}
+EXPORT_SYMBOL(lustre_swab_ost_last_id);
+
+void lustre_swab_generic_32s(__u32 *val)
+{
+	__swab32s(val);
+}
+EXPORT_SYMBOL(lustre_swab_generic_32s);
+
+void lustre_swab_gl_desc(union ldlm_gl_desc *desc)
+{
+	lustre_swab_lu_fid(&desc->lquota_desc.gl_id.qid_fid);
+	__swab64s(&desc->lquota_desc.gl_flags);
+	__swab64s(&desc->lquota_desc.gl_ver);
+	__swab64s(&desc->lquota_desc.gl_hardlimit);
+	__swab64s(&desc->lquota_desc.gl_softlimit);
+	__swab64s(&desc->lquota_desc.gl_time);
+	CLASSERT(offsetof(typeof(desc->lquota_desc), gl_pad2) != 0);
+}
+
+void lustre_swab_ost_lvb_v1(struct ost_lvb_v1 *lvb)
+{
+	__swab64s(&lvb->lvb_size);
+	__swab64s(&lvb->lvb_mtime);
+	__swab64s(&lvb->lvb_atime);
+	__swab64s(&lvb->lvb_ctime);
+	__swab64s(&lvb->lvb_blocks);
+}
+EXPORT_SYMBOL(lustre_swab_ost_lvb_v1);
+
+void lustre_swab_ost_lvb(struct ost_lvb *lvb)
+{
+	__swab64s(&lvb->lvb_size);
+	__swab64s(&lvb->lvb_mtime);
+	__swab64s(&lvb->lvb_atime);
+	__swab64s(&lvb->lvb_ctime);
+	__swab64s(&lvb->lvb_blocks);
+	__swab32s(&lvb->lvb_mtime_ns);
+	__swab32s(&lvb->lvb_atime_ns);
+	__swab32s(&lvb->lvb_ctime_ns);
+	__swab32s(&lvb->lvb_padding);
+}
+EXPORT_SYMBOL(lustre_swab_ost_lvb);
+
+void lustre_swab_lquota_lvb(struct lquota_lvb *lvb)
+{
+	__swab64s(&lvb->lvb_flags);
+	__swab64s(&lvb->lvb_id_may_rel);
+	__swab64s(&lvb->lvb_id_rel);
+	__swab64s(&lvb->lvb_id_qunit);
+	__swab64s(&lvb->lvb_pad1);
+}
+EXPORT_SYMBOL(lustre_swab_lquota_lvb);
+
+void lustre_swab_mdt_body(struct mdt_body *b)
+{
+	lustre_swab_lu_fid(&b->fid1);
+	lustre_swab_lu_fid(&b->fid2);
+	/* handle is opaque */
+	__swab64s(&b->valid);
+	__swab64s(&b->size);
+	__swab64s(&b->mtime);
+	__swab64s(&b->atime);
+	__swab64s(&b->ctime);
+	__swab64s(&b->blocks);
+	__swab64s(&b->ioepoch);
+	__swab64s(&b->t_state);
+	__swab32s(&b->fsuid);
+	__swab32s(&b->fsgid);
+	__swab32s(&b->capability);
+	__swab32s(&b->mode);
+	__swab32s(&b->uid);
+	__swab32s(&b->gid);
+	__swab32s(&b->flags);
+	__swab32s(&b->rdev);
+	__swab32s(&b->nlink);
+	CLASSERT(offsetof(typeof(*b), unused2) != 0);
+	__swab32s(&b->suppgid);
+	__swab32s(&b->eadatasize);
+	__swab32s(&b->aclsize);
+	__swab32s(&b->max_mdsize);
+	__swab32s(&b->max_cookiesize);
+	__swab32s(&b->uid_h);
+	__swab32s(&b->gid_h);
+	CLASSERT(offsetof(typeof(*b), padding_5) != 0);
+}
+EXPORT_SYMBOL(lustre_swab_mdt_body);
+
+void lustre_swab_mdt_ioepoch(struct mdt_ioepoch *b)
+{
+	/* handle is opaque */
+	 __swab64s(&b->ioepoch);
+	 __swab32s(&b->flags);
+	 CLASSERT(offsetof(typeof(*b), padding) != 0);
+}
+EXPORT_SYMBOL(lustre_swab_mdt_ioepoch);
+
+void lustre_swab_mgs_target_info(struct mgs_target_info *mti)
+{
+	int i;
+	__swab32s(&mti->mti_lustre_ver);
+	__swab32s(&mti->mti_stripe_index);
+	__swab32s(&mti->mti_config_ver);
+	__swab32s(&mti->mti_flags);
+	__swab32s(&mti->mti_instance);
+	__swab32s(&mti->mti_nid_count);
+	CLASSERT(sizeof(lnet_nid_t) == sizeof(__u64));
+	for (i = 0; i < MTI_NIDS_MAX; i++)
+		__swab64s(&mti->mti_nids[i]);
+}
+EXPORT_SYMBOL(lustre_swab_mgs_target_info);
+
+void lustre_swab_mgs_nidtbl_entry(struct mgs_nidtbl_entry *entry)
+{
+	int i;
+
+	__swab64s(&entry->mne_version);
+	__swab32s(&entry->mne_instance);
+	__swab32s(&entry->mne_index);
+	__swab32s(&entry->mne_length);
+
+	/* mne_nid_(count|type) must be one byte size because we're gonna
+	 * access it w/o swapping. */
+	CLASSERT(sizeof(entry->mne_nid_count) == sizeof(__u8));
+	CLASSERT(sizeof(entry->mne_nid_type) == sizeof(__u8));
+
+	/* remove this assertion if ipv6 is supported. */
+	LASSERT(entry->mne_nid_type == 0);
+	for (i = 0; i < entry->mne_nid_count; i++) {
+		CLASSERT(sizeof(lnet_nid_t) == sizeof(__u64));
+		__swab64s(&entry->u.nids[i]);
+	}
+}
+EXPORT_SYMBOL(lustre_swab_mgs_nidtbl_entry);
+
+void lustre_swab_mgs_config_body(struct mgs_config_body *body)
+{
+	__swab64s(&body->mcb_offset);
+	__swab32s(&body->mcb_units);
+	__swab16s(&body->mcb_type);
+}
+EXPORT_SYMBOL(lustre_swab_mgs_config_body);
+
+void lustre_swab_mgs_config_res(struct mgs_config_res *body)
+{
+	__swab64s(&body->mcr_offset);
+	__swab64s(&body->mcr_size);
+}
+EXPORT_SYMBOL(lustre_swab_mgs_config_res);
+
+static void lustre_swab_obd_dqinfo(struct obd_dqinfo *i)
+{
+	__swab64s(&i->dqi_bgrace);
+	__swab64s(&i->dqi_igrace);
+	__swab32s(&i->dqi_flags);
+	__swab32s(&i->dqi_valid);
+}
+
+static void lustre_swab_obd_dqblk(struct obd_dqblk *b)
+{
+	__swab64s(&b->dqb_ihardlimit);
+	__swab64s(&b->dqb_isoftlimit);
+	__swab64s(&b->dqb_curinodes);
+	__swab64s(&b->dqb_bhardlimit);
+	__swab64s(&b->dqb_bsoftlimit);
+	__swab64s(&b->dqb_curspace);
+	__swab64s(&b->dqb_btime);
+	__swab64s(&b->dqb_itime);
+	__swab32s(&b->dqb_valid);
+	CLASSERT(offsetof(typeof(*b), dqb_padding) != 0);
+}
+
+void lustre_swab_obd_quotactl(struct obd_quotactl *q)
+{
+	__swab32s(&q->qc_cmd);
+	__swab32s(&q->qc_type);
+	__swab32s(&q->qc_id);
+	__swab32s(&q->qc_stat);
+	lustre_swab_obd_dqinfo(&q->qc_dqinfo);
+	lustre_swab_obd_dqblk(&q->qc_dqblk);
+}
+EXPORT_SYMBOL(lustre_swab_obd_quotactl);
+
+void lustre_swab_mdt_remote_perm(struct mdt_remote_perm *p)
+{
+	__swab32s(&p->rp_uid);
+	__swab32s(&p->rp_gid);
+	__swab32s(&p->rp_fsuid);
+	__swab32s(&p->rp_fsuid_h);
+	__swab32s(&p->rp_fsgid);
+	__swab32s(&p->rp_fsgid_h);
+	__swab32s(&p->rp_access_perm);
+	__swab32s(&p->rp_padding);
+};
+EXPORT_SYMBOL(lustre_swab_mdt_remote_perm);
+
+void lustre_swab_fid2path(struct getinfo_fid2path *gf)
+{
+	lustre_swab_lu_fid(&gf->gf_fid);
+	__swab64s(&gf->gf_recno);
+	__swab32s(&gf->gf_linkno);
+	__swab32s(&gf->gf_pathlen);
+}
+EXPORT_SYMBOL(lustre_swab_fid2path);
+
+void lustre_swab_fiemap_extent(struct ll_fiemap_extent *fm_extent)
+{
+	__swab64s(&fm_extent->fe_logical);
+	__swab64s(&fm_extent->fe_physical);
+	__swab64s(&fm_extent->fe_length);
+	__swab32s(&fm_extent->fe_flags);
+	__swab32s(&fm_extent->fe_device);
+}
+
+void lustre_swab_fiemap(struct ll_user_fiemap *fiemap)
+{
+	int i;
+
+	__swab64s(&fiemap->fm_start);
+	__swab64s(&fiemap->fm_length);
+	__swab32s(&fiemap->fm_flags);
+	__swab32s(&fiemap->fm_mapped_extents);
+	__swab32s(&fiemap->fm_extent_count);
+	__swab32s(&fiemap->fm_reserved);
+
+	for (i = 0; i < fiemap->fm_mapped_extents; i++)
+		lustre_swab_fiemap_extent(&fiemap->fm_extents[i]);
+}
+EXPORT_SYMBOL(lustre_swab_fiemap);
+
+void lustre_swab_idx_info(struct idx_info *ii)
+{
+	__swab32s(&ii->ii_magic);
+	__swab32s(&ii->ii_flags);
+	__swab16s(&ii->ii_count);
+	__swab32s(&ii->ii_attrs);
+	lustre_swab_lu_fid(&ii->ii_fid);
+	__swab64s(&ii->ii_version);
+	__swab64s(&ii->ii_hash_start);
+	__swab64s(&ii->ii_hash_end);
+	__swab16s(&ii->ii_keysize);
+	__swab16s(&ii->ii_recsize);
+}
+
+void lustre_swab_lip_header(struct lu_idxpage *lip)
+{
+	/* swab header */
+	__swab32s(&lip->lip_magic);
+	__swab16s(&lip->lip_flags);
+	__swab16s(&lip->lip_nr);
+}
+EXPORT_SYMBOL(lustre_swab_lip_header);
+
+void lustre_swab_mdt_rec_reint (struct mdt_rec_reint *rr)
+{
+	__swab32s(&rr->rr_opcode);
+	__swab32s(&rr->rr_cap);
+	__swab32s(&rr->rr_fsuid);
+	/* rr_fsuid_h is unused */
+	__swab32s(&rr->rr_fsgid);
+	/* rr_fsgid_h is unused */
+	__swab32s(&rr->rr_suppgid1);
+	/* rr_suppgid1_h is unused */
+	__swab32s(&rr->rr_suppgid2);
+	/* rr_suppgid2_h is unused */
+	lustre_swab_lu_fid(&rr->rr_fid1);
+	lustre_swab_lu_fid(&rr->rr_fid2);
+	__swab64s(&rr->rr_mtime);
+	__swab64s(&rr->rr_atime);
+	__swab64s(&rr->rr_ctime);
+	__swab64s(&rr->rr_size);
+	__swab64s(&rr->rr_blocks);
+	__swab32s(&rr->rr_bias);
+	__swab32s(&rr->rr_mode);
+	__swab32s(&rr->rr_flags);
+	__swab32s(&rr->rr_flags_h);
+	__swab32s(&rr->rr_umask);
+
+	CLASSERT(offsetof(typeof(*rr), rr_padding_4) != 0);
+};
+EXPORT_SYMBOL(lustre_swab_mdt_rec_reint);
+
+void lustre_swab_lov_desc(struct lov_desc *ld)
+{
+	__swab32s(&ld->ld_tgt_count);
+	__swab32s(&ld->ld_active_tgt_count);
+	__swab32s(&ld->ld_default_stripe_count);
+	__swab32s(&ld->ld_pattern);
+	__swab64s(&ld->ld_default_stripe_size);
+	__swab64s(&ld->ld_default_stripe_offset);
+	__swab32s(&ld->ld_qos_maxage);
+	/* uuid endian insensitive */
+}
+EXPORT_SYMBOL(lustre_swab_lov_desc);
+
+void lustre_swab_lmv_desc(struct lmv_desc *ld)
+{
+	__swab32s(&ld->ld_tgt_count);
+	__swab32s(&ld->ld_active_tgt_count);
+	__swab32s(&ld->ld_default_stripe_count);
+	__swab32s(&ld->ld_pattern);
+	__swab64s(&ld->ld_default_hash_size);
+	__swab32s(&ld->ld_qos_maxage);
+	/* uuid endian insensitive */
+}
+
+void lustre_swab_lmv_stripe_md(struct lmv_stripe_md *mea)
+{
+	__swab32s(&mea->mea_magic);
+	__swab32s(&mea->mea_count);
+	__swab32s(&mea->mea_master);
+	CLASSERT(offsetof(typeof(*mea), mea_padding) != 0);
+}
+
+void lustre_swab_lmv_user_md(struct lmv_user_md *lum)
+{
+	int i;
+
+	__swab32s(&lum->lum_magic);
+	__swab32s(&lum->lum_stripe_count);
+	__swab32s(&lum->lum_stripe_offset);
+	__swab32s(&lum->lum_hash_type);
+	__swab32s(&lum->lum_type);
+	CLASSERT(offsetof(typeof(*lum), lum_padding1) != 0);
+	CLASSERT(offsetof(typeof(*lum), lum_padding2) != 0);
+	CLASSERT(offsetof(typeof(*lum), lum_padding3) != 0);
+
+	for (i = 0; i < lum->lum_stripe_count; i++) {
+		__swab32s(&lum->lum_objects[i].lum_mds);
+		lustre_swab_lu_fid(&lum->lum_objects[i].lum_fid);
+	}
+
+}
+EXPORT_SYMBOL(lustre_swab_lmv_user_md);
+
+static void print_lum(struct lov_user_md *lum)
+{
+	CDEBUG(D_OTHER, "lov_user_md %p:\n", lum);
+	CDEBUG(D_OTHER, "\tlmm_magic: %#x\n", lum->lmm_magic);
+	CDEBUG(D_OTHER, "\tlmm_pattern: %#x\n", lum->lmm_pattern);
+	CDEBUG(D_OTHER, "\tlmm_object_id: %llu\n", lmm_oi_id(&lum->lmm_oi));
+	CDEBUG(D_OTHER, "\tlmm_object_gr: %llu\n", lmm_oi_seq(&lum->lmm_oi));
+	CDEBUG(D_OTHER, "\tlmm_stripe_size: %#x\n", lum->lmm_stripe_size);
+	CDEBUG(D_OTHER, "\tlmm_stripe_count: %#x\n", lum->lmm_stripe_count);
+	CDEBUG(D_OTHER, "\tlmm_stripe_offset/lmm_layout_gen: %#x\n",
+			lum->lmm_stripe_offset);
+}
+
+static void lustre_swab_lmm_oi(struct ost_id *oi)
+{
+	__swab64s(&oi->oi.oi_id);
+	__swab64s(&oi->oi.oi_seq);
+}
+
+static void lustre_swab_lov_user_md_common(struct lov_user_md_v1 *lum)
+{
+	__swab32s(&lum->lmm_magic);
+	__swab32s(&lum->lmm_pattern);
+	lustre_swab_lmm_oi(&lum->lmm_oi);
+	__swab32s(&lum->lmm_stripe_size);
+	__swab16s(&lum->lmm_stripe_count);
+	__swab16s(&lum->lmm_stripe_offset);
+	print_lum(lum);
+}
+
+void lustre_swab_lov_user_md_v1(struct lov_user_md_v1 *lum)
+{
+	CDEBUG(D_IOCTL, "swabbing lov_user_md v1\n");
+	lustre_swab_lov_user_md_common(lum);
+}
+EXPORT_SYMBOL(lustre_swab_lov_user_md_v1);
+
+void lustre_swab_lov_user_md_v3(struct lov_user_md_v3 *lum)
+{
+	CDEBUG(D_IOCTL, "swabbing lov_user_md v3\n");
+	lustre_swab_lov_user_md_common((struct lov_user_md_v1 *)lum);
+	/* lmm_pool_name nothing to do with char */
+}
+EXPORT_SYMBOL(lustre_swab_lov_user_md_v3);
+
+void lustre_swab_lov_mds_md(struct lov_mds_md *lmm)
+{
+	CDEBUG(D_IOCTL, "swabbing lov_mds_md\n");
+	__swab32s(&lmm->lmm_magic);
+	__swab32s(&lmm->lmm_pattern);
+	lustre_swab_lmm_oi(&lmm->lmm_oi);
+	__swab32s(&lmm->lmm_stripe_size);
+	__swab16s(&lmm->lmm_stripe_count);
+	__swab16s(&lmm->lmm_layout_gen);
+}
+EXPORT_SYMBOL(lustre_swab_lov_mds_md);
+
+void lustre_swab_lov_user_md_objects(struct lov_user_ost_data *lod,
+				     int stripe_count)
+{
+	int i;
+
+	for (i = 0; i < stripe_count; i++) {
+		lustre_swab_ost_id(&(lod[i].l_ost_oi));
+		__swab32s(&(lod[i].l_ost_gen));
+		__swab32s(&(lod[i].l_ost_idx));
+	}
+}
+EXPORT_SYMBOL(lustre_swab_lov_user_md_objects);
+
+void lustre_swab_ldlm_res_id(struct ldlm_res_id *id)
+{
+	int  i;
+
+	for (i = 0; i < RES_NAME_SIZE; i++)
+		__swab64s(&id->name[i]);
+}
+EXPORT_SYMBOL(lustre_swab_ldlm_res_id);
+
+void lustre_swab_ldlm_policy_data(ldlm_wire_policy_data_t *d)
+{
+	/* the lock data is a union and the first two fields are always an
+	 * extent so it's ok to process an LDLM_EXTENT and LDLM_FLOCK lock
+	 * data the same way. */
+	__swab64s(&d->l_extent.start);
+	__swab64s(&d->l_extent.end);
+	__swab64s(&d->l_extent.gid);
+	__swab64s(&d->l_flock.lfw_owner);
+	__swab32s(&d->l_flock.lfw_pid);
+}
+EXPORT_SYMBOL(lustre_swab_ldlm_policy_data);
+
+void lustre_swab_ldlm_intent(struct ldlm_intent *i)
+{
+	__swab64s(&i->opc);
+}
+EXPORT_SYMBOL(lustre_swab_ldlm_intent);
+
+void lustre_swab_ldlm_resource_desc(struct ldlm_resource_desc *r)
+{
+	__swab32s(&r->lr_type);
+	CLASSERT(offsetof(typeof(*r), lr_padding) != 0);
+	lustre_swab_ldlm_res_id(&r->lr_name);
+}
+EXPORT_SYMBOL(lustre_swab_ldlm_resource_desc);
+
+void lustre_swab_ldlm_lock_desc(struct ldlm_lock_desc *l)
+{
+	lustre_swab_ldlm_resource_desc(&l->l_resource);
+	__swab32s(&l->l_req_mode);
+	__swab32s(&l->l_granted_mode);
+	lustre_swab_ldlm_policy_data(&l->l_policy_data);
+}
+EXPORT_SYMBOL(lustre_swab_ldlm_lock_desc);
+
+void lustre_swab_ldlm_request(struct ldlm_request *rq)
+{
+	__swab32s(&rq->lock_flags);
+	lustre_swab_ldlm_lock_desc(&rq->lock_desc);
+	__swab32s(&rq->lock_count);
+	/* lock_handle[] opaque */
+}
+EXPORT_SYMBOL(lustre_swab_ldlm_request);
+
+void lustre_swab_ldlm_reply(struct ldlm_reply *r)
+{
+	__swab32s(&r->lock_flags);
+	CLASSERT(offsetof(typeof(*r), lock_padding) != 0);
+	lustre_swab_ldlm_lock_desc(&r->lock_desc);
+	/* lock_handle opaque */
+	__swab64s(&r->lock_policy_res1);
+	__swab64s(&r->lock_policy_res2);
+}
+EXPORT_SYMBOL(lustre_swab_ldlm_reply);
+
+void lustre_swab_quota_body(struct quota_body *b)
+{
+	lustre_swab_lu_fid(&b->qb_fid);
+	lustre_swab_lu_fid((struct lu_fid *)&b->qb_id);
+	__swab32s(&b->qb_flags);
+	__swab64s(&b->qb_count);
+	__swab64s(&b->qb_usage);
+	__swab64s(&b->qb_slv_ver);
+}
+
+/* Dump functions */
+void dump_ioo(struct obd_ioobj *ioo)
+{
+	CDEBUG(D_RPCTRACE,
+	       "obd_ioobj: ioo_oid=" DOSTID ", ioo_max_brw=%#x, ioo_bufct=%d\n",
+	       POSTID(&ioo->ioo_oid), ioo->ioo_max_brw,
+	       ioo->ioo_bufcnt);
+}
+EXPORT_SYMBOL(dump_ioo);
+
+void dump_rniobuf(struct niobuf_remote *nb)
+{
+	CDEBUG(D_RPCTRACE, "niobuf_remote: offset=%llu, len=%d, flags=%x\n",
+	       nb->offset, nb->len, nb->flags);
+}
+EXPORT_SYMBOL(dump_rniobuf);
+
+void dump_obdo(struct obdo *oa)
+{
+	__u32 valid = oa->o_valid;
+
+	CDEBUG(D_RPCTRACE, "obdo: o_valid = %08x\n", valid);
+	if (valid & OBD_MD_FLID)
+		CDEBUG(D_RPCTRACE, "obdo: id = "DOSTID"\n", POSTID(&oa->o_oi));
+	if (valid & OBD_MD_FLFID)
+		CDEBUG(D_RPCTRACE, "obdo: o_parent_seq = %#llx\n",
+		       oa->o_parent_seq);
+	if (valid & OBD_MD_FLSIZE)
+		CDEBUG(D_RPCTRACE, "obdo: o_size = %lld\n", oa->o_size);
+	if (valid & OBD_MD_FLMTIME)
+		CDEBUG(D_RPCTRACE, "obdo: o_mtime = %lld\n", oa->o_mtime);
+	if (valid & OBD_MD_FLATIME)
+		CDEBUG(D_RPCTRACE, "obdo: o_atime = %lld\n", oa->o_atime);
+	if (valid & OBD_MD_FLCTIME)
+		CDEBUG(D_RPCTRACE, "obdo: o_ctime = %lld\n", oa->o_ctime);
+	if (valid & OBD_MD_FLBLOCKS)   /* allocation of space */
+		CDEBUG(D_RPCTRACE, "obdo: o_blocks = %lld\n", oa->o_blocks);
+	if (valid & OBD_MD_FLGRANT)
+		CDEBUG(D_RPCTRACE, "obdo: o_grant = %lld\n", oa->o_grant);
+	if (valid & OBD_MD_FLBLKSZ)
+		CDEBUG(D_RPCTRACE, "obdo: o_blksize = %d\n", oa->o_blksize);
+	if (valid & (OBD_MD_FLTYPE | OBD_MD_FLMODE))
+		CDEBUG(D_RPCTRACE, "obdo: o_mode = %o\n",
+		       oa->o_mode & ((valid & OBD_MD_FLTYPE ?  S_IFMT : 0) |
+				     (valid & OBD_MD_FLMODE ? ~S_IFMT : 0)));
+	if (valid & OBD_MD_FLUID)
+		CDEBUG(D_RPCTRACE, "obdo: o_uid = %u\n", oa->o_uid);
+	if (valid & OBD_MD_FLUID)
+		CDEBUG(D_RPCTRACE, "obdo: o_uid_h = %u\n", oa->o_uid_h);
+	if (valid & OBD_MD_FLGID)
+		CDEBUG(D_RPCTRACE, "obdo: o_gid = %u\n", oa->o_gid);
+	if (valid & OBD_MD_FLGID)
+		CDEBUG(D_RPCTRACE, "obdo: o_gid_h = %u\n", oa->o_gid_h);
+	if (valid & OBD_MD_FLFLAGS)
+		CDEBUG(D_RPCTRACE, "obdo: o_flags = %x\n", oa->o_flags);
+	if (valid & OBD_MD_FLNLINK)
+		CDEBUG(D_RPCTRACE, "obdo: o_nlink = %u\n", oa->o_nlink);
+	else if (valid & OBD_MD_FLCKSUM)
+		CDEBUG(D_RPCTRACE, "obdo: o_checksum (o_nlink) = %u\n",
+		       oa->o_nlink);
+	if (valid & OBD_MD_FLGENER)
+		CDEBUG(D_RPCTRACE, "obdo: o_parent_oid = %x\n",
+		       oa->o_parent_oid);
+	if (valid & OBD_MD_FLEPOCH)
+		CDEBUG(D_RPCTRACE, "obdo: o_ioepoch = %lld\n",
+		       oa->o_ioepoch);
+	if (valid & OBD_MD_FLFID) {
+		CDEBUG(D_RPCTRACE, "obdo: o_stripe_idx = %u\n",
+		       oa->o_stripe_idx);
+		CDEBUG(D_RPCTRACE, "obdo: o_parent_ver = %x\n",
+		       oa->o_parent_ver);
+	}
+	if (valid & OBD_MD_FLHANDLE)
+		CDEBUG(D_RPCTRACE, "obdo: o_handle = %lld\n",
+		       oa->o_handle.cookie);
+	if (valid & OBD_MD_FLCOOKIE)
+		CDEBUG(D_RPCTRACE, "obdo: o_lcookie = (llog_cookie dumping not yet implemented)\n");
+}
+EXPORT_SYMBOL(dump_obdo);
+
+void dump_ost_body(struct ost_body *ob)
+{
+	dump_obdo(&ob->oa);
+}
+EXPORT_SYMBOL(dump_ost_body);
+
+void dump_rcs(__u32 *rc)
+{
+	CDEBUG(D_RPCTRACE, "rmf_rcs: %d\n", *rc);
+}
+EXPORT_SYMBOL(dump_rcs);
+
+static inline int req_ptlrpc_body_swabbed(struct ptlrpc_request *req)
+{
+	LASSERT(req->rq_reqmsg);
+
+	switch (req->rq_reqmsg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V2:
+		return lustre_req_swabbed(req, MSG_PTLRPC_BODY_OFF);
+	default:
+		CERROR("bad lustre msg magic: %#08X\n",
+		       req->rq_reqmsg->lm_magic);
+	}
+	return 0;
+}
+
+static inline int rep_ptlrpc_body_swabbed(struct ptlrpc_request *req)
+{
+	LASSERT(req->rq_repmsg);
+
+	switch (req->rq_repmsg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V2:
+		return lustre_rep_swabbed(req, MSG_PTLRPC_BODY_OFF);
+	default:
+		/* uninitialized yet */
+		return 0;
+	}
+}
+
+void _debug_req(struct ptlrpc_request *req,
+		struct libcfs_debug_msg_data *msgdata,
+		const char *fmt, ...)
+{
+	int req_ok = req->rq_reqmsg != NULL;
+	int rep_ok = req->rq_repmsg != NULL;
+	lnet_nid_t nid = LNET_NID_ANY;
+	va_list args;
+
+	if (ptlrpc_req_need_swab(req)) {
+		req_ok = req_ok && req_ptlrpc_body_swabbed(req);
+		rep_ok = rep_ok && rep_ptlrpc_body_swabbed(req);
+	}
+
+	if (req->rq_import && req->rq_import->imp_connection)
+		nid = req->rq_import->imp_connection->c_peer.nid;
+	else if (req->rq_export && req->rq_export->exp_connection)
+		nid = req->rq_export->exp_connection->c_peer.nid;
+
+	va_start(args, fmt);
+	libcfs_debug_vmsg2(msgdata, fmt, args,
+			   " req@%p x%llu/t%lld(%lld) o%d->%s@%s:%d/%d lens %d/%d e %d to %d dl " CFS_TIME_T " ref %d fl " REQ_FLAGS_FMT "/%x/%x rc %d/%d\n",
+			   req, req->rq_xid, req->rq_transno,
+			   req_ok ? lustre_msg_get_transno(req->rq_reqmsg) : 0,
+			   req_ok ? lustre_msg_get_opc(req->rq_reqmsg) : -1,
+			   req->rq_import ?
+			   req->rq_import->imp_obd->obd_name :
+			   req->rq_export ?
+			   req->rq_export->exp_client_uuid.uuid :
+			   "<?>",
+			   libcfs_nid2str(nid),
+			   req->rq_request_portal, req->rq_reply_portal,
+			   req->rq_reqlen, req->rq_replen,
+			   req->rq_early_count, req->rq_timedout,
+			   req->rq_deadline,
+			   atomic_read(&req->rq_refcount),
+			   DEBUG_REQ_FLAGS(req),
+			   req_ok ? lustre_msg_get_flags(req->rq_reqmsg) : -1,
+			   rep_ok ? lustre_msg_get_flags(req->rq_repmsg) : -1,
+			   req->rq_status,
+			   rep_ok ? lustre_msg_get_status(req->rq_repmsg) : -1);
+	va_end(args);
+}
+EXPORT_SYMBOL(_debug_req);
+
+void lustre_swab_lustre_capa(struct lustre_capa *c)
+{
+	lustre_swab_lu_fid(&c->lc_fid);
+	__swab64s(&c->lc_opc);
+	__swab64s(&c->lc_uid);
+	__swab64s(&c->lc_gid);
+	__swab32s(&c->lc_flags);
+	__swab32s(&c->lc_keyid);
+	__swab32s(&c->lc_timeout);
+	__swab32s(&c->lc_expiry);
+}
+EXPORT_SYMBOL(lustre_swab_lustre_capa);
+
+void lustre_swab_lustre_capa_key(struct lustre_capa_key *k)
+{
+	__swab64s(&k->lk_seq);
+	__swab32s(&k->lk_keyid);
+	CLASSERT(offsetof(typeof(*k), lk_padding) != 0);
+}
+EXPORT_SYMBOL(lustre_swab_lustre_capa_key);
+
+void lustre_swab_hsm_user_state(struct hsm_user_state *state)
+{
+	__swab32s(&state->hus_states);
+	__swab32s(&state->hus_archive_id);
+}
+EXPORT_SYMBOL(lustre_swab_hsm_user_state);
+
+void lustre_swab_hsm_state_set(struct hsm_state_set *hss)
+{
+	__swab32s(&hss->hss_valid);
+	__swab64s(&hss->hss_setmask);
+	__swab64s(&hss->hss_clearmask);
+	__swab32s(&hss->hss_archive_id);
+}
+EXPORT_SYMBOL(lustre_swab_hsm_state_set);
+
+void lustre_swab_hsm_extent(struct hsm_extent *extent)
+{
+	__swab64s(&extent->offset);
+	__swab64s(&extent->length);
+}
+
+void lustre_swab_hsm_current_action(struct hsm_current_action *action)
+{
+	__swab32s(&action->hca_state);
+	__swab32s(&action->hca_action);
+	lustre_swab_hsm_extent(&action->hca_location);
+}
+EXPORT_SYMBOL(lustre_swab_hsm_current_action);
+
+void lustre_swab_hsm_user_item(struct hsm_user_item *hui)
+{
+	lustre_swab_lu_fid(&hui->hui_fid);
+	lustre_swab_hsm_extent(&hui->hui_extent);
+}
+EXPORT_SYMBOL(lustre_swab_hsm_user_item);
+
+void lustre_swab_layout_intent(struct layout_intent *li)
+{
+	__swab32s(&li->li_opc);
+	__swab32s(&li->li_flags);
+	__swab64s(&li->li_start);
+	__swab64s(&li->li_end);
+}
+EXPORT_SYMBOL(lustre_swab_layout_intent);
+
+void lustre_swab_hsm_progress_kernel(struct hsm_progress_kernel *hpk)
+{
+	lustre_swab_lu_fid(&hpk->hpk_fid);
+	__swab64s(&hpk->hpk_cookie);
+	__swab64s(&hpk->hpk_extent.offset);
+	__swab64s(&hpk->hpk_extent.length);
+	__swab16s(&hpk->hpk_flags);
+	__swab16s(&hpk->hpk_errval);
+}
+EXPORT_SYMBOL(lustre_swab_hsm_progress_kernel);
+
+void lustre_swab_hsm_request(struct hsm_request *hr)
+{
+	__swab32s(&hr->hr_action);
+	__swab32s(&hr->hr_archive_id);
+	__swab64s(&hr->hr_flags);
+	__swab32s(&hr->hr_itemcount);
+	__swab32s(&hr->hr_data_len);
+}
+EXPORT_SYMBOL(lustre_swab_hsm_request);
+
+void lustre_swab_update_buf(struct update_buf *ub)
+{
+	__swab32s(&ub->ub_magic);
+	__swab32s(&ub->ub_count);
+}
+EXPORT_SYMBOL(lustre_swab_update_buf);
+
+void lustre_swab_update_reply_buf(struct update_reply *ur)
+{
+	int i;
+
+	__swab32s(&ur->ur_version);
+	__swab32s(&ur->ur_count);
+	for (i = 0; i < ur->ur_count; i++)
+		__swab32s(&ur->ur_lens[i]);
+}
+EXPORT_SYMBOL(lustre_swab_update_reply_buf);
+
+void lustre_swab_swap_layouts(struct mdc_swap_layouts *msl)
+{
+	__swab64s(&msl->msl_flags);
+}
+EXPORT_SYMBOL(lustre_swab_swap_layouts);
+
+void lustre_swab_close_data(struct close_data *cd)
+{
+	lustre_swab_lu_fid(&cd->cd_fid);
+	__swab64s(&cd->cd_data_version);
+}
+EXPORT_SYMBOL(lustre_swab_close_data);
diff --git a/kernel/drivers/staging/lustre/lustre/ptlrpc/pers.c b/kernel/drivers/staging/lustre/lustre/ptlrpc/pers.c
new file mode 100644
index 000000000..e1334c24e
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/ptlrpc/pers.c
@@ -0,0 +1,75 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_RPC
+
+#include "../include/obd_support.h"
+#include "../include/obd_class.h"
+#include "../include/lustre_lib.h"
+#include "../include/lustre_ha.h"
+#include "../include/lustre_import.h"
+
+#include "ptlrpc_internal.h"
+
+
+void ptlrpc_fill_bulk_md(lnet_md_t *md, struct ptlrpc_bulk_desc *desc,
+			 int mdidx)
+{
+	CLASSERT(PTLRPC_MAX_BRW_PAGES < LI_POISON);
+
+	LASSERT(mdidx < desc->bd_md_max_brw);
+	LASSERT(desc->bd_iov_count <= PTLRPC_MAX_BRW_PAGES);
+	LASSERT(!(md->options & (LNET_MD_IOVEC | LNET_MD_KIOV |
+				 LNET_MD_PHYS)));
+
+	md->options |= LNET_MD_KIOV;
+	md->length = max(0, desc->bd_iov_count - mdidx * LNET_MAX_IOV);
+	md->length = min_t(unsigned int, LNET_MAX_IOV, md->length);
+	if (desc->bd_enc_iov)
+		md->start = &desc->bd_enc_iov[mdidx * LNET_MAX_IOV];
+	else
+		md->start = &desc->bd_iov[mdidx * LNET_MAX_IOV];
+}
+
+void ptlrpc_add_bulk_page(struct ptlrpc_bulk_desc *desc, struct page *page,
+			  int pageoffset, int len)
+{
+	lnet_kiov_t *kiov = &desc->bd_iov[desc->bd_iov_count];
+
+	kiov->kiov_page = page;
+	kiov->kiov_offset = pageoffset;
+	kiov->kiov_len = len;
+
+	desc->bd_iov_count++;
+}
diff --git a/kernel/drivers/staging/lustre/lustre/ptlrpc/pinger.c b/kernel/drivers/staging/lustre/lustre/ptlrpc/pinger.c
new file mode 100644
index 000000000..9dbda9332
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/ptlrpc/pinger.c
@@ -0,0 +1,678 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/ptlrpc/pinger.c
+ *
+ * Portal-RPC reconnection and replay operations, for use in recovery.
+ */
+
+#define DEBUG_SUBSYSTEM S_RPC
+
+#include "../include/obd_support.h"
+#include "../include/obd_class.h"
+#include "ptlrpc_internal.h"
+
+static int suppress_pings;
+module_param(suppress_pings, int, 0644);
+MODULE_PARM_DESC(suppress_pings, "Suppress pings");
+
+struct mutex pinger_mutex;
+static LIST_HEAD(pinger_imports);
+static struct list_head timeout_list = LIST_HEAD_INIT(timeout_list);
+
+int ptlrpc_pinger_suppress_pings(void)
+{
+	return suppress_pings;
+}
+EXPORT_SYMBOL(ptlrpc_pinger_suppress_pings);
+
+struct ptlrpc_request *
+ptlrpc_prep_ping(struct obd_import *imp)
+{
+	struct ptlrpc_request *req;
+
+	req = ptlrpc_request_alloc_pack(imp, &RQF_OBD_PING,
+					LUSTRE_OBD_VERSION, OBD_PING);
+	if (req) {
+		ptlrpc_request_set_replen(req);
+		req->rq_no_resend = req->rq_no_delay = 1;
+	}
+	return req;
+}
+
+int ptlrpc_obd_ping(struct obd_device *obd)
+{
+	int rc;
+	struct ptlrpc_request *req;
+
+	req = ptlrpc_prep_ping(obd->u.cli.cl_import);
+	if (req == NULL)
+		return -ENOMEM;
+
+	req->rq_send_state = LUSTRE_IMP_FULL;
+
+	rc = ptlrpc_queue_wait(req);
+
+	ptlrpc_req_finished(req);
+
+	return rc;
+}
+EXPORT_SYMBOL(ptlrpc_obd_ping);
+
+int ptlrpc_ping(struct obd_import *imp)
+{
+	struct ptlrpc_request *req;
+
+	req = ptlrpc_prep_ping(imp);
+	if (req == NULL) {
+		CERROR("OOM trying to ping %s->%s\n",
+		       imp->imp_obd->obd_uuid.uuid,
+		       obd2cli_tgt(imp->imp_obd));
+		return -ENOMEM;
+	}
+
+	DEBUG_REQ(D_INFO, req, "pinging %s->%s",
+		  imp->imp_obd->obd_uuid.uuid, obd2cli_tgt(imp->imp_obd));
+	ptlrpcd_add_req(req, PDL_POLICY_ROUND, -1);
+
+	return 0;
+}
+
+void ptlrpc_update_next_ping(struct obd_import *imp, int soon)
+{
+	int time = soon ? PING_INTERVAL_SHORT : PING_INTERVAL;
+	if (imp->imp_state == LUSTRE_IMP_DISCON) {
+		int dtime = max_t(int, CONNECTION_SWITCH_MIN,
+				  AT_OFF ? 0 :
+				  at_get(&imp->imp_at.iat_net_latency));
+		time = min(time, dtime);
+	}
+	imp->imp_next_ping = cfs_time_shift(time);
+}
+
+void ptlrpc_ping_import_soon(struct obd_import *imp)
+{
+	imp->imp_next_ping = cfs_time_current();
+}
+
+static inline int imp_is_deactive(struct obd_import *imp)
+{
+	return (imp->imp_deactive ||
+		OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_IMP_DEACTIVE));
+}
+
+static inline int ptlrpc_next_reconnect(struct obd_import *imp)
+{
+	if (imp->imp_server_timeout)
+		return cfs_time_shift(obd_timeout / 2);
+	else
+		return cfs_time_shift(obd_timeout);
+}
+
+long pinger_check_timeout(unsigned long time)
+{
+	struct timeout_item *item;
+	unsigned long timeout = PING_INTERVAL;
+
+	/* The timeout list is a increase order sorted list */
+	mutex_lock(&pinger_mutex);
+	list_for_each_entry(item, &timeout_list, ti_chain) {
+		int ti_timeout = item->ti_timeout;
+		if (timeout > ti_timeout)
+			timeout = ti_timeout;
+		break;
+	}
+	mutex_unlock(&pinger_mutex);
+
+	return cfs_time_sub(cfs_time_add(time, cfs_time_seconds(timeout)),
+					 cfs_time_current());
+}
+
+static bool ir_up;
+
+void ptlrpc_pinger_ir_up(void)
+{
+	CDEBUG(D_HA, "IR up\n");
+	ir_up = true;
+}
+EXPORT_SYMBOL(ptlrpc_pinger_ir_up);
+
+void ptlrpc_pinger_ir_down(void)
+{
+	CDEBUG(D_HA, "IR down\n");
+	ir_up = false;
+}
+EXPORT_SYMBOL(ptlrpc_pinger_ir_down);
+
+static void ptlrpc_pinger_process_import(struct obd_import *imp,
+					 unsigned long this_ping)
+{
+	int level;
+	int force;
+	int force_next;
+	int suppress;
+
+	spin_lock(&imp->imp_lock);
+
+	level = imp->imp_state;
+	force = imp->imp_force_verify;
+	force_next = imp->imp_force_next_verify;
+	/*
+	 * This will be used below only if the import is "FULL".
+	 */
+	suppress = ir_up && OCD_HAS_FLAG(&imp->imp_connect_data, PINGLESS);
+
+	imp->imp_force_verify = 0;
+
+	if (cfs_time_aftereq(imp->imp_next_ping - 5 * CFS_TICK, this_ping) &&
+	    !force) {
+		spin_unlock(&imp->imp_lock);
+		return;
+	}
+
+	imp->imp_force_next_verify = 0;
+
+	spin_unlock(&imp->imp_lock);
+
+	CDEBUG(level == LUSTRE_IMP_FULL ? D_INFO : D_HA, "%s->%s: level %s/%u force %u force_next %u deactive %u pingable %u suppress %u\n",
+	       imp->imp_obd->obd_uuid.uuid, obd2cli_tgt(imp->imp_obd),
+	       ptlrpc_import_state_name(level), level, force, force_next,
+	       imp->imp_deactive, imp->imp_pingable, suppress);
+
+	if (level == LUSTRE_IMP_DISCON && !imp_is_deactive(imp)) {
+		/* wait for a while before trying recovery again */
+		imp->imp_next_ping = ptlrpc_next_reconnect(imp);
+		if (!imp->imp_no_pinger_recover)
+			ptlrpc_initiate_recovery(imp);
+	} else if (level != LUSTRE_IMP_FULL ||
+		   imp->imp_obd->obd_no_recov ||
+		   imp_is_deactive(imp)) {
+		CDEBUG(D_HA, "%s->%s: not pinging (in recovery or recovery disabled: %s)\n",
+		       imp->imp_obd->obd_uuid.uuid, obd2cli_tgt(imp->imp_obd),
+		       ptlrpc_import_state_name(level));
+		if (force) {
+			spin_lock(&imp->imp_lock);
+			imp->imp_force_verify = 1;
+			spin_unlock(&imp->imp_lock);
+		}
+	} else if ((imp->imp_pingable && !suppress) || force_next || force) {
+		ptlrpc_ping(imp);
+	}
+}
+
+static int ptlrpc_pinger_main(void *arg)
+{
+	struct ptlrpc_thread *thread = (struct ptlrpc_thread *)arg;
+
+	/* Record that the thread is running */
+	thread_set_flags(thread, SVC_RUNNING);
+	wake_up(&thread->t_ctl_waitq);
+
+	/* And now, loop forever, pinging as needed. */
+	while (1) {
+		unsigned long this_ping = cfs_time_current();
+		struct l_wait_info lwi;
+		long time_to_next_wake;
+		struct timeout_item *item;
+		struct list_head *iter;
+
+		mutex_lock(&pinger_mutex);
+		list_for_each_entry(item, &timeout_list, ti_chain) {
+			item->ti_cb(item, item->ti_cb_data);
+		}
+		list_for_each(iter, &pinger_imports) {
+			struct obd_import *imp =
+				list_entry(iter, struct obd_import,
+					       imp_pinger_chain);
+
+			ptlrpc_pinger_process_import(imp, this_ping);
+			/* obd_timeout might have changed */
+			if (imp->imp_pingable && imp->imp_next_ping &&
+			    cfs_time_after(imp->imp_next_ping,
+					   cfs_time_add(this_ping,
+							cfs_time_seconds(PING_INTERVAL))))
+				ptlrpc_update_next_ping(imp, 0);
+		}
+		mutex_unlock(&pinger_mutex);
+		/* update memory usage info */
+		obd_update_maxusage();
+
+		/* Wait until the next ping time, or until we're stopped. */
+		time_to_next_wake = pinger_check_timeout(this_ping);
+		/* The ping sent by ptlrpc_send_rpc may get sent out
+		   say .01 second after this.
+		   ptlrpc_pinger_sending_on_import will then set the
+		   next ping time to next_ping + .01 sec, which means
+		   we will SKIP the next ping at next_ping, and the
+		   ping will get sent 2 timeouts from now!  Beware. */
+		CDEBUG(D_INFO, "next wakeup in "CFS_DURATION_T" ("
+		       CFS_TIME_T")\n", time_to_next_wake,
+		       cfs_time_add(this_ping,
+				    cfs_time_seconds(PING_INTERVAL)));
+		if (time_to_next_wake > 0) {
+			lwi = LWI_TIMEOUT(max_t(long, time_to_next_wake,
+						cfs_time_seconds(1)),
+					  NULL, NULL);
+			l_wait_event(thread->t_ctl_waitq,
+				     thread_is_stopping(thread) ||
+				     thread_is_event(thread),
+				     &lwi);
+			if (thread_test_and_clear_flags(thread, SVC_STOPPING)) {
+				break;
+			} else {
+				/* woken after adding import to reset timer */
+				thread_test_and_clear_flags(thread, SVC_EVENT);
+			}
+		}
+	}
+
+	thread_set_flags(thread, SVC_STOPPED);
+	wake_up(&thread->t_ctl_waitq);
+
+	CDEBUG(D_NET, "pinger thread exiting, process %d\n", current_pid());
+	return 0;
+}
+
+static struct ptlrpc_thread pinger_thread;
+
+int ptlrpc_start_pinger(void)
+{
+	struct l_wait_info lwi = { 0 };
+	int rc;
+
+	if (!thread_is_init(&pinger_thread) &&
+	    !thread_is_stopped(&pinger_thread))
+		return -EALREADY;
+
+	init_waitqueue_head(&pinger_thread.t_ctl_waitq);
+
+	strcpy(pinger_thread.t_name, "ll_ping");
+
+	/* CLONE_VM and CLONE_FILES just avoid a needless copy, because we
+	 * just drop the VM and FILES in cfs_daemonize_ctxt() right away. */
+	rc = PTR_ERR(kthread_run(ptlrpc_pinger_main, &pinger_thread,
+				 "%s", pinger_thread.t_name));
+	if (IS_ERR_VALUE(rc)) {
+		CERROR("cannot start thread: %d\n", rc);
+		return rc;
+	}
+	l_wait_event(pinger_thread.t_ctl_waitq,
+		     thread_is_running(&pinger_thread), &lwi);
+
+	if (suppress_pings)
+		CWARN("Pings will be suppressed at the request of the administrator.  The configuration shall meet the additional requirements described in the manual.  (Search for the \"suppress_pings\" kernel module parameter.)\n");
+
+	return 0;
+}
+
+int ptlrpc_pinger_remove_timeouts(void);
+
+int ptlrpc_stop_pinger(void)
+{
+	struct l_wait_info lwi = { 0 };
+	int rc = 0;
+
+	if (thread_is_init(&pinger_thread) ||
+	    thread_is_stopped(&pinger_thread))
+		return -EALREADY;
+
+	ptlrpc_pinger_remove_timeouts();
+	thread_set_flags(&pinger_thread, SVC_STOPPING);
+	wake_up(&pinger_thread.t_ctl_waitq);
+
+	l_wait_event(pinger_thread.t_ctl_waitq,
+		     thread_is_stopped(&pinger_thread), &lwi);
+
+	return rc;
+}
+
+void ptlrpc_pinger_sending_on_import(struct obd_import *imp)
+{
+	ptlrpc_update_next_ping(imp, 0);
+}
+EXPORT_SYMBOL(ptlrpc_pinger_sending_on_import);
+
+void ptlrpc_pinger_commit_expected(struct obd_import *imp)
+{
+	ptlrpc_update_next_ping(imp, 1);
+	assert_spin_locked(&imp->imp_lock);
+	/*
+	 * Avoid reading stale imp_connect_data.  When not sure if pings are
+	 * expected or not on next connection, we assume they are not and force
+	 * one anyway to guarantee the chance of updating
+	 * imp_peer_committed_transno.
+	 */
+	if (imp->imp_state != LUSTRE_IMP_FULL ||
+	    OCD_HAS_FLAG(&imp->imp_connect_data, PINGLESS))
+		imp->imp_force_next_verify = 1;
+}
+
+int ptlrpc_pinger_add_import(struct obd_import *imp)
+{
+	if (!list_empty(&imp->imp_pinger_chain))
+		return -EALREADY;
+
+	mutex_lock(&pinger_mutex);
+	CDEBUG(D_HA, "adding pingable import %s->%s\n",
+	       imp->imp_obd->obd_uuid.uuid, obd2cli_tgt(imp->imp_obd));
+	/* if we add to pinger we want recovery on this import */
+	imp->imp_obd->obd_no_recov = 0;
+	ptlrpc_update_next_ping(imp, 0);
+	/* XXX sort, blah blah */
+	list_add_tail(&imp->imp_pinger_chain, &pinger_imports);
+	class_import_get(imp);
+
+	ptlrpc_pinger_wake_up();
+	mutex_unlock(&pinger_mutex);
+
+	return 0;
+}
+EXPORT_SYMBOL(ptlrpc_pinger_add_import);
+
+int ptlrpc_pinger_del_import(struct obd_import *imp)
+{
+	if (list_empty(&imp->imp_pinger_chain))
+		return -ENOENT;
+
+	mutex_lock(&pinger_mutex);
+	list_del_init(&imp->imp_pinger_chain);
+	CDEBUG(D_HA, "removing pingable import %s->%s\n",
+	       imp->imp_obd->obd_uuid.uuid, obd2cli_tgt(imp->imp_obd));
+	/* if we remove from pinger we don't want recovery on this import */
+	imp->imp_obd->obd_no_recov = 1;
+	class_import_put(imp);
+	mutex_unlock(&pinger_mutex);
+	return 0;
+}
+EXPORT_SYMBOL(ptlrpc_pinger_del_import);
+
+/**
+ * Register a timeout callback to the pinger list, and the callback will
+ * be called when timeout happens.
+ */
+struct timeout_item *ptlrpc_new_timeout(int time, enum timeout_event event,
+					timeout_cb_t cb, void *data)
+{
+	struct timeout_item *ti;
+
+	OBD_ALLOC_PTR(ti);
+	if (!ti)
+		return NULL;
+
+	INIT_LIST_HEAD(&ti->ti_obd_list);
+	INIT_LIST_HEAD(&ti->ti_chain);
+	ti->ti_timeout = time;
+	ti->ti_event = event;
+	ti->ti_cb = cb;
+	ti->ti_cb_data = data;
+
+	return ti;
+}
+
+/**
+ * Register timeout event on the pinger thread.
+ * Note: the timeout list is an sorted list with increased timeout value.
+ */
+static struct timeout_item*
+ptlrpc_pinger_register_timeout(int time, enum timeout_event event,
+			       timeout_cb_t cb, void *data)
+{
+	struct timeout_item *item, *tmp;
+
+	LASSERT(mutex_is_locked(&pinger_mutex));
+
+	list_for_each_entry(item, &timeout_list, ti_chain)
+		if (item->ti_event == event)
+			goto out;
+
+	item = ptlrpc_new_timeout(time, event, cb, data);
+	if (item) {
+		list_for_each_entry_reverse(tmp, &timeout_list, ti_chain) {
+			if (tmp->ti_timeout < time) {
+				list_add(&item->ti_chain, &tmp->ti_chain);
+				goto out;
+			}
+		}
+		list_add(&item->ti_chain, &timeout_list);
+	}
+out:
+	return item;
+}
+
+/* Add a client_obd to the timeout event list, when timeout(@time)
+ * happens, the callback(@cb) will be called.
+ */
+int ptlrpc_add_timeout_client(int time, enum timeout_event event,
+			      timeout_cb_t cb, void *data,
+			      struct list_head *obd_list)
+{
+	struct timeout_item *ti;
+
+	mutex_lock(&pinger_mutex);
+	ti = ptlrpc_pinger_register_timeout(time, event, cb, data);
+	if (!ti) {
+		mutex_unlock(&pinger_mutex);
+		return -EINVAL;
+	}
+	list_add(obd_list, &ti->ti_obd_list);
+	mutex_unlock(&pinger_mutex);
+	return 0;
+}
+EXPORT_SYMBOL(ptlrpc_add_timeout_client);
+
+int ptlrpc_del_timeout_client(struct list_head *obd_list,
+			      enum timeout_event event)
+{
+	struct timeout_item *ti = NULL, *item;
+
+	if (list_empty(obd_list))
+		return 0;
+	mutex_lock(&pinger_mutex);
+	list_del_init(obd_list);
+	/**
+	 * If there are no obd attached to the timeout event
+	 * list, remove this timeout event from the pinger
+	 */
+	list_for_each_entry(item, &timeout_list, ti_chain) {
+		if (item->ti_event == event) {
+			ti = item;
+			break;
+		}
+	}
+	LASSERTF(ti != NULL, "ti is NULL !\n");
+	if (list_empty(&ti->ti_obd_list)) {
+		list_del(&ti->ti_chain);
+		OBD_FREE_PTR(ti);
+	}
+	mutex_unlock(&pinger_mutex);
+	return 0;
+}
+EXPORT_SYMBOL(ptlrpc_del_timeout_client);
+
+int ptlrpc_pinger_remove_timeouts(void)
+{
+	struct timeout_item *item, *tmp;
+
+	mutex_lock(&pinger_mutex);
+	list_for_each_entry_safe(item, tmp, &timeout_list, ti_chain) {
+		LASSERT(list_empty(&item->ti_obd_list));
+		list_del(&item->ti_chain);
+		OBD_FREE_PTR(item);
+	}
+	mutex_unlock(&pinger_mutex);
+	return 0;
+}
+
+void ptlrpc_pinger_wake_up(void)
+{
+	thread_add_flags(&pinger_thread, SVC_EVENT);
+	wake_up(&pinger_thread.t_ctl_waitq);
+}
+
+/* Ping evictor thread */
+#define PET_READY     1
+#define PET_TERMINATE 2
+
+static int pet_refcount;
+static int	       pet_state;
+static wait_queue_head_t       pet_waitq;
+LIST_HEAD(pet_list);
+static DEFINE_SPINLOCK(pet_lock);
+
+int ping_evictor_wake(struct obd_export *exp)
+{
+	struct obd_device *obd;
+
+	spin_lock(&pet_lock);
+	if (pet_state != PET_READY) {
+		/* eventually the new obd will call here again. */
+		spin_unlock(&pet_lock);
+		return 1;
+	}
+
+	obd = class_exp2obd(exp);
+	if (list_empty(&obd->obd_evict_list)) {
+		class_incref(obd, "evictor", obd);
+		list_add(&obd->obd_evict_list, &pet_list);
+	}
+	spin_unlock(&pet_lock);
+
+	wake_up(&pet_waitq);
+	return 0;
+}
+
+static int ping_evictor_main(void *arg)
+{
+	struct obd_device *obd;
+	struct obd_export *exp;
+	struct l_wait_info lwi = { 0 };
+	time_t expire_time;
+
+	unshare_fs_struct();
+
+	CDEBUG(D_HA, "Starting Ping Evictor\n");
+	pet_state = PET_READY;
+	while (1) {
+		l_wait_event(pet_waitq, (!list_empty(&pet_list)) ||
+			     (pet_state == PET_TERMINATE), &lwi);
+
+		/* loop until all obd's will be removed */
+		if ((pet_state == PET_TERMINATE) && list_empty(&pet_list))
+			break;
+
+		/* we only get here if pet_exp != NULL, and the end of this
+		 * loop is the only place which sets it NULL again, so lock
+		 * is not strictly necessary. */
+		spin_lock(&pet_lock);
+		obd = list_entry(pet_list.next, struct obd_device,
+				     obd_evict_list);
+		spin_unlock(&pet_lock);
+
+		expire_time = get_seconds() - PING_EVICT_TIMEOUT;
+
+		CDEBUG(D_HA, "evicting all exports of obd %s older than %ld\n",
+		       obd->obd_name, expire_time);
+
+		/* Exports can't be deleted out of the list while we hold
+		 * the obd lock (class_unlink_export), which means we can't
+		 * lose the last ref on the export.  If they've already been
+		 * removed from the list, we won't find them here. */
+		spin_lock(&obd->obd_dev_lock);
+		while (!list_empty(&obd->obd_exports_timed)) {
+			exp = list_entry(obd->obd_exports_timed.next,
+					     struct obd_export,
+					     exp_obd_chain_timed);
+			if (expire_time > exp->exp_last_request_time) {
+				class_export_get(exp);
+				spin_unlock(&obd->obd_dev_lock);
+				LCONSOLE_WARN("%s: haven't heard from client %s (at %s) in %ld seconds. I think it's dead, and I am evicting it. exp %p, cur %ld expire %ld last %ld\n",
+					      obd->obd_name,
+					      obd_uuid2str(&exp->exp_client_uuid),
+					      obd_export_nid2str(exp),
+					      (long)(get_seconds() -
+						     exp->exp_last_request_time),
+					      exp, (long)get_seconds(),
+					      (long)expire_time,
+					      (long)exp->exp_last_request_time);
+				CDEBUG(D_HA, "Last request was at %ld\n",
+				       exp->exp_last_request_time);
+				class_fail_export(exp);
+				class_export_put(exp);
+				spin_lock(&obd->obd_dev_lock);
+			} else {
+				/* List is sorted, so everyone below is ok */
+				break;
+			}
+		}
+		spin_unlock(&obd->obd_dev_lock);
+
+		spin_lock(&pet_lock);
+		list_del_init(&obd->obd_evict_list);
+		spin_unlock(&pet_lock);
+
+		class_decref(obd, "evictor", obd);
+	}
+	CDEBUG(D_HA, "Exiting Ping Evictor\n");
+
+	return 0;
+}
+
+void ping_evictor_start(void)
+{
+	struct task_struct *task;
+
+	if (++pet_refcount > 1)
+		return;
+
+	init_waitqueue_head(&pet_waitq);
+
+	task = kthread_run(ping_evictor_main, NULL, "ll_evictor");
+	if (IS_ERR(task)) {
+		pet_refcount--;
+		CERROR("Cannot start ping evictor thread: %ld\n",
+			PTR_ERR(task));
+	}
+}
+EXPORT_SYMBOL(ping_evictor_start);
+
+void ping_evictor_stop(void)
+{
+	if (--pet_refcount > 0)
+		return;
+
+	pet_state = PET_TERMINATE;
+	wake_up(&pet_waitq);
+}
+EXPORT_SYMBOL(ping_evictor_stop);
diff --git a/kernel/drivers/staging/lustre/lustre/ptlrpc/ptlrpc_internal.h b/kernel/drivers/staging/lustre/lustre/ptlrpc/ptlrpc_internal.h
new file mode 100644
index 000000000..a66dc3c6d
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/ptlrpc/ptlrpc_internal.h
@@ -0,0 +1,312 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+/* Intramodule declarations for ptlrpc. */
+
+#ifndef PTLRPC_INTERNAL_H
+#define PTLRPC_INTERNAL_H
+
+#include "../ldlm/ldlm_internal.h"
+
+struct ldlm_namespace;
+struct obd_import;
+struct ldlm_res_id;
+struct ptlrpc_request_set;
+extern int test_req_buffer_pressure;
+extern struct mutex ptlrpc_all_services_mutex;
+
+int ptlrpc_start_thread(struct ptlrpc_service_part *svcpt, int wait);
+/* ptlrpcd.c */
+int ptlrpcd_start(int index, int max, const char *name, struct ptlrpcd_ctl *pc);
+
+/* client.c */
+struct ptlrpc_bulk_desc *ptlrpc_new_bulk(unsigned npages, unsigned max_brw,
+					 unsigned type, unsigned portal);
+int ptlrpc_request_cache_init(void);
+void ptlrpc_request_cache_fini(void);
+struct ptlrpc_request *ptlrpc_request_cache_alloc(gfp_t flags);
+void ptlrpc_request_cache_free(struct ptlrpc_request *req);
+void ptlrpc_init_xid(void);
+
+/* events.c */
+int ptlrpc_init_portals(void);
+void ptlrpc_exit_portals(void);
+
+void ptlrpc_request_handle_notconn(struct ptlrpc_request *);
+void lustre_assert_wire_constants(void);
+int ptlrpc_import_in_recovery(struct obd_import *imp);
+int ptlrpc_set_import_discon(struct obd_import *imp, __u32 conn_cnt);
+void ptlrpc_handle_failed_import(struct obd_import *imp);
+int ptlrpc_replay_next(struct obd_import *imp, int *inflight);
+void ptlrpc_initiate_recovery(struct obd_import *imp);
+
+int lustre_unpack_req_ptlrpc_body(struct ptlrpc_request *req, int offset);
+int lustre_unpack_rep_ptlrpc_body(struct ptlrpc_request *req, int offset);
+
+#if defined(CONFIG_PROC_FS)
+void ptlrpc_lprocfs_register_service(struct proc_dir_entry *proc_entry,
+				     struct ptlrpc_service *svc);
+void ptlrpc_lprocfs_unregister_service(struct ptlrpc_service *svc);
+void ptlrpc_lprocfs_rpc_sent(struct ptlrpc_request *req, long amount);
+void ptlrpc_lprocfs_do_request_stat(struct ptlrpc_request *req,
+				     long q_usec, long work_usec);
+#else
+#define ptlrpc_lprocfs_register_service(params...) do {} while (0)
+#define ptlrpc_lprocfs_unregister_service(params...) do {} while (0)
+#define ptlrpc_lprocfs_rpc_sent(params...) do {} while (0)
+#define ptlrpc_lprocfs_do_request_stat(params...) do {} while (0)
+#endif /* CONFIG_PROC_FS */
+
+/* NRS */
+
+/**
+ * NRS core object.
+ *
+ * Holds NRS core fields.
+ */
+struct nrs_core {
+	/**
+	 * Protects nrs_core::nrs_policies, serializes external policy
+	 * registration/unregistration, and NRS core lprocfs operations.
+	 */
+	struct mutex nrs_mutex;
+	/* XXX: This is just for liblustre. Remove the #if defined directive
+	 * when the * "cfs_" prefix is dropped from cfs_list_head. */
+	/**
+	 * List of all policy descriptors registered with NRS core; protected
+	 * by nrs_core::nrs_mutex.
+	 */
+	struct list_head nrs_policies;
+
+};
+
+int ptlrpc_service_nrs_setup(struct ptlrpc_service *svc);
+void ptlrpc_service_nrs_cleanup(struct ptlrpc_service *svc);
+
+void ptlrpc_nrs_req_initialize(struct ptlrpc_service_part *svcpt,
+			       struct ptlrpc_request *req, bool hp);
+void ptlrpc_nrs_req_finalize(struct ptlrpc_request *req);
+void ptlrpc_nrs_req_stop_nolock(struct ptlrpc_request *req);
+void ptlrpc_nrs_req_add(struct ptlrpc_service_part *svcpt,
+			struct ptlrpc_request *req, bool hp);
+
+struct ptlrpc_request *
+ptlrpc_nrs_req_get_nolock0(struct ptlrpc_service_part *svcpt, bool hp,
+			   bool peek, bool force);
+
+static inline struct ptlrpc_request *
+ptlrpc_nrs_req_get_nolock(struct ptlrpc_service_part *svcpt, bool hp,
+			  bool force)
+{
+	return ptlrpc_nrs_req_get_nolock0(svcpt, hp, false, force);
+}
+
+static inline struct ptlrpc_request *
+ptlrpc_nrs_req_peek_nolock(struct ptlrpc_service_part *svcpt, bool hp)
+{
+	return ptlrpc_nrs_req_get_nolock0(svcpt, hp, true, false);
+}
+
+void ptlrpc_nrs_req_del_nolock(struct ptlrpc_request *req);
+bool ptlrpc_nrs_req_pending_nolock(struct ptlrpc_service_part *svcpt, bool hp);
+
+int ptlrpc_nrs_policy_control(const struct ptlrpc_service *svc,
+			      enum ptlrpc_nrs_queue_type queue, char *name,
+			      enum ptlrpc_nrs_ctl opc, bool single, void *arg);
+
+int ptlrpc_nrs_init(void);
+void ptlrpc_nrs_fini(void);
+
+static inline bool nrs_svcpt_has_hp(const struct ptlrpc_service_part *svcpt)
+{
+	return svcpt->scp_nrs_hp != NULL;
+}
+
+static inline bool nrs_svc_has_hp(const struct ptlrpc_service *svc)
+{
+	/**
+	 * If the first service partition has an HP NRS head, all service
+	 * partitions will.
+	 */
+	return nrs_svcpt_has_hp(svc->srv_parts[0]);
+}
+
+static inline
+struct ptlrpc_nrs *nrs_svcpt2nrs(struct ptlrpc_service_part *svcpt, bool hp)
+{
+	LASSERT(ergo(hp, nrs_svcpt_has_hp(svcpt)));
+	return hp ? svcpt->scp_nrs_hp : &svcpt->scp_nrs_reg;
+}
+
+static inline int nrs_pol2cptid(const struct ptlrpc_nrs_policy *policy)
+{
+	return policy->pol_nrs->nrs_svcpt->scp_cpt;
+}
+
+static inline
+struct ptlrpc_service *nrs_pol2svc(struct ptlrpc_nrs_policy *policy)
+{
+	return policy->pol_nrs->nrs_svcpt->scp_service;
+}
+
+static inline
+struct ptlrpc_service_part *nrs_pol2svcpt(struct ptlrpc_nrs_policy *policy)
+{
+	return policy->pol_nrs->nrs_svcpt;
+}
+
+static inline
+struct cfs_cpt_table *nrs_pol2cptab(struct ptlrpc_nrs_policy *policy)
+{
+	return nrs_pol2svc(policy)->srv_cptable;
+}
+
+static inline struct ptlrpc_nrs_resource *
+nrs_request_resource(struct ptlrpc_nrs_request *nrq)
+{
+	LASSERT(nrq->nr_initialized);
+	LASSERT(!nrq->nr_finalized);
+
+	return nrq->nr_res_ptrs[nrq->nr_res_idx];
+}
+
+static inline
+struct ptlrpc_nrs_policy *nrs_request_policy(struct ptlrpc_nrs_request *nrq)
+{
+	return nrs_request_resource(nrq)->res_policy;
+}
+
+#define NRS_LPROCFS_QUANTUM_NAME_REG	"reg_quantum:"
+#define NRS_LPROCFS_QUANTUM_NAME_HP	"hp_quantum:"
+
+/**
+ * the maximum size of nrs_crrn_client::cc_quantum and nrs_orr_data::od_quantum.
+ */
+#define LPROCFS_NRS_QUANTUM_MAX		65535
+
+/**
+ * Max valid command string is the size of the labels, plus "65535" twice, plus
+ * a separating space character.
+ */
+#define LPROCFS_NRS_WR_QUANTUM_MAX_CMD					       \
+ sizeof(NRS_LPROCFS_QUANTUM_NAME_REG __stringify(LPROCFS_NRS_QUANTUM_MAX) " "  \
+	NRS_LPROCFS_QUANTUM_NAME_HP __stringify(LPROCFS_NRS_QUANTUM_MAX))
+
+/* recovd_thread.c */
+
+int ptlrpc_expire_one_request(struct ptlrpc_request *req, int async_unlink);
+
+/* pers.c */
+void ptlrpc_fill_bulk_md(lnet_md_t *md, struct ptlrpc_bulk_desc *desc,
+			 int mdcnt);
+void ptlrpc_add_bulk_page(struct ptlrpc_bulk_desc *desc, struct page *page,
+			  int pageoffset, int len);
+
+/* pack_generic.c */
+struct ptlrpc_reply_state *
+lustre_get_emerg_rs(struct ptlrpc_service_part *svcpt);
+void lustre_put_emerg_rs(struct ptlrpc_reply_state *rs);
+
+/* pinger.c */
+int ptlrpc_start_pinger(void);
+int ptlrpc_stop_pinger(void);
+void ptlrpc_pinger_sending_on_import(struct obd_import *imp);
+void ptlrpc_pinger_commit_expected(struct obd_import *imp);
+void ptlrpc_pinger_wake_up(void);
+void ptlrpc_ping_import_soon(struct obd_import *imp);
+int ping_evictor_wake(struct obd_export *exp);
+
+/* sec_null.c */
+int  sptlrpc_null_init(void);
+void sptlrpc_null_fini(void);
+
+/* sec_plain.c */
+int  sptlrpc_plain_init(void);
+void sptlrpc_plain_fini(void);
+
+/* sec_bulk.c */
+int  sptlrpc_enc_pool_init(void);
+void sptlrpc_enc_pool_fini(void);
+int sptlrpc_proc_enc_pool_seq_show(struct seq_file *m, void *v);
+
+/* sec_lproc.c */
+#if defined(CONFIG_PROC_FS)
+int  sptlrpc_lproc_init(void);
+void sptlrpc_lproc_fini(void);
+#else
+static inline int sptlrpc_lproc_init(void)
+{ return 0; }
+static inline void sptlrpc_lproc_fini(void) {}
+#endif
+
+/* sec_gc.c */
+int sptlrpc_gc_init(void);
+void sptlrpc_gc_fini(void);
+
+/* sec_config.c */
+void sptlrpc_conf_choose_flavor(enum lustre_sec_part from,
+				enum lustre_sec_part to,
+				struct obd_uuid *target,
+				lnet_nid_t nid,
+				struct sptlrpc_flavor *sf);
+int  sptlrpc_conf_init(void);
+void sptlrpc_conf_fini(void);
+
+/* sec.c */
+int  sptlrpc_init(void);
+void sptlrpc_fini(void);
+
+static inline int ll_rpc_recoverable_error(int rc)
+{
+	return (rc == -ENOTCONN || rc == -ENODEV);
+}
+
+static inline int tgt_mod_init(void)
+{
+	return 0;
+}
+
+static inline void tgt_mod_exit(void)
+{
+	return;
+}
+
+static inline void ptlrpc_reqset_put(struct ptlrpc_request_set *set)
+{
+	if (atomic_dec_and_test(&set->set_refcount))
+		OBD_FREE_PTR(set);
+}
+#endif /* PTLRPC_INTERNAL_H */
diff --git a/kernel/drivers/staging/lustre/lustre/ptlrpc/ptlrpc_module.c b/kernel/drivers/staging/lustre/lustre/ptlrpc/ptlrpc_module.c
new file mode 100644
index 000000000..5268887ca
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/ptlrpc/ptlrpc_module.c
@@ -0,0 +1,171 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_RPC
+
+
+#include "../include/obd_support.h"
+#include "../include/obd_class.h"
+#include "../include/lustre_net.h"
+#include "../include/lustre_req_layout.h"
+
+#include "ptlrpc_internal.h"
+
+extern spinlock_t ptlrpc_last_xid_lock;
+#if RS_DEBUG
+extern spinlock_t ptlrpc_rs_debug_lock;
+#endif
+extern struct mutex pinger_mutex;
+extern struct mutex ptlrpcd_mutex;
+
+__init int ptlrpc_init(void)
+{
+	int rc, cleanup_phase = 0;
+
+	lustre_assert_wire_constants();
+#if RS_DEBUG
+	spin_lock_init(&ptlrpc_rs_debug_lock);
+#endif
+	mutex_init(&ptlrpc_all_services_mutex);
+	mutex_init(&pinger_mutex);
+	mutex_init(&ptlrpcd_mutex);
+	ptlrpc_init_xid();
+
+	rc = req_layout_init();
+	if (rc)
+		return rc;
+
+	rc = ptlrpc_hr_init();
+	if (rc)
+		return rc;
+
+	cleanup_phase = 1;
+	rc = ptlrpc_request_cache_init();
+	if (rc)
+		goto cleanup;
+
+	cleanup_phase = 2;
+	rc = ptlrpc_init_portals();
+	if (rc)
+		goto cleanup;
+
+	cleanup_phase = 3;
+
+	rc = ptlrpc_connection_init();
+	if (rc)
+		goto cleanup;
+
+	cleanup_phase = 4;
+	ptlrpc_put_connection_superhack = ptlrpc_connection_put;
+
+	rc = ptlrpc_start_pinger();
+	if (rc)
+		goto cleanup;
+
+	cleanup_phase = 5;
+	rc = ldlm_init();
+	if (rc)
+		goto cleanup;
+
+	cleanup_phase = 6;
+	rc = sptlrpc_init();
+	if (rc)
+		goto cleanup;
+
+	cleanup_phase = 7;
+	rc = ptlrpc_nrs_init();
+	if (rc)
+		goto cleanup;
+
+	cleanup_phase = 8;
+	rc = tgt_mod_init();
+	if (rc)
+		goto cleanup;
+	return 0;
+
+cleanup:
+	switch (cleanup_phase) {
+	case 8:
+		ptlrpc_nrs_fini();
+		/* Fall through */
+	case 7:
+		sptlrpc_fini();
+		/* Fall through */
+	case 6:
+		ldlm_exit();
+		/* Fall through */
+	case 5:
+		ptlrpc_stop_pinger();
+		/* Fall through */
+	case 4:
+		ptlrpc_connection_fini();
+		/* Fall through */
+	case 3:
+		ptlrpc_exit_portals();
+		/* Fall through */
+	case 2:
+		ptlrpc_request_cache_fini();
+		/* Fall through */
+	case 1:
+		ptlrpc_hr_fini();
+		req_layout_fini();
+		/* Fall through */
+	default: ;
+	}
+
+	return rc;
+}
+
+static void __exit ptlrpc_exit(void)
+{
+	tgt_mod_exit();
+	ptlrpc_nrs_fini();
+	sptlrpc_fini();
+	ldlm_exit();
+	ptlrpc_stop_pinger();
+	ptlrpc_exit_portals();
+	ptlrpc_request_cache_fini();
+	ptlrpc_hr_fini();
+	ptlrpc_connection_fini();
+}
+
+MODULE_AUTHOR("Sun Microsystems, Inc. <http://www.lustre.org/>");
+MODULE_DESCRIPTION("Lustre Request Processor and Lock Management");
+MODULE_LICENSE("GPL");
+MODULE_VERSION("1.0.0");
+
+module_init(ptlrpc_init);
+module_exit(ptlrpc_exit);
diff --git a/kernel/drivers/staging/lustre/lustre/ptlrpc/ptlrpcd.c b/kernel/drivers/staging/lustre/lustre/ptlrpc/ptlrpcd.c
new file mode 100644
index 000000000..0c178ec0e
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/ptlrpc/ptlrpcd.c
@@ -0,0 +1,811 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/ptlrpc/ptlrpcd.c
+ */
+
+/** \defgroup ptlrpcd PortalRPC daemon
+ *
+ * ptlrpcd is a special thread with its own set where other user might add
+ * requests when they don't want to wait for their completion.
+ * PtlRPCD will take care of sending such requests and then processing their
+ * replies and calling completion callbacks as necessary.
+ * The callbacks are called directly from ptlrpcd context.
+ * It is important to never significantly block (esp. on RPCs!) within such
+ * completion handler or a deadlock might occur where ptlrpcd enters some
+ * callback that attempts to send another RPC and wait for it to return,
+ * during which time ptlrpcd is completely blocked, so e.g. if import
+ * fails, recovery cannot progress because connection requests are also
+ * sent by ptlrpcd.
+ *
+ * @{
+ */
+
+#define DEBUG_SUBSYSTEM S_RPC
+
+#include "../../include/linux/libcfs/libcfs.h"
+
+#include "../include/lustre_net.h"
+#include "../include/lustre_lib.h"
+#include "../include/lustre_ha.h"
+#include "../include/obd_class.h"	/* for obd_zombie */
+#include "../include/obd_support.h"	/* for OBD_FAIL_CHECK */
+#include "../include/cl_object.h"	/* cl_env_{get,put}() */
+#include "../include/lprocfs_status.h"
+
+#include "ptlrpc_internal.h"
+
+struct ptlrpcd {
+	int		pd_size;
+	int		pd_index;
+	int		pd_nthreads;
+	struct ptlrpcd_ctl pd_thread_rcv;
+	struct ptlrpcd_ctl pd_threads[0];
+};
+
+static int max_ptlrpcds;
+module_param(max_ptlrpcds, int, 0644);
+MODULE_PARM_DESC(max_ptlrpcds, "Max ptlrpcd thread count to be started.");
+
+static int ptlrpcd_bind_policy = PDB_POLICY_PAIR;
+module_param(ptlrpcd_bind_policy, int, 0644);
+MODULE_PARM_DESC(ptlrpcd_bind_policy, "Ptlrpcd threads binding mode.");
+static struct ptlrpcd *ptlrpcds;
+
+struct mutex ptlrpcd_mutex;
+static int ptlrpcd_users;
+
+void ptlrpcd_wake(struct ptlrpc_request *req)
+{
+	struct ptlrpc_request_set *rq_set = req->rq_set;
+
+	LASSERT(rq_set != NULL);
+
+	wake_up(&rq_set->set_waitq);
+}
+EXPORT_SYMBOL(ptlrpcd_wake);
+
+static struct ptlrpcd_ctl *
+ptlrpcd_select_pc(struct ptlrpc_request *req, pdl_policy_t policy, int index)
+{
+	int idx = 0;
+
+	if (req != NULL && req->rq_send_state != LUSTRE_IMP_FULL)
+		return &ptlrpcds->pd_thread_rcv;
+
+	switch (policy) {
+	case PDL_POLICY_SAME:
+		idx = smp_processor_id() % ptlrpcds->pd_nthreads;
+		break;
+	case PDL_POLICY_LOCAL:
+		/* Before CPU partition patches available, process it the same
+		 * as "PDL_POLICY_ROUND". */
+# ifdef CFS_CPU_MODE_NUMA
+# warning "fix this code to use new CPU partition APIs"
+# endif
+		/* Fall through to PDL_POLICY_ROUND until the CPU
+		 * CPU partition patches are available. */
+		index = -1;
+	case PDL_POLICY_PREFERRED:
+		if (index >= 0 && index < num_online_cpus()) {
+			idx = index % ptlrpcds->pd_nthreads;
+			break;
+		}
+		/* Fall through to PDL_POLICY_ROUND for bad index. */
+	default:
+		/* Fall through to PDL_POLICY_ROUND for unknown policy. */
+	case PDL_POLICY_ROUND:
+		/* We do not care whether it is strict load balance. */
+		idx = ptlrpcds->pd_index + 1;
+		if (idx == smp_processor_id())
+			idx++;
+		idx %= ptlrpcds->pd_nthreads;
+		ptlrpcds->pd_index = idx;
+		break;
+	}
+
+	return &ptlrpcds->pd_threads[idx];
+}
+
+/**
+ * Move all request from an existing request set to the ptlrpcd queue.
+ * All requests from the set must be in phase RQ_PHASE_NEW.
+ */
+void ptlrpcd_add_rqset(struct ptlrpc_request_set *set)
+{
+	struct list_head *tmp, *pos;
+	struct ptlrpcd_ctl *pc;
+	struct ptlrpc_request_set *new;
+	int count, i;
+
+	pc = ptlrpcd_select_pc(NULL, PDL_POLICY_LOCAL, -1);
+	new = pc->pc_set;
+
+	list_for_each_safe(pos, tmp, &set->set_requests) {
+		struct ptlrpc_request *req =
+			list_entry(pos, struct ptlrpc_request,
+				       rq_set_chain);
+
+		LASSERT(req->rq_phase == RQ_PHASE_NEW);
+		req->rq_set = new;
+		req->rq_queued_time = cfs_time_current();
+	}
+
+	spin_lock(&new->set_new_req_lock);
+	list_splice_init(&set->set_requests, &new->set_new_requests);
+	i = atomic_read(&set->set_remaining);
+	count = atomic_add_return(i, &new->set_new_count);
+	atomic_set(&set->set_remaining, 0);
+	spin_unlock(&new->set_new_req_lock);
+	if (count == i) {
+		wake_up(&new->set_waitq);
+
+		/* XXX: It maybe unnecessary to wakeup all the partners. But to
+		 *      guarantee the async RPC can be processed ASAP, we have
+		 *      no other better choice. It maybe fixed in future. */
+		for (i = 0; i < pc->pc_npartners; i++)
+			wake_up(&pc->pc_partners[i]->pc_set->set_waitq);
+	}
+}
+EXPORT_SYMBOL(ptlrpcd_add_rqset);
+
+/**
+ * Return transferred RPCs count.
+ */
+static int ptlrpcd_steal_rqset(struct ptlrpc_request_set *des,
+			       struct ptlrpc_request_set *src)
+{
+	struct list_head *tmp, *pos;
+	struct ptlrpc_request *req;
+	int rc = 0;
+
+	spin_lock(&src->set_new_req_lock);
+	if (likely(!list_empty(&src->set_new_requests))) {
+		list_for_each_safe(pos, tmp, &src->set_new_requests) {
+			req = list_entry(pos, struct ptlrpc_request,
+					     rq_set_chain);
+			req->rq_set = des;
+		}
+		list_splice_init(&src->set_new_requests,
+				     &des->set_requests);
+		rc = atomic_read(&src->set_new_count);
+		atomic_add(rc, &des->set_remaining);
+		atomic_set(&src->set_new_count, 0);
+	}
+	spin_unlock(&src->set_new_req_lock);
+	return rc;
+}
+
+/**
+ * Requests that are added to the ptlrpcd queue are sent via
+ * ptlrpcd_check->ptlrpc_check_set().
+ */
+void ptlrpcd_add_req(struct ptlrpc_request *req, pdl_policy_t policy, int idx)
+{
+	struct ptlrpcd_ctl *pc;
+
+	if (req->rq_reqmsg)
+		lustre_msg_set_jobid(req->rq_reqmsg, NULL);
+
+	spin_lock(&req->rq_lock);
+	if (req->rq_invalid_rqset) {
+		struct l_wait_info lwi = LWI_TIMEOUT(cfs_time_seconds(5),
+						     back_to_sleep, NULL);
+
+		req->rq_invalid_rqset = 0;
+		spin_unlock(&req->rq_lock);
+		l_wait_event(req->rq_set_waitq, (req->rq_set == NULL), &lwi);
+	} else if (req->rq_set) {
+		/* If we have a valid "rq_set", just reuse it to avoid double
+		 * linked. */
+		LASSERT(req->rq_phase == RQ_PHASE_NEW);
+		LASSERT(req->rq_send_state == LUSTRE_IMP_REPLAY);
+
+		/* ptlrpc_check_set will decrease the count */
+		atomic_inc(&req->rq_set->set_remaining);
+		spin_unlock(&req->rq_lock);
+		wake_up(&req->rq_set->set_waitq);
+		return;
+	} else {
+		spin_unlock(&req->rq_lock);
+	}
+
+	pc = ptlrpcd_select_pc(req, policy, idx);
+
+	DEBUG_REQ(D_INFO, req, "add req [%p] to pc [%s:%d]",
+		  req, pc->pc_name, pc->pc_index);
+
+	ptlrpc_set_add_new_req(pc, req);
+}
+EXPORT_SYMBOL(ptlrpcd_add_req);
+
+static inline void ptlrpc_reqset_get(struct ptlrpc_request_set *set)
+{
+	atomic_inc(&set->set_refcount);
+}
+
+/**
+ * Check if there is more work to do on ptlrpcd set.
+ * Returns 1 if yes.
+ */
+static int ptlrpcd_check(struct lu_env *env, struct ptlrpcd_ctl *pc)
+{
+	struct list_head *tmp, *pos;
+	struct ptlrpc_request *req;
+	struct ptlrpc_request_set *set = pc->pc_set;
+	int rc = 0;
+	int rc2;
+
+	if (atomic_read(&set->set_new_count)) {
+		spin_lock(&set->set_new_req_lock);
+		if (likely(!list_empty(&set->set_new_requests))) {
+			list_splice_init(&set->set_new_requests,
+					     &set->set_requests);
+			atomic_add(atomic_read(&set->set_new_count),
+				       &set->set_remaining);
+			atomic_set(&set->set_new_count, 0);
+			/*
+			 * Need to calculate its timeout.
+			 */
+			rc = 1;
+		}
+		spin_unlock(&set->set_new_req_lock);
+	}
+
+	/* We should call lu_env_refill() before handling new requests to make
+	 * sure that env key the requests depending on really exists.
+	 */
+	rc2 = lu_env_refill(env);
+	if (rc2 != 0) {
+		/*
+		 * XXX This is very awkward situation, because
+		 * execution can neither continue (request
+		 * interpreters assume that env is set up), nor repeat
+		 * the loop (as this potentially results in a tight
+		 * loop of -ENOMEM's).
+		 *
+		 * Fortunately, refill only ever does something when
+		 * new modules are loaded, i.e., early during boot up.
+		 */
+		CERROR("Failure to refill session: %d\n", rc2);
+		return rc;
+	}
+
+	if (atomic_read(&set->set_remaining))
+		rc |= ptlrpc_check_set(env, set);
+
+	/* NB: ptlrpc_check_set has already moved completed request at the
+	 * head of seq::set_requests */
+	list_for_each_safe(pos, tmp, &set->set_requests) {
+		req = list_entry(pos, struct ptlrpc_request, rq_set_chain);
+		if (req->rq_phase != RQ_PHASE_COMPLETE)
+			break;
+
+		list_del_init(&req->rq_set_chain);
+		req->rq_set = NULL;
+		ptlrpc_req_finished(req);
+	}
+
+	if (rc == 0) {
+		/*
+		 * If new requests have been added, make sure to wake up.
+		 */
+		rc = atomic_read(&set->set_new_count);
+
+		/* If we have nothing to do, check whether we can take some
+		 * work from our partner threads. */
+		if (rc == 0 && pc->pc_npartners > 0) {
+			struct ptlrpcd_ctl *partner;
+			struct ptlrpc_request_set *ps;
+			int first = pc->pc_cursor;
+
+			do {
+				partner = pc->pc_partners[pc->pc_cursor++];
+				if (pc->pc_cursor >= pc->pc_npartners)
+					pc->pc_cursor = 0;
+				if (partner == NULL)
+					continue;
+
+				spin_lock(&partner->pc_lock);
+				ps = partner->pc_set;
+				if (ps == NULL) {
+					spin_unlock(&partner->pc_lock);
+					continue;
+				}
+
+				ptlrpc_reqset_get(ps);
+				spin_unlock(&partner->pc_lock);
+
+				if (atomic_read(&ps->set_new_count)) {
+					rc = ptlrpcd_steal_rqset(set, ps);
+					if (rc > 0)
+						CDEBUG(D_RPCTRACE, "transfer %d async RPCs [%d->%d]\n",
+						       rc, partner->pc_index,
+						       pc->pc_index);
+				}
+				ptlrpc_reqset_put(ps);
+			} while (rc == 0 && pc->pc_cursor != first);
+		}
+	}
+
+	return rc;
+}
+
+/**
+ * Main ptlrpcd thread.
+ * ptlrpc's code paths like to execute in process context, so we have this
+ * thread which spins on a set which contains the rpcs and sends them.
+ *
+ */
+static int ptlrpcd(void *arg)
+{
+	struct ptlrpcd_ctl *pc = arg;
+	struct ptlrpc_request_set *set = pc->pc_set;
+	struct lu_env env = { .le_ses = NULL };
+	int rc, exit = 0;
+
+	unshare_fs_struct();
+#if defined(CONFIG_SMP)
+	if (test_bit(LIOD_BIND, &pc->pc_flags)) {
+		int index = pc->pc_index;
+
+		if (index >= 0 && index < num_possible_cpus()) {
+			while (!cpu_online(index)) {
+				if (++index >= num_possible_cpus())
+					index = 0;
+			}
+			set_cpus_allowed_ptr(current,
+					cpumask_of_node(cpu_to_node(index)));
+		}
+	}
+#endif
+	/*
+	 * XXX So far only "client" ptlrpcd uses an environment. In
+	 * the future, ptlrpcd thread (or a thread-set) has to given
+	 * an argument, describing its "scope".
+	 */
+	rc = lu_context_init(&env.le_ctx,
+			     LCT_CL_THREAD|LCT_REMEMBER|LCT_NOREF);
+	complete(&pc->pc_starting);
+
+	if (rc != 0)
+		return rc;
+
+	/*
+	 * This mainloop strongly resembles ptlrpc_set_wait() except that our
+	 * set never completes.  ptlrpcd_check() calls ptlrpc_check_set() when
+	 * there are requests in the set. New requests come in on the set's
+	 * new_req_list and ptlrpcd_check() moves them into the set.
+	 */
+	do {
+		struct l_wait_info lwi;
+		int timeout;
+
+		timeout = ptlrpc_set_next_timeout(set);
+		lwi = LWI_TIMEOUT(cfs_time_seconds(timeout ? timeout : 1),
+				  ptlrpc_expired_set, set);
+
+		lu_context_enter(&env.le_ctx);
+		l_wait_event(set->set_waitq,
+			     ptlrpcd_check(&env, pc), &lwi);
+		lu_context_exit(&env.le_ctx);
+
+		/*
+		 * Abort inflight rpcs for forced stop case.
+		 */
+		if (test_bit(LIOD_STOP, &pc->pc_flags)) {
+			if (test_bit(LIOD_FORCE, &pc->pc_flags))
+				ptlrpc_abort_set(set);
+			exit++;
+		}
+
+		/*
+		 * Let's make one more loop to make sure that ptlrpcd_check()
+		 * copied all raced new rpcs into the set so we can kill them.
+		 */
+	} while (exit < 2);
+
+	/*
+	 * Wait for inflight requests to drain.
+	 */
+	if (!list_empty(&set->set_requests))
+		ptlrpc_set_wait(set);
+	lu_context_fini(&env.le_ctx);
+
+	complete(&pc->pc_finishing);
+
+	return 0;
+}
+
+/* XXX: We want multiple CPU cores to share the async RPC load. So we start many
+ *      ptlrpcd threads. We also want to reduce the ptlrpcd overhead caused by
+ *      data transfer cross-CPU cores. So we bind ptlrpcd thread to specified
+ *      CPU core. But binding all ptlrpcd threads maybe cause response delay
+ *      because of some CPU core(s) busy with other loads.
+ *
+ *      For example: "ls -l", some async RPCs for statahead are assigned to
+ *      ptlrpcd_0, and ptlrpcd_0 is bound to CPU_0, but CPU_0 may be quite busy
+ *      with other non-ptlrpcd, like "ls -l" itself (we want to the "ls -l"
+ *      thread, statahead thread, and ptlrpcd thread can run in parallel), under
+ *      such case, the statahead async RPCs can not be processed in time, it is
+ *      unexpected. If ptlrpcd_0 can be re-scheduled on other CPU core, it may
+ *      be better. But it breaks former data transfer policy.
+ *
+ *      So we shouldn't be blind for avoiding the data transfer. We make some
+ *      compromise: divide the ptlrpcd threads pool into two parts. One part is
+ *      for bound mode, each ptlrpcd thread in this part is bound to some CPU
+ *      core. The other part is for free mode, all the ptlrpcd threads in the
+ *      part can be scheduled on any CPU core. We specify some partnership
+ *      between bound mode ptlrpcd thread(s) and free mode ptlrpcd thread(s),
+ *      and the async RPC load within the partners are shared.
+ *
+ *      It can partly avoid data transfer cross-CPU (if the bound mode ptlrpcd
+ *      thread can be scheduled in time), and try to guarantee the async RPC
+ *      processed ASAP (as long as the free mode ptlrpcd thread can be scheduled
+ *      on any CPU core).
+ *
+ *      As for how to specify the partnership between bound mode ptlrpcd
+ *      thread(s) and free mode ptlrpcd thread(s), the simplest way is to use
+ *      <free bound> pair. In future, we can specify some more complex
+ *      partnership based on the patches for CPU partition. But before such
+ *      patches are available, we prefer to use the simplest one.
+ */
+# ifdef CFS_CPU_MODE_NUMA
+# warning "fix ptlrpcd_bind() to use new CPU partition APIs"
+# endif
+static int ptlrpcd_bind(int index, int max)
+{
+	struct ptlrpcd_ctl *pc;
+	int rc = 0;
+#if defined(CONFIG_NUMA)
+	cpumask_t mask;
+#endif
+
+	LASSERT(index <= max - 1);
+	pc = &ptlrpcds->pd_threads[index];
+	switch (ptlrpcd_bind_policy) {
+	case PDB_POLICY_NONE:
+		pc->pc_npartners = -1;
+		break;
+	case PDB_POLICY_FULL:
+		pc->pc_npartners = 0;
+		set_bit(LIOD_BIND, &pc->pc_flags);
+		break;
+	case PDB_POLICY_PAIR:
+		LASSERT(max % 2 == 0);
+		pc->pc_npartners = 1;
+		break;
+	case PDB_POLICY_NEIGHBOR:
+#if defined(CONFIG_NUMA)
+	{
+		int i;
+		cpumask_copy(&mask, cpumask_of_node(cpu_to_node(index)));
+		for (i = max; i < num_online_cpus(); i++)
+			cpumask_clear_cpu(i, &mask);
+		pc->pc_npartners = cpumask_weight(&mask) - 1;
+		set_bit(LIOD_BIND, &pc->pc_flags);
+	}
+#else
+		LASSERT(max >= 3);
+		pc->pc_npartners = 2;
+#endif
+		break;
+	default:
+		CERROR("unknown ptlrpcd bind policy %d\n", ptlrpcd_bind_policy);
+		rc = -EINVAL;
+	}
+
+	if (rc == 0 && pc->pc_npartners > 0) {
+		OBD_ALLOC(pc->pc_partners,
+			  sizeof(struct ptlrpcd_ctl *) * pc->pc_npartners);
+		if (pc->pc_partners == NULL) {
+			pc->pc_npartners = 0;
+			rc = -ENOMEM;
+		} else {
+			switch (ptlrpcd_bind_policy) {
+			case PDB_POLICY_PAIR:
+				if (index & 0x1) {
+					set_bit(LIOD_BIND, &pc->pc_flags);
+					pc->pc_partners[0] = &ptlrpcds->
+						pd_threads[index - 1];
+					ptlrpcds->pd_threads[index - 1].
+						pc_partners[0] = pc;
+				}
+				break;
+			case PDB_POLICY_NEIGHBOR:
+#if defined(CONFIG_NUMA)
+			{
+				struct ptlrpcd_ctl *ppc;
+				int i, pidx;
+				/* partners are cores in the same NUMA node.
+				 * setup partnership only with ptlrpcd threads
+				 * that are already initialized
+				 */
+				for (pidx = 0, i = 0; i < index; i++) {
+					if (cpumask_test_cpu(i, &mask)) {
+						ppc = &ptlrpcds->pd_threads[i];
+						pc->pc_partners[pidx++] = ppc;
+						ppc->pc_partners[ppc->
+							  pc_npartners++] = pc;
+					}
+				}
+				/* adjust number of partners to the number
+				 * of partnership really setup */
+				pc->pc_npartners = pidx;
+			}
+#else
+				if (index & 0x1)
+					set_bit(LIOD_BIND, &pc->pc_flags);
+				if (index > 0) {
+					pc->pc_partners[0] = &ptlrpcds->
+						pd_threads[index - 1];
+					ptlrpcds->pd_threads[index - 1].
+						pc_partners[1] = pc;
+					if (index == max - 1) {
+						pc->pc_partners[1] =
+						&ptlrpcds->pd_threads[0];
+						ptlrpcds->pd_threads[0].
+						pc_partners[0] = pc;
+					}
+				}
+#endif
+				break;
+			}
+		}
+	}
+
+	return rc;
+}
+
+
+int ptlrpcd_start(int index, int max, const char *name, struct ptlrpcd_ctl *pc)
+{
+	int rc;
+
+	/*
+	 * Do not allow start second thread for one pc.
+	 */
+	if (test_and_set_bit(LIOD_START, &pc->pc_flags)) {
+		CWARN("Starting second thread (%s) for same pc %p\n",
+		      name, pc);
+		return 0;
+	}
+
+	pc->pc_index = index;
+	init_completion(&pc->pc_starting);
+	init_completion(&pc->pc_finishing);
+	spin_lock_init(&pc->pc_lock);
+	strlcpy(pc->pc_name, name, sizeof(pc->pc_name));
+	pc->pc_set = ptlrpc_prep_set();
+	if (pc->pc_set == NULL) {
+		rc = -ENOMEM;
+		goto out;
+	}
+
+	/*
+	 * So far only "client" ptlrpcd uses an environment. In the future,
+	 * ptlrpcd thread (or a thread-set) has to be given an argument,
+	 * describing its "scope".
+	 */
+	rc = lu_context_init(&pc->pc_env.le_ctx, LCT_CL_THREAD|LCT_REMEMBER);
+	if (rc != 0)
+		goto out_set;
+
+	{
+		struct task_struct *task;
+		if (index >= 0) {
+			rc = ptlrpcd_bind(index, max);
+			if (rc < 0)
+				goto out_env;
+		}
+
+		task = kthread_run(ptlrpcd, pc, "%s", pc->pc_name);
+		if (IS_ERR(task)) {
+			rc = PTR_ERR(task);
+			goto out_env;
+		}
+
+		wait_for_completion(&pc->pc_starting);
+	}
+	return 0;
+
+out_env:
+	lu_context_fini(&pc->pc_env.le_ctx);
+
+out_set:
+	if (pc->pc_set != NULL) {
+		struct ptlrpc_request_set *set = pc->pc_set;
+
+		spin_lock(&pc->pc_lock);
+		pc->pc_set = NULL;
+		spin_unlock(&pc->pc_lock);
+		ptlrpc_set_destroy(set);
+	}
+	clear_bit(LIOD_BIND, &pc->pc_flags);
+
+out:
+	clear_bit(LIOD_START, &pc->pc_flags);
+	return rc;
+}
+
+void ptlrpcd_stop(struct ptlrpcd_ctl *pc, int force)
+{
+	if (!test_bit(LIOD_START, &pc->pc_flags)) {
+		CWARN("Thread for pc %p was not started\n", pc);
+		return;
+	}
+
+	set_bit(LIOD_STOP, &pc->pc_flags);
+	if (force)
+		set_bit(LIOD_FORCE, &pc->pc_flags);
+	wake_up(&pc->pc_set->set_waitq);
+}
+
+void ptlrpcd_free(struct ptlrpcd_ctl *pc)
+{
+	struct ptlrpc_request_set *set = pc->pc_set;
+
+	if (!test_bit(LIOD_START, &pc->pc_flags)) {
+		CWARN("Thread for pc %p was not started\n", pc);
+		goto out;
+	}
+
+	wait_for_completion(&pc->pc_finishing);
+	lu_context_fini(&pc->pc_env.le_ctx);
+
+	spin_lock(&pc->pc_lock);
+	pc->pc_set = NULL;
+	spin_unlock(&pc->pc_lock);
+	ptlrpc_set_destroy(set);
+
+	clear_bit(LIOD_START, &pc->pc_flags);
+	clear_bit(LIOD_STOP, &pc->pc_flags);
+	clear_bit(LIOD_FORCE, &pc->pc_flags);
+	clear_bit(LIOD_BIND, &pc->pc_flags);
+
+out:
+	if (pc->pc_npartners > 0) {
+		LASSERT(pc->pc_partners != NULL);
+
+		OBD_FREE(pc->pc_partners,
+			 sizeof(struct ptlrpcd_ctl *) * pc->pc_npartners);
+		pc->pc_partners = NULL;
+	}
+	pc->pc_npartners = 0;
+}
+
+static void ptlrpcd_fini(void)
+{
+	int i;
+
+	if (ptlrpcds != NULL) {
+		for (i = 0; i < ptlrpcds->pd_nthreads; i++)
+			ptlrpcd_stop(&ptlrpcds->pd_threads[i], 0);
+		for (i = 0; i < ptlrpcds->pd_nthreads; i++)
+			ptlrpcd_free(&ptlrpcds->pd_threads[i]);
+		ptlrpcd_stop(&ptlrpcds->pd_thread_rcv, 0);
+		ptlrpcd_free(&ptlrpcds->pd_thread_rcv);
+		OBD_FREE(ptlrpcds, ptlrpcds->pd_size);
+		ptlrpcds = NULL;
+	}
+}
+
+static int ptlrpcd_init(void)
+{
+	int nthreads = num_online_cpus();
+	char name[16];
+	int size, i = -1, j, rc = 0;
+
+	if (max_ptlrpcds > 0 && max_ptlrpcds < nthreads)
+		nthreads = max_ptlrpcds;
+	if (nthreads < 2)
+		nthreads = 2;
+	if (nthreads < 3 && ptlrpcd_bind_policy == PDB_POLICY_NEIGHBOR)
+		ptlrpcd_bind_policy = PDB_POLICY_PAIR;
+	else if (nthreads % 2 != 0 && ptlrpcd_bind_policy == PDB_POLICY_PAIR)
+		nthreads &= ~1; /* make sure it is even */
+
+	size = offsetof(struct ptlrpcd, pd_threads[nthreads]);
+	OBD_ALLOC(ptlrpcds, size);
+	if (ptlrpcds == NULL) {
+		rc = -ENOMEM;
+		goto out;
+	}
+
+	snprintf(name, sizeof(name), "ptlrpcd_rcv");
+	set_bit(LIOD_RECOVERY, &ptlrpcds->pd_thread_rcv.pc_flags);
+	rc = ptlrpcd_start(-1, nthreads, name, &ptlrpcds->pd_thread_rcv);
+	if (rc < 0)
+		goto out;
+
+	/* XXX: We start nthreads ptlrpc daemons. Each of them can process any
+	 *      non-recovery async RPC to improve overall async RPC efficiency.
+	 *
+	 *      But there are some issues with async I/O RPCs and async non-I/O
+	 *      RPCs processed in the same set under some cases. The ptlrpcd may
+	 *      be blocked by some async I/O RPC(s), then will cause other async
+	 *      non-I/O RPC(s) can not be processed in time.
+	 *
+	 *      Maybe we should distinguish blocked async RPCs from non-blocked
+	 *      async RPCs, and process them in different ptlrpcd sets to avoid
+	 *      unnecessary dependency. But how to distribute async RPCs load
+	 *      among all the ptlrpc daemons becomes another trouble. */
+	for (i = 0; i < nthreads; i++) {
+		snprintf(name, sizeof(name), "ptlrpcd_%d", i);
+		rc = ptlrpcd_start(i, nthreads, name, &ptlrpcds->pd_threads[i]);
+		if (rc < 0)
+			goto out;
+	}
+
+	ptlrpcds->pd_size = size;
+	ptlrpcds->pd_index = 0;
+	ptlrpcds->pd_nthreads = nthreads;
+
+out:
+	if (rc != 0 && ptlrpcds != NULL) {
+		for (j = 0; j <= i; j++)
+			ptlrpcd_stop(&ptlrpcds->pd_threads[j], 0);
+		for (j = 0; j <= i; j++)
+			ptlrpcd_free(&ptlrpcds->pd_threads[j]);
+		ptlrpcd_stop(&ptlrpcds->pd_thread_rcv, 0);
+		ptlrpcd_free(&ptlrpcds->pd_thread_rcv);
+		OBD_FREE(ptlrpcds, size);
+		ptlrpcds = NULL;
+	}
+
+	return 0;
+}
+
+int ptlrpcd_addref(void)
+{
+	int rc = 0;
+
+	mutex_lock(&ptlrpcd_mutex);
+	if (++ptlrpcd_users == 1)
+		rc = ptlrpcd_init();
+	mutex_unlock(&ptlrpcd_mutex);
+	return rc;
+}
+EXPORT_SYMBOL(ptlrpcd_addref);
+
+void ptlrpcd_decref(void)
+{
+	mutex_lock(&ptlrpcd_mutex);
+	if (--ptlrpcd_users == 0)
+		ptlrpcd_fini();
+	mutex_unlock(&ptlrpcd_mutex);
+}
+EXPORT_SYMBOL(ptlrpcd_decref);
+/** @} ptlrpcd */
diff --git a/kernel/drivers/staging/lustre/lustre/ptlrpc/recover.c b/kernel/drivers/staging/lustre/lustre/ptlrpc/recover.c
new file mode 100644
index 000000000..7b1d72947
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/ptlrpc/recover.c
@@ -0,0 +1,379 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/ptlrpc/recover.c
+ *
+ * Author: Mike Shaver <shaver@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_RPC
+#include "../../include/linux/libcfs/libcfs.h"
+
+#include "../include/obd_support.h"
+#include "../include/lustre_ha.h"
+#include "../include/lustre_net.h"
+#include "../include/lustre_import.h"
+#include "../include/lustre_export.h"
+#include "../include/obd.h"
+#include "../include/obd_class.h"
+#include <linux/list.h>
+
+#include "ptlrpc_internal.h"
+
+/**
+ * Start recovery on disconnected import.
+ * This is done by just attempting a connect
+ */
+void ptlrpc_initiate_recovery(struct obd_import *imp)
+{
+	CDEBUG(D_HA, "%s: starting recovery\n", obd2cli_tgt(imp->imp_obd));
+	ptlrpc_connect_import(imp);
+}
+
+/**
+ * Identify what request from replay list needs to be replayed next
+ * (based on what we have already replayed) and send it to server.
+ */
+int ptlrpc_replay_next(struct obd_import *imp, int *inflight)
+{
+	int rc = 0;
+	struct list_head *tmp, *pos;
+	struct ptlrpc_request *req = NULL;
+	__u64 last_transno;
+
+	*inflight = 0;
+
+	/* It might have committed some after we last spoke, so make sure we
+	 * get rid of them now.
+	 */
+	spin_lock(&imp->imp_lock);
+	imp->imp_last_transno_checked = 0;
+	ptlrpc_free_committed(imp);
+	last_transno = imp->imp_last_replay_transno;
+	spin_unlock(&imp->imp_lock);
+
+	CDEBUG(D_HA, "import %p from %s committed %llu last %llu\n",
+	       imp, obd2cli_tgt(imp->imp_obd),
+	       imp->imp_peer_committed_transno, last_transno);
+
+	/* Do I need to hold a lock across this iteration?  We shouldn't be
+	 * racing with any additions to the list, because we're in recovery
+	 * and are therefore not processing additional requests to add.  Calls
+	 * to ptlrpc_free_committed might commit requests, but nothing "newer"
+	 * than the one we're replaying (it can't be committed until it's
+	 * replayed, and we're doing that here).  l_f_e_safe protects against
+	 * problems with the current request being committed, in the unlikely
+	 * event of that race.  So, in conclusion, I think that it's safe to
+	 * perform this list-walk without the imp_lock held.
+	 *
+	 * But, the {mdc,osc}_replay_open callbacks both iterate
+	 * request lists, and have comments saying they assume the
+	 * imp_lock is being held by ptlrpc_replay, but it's not. it's
+	 * just a little race...
+	 */
+
+	/* Replay all the committed open requests on committed_list first */
+	if (!list_empty(&imp->imp_committed_list)) {
+		tmp = imp->imp_committed_list.prev;
+		req = list_entry(tmp, struct ptlrpc_request,
+				     rq_replay_list);
+
+		/* The last request on committed_list hasn't been replayed */
+		if (req->rq_transno > last_transno) {
+			/* Since the imp_committed_list is immutable before
+			 * all of it's requests being replayed, it's safe to
+			 * use a cursor to accelerate the search */
+			imp->imp_replay_cursor = imp->imp_replay_cursor->next;
+
+			while (imp->imp_replay_cursor !=
+			       &imp->imp_committed_list) {
+				req = list_entry(imp->imp_replay_cursor,
+						 struct ptlrpc_request,
+						 rq_replay_list);
+				if (req->rq_transno > last_transno)
+					break;
+
+				req = NULL;
+				imp->imp_replay_cursor =
+					imp->imp_replay_cursor->next;
+			}
+		} else {
+			/* All requests on committed_list have been replayed */
+			imp->imp_replay_cursor = &imp->imp_committed_list;
+			req = NULL;
+		}
+	}
+
+	/* All the requests in committed list have been replayed, let's replay
+	 * the imp_replay_list */
+	if (req == NULL) {
+		list_for_each_safe(tmp, pos, &imp->imp_replay_list) {
+			req = list_entry(tmp, struct ptlrpc_request,
+					 rq_replay_list);
+
+			if (req->rq_transno > last_transno)
+				break;
+			req = NULL;
+		}
+	}
+
+	/* If need to resend the last sent transno (because a reconnect
+	 * has occurred), then stop on the matching req and send it again.
+	 * If, however, the last sent transno has been committed then we
+	 * continue replay from the next request. */
+	if (req != NULL && imp->imp_resend_replay)
+		lustre_msg_add_flags(req->rq_reqmsg, MSG_RESENT);
+
+	spin_lock(&imp->imp_lock);
+	imp->imp_resend_replay = 0;
+	spin_unlock(&imp->imp_lock);
+
+	if (req != NULL) {
+		rc = ptlrpc_replay_req(req);
+		if (rc) {
+			CERROR("recovery replay error %d for req %llu\n",
+			       rc, req->rq_xid);
+			return rc;
+		}
+		*inflight = 1;
+	}
+	return rc;
+}
+
+/**
+ * Schedule resending of request on sending_list. This is done after
+ * we completed replaying of requests and locks.
+ */
+int ptlrpc_resend(struct obd_import *imp)
+{
+	struct ptlrpc_request *req, *next;
+
+	/* As long as we're in recovery, nothing should be added to the sending
+	 * list, so we don't need to hold the lock during this iteration and
+	 * resend process.
+	 */
+	/* Well... what if lctl recover is called twice at the same time?
+	 */
+	spin_lock(&imp->imp_lock);
+	if (imp->imp_state != LUSTRE_IMP_RECOVER) {
+		spin_unlock(&imp->imp_lock);
+		return -1;
+	}
+
+	list_for_each_entry_safe(req, next, &imp->imp_sending_list,
+				     rq_list) {
+		LASSERTF((long)req > PAGE_CACHE_SIZE && req != LP_POISON,
+			 "req %p bad\n", req);
+		LASSERTF(req->rq_type != LI_POISON, "req %p freed\n", req);
+		if (!ptlrpc_no_resend(req))
+			ptlrpc_resend_req(req);
+	}
+	spin_unlock(&imp->imp_lock);
+
+	return 0;
+}
+EXPORT_SYMBOL(ptlrpc_resend);
+
+/**
+ * Go through all requests in delayed list and wake their threads
+ * for resending
+ */
+void ptlrpc_wake_delayed(struct obd_import *imp)
+{
+	struct list_head *tmp, *pos;
+	struct ptlrpc_request *req;
+
+	spin_lock(&imp->imp_lock);
+	list_for_each_safe(tmp, pos, &imp->imp_delayed_list) {
+		req = list_entry(tmp, struct ptlrpc_request, rq_list);
+
+		DEBUG_REQ(D_HA, req, "waking (set %p):", req->rq_set);
+		ptlrpc_client_wake_req(req);
+	}
+	spin_unlock(&imp->imp_lock);
+}
+EXPORT_SYMBOL(ptlrpc_wake_delayed);
+
+void ptlrpc_request_handle_notconn(struct ptlrpc_request *failed_req)
+{
+	struct obd_import *imp = failed_req->rq_import;
+
+	CDEBUG(D_HA, "import %s of %s@%s abruptly disconnected: reconnecting\n",
+	       imp->imp_obd->obd_name, obd2cli_tgt(imp->imp_obd),
+	       imp->imp_connection->c_remote_uuid.uuid);
+
+	if (ptlrpc_set_import_discon(imp,
+			      lustre_msg_get_conn_cnt(failed_req->rq_reqmsg))) {
+		if (!imp->imp_replayable) {
+			CDEBUG(D_HA, "import %s@%s for %s not replayable, auto-deactivating\n",
+			       obd2cli_tgt(imp->imp_obd),
+			       imp->imp_connection->c_remote_uuid.uuid,
+			       imp->imp_obd->obd_name);
+			ptlrpc_deactivate_import(imp);
+		}
+		/* to control recovery via lctl {disable|enable}_recovery */
+		if (imp->imp_deactive == 0)
+			ptlrpc_connect_import(imp);
+	}
+
+	/* Wait for recovery to complete and resend. If evicted, then
+	   this request will be errored out later.*/
+	spin_lock(&failed_req->rq_lock);
+	if (!failed_req->rq_no_resend)
+		failed_req->rq_resend = 1;
+	spin_unlock(&failed_req->rq_lock);
+}
+
+/**
+ * Administratively active/deactive a client.
+ * This should only be called by the ioctl interface, currently
+ *  - the lctl deactivate and activate commands
+ *  - echo 0/1 >> /proc/osc/XXX/active
+ *  - client umount -f (ll_umount_begin)
+ */
+int ptlrpc_set_import_active(struct obd_import *imp, int active)
+{
+	struct obd_device *obd = imp->imp_obd;
+	int rc = 0;
+
+	LASSERT(obd);
+
+	/* When deactivating, mark import invalid, and abort in-flight
+	 * requests. */
+	if (!active) {
+		LCONSOLE_WARN("setting import %s INACTIVE by administrator request\n",
+			      obd2cli_tgt(imp->imp_obd));
+
+		/* set before invalidate to avoid messages about imp_inval
+		 * set without imp_deactive in ptlrpc_import_delay_req */
+		spin_lock(&imp->imp_lock);
+		imp->imp_deactive = 1;
+		spin_unlock(&imp->imp_lock);
+
+		obd_import_event(imp->imp_obd, imp, IMP_EVENT_DEACTIVATE);
+
+		ptlrpc_invalidate_import(imp);
+	}
+
+	/* When activating, mark import valid, and attempt recovery */
+	if (active) {
+		CDEBUG(D_HA, "setting import %s VALID\n",
+		       obd2cli_tgt(imp->imp_obd));
+
+		spin_lock(&imp->imp_lock);
+		imp->imp_deactive = 0;
+		spin_unlock(&imp->imp_lock);
+		obd_import_event(imp->imp_obd, imp, IMP_EVENT_ACTIVATE);
+
+		rc = ptlrpc_recover_import(imp, NULL, 0);
+	}
+
+	return rc;
+}
+EXPORT_SYMBOL(ptlrpc_set_import_active);
+
+/* Attempt to reconnect an import */
+int ptlrpc_recover_import(struct obd_import *imp, char *new_uuid, int async)
+{
+	int rc = 0;
+
+	spin_lock(&imp->imp_lock);
+	if (imp->imp_state == LUSTRE_IMP_NEW || imp->imp_deactive ||
+	    atomic_read(&imp->imp_inval_count))
+		rc = -EINVAL;
+	spin_unlock(&imp->imp_lock);
+	if (rc)
+		goto out;
+
+	/* force import to be disconnected. */
+	ptlrpc_set_import_discon(imp, 0);
+
+	if (new_uuid) {
+		struct obd_uuid uuid;
+
+		/* intruct import to use new uuid */
+		obd_str2uuid(&uuid, new_uuid);
+		rc = import_set_conn_priority(imp, &uuid);
+		if (rc)
+			goto out;
+	}
+
+	/* Check if reconnect is already in progress */
+	spin_lock(&imp->imp_lock);
+	if (imp->imp_state != LUSTRE_IMP_DISCON) {
+		imp->imp_force_verify = 1;
+		rc = -EALREADY;
+	}
+	spin_unlock(&imp->imp_lock);
+	if (rc)
+		goto out;
+
+	rc = ptlrpc_connect_import(imp);
+	if (rc)
+		goto out;
+
+	if (!async) {
+		struct l_wait_info lwi;
+		int secs = cfs_time_seconds(obd_timeout);
+
+		CDEBUG(D_HA, "%s: recovery started, waiting %u seconds\n",
+		       obd2cli_tgt(imp->imp_obd), secs);
+
+		lwi = LWI_TIMEOUT(secs, NULL, NULL);
+		rc = l_wait_event(imp->imp_recovery_waitq,
+				  !ptlrpc_import_in_recovery(imp), &lwi);
+		CDEBUG(D_HA, "%s: recovery finished\n",
+		       obd2cli_tgt(imp->imp_obd));
+	}
+
+out:
+	return rc;
+}
+EXPORT_SYMBOL(ptlrpc_recover_import);
+
+int ptlrpc_import_in_recovery(struct obd_import *imp)
+{
+	int in_recovery = 1;
+
+	spin_lock(&imp->imp_lock);
+	if (imp->imp_state == LUSTRE_IMP_FULL ||
+	    imp->imp_state == LUSTRE_IMP_CLOSED ||
+	    imp->imp_state == LUSTRE_IMP_DISCON ||
+	    imp->imp_obd->obd_no_recov)
+		in_recovery = 0;
+	spin_unlock(&imp->imp_lock);
+
+	return in_recovery;
+}
diff --git a/kernel/drivers/staging/lustre/lustre/ptlrpc/sec.c b/kernel/drivers/staging/lustre/lustre/ptlrpc/sec.c
new file mode 100644
index 000000000..21e9dc9d5
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/ptlrpc/sec.c
@@ -0,0 +1,2459 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/ptlrpc/sec.c
+ *
+ * Author: Eric Mei <ericm@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_SEC
+
+#include "../../include/linux/libcfs/libcfs.h"
+#include <linux/crypto.h>
+#include <linux/key.h>
+
+#include "../include/obd.h"
+#include "../include/obd_class.h"
+#include "../include/obd_support.h"
+#include "../include/lustre_net.h"
+#include "../include/lustre_import.h"
+#include "../include/lustre_dlm.h"
+#include "../include/lustre_sec.h"
+
+#include "ptlrpc_internal.h"
+
+/***********************************************
+ * policy registers			    *
+ ***********************************************/
+
+static rwlock_t policy_lock;
+static struct ptlrpc_sec_policy *policies[SPTLRPC_POLICY_MAX] = {
+	NULL,
+};
+
+int sptlrpc_register_policy(struct ptlrpc_sec_policy *policy)
+{
+	__u16 number = policy->sp_policy;
+
+	LASSERT(policy->sp_name);
+	LASSERT(policy->sp_cops);
+	LASSERT(policy->sp_sops);
+
+	if (number >= SPTLRPC_POLICY_MAX)
+		return -EINVAL;
+
+	write_lock(&policy_lock);
+	if (unlikely(policies[number])) {
+		write_unlock(&policy_lock);
+		return -EALREADY;
+	}
+	policies[number] = policy;
+	write_unlock(&policy_lock);
+
+	CDEBUG(D_SEC, "%s: registered\n", policy->sp_name);
+	return 0;
+}
+EXPORT_SYMBOL(sptlrpc_register_policy);
+
+int sptlrpc_unregister_policy(struct ptlrpc_sec_policy *policy)
+{
+	__u16 number = policy->sp_policy;
+
+	LASSERT(number < SPTLRPC_POLICY_MAX);
+
+	write_lock(&policy_lock);
+	if (unlikely(policies[number] == NULL)) {
+		write_unlock(&policy_lock);
+		CERROR("%s: already unregistered\n", policy->sp_name);
+		return -EINVAL;
+	}
+
+	LASSERT(policies[number] == policy);
+	policies[number] = NULL;
+	write_unlock(&policy_lock);
+
+	CDEBUG(D_SEC, "%s: unregistered\n", policy->sp_name);
+	return 0;
+}
+EXPORT_SYMBOL(sptlrpc_unregister_policy);
+
+static
+struct ptlrpc_sec_policy *sptlrpc_wireflavor2policy(__u32 flavor)
+{
+	static DEFINE_MUTEX(load_mutex);
+	static atomic_t       loaded = ATOMIC_INIT(0);
+	struct ptlrpc_sec_policy *policy;
+	__u16		     number = SPTLRPC_FLVR_POLICY(flavor);
+	__u16		     flag = 0;
+
+	if (number >= SPTLRPC_POLICY_MAX)
+		return NULL;
+
+	while (1) {
+		read_lock(&policy_lock);
+		policy = policies[number];
+		if (policy && !try_module_get(policy->sp_owner))
+			policy = NULL;
+		if (policy == NULL)
+			flag = atomic_read(&loaded);
+		read_unlock(&policy_lock);
+
+		if (policy != NULL || flag != 0 ||
+		    number != SPTLRPC_POLICY_GSS)
+			break;
+
+		/* try to load gss module, once */
+		mutex_lock(&load_mutex);
+		if (atomic_read(&loaded) == 0) {
+			if (request_module("ptlrpc_gss") == 0)
+				CDEBUG(D_SEC,
+				       "module ptlrpc_gss loaded on demand\n");
+			else
+				CERROR("Unable to load module ptlrpc_gss\n");
+
+			atomic_set(&loaded, 1);
+		}
+		mutex_unlock(&load_mutex);
+	}
+
+	return policy;
+}
+
+__u32 sptlrpc_name2flavor_base(const char *name)
+{
+	if (!strcmp(name, "null"))
+		return SPTLRPC_FLVR_NULL;
+	if (!strcmp(name, "plain"))
+		return SPTLRPC_FLVR_PLAIN;
+	if (!strcmp(name, "krb5n"))
+		return SPTLRPC_FLVR_KRB5N;
+	if (!strcmp(name, "krb5a"))
+		return SPTLRPC_FLVR_KRB5A;
+	if (!strcmp(name, "krb5i"))
+		return SPTLRPC_FLVR_KRB5I;
+	if (!strcmp(name, "krb5p"))
+		return SPTLRPC_FLVR_KRB5P;
+
+	return SPTLRPC_FLVR_INVALID;
+}
+EXPORT_SYMBOL(sptlrpc_name2flavor_base);
+
+const char *sptlrpc_flavor2name_base(__u32 flvr)
+{
+	__u32   base = SPTLRPC_FLVR_BASE(flvr);
+
+	if (base == SPTLRPC_FLVR_BASE(SPTLRPC_FLVR_NULL))
+		return "null";
+	else if (base == SPTLRPC_FLVR_BASE(SPTLRPC_FLVR_PLAIN))
+		return "plain";
+	else if (base == SPTLRPC_FLVR_BASE(SPTLRPC_FLVR_KRB5N))
+		return "krb5n";
+	else if (base == SPTLRPC_FLVR_BASE(SPTLRPC_FLVR_KRB5A))
+		return "krb5a";
+	else if (base == SPTLRPC_FLVR_BASE(SPTLRPC_FLVR_KRB5I))
+		return "krb5i";
+	else if (base == SPTLRPC_FLVR_BASE(SPTLRPC_FLVR_KRB5P))
+		return "krb5p";
+
+	CERROR("invalid wire flavor 0x%x\n", flvr);
+	return "invalid";
+}
+EXPORT_SYMBOL(sptlrpc_flavor2name_base);
+
+char *sptlrpc_flavor2name_bulk(struct sptlrpc_flavor *sf,
+			       char *buf, int bufsize)
+{
+	if (SPTLRPC_FLVR_POLICY(sf->sf_rpc) == SPTLRPC_POLICY_PLAIN)
+		snprintf(buf, bufsize, "hash:%s",
+			 sptlrpc_get_hash_name(sf->u_bulk.hash.hash_alg));
+	else
+		snprintf(buf, bufsize, "%s",
+			 sptlrpc_flavor2name_base(sf->sf_rpc));
+
+	buf[bufsize - 1] = '\0';
+	return buf;
+}
+EXPORT_SYMBOL(sptlrpc_flavor2name_bulk);
+
+char *sptlrpc_flavor2name(struct sptlrpc_flavor *sf, char *buf, int bufsize)
+{
+	strlcpy(buf, sptlrpc_flavor2name_base(sf->sf_rpc), bufsize);
+
+	/*
+	 * currently we don't support customized bulk specification for
+	 * flavors other than plain
+	 */
+	if (SPTLRPC_FLVR_POLICY(sf->sf_rpc) == SPTLRPC_POLICY_PLAIN) {
+		char bspec[16];
+
+		bspec[0] = '-';
+		sptlrpc_flavor2name_bulk(sf, &bspec[1], sizeof(bspec) - 1);
+		strlcat(buf, bspec, bufsize);
+	}
+
+	return buf;
+}
+EXPORT_SYMBOL(sptlrpc_flavor2name);
+
+char *sptlrpc_secflags2str(__u32 flags, char *buf, int bufsize)
+{
+	buf[0] = '\0';
+
+	if (flags & PTLRPC_SEC_FL_REVERSE)
+		strlcat(buf, "reverse,", bufsize);
+	if (flags & PTLRPC_SEC_FL_ROOTONLY)
+		strlcat(buf, "rootonly,", bufsize);
+	if (flags & PTLRPC_SEC_FL_UDESC)
+		strlcat(buf, "udesc,", bufsize);
+	if (flags & PTLRPC_SEC_FL_BULK)
+		strlcat(buf, "bulk,", bufsize);
+	if (buf[0] == '\0')
+		strlcat(buf, "-,", bufsize);
+
+	return buf;
+}
+EXPORT_SYMBOL(sptlrpc_secflags2str);
+
+/**************************************************
+ * client context APIs			    *
+ **************************************************/
+
+static
+struct ptlrpc_cli_ctx *get_my_ctx(struct ptlrpc_sec *sec)
+{
+	struct vfs_cred vcred;
+	int create = 1, remove_dead = 1;
+
+	LASSERT(sec);
+	LASSERT(sec->ps_policy->sp_cops->lookup_ctx);
+
+	if (sec->ps_flvr.sf_flags & (PTLRPC_SEC_FL_REVERSE |
+				     PTLRPC_SEC_FL_ROOTONLY)) {
+		vcred.vc_uid = 0;
+		vcred.vc_gid = 0;
+		if (sec->ps_flvr.sf_flags & PTLRPC_SEC_FL_REVERSE) {
+			create = 0;
+			remove_dead = 0;
+		}
+	} else {
+		vcred.vc_uid = from_kuid(&init_user_ns, current_uid());
+		vcred.vc_gid = from_kgid(&init_user_ns, current_gid());
+	}
+
+	return sec->ps_policy->sp_cops->lookup_ctx(sec, &vcred,
+						   create, remove_dead);
+}
+
+struct ptlrpc_cli_ctx *sptlrpc_cli_ctx_get(struct ptlrpc_cli_ctx *ctx)
+{
+	atomic_inc(&ctx->cc_refcount);
+	return ctx;
+}
+EXPORT_SYMBOL(sptlrpc_cli_ctx_get);
+
+void sptlrpc_cli_ctx_put(struct ptlrpc_cli_ctx *ctx, int sync)
+{
+	struct ptlrpc_sec *sec = ctx->cc_sec;
+
+	LASSERT(sec);
+	LASSERT_ATOMIC_POS(&ctx->cc_refcount);
+
+	if (!atomic_dec_and_test(&ctx->cc_refcount))
+		return;
+
+	sec->ps_policy->sp_cops->release_ctx(sec, ctx, sync);
+}
+EXPORT_SYMBOL(sptlrpc_cli_ctx_put);
+
+/**
+ * Expire the client context immediately.
+ *
+ * \pre Caller must hold at least 1 reference on the \a ctx.
+ */
+void sptlrpc_cli_ctx_expire(struct ptlrpc_cli_ctx *ctx)
+{
+	LASSERT(ctx->cc_ops->force_die);
+	ctx->cc_ops->force_die(ctx, 0);
+}
+EXPORT_SYMBOL(sptlrpc_cli_ctx_expire);
+
+/**
+ * To wake up the threads who are waiting for this client context. Called
+ * after some status change happened on \a ctx.
+ */
+void sptlrpc_cli_ctx_wakeup(struct ptlrpc_cli_ctx *ctx)
+{
+	struct ptlrpc_request *req, *next;
+
+	spin_lock(&ctx->cc_lock);
+	list_for_each_entry_safe(req, next, &ctx->cc_req_list,
+				     rq_ctx_chain) {
+		list_del_init(&req->rq_ctx_chain);
+		ptlrpc_client_wake_req(req);
+	}
+	spin_unlock(&ctx->cc_lock);
+}
+EXPORT_SYMBOL(sptlrpc_cli_ctx_wakeup);
+
+int sptlrpc_cli_ctx_display(struct ptlrpc_cli_ctx *ctx, char *buf, int bufsize)
+{
+	LASSERT(ctx->cc_ops);
+
+	if (ctx->cc_ops->display == NULL)
+		return 0;
+
+	return ctx->cc_ops->display(ctx, buf, bufsize);
+}
+
+static int import_sec_check_expire(struct obd_import *imp)
+{
+	int     adapt = 0;
+
+	spin_lock(&imp->imp_lock);
+	if (imp->imp_sec_expire &&
+	    imp->imp_sec_expire < get_seconds()) {
+		adapt = 1;
+		imp->imp_sec_expire = 0;
+	}
+	spin_unlock(&imp->imp_lock);
+
+	if (!adapt)
+		return 0;
+
+	CDEBUG(D_SEC, "found delayed sec adapt expired, do it now\n");
+	return sptlrpc_import_sec_adapt(imp, NULL, NULL);
+}
+
+static int import_sec_validate_get(struct obd_import *imp,
+				   struct ptlrpc_sec **sec)
+{
+	int     rc;
+
+	if (unlikely(imp->imp_sec_expire)) {
+		rc = import_sec_check_expire(imp);
+		if (rc)
+			return rc;
+	}
+
+	*sec = sptlrpc_import_sec_ref(imp);
+	if (*sec == NULL) {
+		CERROR("import %p (%s) with no sec\n",
+		       imp, ptlrpc_import_state_name(imp->imp_state));
+		return -EACCES;
+	}
+
+	if (unlikely((*sec)->ps_dying)) {
+		CERROR("attempt to use dying sec %p\n", sec);
+		sptlrpc_sec_put(*sec);
+		return -EACCES;
+	}
+
+	return 0;
+}
+
+/**
+ * Given a \a req, find or allocate a appropriate context for it.
+ * \pre req->rq_cli_ctx == NULL.
+ *
+ * \retval 0 succeed, and req->rq_cli_ctx is set.
+ * \retval -ev error number, and req->rq_cli_ctx == NULL.
+ */
+int sptlrpc_req_get_ctx(struct ptlrpc_request *req)
+{
+	struct obd_import *imp = req->rq_import;
+	struct ptlrpc_sec *sec;
+	int		rc;
+
+	LASSERT(!req->rq_cli_ctx);
+	LASSERT(imp);
+
+	rc = import_sec_validate_get(imp, &sec);
+	if (rc)
+		return rc;
+
+	req->rq_cli_ctx = get_my_ctx(sec);
+
+	sptlrpc_sec_put(sec);
+
+	if (!req->rq_cli_ctx) {
+		CERROR("req %p: fail to get context\n", req);
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+
+/**
+ * Drop the context for \a req.
+ * \pre req->rq_cli_ctx != NULL.
+ * \post req->rq_cli_ctx == NULL.
+ *
+ * If \a sync == 0, this function should return quickly without sleep;
+ * otherwise it might trigger and wait for the whole process of sending
+ * an context-destroying rpc to server.
+ */
+void sptlrpc_req_put_ctx(struct ptlrpc_request *req, int sync)
+{
+	LASSERT(req);
+	LASSERT(req->rq_cli_ctx);
+
+	/* request might be asked to release earlier while still
+	 * in the context waiting list.
+	 */
+	if (!list_empty(&req->rq_ctx_chain)) {
+		spin_lock(&req->rq_cli_ctx->cc_lock);
+		list_del_init(&req->rq_ctx_chain);
+		spin_unlock(&req->rq_cli_ctx->cc_lock);
+	}
+
+	sptlrpc_cli_ctx_put(req->rq_cli_ctx, sync);
+	req->rq_cli_ctx = NULL;
+}
+
+static
+int sptlrpc_req_ctx_switch(struct ptlrpc_request *req,
+			   struct ptlrpc_cli_ctx *oldctx,
+			   struct ptlrpc_cli_ctx *newctx)
+{
+	struct sptlrpc_flavor   old_flvr;
+	char		   *reqmsg = NULL; /* to workaround old gcc */
+	int		     reqmsg_size;
+	int		     rc = 0;
+
+	LASSERT(req->rq_reqmsg);
+	LASSERT(req->rq_reqlen);
+	LASSERT(req->rq_replen);
+
+	CDEBUG(D_SEC, "req %p: switch ctx %p(%u->%s) -> %p(%u->%s), switch sec %p(%s) -> %p(%s)\n",
+	       req,
+	       oldctx, oldctx->cc_vcred.vc_uid, sec2target_str(oldctx->cc_sec),
+	       newctx, newctx->cc_vcred.vc_uid, sec2target_str(newctx->cc_sec),
+	       oldctx->cc_sec, oldctx->cc_sec->ps_policy->sp_name,
+	       newctx->cc_sec, newctx->cc_sec->ps_policy->sp_name);
+
+	/* save flavor */
+	old_flvr = req->rq_flvr;
+
+	/* save request message */
+	reqmsg_size = req->rq_reqlen;
+	if (reqmsg_size != 0) {
+		OBD_ALLOC_LARGE(reqmsg, reqmsg_size);
+		if (reqmsg == NULL)
+			return -ENOMEM;
+		memcpy(reqmsg, req->rq_reqmsg, reqmsg_size);
+	}
+
+	/* release old req/rep buf */
+	req->rq_cli_ctx = oldctx;
+	sptlrpc_cli_free_reqbuf(req);
+	sptlrpc_cli_free_repbuf(req);
+	req->rq_cli_ctx = newctx;
+
+	/* recalculate the flavor */
+	sptlrpc_req_set_flavor(req, 0);
+
+	/* alloc new request buffer
+	 * we don't need to alloc reply buffer here, leave it to the
+	 * rest procedure of ptlrpc */
+	if (reqmsg_size != 0) {
+		rc = sptlrpc_cli_alloc_reqbuf(req, reqmsg_size);
+		if (!rc) {
+			LASSERT(req->rq_reqmsg);
+			memcpy(req->rq_reqmsg, reqmsg, reqmsg_size);
+		} else {
+			CWARN("failed to alloc reqbuf: %d\n", rc);
+			req->rq_flvr = old_flvr;
+		}
+
+		OBD_FREE_LARGE(reqmsg, reqmsg_size);
+	}
+	return rc;
+}
+
+/**
+ * If current context of \a req is dead somehow, e.g. we just switched flavor
+ * thus marked original contexts dead, we'll find a new context for it. if
+ * no switch is needed, \a req will end up with the same context.
+ *
+ * \note a request must have a context, to keep other parts of code happy.
+ * In any case of failure during the switching, we must restore the old one.
+ */
+int sptlrpc_req_replace_dead_ctx(struct ptlrpc_request *req)
+{
+	struct ptlrpc_cli_ctx *oldctx = req->rq_cli_ctx;
+	struct ptlrpc_cli_ctx *newctx;
+	int		    rc;
+
+	LASSERT(oldctx);
+
+	sptlrpc_cli_ctx_get(oldctx);
+	sptlrpc_req_put_ctx(req, 0);
+
+	rc = sptlrpc_req_get_ctx(req);
+	if (unlikely(rc)) {
+		LASSERT(!req->rq_cli_ctx);
+
+		/* restore old ctx */
+		req->rq_cli_ctx = oldctx;
+		return rc;
+	}
+
+	newctx = req->rq_cli_ctx;
+	LASSERT(newctx);
+
+	if (unlikely(newctx == oldctx &&
+		     test_bit(PTLRPC_CTX_DEAD_BIT, &oldctx->cc_flags))) {
+		/*
+		 * still get the old dead ctx, usually means system too busy
+		 */
+		CDEBUG(D_SEC,
+		       "ctx (%p, fl %lx) doesn't switch, relax a little bit\n",
+		       newctx, newctx->cc_flags);
+
+		set_current_state(TASK_INTERRUPTIBLE);
+		schedule_timeout(HZ);
+	} else {
+		/*
+		 * it's possible newctx == oldctx if we're switching
+		 * subflavor with the same sec.
+		 */
+		rc = sptlrpc_req_ctx_switch(req, oldctx, newctx);
+		if (rc) {
+			/* restore old ctx */
+			sptlrpc_req_put_ctx(req, 0);
+			req->rq_cli_ctx = oldctx;
+			return rc;
+		}
+
+		LASSERT(req->rq_cli_ctx == newctx);
+	}
+
+	sptlrpc_cli_ctx_put(oldctx, 1);
+	return 0;
+}
+EXPORT_SYMBOL(sptlrpc_req_replace_dead_ctx);
+
+static
+int ctx_check_refresh(struct ptlrpc_cli_ctx *ctx)
+{
+	if (cli_ctx_is_refreshed(ctx))
+		return 1;
+	return 0;
+}
+
+static
+int ctx_refresh_timeout(void *data)
+{
+	struct ptlrpc_request *req = data;
+	int rc;
+
+	/* conn_cnt is needed in expire_one_request */
+	lustre_msg_set_conn_cnt(req->rq_reqmsg, req->rq_import->imp_conn_cnt);
+
+	rc = ptlrpc_expire_one_request(req, 1);
+	/* if we started recovery, we should mark this ctx dead; otherwise
+	 * in case of lgssd died nobody would retire this ctx, following
+	 * connecting will still find the same ctx thus cause deadlock.
+	 * there's an assumption that expire time of the request should be
+	 * later than the context refresh expire time.
+	 */
+	if (rc == 0)
+		req->rq_cli_ctx->cc_ops->force_die(req->rq_cli_ctx, 0);
+	return rc;
+}
+
+static
+void ctx_refresh_interrupt(void *data)
+{
+	struct ptlrpc_request *req = data;
+
+	spin_lock(&req->rq_lock);
+	req->rq_intr = 1;
+	spin_unlock(&req->rq_lock);
+}
+
+static
+void req_off_ctx_list(struct ptlrpc_request *req, struct ptlrpc_cli_ctx *ctx)
+{
+	spin_lock(&ctx->cc_lock);
+	if (!list_empty(&req->rq_ctx_chain))
+		list_del_init(&req->rq_ctx_chain);
+	spin_unlock(&ctx->cc_lock);
+}
+
+/**
+ * To refresh the context of \req, if it's not up-to-date.
+ * \param timeout
+ * - < 0: don't wait
+ * - = 0: wait until success or fatal error occur
+ * - > 0: timeout value (in seconds)
+ *
+ * The status of the context could be subject to be changed by other threads
+ * at any time. We allow this race, but once we return with 0, the caller will
+ * suppose it's uptodated and keep using it until the owning rpc is done.
+ *
+ * \retval 0 only if the context is uptodated.
+ * \retval -ev error number.
+ */
+int sptlrpc_req_refresh_ctx(struct ptlrpc_request *req, long timeout)
+{
+	struct ptlrpc_cli_ctx  *ctx = req->rq_cli_ctx;
+	struct ptlrpc_sec      *sec;
+	struct l_wait_info      lwi;
+	int		     rc;
+
+	LASSERT(ctx);
+
+	if (req->rq_ctx_init || req->rq_ctx_fini)
+		return 0;
+
+	/*
+	 * during the process a request's context might change type even
+	 * (e.g. from gss ctx to null ctx), so each loop we need to re-check
+	 * everything
+	 */
+again:
+	rc = import_sec_validate_get(req->rq_import, &sec);
+	if (rc)
+		return rc;
+
+	if (sec->ps_flvr.sf_rpc != req->rq_flvr.sf_rpc) {
+		CDEBUG(D_SEC, "req %p: flavor has changed %x -> %x\n",
+		      req, req->rq_flvr.sf_rpc, sec->ps_flvr.sf_rpc);
+		req_off_ctx_list(req, ctx);
+		sptlrpc_req_replace_dead_ctx(req);
+		ctx = req->rq_cli_ctx;
+	}
+	sptlrpc_sec_put(sec);
+
+	if (cli_ctx_is_eternal(ctx))
+		return 0;
+
+	if (unlikely(test_bit(PTLRPC_CTX_NEW_BIT, &ctx->cc_flags))) {
+		LASSERT(ctx->cc_ops->refresh);
+		ctx->cc_ops->refresh(ctx);
+	}
+	LASSERT(test_bit(PTLRPC_CTX_NEW_BIT, &ctx->cc_flags) == 0);
+
+	LASSERT(ctx->cc_ops->validate);
+	if (ctx->cc_ops->validate(ctx) == 0) {
+		req_off_ctx_list(req, ctx);
+		return 0;
+	}
+
+	if (unlikely(test_bit(PTLRPC_CTX_ERROR_BIT, &ctx->cc_flags))) {
+		spin_lock(&req->rq_lock);
+		req->rq_err = 1;
+		spin_unlock(&req->rq_lock);
+		req_off_ctx_list(req, ctx);
+		return -EPERM;
+	}
+
+	/*
+	 * There's a subtle issue for resending RPCs, suppose following
+	 * situation:
+	 *  1. the request was sent to server.
+	 *  2. recovery was kicked start, after finished the request was
+	 *     marked as resent.
+	 *  3. resend the request.
+	 *  4. old reply from server received, we accept and verify the reply.
+	 *     this has to be success, otherwise the error will be aware
+	 *     by application.
+	 *  5. new reply from server received, dropped by LNet.
+	 *
+	 * Note the xid of old & new request is the same. We can't simply
+	 * change xid for the resent request because the server replies on
+	 * it for reply reconstruction.
+	 *
+	 * Commonly the original context should be uptodate because we
+	 * have a expiry nice time; server will keep its context because
+	 * we at least hold a ref of old context which prevent context
+	 * destroying RPC being sent. So server still can accept the request
+	 * and finish the RPC. But if that's not the case:
+	 *  1. If server side context has been trimmed, a NO_CONTEXT will
+	 *     be returned, gss_cli_ctx_verify/unseal will switch to new
+	 *     context by force.
+	 *  2. Current context never be refreshed, then we are fine: we
+	 *     never really send request with old context before.
+	 */
+	if (test_bit(PTLRPC_CTX_UPTODATE_BIT, &ctx->cc_flags) &&
+	    unlikely(req->rq_reqmsg) &&
+	    lustre_msg_get_flags(req->rq_reqmsg) & MSG_RESENT) {
+		req_off_ctx_list(req, ctx);
+		return 0;
+	}
+
+	if (unlikely(test_bit(PTLRPC_CTX_DEAD_BIT, &ctx->cc_flags))) {
+		req_off_ctx_list(req, ctx);
+		/*
+		 * don't switch ctx if import was deactivated
+		 */
+		if (req->rq_import->imp_deactive) {
+			spin_lock(&req->rq_lock);
+			req->rq_err = 1;
+			spin_unlock(&req->rq_lock);
+			return -EINTR;
+		}
+
+		rc = sptlrpc_req_replace_dead_ctx(req);
+		if (rc) {
+			LASSERT(ctx == req->rq_cli_ctx);
+			CERROR("req %p: failed to replace dead ctx %p: %d\n",
+			       req, ctx, rc);
+			spin_lock(&req->rq_lock);
+			req->rq_err = 1;
+			spin_unlock(&req->rq_lock);
+			return rc;
+		}
+
+		ctx = req->rq_cli_ctx;
+		goto again;
+	}
+
+	/*
+	 * Now we're sure this context is during upcall, add myself into
+	 * waiting list
+	 */
+	spin_lock(&ctx->cc_lock);
+	if (list_empty(&req->rq_ctx_chain))
+		list_add(&req->rq_ctx_chain, &ctx->cc_req_list);
+	spin_unlock(&ctx->cc_lock);
+
+	if (timeout < 0)
+		return -EWOULDBLOCK;
+
+	/* Clear any flags that may be present from previous sends */
+	LASSERT(req->rq_receiving_reply == 0);
+	spin_lock(&req->rq_lock);
+	req->rq_err = 0;
+	req->rq_timedout = 0;
+	req->rq_resend = 0;
+	req->rq_restart = 0;
+	spin_unlock(&req->rq_lock);
+
+	lwi = LWI_TIMEOUT_INTR(timeout * HZ, ctx_refresh_timeout,
+			       ctx_refresh_interrupt, req);
+	rc = l_wait_event(req->rq_reply_waitq, ctx_check_refresh(ctx), &lwi);
+
+	/*
+	 * following cases could lead us here:
+	 * - successfully refreshed;
+	 * - interrupted;
+	 * - timedout, and we don't want recover from the failure;
+	 * - timedout, and waked up upon recovery finished;
+	 * - someone else mark this ctx dead by force;
+	 * - someone invalidate the req and call ptlrpc_client_wake_req(),
+	 *   e.g. ptlrpc_abort_inflight();
+	 */
+	if (!cli_ctx_is_refreshed(ctx)) {
+		/* timed out or interrupted */
+		req_off_ctx_list(req, ctx);
+
+		LASSERT(rc != 0);
+		return rc;
+	}
+
+	goto again;
+}
+
+/**
+ * Initialize flavor settings for \a req, according to \a opcode.
+ *
+ * \note this could be called in two situations:
+ * - new request from ptlrpc_pre_req(), with proper @opcode
+ * - old request which changed ctx in the middle, with @opcode == 0
+ */
+void sptlrpc_req_set_flavor(struct ptlrpc_request *req, int opcode)
+{
+	struct ptlrpc_sec *sec;
+
+	LASSERT(req->rq_import);
+	LASSERT(req->rq_cli_ctx);
+	LASSERT(req->rq_cli_ctx->cc_sec);
+	LASSERT(req->rq_bulk_read == 0 || req->rq_bulk_write == 0);
+
+	/* special security flags according to opcode */
+	switch (opcode) {
+	case OST_READ:
+	case MDS_READPAGE:
+	case MGS_CONFIG_READ:
+	case OBD_IDX_READ:
+		req->rq_bulk_read = 1;
+		break;
+	case OST_WRITE:
+	case MDS_WRITEPAGE:
+		req->rq_bulk_write = 1;
+		break;
+	case SEC_CTX_INIT:
+		req->rq_ctx_init = 1;
+		break;
+	case SEC_CTX_FINI:
+		req->rq_ctx_fini = 1;
+		break;
+	case 0:
+		/* init/fini rpc won't be resend, so can't be here */
+		LASSERT(req->rq_ctx_init == 0);
+		LASSERT(req->rq_ctx_fini == 0);
+
+		/* cleanup flags, which should be recalculated */
+		req->rq_pack_udesc = 0;
+		req->rq_pack_bulk = 0;
+		break;
+	}
+
+	sec = req->rq_cli_ctx->cc_sec;
+
+	spin_lock(&sec->ps_lock);
+	req->rq_flvr = sec->ps_flvr;
+	spin_unlock(&sec->ps_lock);
+
+	/* force SVC_NULL for context initiation rpc, SVC_INTG for context
+	 * destruction rpc */
+	if (unlikely(req->rq_ctx_init))
+		flvr_set_svc(&req->rq_flvr.sf_rpc, SPTLRPC_SVC_NULL);
+	else if (unlikely(req->rq_ctx_fini))
+		flvr_set_svc(&req->rq_flvr.sf_rpc, SPTLRPC_SVC_INTG);
+
+	/* user descriptor flag, null security can't do it anyway */
+	if ((sec->ps_flvr.sf_flags & PTLRPC_SEC_FL_UDESC) &&
+	    (req->rq_flvr.sf_rpc != SPTLRPC_FLVR_NULL))
+		req->rq_pack_udesc = 1;
+
+	/* bulk security flag */
+	if ((req->rq_bulk_read || req->rq_bulk_write) &&
+	    sptlrpc_flavor_has_bulk(&req->rq_flvr))
+		req->rq_pack_bulk = 1;
+}
+
+void sptlrpc_request_out_callback(struct ptlrpc_request *req)
+{
+	if (SPTLRPC_FLVR_SVC(req->rq_flvr.sf_rpc) != SPTLRPC_SVC_PRIV)
+		return;
+
+	LASSERT(req->rq_clrbuf);
+	if (req->rq_pool || !req->rq_reqbuf)
+		return;
+
+	OBD_FREE(req->rq_reqbuf, req->rq_reqbuf_len);
+	req->rq_reqbuf = NULL;
+	req->rq_reqbuf_len = 0;
+}
+
+/**
+ * Given an import \a imp, check whether current user has a valid context
+ * or not. We may create a new context and try to refresh it, and try
+ * repeatedly try in case of non-fatal errors. Return 0 means success.
+ */
+int sptlrpc_import_check_ctx(struct obd_import *imp)
+{
+	struct ptlrpc_sec     *sec;
+	struct ptlrpc_cli_ctx *ctx;
+	struct ptlrpc_request *req = NULL;
+	int rc;
+
+	might_sleep();
+
+	sec = sptlrpc_import_sec_ref(imp);
+	ctx = get_my_ctx(sec);
+	sptlrpc_sec_put(sec);
+
+	if (!ctx)
+		return -ENOMEM;
+
+	if (cli_ctx_is_eternal(ctx) ||
+	    ctx->cc_ops->validate(ctx) == 0) {
+		sptlrpc_cli_ctx_put(ctx, 1);
+		return 0;
+	}
+
+	if (cli_ctx_is_error(ctx)) {
+		sptlrpc_cli_ctx_put(ctx, 1);
+		return -EACCES;
+	}
+
+	req = ptlrpc_request_cache_alloc(GFP_NOFS);
+	if (!req)
+		return -ENOMEM;
+
+	spin_lock_init(&req->rq_lock);
+	atomic_set(&req->rq_refcount, 10000);
+	INIT_LIST_HEAD(&req->rq_ctx_chain);
+	init_waitqueue_head(&req->rq_reply_waitq);
+	init_waitqueue_head(&req->rq_set_waitq);
+	req->rq_import = imp;
+	req->rq_flvr = sec->ps_flvr;
+	req->rq_cli_ctx = ctx;
+
+	rc = sptlrpc_req_refresh_ctx(req, 0);
+	LASSERT(list_empty(&req->rq_ctx_chain));
+	sptlrpc_cli_ctx_put(req->rq_cli_ctx, 1);
+	ptlrpc_request_cache_free(req);
+
+	return rc;
+}
+
+/**
+ * Used by ptlrpc client, to perform the pre-defined security transformation
+ * upon the request message of \a req. After this function called,
+ * req->rq_reqmsg is still accessible as clear text.
+ */
+int sptlrpc_cli_wrap_request(struct ptlrpc_request *req)
+{
+	struct ptlrpc_cli_ctx *ctx = req->rq_cli_ctx;
+	int rc = 0;
+
+	LASSERT(ctx);
+	LASSERT(ctx->cc_sec);
+	LASSERT(req->rq_reqbuf || req->rq_clrbuf);
+
+	/* we wrap bulk request here because now we can be sure
+	 * the context is uptodate.
+	 */
+	if (req->rq_bulk) {
+		rc = sptlrpc_cli_wrap_bulk(req, req->rq_bulk);
+		if (rc)
+			return rc;
+	}
+
+	switch (SPTLRPC_FLVR_SVC(req->rq_flvr.sf_rpc)) {
+	case SPTLRPC_SVC_NULL:
+	case SPTLRPC_SVC_AUTH:
+	case SPTLRPC_SVC_INTG:
+		LASSERT(ctx->cc_ops->sign);
+		rc = ctx->cc_ops->sign(ctx, req);
+		break;
+	case SPTLRPC_SVC_PRIV:
+		LASSERT(ctx->cc_ops->seal);
+		rc = ctx->cc_ops->seal(ctx, req);
+		break;
+	default:
+		LBUG();
+	}
+
+	if (rc == 0) {
+		LASSERT(req->rq_reqdata_len);
+		LASSERT(req->rq_reqdata_len % 8 == 0);
+		LASSERT(req->rq_reqdata_len <= req->rq_reqbuf_len);
+	}
+
+	return rc;
+}
+
+static int do_cli_unwrap_reply(struct ptlrpc_request *req)
+{
+	struct ptlrpc_cli_ctx *ctx = req->rq_cli_ctx;
+	int		    rc;
+
+	LASSERT(ctx);
+	LASSERT(ctx->cc_sec);
+	LASSERT(req->rq_repbuf);
+	LASSERT(req->rq_repdata);
+	LASSERT(req->rq_repmsg == NULL);
+
+	req->rq_rep_swab_mask = 0;
+
+	rc = __lustre_unpack_msg(req->rq_repdata, req->rq_repdata_len);
+	switch (rc) {
+	case 1:
+		lustre_set_rep_swabbed(req, MSG_PTLRPC_HEADER_OFF);
+	case 0:
+		break;
+	default:
+		CERROR("failed unpack reply: x%llu\n", req->rq_xid);
+		return -EPROTO;
+	}
+
+	if (req->rq_repdata_len < sizeof(struct lustre_msg)) {
+		CERROR("replied data length %d too small\n",
+		       req->rq_repdata_len);
+		return -EPROTO;
+	}
+
+	if (SPTLRPC_FLVR_POLICY(req->rq_repdata->lm_secflvr) !=
+	    SPTLRPC_FLVR_POLICY(req->rq_flvr.sf_rpc)) {
+		CERROR("reply policy %u doesn't match request policy %u\n",
+		       SPTLRPC_FLVR_POLICY(req->rq_repdata->lm_secflvr),
+		       SPTLRPC_FLVR_POLICY(req->rq_flvr.sf_rpc));
+		return -EPROTO;
+	}
+
+	switch (SPTLRPC_FLVR_SVC(req->rq_flvr.sf_rpc)) {
+	case SPTLRPC_SVC_NULL:
+	case SPTLRPC_SVC_AUTH:
+	case SPTLRPC_SVC_INTG:
+		LASSERT(ctx->cc_ops->verify);
+		rc = ctx->cc_ops->verify(ctx, req);
+		break;
+	case SPTLRPC_SVC_PRIV:
+		LASSERT(ctx->cc_ops->unseal);
+		rc = ctx->cc_ops->unseal(ctx, req);
+		break;
+	default:
+		LBUG();
+	}
+	LASSERT(rc || req->rq_repmsg || req->rq_resend);
+
+	if (SPTLRPC_FLVR_POLICY(req->rq_flvr.sf_rpc) != SPTLRPC_POLICY_NULL &&
+	    !req->rq_ctx_init)
+		req->rq_rep_swab_mask = 0;
+	return rc;
+}
+
+/**
+ * Used by ptlrpc client, to perform security transformation upon the reply
+ * message of \a req. After return successfully, req->rq_repmsg points to
+ * the reply message in clear text.
+ *
+ * \pre the reply buffer should have been un-posted from LNet, so nothing is
+ * going to change.
+ */
+int sptlrpc_cli_unwrap_reply(struct ptlrpc_request *req)
+{
+	LASSERT(req->rq_repbuf);
+	LASSERT(req->rq_repdata == NULL);
+	LASSERT(req->rq_repmsg == NULL);
+	LASSERT(req->rq_reply_off + req->rq_nob_received <= req->rq_repbuf_len);
+
+	if (req->rq_reply_off == 0 &&
+	    (lustre_msghdr_get_flags(req->rq_reqmsg) & MSGHDR_AT_SUPPORT)) {
+		CERROR("real reply with offset 0\n");
+		return -EPROTO;
+	}
+
+	if (req->rq_reply_off % 8 != 0) {
+		CERROR("reply at odd offset %u\n", req->rq_reply_off);
+		return -EPROTO;
+	}
+
+	req->rq_repdata = (struct lustre_msg *)
+				(req->rq_repbuf + req->rq_reply_off);
+	req->rq_repdata_len = req->rq_nob_received;
+
+	return do_cli_unwrap_reply(req);
+}
+
+/**
+ * Used by ptlrpc client, to perform security transformation upon the early
+ * reply message of \a req. We expect the rq_reply_off is 0, and
+ * rq_nob_received is the early reply size.
+ *
+ * Because the receive buffer might be still posted, the reply data might be
+ * changed at any time, no matter we're holding rq_lock or not. For this reason
+ * we allocate a separate ptlrpc_request and reply buffer for early reply
+ * processing.
+ *
+ * \retval 0 success, \a req_ret is filled with a duplicated ptlrpc_request.
+ * Later the caller must call sptlrpc_cli_finish_early_reply() on the returned
+ * \a *req_ret to release it.
+ * \retval -ev error number, and \a req_ret will not be set.
+ */
+int sptlrpc_cli_unwrap_early_reply(struct ptlrpc_request *req,
+				   struct ptlrpc_request **req_ret)
+{
+	struct ptlrpc_request  *early_req;
+	char		   *early_buf;
+	int		     early_bufsz, early_size;
+	int		     rc;
+
+	early_req = ptlrpc_request_cache_alloc(GFP_NOFS);
+	if (early_req == NULL)
+		return -ENOMEM;
+
+	early_size = req->rq_nob_received;
+	early_bufsz = size_roundup_power2(early_size);
+	OBD_ALLOC_LARGE(early_buf, early_bufsz);
+	if (early_buf == NULL) {
+		rc = -ENOMEM;
+		goto err_req;
+	}
+
+	/* sanity checkings and copy data out, do it inside spinlock */
+	spin_lock(&req->rq_lock);
+
+	if (req->rq_replied) {
+		spin_unlock(&req->rq_lock);
+		rc = -EALREADY;
+		goto err_buf;
+	}
+
+	LASSERT(req->rq_repbuf);
+	LASSERT(req->rq_repdata == NULL);
+	LASSERT(req->rq_repmsg == NULL);
+
+	if (req->rq_reply_off != 0) {
+		CERROR("early reply with offset %u\n", req->rq_reply_off);
+		spin_unlock(&req->rq_lock);
+		rc = -EPROTO;
+		goto err_buf;
+	}
+
+	if (req->rq_nob_received != early_size) {
+		/* even another early arrived the size should be the same */
+		CERROR("data size has changed from %u to %u\n",
+		       early_size, req->rq_nob_received);
+		spin_unlock(&req->rq_lock);
+		rc = -EINVAL;
+		goto err_buf;
+	}
+
+	if (req->rq_nob_received < sizeof(struct lustre_msg)) {
+		CERROR("early reply length %d too small\n",
+		       req->rq_nob_received);
+		spin_unlock(&req->rq_lock);
+		rc = -EALREADY;
+		goto err_buf;
+	}
+
+	memcpy(early_buf, req->rq_repbuf, early_size);
+	spin_unlock(&req->rq_lock);
+
+	spin_lock_init(&early_req->rq_lock);
+	early_req->rq_cli_ctx = sptlrpc_cli_ctx_get(req->rq_cli_ctx);
+	early_req->rq_flvr = req->rq_flvr;
+	early_req->rq_repbuf = early_buf;
+	early_req->rq_repbuf_len = early_bufsz;
+	early_req->rq_repdata = (struct lustre_msg *) early_buf;
+	early_req->rq_repdata_len = early_size;
+	early_req->rq_early = 1;
+	early_req->rq_reqmsg = req->rq_reqmsg;
+
+	rc = do_cli_unwrap_reply(early_req);
+	if (rc) {
+		DEBUG_REQ(D_ADAPTTO, early_req,
+			  "error %d unwrap early reply", rc);
+		goto err_ctx;
+	}
+
+	LASSERT(early_req->rq_repmsg);
+	*req_ret = early_req;
+	return 0;
+
+err_ctx:
+	sptlrpc_cli_ctx_put(early_req->rq_cli_ctx, 1);
+err_buf:
+	OBD_FREE_LARGE(early_buf, early_bufsz);
+err_req:
+	ptlrpc_request_cache_free(early_req);
+	return rc;
+}
+
+/**
+ * Used by ptlrpc client, to release a processed early reply \a early_req.
+ *
+ * \pre \a early_req was obtained from calling sptlrpc_cli_unwrap_early_reply().
+ */
+void sptlrpc_cli_finish_early_reply(struct ptlrpc_request *early_req)
+{
+	LASSERT(early_req->rq_repbuf);
+	LASSERT(early_req->rq_repdata);
+	LASSERT(early_req->rq_repmsg);
+
+	sptlrpc_cli_ctx_put(early_req->rq_cli_ctx, 1);
+	OBD_FREE_LARGE(early_req->rq_repbuf, early_req->rq_repbuf_len);
+	ptlrpc_request_cache_free(early_req);
+}
+
+/**************************************************
+ * sec ID					 *
+ **************************************************/
+
+/*
+ * "fixed" sec (e.g. null) use sec_id < 0
+ */
+static atomic_t sptlrpc_sec_id = ATOMIC_INIT(1);
+
+int sptlrpc_get_next_secid(void)
+{
+	return atomic_inc_return(&sptlrpc_sec_id);
+}
+EXPORT_SYMBOL(sptlrpc_get_next_secid);
+
+/**************************************************
+ * client side high-level security APIs	   *
+ **************************************************/
+
+static int sec_cop_flush_ctx_cache(struct ptlrpc_sec *sec, uid_t uid,
+				   int grace, int force)
+{
+	struct ptlrpc_sec_policy *policy = sec->ps_policy;
+
+	LASSERT(policy->sp_cops);
+	LASSERT(policy->sp_cops->flush_ctx_cache);
+
+	return policy->sp_cops->flush_ctx_cache(sec, uid, grace, force);
+}
+
+static void sec_cop_destroy_sec(struct ptlrpc_sec *sec)
+{
+	struct ptlrpc_sec_policy *policy = sec->ps_policy;
+
+	LASSERT_ATOMIC_ZERO(&sec->ps_refcount);
+	LASSERT_ATOMIC_ZERO(&sec->ps_nctx);
+	LASSERT(policy->sp_cops->destroy_sec);
+
+	CDEBUG(D_SEC, "%s@%p: being destroyed\n", sec->ps_policy->sp_name, sec);
+
+	policy->sp_cops->destroy_sec(sec);
+	sptlrpc_policy_put(policy);
+}
+
+void sptlrpc_sec_destroy(struct ptlrpc_sec *sec)
+{
+	sec_cop_destroy_sec(sec);
+}
+EXPORT_SYMBOL(sptlrpc_sec_destroy);
+
+static void sptlrpc_sec_kill(struct ptlrpc_sec *sec)
+{
+	LASSERT_ATOMIC_POS(&sec->ps_refcount);
+
+	if (sec->ps_policy->sp_cops->kill_sec) {
+		sec->ps_policy->sp_cops->kill_sec(sec);
+
+		sec_cop_flush_ctx_cache(sec, -1, 1, 1);
+	}
+}
+
+struct ptlrpc_sec *sptlrpc_sec_get(struct ptlrpc_sec *sec)
+{
+	if (sec)
+		atomic_inc(&sec->ps_refcount);
+
+	return sec;
+}
+EXPORT_SYMBOL(sptlrpc_sec_get);
+
+void sptlrpc_sec_put(struct ptlrpc_sec *sec)
+{
+	if (sec) {
+		LASSERT_ATOMIC_POS(&sec->ps_refcount);
+
+		if (atomic_dec_and_test(&sec->ps_refcount)) {
+			sptlrpc_gc_del_sec(sec);
+			sec_cop_destroy_sec(sec);
+		}
+	}
+}
+EXPORT_SYMBOL(sptlrpc_sec_put);
+
+/*
+ * policy module is responsible for taking reference of import
+ */
+static
+struct ptlrpc_sec *sptlrpc_sec_create(struct obd_import *imp,
+				       struct ptlrpc_svc_ctx *svc_ctx,
+				       struct sptlrpc_flavor *sf,
+				       enum lustre_sec_part sp)
+{
+	struct ptlrpc_sec_policy *policy;
+	struct ptlrpc_sec	*sec;
+	char		      str[32];
+
+	if (svc_ctx) {
+		LASSERT(imp->imp_dlm_fake == 1);
+
+		CDEBUG(D_SEC, "%s %s: reverse sec using flavor %s\n",
+		       imp->imp_obd->obd_type->typ_name,
+		       imp->imp_obd->obd_name,
+		       sptlrpc_flavor2name(sf, str, sizeof(str)));
+
+		policy = sptlrpc_policy_get(svc_ctx->sc_policy);
+		sf->sf_flags |= PTLRPC_SEC_FL_REVERSE | PTLRPC_SEC_FL_ROOTONLY;
+	} else {
+		LASSERT(imp->imp_dlm_fake == 0);
+
+		CDEBUG(D_SEC, "%s %s: select security flavor %s\n",
+		       imp->imp_obd->obd_type->typ_name,
+		       imp->imp_obd->obd_name,
+		       sptlrpc_flavor2name(sf, str, sizeof(str)));
+
+		policy = sptlrpc_wireflavor2policy(sf->sf_rpc);
+		if (!policy) {
+			CERROR("invalid flavor 0x%x\n", sf->sf_rpc);
+			return NULL;
+		}
+	}
+
+	sec = policy->sp_cops->create_sec(imp, svc_ctx, sf);
+	if (sec) {
+		atomic_inc(&sec->ps_refcount);
+
+		sec->ps_part = sp;
+
+		if (sec->ps_gc_interval && policy->sp_cops->gc_ctx)
+			sptlrpc_gc_add_sec(sec);
+	} else {
+		sptlrpc_policy_put(policy);
+	}
+
+	return sec;
+}
+
+struct ptlrpc_sec *sptlrpc_import_sec_ref(struct obd_import *imp)
+{
+	struct ptlrpc_sec *sec;
+
+	spin_lock(&imp->imp_lock);
+	sec = sptlrpc_sec_get(imp->imp_sec);
+	spin_unlock(&imp->imp_lock);
+
+	return sec;
+}
+EXPORT_SYMBOL(sptlrpc_import_sec_ref);
+
+static void sptlrpc_import_sec_install(struct obd_import *imp,
+				       struct ptlrpc_sec *sec)
+{
+	struct ptlrpc_sec *old_sec;
+
+	LASSERT_ATOMIC_POS(&sec->ps_refcount);
+
+	spin_lock(&imp->imp_lock);
+	old_sec = imp->imp_sec;
+	imp->imp_sec = sec;
+	spin_unlock(&imp->imp_lock);
+
+	if (old_sec) {
+		sptlrpc_sec_kill(old_sec);
+
+		/* balance the ref taken by this import */
+		sptlrpc_sec_put(old_sec);
+	}
+}
+
+static inline
+int flavor_equal(struct sptlrpc_flavor *sf1, struct sptlrpc_flavor *sf2)
+{
+	return (memcmp(sf1, sf2, sizeof(*sf1)) == 0);
+}
+
+static inline
+void flavor_copy(struct sptlrpc_flavor *dst, struct sptlrpc_flavor *src)
+{
+	*dst = *src;
+}
+
+static void sptlrpc_import_sec_adapt_inplace(struct obd_import *imp,
+					     struct ptlrpc_sec *sec,
+					     struct sptlrpc_flavor *sf)
+{
+	char    str1[32], str2[32];
+
+	if (sec->ps_flvr.sf_flags != sf->sf_flags)
+		CDEBUG(D_SEC, "changing sec flags: %s -> %s\n",
+		       sptlrpc_secflags2str(sec->ps_flvr.sf_flags,
+					    str1, sizeof(str1)),
+		       sptlrpc_secflags2str(sf->sf_flags,
+					    str2, sizeof(str2)));
+
+	spin_lock(&sec->ps_lock);
+	flavor_copy(&sec->ps_flvr, sf);
+	spin_unlock(&sec->ps_lock);
+}
+
+/**
+ * To get an appropriate ptlrpc_sec for the \a imp, according to the current
+ * configuration. Upon called, imp->imp_sec may or may not be NULL.
+ *
+ *  - regular import: \a svc_ctx should be NULL and \a flvr is ignored;
+ *  - reverse import: \a svc_ctx and \a flvr are obtained from incoming request.
+ */
+int sptlrpc_import_sec_adapt(struct obd_import *imp,
+			     struct ptlrpc_svc_ctx *svc_ctx,
+			     struct sptlrpc_flavor *flvr)
+{
+	struct ptlrpc_connection   *conn;
+	struct sptlrpc_flavor       sf;
+	struct ptlrpc_sec	  *sec, *newsec;
+	enum lustre_sec_part	sp;
+	char			str[24];
+	int			 rc = 0;
+
+	might_sleep();
+
+	if (imp == NULL)
+		return 0;
+
+	conn = imp->imp_connection;
+
+	if (svc_ctx == NULL) {
+		struct client_obd *cliobd = &imp->imp_obd->u.cli;
+		/*
+		 * normal import, determine flavor from rule set, except
+		 * for mgc the flavor is predetermined.
+		 */
+		if (cliobd->cl_sp_me == LUSTRE_SP_MGC)
+			sf = cliobd->cl_flvr_mgc;
+		else
+			sptlrpc_conf_choose_flavor(cliobd->cl_sp_me,
+						   cliobd->cl_sp_to,
+						   &cliobd->cl_target_uuid,
+						   conn->c_self, &sf);
+
+		sp = imp->imp_obd->u.cli.cl_sp_me;
+	} else {
+		/* reverse import, determine flavor from incoming request */
+		sf = *flvr;
+
+		if (sf.sf_rpc != SPTLRPC_FLVR_NULL)
+			sf.sf_flags = PTLRPC_SEC_FL_REVERSE |
+				      PTLRPC_SEC_FL_ROOTONLY;
+
+		sp = sptlrpc_target_sec_part(imp->imp_obd);
+	}
+
+	sec = sptlrpc_import_sec_ref(imp);
+	if (sec) {
+		char    str2[24];
+
+		if (flavor_equal(&sf, &sec->ps_flvr))
+			goto out;
+
+		CDEBUG(D_SEC, "import %s->%s: changing flavor %s -> %s\n",
+		       imp->imp_obd->obd_name,
+		       obd_uuid2str(&conn->c_remote_uuid),
+		       sptlrpc_flavor2name(&sec->ps_flvr, str, sizeof(str)),
+		       sptlrpc_flavor2name(&sf, str2, sizeof(str2)));
+
+		if (SPTLRPC_FLVR_POLICY(sf.sf_rpc) ==
+		    SPTLRPC_FLVR_POLICY(sec->ps_flvr.sf_rpc) &&
+		    SPTLRPC_FLVR_MECH(sf.sf_rpc) ==
+		    SPTLRPC_FLVR_MECH(sec->ps_flvr.sf_rpc)) {
+			sptlrpc_import_sec_adapt_inplace(imp, sec, &sf);
+			goto out;
+		}
+	} else if (SPTLRPC_FLVR_BASE(sf.sf_rpc) !=
+		   SPTLRPC_FLVR_BASE(SPTLRPC_FLVR_NULL)) {
+		CDEBUG(D_SEC, "import %s->%s netid %x: select flavor %s\n",
+		       imp->imp_obd->obd_name,
+		       obd_uuid2str(&conn->c_remote_uuid),
+		       LNET_NIDNET(conn->c_self),
+		       sptlrpc_flavor2name(&sf, str, sizeof(str)));
+	}
+
+	mutex_lock(&imp->imp_sec_mutex);
+
+	newsec = sptlrpc_sec_create(imp, svc_ctx, &sf, sp);
+	if (newsec) {
+		sptlrpc_import_sec_install(imp, newsec);
+	} else {
+		CERROR("import %s->%s: failed to create new sec\n",
+		       imp->imp_obd->obd_name,
+		       obd_uuid2str(&conn->c_remote_uuid));
+		rc = -EPERM;
+	}
+
+	mutex_unlock(&imp->imp_sec_mutex);
+out:
+	sptlrpc_sec_put(sec);
+	return rc;
+}
+
+void sptlrpc_import_sec_put(struct obd_import *imp)
+{
+	if (imp->imp_sec) {
+		sptlrpc_sec_kill(imp->imp_sec);
+
+		sptlrpc_sec_put(imp->imp_sec);
+		imp->imp_sec = NULL;
+	}
+}
+
+static void import_flush_ctx_common(struct obd_import *imp,
+				    uid_t uid, int grace, int force)
+{
+	struct ptlrpc_sec *sec;
+
+	if (imp == NULL)
+		return;
+
+	sec = sptlrpc_import_sec_ref(imp);
+	if (sec == NULL)
+		return;
+
+	sec_cop_flush_ctx_cache(sec, uid, grace, force);
+	sptlrpc_sec_put(sec);
+}
+
+void sptlrpc_import_flush_root_ctx(struct obd_import *imp)
+{
+	/* it's important to use grace mode, see explain in
+	 * sptlrpc_req_refresh_ctx() */
+	import_flush_ctx_common(imp, 0, 1, 1);
+}
+
+void sptlrpc_import_flush_my_ctx(struct obd_import *imp)
+{
+	import_flush_ctx_common(imp, from_kuid(&init_user_ns, current_uid()),
+				1, 1);
+}
+EXPORT_SYMBOL(sptlrpc_import_flush_my_ctx);
+
+void sptlrpc_import_flush_all_ctx(struct obd_import *imp)
+{
+	import_flush_ctx_common(imp, -1, 1, 1);
+}
+EXPORT_SYMBOL(sptlrpc_import_flush_all_ctx);
+
+/**
+ * Used by ptlrpc client to allocate request buffer of \a req. Upon return
+ * successfully, req->rq_reqmsg points to a buffer with size \a msgsize.
+ */
+int sptlrpc_cli_alloc_reqbuf(struct ptlrpc_request *req, int msgsize)
+{
+	struct ptlrpc_cli_ctx *ctx = req->rq_cli_ctx;
+	struct ptlrpc_sec_policy *policy;
+	int rc;
+
+	LASSERT(ctx);
+	LASSERT(ctx->cc_sec);
+	LASSERT(ctx->cc_sec->ps_policy);
+	LASSERT(req->rq_reqmsg == NULL);
+	LASSERT_ATOMIC_POS(&ctx->cc_refcount);
+
+	policy = ctx->cc_sec->ps_policy;
+	rc = policy->sp_cops->alloc_reqbuf(ctx->cc_sec, req, msgsize);
+	if (!rc) {
+		LASSERT(req->rq_reqmsg);
+		LASSERT(req->rq_reqbuf || req->rq_clrbuf);
+
+		/* zeroing preallocated buffer */
+		if (req->rq_pool)
+			memset(req->rq_reqmsg, 0, msgsize);
+	}
+
+	return rc;
+}
+
+/**
+ * Used by ptlrpc client to free request buffer of \a req. After this
+ * req->rq_reqmsg is set to NULL and should not be accessed anymore.
+ */
+void sptlrpc_cli_free_reqbuf(struct ptlrpc_request *req)
+{
+	struct ptlrpc_cli_ctx *ctx = req->rq_cli_ctx;
+	struct ptlrpc_sec_policy *policy;
+
+	LASSERT(ctx);
+	LASSERT(ctx->cc_sec);
+	LASSERT(ctx->cc_sec->ps_policy);
+	LASSERT_ATOMIC_POS(&ctx->cc_refcount);
+
+	if (req->rq_reqbuf == NULL && req->rq_clrbuf == NULL)
+		return;
+
+	policy = ctx->cc_sec->ps_policy;
+	policy->sp_cops->free_reqbuf(ctx->cc_sec, req);
+	req->rq_reqmsg = NULL;
+}
+
+/*
+ * NOTE caller must guarantee the buffer size is enough for the enlargement
+ */
+void _sptlrpc_enlarge_msg_inplace(struct lustre_msg *msg,
+				  int segment, int newsize)
+{
+	void   *src, *dst;
+	int     oldsize, oldmsg_size, movesize;
+
+	LASSERT(segment < msg->lm_bufcount);
+	LASSERT(msg->lm_buflens[segment] <= newsize);
+
+	if (msg->lm_buflens[segment] == newsize)
+		return;
+
+	/* nothing to do if we are enlarging the last segment */
+	if (segment == msg->lm_bufcount - 1) {
+		msg->lm_buflens[segment] = newsize;
+		return;
+	}
+
+	oldsize = msg->lm_buflens[segment];
+
+	src = lustre_msg_buf(msg, segment + 1, 0);
+	msg->lm_buflens[segment] = newsize;
+	dst = lustre_msg_buf(msg, segment + 1, 0);
+	msg->lm_buflens[segment] = oldsize;
+
+	/* move from segment + 1 to end segment */
+	LASSERT(msg->lm_magic == LUSTRE_MSG_MAGIC_V2);
+	oldmsg_size = lustre_msg_size_v2(msg->lm_bufcount, msg->lm_buflens);
+	movesize = oldmsg_size - ((unsigned long) src - (unsigned long) msg);
+	LASSERT(movesize >= 0);
+
+	if (movesize)
+		memmove(dst, src, movesize);
+
+	/* note we don't clear the ares where old data live, not secret */
+
+	/* finally set new segment size */
+	msg->lm_buflens[segment] = newsize;
+}
+EXPORT_SYMBOL(_sptlrpc_enlarge_msg_inplace);
+
+/**
+ * Used by ptlrpc client to enlarge the \a segment of request message pointed
+ * by req->rq_reqmsg to size \a newsize, all previously filled-in data will be
+ * preserved after the enlargement. this must be called after original request
+ * buffer being allocated.
+ *
+ * \note after this be called, rq_reqmsg and rq_reqlen might have been changed,
+ * so caller should refresh its local pointers if needed.
+ */
+int sptlrpc_cli_enlarge_reqbuf(struct ptlrpc_request *req,
+			       int segment, int newsize)
+{
+	struct ptlrpc_cli_ctx    *ctx = req->rq_cli_ctx;
+	struct ptlrpc_sec_cops   *cops;
+	struct lustre_msg	*msg = req->rq_reqmsg;
+
+	LASSERT(ctx);
+	LASSERT(msg);
+	LASSERT(msg->lm_bufcount > segment);
+	LASSERT(msg->lm_buflens[segment] <= newsize);
+
+	if (msg->lm_buflens[segment] == newsize)
+		return 0;
+
+	cops = ctx->cc_sec->ps_policy->sp_cops;
+	LASSERT(cops->enlarge_reqbuf);
+	return cops->enlarge_reqbuf(ctx->cc_sec, req, segment, newsize);
+}
+EXPORT_SYMBOL(sptlrpc_cli_enlarge_reqbuf);
+
+/**
+ * Used by ptlrpc client to allocate reply buffer of \a req.
+ *
+ * \note After this, req->rq_repmsg is still not accessible.
+ */
+int sptlrpc_cli_alloc_repbuf(struct ptlrpc_request *req, int msgsize)
+{
+	struct ptlrpc_cli_ctx *ctx = req->rq_cli_ctx;
+	struct ptlrpc_sec_policy *policy;
+
+	LASSERT(ctx);
+	LASSERT(ctx->cc_sec);
+	LASSERT(ctx->cc_sec->ps_policy);
+
+	if (req->rq_repbuf)
+		return 0;
+
+	policy = ctx->cc_sec->ps_policy;
+	return policy->sp_cops->alloc_repbuf(ctx->cc_sec, req, msgsize);
+}
+
+/**
+ * Used by ptlrpc client to free reply buffer of \a req. After this
+ * req->rq_repmsg is set to NULL and should not be accessed anymore.
+ */
+void sptlrpc_cli_free_repbuf(struct ptlrpc_request *req)
+{
+	struct ptlrpc_cli_ctx *ctx = req->rq_cli_ctx;
+	struct ptlrpc_sec_policy *policy;
+
+	LASSERT(ctx);
+	LASSERT(ctx->cc_sec);
+	LASSERT(ctx->cc_sec->ps_policy);
+	LASSERT_ATOMIC_POS(&ctx->cc_refcount);
+
+	if (req->rq_repbuf == NULL)
+		return;
+	LASSERT(req->rq_repbuf_len);
+
+	policy = ctx->cc_sec->ps_policy;
+	policy->sp_cops->free_repbuf(ctx->cc_sec, req);
+	req->rq_repmsg = NULL;
+}
+
+int sptlrpc_cli_install_rvs_ctx(struct obd_import *imp,
+				struct ptlrpc_cli_ctx *ctx)
+{
+	struct ptlrpc_sec_policy *policy = ctx->cc_sec->ps_policy;
+
+	if (!policy->sp_cops->install_rctx)
+		return 0;
+	return policy->sp_cops->install_rctx(imp, ctx->cc_sec, ctx);
+}
+
+int sptlrpc_svc_install_rvs_ctx(struct obd_import *imp,
+				struct ptlrpc_svc_ctx *ctx)
+{
+	struct ptlrpc_sec_policy *policy = ctx->sc_policy;
+
+	if (!policy->sp_sops->install_rctx)
+		return 0;
+	return policy->sp_sops->install_rctx(imp, ctx);
+}
+
+/****************************************
+ * server side security		 *
+ ****************************************/
+
+static int flavor_allowed(struct sptlrpc_flavor *exp,
+			  struct ptlrpc_request *req)
+{
+	struct sptlrpc_flavor *flvr = &req->rq_flvr;
+
+	if (exp->sf_rpc == SPTLRPC_FLVR_ANY || exp->sf_rpc == flvr->sf_rpc)
+		return 1;
+
+	if ((req->rq_ctx_init || req->rq_ctx_fini) &&
+	    SPTLRPC_FLVR_POLICY(exp->sf_rpc) ==
+	    SPTLRPC_FLVR_POLICY(flvr->sf_rpc) &&
+	    SPTLRPC_FLVR_MECH(exp->sf_rpc) == SPTLRPC_FLVR_MECH(flvr->sf_rpc))
+		return 1;
+
+	return 0;
+}
+
+#define EXP_FLVR_UPDATE_EXPIRE      (OBD_TIMEOUT_DEFAULT + 10)
+
+/**
+ * Given an export \a exp, check whether the flavor of incoming \a req
+ * is allowed by the export \a exp. Main logic is about taking care of
+ * changing configurations. Return 0 means success.
+ */
+int sptlrpc_target_export_check(struct obd_export *exp,
+				struct ptlrpc_request *req)
+{
+	struct sptlrpc_flavor   flavor;
+
+	if (exp == NULL)
+		return 0;
+
+	/* client side export has no imp_reverse, skip
+	 * FIXME maybe we should check flavor this as well??? */
+	if (exp->exp_imp_reverse == NULL)
+		return 0;
+
+	/* don't care about ctx fini rpc */
+	if (req->rq_ctx_fini)
+		return 0;
+
+	spin_lock(&exp->exp_lock);
+
+	/* if flavor just changed (exp->exp_flvr_changed != 0), we wait for
+	 * the first req with the new flavor, then treat it as current flavor,
+	 * adapt reverse sec according to it.
+	 * note the first rpc with new flavor might not be with root ctx, in
+	 * which case delay the sec_adapt by leaving exp_flvr_adapt == 1. */
+	if (unlikely(exp->exp_flvr_changed) &&
+	    flavor_allowed(&exp->exp_flvr_old[1], req)) {
+		/* make the new flavor as "current", and old ones as
+		 * about-to-expire */
+		CDEBUG(D_SEC, "exp %p: just changed: %x->%x\n", exp,
+		       exp->exp_flvr.sf_rpc, exp->exp_flvr_old[1].sf_rpc);
+		flavor = exp->exp_flvr_old[1];
+		exp->exp_flvr_old[1] = exp->exp_flvr_old[0];
+		exp->exp_flvr_expire[1] = exp->exp_flvr_expire[0];
+		exp->exp_flvr_old[0] = exp->exp_flvr;
+		exp->exp_flvr_expire[0] = get_seconds() +
+					  EXP_FLVR_UPDATE_EXPIRE;
+		exp->exp_flvr = flavor;
+
+		/* flavor change finished */
+		exp->exp_flvr_changed = 0;
+		LASSERT(exp->exp_flvr_adapt == 1);
+
+		/* if it's gss, we only interested in root ctx init */
+		if (req->rq_auth_gss &&
+		    !(req->rq_ctx_init &&
+		      (req->rq_auth_usr_root || req->rq_auth_usr_mdt ||
+		       req->rq_auth_usr_ost))) {
+			spin_unlock(&exp->exp_lock);
+			CDEBUG(D_SEC, "is good but not root(%d:%d:%d:%d:%d)\n",
+			       req->rq_auth_gss, req->rq_ctx_init,
+			       req->rq_auth_usr_root, req->rq_auth_usr_mdt,
+			       req->rq_auth_usr_ost);
+			return 0;
+		}
+
+		exp->exp_flvr_adapt = 0;
+		spin_unlock(&exp->exp_lock);
+
+		return sptlrpc_import_sec_adapt(exp->exp_imp_reverse,
+						req->rq_svc_ctx, &flavor);
+	}
+
+	/* if it equals to the current flavor, we accept it, but need to
+	 * dealing with reverse sec/ctx */
+	if (likely(flavor_allowed(&exp->exp_flvr, req))) {
+		/* most cases should return here, we only interested in
+		 * gss root ctx init */
+		if (!req->rq_auth_gss || !req->rq_ctx_init ||
+		    (!req->rq_auth_usr_root && !req->rq_auth_usr_mdt &&
+		     !req->rq_auth_usr_ost)) {
+			spin_unlock(&exp->exp_lock);
+			return 0;
+		}
+
+		/* if flavor just changed, we should not proceed, just leave
+		 * it and current flavor will be discovered and replaced
+		 * shortly, and let _this_ rpc pass through */
+		if (exp->exp_flvr_changed) {
+			LASSERT(exp->exp_flvr_adapt);
+			spin_unlock(&exp->exp_lock);
+			return 0;
+		}
+
+		if (exp->exp_flvr_adapt) {
+			exp->exp_flvr_adapt = 0;
+			CDEBUG(D_SEC, "exp %p (%x|%x|%x): do delayed adapt\n",
+			       exp, exp->exp_flvr.sf_rpc,
+			       exp->exp_flvr_old[0].sf_rpc,
+			       exp->exp_flvr_old[1].sf_rpc);
+			flavor = exp->exp_flvr;
+			spin_unlock(&exp->exp_lock);
+
+			return sptlrpc_import_sec_adapt(exp->exp_imp_reverse,
+							req->rq_svc_ctx,
+							&flavor);
+		} else {
+			CDEBUG(D_SEC, "exp %p (%x|%x|%x): is current flavor, install rvs ctx\n",
+			       exp, exp->exp_flvr.sf_rpc,
+			       exp->exp_flvr_old[0].sf_rpc,
+			       exp->exp_flvr_old[1].sf_rpc);
+			spin_unlock(&exp->exp_lock);
+
+			return sptlrpc_svc_install_rvs_ctx(exp->exp_imp_reverse,
+							   req->rq_svc_ctx);
+		}
+	}
+
+	if (exp->exp_flvr_expire[0]) {
+		if (exp->exp_flvr_expire[0] >= get_seconds()) {
+			if (flavor_allowed(&exp->exp_flvr_old[0], req)) {
+				CDEBUG(D_SEC, "exp %p (%x|%x|%x): match the middle one (" CFS_DURATION_T ")\n", exp,
+				       exp->exp_flvr.sf_rpc,
+				       exp->exp_flvr_old[0].sf_rpc,
+				       exp->exp_flvr_old[1].sf_rpc,
+				       exp->exp_flvr_expire[0] -
+				       get_seconds());
+				spin_unlock(&exp->exp_lock);
+				return 0;
+			}
+		} else {
+			CDEBUG(D_SEC, "mark middle expired\n");
+			exp->exp_flvr_expire[0] = 0;
+		}
+		CDEBUG(D_SEC, "exp %p (%x|%x|%x): %x not match middle\n", exp,
+		       exp->exp_flvr.sf_rpc,
+		       exp->exp_flvr_old[0].sf_rpc, exp->exp_flvr_old[1].sf_rpc,
+		       req->rq_flvr.sf_rpc);
+	}
+
+	/* now it doesn't match the current flavor, the only chance we can
+	 * accept it is match the old flavors which is not expired. */
+	if (exp->exp_flvr_changed == 0 && exp->exp_flvr_expire[1]) {
+		if (exp->exp_flvr_expire[1] >= get_seconds()) {
+			if (flavor_allowed(&exp->exp_flvr_old[1], req)) {
+				CDEBUG(D_SEC, "exp %p (%x|%x|%x): match the oldest one (" CFS_DURATION_T ")\n",
+				       exp,
+				       exp->exp_flvr.sf_rpc,
+				       exp->exp_flvr_old[0].sf_rpc,
+				       exp->exp_flvr_old[1].sf_rpc,
+				       exp->exp_flvr_expire[1] -
+				       get_seconds());
+				spin_unlock(&exp->exp_lock);
+				return 0;
+			}
+		} else {
+			CDEBUG(D_SEC, "mark oldest expired\n");
+			exp->exp_flvr_expire[1] = 0;
+		}
+		CDEBUG(D_SEC, "exp %p (%x|%x|%x): %x not match found\n",
+		       exp, exp->exp_flvr.sf_rpc,
+		       exp->exp_flvr_old[0].sf_rpc, exp->exp_flvr_old[1].sf_rpc,
+		       req->rq_flvr.sf_rpc);
+	} else {
+		CDEBUG(D_SEC, "exp %p (%x|%x|%x): skip the last one\n",
+		       exp, exp->exp_flvr.sf_rpc, exp->exp_flvr_old[0].sf_rpc,
+		       exp->exp_flvr_old[1].sf_rpc);
+	}
+
+	spin_unlock(&exp->exp_lock);
+
+	CWARN("exp %p(%s): req %p (%u|%u|%u|%u|%u|%u) with unauthorized flavor %x, expect %x|%x(%+ld)|%x(%+ld)\n",
+	      exp, exp->exp_obd->obd_name,
+	      req, req->rq_auth_gss, req->rq_ctx_init, req->rq_ctx_fini,
+	      req->rq_auth_usr_root, req->rq_auth_usr_mdt, req->rq_auth_usr_ost,
+	      req->rq_flvr.sf_rpc,
+	      exp->exp_flvr.sf_rpc,
+	      exp->exp_flvr_old[0].sf_rpc,
+	      exp->exp_flvr_expire[0] ?
+	      (unsigned long) (exp->exp_flvr_expire[0] -
+			       get_seconds()) : 0,
+	      exp->exp_flvr_old[1].sf_rpc,
+	      exp->exp_flvr_expire[1] ?
+	      (unsigned long) (exp->exp_flvr_expire[1] -
+			       get_seconds()) : 0);
+	return -EACCES;
+}
+EXPORT_SYMBOL(sptlrpc_target_export_check);
+
+void sptlrpc_target_update_exp_flavor(struct obd_device *obd,
+				      struct sptlrpc_rule_set *rset)
+{
+	struct obd_export       *exp;
+	struct sptlrpc_flavor    new_flvr;
+
+	LASSERT(obd);
+
+	spin_lock(&obd->obd_dev_lock);
+
+	list_for_each_entry(exp, &obd->obd_exports, exp_obd_chain) {
+		if (exp->exp_connection == NULL)
+			continue;
+
+		/* note if this export had just been updated flavor
+		 * (exp_flvr_changed == 1), this will override the
+		 * previous one. */
+		spin_lock(&exp->exp_lock);
+		sptlrpc_target_choose_flavor(rset, exp->exp_sp_peer,
+					     exp->exp_connection->c_peer.nid,
+					     &new_flvr);
+		if (exp->exp_flvr_changed ||
+		    !flavor_equal(&new_flvr, &exp->exp_flvr)) {
+			exp->exp_flvr_old[1] = new_flvr;
+			exp->exp_flvr_expire[1] = 0;
+			exp->exp_flvr_changed = 1;
+			exp->exp_flvr_adapt = 1;
+
+			CDEBUG(D_SEC, "exp %p (%s): updated flavor %x->%x\n",
+			       exp, sptlrpc_part2name(exp->exp_sp_peer),
+			       exp->exp_flvr.sf_rpc,
+			       exp->exp_flvr_old[1].sf_rpc);
+		}
+		spin_unlock(&exp->exp_lock);
+	}
+
+	spin_unlock(&obd->obd_dev_lock);
+}
+EXPORT_SYMBOL(sptlrpc_target_update_exp_flavor);
+
+static int sptlrpc_svc_check_from(struct ptlrpc_request *req, int svc_rc)
+{
+	/* peer's claim is unreliable unless gss is being used */
+	if (!req->rq_auth_gss || svc_rc == SECSVC_DROP)
+		return svc_rc;
+
+	switch (req->rq_sp_from) {
+	case LUSTRE_SP_CLI:
+		if (req->rq_auth_usr_mdt || req->rq_auth_usr_ost) {
+			DEBUG_REQ(D_ERROR, req, "faked source CLI");
+			svc_rc = SECSVC_DROP;
+		}
+		break;
+	case LUSTRE_SP_MDT:
+		if (!req->rq_auth_usr_mdt) {
+			DEBUG_REQ(D_ERROR, req, "faked source MDT");
+			svc_rc = SECSVC_DROP;
+		}
+		break;
+	case LUSTRE_SP_OST:
+		if (!req->rq_auth_usr_ost) {
+			DEBUG_REQ(D_ERROR, req, "faked source OST");
+			svc_rc = SECSVC_DROP;
+		}
+		break;
+	case LUSTRE_SP_MGS:
+	case LUSTRE_SP_MGC:
+		if (!req->rq_auth_usr_root && !req->rq_auth_usr_mdt &&
+		    !req->rq_auth_usr_ost) {
+			DEBUG_REQ(D_ERROR, req, "faked source MGC/MGS");
+			svc_rc = SECSVC_DROP;
+		}
+		break;
+	case LUSTRE_SP_ANY:
+	default:
+		DEBUG_REQ(D_ERROR, req, "invalid source %u", req->rq_sp_from);
+		svc_rc = SECSVC_DROP;
+	}
+
+	return svc_rc;
+}
+
+/**
+ * Used by ptlrpc server, to perform transformation upon request message of
+ * incoming \a req. This must be the first thing to do with a incoming
+ * request in ptlrpc layer.
+ *
+ * \retval SECSVC_OK success, and req->rq_reqmsg point to request message in
+ * clear text, size is req->rq_reqlen; also req->rq_svc_ctx is set.
+ * \retval SECSVC_COMPLETE success, the request has been fully processed, and
+ * reply message has been prepared.
+ * \retval SECSVC_DROP failed, this request should be dropped.
+ */
+int sptlrpc_svc_unwrap_request(struct ptlrpc_request *req)
+{
+	struct ptlrpc_sec_policy *policy;
+	struct lustre_msg	*msg = req->rq_reqbuf;
+	int		       rc;
+
+	LASSERT(msg);
+	LASSERT(req->rq_reqmsg == NULL);
+	LASSERT(req->rq_repmsg == NULL);
+	LASSERT(req->rq_svc_ctx == NULL);
+
+	req->rq_req_swab_mask = 0;
+
+	rc = __lustre_unpack_msg(msg, req->rq_reqdata_len);
+	switch (rc) {
+	case 1:
+		lustre_set_req_swabbed(req, MSG_PTLRPC_HEADER_OFF);
+	case 0:
+		break;
+	default:
+		CERROR("error unpacking request from %s x%llu\n",
+		       libcfs_id2str(req->rq_peer), req->rq_xid);
+		return SECSVC_DROP;
+	}
+
+	req->rq_flvr.sf_rpc = WIRE_FLVR(msg->lm_secflvr);
+	req->rq_sp_from = LUSTRE_SP_ANY;
+	req->rq_auth_uid = -1;
+	req->rq_auth_mapped_uid = -1;
+
+	policy = sptlrpc_wireflavor2policy(req->rq_flvr.sf_rpc);
+	if (!policy) {
+		CERROR("unsupported rpc flavor %x\n", req->rq_flvr.sf_rpc);
+		return SECSVC_DROP;
+	}
+
+	LASSERT(policy->sp_sops->accept);
+	rc = policy->sp_sops->accept(req);
+	sptlrpc_policy_put(policy);
+	LASSERT(req->rq_reqmsg || rc != SECSVC_OK);
+	LASSERT(req->rq_svc_ctx || rc == SECSVC_DROP);
+
+	/*
+	 * if it's not null flavor (which means embedded packing msg),
+	 * reset the swab mask for the coming inner msg unpacking.
+	 */
+	if (SPTLRPC_FLVR_POLICY(req->rq_flvr.sf_rpc) != SPTLRPC_POLICY_NULL)
+		req->rq_req_swab_mask = 0;
+
+	/* sanity check for the request source */
+	rc = sptlrpc_svc_check_from(req, rc);
+	return rc;
+}
+
+/**
+ * Used by ptlrpc server, to allocate reply buffer for \a req. If succeed,
+ * req->rq_reply_state is set, and req->rq_reply_state->rs_msg point to
+ * a buffer of \a msglen size.
+ */
+int sptlrpc_svc_alloc_rs(struct ptlrpc_request *req, int msglen)
+{
+	struct ptlrpc_sec_policy *policy;
+	struct ptlrpc_reply_state *rs;
+	int rc;
+
+	LASSERT(req->rq_svc_ctx);
+	LASSERT(req->rq_svc_ctx->sc_policy);
+
+	policy = req->rq_svc_ctx->sc_policy;
+	LASSERT(policy->sp_sops->alloc_rs);
+
+	rc = policy->sp_sops->alloc_rs(req, msglen);
+	if (unlikely(rc == -ENOMEM)) {
+		struct ptlrpc_service_part *svcpt = req->rq_rqbd->rqbd_svcpt;
+		if (svcpt->scp_service->srv_max_reply_size <
+		   msglen + sizeof(struct ptlrpc_reply_state)) {
+			/* Just return failure if the size is too big */
+			CERROR("size of message is too big (%zd), %d allowed",
+				msglen + sizeof(struct ptlrpc_reply_state),
+				svcpt->scp_service->srv_max_reply_size);
+			return -ENOMEM;
+		}
+
+		/* failed alloc, try emergency pool */
+		rs = lustre_get_emerg_rs(svcpt);
+		if (rs == NULL)
+			return -ENOMEM;
+
+		req->rq_reply_state = rs;
+		rc = policy->sp_sops->alloc_rs(req, msglen);
+		if (rc) {
+			lustre_put_emerg_rs(rs);
+			req->rq_reply_state = NULL;
+		}
+	}
+
+	LASSERT(rc != 0 ||
+		(req->rq_reply_state && req->rq_reply_state->rs_msg));
+
+	return rc;
+}
+
+/**
+ * Used by ptlrpc server, to perform transformation upon reply message.
+ *
+ * \post req->rq_reply_off is set to appropriate server-controlled reply offset.
+ * \post req->rq_repmsg and req->rq_reply_state->rs_msg becomes inaccessible.
+ */
+int sptlrpc_svc_wrap_reply(struct ptlrpc_request *req)
+{
+	struct ptlrpc_sec_policy *policy;
+	int rc;
+
+	LASSERT(req->rq_svc_ctx);
+	LASSERT(req->rq_svc_ctx->sc_policy);
+
+	policy = req->rq_svc_ctx->sc_policy;
+	LASSERT(policy->sp_sops->authorize);
+
+	rc = policy->sp_sops->authorize(req);
+	LASSERT(rc || req->rq_reply_state->rs_repdata_len);
+
+	return rc;
+}
+
+/**
+ * Used by ptlrpc server, to free reply_state.
+ */
+void sptlrpc_svc_free_rs(struct ptlrpc_reply_state *rs)
+{
+	struct ptlrpc_sec_policy *policy;
+	unsigned int prealloc;
+
+	LASSERT(rs->rs_svc_ctx);
+	LASSERT(rs->rs_svc_ctx->sc_policy);
+
+	policy = rs->rs_svc_ctx->sc_policy;
+	LASSERT(policy->sp_sops->free_rs);
+
+	prealloc = rs->rs_prealloc;
+	policy->sp_sops->free_rs(rs);
+
+	if (prealloc)
+		lustre_put_emerg_rs(rs);
+}
+
+void sptlrpc_svc_ctx_addref(struct ptlrpc_request *req)
+{
+	struct ptlrpc_svc_ctx *ctx = req->rq_svc_ctx;
+
+	if (ctx != NULL)
+		atomic_inc(&ctx->sc_refcount);
+}
+
+void sptlrpc_svc_ctx_decref(struct ptlrpc_request *req)
+{
+	struct ptlrpc_svc_ctx *ctx = req->rq_svc_ctx;
+
+	if (ctx == NULL)
+		return;
+
+	LASSERT_ATOMIC_POS(&ctx->sc_refcount);
+	if (atomic_dec_and_test(&ctx->sc_refcount)) {
+		if (ctx->sc_policy->sp_sops->free_ctx)
+			ctx->sc_policy->sp_sops->free_ctx(ctx);
+	}
+	req->rq_svc_ctx = NULL;
+}
+
+void sptlrpc_svc_ctx_invalidate(struct ptlrpc_request *req)
+{
+	struct ptlrpc_svc_ctx *ctx = req->rq_svc_ctx;
+
+	if (ctx == NULL)
+		return;
+
+	LASSERT_ATOMIC_POS(&ctx->sc_refcount);
+	if (ctx->sc_policy->sp_sops->invalidate_ctx)
+		ctx->sc_policy->sp_sops->invalidate_ctx(ctx);
+}
+EXPORT_SYMBOL(sptlrpc_svc_ctx_invalidate);
+
+/****************************************
+ * bulk security			*
+ ****************************************/
+
+/**
+ * Perform transformation upon bulk data pointed by \a desc. This is called
+ * before transforming the request message.
+ */
+int sptlrpc_cli_wrap_bulk(struct ptlrpc_request *req,
+			  struct ptlrpc_bulk_desc *desc)
+{
+	struct ptlrpc_cli_ctx *ctx;
+
+	LASSERT(req->rq_bulk_read || req->rq_bulk_write);
+
+	if (!req->rq_pack_bulk)
+		return 0;
+
+	ctx = req->rq_cli_ctx;
+	if (ctx->cc_ops->wrap_bulk)
+		return ctx->cc_ops->wrap_bulk(ctx, req, desc);
+	return 0;
+}
+EXPORT_SYMBOL(sptlrpc_cli_wrap_bulk);
+
+/**
+ * This is called after unwrap the reply message.
+ * return nob of actual plain text size received, or error code.
+ */
+int sptlrpc_cli_unwrap_bulk_read(struct ptlrpc_request *req,
+				 struct ptlrpc_bulk_desc *desc,
+				 int nob)
+{
+	struct ptlrpc_cli_ctx  *ctx;
+	int		     rc;
+
+	LASSERT(req->rq_bulk_read && !req->rq_bulk_write);
+
+	if (!req->rq_pack_bulk)
+		return desc->bd_nob_transferred;
+
+	ctx = req->rq_cli_ctx;
+	if (ctx->cc_ops->unwrap_bulk) {
+		rc = ctx->cc_ops->unwrap_bulk(ctx, req, desc);
+		if (rc < 0)
+			return rc;
+	}
+	return desc->bd_nob_transferred;
+}
+EXPORT_SYMBOL(sptlrpc_cli_unwrap_bulk_read);
+
+/**
+ * This is called after unwrap the reply message.
+ * return 0 for success or error code.
+ */
+int sptlrpc_cli_unwrap_bulk_write(struct ptlrpc_request *req,
+				  struct ptlrpc_bulk_desc *desc)
+{
+	struct ptlrpc_cli_ctx  *ctx;
+	int		     rc;
+
+	LASSERT(!req->rq_bulk_read && req->rq_bulk_write);
+
+	if (!req->rq_pack_bulk)
+		return 0;
+
+	ctx = req->rq_cli_ctx;
+	if (ctx->cc_ops->unwrap_bulk) {
+		rc = ctx->cc_ops->unwrap_bulk(ctx, req, desc);
+		if (rc < 0)
+			return rc;
+	}
+
+	/*
+	 * if everything is going right, nob should equals to nob_transferred.
+	 * in case of privacy mode, nob_transferred needs to be adjusted.
+	 */
+	if (desc->bd_nob != desc->bd_nob_transferred) {
+		CERROR("nob %d doesn't match transferred nob %d",
+		       desc->bd_nob, desc->bd_nob_transferred);
+		return -EPROTO;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL(sptlrpc_cli_unwrap_bulk_write);
+
+
+/****************************************
+ * user descriptor helpers	      *
+ ****************************************/
+
+int sptlrpc_current_user_desc_size(void)
+{
+	int ngroups;
+
+	ngroups = current_ngroups;
+
+	if (ngroups > LUSTRE_MAX_GROUPS)
+		ngroups = LUSTRE_MAX_GROUPS;
+	return sptlrpc_user_desc_size(ngroups);
+}
+EXPORT_SYMBOL(sptlrpc_current_user_desc_size);
+
+int sptlrpc_pack_user_desc(struct lustre_msg *msg, int offset)
+{
+	struct ptlrpc_user_desc *pud;
+
+	pud = lustre_msg_buf(msg, offset, 0);
+
+	pud->pud_uid = from_kuid(&init_user_ns, current_uid());
+	pud->pud_gid = from_kgid(&init_user_ns, current_gid());
+	pud->pud_fsuid = from_kuid(&init_user_ns, current_fsuid());
+	pud->pud_fsgid = from_kgid(&init_user_ns, current_fsgid());
+	pud->pud_cap = cfs_curproc_cap_pack();
+	pud->pud_ngroups = (msg->lm_buflens[offset] - sizeof(*pud)) / 4;
+
+	task_lock(current);
+	if (pud->pud_ngroups > current_ngroups)
+		pud->pud_ngroups = current_ngroups;
+	memcpy(pud->pud_groups, current_cred()->group_info->blocks[0],
+	       pud->pud_ngroups * sizeof(__u32));
+	task_unlock(current);
+
+	return 0;
+}
+EXPORT_SYMBOL(sptlrpc_pack_user_desc);
+
+int sptlrpc_unpack_user_desc(struct lustre_msg *msg, int offset, int swabbed)
+{
+	struct ptlrpc_user_desc *pud;
+	int		      i;
+
+	pud = lustre_msg_buf(msg, offset, sizeof(*pud));
+	if (!pud)
+		return -EINVAL;
+
+	if (swabbed) {
+		__swab32s(&pud->pud_uid);
+		__swab32s(&pud->pud_gid);
+		__swab32s(&pud->pud_fsuid);
+		__swab32s(&pud->pud_fsgid);
+		__swab32s(&pud->pud_cap);
+		__swab32s(&pud->pud_ngroups);
+	}
+
+	if (pud->pud_ngroups > LUSTRE_MAX_GROUPS) {
+		CERROR("%u groups is too large\n", pud->pud_ngroups);
+		return -EINVAL;
+	}
+
+	if (sizeof(*pud) + pud->pud_ngroups * sizeof(__u32) >
+	    msg->lm_buflens[offset]) {
+		CERROR("%u groups are claimed but bufsize only %u\n",
+		       pud->pud_ngroups, msg->lm_buflens[offset]);
+		return -EINVAL;
+	}
+
+	if (swabbed) {
+		for (i = 0; i < pud->pud_ngroups; i++)
+			__swab32s(&pud->pud_groups[i]);
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL(sptlrpc_unpack_user_desc);
+
+/****************************************
+ * misc helpers			 *
+ ****************************************/
+
+const char *sec2target_str(struct ptlrpc_sec *sec)
+{
+	if (!sec || !sec->ps_import || !sec->ps_import->imp_obd)
+		return "*";
+	if (sec_is_reverse(sec))
+		return "c";
+	return obd_uuid2str(&sec->ps_import->imp_obd->u.cli.cl_target_uuid);
+}
+EXPORT_SYMBOL(sec2target_str);
+
+/*
+ * return true if the bulk data is protected
+ */
+int sptlrpc_flavor_has_bulk(struct sptlrpc_flavor *flvr)
+{
+	switch (SPTLRPC_FLVR_BULK_SVC(flvr->sf_rpc)) {
+	case SPTLRPC_BULK_SVC_INTG:
+	case SPTLRPC_BULK_SVC_PRIV:
+		return 1;
+	default:
+		return 0;
+	}
+}
+EXPORT_SYMBOL(sptlrpc_flavor_has_bulk);
+
+/****************************************
+ * crypto API helper/alloc blkciper     *
+ ****************************************/
+
+/****************************************
+ * initialize/finalize		  *
+ ****************************************/
+
+int sptlrpc_init(void)
+{
+	int rc;
+
+	rwlock_init(&policy_lock);
+
+	rc = sptlrpc_gc_init();
+	if (rc)
+		goto out;
+
+	rc = sptlrpc_conf_init();
+	if (rc)
+		goto out_gc;
+
+	rc = sptlrpc_enc_pool_init();
+	if (rc)
+		goto out_conf;
+
+	rc = sptlrpc_null_init();
+	if (rc)
+		goto out_pool;
+
+	rc = sptlrpc_plain_init();
+	if (rc)
+		goto out_null;
+
+	rc = sptlrpc_lproc_init();
+	if (rc)
+		goto out_plain;
+
+	return 0;
+
+out_plain:
+	sptlrpc_plain_fini();
+out_null:
+	sptlrpc_null_fini();
+out_pool:
+	sptlrpc_enc_pool_fini();
+out_conf:
+	sptlrpc_conf_fini();
+out_gc:
+	sptlrpc_gc_fini();
+out:
+	return rc;
+}
+
+void sptlrpc_fini(void)
+{
+	sptlrpc_lproc_fini();
+	sptlrpc_plain_fini();
+	sptlrpc_null_fini();
+	sptlrpc_enc_pool_fini();
+	sptlrpc_conf_fini();
+	sptlrpc_gc_fini();
+}
diff --git a/kernel/drivers/staging/lustre/lustre/ptlrpc/sec_bulk.c b/kernel/drivers/staging/lustre/lustre/ptlrpc/sec_bulk.c
new file mode 100644
index 000000000..c05a8554d
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/ptlrpc/sec_bulk.c
@@ -0,0 +1,884 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/ptlrpc/sec_bulk.c
+ *
+ * Author: Eric Mei <ericm@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_SEC
+
+#include "../../include/linux/libcfs/libcfs.h"
+#include <linux/crypto.h>
+
+#include "../include/obd.h"
+#include "../include/obd_cksum.h"
+#include "../include/obd_class.h"
+#include "../include/obd_support.h"
+#include "../include/lustre_net.h"
+#include "../include/lustre_import.h"
+#include "../include/lustre_dlm.h"
+#include "../include/lustre_sec.h"
+
+#include "ptlrpc_internal.h"
+
+/****************************************
+ * bulk encryption page pools	   *
+ ****************************************/
+
+
+#define POINTERS_PER_PAGE	(PAGE_CACHE_SIZE / sizeof(void *))
+#define PAGES_PER_POOL		(POINTERS_PER_PAGE)
+
+#define IDLE_IDX_MAX	    (100)
+#define IDLE_IDX_WEIGHT	 (3)
+
+#define CACHE_QUIESCENT_PERIOD  (20)
+
+static struct ptlrpc_enc_page_pool {
+	/*
+	 * constants
+	 */
+	unsigned long    epp_max_pages;   /* maximum pages can hold, const */
+	unsigned int     epp_max_pools;   /* number of pools, const */
+
+	/*
+	 * wait queue in case of not enough free pages.
+	 */
+	wait_queue_head_t      epp_waitq;       /* waiting threads */
+	unsigned int     epp_waitqlen;    /* wait queue length */
+	unsigned long    epp_pages_short; /* # of pages wanted of in-q users */
+	unsigned int     epp_growing:1;   /* during adding pages */
+
+	/*
+	 * indicating how idle the pools are, from 0 to MAX_IDLE_IDX
+	 * this is counted based on each time when getting pages from
+	 * the pools, not based on time. which means in case that system
+	 * is idled for a while but the idle_idx might still be low if no
+	 * activities happened in the pools.
+	 */
+	unsigned long    epp_idle_idx;
+
+	/* last shrink time due to mem tight */
+	long	     epp_last_shrink;
+	long	     epp_last_access;
+
+	/*
+	 * in-pool pages bookkeeping
+	 */
+	spinlock_t	 epp_lock;	   /* protect following fields */
+	unsigned long    epp_total_pages; /* total pages in pools */
+	unsigned long    epp_free_pages;  /* current pages available */
+
+	/*
+	 * statistics
+	 */
+	unsigned long    epp_st_max_pages;      /* # of pages ever reached */
+	unsigned int     epp_st_grows;	  /* # of grows */
+	unsigned int     epp_st_grow_fails;     /* # of add pages failures */
+	unsigned int     epp_st_shrinks;	/* # of shrinks */
+	unsigned long    epp_st_access;	 /* # of access */
+	unsigned long    epp_st_missings;       /* # of cache missing */
+	unsigned long    epp_st_lowfree;	/* lowest free pages reached */
+	unsigned int     epp_st_max_wqlen;      /* highest waitqueue length */
+	unsigned long       epp_st_max_wait;       /* in jiffies */
+	/*
+	 * pointers to pools
+	 */
+	struct page    ***epp_pools;
+} page_pools;
+
+/*
+ * /proc/fs/lustre/sptlrpc/encrypt_page_pools
+ */
+int sptlrpc_proc_enc_pool_seq_show(struct seq_file *m, void *v)
+{
+	spin_lock(&page_pools.epp_lock);
+
+	seq_printf(m,
+		   "physical pages:	  %lu\n"
+		   "pages per pool:	  %lu\n"
+		   "max pages:	       %lu\n"
+		   "max pools:	       %u\n"
+		   "total pages:	     %lu\n"
+		   "total free:	      %lu\n"
+		   "idle index:	      %lu/100\n"
+		   "last shrink:	     %lds\n"
+		   "last access:	     %lds\n"
+		   "max pages reached:       %lu\n"
+		   "grows:		   %u\n"
+		   "grows failure:	   %u\n"
+		   "shrinks:		 %u\n"
+		   "cache access:	    %lu\n"
+		   "cache missing:	   %lu\n"
+		   "low free mark:	   %lu\n"
+		   "max waitqueue depth:     %u\n"
+		   "max wait time:	   " CFS_TIME_T "/%u\n",
+		   totalram_pages,
+		   PAGES_PER_POOL,
+		   page_pools.epp_max_pages,
+		   page_pools.epp_max_pools,
+		   page_pools.epp_total_pages,
+		   page_pools.epp_free_pages,
+		   page_pools.epp_idle_idx,
+		   get_seconds() - page_pools.epp_last_shrink,
+		   get_seconds() - page_pools.epp_last_access,
+		   page_pools.epp_st_max_pages,
+		   page_pools.epp_st_grows,
+		   page_pools.epp_st_grow_fails,
+		   page_pools.epp_st_shrinks,
+		   page_pools.epp_st_access,
+		   page_pools.epp_st_missings,
+		   page_pools.epp_st_lowfree,
+		   page_pools.epp_st_max_wqlen,
+		   page_pools.epp_st_max_wait,
+		   HZ);
+
+	spin_unlock(&page_pools.epp_lock);
+
+	return 0;
+}
+
+static void enc_pools_release_free_pages(long npages)
+{
+	int     p_idx, g_idx;
+	int     p_idx_max1, p_idx_max2;
+
+	LASSERT(npages > 0);
+	LASSERT(npages <= page_pools.epp_free_pages);
+	LASSERT(page_pools.epp_free_pages <= page_pools.epp_total_pages);
+
+	/* max pool index before the release */
+	p_idx_max2 = (page_pools.epp_total_pages - 1) / PAGES_PER_POOL;
+
+	page_pools.epp_free_pages -= npages;
+	page_pools.epp_total_pages -= npages;
+
+	/* max pool index after the release */
+	p_idx_max1 = page_pools.epp_total_pages == 0 ? -1 :
+		     ((page_pools.epp_total_pages - 1) / PAGES_PER_POOL);
+
+	p_idx = page_pools.epp_free_pages / PAGES_PER_POOL;
+	g_idx = page_pools.epp_free_pages % PAGES_PER_POOL;
+	LASSERT(page_pools.epp_pools[p_idx]);
+
+	while (npages--) {
+		LASSERT(page_pools.epp_pools[p_idx]);
+		LASSERT(page_pools.epp_pools[p_idx][g_idx] != NULL);
+
+		__free_page(page_pools.epp_pools[p_idx][g_idx]);
+		page_pools.epp_pools[p_idx][g_idx] = NULL;
+
+		if (++g_idx == PAGES_PER_POOL) {
+			p_idx++;
+			g_idx = 0;
+		}
+	}
+
+	/* free unused pools */
+	while (p_idx_max1 < p_idx_max2) {
+		LASSERT(page_pools.epp_pools[p_idx_max2]);
+		OBD_FREE(page_pools.epp_pools[p_idx_max2], PAGE_CACHE_SIZE);
+		page_pools.epp_pools[p_idx_max2] = NULL;
+		p_idx_max2--;
+	}
+}
+
+/*
+ * we try to keep at least PTLRPC_MAX_BRW_PAGES pages in the pool.
+ */
+static unsigned long enc_pools_shrink_count(struct shrinker *s,
+					    struct shrink_control *sc)
+{
+	/*
+	 * if no pool access for a long time, we consider it's fully idle.
+	 * a little race here is fine.
+	 */
+	if (unlikely(get_seconds() - page_pools.epp_last_access >
+		     CACHE_QUIESCENT_PERIOD)) {
+		spin_lock(&page_pools.epp_lock);
+		page_pools.epp_idle_idx = IDLE_IDX_MAX;
+		spin_unlock(&page_pools.epp_lock);
+	}
+
+	LASSERT(page_pools.epp_idle_idx <= IDLE_IDX_MAX);
+	return max((int)page_pools.epp_free_pages - PTLRPC_MAX_BRW_PAGES, 0) *
+		(IDLE_IDX_MAX - page_pools.epp_idle_idx) / IDLE_IDX_MAX;
+}
+
+/*
+ * we try to keep at least PTLRPC_MAX_BRW_PAGES pages in the pool.
+ */
+static unsigned long enc_pools_shrink_scan(struct shrinker *s,
+					   struct shrink_control *sc)
+{
+	spin_lock(&page_pools.epp_lock);
+	sc->nr_to_scan = min_t(unsigned long, sc->nr_to_scan,
+			      page_pools.epp_free_pages - PTLRPC_MAX_BRW_PAGES);
+	if (sc->nr_to_scan > 0) {
+		enc_pools_release_free_pages(sc->nr_to_scan);
+		CDEBUG(D_SEC, "released %ld pages, %ld left\n",
+		       (long)sc->nr_to_scan, page_pools.epp_free_pages);
+
+		page_pools.epp_st_shrinks++;
+		page_pools.epp_last_shrink = get_seconds();
+	}
+	spin_unlock(&page_pools.epp_lock);
+
+	/*
+	 * if no pool access for a long time, we consider it's fully idle.
+	 * a little race here is fine.
+	 */
+	if (unlikely(get_seconds() - page_pools.epp_last_access >
+		     CACHE_QUIESCENT_PERIOD)) {
+		spin_lock(&page_pools.epp_lock);
+		page_pools.epp_idle_idx = IDLE_IDX_MAX;
+		spin_unlock(&page_pools.epp_lock);
+	}
+
+	LASSERT(page_pools.epp_idle_idx <= IDLE_IDX_MAX);
+	return sc->nr_to_scan;
+}
+
+static inline
+int npages_to_npools(unsigned long npages)
+{
+	return (int) ((npages + PAGES_PER_POOL - 1) / PAGES_PER_POOL);
+}
+
+/*
+ * return how many pages cleaned up.
+ */
+static unsigned long enc_pools_cleanup(struct page ***pools, int npools)
+{
+	unsigned long cleaned = 0;
+	int	   i, j;
+
+	for (i = 0; i < npools; i++) {
+		if (pools[i]) {
+			for (j = 0; j < PAGES_PER_POOL; j++) {
+				if (pools[i][j]) {
+					__free_page(pools[i][j]);
+					cleaned++;
+				}
+			}
+			OBD_FREE(pools[i], PAGE_CACHE_SIZE);
+			pools[i] = NULL;
+		}
+	}
+
+	return cleaned;
+}
+
+/*
+ * merge @npools pointed by @pools which contains @npages new pages
+ * into current pools.
+ *
+ * we have options to avoid most memory copy with some tricks. but we choose
+ * the simplest way to avoid complexity. It's not frequently called.
+ */
+static void enc_pools_insert(struct page ***pools, int npools, int npages)
+{
+	int     freeslot;
+	int     op_idx, np_idx, og_idx, ng_idx;
+	int     cur_npools, end_npools;
+
+	LASSERT(npages > 0);
+	LASSERT(page_pools.epp_total_pages+npages <= page_pools.epp_max_pages);
+	LASSERT(npages_to_npools(npages) == npools);
+	LASSERT(page_pools.epp_growing);
+
+	spin_lock(&page_pools.epp_lock);
+
+	/*
+	 * (1) fill all the free slots of current pools.
+	 */
+	/* free slots are those left by rent pages, and the extra ones with
+	 * index >= total_pages, locate at the tail of last pool. */
+	freeslot = page_pools.epp_total_pages % PAGES_PER_POOL;
+	if (freeslot != 0)
+		freeslot = PAGES_PER_POOL - freeslot;
+	freeslot += page_pools.epp_total_pages - page_pools.epp_free_pages;
+
+	op_idx = page_pools.epp_free_pages / PAGES_PER_POOL;
+	og_idx = page_pools.epp_free_pages % PAGES_PER_POOL;
+	np_idx = npools - 1;
+	ng_idx = (npages - 1) % PAGES_PER_POOL;
+
+	while (freeslot) {
+		LASSERT(page_pools.epp_pools[op_idx][og_idx] == NULL);
+		LASSERT(pools[np_idx][ng_idx] != NULL);
+
+		page_pools.epp_pools[op_idx][og_idx] = pools[np_idx][ng_idx];
+		pools[np_idx][ng_idx] = NULL;
+
+		freeslot--;
+
+		if (++og_idx == PAGES_PER_POOL) {
+			op_idx++;
+			og_idx = 0;
+		}
+		if (--ng_idx < 0) {
+			if (np_idx == 0)
+				break;
+			np_idx--;
+			ng_idx = PAGES_PER_POOL - 1;
+		}
+	}
+
+	/*
+	 * (2) add pools if needed.
+	 */
+	cur_npools = (page_pools.epp_total_pages + PAGES_PER_POOL - 1) /
+		     PAGES_PER_POOL;
+	end_npools = (page_pools.epp_total_pages + npages + PAGES_PER_POOL - 1)
+		     / PAGES_PER_POOL;
+	LASSERT(end_npools <= page_pools.epp_max_pools);
+
+	np_idx = 0;
+	while (cur_npools < end_npools) {
+		LASSERT(page_pools.epp_pools[cur_npools] == NULL);
+		LASSERT(np_idx < npools);
+		LASSERT(pools[np_idx] != NULL);
+
+		page_pools.epp_pools[cur_npools++] = pools[np_idx];
+		pools[np_idx++] = NULL;
+	}
+
+	page_pools.epp_total_pages += npages;
+	page_pools.epp_free_pages += npages;
+	page_pools.epp_st_lowfree = page_pools.epp_free_pages;
+
+	if (page_pools.epp_total_pages > page_pools.epp_st_max_pages)
+		page_pools.epp_st_max_pages = page_pools.epp_total_pages;
+
+	CDEBUG(D_SEC, "add %d pages to total %lu\n", npages,
+	       page_pools.epp_total_pages);
+
+	spin_unlock(&page_pools.epp_lock);
+}
+
+static int enc_pools_add_pages(int npages)
+{
+	static DEFINE_MUTEX(add_pages_mutex);
+	struct page   ***pools;
+	int	     npools, alloced = 0;
+	int	     i, j, rc = -ENOMEM;
+
+	if (npages < PTLRPC_MAX_BRW_PAGES)
+		npages = PTLRPC_MAX_BRW_PAGES;
+
+	mutex_lock(&add_pages_mutex);
+
+	if (npages + page_pools.epp_total_pages > page_pools.epp_max_pages)
+		npages = page_pools.epp_max_pages - page_pools.epp_total_pages;
+	LASSERT(npages > 0);
+
+	page_pools.epp_st_grows++;
+
+	npools = npages_to_npools(npages);
+	OBD_ALLOC(pools, npools * sizeof(*pools));
+	if (pools == NULL)
+		goto out;
+
+	for (i = 0; i < npools; i++) {
+		OBD_ALLOC(pools[i], PAGE_CACHE_SIZE);
+		if (pools[i] == NULL)
+			goto out_pools;
+
+		for (j = 0; j < PAGES_PER_POOL && alloced < npages; j++) {
+			pools[i][j] = alloc_page(GFP_NOFS |
+						     __GFP_HIGHMEM);
+			if (pools[i][j] == NULL)
+				goto out_pools;
+
+			alloced++;
+		}
+	}
+	LASSERT(alloced == npages);
+
+	enc_pools_insert(pools, npools, npages);
+	CDEBUG(D_SEC, "added %d pages into pools\n", npages);
+	rc = 0;
+
+out_pools:
+	enc_pools_cleanup(pools, npools);
+	OBD_FREE(pools, npools * sizeof(*pools));
+out:
+	if (rc) {
+		page_pools.epp_st_grow_fails++;
+		CERROR("Failed to allocate %d enc pages\n", npages);
+	}
+
+	mutex_unlock(&add_pages_mutex);
+	return rc;
+}
+
+static inline void enc_pools_wakeup(void)
+{
+	assert_spin_locked(&page_pools.epp_lock);
+	LASSERT(page_pools.epp_waitqlen >= 0);
+
+	if (unlikely(page_pools.epp_waitqlen)) {
+		LASSERT(waitqueue_active(&page_pools.epp_waitq));
+		wake_up_all(&page_pools.epp_waitq);
+	}
+}
+
+static int enc_pools_should_grow(int page_needed, long now)
+{
+	/* don't grow if someone else is growing the pools right now,
+	 * or the pools has reached its full capacity
+	 */
+	if (page_pools.epp_growing ||
+	    page_pools.epp_total_pages == page_pools.epp_max_pages)
+		return 0;
+
+	/* if total pages is not enough, we need to grow */
+	if (page_pools.epp_total_pages < page_needed)
+		return 1;
+
+	/*
+	 * we wanted to return 0 here if there was a shrink just happened
+	 * moment ago, but this may cause deadlock if both client and ost
+	 * live on single node.
+	 */
+#if 0
+	if (now - page_pools.epp_last_shrink < 2)
+		return 0;
+#endif
+
+	/*
+	 * here we perhaps need consider other factors like wait queue
+	 * length, idle index, etc. ?
+	 */
+
+	/* grow the pools in any other cases */
+	return 1;
+}
+
+/*
+ * we allocate the requested pages atomically.
+ */
+int sptlrpc_enc_pool_get_pages(struct ptlrpc_bulk_desc *desc)
+{
+	wait_queue_t  waitlink;
+	unsigned long   this_idle = -1;
+	unsigned long      tick = 0;
+	long	    now;
+	int	     p_idx, g_idx;
+	int	     i;
+
+	LASSERT(desc->bd_iov_count > 0);
+	LASSERT(desc->bd_iov_count <= page_pools.epp_max_pages);
+
+	/* resent bulk, enc iov might have been allocated previously */
+	if (desc->bd_enc_iov != NULL)
+		return 0;
+
+	OBD_ALLOC(desc->bd_enc_iov,
+		  desc->bd_iov_count * sizeof(*desc->bd_enc_iov));
+	if (desc->bd_enc_iov == NULL)
+		return -ENOMEM;
+
+	spin_lock(&page_pools.epp_lock);
+
+	page_pools.epp_st_access++;
+again:
+	if (unlikely(page_pools.epp_free_pages < desc->bd_iov_count)) {
+		if (tick == 0)
+			tick = cfs_time_current();
+
+		now = get_seconds();
+
+		page_pools.epp_st_missings++;
+		page_pools.epp_pages_short += desc->bd_iov_count;
+
+		if (enc_pools_should_grow(desc->bd_iov_count, now)) {
+			page_pools.epp_growing = 1;
+
+			spin_unlock(&page_pools.epp_lock);
+			enc_pools_add_pages(page_pools.epp_pages_short / 2);
+			spin_lock(&page_pools.epp_lock);
+
+			page_pools.epp_growing = 0;
+
+			enc_pools_wakeup();
+		} else {
+			if (++page_pools.epp_waitqlen >
+			    page_pools.epp_st_max_wqlen)
+				page_pools.epp_st_max_wqlen =
+						page_pools.epp_waitqlen;
+
+			set_current_state(TASK_UNINTERRUPTIBLE);
+			init_waitqueue_entry(&waitlink, current);
+			add_wait_queue(&page_pools.epp_waitq, &waitlink);
+
+			spin_unlock(&page_pools.epp_lock);
+			schedule();
+			remove_wait_queue(&page_pools.epp_waitq, &waitlink);
+			LASSERT(page_pools.epp_waitqlen > 0);
+			spin_lock(&page_pools.epp_lock);
+			page_pools.epp_waitqlen--;
+		}
+
+		LASSERT(page_pools.epp_pages_short >= desc->bd_iov_count);
+		page_pools.epp_pages_short -= desc->bd_iov_count;
+
+		this_idle = 0;
+		goto again;
+	}
+
+	/* record max wait time */
+	if (unlikely(tick != 0)) {
+		tick = cfs_time_current() - tick;
+		if (tick > page_pools.epp_st_max_wait)
+			page_pools.epp_st_max_wait = tick;
+	}
+
+	/* proceed with rest of allocation */
+	page_pools.epp_free_pages -= desc->bd_iov_count;
+
+	p_idx = page_pools.epp_free_pages / PAGES_PER_POOL;
+	g_idx = page_pools.epp_free_pages % PAGES_PER_POOL;
+
+	for (i = 0; i < desc->bd_iov_count; i++) {
+		LASSERT(page_pools.epp_pools[p_idx][g_idx] != NULL);
+		desc->bd_enc_iov[i].kiov_page =
+					page_pools.epp_pools[p_idx][g_idx];
+		page_pools.epp_pools[p_idx][g_idx] = NULL;
+
+		if (++g_idx == PAGES_PER_POOL) {
+			p_idx++;
+			g_idx = 0;
+		}
+	}
+
+	if (page_pools.epp_free_pages < page_pools.epp_st_lowfree)
+		page_pools.epp_st_lowfree = page_pools.epp_free_pages;
+
+	/*
+	 * new idle index = (old * weight + new) / (weight + 1)
+	 */
+	if (this_idle == -1) {
+		this_idle = page_pools.epp_free_pages * IDLE_IDX_MAX /
+			    page_pools.epp_total_pages;
+	}
+	page_pools.epp_idle_idx = (page_pools.epp_idle_idx * IDLE_IDX_WEIGHT +
+				   this_idle) /
+				  (IDLE_IDX_WEIGHT + 1);
+
+	page_pools.epp_last_access = get_seconds();
+
+	spin_unlock(&page_pools.epp_lock);
+	return 0;
+}
+EXPORT_SYMBOL(sptlrpc_enc_pool_get_pages);
+
+void sptlrpc_enc_pool_put_pages(struct ptlrpc_bulk_desc *desc)
+{
+	int     p_idx, g_idx;
+	int     i;
+
+	if (desc->bd_enc_iov == NULL)
+		return;
+
+	LASSERT(desc->bd_iov_count > 0);
+
+	spin_lock(&page_pools.epp_lock);
+
+	p_idx = page_pools.epp_free_pages / PAGES_PER_POOL;
+	g_idx = page_pools.epp_free_pages % PAGES_PER_POOL;
+
+	LASSERT(page_pools.epp_free_pages + desc->bd_iov_count <=
+		page_pools.epp_total_pages);
+	LASSERT(page_pools.epp_pools[p_idx]);
+
+	for (i = 0; i < desc->bd_iov_count; i++) {
+		LASSERT(desc->bd_enc_iov[i].kiov_page != NULL);
+		LASSERT(g_idx != 0 || page_pools.epp_pools[p_idx]);
+		LASSERT(page_pools.epp_pools[p_idx][g_idx] == NULL);
+
+		page_pools.epp_pools[p_idx][g_idx] =
+					desc->bd_enc_iov[i].kiov_page;
+
+		if (++g_idx == PAGES_PER_POOL) {
+			p_idx++;
+			g_idx = 0;
+		}
+	}
+
+	page_pools.epp_free_pages += desc->bd_iov_count;
+
+	enc_pools_wakeup();
+
+	spin_unlock(&page_pools.epp_lock);
+
+	OBD_FREE(desc->bd_enc_iov,
+		 desc->bd_iov_count * sizeof(*desc->bd_enc_iov));
+	desc->bd_enc_iov = NULL;
+}
+EXPORT_SYMBOL(sptlrpc_enc_pool_put_pages);
+
+/*
+ * we don't do much stuff for add_user/del_user anymore, except adding some
+ * initial pages in add_user() if current pools are empty, rest would be
+ * handled by the pools's self-adaption.
+ */
+int sptlrpc_enc_pool_add_user(void)
+{
+	int     need_grow = 0;
+
+	spin_lock(&page_pools.epp_lock);
+	if (page_pools.epp_growing == 0 && page_pools.epp_total_pages == 0) {
+		page_pools.epp_growing = 1;
+		need_grow = 1;
+	}
+	spin_unlock(&page_pools.epp_lock);
+
+	if (need_grow) {
+		enc_pools_add_pages(PTLRPC_MAX_BRW_PAGES +
+				    PTLRPC_MAX_BRW_PAGES);
+
+		spin_lock(&page_pools.epp_lock);
+		page_pools.epp_growing = 0;
+		enc_pools_wakeup();
+		spin_unlock(&page_pools.epp_lock);
+	}
+	return 0;
+}
+EXPORT_SYMBOL(sptlrpc_enc_pool_add_user);
+
+int sptlrpc_enc_pool_del_user(void)
+{
+	return 0;
+}
+EXPORT_SYMBOL(sptlrpc_enc_pool_del_user);
+
+static inline void enc_pools_alloc(void)
+{
+	LASSERT(page_pools.epp_max_pools);
+	OBD_ALLOC_LARGE(page_pools.epp_pools,
+			page_pools.epp_max_pools *
+			sizeof(*page_pools.epp_pools));
+}
+
+static inline void enc_pools_free(void)
+{
+	LASSERT(page_pools.epp_max_pools);
+	LASSERT(page_pools.epp_pools);
+
+	OBD_FREE_LARGE(page_pools.epp_pools,
+		       page_pools.epp_max_pools *
+		       sizeof(*page_pools.epp_pools));
+}
+
+static struct shrinker pools_shrinker = {
+	.count_objects	= enc_pools_shrink_count,
+	.scan_objects	= enc_pools_shrink_scan,
+	.seeks		= DEFAULT_SEEKS,
+};
+
+int sptlrpc_enc_pool_init(void)
+{
+	/*
+	 * maximum capacity is 1/8 of total physical memory.
+	 * is the 1/8 a good number?
+	 */
+	page_pools.epp_max_pages = totalram_pages / 8;
+	page_pools.epp_max_pools = npages_to_npools(page_pools.epp_max_pages);
+
+	init_waitqueue_head(&page_pools.epp_waitq);
+	page_pools.epp_waitqlen = 0;
+	page_pools.epp_pages_short = 0;
+
+	page_pools.epp_growing = 0;
+
+	page_pools.epp_idle_idx = 0;
+	page_pools.epp_last_shrink = get_seconds();
+	page_pools.epp_last_access = get_seconds();
+
+	spin_lock_init(&page_pools.epp_lock);
+	page_pools.epp_total_pages = 0;
+	page_pools.epp_free_pages = 0;
+
+	page_pools.epp_st_max_pages = 0;
+	page_pools.epp_st_grows = 0;
+	page_pools.epp_st_grow_fails = 0;
+	page_pools.epp_st_shrinks = 0;
+	page_pools.epp_st_access = 0;
+	page_pools.epp_st_missings = 0;
+	page_pools.epp_st_lowfree = 0;
+	page_pools.epp_st_max_wqlen = 0;
+	page_pools.epp_st_max_wait = 0;
+
+	enc_pools_alloc();
+	if (page_pools.epp_pools == NULL)
+		return -ENOMEM;
+
+	register_shrinker(&pools_shrinker);
+
+	return 0;
+}
+
+void sptlrpc_enc_pool_fini(void)
+{
+	unsigned long cleaned, npools;
+
+	LASSERT(page_pools.epp_pools);
+	LASSERT(page_pools.epp_total_pages == page_pools.epp_free_pages);
+
+	unregister_shrinker(&pools_shrinker);
+
+	npools = npages_to_npools(page_pools.epp_total_pages);
+	cleaned = enc_pools_cleanup(page_pools.epp_pools, npools);
+	LASSERT(cleaned == page_pools.epp_total_pages);
+
+	enc_pools_free();
+
+	if (page_pools.epp_st_access > 0) {
+		CDEBUG(D_SEC,
+		       "max pages %lu, grows %u, grow fails %u, shrinks %u, access %lu, missing %lu, max qlen %u, max wait "
+		       CFS_TIME_T"/%d\n",
+		       page_pools.epp_st_max_pages, page_pools.epp_st_grows,
+		       page_pools.epp_st_grow_fails,
+		       page_pools.epp_st_shrinks, page_pools.epp_st_access,
+		       page_pools.epp_st_missings, page_pools.epp_st_max_wqlen,
+		       page_pools.epp_st_max_wait, HZ);
+	}
+}
+
+
+static int cfs_hash_alg_id[] = {
+	[BULK_HASH_ALG_NULL]	= CFS_HASH_ALG_NULL,
+	[BULK_HASH_ALG_ADLER32]	= CFS_HASH_ALG_ADLER32,
+	[BULK_HASH_ALG_CRC32]	= CFS_HASH_ALG_CRC32,
+	[BULK_HASH_ALG_MD5]	= CFS_HASH_ALG_MD5,
+	[BULK_HASH_ALG_SHA1]	= CFS_HASH_ALG_SHA1,
+	[BULK_HASH_ALG_SHA256]	= CFS_HASH_ALG_SHA256,
+	[BULK_HASH_ALG_SHA384]	= CFS_HASH_ALG_SHA384,
+	[BULK_HASH_ALG_SHA512]	= CFS_HASH_ALG_SHA512,
+};
+const char *sptlrpc_get_hash_name(__u8 hash_alg)
+{
+	return cfs_crypto_hash_name(cfs_hash_alg_id[hash_alg]);
+}
+EXPORT_SYMBOL(sptlrpc_get_hash_name);
+
+__u8 sptlrpc_get_hash_alg(const char *algname)
+{
+	return cfs_crypto_hash_alg(algname);
+}
+EXPORT_SYMBOL(sptlrpc_get_hash_alg);
+
+int bulk_sec_desc_unpack(struct lustre_msg *msg, int offset, int swabbed)
+{
+	struct ptlrpc_bulk_sec_desc *bsd;
+	int			  size = msg->lm_buflens[offset];
+
+	bsd = lustre_msg_buf(msg, offset, sizeof(*bsd));
+	if (bsd == NULL) {
+		CERROR("Invalid bulk sec desc: size %d\n", size);
+		return -EINVAL;
+	}
+
+	if (swabbed)
+		__swab32s(&bsd->bsd_nob);
+
+	if (unlikely(bsd->bsd_version != 0)) {
+		CERROR("Unexpected version %u\n", bsd->bsd_version);
+		return -EPROTO;
+	}
+
+	if (unlikely(bsd->bsd_type >= SPTLRPC_BULK_MAX)) {
+		CERROR("Invalid type %u\n", bsd->bsd_type);
+		return -EPROTO;
+	}
+
+	/* FIXME more sanity check here */
+
+	if (unlikely(bsd->bsd_svc != SPTLRPC_BULK_SVC_NULL &&
+		     bsd->bsd_svc != SPTLRPC_BULK_SVC_INTG &&
+		     bsd->bsd_svc != SPTLRPC_BULK_SVC_PRIV)) {
+		CERROR("Invalid svc %u\n", bsd->bsd_svc);
+		return -EPROTO;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL(bulk_sec_desc_unpack);
+
+int sptlrpc_get_bulk_checksum(struct ptlrpc_bulk_desc *desc, __u8 alg,
+			      void *buf, int buflen)
+{
+	struct cfs_crypto_hash_desc	*hdesc;
+	int				hashsize;
+	char				hashbuf[64];
+	unsigned int			bufsize;
+	int				i, err;
+
+	LASSERT(alg > BULK_HASH_ALG_NULL && alg < BULK_HASH_ALG_MAX);
+	LASSERT(buflen >= 4);
+
+	hdesc = cfs_crypto_hash_init(cfs_hash_alg_id[alg], NULL, 0);
+	if (IS_ERR(hdesc)) {
+		CERROR("Unable to initialize checksum hash %s\n",
+		       cfs_crypto_hash_name(cfs_hash_alg_id[alg]));
+		return PTR_ERR(hdesc);
+	}
+
+	hashsize = cfs_crypto_hash_digestsize(cfs_hash_alg_id[alg]);
+
+	for (i = 0; i < desc->bd_iov_count; i++) {
+		cfs_crypto_hash_update_page(hdesc, desc->bd_iov[i].kiov_page,
+				  desc->bd_iov[i].kiov_offset & ~CFS_PAGE_MASK,
+				  desc->bd_iov[i].kiov_len);
+	}
+	if (hashsize > buflen) {
+		bufsize = sizeof(hashbuf);
+		err = cfs_crypto_hash_final(hdesc, (unsigned char *)hashbuf,
+					    &bufsize);
+		memcpy(buf, hashbuf, buflen);
+	} else {
+		bufsize = buflen;
+		err = cfs_crypto_hash_final(hdesc, (unsigned char *)buf,
+					    &bufsize);
+	}
+
+	if (err)
+		cfs_crypto_hash_final(hdesc, NULL, NULL);
+	return err;
+}
+EXPORT_SYMBOL(sptlrpc_get_bulk_checksum);
diff --git a/kernel/drivers/staging/lustre/lustre/ptlrpc/sec_config.c b/kernel/drivers/staging/lustre/lustre/ptlrpc/sec_config.c
new file mode 100644
index 000000000..56ba9e4e5
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/ptlrpc/sec_config.c
@@ -0,0 +1,901 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_SEC
+
+#include "../../include/linux/libcfs/libcfs.h"
+#include <linux/crypto.h>
+#include <linux/key.h>
+
+#include "../include/obd.h"
+#include "../include/obd_support.h"
+#include "../include/lustre_import.h"
+#include "../include/lustre_param.h"
+#include "../include/lustre_sec.h"
+
+#include "ptlrpc_internal.h"
+
+const char *sptlrpc_part2name(enum lustre_sec_part part)
+{
+	switch (part) {
+	case LUSTRE_SP_CLI:
+		return "cli";
+	case LUSTRE_SP_MDT:
+		return "mdt";
+	case LUSTRE_SP_OST:
+		return "ost";
+	case LUSTRE_SP_MGC:
+		return "mgc";
+	case LUSTRE_SP_MGS:
+		return "mgs";
+	case LUSTRE_SP_ANY:
+		return "any";
+	default:
+		return "err";
+	}
+}
+EXPORT_SYMBOL(sptlrpc_part2name);
+
+enum lustre_sec_part sptlrpc_target_sec_part(struct obd_device *obd)
+{
+	const char *type = obd->obd_type->typ_name;
+
+	if (!strcmp(type, LUSTRE_MDT_NAME))
+		return LUSTRE_SP_MDT;
+	if (!strcmp(type, LUSTRE_OST_NAME))
+		return LUSTRE_SP_OST;
+	if (!strcmp(type, LUSTRE_MGS_NAME))
+		return LUSTRE_SP_MGS;
+
+	CERROR("unknown target %p(%s)\n", obd, type);
+	return LUSTRE_SP_ANY;
+}
+EXPORT_SYMBOL(sptlrpc_target_sec_part);
+
+/****************************************
+ * user supplied flavor string parsing  *
+ ****************************************/
+
+/*
+ * format: <base_flavor>[-<bulk_type:alg_spec>]
+ */
+int sptlrpc_parse_flavor(const char *str, struct sptlrpc_flavor *flvr)
+{
+	char	    buf[32];
+	char	   *bulk, *alg;
+
+	memset(flvr, 0, sizeof(*flvr));
+
+	if (str == NULL || str[0] == '\0') {
+		flvr->sf_rpc = SPTLRPC_FLVR_INVALID;
+		return 0;
+	}
+
+	strncpy(buf, str, sizeof(buf));
+	buf[sizeof(buf) - 1] = '\0';
+
+	bulk = strchr(buf, '-');
+	if (bulk)
+		*bulk++ = '\0';
+
+	flvr->sf_rpc = sptlrpc_name2flavor_base(buf);
+	if (flvr->sf_rpc == SPTLRPC_FLVR_INVALID)
+		goto err_out;
+
+	/*
+	 * currently only base flavor "plain" can have bulk specification.
+	 */
+	if (flvr->sf_rpc == SPTLRPC_FLVR_PLAIN) {
+		flvr->u_bulk.hash.hash_alg = BULK_HASH_ALG_ADLER32;
+		if (bulk) {
+			/*
+			 * format: plain-hash:<hash_alg>
+			 */
+			alg = strchr(bulk, ':');
+			if (alg == NULL)
+				goto err_out;
+			*alg++ = '\0';
+
+			if (strcmp(bulk, "hash"))
+				goto err_out;
+
+			flvr->u_bulk.hash.hash_alg = sptlrpc_get_hash_alg(alg);
+			if (flvr->u_bulk.hash.hash_alg >= BULK_HASH_ALG_MAX)
+				goto err_out;
+		}
+
+		if (flvr->u_bulk.hash.hash_alg == BULK_HASH_ALG_NULL)
+			flvr_set_bulk_svc(&flvr->sf_rpc, SPTLRPC_BULK_SVC_NULL);
+		else
+			flvr_set_bulk_svc(&flvr->sf_rpc, SPTLRPC_BULK_SVC_INTG);
+	} else {
+		if (bulk)
+			goto err_out;
+	}
+
+	flvr->sf_flags = 0;
+	return 0;
+
+err_out:
+	CERROR("invalid flavor string: %s\n", str);
+	return -EINVAL;
+}
+EXPORT_SYMBOL(sptlrpc_parse_flavor);
+
+/****************************************
+ * configure rules		      *
+ ****************************************/
+
+static void get_default_flavor(struct sptlrpc_flavor *sf)
+{
+	memset(sf, 0, sizeof(*sf));
+
+	sf->sf_rpc = SPTLRPC_FLVR_NULL;
+	sf->sf_flags = 0;
+}
+
+static void sptlrpc_rule_init(struct sptlrpc_rule *rule)
+{
+	rule->sr_netid = LNET_NIDNET(LNET_NID_ANY);
+	rule->sr_from = LUSTRE_SP_ANY;
+	rule->sr_to = LUSTRE_SP_ANY;
+	rule->sr_padding = 0;
+
+	get_default_flavor(&rule->sr_flvr);
+}
+
+/*
+ * format: network[.direction]=flavor
+ */
+int sptlrpc_parse_rule(char *param, struct sptlrpc_rule *rule)
+{
+	char	   *flavor, *dir;
+	int	     rc;
+
+	sptlrpc_rule_init(rule);
+
+	flavor = strchr(param, '=');
+	if (flavor == NULL) {
+		CERROR("invalid param, no '='\n");
+		return -EINVAL;
+	}
+	*flavor++ = '\0';
+
+	dir = strchr(param, '.');
+	if (dir)
+		*dir++ = '\0';
+
+	/* 1.1 network */
+	if (strcmp(param, "default")) {
+		rule->sr_netid = libcfs_str2net(param);
+		if (rule->sr_netid == LNET_NIDNET(LNET_NID_ANY)) {
+			CERROR("invalid network name: %s\n", param);
+			return -EINVAL;
+		}
+	}
+
+	/* 1.2 direction */
+	if (dir) {
+		if (!strcmp(dir, "mdt2ost")) {
+			rule->sr_from = LUSTRE_SP_MDT;
+			rule->sr_to = LUSTRE_SP_OST;
+		} else if (!strcmp(dir, "mdt2mdt")) {
+			rule->sr_from = LUSTRE_SP_MDT;
+			rule->sr_to = LUSTRE_SP_MDT;
+		} else if (!strcmp(dir, "cli2ost")) {
+			rule->sr_from = LUSTRE_SP_CLI;
+			rule->sr_to = LUSTRE_SP_OST;
+		} else if (!strcmp(dir, "cli2mdt")) {
+			rule->sr_from = LUSTRE_SP_CLI;
+			rule->sr_to = LUSTRE_SP_MDT;
+		} else {
+			CERROR("invalid rule dir segment: %s\n", dir);
+			return -EINVAL;
+		}
+	}
+
+	/* 2.1 flavor */
+	rc = sptlrpc_parse_flavor(flavor, &rule->sr_flvr);
+	if (rc)
+		return -EINVAL;
+
+	return 0;
+}
+EXPORT_SYMBOL(sptlrpc_parse_rule);
+
+void sptlrpc_rule_set_free(struct sptlrpc_rule_set *rset)
+{
+	LASSERT(rset->srs_nslot ||
+		(rset->srs_nrule == 0 && rset->srs_rules == NULL));
+
+	if (rset->srs_nslot) {
+		OBD_FREE(rset->srs_rules,
+			 rset->srs_nslot * sizeof(*rset->srs_rules));
+		sptlrpc_rule_set_init(rset);
+	}
+}
+EXPORT_SYMBOL(sptlrpc_rule_set_free);
+
+/*
+ * return 0 if the rule set could accommodate one more rule.
+ */
+int sptlrpc_rule_set_expand(struct sptlrpc_rule_set *rset)
+{
+	struct sptlrpc_rule *rules;
+	int nslot;
+
+	might_sleep();
+
+	if (rset->srs_nrule < rset->srs_nslot)
+		return 0;
+
+	nslot = rset->srs_nslot + 8;
+
+	/* better use realloc() if available */
+	OBD_ALLOC(rules, nslot * sizeof(*rset->srs_rules));
+	if (rules == NULL)
+		return -ENOMEM;
+
+	if (rset->srs_nrule) {
+		LASSERT(rset->srs_nslot && rset->srs_rules);
+		memcpy(rules, rset->srs_rules,
+		       rset->srs_nrule * sizeof(*rset->srs_rules));
+
+		OBD_FREE(rset->srs_rules,
+			 rset->srs_nslot * sizeof(*rset->srs_rules));
+	}
+
+	rset->srs_rules = rules;
+	rset->srs_nslot = nslot;
+	return 0;
+}
+EXPORT_SYMBOL(sptlrpc_rule_set_expand);
+
+static inline int rule_spec_dir(struct sptlrpc_rule *rule)
+{
+	return (rule->sr_from != LUSTRE_SP_ANY ||
+		rule->sr_to != LUSTRE_SP_ANY);
+}
+static inline int rule_spec_net(struct sptlrpc_rule *rule)
+{
+	return (rule->sr_netid != LNET_NIDNET(LNET_NID_ANY));
+}
+static inline int rule_match_dir(struct sptlrpc_rule *r1,
+				 struct sptlrpc_rule *r2)
+{
+	return (r1->sr_from == r2->sr_from && r1->sr_to == r2->sr_to);
+}
+static inline int rule_match_net(struct sptlrpc_rule *r1,
+				 struct sptlrpc_rule *r2)
+{
+	return (r1->sr_netid == r2->sr_netid);
+}
+
+/*
+ * merge @rule into @rset.
+ * the @rset slots might be expanded.
+ */
+int sptlrpc_rule_set_merge(struct sptlrpc_rule_set *rset,
+			   struct sptlrpc_rule *rule)
+{
+	struct sptlrpc_rule      *p = rset->srs_rules;
+	int		       spec_dir, spec_net;
+	int		       rc, n, match = 0;
+
+	might_sleep();
+
+	spec_net = rule_spec_net(rule);
+	spec_dir = rule_spec_dir(rule);
+
+	for (n = 0; n < rset->srs_nrule; n++) {
+		p = &rset->srs_rules[n];
+
+		/* test network match, if failed:
+		 * - spec rule: skip rules which is also spec rule match, until
+		 *   we hit a wild rule, which means no more chance
+		 * - wild rule: skip until reach the one which is also wild
+		 *   and matches
+		 */
+		if (!rule_match_net(p, rule)) {
+			if (spec_net) {
+				if (rule_spec_net(p))
+					continue;
+				else
+					break;
+			} else {
+				continue;
+			}
+		}
+
+		/* test dir match, same logic as net matching */
+		if (!rule_match_dir(p, rule)) {
+			if (spec_dir) {
+				if (rule_spec_dir(p))
+					continue;
+				else
+					break;
+			} else {
+				continue;
+			}
+		}
+
+		/* find a match */
+		match = 1;
+		break;
+	}
+
+	if (match) {
+		LASSERT(n >= 0 && n < rset->srs_nrule);
+
+		if (rule->sr_flvr.sf_rpc == SPTLRPC_FLVR_INVALID) {
+			/* remove this rule */
+			if (n < rset->srs_nrule - 1)
+				memmove(&rset->srs_rules[n],
+					&rset->srs_rules[n + 1],
+					(rset->srs_nrule - n - 1) *
+					sizeof(*rule));
+			rset->srs_nrule--;
+		} else {
+			/* override the rule */
+			memcpy(&rset->srs_rules[n], rule, sizeof(*rule));
+		}
+	} else {
+		LASSERT(n >= 0 && n <= rset->srs_nrule);
+
+		if (rule->sr_flvr.sf_rpc != SPTLRPC_FLVR_INVALID) {
+			rc = sptlrpc_rule_set_expand(rset);
+			if (rc)
+				return rc;
+
+			if (n < rset->srs_nrule)
+				memmove(&rset->srs_rules[n + 1],
+					&rset->srs_rules[n],
+					(rset->srs_nrule - n) * sizeof(*rule));
+			memcpy(&rset->srs_rules[n], rule, sizeof(*rule));
+			rset->srs_nrule++;
+		} else {
+			CDEBUG(D_CONFIG, "ignore the unmatched deletion\n");
+		}
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL(sptlrpc_rule_set_merge);
+
+/**
+ * given from/to/nid, determine a matching flavor in ruleset.
+ * return 1 if a match found, otherwise return 0.
+ */
+int sptlrpc_rule_set_choose(struct sptlrpc_rule_set *rset,
+			    enum lustre_sec_part from,
+			    enum lustre_sec_part to,
+			    lnet_nid_t nid,
+			    struct sptlrpc_flavor *sf)
+{
+	struct sptlrpc_rule    *r;
+	int		     n;
+
+	for (n = 0; n < rset->srs_nrule; n++) {
+		r = &rset->srs_rules[n];
+
+		if (LNET_NIDNET(nid) != LNET_NIDNET(LNET_NID_ANY) &&
+		    r->sr_netid != LNET_NIDNET(LNET_NID_ANY) &&
+		    LNET_NIDNET(nid) != r->sr_netid)
+			continue;
+
+		if (from != LUSTRE_SP_ANY && r->sr_from != LUSTRE_SP_ANY &&
+		    from != r->sr_from)
+			continue;
+
+		if (to != LUSTRE_SP_ANY && r->sr_to != LUSTRE_SP_ANY &&
+		    to != r->sr_to)
+			continue;
+
+		*sf = r->sr_flvr;
+		return 1;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL(sptlrpc_rule_set_choose);
+
+void sptlrpc_rule_set_dump(struct sptlrpc_rule_set *rset)
+{
+	struct sptlrpc_rule *r;
+	int     n;
+
+	for (n = 0; n < rset->srs_nrule; n++) {
+		r = &rset->srs_rules[n];
+		CDEBUG(D_SEC, "<%02d> from %x to %x, net %x, rpc %x\n", n,
+		       r->sr_from, r->sr_to, r->sr_netid, r->sr_flvr.sf_rpc);
+	}
+}
+EXPORT_SYMBOL(sptlrpc_rule_set_dump);
+
+/**********************************
+ * sptlrpc configuration support  *
+ **********************************/
+
+struct sptlrpc_conf_tgt {
+	struct list_head	      sct_list;
+	char		    sct_name[MAX_OBD_NAME];
+	struct sptlrpc_rule_set sct_rset;
+};
+
+struct sptlrpc_conf {
+	struct list_head	      sc_list;
+	char		    sc_fsname[MTI_NAME_MAXLEN];
+	unsigned int	    sc_modified;  /* modified during updating */
+	unsigned int	    sc_updated:1, /* updated copy from MGS */
+				sc_local:1;   /* local copy from target */
+	struct sptlrpc_rule_set sc_rset;      /* fs general rules */
+	struct list_head	      sc_tgts;      /* target-specific rules */
+};
+
+static struct mutex sptlrpc_conf_lock;
+static LIST_HEAD(sptlrpc_confs);
+
+static inline int is_hex(char c)
+{
+	return ((c >= '0' && c <= '9') ||
+		(c >= 'a' && c <= 'f'));
+}
+
+static void target2fsname(const char *tgt, char *fsname, int buflen)
+{
+	const char     *ptr;
+	int	     len;
+
+	ptr = strrchr(tgt, '-');
+	if (ptr) {
+		if ((strncmp(ptr, "-MDT", 4) != 0 &&
+		     strncmp(ptr, "-OST", 4) != 0) ||
+		    !is_hex(ptr[4]) || !is_hex(ptr[5]) ||
+		    !is_hex(ptr[6]) || !is_hex(ptr[7]))
+			ptr = NULL;
+	}
+
+	/* if we didn't find the pattern, treat the whole string as fsname */
+	if (ptr == NULL)
+		len = strlen(tgt);
+	else
+		len = ptr - tgt;
+
+	len = min(len, buflen - 1);
+	memcpy(fsname, tgt, len);
+	fsname[len] = '\0';
+}
+
+static void sptlrpc_conf_free_rsets(struct sptlrpc_conf *conf)
+{
+	struct sptlrpc_conf_tgt *conf_tgt, *conf_tgt_next;
+
+	sptlrpc_rule_set_free(&conf->sc_rset);
+
+	list_for_each_entry_safe(conf_tgt, conf_tgt_next,
+				     &conf->sc_tgts, sct_list) {
+		sptlrpc_rule_set_free(&conf_tgt->sct_rset);
+		list_del(&conf_tgt->sct_list);
+		OBD_FREE_PTR(conf_tgt);
+	}
+	LASSERT(list_empty(&conf->sc_tgts));
+
+	conf->sc_updated = 0;
+	conf->sc_local = 0;
+}
+
+static void sptlrpc_conf_free(struct sptlrpc_conf *conf)
+{
+	CDEBUG(D_SEC, "free sptlrpc conf %s\n", conf->sc_fsname);
+
+	sptlrpc_conf_free_rsets(conf);
+	list_del(&conf->sc_list);
+	OBD_FREE_PTR(conf);
+}
+
+static
+struct sptlrpc_conf_tgt *sptlrpc_conf_get_tgt(struct sptlrpc_conf *conf,
+					      const char *name,
+					      int create)
+{
+	struct sptlrpc_conf_tgt *conf_tgt;
+
+	list_for_each_entry(conf_tgt, &conf->sc_tgts, sct_list) {
+		if (strcmp(conf_tgt->sct_name, name) == 0)
+			return conf_tgt;
+	}
+
+	if (!create)
+		return NULL;
+
+	OBD_ALLOC_PTR(conf_tgt);
+	if (conf_tgt) {
+		strlcpy(conf_tgt->sct_name, name, sizeof(conf_tgt->sct_name));
+		sptlrpc_rule_set_init(&conf_tgt->sct_rset);
+		list_add(&conf_tgt->sct_list, &conf->sc_tgts);
+	}
+
+	return conf_tgt;
+}
+
+static
+struct sptlrpc_conf *sptlrpc_conf_get(const char *fsname,
+				      int create)
+{
+	struct sptlrpc_conf *conf;
+
+	list_for_each_entry(conf, &sptlrpc_confs, sc_list) {
+		if (strcmp(conf->sc_fsname, fsname) == 0)
+			return conf;
+	}
+
+	if (!create)
+		return NULL;
+
+	OBD_ALLOC_PTR(conf);
+	if (conf == NULL)
+		return NULL;
+
+	strcpy(conf->sc_fsname, fsname);
+	sptlrpc_rule_set_init(&conf->sc_rset);
+	INIT_LIST_HEAD(&conf->sc_tgts);
+	list_add(&conf->sc_list, &sptlrpc_confs);
+
+	CDEBUG(D_SEC, "create sptlrpc conf %s\n", conf->sc_fsname);
+	return conf;
+}
+
+/**
+ * caller must hold conf_lock already.
+ */
+static int sptlrpc_conf_merge_rule(struct sptlrpc_conf *conf,
+				   const char *target,
+				   struct sptlrpc_rule *rule)
+{
+	struct sptlrpc_conf_tgt  *conf_tgt;
+	struct sptlrpc_rule_set  *rule_set;
+
+	/* fsname == target means general rules for the whole fs */
+	if (strcmp(conf->sc_fsname, target) == 0) {
+		rule_set = &conf->sc_rset;
+	} else {
+		conf_tgt = sptlrpc_conf_get_tgt(conf, target, 1);
+		if (conf_tgt) {
+			rule_set = &conf_tgt->sct_rset;
+		} else {
+			CERROR("out of memory, can't merge rule!\n");
+			return -ENOMEM;
+		}
+	}
+
+	return sptlrpc_rule_set_merge(rule_set, rule);
+}
+
+/**
+ * process one LCFG_SPTLRPC_CONF record. if \a conf is NULL, we
+ * find one through the target name in the record inside conf_lock;
+ * otherwise means caller already hold conf_lock.
+ */
+static int __sptlrpc_process_config(struct lustre_cfg *lcfg,
+				    struct sptlrpc_conf *conf)
+{
+	char		   *target, *param;
+	char		    fsname[MTI_NAME_MAXLEN];
+	struct sptlrpc_rule     rule;
+	int		     rc;
+
+	target = lustre_cfg_string(lcfg, 1);
+	if (target == NULL) {
+		CERROR("missing target name\n");
+		return -EINVAL;
+	}
+
+	param = lustre_cfg_string(lcfg, 2);
+	if (param == NULL) {
+		CERROR("missing parameter\n");
+		return -EINVAL;
+	}
+
+	CDEBUG(D_SEC, "processing rule: %s.%s\n", target, param);
+
+	/* parse rule to make sure the format is correct */
+	if (strncmp(param, PARAM_SRPC_FLVR, sizeof(PARAM_SRPC_FLVR) - 1) != 0) {
+		CERROR("Invalid sptlrpc parameter: %s\n", param);
+		return -EINVAL;
+	}
+	param += sizeof(PARAM_SRPC_FLVR) - 1;
+
+	rc = sptlrpc_parse_rule(param, &rule);
+	if (rc)
+		return -EINVAL;
+
+	if (conf == NULL) {
+		target2fsname(target, fsname, sizeof(fsname));
+
+		mutex_lock(&sptlrpc_conf_lock);
+		conf = sptlrpc_conf_get(fsname, 0);
+		if (conf == NULL) {
+			CERROR("can't find conf\n");
+			rc = -ENOMEM;
+		} else {
+			rc = sptlrpc_conf_merge_rule(conf, target, &rule);
+		}
+		mutex_unlock(&sptlrpc_conf_lock);
+	} else {
+		LASSERT(mutex_is_locked(&sptlrpc_conf_lock));
+		rc = sptlrpc_conf_merge_rule(conf, target, &rule);
+	}
+
+	if (rc == 0)
+		conf->sc_modified++;
+
+	return rc;
+}
+
+int sptlrpc_process_config(struct lustre_cfg *lcfg)
+{
+	return __sptlrpc_process_config(lcfg, NULL);
+}
+EXPORT_SYMBOL(sptlrpc_process_config);
+
+static int logname2fsname(const char *logname, char *buf, int buflen)
+{
+	char   *ptr;
+	int     len;
+
+	ptr = strrchr(logname, '-');
+	if (ptr == NULL || strcmp(ptr, "-sptlrpc")) {
+		CERROR("%s is not a sptlrpc config log\n", logname);
+		return -EINVAL;
+	}
+
+	len = min((int) (ptr - logname), buflen - 1);
+
+	memcpy(buf, logname, len);
+	buf[len] = '\0';
+	return 0;
+}
+
+void sptlrpc_conf_log_update_begin(const char *logname)
+{
+	struct sptlrpc_conf *conf;
+	char		 fsname[16];
+
+	if (logname2fsname(logname, fsname, sizeof(fsname)))
+		return;
+
+	mutex_lock(&sptlrpc_conf_lock);
+
+	conf = sptlrpc_conf_get(fsname, 0);
+	if (conf) {
+		if (conf->sc_local) {
+			LASSERT(conf->sc_updated == 0);
+			sptlrpc_conf_free_rsets(conf);
+		}
+		conf->sc_modified = 0;
+	}
+
+	mutex_unlock(&sptlrpc_conf_lock);
+}
+EXPORT_SYMBOL(sptlrpc_conf_log_update_begin);
+
+/**
+ * mark a config log has been updated
+ */
+void sptlrpc_conf_log_update_end(const char *logname)
+{
+	struct sptlrpc_conf *conf;
+	char		 fsname[16];
+
+	if (logname2fsname(logname, fsname, sizeof(fsname)))
+		return;
+
+	mutex_lock(&sptlrpc_conf_lock);
+
+	conf = sptlrpc_conf_get(fsname, 0);
+	if (conf) {
+		/*
+		 * if original state is not updated, make sure the
+		 * modified counter > 0 to enforce updating local copy.
+		 */
+		if (conf->sc_updated == 0)
+			conf->sc_modified++;
+
+		conf->sc_updated = 1;
+	}
+
+	mutex_unlock(&sptlrpc_conf_lock);
+}
+EXPORT_SYMBOL(sptlrpc_conf_log_update_end);
+
+void sptlrpc_conf_log_start(const char *logname)
+{
+	char		 fsname[16];
+
+	if (logname2fsname(logname, fsname, sizeof(fsname)))
+		return;
+
+	mutex_lock(&sptlrpc_conf_lock);
+	sptlrpc_conf_get(fsname, 1);
+	mutex_unlock(&sptlrpc_conf_lock);
+}
+EXPORT_SYMBOL(sptlrpc_conf_log_start);
+
+void sptlrpc_conf_log_stop(const char *logname)
+{
+	struct sptlrpc_conf *conf;
+	char		 fsname[16];
+
+	if (logname2fsname(logname, fsname, sizeof(fsname)))
+		return;
+
+	mutex_lock(&sptlrpc_conf_lock);
+	conf = sptlrpc_conf_get(fsname, 0);
+	if (conf)
+		sptlrpc_conf_free(conf);
+	mutex_unlock(&sptlrpc_conf_lock);
+}
+EXPORT_SYMBOL(sptlrpc_conf_log_stop);
+
+static inline void flavor_set_flags(struct sptlrpc_flavor *sf,
+				    enum lustre_sec_part from,
+				    enum lustre_sec_part to,
+				    unsigned int fl_udesc)
+{
+	/*
+	 * null flavor doesn't need to set any flavor, and in fact
+	 * we'd better not do that because everybody share a single sec.
+	 */
+	if (sf->sf_rpc == SPTLRPC_FLVR_NULL)
+		return;
+
+	if (from == LUSTRE_SP_MDT) {
+		/* MDT->MDT; MDT->OST */
+		sf->sf_flags |= PTLRPC_SEC_FL_ROOTONLY;
+	} else if (from == LUSTRE_SP_CLI && to == LUSTRE_SP_OST) {
+		/* CLI->OST */
+		sf->sf_flags |= PTLRPC_SEC_FL_ROOTONLY | PTLRPC_SEC_FL_BULK;
+	} else if (from == LUSTRE_SP_CLI && to == LUSTRE_SP_MDT) {
+		/* CLI->MDT */
+		if (fl_udesc && sf->sf_rpc != SPTLRPC_FLVR_NULL)
+			sf->sf_flags |= PTLRPC_SEC_FL_UDESC;
+	}
+}
+
+void sptlrpc_conf_choose_flavor(enum lustre_sec_part from,
+				enum lustre_sec_part to,
+				struct obd_uuid *target,
+				lnet_nid_t nid,
+				struct sptlrpc_flavor *sf)
+{
+	struct sptlrpc_conf     *conf;
+	struct sptlrpc_conf_tgt *conf_tgt;
+	char		     name[MTI_NAME_MAXLEN];
+	int		      len, rc = 0;
+
+	target2fsname(target->uuid, name, sizeof(name));
+
+	mutex_lock(&sptlrpc_conf_lock);
+
+	conf = sptlrpc_conf_get(name, 0);
+	if (conf == NULL)
+		goto out;
+
+	/* convert uuid name (supposed end with _UUID) to target name */
+	len = strlen(target->uuid);
+	LASSERT(len > 5);
+	memcpy(name, target->uuid, len - 5);
+	name[len - 5] = '\0';
+
+	conf_tgt = sptlrpc_conf_get_tgt(conf, name, 0);
+	if (conf_tgt) {
+		rc = sptlrpc_rule_set_choose(&conf_tgt->sct_rset,
+					     from, to, nid, sf);
+		if (rc)
+			goto out;
+	}
+
+	rc = sptlrpc_rule_set_choose(&conf->sc_rset, from, to, nid, sf);
+out:
+	mutex_unlock(&sptlrpc_conf_lock);
+
+	if (rc == 0)
+		get_default_flavor(sf);
+
+	flavor_set_flags(sf, from, to, 1);
+}
+
+/**
+ * called by target devices, determine the expected flavor from
+ * certain peer (from, nid).
+ */
+void sptlrpc_target_choose_flavor(struct sptlrpc_rule_set *rset,
+				  enum lustre_sec_part from,
+				  lnet_nid_t nid,
+				  struct sptlrpc_flavor *sf)
+{
+	if (sptlrpc_rule_set_choose(rset, from, LUSTRE_SP_ANY, nid, sf) == 0)
+		get_default_flavor(sf);
+}
+EXPORT_SYMBOL(sptlrpc_target_choose_flavor);
+
+#define SEC_ADAPT_DELAY	 (10)
+
+/**
+ * called by client devices, notify the sptlrpc config has changed and
+ * do import_sec_adapt later.
+ */
+void sptlrpc_conf_client_adapt(struct obd_device *obd)
+{
+	struct obd_import  *imp;
+
+	LASSERT(strcmp(obd->obd_type->typ_name, LUSTRE_MDC_NAME) == 0 ||
+		strcmp(obd->obd_type->typ_name, LUSTRE_OSC_NAME) == 0);
+	CDEBUG(D_SEC, "obd %s\n", obd->u.cli.cl_target_uuid.uuid);
+
+	/* serialize with connect/disconnect import */
+	down_read(&obd->u.cli.cl_sem);
+
+	imp = obd->u.cli.cl_import;
+	if (imp) {
+		spin_lock(&imp->imp_lock);
+		if (imp->imp_sec)
+			imp->imp_sec_expire = get_seconds() +
+				SEC_ADAPT_DELAY;
+		spin_unlock(&imp->imp_lock);
+	}
+
+	up_read(&obd->u.cli.cl_sem);
+}
+EXPORT_SYMBOL(sptlrpc_conf_client_adapt);
+
+int  sptlrpc_conf_init(void)
+{
+	mutex_init(&sptlrpc_conf_lock);
+	return 0;
+}
+
+void sptlrpc_conf_fini(void)
+{
+	struct sptlrpc_conf  *conf, *conf_next;
+
+	mutex_lock(&sptlrpc_conf_lock);
+	list_for_each_entry_safe(conf, conf_next, &sptlrpc_confs, sc_list) {
+		sptlrpc_conf_free(conf);
+	}
+	LASSERT(list_empty(&sptlrpc_confs));
+	mutex_unlock(&sptlrpc_conf_lock);
+}
diff --git a/kernel/drivers/staging/lustre/lustre/ptlrpc/sec_gc.c b/kernel/drivers/staging/lustre/lustre/ptlrpc/sec_gc.c
new file mode 100644
index 000000000..81de68edb
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/ptlrpc/sec_gc.c
@@ -0,0 +1,252 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/ptlrpc/sec_gc.c
+ *
+ * Author: Eric Mei <ericm@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_SEC
+
+#include "../../include/linux/libcfs/libcfs.h"
+
+#include "../include/obd_support.h"
+#include "../include/obd_class.h"
+#include "../include/lustre_net.h"
+#include "../include/lustre_sec.h"
+
+#include "ptlrpc_internal.h"
+
+#define SEC_GC_INTERVAL (30 * 60)
+
+
+static struct mutex sec_gc_mutex;
+static LIST_HEAD(sec_gc_list);
+static spinlock_t sec_gc_list_lock;
+
+static LIST_HEAD(sec_gc_ctx_list);
+static spinlock_t sec_gc_ctx_list_lock;
+
+static struct ptlrpc_thread sec_gc_thread;
+static atomic_t sec_gc_wait_del = ATOMIC_INIT(0);
+
+
+void sptlrpc_gc_add_sec(struct ptlrpc_sec *sec)
+{
+	LASSERT(sec->ps_policy->sp_cops->gc_ctx);
+	LASSERT(sec->ps_gc_interval > 0);
+	LASSERT(list_empty(&sec->ps_gc_list));
+
+	sec->ps_gc_next = get_seconds() + sec->ps_gc_interval;
+
+	spin_lock(&sec_gc_list_lock);
+	list_add_tail(&sec_gc_list, &sec->ps_gc_list);
+	spin_unlock(&sec_gc_list_lock);
+
+	CDEBUG(D_SEC, "added sec %p(%s)\n", sec, sec->ps_policy->sp_name);
+}
+EXPORT_SYMBOL(sptlrpc_gc_add_sec);
+
+void sptlrpc_gc_del_sec(struct ptlrpc_sec *sec)
+{
+	if (list_empty(&sec->ps_gc_list))
+		return;
+
+	might_sleep();
+
+	/* signal before list_del to make iteration in gc thread safe */
+	atomic_inc(&sec_gc_wait_del);
+
+	spin_lock(&sec_gc_list_lock);
+	list_del_init(&sec->ps_gc_list);
+	spin_unlock(&sec_gc_list_lock);
+
+	/* barrier */
+	mutex_lock(&sec_gc_mutex);
+	mutex_unlock(&sec_gc_mutex);
+
+	atomic_dec(&sec_gc_wait_del);
+
+	CDEBUG(D_SEC, "del sec %p(%s)\n", sec, sec->ps_policy->sp_name);
+}
+EXPORT_SYMBOL(sptlrpc_gc_del_sec);
+
+void sptlrpc_gc_add_ctx(struct ptlrpc_cli_ctx *ctx)
+{
+	LASSERT(list_empty(&ctx->cc_gc_chain));
+
+	CDEBUG(D_SEC, "hand over ctx %p(%u->%s)\n",
+	       ctx, ctx->cc_vcred.vc_uid, sec2target_str(ctx->cc_sec));
+	spin_lock(&sec_gc_ctx_list_lock);
+	list_add(&ctx->cc_gc_chain, &sec_gc_ctx_list);
+	spin_unlock(&sec_gc_ctx_list_lock);
+
+	thread_add_flags(&sec_gc_thread, SVC_SIGNAL);
+	wake_up(&sec_gc_thread.t_ctl_waitq);
+}
+EXPORT_SYMBOL(sptlrpc_gc_add_ctx);
+
+static void sec_process_ctx_list(void)
+{
+	struct ptlrpc_cli_ctx *ctx;
+
+	spin_lock(&sec_gc_ctx_list_lock);
+
+	while (!list_empty(&sec_gc_ctx_list)) {
+		ctx = list_entry(sec_gc_ctx_list.next,
+				     struct ptlrpc_cli_ctx, cc_gc_chain);
+		list_del_init(&ctx->cc_gc_chain);
+		spin_unlock(&sec_gc_ctx_list_lock);
+
+		LASSERT(ctx->cc_sec);
+		LASSERT(atomic_read(&ctx->cc_refcount) == 1);
+		CDEBUG(D_SEC, "gc pick up ctx %p(%u->%s)\n",
+		       ctx, ctx->cc_vcred.vc_uid, sec2target_str(ctx->cc_sec));
+		sptlrpc_cli_ctx_put(ctx, 1);
+
+		spin_lock(&sec_gc_ctx_list_lock);
+	}
+
+	spin_unlock(&sec_gc_ctx_list_lock);
+}
+
+static void sec_do_gc(struct ptlrpc_sec *sec)
+{
+	LASSERT(sec->ps_policy->sp_cops->gc_ctx);
+
+	if (unlikely(sec->ps_gc_next == 0)) {
+		CDEBUG(D_SEC, "sec %p(%s) has 0 gc time\n",
+		      sec, sec->ps_policy->sp_name);
+		return;
+	}
+
+	CDEBUG(D_SEC, "check on sec %p(%s)\n", sec, sec->ps_policy->sp_name);
+
+	if (cfs_time_after(sec->ps_gc_next, get_seconds()))
+		return;
+
+	sec->ps_policy->sp_cops->gc_ctx(sec);
+	sec->ps_gc_next = get_seconds() + sec->ps_gc_interval;
+}
+
+static int sec_gc_main(void *arg)
+{
+	struct ptlrpc_thread *thread = (struct ptlrpc_thread *) arg;
+	struct l_wait_info    lwi;
+
+	unshare_fs_struct();
+
+	/* Record that the thread is running */
+	thread_set_flags(thread, SVC_RUNNING);
+	wake_up(&thread->t_ctl_waitq);
+
+	while (1) {
+		struct ptlrpc_sec *sec;
+
+		thread_clear_flags(thread, SVC_SIGNAL);
+		sec_process_ctx_list();
+again:
+		/* go through sec list do gc.
+		 * FIXME here we iterate through the whole list each time which
+		 * is not optimal. we perhaps want to use balanced binary tree
+		 * to trace each sec as order of expiry time.
+		 * another issue here is we wakeup as fixed interval instead of
+		 * according to each sec's expiry time */
+		mutex_lock(&sec_gc_mutex);
+		list_for_each_entry(sec, &sec_gc_list, ps_gc_list) {
+			/* if someone is waiting to be deleted, let it
+			 * proceed as soon as possible. */
+			if (atomic_read(&sec_gc_wait_del)) {
+				CDEBUG(D_SEC, "deletion pending, start over\n");
+				mutex_unlock(&sec_gc_mutex);
+				goto again;
+			}
+
+			sec_do_gc(sec);
+		}
+		mutex_unlock(&sec_gc_mutex);
+
+		/* check ctx list again before sleep */
+		sec_process_ctx_list();
+
+		lwi = LWI_TIMEOUT(SEC_GC_INTERVAL * HZ, NULL, NULL);
+		l_wait_event(thread->t_ctl_waitq,
+			     thread_is_stopping(thread) ||
+			     thread_is_signal(thread),
+			     &lwi);
+
+		if (thread_test_and_clear_flags(thread, SVC_STOPPING))
+			break;
+	}
+
+	thread_set_flags(thread, SVC_STOPPED);
+	wake_up(&thread->t_ctl_waitq);
+	return 0;
+}
+
+int sptlrpc_gc_init(void)
+{
+	struct l_wait_info lwi = { 0 };
+	struct task_struct *task;
+
+	mutex_init(&sec_gc_mutex);
+	spin_lock_init(&sec_gc_list_lock);
+	spin_lock_init(&sec_gc_ctx_list_lock);
+
+	/* initialize thread control */
+	memset(&sec_gc_thread, 0, sizeof(sec_gc_thread));
+	init_waitqueue_head(&sec_gc_thread.t_ctl_waitq);
+
+	task = kthread_run(sec_gc_main, &sec_gc_thread, "sptlrpc_gc");
+	if (IS_ERR(task)) {
+		CERROR("can't start gc thread: %ld\n", PTR_ERR(task));
+		return PTR_ERR(task);
+	}
+
+	l_wait_event(sec_gc_thread.t_ctl_waitq,
+		     thread_is_running(&sec_gc_thread), &lwi);
+	return 0;
+}
+
+void sptlrpc_gc_fini(void)
+{
+	struct l_wait_info lwi = { 0 };
+
+	thread_set_flags(&sec_gc_thread, SVC_STOPPING);
+	wake_up(&sec_gc_thread.t_ctl_waitq);
+
+	l_wait_event(sec_gc_thread.t_ctl_waitq,
+		     thread_is_stopped(&sec_gc_thread), &lwi);
+}
diff --git a/kernel/drivers/staging/lustre/lustre/ptlrpc/sec_lproc.c b/kernel/drivers/staging/lustre/lustre/ptlrpc/sec_lproc.c
new file mode 100644
index 000000000..0d08145a6
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/ptlrpc/sec_lproc.c
@@ -0,0 +1,199 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/ptlrpc/sec_lproc.c
+ *
+ * Author: Eric Mei <ericm@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_SEC
+
+#include "../../include/linux/libcfs/libcfs.h"
+#include <linux/crypto.h>
+
+#include "../include/obd.h"
+#include "../include/obd_class.h"
+#include "../include/obd_support.h"
+#include "../include/lustre_net.h"
+#include "../include/lustre_import.h"
+#include "../include/lustre_dlm.h"
+#include "../include/lustre_sec.h"
+
+#include "ptlrpc_internal.h"
+
+
+struct proc_dir_entry *sptlrpc_proc_root = NULL;
+EXPORT_SYMBOL(sptlrpc_proc_root);
+
+static char *sec_flags2str(unsigned long flags, char *buf, int bufsize)
+{
+	buf[0] = '\0';
+
+	if (flags & PTLRPC_SEC_FL_REVERSE)
+		strlcat(buf, "reverse,", bufsize);
+	if (flags & PTLRPC_SEC_FL_ROOTONLY)
+		strlcat(buf, "rootonly,", bufsize);
+	if (flags & PTLRPC_SEC_FL_UDESC)
+		strlcat(buf, "udesc,", bufsize);
+	if (flags & PTLRPC_SEC_FL_BULK)
+		strlcat(buf, "bulk,", bufsize);
+	if (buf[0] == '\0')
+		strlcat(buf, "-,", bufsize);
+
+	return buf;
+}
+
+static int sptlrpc_info_lprocfs_seq_show(struct seq_file *seq, void *v)
+{
+	struct obd_device *dev = seq->private;
+	struct client_obd *cli = &dev->u.cli;
+	struct ptlrpc_sec *sec = NULL;
+	char	       str[32];
+
+	LASSERT(strcmp(dev->obd_type->typ_name, LUSTRE_OSC_NAME) == 0 ||
+		strcmp(dev->obd_type->typ_name, LUSTRE_MDC_NAME) == 0 ||
+		strcmp(dev->obd_type->typ_name, LUSTRE_MGC_NAME) == 0);
+
+	if (cli->cl_import)
+		sec = sptlrpc_import_sec_ref(cli->cl_import);
+	if (sec == NULL)
+		goto out;
+
+	sec_flags2str(sec->ps_flvr.sf_flags, str, sizeof(str));
+
+	seq_printf(seq, "rpc flavor:    %s\n",
+		   sptlrpc_flavor2name_base(sec->ps_flvr.sf_rpc));
+	seq_printf(seq, "bulk flavor:   %s\n",
+		   sptlrpc_flavor2name_bulk(&sec->ps_flvr, str, sizeof(str)));
+	seq_printf(seq, "flags:	 %s\n",
+		   sec_flags2str(sec->ps_flvr.sf_flags, str, sizeof(str)));
+	seq_printf(seq, "id:	    %d\n", sec->ps_id);
+	seq_printf(seq, "refcount:      %d\n",
+		   atomic_read(&sec->ps_refcount));
+	seq_printf(seq, "nctx:	  %d\n", atomic_read(&sec->ps_nctx));
+	seq_printf(seq, "gc internal    %ld\n", sec->ps_gc_interval);
+	seq_printf(seq, "gc next	%ld\n",
+		   sec->ps_gc_interval ?
+		   sec->ps_gc_next - get_seconds() : 0);
+
+	sptlrpc_sec_put(sec);
+out:
+	return 0;
+}
+LPROC_SEQ_FOPS_RO(sptlrpc_info_lprocfs);
+
+static int sptlrpc_ctxs_lprocfs_seq_show(struct seq_file *seq, void *v)
+{
+	struct obd_device *dev = seq->private;
+	struct client_obd *cli = &dev->u.cli;
+	struct ptlrpc_sec *sec = NULL;
+
+	LASSERT(strcmp(dev->obd_type->typ_name, LUSTRE_OSC_NAME) == 0 ||
+		strcmp(dev->obd_type->typ_name, LUSTRE_MDC_NAME) == 0 ||
+		strcmp(dev->obd_type->typ_name, LUSTRE_MGC_NAME) == 0);
+
+	if (cli->cl_import)
+		sec = sptlrpc_import_sec_ref(cli->cl_import);
+	if (sec == NULL)
+		goto out;
+
+	if (sec->ps_policy->sp_cops->display)
+		sec->ps_policy->sp_cops->display(sec, seq);
+
+	sptlrpc_sec_put(sec);
+out:
+	return 0;
+}
+LPROC_SEQ_FOPS_RO(sptlrpc_ctxs_lprocfs);
+
+int sptlrpc_lprocfs_cliobd_attach(struct obd_device *dev)
+{
+	int     rc;
+
+	if (strcmp(dev->obd_type->typ_name, LUSTRE_OSC_NAME) != 0 &&
+	    strcmp(dev->obd_type->typ_name, LUSTRE_MDC_NAME) != 0 &&
+	    strcmp(dev->obd_type->typ_name, LUSTRE_MGC_NAME) != 0) {
+		CERROR("can't register lproc for obd type %s\n",
+		       dev->obd_type->typ_name);
+		return -EINVAL;
+	}
+
+	rc = lprocfs_obd_seq_create(dev, "srpc_info", 0444,
+				    &sptlrpc_info_lprocfs_fops, dev);
+	if (rc) {
+		CERROR("create proc entry srpc_info for %s: %d\n",
+		       dev->obd_name, rc);
+		return rc;
+	}
+
+	rc = lprocfs_obd_seq_create(dev, "srpc_contexts", 0444,
+				    &sptlrpc_ctxs_lprocfs_fops, dev);
+	if (rc) {
+		CERROR("create proc entry srpc_contexts for %s: %d\n",
+		       dev->obd_name, rc);
+		return rc;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL(sptlrpc_lprocfs_cliobd_attach);
+
+LPROC_SEQ_FOPS_RO(sptlrpc_proc_enc_pool);
+static struct lprocfs_vars sptlrpc_lprocfs_vars[] = {
+	{ "encrypt_page_pools", &sptlrpc_proc_enc_pool_fops },
+	{ NULL }
+};
+
+int sptlrpc_lproc_init(void)
+{
+	int     rc;
+
+	LASSERT(sptlrpc_proc_root == NULL);
+
+	sptlrpc_proc_root = lprocfs_register("sptlrpc", proc_lustre_root,
+					     sptlrpc_lprocfs_vars, NULL);
+	if (IS_ERR(sptlrpc_proc_root)) {
+		rc = PTR_ERR(sptlrpc_proc_root);
+		sptlrpc_proc_root = NULL;
+		return rc;
+	}
+	return 0;
+}
+
+void sptlrpc_lproc_fini(void)
+{
+	if (sptlrpc_proc_root) {
+		lprocfs_remove(&sptlrpc_proc_root);
+		sptlrpc_proc_root = NULL;
+	}
+}
diff --git a/kernel/drivers/staging/lustre/lustre/ptlrpc/sec_null.c b/kernel/drivers/staging/lustre/lustre/ptlrpc/sec_null.c
new file mode 100644
index 000000000..4e132435b
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/ptlrpc/sec_null.c
@@ -0,0 +1,458 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/ptlrpc/sec_null.c
+ *
+ * Author: Eric Mei <ericm@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_SEC
+
+
+#include "../include/obd_support.h"
+#include "../include/obd_cksum.h"
+#include "../include/obd_class.h"
+#include "../include/lustre_net.h"
+#include "../include/lustre_sec.h"
+
+static struct ptlrpc_sec_policy null_policy;
+static struct ptlrpc_sec	null_sec;
+static struct ptlrpc_cli_ctx    null_cli_ctx;
+static struct ptlrpc_svc_ctx    null_svc_ctx;
+
+/*
+ * we can temporarily use the topmost 8-bits of lm_secflvr to identify
+ * the source sec part.
+ */
+static inline
+void null_encode_sec_part(struct lustre_msg *msg, enum lustre_sec_part sp)
+{
+	msg->lm_secflvr |= (((__u32) sp) & 0xFF) << 24;
+}
+
+static inline
+enum lustre_sec_part null_decode_sec_part(struct lustre_msg *msg)
+{
+	return (msg->lm_secflvr >> 24) & 0xFF;
+}
+
+static int null_ctx_refresh(struct ptlrpc_cli_ctx *ctx)
+{
+	/* should never reach here */
+	LBUG();
+	return 0;
+}
+
+static
+int null_ctx_sign(struct ptlrpc_cli_ctx *ctx, struct ptlrpc_request *req)
+{
+	req->rq_reqbuf->lm_secflvr = SPTLRPC_FLVR_NULL;
+
+	if (!req->rq_import->imp_dlm_fake) {
+		struct obd_device *obd = req->rq_import->imp_obd;
+		null_encode_sec_part(req->rq_reqbuf,
+				     obd->u.cli.cl_sp_me);
+	}
+	req->rq_reqdata_len = req->rq_reqlen;
+	return 0;
+}
+
+static
+int null_ctx_verify(struct ptlrpc_cli_ctx *ctx, struct ptlrpc_request *req)
+{
+	__u32   cksums, cksumc;
+
+	LASSERT(req->rq_repdata);
+
+	req->rq_repmsg = req->rq_repdata;
+	req->rq_replen = req->rq_repdata_len;
+
+	if (req->rq_early) {
+		cksums = lustre_msg_get_cksum(req->rq_repdata);
+		cksumc = lustre_msg_calc_cksum(req->rq_repmsg);
+		if (cksumc != cksums) {
+			CDEBUG(D_SEC,
+			       "early reply checksum mismatch: %08x != %08x\n",
+			       cksumc, cksums);
+			return -EINVAL;
+		}
+	}
+
+	return 0;
+}
+
+static
+struct ptlrpc_sec *null_create_sec(struct obd_import *imp,
+				   struct ptlrpc_svc_ctx *svc_ctx,
+				   struct sptlrpc_flavor *sf)
+{
+	LASSERT(SPTLRPC_FLVR_POLICY(sf->sf_rpc) == SPTLRPC_POLICY_NULL);
+
+	/* general layer has take a module reference for us, because we never
+	 * really destroy the sec, simply release the reference here.
+	 */
+	sptlrpc_policy_put(&null_policy);
+	return &null_sec;
+}
+
+static
+void null_destroy_sec(struct ptlrpc_sec *sec)
+{
+	LASSERT(sec == &null_sec);
+}
+
+static
+struct ptlrpc_cli_ctx *null_lookup_ctx(struct ptlrpc_sec *sec,
+				       struct vfs_cred *vcred,
+				       int create, int remove_dead)
+{
+	atomic_inc(&null_cli_ctx.cc_refcount);
+	return &null_cli_ctx;
+}
+
+static
+int null_flush_ctx_cache(struct ptlrpc_sec *sec,
+			 uid_t uid,
+			 int grace, int force)
+{
+	return 0;
+}
+
+static
+int null_alloc_reqbuf(struct ptlrpc_sec *sec,
+		      struct ptlrpc_request *req,
+		      int msgsize)
+{
+	if (!req->rq_reqbuf) {
+		int alloc_size = size_roundup_power2(msgsize);
+
+		LASSERT(!req->rq_pool);
+		OBD_ALLOC_LARGE(req->rq_reqbuf, alloc_size);
+		if (!req->rq_reqbuf)
+			return -ENOMEM;
+
+		req->rq_reqbuf_len = alloc_size;
+	} else {
+		LASSERT(req->rq_pool);
+		LASSERT(req->rq_reqbuf_len >= msgsize);
+		memset(req->rq_reqbuf, 0, msgsize);
+	}
+
+	req->rq_reqmsg = req->rq_reqbuf;
+	return 0;
+}
+
+static
+void null_free_reqbuf(struct ptlrpc_sec *sec,
+		      struct ptlrpc_request *req)
+{
+	if (!req->rq_pool) {
+		LASSERTF(req->rq_reqmsg == req->rq_reqbuf,
+			 "req %p: reqmsg %p is not reqbuf %p in null sec\n",
+			 req, req->rq_reqmsg, req->rq_reqbuf);
+		LASSERTF(req->rq_reqbuf_len >= req->rq_reqlen,
+			 "req %p: reqlen %d should smaller than buflen %d\n",
+			 req, req->rq_reqlen, req->rq_reqbuf_len);
+
+		OBD_FREE_LARGE(req->rq_reqbuf, req->rq_reqbuf_len);
+		req->rq_reqbuf = NULL;
+		req->rq_reqbuf_len = 0;
+	}
+}
+
+static
+int null_alloc_repbuf(struct ptlrpc_sec *sec,
+		      struct ptlrpc_request *req,
+		      int msgsize)
+{
+	/* add space for early replied */
+	msgsize += lustre_msg_early_size();
+
+	msgsize = size_roundup_power2(msgsize);
+
+	OBD_ALLOC_LARGE(req->rq_repbuf, msgsize);
+	if (!req->rq_repbuf)
+		return -ENOMEM;
+
+	req->rq_repbuf_len = msgsize;
+	return 0;
+}
+
+static
+void null_free_repbuf(struct ptlrpc_sec *sec,
+		      struct ptlrpc_request *req)
+{
+	LASSERT(req->rq_repbuf);
+
+	OBD_FREE_LARGE(req->rq_repbuf, req->rq_repbuf_len);
+	req->rq_repbuf = NULL;
+	req->rq_repbuf_len = 0;
+}
+
+static
+int null_enlarge_reqbuf(struct ptlrpc_sec *sec,
+			struct ptlrpc_request *req,
+			int segment, int newsize)
+{
+	struct lustre_msg      *newbuf;
+	struct lustre_msg      *oldbuf = req->rq_reqmsg;
+	int		     oldsize, newmsg_size, alloc_size;
+
+	LASSERT(req->rq_reqbuf);
+	LASSERT(req->rq_reqbuf == req->rq_reqmsg);
+	LASSERT(req->rq_reqbuf_len >= req->rq_reqlen);
+	LASSERT(req->rq_reqlen == lustre_packed_msg_size(oldbuf));
+
+	/* compute new message size */
+	oldsize = req->rq_reqbuf->lm_buflens[segment];
+	req->rq_reqbuf->lm_buflens[segment] = newsize;
+	newmsg_size = lustre_packed_msg_size(oldbuf);
+	req->rq_reqbuf->lm_buflens[segment] = oldsize;
+
+	/* request from pool should always have enough buffer */
+	LASSERT(!req->rq_pool || req->rq_reqbuf_len >= newmsg_size);
+
+	if (req->rq_reqbuf_len < newmsg_size) {
+		alloc_size = size_roundup_power2(newmsg_size);
+
+		OBD_ALLOC_LARGE(newbuf, alloc_size);
+		if (newbuf == NULL)
+			return -ENOMEM;
+
+		/* Must lock this, so that otherwise unprotected change of
+		 * rq_reqmsg is not racing with parallel processing of
+		 * imp_replay_list traversing threads. See LU-3333
+		 * This is a bandaid at best, we really need to deal with this
+		 * in request enlarging code before unpacking that's already
+		 * there */
+		if (req->rq_import)
+			spin_lock(&req->rq_import->imp_lock);
+		memcpy(newbuf, req->rq_reqbuf, req->rq_reqlen);
+
+		OBD_FREE_LARGE(req->rq_reqbuf, req->rq_reqbuf_len);
+		req->rq_reqbuf = req->rq_reqmsg = newbuf;
+		req->rq_reqbuf_len = alloc_size;
+
+		if (req->rq_import)
+			spin_unlock(&req->rq_import->imp_lock);
+	}
+
+	_sptlrpc_enlarge_msg_inplace(req->rq_reqmsg, segment, newsize);
+	req->rq_reqlen = newmsg_size;
+
+	return 0;
+}
+
+static struct ptlrpc_svc_ctx null_svc_ctx = {
+	.sc_refcount    = ATOMIC_INIT(1),
+	.sc_policy      = &null_policy,
+};
+
+static
+int null_accept(struct ptlrpc_request *req)
+{
+	LASSERT(SPTLRPC_FLVR_POLICY(req->rq_flvr.sf_rpc) ==
+		SPTLRPC_POLICY_NULL);
+
+	if (req->rq_flvr.sf_rpc != SPTLRPC_FLVR_NULL) {
+		CERROR("Invalid rpc flavor 0x%x\n", req->rq_flvr.sf_rpc);
+		return SECSVC_DROP;
+	}
+
+	req->rq_sp_from = null_decode_sec_part(req->rq_reqbuf);
+
+	req->rq_reqmsg = req->rq_reqbuf;
+	req->rq_reqlen = req->rq_reqdata_len;
+
+	req->rq_svc_ctx = &null_svc_ctx;
+	atomic_inc(&req->rq_svc_ctx->sc_refcount);
+
+	return SECSVC_OK;
+}
+
+static
+int null_alloc_rs(struct ptlrpc_request *req, int msgsize)
+{
+	struct ptlrpc_reply_state *rs;
+	int rs_size = sizeof(*rs) + msgsize;
+
+	LASSERT(msgsize % 8 == 0);
+
+	rs = req->rq_reply_state;
+
+	if (rs) {
+		/* pre-allocated */
+		LASSERT(rs->rs_size >= rs_size);
+	} else {
+		OBD_ALLOC_LARGE(rs, rs_size);
+		if (rs == NULL)
+			return -ENOMEM;
+
+		rs->rs_size = rs_size;
+	}
+
+	rs->rs_svc_ctx = req->rq_svc_ctx;
+	atomic_inc(&req->rq_svc_ctx->sc_refcount);
+
+	rs->rs_repbuf = (struct lustre_msg *) (rs + 1);
+	rs->rs_repbuf_len = rs_size - sizeof(*rs);
+	rs->rs_msg = rs->rs_repbuf;
+
+	req->rq_reply_state = rs;
+	return 0;
+}
+
+static
+void null_free_rs(struct ptlrpc_reply_state *rs)
+{
+	LASSERT_ATOMIC_GT(&rs->rs_svc_ctx->sc_refcount, 1);
+	atomic_dec(&rs->rs_svc_ctx->sc_refcount);
+
+	if (!rs->rs_prealloc)
+		OBD_FREE_LARGE(rs, rs->rs_size);
+}
+
+static
+int null_authorize(struct ptlrpc_request *req)
+{
+	struct ptlrpc_reply_state *rs = req->rq_reply_state;
+
+	LASSERT(rs);
+
+	rs->rs_repbuf->lm_secflvr = SPTLRPC_FLVR_NULL;
+	rs->rs_repdata_len = req->rq_replen;
+
+	if (likely(req->rq_packed_final)) {
+		if (lustre_msghdr_get_flags(req->rq_reqmsg) & MSGHDR_AT_SUPPORT)
+			req->rq_reply_off = lustre_msg_early_size();
+		else
+			req->rq_reply_off = 0;
+	} else {
+		__u32 cksum;
+
+		cksum = lustre_msg_calc_cksum(rs->rs_repbuf);
+		lustre_msg_set_cksum(rs->rs_repbuf, cksum);
+		req->rq_reply_off = 0;
+	}
+
+	return 0;
+}
+
+static struct ptlrpc_ctx_ops null_ctx_ops = {
+	.refresh		= null_ctx_refresh,
+	.sign		   = null_ctx_sign,
+	.verify		 = null_ctx_verify,
+};
+
+static struct ptlrpc_sec_cops null_sec_cops = {
+	.create_sec	     = null_create_sec,
+	.destroy_sec	    = null_destroy_sec,
+	.lookup_ctx	     = null_lookup_ctx,
+	.flush_ctx_cache	= null_flush_ctx_cache,
+	.alloc_reqbuf	   = null_alloc_reqbuf,
+	.alloc_repbuf	   = null_alloc_repbuf,
+	.free_reqbuf	    = null_free_reqbuf,
+	.free_repbuf	    = null_free_repbuf,
+	.enlarge_reqbuf	 = null_enlarge_reqbuf,
+};
+
+static struct ptlrpc_sec_sops null_sec_sops = {
+	.accept		 = null_accept,
+	.alloc_rs	       = null_alloc_rs,
+	.authorize	      = null_authorize,
+	.free_rs		= null_free_rs,
+};
+
+static struct ptlrpc_sec_policy null_policy = {
+	.sp_owner	       = THIS_MODULE,
+	.sp_name		= "sec.null",
+	.sp_policy	      = SPTLRPC_POLICY_NULL,
+	.sp_cops		= &null_sec_cops,
+	.sp_sops		= &null_sec_sops,
+};
+
+static void null_init_internal(void)
+{
+	static HLIST_HEAD(__list);
+
+	null_sec.ps_policy = &null_policy;
+	atomic_set(&null_sec.ps_refcount, 1);     /* always busy */
+	null_sec.ps_id = -1;
+	null_sec.ps_import = NULL;
+	null_sec.ps_flvr.sf_rpc = SPTLRPC_FLVR_NULL;
+	null_sec.ps_flvr.sf_flags = 0;
+	null_sec.ps_part = LUSTRE_SP_ANY;
+	null_sec.ps_dying = 0;
+	spin_lock_init(&null_sec.ps_lock);
+	atomic_set(&null_sec.ps_nctx, 1);	 /* for "null_cli_ctx" */
+	INIT_LIST_HEAD(&null_sec.ps_gc_list);
+	null_sec.ps_gc_interval = 0;
+	null_sec.ps_gc_next = 0;
+
+	hlist_add_head(&null_cli_ctx.cc_cache, &__list);
+	atomic_set(&null_cli_ctx.cc_refcount, 1);    /* for hash */
+	null_cli_ctx.cc_sec = &null_sec;
+	null_cli_ctx.cc_ops = &null_ctx_ops;
+	null_cli_ctx.cc_expire = 0;
+	null_cli_ctx.cc_flags = PTLRPC_CTX_CACHED | PTLRPC_CTX_ETERNAL |
+				PTLRPC_CTX_UPTODATE;
+	null_cli_ctx.cc_vcred.vc_uid = 0;
+	spin_lock_init(&null_cli_ctx.cc_lock);
+	INIT_LIST_HEAD(&null_cli_ctx.cc_req_list);
+	INIT_LIST_HEAD(&null_cli_ctx.cc_gc_chain);
+}
+
+int sptlrpc_null_init(void)
+{
+	int rc;
+
+	null_init_internal();
+
+	rc = sptlrpc_register_policy(&null_policy);
+	if (rc)
+		CERROR("failed to register %s: %d\n", null_policy.sp_name, rc);
+
+	return rc;
+}
+
+void sptlrpc_null_fini(void)
+{
+	int rc;
+
+	rc = sptlrpc_unregister_policy(&null_policy);
+	if (rc)
+		CERROR("failed to unregister %s: %d\n",
+		       null_policy.sp_name, rc);
+}
diff --git a/kernel/drivers/staging/lustre/lustre/ptlrpc/sec_plain.c b/kernel/drivers/staging/lustre/lustre/ptlrpc/sec_plain.c
new file mode 100644
index 000000000..a79cd5301
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/ptlrpc/sec_plain.c
@@ -0,0 +1,1013 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/ptlrpc/sec_plain.c
+ *
+ * Author: Eric Mei <ericm@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_SEC
+
+
+#include "../include/obd_support.h"
+#include "../include/obd_cksum.h"
+#include "../include/obd_class.h"
+#include "../include/lustre_net.h"
+#include "../include/lustre_sec.h"
+
+struct plain_sec {
+	struct ptlrpc_sec       pls_base;
+	rwlock_t	    pls_lock;
+	struct ptlrpc_cli_ctx  *pls_ctx;
+};
+
+static inline struct plain_sec *sec2plsec(struct ptlrpc_sec *sec)
+{
+	return container_of(sec, struct plain_sec, pls_base);
+}
+
+static struct ptlrpc_sec_policy plain_policy;
+static struct ptlrpc_ctx_ops    plain_ctx_ops;
+static struct ptlrpc_svc_ctx    plain_svc_ctx;
+
+static unsigned int plain_at_offset;
+
+/*
+ * for simplicity, plain policy rpc use fixed layout.
+ */
+#define PLAIN_PACK_SEGMENTS	     (4)
+
+#define PLAIN_PACK_HDR_OFF	      (0)
+#define PLAIN_PACK_MSG_OFF	      (1)
+#define PLAIN_PACK_USER_OFF	     (2)
+#define PLAIN_PACK_BULK_OFF	     (3)
+
+#define PLAIN_FL_USER		   (0x01)
+#define PLAIN_FL_BULK		   (0x02)
+
+struct plain_header {
+	__u8	    ph_ver;	    /* 0 */
+	__u8	    ph_flags;
+	__u8	    ph_sp;	     /* source */
+	__u8	    ph_bulk_hash_alg;  /* complete flavor desc */
+	__u8	    ph_pad[4];
+};
+
+struct plain_bulk_token {
+	__u8	    pbt_hash[8];
+};
+
+#define PLAIN_BSD_SIZE \
+	(sizeof(struct ptlrpc_bulk_sec_desc) + sizeof(struct plain_bulk_token))
+
+/****************************************
+ * bulk checksum helpers		*
+ ****************************************/
+
+static int plain_unpack_bsd(struct lustre_msg *msg, int swabbed)
+{
+	struct ptlrpc_bulk_sec_desc *bsd;
+
+	if (bulk_sec_desc_unpack(msg, PLAIN_PACK_BULK_OFF, swabbed))
+		return -EPROTO;
+
+	bsd = lustre_msg_buf(msg, PLAIN_PACK_BULK_OFF, PLAIN_BSD_SIZE);
+	if (bsd == NULL) {
+		CERROR("bulk sec desc has short size %d\n",
+		       lustre_msg_buflen(msg, PLAIN_PACK_BULK_OFF));
+		return -EPROTO;
+	}
+
+	if (bsd->bsd_svc != SPTLRPC_BULK_SVC_NULL &&
+	    bsd->bsd_svc != SPTLRPC_BULK_SVC_INTG) {
+		CERROR("invalid bulk svc %u\n", bsd->bsd_svc);
+		return -EPROTO;
+	}
+
+	return 0;
+}
+
+static int plain_generate_bulk_csum(struct ptlrpc_bulk_desc *desc,
+				    __u8 hash_alg,
+				    struct plain_bulk_token *token)
+{
+	if (hash_alg == BULK_HASH_ALG_NULL)
+		return 0;
+
+	memset(token->pbt_hash, 0, sizeof(token->pbt_hash));
+	return sptlrpc_get_bulk_checksum(desc, hash_alg, token->pbt_hash,
+					 sizeof(token->pbt_hash));
+}
+
+static int plain_verify_bulk_csum(struct ptlrpc_bulk_desc *desc,
+				  __u8 hash_alg,
+				  struct plain_bulk_token *tokenr)
+{
+	struct plain_bulk_token tokenv;
+	int		     rc;
+
+	if (hash_alg == BULK_HASH_ALG_NULL)
+		return 0;
+
+	memset(&tokenv.pbt_hash, 0, sizeof(tokenv.pbt_hash));
+	rc = sptlrpc_get_bulk_checksum(desc, hash_alg, tokenv.pbt_hash,
+				       sizeof(tokenv.pbt_hash));
+	if (rc)
+		return rc;
+
+	if (memcmp(tokenr->pbt_hash, tokenv.pbt_hash, sizeof(tokenr->pbt_hash)))
+		return -EACCES;
+	return 0;
+}
+
+static void corrupt_bulk_data(struct ptlrpc_bulk_desc *desc)
+{
+	char	   *ptr;
+	unsigned int    off, i;
+
+	for (i = 0; i < desc->bd_iov_count; i++) {
+		if (desc->bd_iov[i].kiov_len == 0)
+			continue;
+
+		ptr = kmap(desc->bd_iov[i].kiov_page);
+		off = desc->bd_iov[i].kiov_offset & ~CFS_PAGE_MASK;
+		ptr[off] ^= 0x1;
+		kunmap(desc->bd_iov[i].kiov_page);
+		return;
+	}
+}
+
+/****************************************
+ * cli_ctx apis			 *
+ ****************************************/
+
+static
+int plain_ctx_refresh(struct ptlrpc_cli_ctx *ctx)
+{
+	/* should never reach here */
+	LBUG();
+	return 0;
+}
+
+static
+int plain_ctx_validate(struct ptlrpc_cli_ctx *ctx)
+{
+	return 0;
+}
+
+static
+int plain_ctx_sign(struct ptlrpc_cli_ctx *ctx, struct ptlrpc_request *req)
+{
+	struct lustre_msg   *msg = req->rq_reqbuf;
+	struct plain_header *phdr;
+
+	msg->lm_secflvr = req->rq_flvr.sf_rpc;
+
+	phdr = lustre_msg_buf(msg, PLAIN_PACK_HDR_OFF, 0);
+	phdr->ph_ver = 0;
+	phdr->ph_flags = 0;
+	phdr->ph_sp = ctx->cc_sec->ps_part;
+	phdr->ph_bulk_hash_alg = req->rq_flvr.u_bulk.hash.hash_alg;
+
+	if (req->rq_pack_udesc)
+		phdr->ph_flags |= PLAIN_FL_USER;
+	if (req->rq_pack_bulk)
+		phdr->ph_flags |= PLAIN_FL_BULK;
+
+	req->rq_reqdata_len = lustre_msg_size_v2(msg->lm_bufcount,
+						 msg->lm_buflens);
+	return 0;
+}
+
+static
+int plain_ctx_verify(struct ptlrpc_cli_ctx *ctx, struct ptlrpc_request *req)
+{
+	struct lustre_msg   *msg = req->rq_repdata;
+	struct plain_header *phdr;
+	__u32		cksum;
+	int		  swabbed;
+
+	if (msg->lm_bufcount != PLAIN_PACK_SEGMENTS) {
+		CERROR("unexpected reply buf count %u\n", msg->lm_bufcount);
+		return -EPROTO;
+	}
+
+	swabbed = ptlrpc_rep_need_swab(req);
+
+	phdr = lustre_msg_buf(msg, PLAIN_PACK_HDR_OFF, sizeof(*phdr));
+	if (phdr == NULL) {
+		CERROR("missing plain header\n");
+		return -EPROTO;
+	}
+
+	if (phdr->ph_ver != 0) {
+		CERROR("Invalid header version\n");
+		return -EPROTO;
+	}
+
+	/* expect no user desc in reply */
+	if (phdr->ph_flags & PLAIN_FL_USER) {
+		CERROR("Unexpected udesc flag in reply\n");
+		return -EPROTO;
+	}
+
+	if (phdr->ph_bulk_hash_alg != req->rq_flvr.u_bulk.hash.hash_alg) {
+		CERROR("reply bulk flavor %u != %u\n", phdr->ph_bulk_hash_alg,
+		       req->rq_flvr.u_bulk.hash.hash_alg);
+		return -EPROTO;
+	}
+
+	if (unlikely(req->rq_early)) {
+		unsigned int hsize = 4;
+
+		cfs_crypto_hash_digest(CFS_HASH_ALG_CRC32,
+				lustre_msg_buf(msg, PLAIN_PACK_MSG_OFF, 0),
+				lustre_msg_buflen(msg, PLAIN_PACK_MSG_OFF),
+				NULL, 0, (unsigned char *)&cksum, &hsize);
+		if (cksum != msg->lm_cksum) {
+			CDEBUG(D_SEC,
+			       "early reply checksum mismatch: %08x != %08x\n",
+			       cpu_to_le32(cksum), msg->lm_cksum);
+			return -EINVAL;
+		}
+	} else {
+		/* whether we sent with bulk or not, we expect the same
+		 * in reply, except for early reply */
+		if (!req->rq_early &&
+		    !equi(req->rq_pack_bulk == 1,
+			  phdr->ph_flags & PLAIN_FL_BULK)) {
+			CERROR("%s bulk checksum in reply\n",
+			       req->rq_pack_bulk ? "Missing" : "Unexpected");
+			return -EPROTO;
+		}
+
+		if (phdr->ph_flags & PLAIN_FL_BULK) {
+			if (plain_unpack_bsd(msg, swabbed))
+				return -EPROTO;
+		}
+	}
+
+	req->rq_repmsg = lustre_msg_buf(msg, PLAIN_PACK_MSG_OFF, 0);
+	req->rq_replen = lustre_msg_buflen(msg, PLAIN_PACK_MSG_OFF);
+	return 0;
+}
+
+static
+int plain_cli_wrap_bulk(struct ptlrpc_cli_ctx *ctx,
+			struct ptlrpc_request *req,
+			struct ptlrpc_bulk_desc *desc)
+{
+	struct ptlrpc_bulk_sec_desc *bsd;
+	struct plain_bulk_token     *token;
+	int			  rc;
+
+	LASSERT(req->rq_pack_bulk);
+	LASSERT(req->rq_reqbuf->lm_bufcount == PLAIN_PACK_SEGMENTS);
+
+	bsd = lustre_msg_buf(req->rq_reqbuf, PLAIN_PACK_BULK_OFF, 0);
+	token = (struct plain_bulk_token *) bsd->bsd_data;
+
+	bsd->bsd_version = 0;
+	bsd->bsd_flags = 0;
+	bsd->bsd_type = SPTLRPC_BULK_DEFAULT;
+	bsd->bsd_svc = SPTLRPC_FLVR_BULK_SVC(req->rq_flvr.sf_rpc);
+
+	if (bsd->bsd_svc == SPTLRPC_BULK_SVC_NULL)
+		return 0;
+
+	if (req->rq_bulk_read)
+		return 0;
+
+	rc = plain_generate_bulk_csum(desc, req->rq_flvr.u_bulk.hash.hash_alg,
+				      token);
+	if (rc) {
+		CERROR("bulk write: failed to compute checksum: %d\n", rc);
+	} else {
+		/*
+		 * for sending we only compute the wrong checksum instead
+		 * of corrupting the data so it is still correct on a redo
+		 */
+		if (OBD_FAIL_CHECK(OBD_FAIL_OSC_CHECKSUM_SEND) &&
+		    req->rq_flvr.u_bulk.hash.hash_alg != BULK_HASH_ALG_NULL)
+			token->pbt_hash[0] ^= 0x1;
+	}
+
+	return rc;
+}
+
+static
+int plain_cli_unwrap_bulk(struct ptlrpc_cli_ctx *ctx,
+			  struct ptlrpc_request *req,
+			  struct ptlrpc_bulk_desc *desc)
+{
+	struct ptlrpc_bulk_sec_desc *bsdv;
+	struct plain_bulk_token     *tokenv;
+	int			  rc;
+	int			  i, nob;
+
+	LASSERT(req->rq_pack_bulk);
+	LASSERT(req->rq_reqbuf->lm_bufcount == PLAIN_PACK_SEGMENTS);
+	LASSERT(req->rq_repdata->lm_bufcount == PLAIN_PACK_SEGMENTS);
+
+	bsdv = lustre_msg_buf(req->rq_repdata, PLAIN_PACK_BULK_OFF, 0);
+	tokenv = (struct plain_bulk_token *) bsdv->bsd_data;
+
+	if (req->rq_bulk_write) {
+		if (bsdv->bsd_flags & BSD_FL_ERR)
+			return -EIO;
+		return 0;
+	}
+
+	/* fix the actual data size */
+	for (i = 0, nob = 0; i < desc->bd_iov_count; i++) {
+		if (desc->bd_iov[i].kiov_len + nob > desc->bd_nob_transferred) {
+			desc->bd_iov[i].kiov_len =
+				desc->bd_nob_transferred - nob;
+		}
+		nob += desc->bd_iov[i].kiov_len;
+	}
+
+	rc = plain_verify_bulk_csum(desc, req->rq_flvr.u_bulk.hash.hash_alg,
+				    tokenv);
+	if (rc)
+		CERROR("bulk read: client verify failed: %d\n", rc);
+
+	return rc;
+}
+
+/****************************************
+ * sec apis			     *
+ ****************************************/
+
+static
+struct ptlrpc_cli_ctx *plain_sec_install_ctx(struct plain_sec *plsec)
+{
+	struct ptlrpc_cli_ctx  *ctx, *ctx_new;
+
+	OBD_ALLOC_PTR(ctx_new);
+
+	write_lock(&plsec->pls_lock);
+
+	ctx = plsec->pls_ctx;
+	if (ctx) {
+		atomic_inc(&ctx->cc_refcount);
+
+		if (ctx_new)
+			OBD_FREE_PTR(ctx_new);
+	} else if (ctx_new) {
+		ctx = ctx_new;
+
+		atomic_set(&ctx->cc_refcount, 1); /* for cache */
+		ctx->cc_sec = &plsec->pls_base;
+		ctx->cc_ops = &plain_ctx_ops;
+		ctx->cc_expire = 0;
+		ctx->cc_flags = PTLRPC_CTX_CACHED | PTLRPC_CTX_UPTODATE;
+		ctx->cc_vcred.vc_uid = 0;
+		spin_lock_init(&ctx->cc_lock);
+		INIT_LIST_HEAD(&ctx->cc_req_list);
+		INIT_LIST_HEAD(&ctx->cc_gc_chain);
+
+		plsec->pls_ctx = ctx;
+		atomic_inc(&plsec->pls_base.ps_nctx);
+		atomic_inc(&plsec->pls_base.ps_refcount);
+
+		atomic_inc(&ctx->cc_refcount); /* for caller */
+	}
+
+	write_unlock(&plsec->pls_lock);
+
+	return ctx;
+}
+
+static
+void plain_destroy_sec(struct ptlrpc_sec *sec)
+{
+	struct plain_sec       *plsec = sec2plsec(sec);
+
+	LASSERT(sec->ps_policy == &plain_policy);
+	LASSERT(sec->ps_import);
+	LASSERT(atomic_read(&sec->ps_refcount) == 0);
+	LASSERT(atomic_read(&sec->ps_nctx) == 0);
+	LASSERT(plsec->pls_ctx == NULL);
+
+	class_import_put(sec->ps_import);
+
+	OBD_FREE_PTR(plsec);
+}
+
+static
+void plain_kill_sec(struct ptlrpc_sec *sec)
+{
+	sec->ps_dying = 1;
+}
+
+static
+struct ptlrpc_sec *plain_create_sec(struct obd_import *imp,
+				    struct ptlrpc_svc_ctx *svc_ctx,
+				    struct sptlrpc_flavor *sf)
+{
+	struct plain_sec       *plsec;
+	struct ptlrpc_sec      *sec;
+	struct ptlrpc_cli_ctx  *ctx;
+
+	LASSERT(SPTLRPC_FLVR_POLICY(sf->sf_rpc) == SPTLRPC_POLICY_PLAIN);
+
+	OBD_ALLOC_PTR(plsec);
+	if (plsec == NULL)
+		return NULL;
+
+	/*
+	 * initialize plain_sec
+	 */
+	rwlock_init(&plsec->pls_lock);
+	plsec->pls_ctx = NULL;
+
+	sec = &plsec->pls_base;
+	sec->ps_policy = &plain_policy;
+	atomic_set(&sec->ps_refcount, 0);
+	atomic_set(&sec->ps_nctx, 0);
+	sec->ps_id = sptlrpc_get_next_secid();
+	sec->ps_import = class_import_get(imp);
+	sec->ps_flvr = *sf;
+	spin_lock_init(&sec->ps_lock);
+	INIT_LIST_HEAD(&sec->ps_gc_list);
+	sec->ps_gc_interval = 0;
+	sec->ps_gc_next = 0;
+
+	/* install ctx immediately if this is a reverse sec */
+	if (svc_ctx) {
+		ctx = plain_sec_install_ctx(plsec);
+		if (ctx == NULL) {
+			plain_destroy_sec(sec);
+			return NULL;
+		}
+		sptlrpc_cli_ctx_put(ctx, 1);
+	}
+
+	return sec;
+}
+
+static
+struct ptlrpc_cli_ctx *plain_lookup_ctx(struct ptlrpc_sec *sec,
+					struct vfs_cred *vcred,
+					int create, int remove_dead)
+{
+	struct plain_sec       *plsec = sec2plsec(sec);
+	struct ptlrpc_cli_ctx  *ctx;
+
+	read_lock(&plsec->pls_lock);
+	ctx = plsec->pls_ctx;
+	if (ctx)
+		atomic_inc(&ctx->cc_refcount);
+	read_unlock(&plsec->pls_lock);
+
+	if (unlikely(ctx == NULL))
+		ctx = plain_sec_install_ctx(plsec);
+
+	return ctx;
+}
+
+static
+void plain_release_ctx(struct ptlrpc_sec *sec,
+		       struct ptlrpc_cli_ctx *ctx, int sync)
+{
+	LASSERT(atomic_read(&sec->ps_refcount) > 0);
+	LASSERT(atomic_read(&sec->ps_nctx) > 0);
+	LASSERT(atomic_read(&ctx->cc_refcount) == 0);
+	LASSERT(ctx->cc_sec == sec);
+
+	OBD_FREE_PTR(ctx);
+
+	atomic_dec(&sec->ps_nctx);
+	sptlrpc_sec_put(sec);
+}
+
+static
+int plain_flush_ctx_cache(struct ptlrpc_sec *sec,
+			  uid_t uid, int grace, int force)
+{
+	struct plain_sec       *plsec = sec2plsec(sec);
+	struct ptlrpc_cli_ctx  *ctx;
+
+	/* do nothing unless caller want to flush for 'all' */
+	if (uid != -1)
+		return 0;
+
+	write_lock(&plsec->pls_lock);
+	ctx = plsec->pls_ctx;
+	plsec->pls_ctx = NULL;
+	write_unlock(&plsec->pls_lock);
+
+	if (ctx)
+		sptlrpc_cli_ctx_put(ctx, 1);
+	return 0;
+}
+
+static
+int plain_alloc_reqbuf(struct ptlrpc_sec *sec,
+		       struct ptlrpc_request *req,
+		       int msgsize)
+{
+	__u32 buflens[PLAIN_PACK_SEGMENTS] = { 0, };
+	int   alloc_len;
+
+	buflens[PLAIN_PACK_HDR_OFF] = sizeof(struct plain_header);
+	buflens[PLAIN_PACK_MSG_OFF] = msgsize;
+
+	if (req->rq_pack_udesc)
+		buflens[PLAIN_PACK_USER_OFF] = sptlrpc_current_user_desc_size();
+
+	if (req->rq_pack_bulk) {
+		LASSERT(req->rq_bulk_read || req->rq_bulk_write);
+		buflens[PLAIN_PACK_BULK_OFF] = PLAIN_BSD_SIZE;
+	}
+
+	alloc_len = lustre_msg_size_v2(PLAIN_PACK_SEGMENTS, buflens);
+
+	if (!req->rq_reqbuf) {
+		LASSERT(!req->rq_pool);
+
+		alloc_len = size_roundup_power2(alloc_len);
+		OBD_ALLOC_LARGE(req->rq_reqbuf, alloc_len);
+		if (!req->rq_reqbuf)
+			return -ENOMEM;
+
+		req->rq_reqbuf_len = alloc_len;
+	} else {
+		LASSERT(req->rq_pool);
+		LASSERT(req->rq_reqbuf_len >= alloc_len);
+		memset(req->rq_reqbuf, 0, alloc_len);
+	}
+
+	lustre_init_msg_v2(req->rq_reqbuf, PLAIN_PACK_SEGMENTS, buflens, NULL);
+	req->rq_reqmsg = lustre_msg_buf(req->rq_reqbuf, PLAIN_PACK_MSG_OFF, 0);
+
+	if (req->rq_pack_udesc)
+		sptlrpc_pack_user_desc(req->rq_reqbuf, PLAIN_PACK_USER_OFF);
+
+	return 0;
+}
+
+static
+void plain_free_reqbuf(struct ptlrpc_sec *sec,
+		       struct ptlrpc_request *req)
+{
+	if (!req->rq_pool) {
+		OBD_FREE_LARGE(req->rq_reqbuf, req->rq_reqbuf_len);
+		req->rq_reqbuf = NULL;
+		req->rq_reqbuf_len = 0;
+	}
+}
+
+static
+int plain_alloc_repbuf(struct ptlrpc_sec *sec,
+		       struct ptlrpc_request *req,
+		       int msgsize)
+{
+	__u32 buflens[PLAIN_PACK_SEGMENTS] = { 0, };
+	int alloc_len;
+
+	buflens[PLAIN_PACK_HDR_OFF] = sizeof(struct plain_header);
+	buflens[PLAIN_PACK_MSG_OFF] = msgsize;
+
+	if (req->rq_pack_bulk) {
+		LASSERT(req->rq_bulk_read || req->rq_bulk_write);
+		buflens[PLAIN_PACK_BULK_OFF] = PLAIN_BSD_SIZE;
+	}
+
+	alloc_len = lustre_msg_size_v2(PLAIN_PACK_SEGMENTS, buflens);
+
+	/* add space for early reply */
+	alloc_len += plain_at_offset;
+
+	alloc_len = size_roundup_power2(alloc_len);
+
+	OBD_ALLOC_LARGE(req->rq_repbuf, alloc_len);
+	if (!req->rq_repbuf)
+		return -ENOMEM;
+
+	req->rq_repbuf_len = alloc_len;
+	return 0;
+}
+
+static
+void plain_free_repbuf(struct ptlrpc_sec *sec,
+		       struct ptlrpc_request *req)
+{
+	OBD_FREE_LARGE(req->rq_repbuf, req->rq_repbuf_len);
+	req->rq_repbuf = NULL;
+	req->rq_repbuf_len = 0;
+}
+
+static
+int plain_enlarge_reqbuf(struct ptlrpc_sec *sec,
+			 struct ptlrpc_request *req,
+			 int segment, int newsize)
+{
+	struct lustre_msg      *newbuf;
+	int		     oldsize;
+	int		     newmsg_size, newbuf_size;
+
+	LASSERT(req->rq_reqbuf);
+	LASSERT(req->rq_reqbuf_len >= req->rq_reqlen);
+	LASSERT(lustre_msg_buf(req->rq_reqbuf, PLAIN_PACK_MSG_OFF, 0) ==
+		req->rq_reqmsg);
+
+	/* compute new embedded msg size.  */
+	oldsize = req->rq_reqmsg->lm_buflens[segment];
+	req->rq_reqmsg->lm_buflens[segment] = newsize;
+	newmsg_size = lustre_msg_size_v2(req->rq_reqmsg->lm_bufcount,
+					 req->rq_reqmsg->lm_buflens);
+	req->rq_reqmsg->lm_buflens[segment] = oldsize;
+
+	/* compute new wrapper msg size.  */
+	oldsize = req->rq_reqbuf->lm_buflens[PLAIN_PACK_MSG_OFF];
+	req->rq_reqbuf->lm_buflens[PLAIN_PACK_MSG_OFF] = newmsg_size;
+	newbuf_size = lustre_msg_size_v2(req->rq_reqbuf->lm_bufcount,
+					 req->rq_reqbuf->lm_buflens);
+	req->rq_reqbuf->lm_buflens[PLAIN_PACK_MSG_OFF] = oldsize;
+
+	/* request from pool should always have enough buffer */
+	LASSERT(!req->rq_pool || req->rq_reqbuf_len >= newbuf_size);
+
+	if (req->rq_reqbuf_len < newbuf_size) {
+		newbuf_size = size_roundup_power2(newbuf_size);
+
+		OBD_ALLOC_LARGE(newbuf, newbuf_size);
+		if (newbuf == NULL)
+			return -ENOMEM;
+
+		/* Must lock this, so that otherwise unprotected change of
+		 * rq_reqmsg is not racing with parallel processing of
+		 * imp_replay_list traversing threads. See LU-3333
+		 * This is a bandaid at best, we really need to deal with this
+		 * in request enlarging code before unpacking that's already
+		 * there */
+		if (req->rq_import)
+			spin_lock(&req->rq_import->imp_lock);
+
+		memcpy(newbuf, req->rq_reqbuf, req->rq_reqbuf_len);
+
+		OBD_FREE_LARGE(req->rq_reqbuf, req->rq_reqbuf_len);
+		req->rq_reqbuf = newbuf;
+		req->rq_reqbuf_len = newbuf_size;
+		req->rq_reqmsg = lustre_msg_buf(req->rq_reqbuf,
+						PLAIN_PACK_MSG_OFF, 0);
+
+		if (req->rq_import)
+			spin_unlock(&req->rq_import->imp_lock);
+	}
+
+	_sptlrpc_enlarge_msg_inplace(req->rq_reqbuf, PLAIN_PACK_MSG_OFF,
+				     newmsg_size);
+	_sptlrpc_enlarge_msg_inplace(req->rq_reqmsg, segment, newsize);
+
+	req->rq_reqlen = newmsg_size;
+	return 0;
+}
+
+/****************************************
+ * service apis			 *
+ ****************************************/
+
+static struct ptlrpc_svc_ctx plain_svc_ctx = {
+	.sc_refcount    = ATOMIC_INIT(1),
+	.sc_policy      = &plain_policy,
+};
+
+static
+int plain_accept(struct ptlrpc_request *req)
+{
+	struct lustre_msg   *msg = req->rq_reqbuf;
+	struct plain_header *phdr;
+	int		  swabbed;
+
+	LASSERT(SPTLRPC_FLVR_POLICY(req->rq_flvr.sf_rpc) ==
+		SPTLRPC_POLICY_PLAIN);
+
+	if (SPTLRPC_FLVR_BASE(req->rq_flvr.sf_rpc) !=
+	    SPTLRPC_FLVR_BASE(SPTLRPC_FLVR_PLAIN) ||
+	    SPTLRPC_FLVR_BULK_TYPE(req->rq_flvr.sf_rpc) !=
+	    SPTLRPC_FLVR_BULK_TYPE(SPTLRPC_FLVR_PLAIN)) {
+		CERROR("Invalid rpc flavor %x\n", req->rq_flvr.sf_rpc);
+		return SECSVC_DROP;
+	}
+
+	if (msg->lm_bufcount < PLAIN_PACK_SEGMENTS) {
+		CERROR("unexpected request buf count %u\n", msg->lm_bufcount);
+		return SECSVC_DROP;
+	}
+
+	swabbed = ptlrpc_req_need_swab(req);
+
+	phdr = lustre_msg_buf(msg, PLAIN_PACK_HDR_OFF, sizeof(*phdr));
+	if (phdr == NULL) {
+		CERROR("missing plain header\n");
+		return -EPROTO;
+	}
+
+	if (phdr->ph_ver != 0) {
+		CERROR("Invalid header version\n");
+		return -EPROTO;
+	}
+
+	if (phdr->ph_bulk_hash_alg >= BULK_HASH_ALG_MAX) {
+		CERROR("invalid hash algorithm: %u\n", phdr->ph_bulk_hash_alg);
+		return -EPROTO;
+	}
+
+	req->rq_sp_from = phdr->ph_sp;
+	req->rq_flvr.u_bulk.hash.hash_alg = phdr->ph_bulk_hash_alg;
+
+	if (phdr->ph_flags & PLAIN_FL_USER) {
+		if (sptlrpc_unpack_user_desc(msg, PLAIN_PACK_USER_OFF,
+					     swabbed)) {
+			CERROR("Mal-formed user descriptor\n");
+			return SECSVC_DROP;
+		}
+
+		req->rq_pack_udesc = 1;
+		req->rq_user_desc = lustre_msg_buf(msg, PLAIN_PACK_USER_OFF, 0);
+	}
+
+	if (phdr->ph_flags & PLAIN_FL_BULK) {
+		if (plain_unpack_bsd(msg, swabbed))
+			return SECSVC_DROP;
+
+		req->rq_pack_bulk = 1;
+	}
+
+	req->rq_reqmsg = lustre_msg_buf(msg, PLAIN_PACK_MSG_OFF, 0);
+	req->rq_reqlen = msg->lm_buflens[PLAIN_PACK_MSG_OFF];
+
+	req->rq_svc_ctx = &plain_svc_ctx;
+	atomic_inc(&req->rq_svc_ctx->sc_refcount);
+
+	return SECSVC_OK;
+}
+
+static
+int plain_alloc_rs(struct ptlrpc_request *req, int msgsize)
+{
+	struct ptlrpc_reply_state   *rs;
+	__u32			buflens[PLAIN_PACK_SEGMENTS] = { 0, };
+	int			  rs_size = sizeof(*rs);
+
+	LASSERT(msgsize % 8 == 0);
+
+	buflens[PLAIN_PACK_HDR_OFF] = sizeof(struct plain_header);
+	buflens[PLAIN_PACK_MSG_OFF] = msgsize;
+
+	if (req->rq_pack_bulk && (req->rq_bulk_read || req->rq_bulk_write))
+		buflens[PLAIN_PACK_BULK_OFF] = PLAIN_BSD_SIZE;
+
+	rs_size += lustre_msg_size_v2(PLAIN_PACK_SEGMENTS, buflens);
+
+	rs = req->rq_reply_state;
+
+	if (rs) {
+		/* pre-allocated */
+		LASSERT(rs->rs_size >= rs_size);
+	} else {
+		OBD_ALLOC_LARGE(rs, rs_size);
+		if (rs == NULL)
+			return -ENOMEM;
+
+		rs->rs_size = rs_size;
+	}
+
+	rs->rs_svc_ctx = req->rq_svc_ctx;
+	atomic_inc(&req->rq_svc_ctx->sc_refcount);
+	rs->rs_repbuf = (struct lustre_msg *) (rs + 1);
+	rs->rs_repbuf_len = rs_size - sizeof(*rs);
+
+	lustre_init_msg_v2(rs->rs_repbuf, PLAIN_PACK_SEGMENTS, buflens, NULL);
+	rs->rs_msg = lustre_msg_buf_v2(rs->rs_repbuf, PLAIN_PACK_MSG_OFF, 0);
+
+	req->rq_reply_state = rs;
+	return 0;
+}
+
+static
+void plain_free_rs(struct ptlrpc_reply_state *rs)
+{
+	LASSERT(atomic_read(&rs->rs_svc_ctx->sc_refcount) > 1);
+	atomic_dec(&rs->rs_svc_ctx->sc_refcount);
+
+	if (!rs->rs_prealloc)
+		OBD_FREE_LARGE(rs, rs->rs_size);
+}
+
+static
+int plain_authorize(struct ptlrpc_request *req)
+{
+	struct ptlrpc_reply_state *rs = req->rq_reply_state;
+	struct lustre_msg_v2      *msg = rs->rs_repbuf;
+	struct plain_header       *phdr;
+	int			len;
+
+	LASSERT(rs);
+	LASSERT(msg);
+
+	if (req->rq_replen != msg->lm_buflens[PLAIN_PACK_MSG_OFF])
+		len = lustre_shrink_msg(msg, PLAIN_PACK_MSG_OFF,
+					req->rq_replen, 1);
+	else
+		len = lustre_msg_size_v2(msg->lm_bufcount, msg->lm_buflens);
+
+	msg->lm_secflvr = req->rq_flvr.sf_rpc;
+
+	phdr = lustre_msg_buf(msg, PLAIN_PACK_HDR_OFF, 0);
+	phdr->ph_ver = 0;
+	phdr->ph_flags = 0;
+	phdr->ph_bulk_hash_alg = req->rq_flvr.u_bulk.hash.hash_alg;
+
+	if (req->rq_pack_bulk)
+		phdr->ph_flags |= PLAIN_FL_BULK;
+
+	rs->rs_repdata_len = len;
+
+	if (likely(req->rq_packed_final)) {
+		if (lustre_msghdr_get_flags(req->rq_reqmsg) & MSGHDR_AT_SUPPORT)
+			req->rq_reply_off = plain_at_offset;
+		else
+			req->rq_reply_off = 0;
+	} else {
+		unsigned int hsize = 4;
+
+		cfs_crypto_hash_digest(CFS_HASH_ALG_CRC32,
+			lustre_msg_buf(msg, PLAIN_PACK_MSG_OFF, 0),
+			lustre_msg_buflen(msg, PLAIN_PACK_MSG_OFF),
+			NULL, 0, (unsigned char *)&msg->lm_cksum, &hsize);
+			req->rq_reply_off = 0;
+	}
+
+	return 0;
+}
+
+static
+int plain_svc_unwrap_bulk(struct ptlrpc_request *req,
+			  struct ptlrpc_bulk_desc *desc)
+{
+	struct ptlrpc_reply_state   *rs = req->rq_reply_state;
+	struct ptlrpc_bulk_sec_desc *bsdr, *bsdv;
+	struct plain_bulk_token     *tokenr;
+	int			  rc;
+
+	LASSERT(req->rq_bulk_write);
+	LASSERT(req->rq_pack_bulk);
+
+	bsdr = lustre_msg_buf(req->rq_reqbuf, PLAIN_PACK_BULK_OFF, 0);
+	tokenr = (struct plain_bulk_token *) bsdr->bsd_data;
+	bsdv = lustre_msg_buf(rs->rs_repbuf, PLAIN_PACK_BULK_OFF, 0);
+
+	bsdv->bsd_version = 0;
+	bsdv->bsd_type = SPTLRPC_BULK_DEFAULT;
+	bsdv->bsd_svc = bsdr->bsd_svc;
+	bsdv->bsd_flags = 0;
+
+	if (bsdr->bsd_svc == SPTLRPC_BULK_SVC_NULL)
+		return 0;
+
+	rc = plain_verify_bulk_csum(desc, req->rq_flvr.u_bulk.hash.hash_alg,
+				    tokenr);
+	if (rc) {
+		bsdv->bsd_flags |= BSD_FL_ERR;
+		CERROR("bulk write: server verify failed: %d\n", rc);
+	}
+
+	return rc;
+}
+
+static
+int plain_svc_wrap_bulk(struct ptlrpc_request *req,
+			struct ptlrpc_bulk_desc *desc)
+{
+	struct ptlrpc_reply_state   *rs = req->rq_reply_state;
+	struct ptlrpc_bulk_sec_desc *bsdr, *bsdv;
+	struct plain_bulk_token     *tokenv;
+	int			  rc;
+
+	LASSERT(req->rq_bulk_read);
+	LASSERT(req->rq_pack_bulk);
+
+	bsdr = lustre_msg_buf(req->rq_reqbuf, PLAIN_PACK_BULK_OFF, 0);
+	bsdv = lustre_msg_buf(rs->rs_repbuf, PLAIN_PACK_BULK_OFF, 0);
+	tokenv = (struct plain_bulk_token *) bsdv->bsd_data;
+
+	bsdv->bsd_version = 0;
+	bsdv->bsd_type = SPTLRPC_BULK_DEFAULT;
+	bsdv->bsd_svc = bsdr->bsd_svc;
+	bsdv->bsd_flags = 0;
+
+	if (bsdr->bsd_svc == SPTLRPC_BULK_SVC_NULL)
+		return 0;
+
+	rc = plain_generate_bulk_csum(desc, req->rq_flvr.u_bulk.hash.hash_alg,
+				      tokenv);
+	if (rc) {
+		CERROR("bulk read: server failed to compute checksum: %d\n",
+		       rc);
+	} else {
+		if (OBD_FAIL_CHECK(OBD_FAIL_OSC_CHECKSUM_RECEIVE))
+			corrupt_bulk_data(desc);
+	}
+
+	return rc;
+}
+
+static struct ptlrpc_ctx_ops plain_ctx_ops = {
+	.refresh		= plain_ctx_refresh,
+	.validate	       = plain_ctx_validate,
+	.sign		   = plain_ctx_sign,
+	.verify		 = plain_ctx_verify,
+	.wrap_bulk	      = plain_cli_wrap_bulk,
+	.unwrap_bulk	    = plain_cli_unwrap_bulk,
+};
+
+static struct ptlrpc_sec_cops plain_sec_cops = {
+	.create_sec	     = plain_create_sec,
+	.destroy_sec	    = plain_destroy_sec,
+	.kill_sec	       = plain_kill_sec,
+	.lookup_ctx	     = plain_lookup_ctx,
+	.release_ctx	    = plain_release_ctx,
+	.flush_ctx_cache	= plain_flush_ctx_cache,
+	.alloc_reqbuf	   = plain_alloc_reqbuf,
+	.free_reqbuf	    = plain_free_reqbuf,
+	.alloc_repbuf	   = plain_alloc_repbuf,
+	.free_repbuf	    = plain_free_repbuf,
+	.enlarge_reqbuf	 = plain_enlarge_reqbuf,
+};
+
+static struct ptlrpc_sec_sops plain_sec_sops = {
+	.accept		 = plain_accept,
+	.alloc_rs	       = plain_alloc_rs,
+	.authorize	      = plain_authorize,
+	.free_rs		= plain_free_rs,
+	.unwrap_bulk	    = plain_svc_unwrap_bulk,
+	.wrap_bulk	      = plain_svc_wrap_bulk,
+};
+
+static struct ptlrpc_sec_policy plain_policy = {
+	.sp_owner	       = THIS_MODULE,
+	.sp_name		= "plain",
+	.sp_policy	      = SPTLRPC_POLICY_PLAIN,
+	.sp_cops		= &plain_sec_cops,
+	.sp_sops		= &plain_sec_sops,
+};
+
+int sptlrpc_plain_init(void)
+{
+	__u32 buflens[PLAIN_PACK_SEGMENTS] = { 0, };
+	int rc;
+
+	buflens[PLAIN_PACK_MSG_OFF] = lustre_msg_early_size();
+	plain_at_offset = lustre_msg_size_v2(PLAIN_PACK_SEGMENTS, buflens);
+
+	rc = sptlrpc_register_policy(&plain_policy);
+	if (rc)
+		CERROR("failed to register: %d\n", rc);
+
+	return rc;
+}
+
+void sptlrpc_plain_fini(void)
+{
+	int rc;
+
+	rc = sptlrpc_unregister_policy(&plain_policy);
+	if (rc)
+		CERROR("cannot unregister: %d\n", rc);
+}
diff --git a/kernel/drivers/staging/lustre/lustre/ptlrpc/service.c b/kernel/drivers/staging/lustre/lustre/ptlrpc/service.c
new file mode 100644
index 000000000..8e6142151
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/ptlrpc/service.c
@@ -0,0 +1,3105 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2010, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_RPC
+#include "../include/obd_support.h"
+#include "../include/obd_class.h"
+#include "../include/lustre_net.h"
+#include "../include/lu_object.h"
+#include "../../include/linux/lnet/types.h"
+#include "ptlrpc_internal.h"
+
+/* The following are visible and mutable through /sys/module/ptlrpc */
+int test_req_buffer_pressure = 0;
+module_param(test_req_buffer_pressure, int, 0444);
+MODULE_PARM_DESC(test_req_buffer_pressure, "set non-zero to put pressure on request buffer pools");
+module_param(at_min, int, 0644);
+MODULE_PARM_DESC(at_min, "Adaptive timeout minimum (sec)");
+module_param(at_max, int, 0644);
+MODULE_PARM_DESC(at_max, "Adaptive timeout maximum (sec)");
+module_param(at_history, int, 0644);
+MODULE_PARM_DESC(at_history,
+		 "Adaptive timeouts remember the slowest event that took place within this period (sec)");
+module_param(at_early_margin, int, 0644);
+MODULE_PARM_DESC(at_early_margin, "How soon before an RPC deadline to send an early reply");
+module_param(at_extra, int, 0644);
+MODULE_PARM_DESC(at_extra, "How much extra time to give with each early reply");
+
+
+/* forward ref */
+static int ptlrpc_server_post_idle_rqbds(struct ptlrpc_service_part *svcpt);
+static void ptlrpc_server_hpreq_fini(struct ptlrpc_request *req);
+static void ptlrpc_at_remove_timed(struct ptlrpc_request *req);
+
+/** Holds a list of all PTLRPC services */
+LIST_HEAD(ptlrpc_all_services);
+/** Used to protect the \e ptlrpc_all_services list */
+struct mutex ptlrpc_all_services_mutex;
+
+struct ptlrpc_request_buffer_desc *
+ptlrpc_alloc_rqbd(struct ptlrpc_service_part *svcpt)
+{
+	struct ptlrpc_service		  *svc = svcpt->scp_service;
+	struct ptlrpc_request_buffer_desc *rqbd;
+
+	OBD_CPT_ALLOC_PTR(rqbd, svc->srv_cptable, svcpt->scp_cpt);
+	if (rqbd == NULL)
+		return NULL;
+
+	rqbd->rqbd_svcpt = svcpt;
+	rqbd->rqbd_refcount = 0;
+	rqbd->rqbd_cbid.cbid_fn = request_in_callback;
+	rqbd->rqbd_cbid.cbid_arg = rqbd;
+	INIT_LIST_HEAD(&rqbd->rqbd_reqs);
+	OBD_CPT_ALLOC_LARGE(rqbd->rqbd_buffer, svc->srv_cptable,
+			    svcpt->scp_cpt, svc->srv_buf_size);
+	if (rqbd->rqbd_buffer == NULL) {
+		OBD_FREE_PTR(rqbd);
+		return NULL;
+	}
+
+	spin_lock(&svcpt->scp_lock);
+	list_add(&rqbd->rqbd_list, &svcpt->scp_rqbd_idle);
+	svcpt->scp_nrqbds_total++;
+	spin_unlock(&svcpt->scp_lock);
+
+	return rqbd;
+}
+
+void
+ptlrpc_free_rqbd(struct ptlrpc_request_buffer_desc *rqbd)
+{
+	struct ptlrpc_service_part *svcpt = rqbd->rqbd_svcpt;
+
+	LASSERT(rqbd->rqbd_refcount == 0);
+	LASSERT(list_empty(&rqbd->rqbd_reqs));
+
+	spin_lock(&svcpt->scp_lock);
+	list_del(&rqbd->rqbd_list);
+	svcpt->scp_nrqbds_total--;
+	spin_unlock(&svcpt->scp_lock);
+
+	OBD_FREE_LARGE(rqbd->rqbd_buffer, svcpt->scp_service->srv_buf_size);
+	OBD_FREE_PTR(rqbd);
+}
+
+int
+ptlrpc_grow_req_bufs(struct ptlrpc_service_part *svcpt, int post)
+{
+	struct ptlrpc_service		  *svc = svcpt->scp_service;
+	struct ptlrpc_request_buffer_desc *rqbd;
+	int				rc = 0;
+	int				i;
+
+	if (svcpt->scp_rqbd_allocating)
+		goto try_post;
+
+	spin_lock(&svcpt->scp_lock);
+	/* check again with lock */
+	if (svcpt->scp_rqbd_allocating) {
+		/* NB: we might allow more than one thread in the future */
+		LASSERT(svcpt->scp_rqbd_allocating == 1);
+		spin_unlock(&svcpt->scp_lock);
+		goto try_post;
+	}
+
+	svcpt->scp_rqbd_allocating++;
+	spin_unlock(&svcpt->scp_lock);
+
+
+	for (i = 0; i < svc->srv_nbuf_per_group; i++) {
+		/* NB: another thread might have recycled enough rqbds, we
+		 * need to make sure it wouldn't over-allocate, see LU-1212. */
+		if (svcpt->scp_nrqbds_posted >= svc->srv_nbuf_per_group)
+			break;
+
+		rqbd = ptlrpc_alloc_rqbd(svcpt);
+
+		if (rqbd == NULL) {
+			CERROR("%s: Can't allocate request buffer\n",
+			       svc->srv_name);
+			rc = -ENOMEM;
+			break;
+		}
+	}
+
+	spin_lock(&svcpt->scp_lock);
+
+	LASSERT(svcpt->scp_rqbd_allocating == 1);
+	svcpt->scp_rqbd_allocating--;
+
+	spin_unlock(&svcpt->scp_lock);
+
+	CDEBUG(D_RPCTRACE,
+	       "%s: allocate %d new %d-byte reqbufs (%d/%d left), rc = %d\n",
+	       svc->srv_name, i, svc->srv_buf_size, svcpt->scp_nrqbds_posted,
+	       svcpt->scp_nrqbds_total, rc);
+
+ try_post:
+	if (post && rc == 0)
+		rc = ptlrpc_server_post_idle_rqbds(svcpt);
+
+	return rc;
+}
+
+/**
+ * Part of Rep-Ack logic.
+ * Puts a lock and its mode into reply state associated to request reply.
+ */
+void
+ptlrpc_save_lock(struct ptlrpc_request *req,
+		 struct lustre_handle *lock, int mode, int no_ack)
+{
+	struct ptlrpc_reply_state *rs = req->rq_reply_state;
+	int			idx;
+
+	LASSERT(rs != NULL);
+	LASSERT(rs->rs_nlocks < RS_MAX_LOCKS);
+
+	if (req->rq_export->exp_disconnected) {
+		ldlm_lock_decref(lock, mode);
+	} else {
+		idx = rs->rs_nlocks++;
+		rs->rs_locks[idx] = *lock;
+		rs->rs_modes[idx] = mode;
+		rs->rs_difficult = 1;
+		rs->rs_no_ack = !!no_ack;
+	}
+}
+EXPORT_SYMBOL(ptlrpc_save_lock);
+
+
+struct ptlrpc_hr_partition;
+
+struct ptlrpc_hr_thread {
+	int				hrt_id;		/* thread ID */
+	spinlock_t			hrt_lock;
+	wait_queue_head_t			hrt_waitq;
+	struct list_head			hrt_queue;	/* RS queue */
+	struct ptlrpc_hr_partition	*hrt_partition;
+};
+
+struct ptlrpc_hr_partition {
+	/* # of started threads */
+	atomic_t			hrp_nstarted;
+	/* # of stopped threads */
+	atomic_t			hrp_nstopped;
+	/* cpu partition id */
+	int				hrp_cpt;
+	/* round-robin rotor for choosing thread */
+	int				hrp_rotor;
+	/* total number of threads on this partition */
+	int				hrp_nthrs;
+	/* threads table */
+	struct ptlrpc_hr_thread		*hrp_thrs;
+};
+
+#define HRT_RUNNING 0
+#define HRT_STOPPING 1
+
+struct ptlrpc_hr_service {
+	/* CPU partition table, it's just cfs_cpt_table for now */
+	struct cfs_cpt_table		*hr_cpt_table;
+	/** controller sleep waitq */
+	wait_queue_head_t			hr_waitq;
+	unsigned int			hr_stopping;
+	/** roundrobin rotor for non-affinity service */
+	unsigned int			hr_rotor;
+	/* partition data */
+	struct ptlrpc_hr_partition	**hr_partitions;
+};
+
+struct rs_batch {
+	struct list_head			rsb_replies;
+	unsigned int			rsb_n_replies;
+	struct ptlrpc_service_part	*rsb_svcpt;
+};
+
+/** reply handling service. */
+static struct ptlrpc_hr_service		ptlrpc_hr;
+
+/**
+ * maximum number of replies scheduled in one batch
+ */
+#define MAX_SCHEDULED 256
+
+/**
+ * Initialize a reply batch.
+ *
+ * \param b batch
+ */
+static void rs_batch_init(struct rs_batch *b)
+{
+	memset(b, 0, sizeof(*b));
+	INIT_LIST_HEAD(&b->rsb_replies);
+}
+
+/**
+ * Choose an hr thread to dispatch requests to.
+ */
+static struct ptlrpc_hr_thread *
+ptlrpc_hr_select(struct ptlrpc_service_part *svcpt)
+{
+	struct ptlrpc_hr_partition	*hrp;
+	unsigned int			rotor;
+
+	if (svcpt->scp_cpt >= 0 &&
+	    svcpt->scp_service->srv_cptable == ptlrpc_hr.hr_cpt_table) {
+		/* directly match partition */
+		hrp = ptlrpc_hr.hr_partitions[svcpt->scp_cpt];
+
+	} else {
+		rotor = ptlrpc_hr.hr_rotor++;
+		rotor %= cfs_cpt_number(ptlrpc_hr.hr_cpt_table);
+
+		hrp = ptlrpc_hr.hr_partitions[rotor];
+	}
+
+	rotor = hrp->hrp_rotor++;
+	return &hrp->hrp_thrs[rotor % hrp->hrp_nthrs];
+}
+
+/**
+ * Dispatch all replies accumulated in the batch to one from
+ * dedicated reply handling threads.
+ *
+ * \param b batch
+ */
+static void rs_batch_dispatch(struct rs_batch *b)
+{
+	if (b->rsb_n_replies != 0) {
+		struct ptlrpc_hr_thread	*hrt;
+
+		hrt = ptlrpc_hr_select(b->rsb_svcpt);
+
+		spin_lock(&hrt->hrt_lock);
+		list_splice_init(&b->rsb_replies, &hrt->hrt_queue);
+		spin_unlock(&hrt->hrt_lock);
+
+		wake_up(&hrt->hrt_waitq);
+		b->rsb_n_replies = 0;
+	}
+}
+
+/**
+ * Add a reply to a batch.
+ * Add one reply object to a batch, schedule batched replies if overload.
+ *
+ * \param b batch
+ * \param rs reply
+ */
+static void rs_batch_add(struct rs_batch *b, struct ptlrpc_reply_state *rs)
+{
+	struct ptlrpc_service_part *svcpt = rs->rs_svcpt;
+
+	if (svcpt != b->rsb_svcpt || b->rsb_n_replies >= MAX_SCHEDULED) {
+		if (b->rsb_svcpt != NULL) {
+			rs_batch_dispatch(b);
+			spin_unlock(&b->rsb_svcpt->scp_rep_lock);
+		}
+		spin_lock(&svcpt->scp_rep_lock);
+		b->rsb_svcpt = svcpt;
+	}
+	spin_lock(&rs->rs_lock);
+	rs->rs_scheduled_ever = 1;
+	if (rs->rs_scheduled == 0) {
+		list_move(&rs->rs_list, &b->rsb_replies);
+		rs->rs_scheduled = 1;
+		b->rsb_n_replies++;
+	}
+	rs->rs_committed = 1;
+	spin_unlock(&rs->rs_lock);
+}
+
+/**
+ * Reply batch finalization.
+ * Dispatch remaining replies from the batch
+ * and release remaining spinlock.
+ *
+ * \param b batch
+ */
+static void rs_batch_fini(struct rs_batch *b)
+{
+	if (b->rsb_svcpt != NULL) {
+		rs_batch_dispatch(b);
+		spin_unlock(&b->rsb_svcpt->scp_rep_lock);
+	}
+}
+
+#define DECLARE_RS_BATCH(b)     struct rs_batch b
+
+
+/**
+ * Put reply state into a queue for processing because we received
+ * ACK from the client
+ */
+void ptlrpc_dispatch_difficult_reply(struct ptlrpc_reply_state *rs)
+{
+	struct ptlrpc_hr_thread *hrt;
+
+	LASSERT(list_empty(&rs->rs_list));
+
+	hrt = ptlrpc_hr_select(rs->rs_svcpt);
+
+	spin_lock(&hrt->hrt_lock);
+	list_add_tail(&rs->rs_list, &hrt->hrt_queue);
+	spin_unlock(&hrt->hrt_lock);
+
+	wake_up(&hrt->hrt_waitq);
+}
+
+void
+ptlrpc_schedule_difficult_reply(struct ptlrpc_reply_state *rs)
+{
+	assert_spin_locked(&rs->rs_svcpt->scp_rep_lock);
+	assert_spin_locked(&rs->rs_lock);
+	LASSERT(rs->rs_difficult);
+	rs->rs_scheduled_ever = 1;  /* flag any notification attempt */
+
+	if (rs->rs_scheduled) {     /* being set up or already notified */
+		return;
+	}
+
+	rs->rs_scheduled = 1;
+	list_del_init(&rs->rs_list);
+	ptlrpc_dispatch_difficult_reply(rs);
+}
+EXPORT_SYMBOL(ptlrpc_schedule_difficult_reply);
+
+void ptlrpc_commit_replies(struct obd_export *exp)
+{
+	struct ptlrpc_reply_state *rs, *nxt;
+	DECLARE_RS_BATCH(batch);
+
+	rs_batch_init(&batch);
+	/* Find any replies that have been committed and get their service
+	 * to attend to complete them. */
+
+	/* CAVEAT EMPTOR: spinlock ordering!!! */
+	spin_lock(&exp->exp_uncommitted_replies_lock);
+	list_for_each_entry_safe(rs, nxt, &exp->exp_uncommitted_replies,
+				     rs_obd_list) {
+		LASSERT(rs->rs_difficult);
+		/* VBR: per-export last_committed */
+		LASSERT(rs->rs_export);
+		if (rs->rs_transno <= exp->exp_last_committed) {
+			list_del_init(&rs->rs_obd_list);
+			rs_batch_add(&batch, rs);
+		}
+	}
+	spin_unlock(&exp->exp_uncommitted_replies_lock);
+	rs_batch_fini(&batch);
+}
+EXPORT_SYMBOL(ptlrpc_commit_replies);
+
+static int
+ptlrpc_server_post_idle_rqbds(struct ptlrpc_service_part *svcpt)
+{
+	struct ptlrpc_request_buffer_desc *rqbd;
+	int				  rc;
+	int				  posted = 0;
+
+	for (;;) {
+		spin_lock(&svcpt->scp_lock);
+
+		if (list_empty(&svcpt->scp_rqbd_idle)) {
+			spin_unlock(&svcpt->scp_lock);
+			return posted;
+		}
+
+		rqbd = list_entry(svcpt->scp_rqbd_idle.next,
+				      struct ptlrpc_request_buffer_desc,
+				      rqbd_list);
+		list_del(&rqbd->rqbd_list);
+
+		/* assume we will post successfully */
+		svcpt->scp_nrqbds_posted++;
+		list_add(&rqbd->rqbd_list, &svcpt->scp_rqbd_posted);
+
+		spin_unlock(&svcpt->scp_lock);
+
+		rc = ptlrpc_register_rqbd(rqbd);
+		if (rc != 0)
+			break;
+
+		posted = 1;
+	}
+
+	spin_lock(&svcpt->scp_lock);
+
+	svcpt->scp_nrqbds_posted--;
+	list_del(&rqbd->rqbd_list);
+	list_add_tail(&rqbd->rqbd_list, &svcpt->scp_rqbd_idle);
+
+	/* Don't complain if no request buffers are posted right now; LNET
+	 * won't drop requests because we set the portal lazy! */
+
+	spin_unlock(&svcpt->scp_lock);
+
+	return -1;
+}
+
+static void ptlrpc_at_timer(unsigned long castmeharder)
+{
+	struct ptlrpc_service_part *svcpt;
+
+	svcpt = (struct ptlrpc_service_part *)castmeharder;
+
+	svcpt->scp_at_check = 1;
+	svcpt->scp_at_checktime = cfs_time_current();
+	wake_up(&svcpt->scp_waitq);
+}
+
+static void
+ptlrpc_server_nthreads_check(struct ptlrpc_service *svc,
+			     struct ptlrpc_service_conf *conf)
+{
+	struct ptlrpc_service_thr_conf	*tc = &conf->psc_thr;
+	unsigned			init;
+	unsigned			total;
+	unsigned			nthrs;
+	int				weight;
+
+	/*
+	 * Common code for estimating & validating threads number.
+	 * CPT affinity service could have percpt thread-pool instead
+	 * of a global thread-pool, which means user might not always
+	 * get the threads number they give it in conf::tc_nthrs_user
+	 * even they did set. It's because we need to validate threads
+	 * number for each CPT to guarantee each pool will have enough
+	 * threads to keep the service healthy.
+	 */
+	init = PTLRPC_NTHRS_INIT + (svc->srv_ops.so_hpreq_handler != NULL);
+	init = max_t(int, init, tc->tc_nthrs_init);
+
+	/* NB: please see comments in lustre_lnet.h for definition
+	 * details of these members */
+	LASSERT(tc->tc_nthrs_max != 0);
+
+	if (tc->tc_nthrs_user != 0) {
+		/* In case there is a reason to test a service with many
+		 * threads, we give a less strict check here, it can
+		 * be up to 8 * nthrs_max */
+		total = min(tc->tc_nthrs_max * 8, tc->tc_nthrs_user);
+		nthrs = total / svc->srv_ncpts;
+		init  = max(init, nthrs);
+		goto out;
+	}
+
+	total = tc->tc_nthrs_max;
+	if (tc->tc_nthrs_base == 0) {
+		/* don't care about base threads number per partition,
+		 * this is most for non-affinity service */
+		nthrs = total / svc->srv_ncpts;
+		goto out;
+	}
+
+	nthrs = tc->tc_nthrs_base;
+	if (svc->srv_ncpts == 1) {
+		int	i;
+
+		/* NB: Increase the base number if it's single partition
+		 * and total number of cores/HTs is larger or equal to 4.
+		 * result will always < 2 * nthrs_base */
+		weight = cfs_cpt_weight(svc->srv_cptable, CFS_CPT_ANY);
+		for (i = 1; (weight >> (i + 1)) != 0 && /* >= 4 cores/HTs */
+			    (tc->tc_nthrs_base >> i) != 0; i++)
+			nthrs += tc->tc_nthrs_base >> i;
+	}
+
+	if (tc->tc_thr_factor != 0) {
+		int	  factor = tc->tc_thr_factor;
+		const int fade = 4;
+
+		/*
+		 * User wants to increase number of threads with for
+		 * each CPU core/HT, most likely the factor is larger then
+		 * one thread/core because service threads are supposed to
+		 * be blocked by lock or wait for IO.
+		 */
+		/*
+		 * Amdahl's law says that adding processors wouldn't give
+		 * a linear increasing of parallelism, so it's nonsense to
+		 * have too many threads no matter how many cores/HTs
+		 * there are.
+		 */
+		/* weight is # of HTs */
+		if (cpumask_weight(topology_thread_cpumask(0)) > 1) {
+			/* depress thread factor for hyper-thread */
+			factor = factor - (factor >> 1) + (factor >> 3);
+		}
+
+		weight = cfs_cpt_weight(svc->srv_cptable, 0);
+		LASSERT(weight > 0);
+
+		for (; factor > 0 && weight > 0; factor--, weight -= fade)
+			nthrs += min(weight, fade) * factor;
+	}
+
+	if (nthrs * svc->srv_ncpts > tc->tc_nthrs_max) {
+		nthrs = max(tc->tc_nthrs_base,
+			    tc->tc_nthrs_max / svc->srv_ncpts);
+	}
+ out:
+	nthrs = max(nthrs, tc->tc_nthrs_init);
+	svc->srv_nthrs_cpt_limit = nthrs;
+	svc->srv_nthrs_cpt_init = init;
+
+	if (nthrs * svc->srv_ncpts > tc->tc_nthrs_max) {
+		CDEBUG(D_OTHER, "%s: This service may have more threads (%d) than the given soft limit (%d)\n",
+		       svc->srv_name, nthrs * svc->srv_ncpts,
+		       tc->tc_nthrs_max);
+	}
+}
+
+/**
+ * Initialize percpt data for a service
+ */
+static int
+ptlrpc_service_part_init(struct ptlrpc_service *svc,
+			 struct ptlrpc_service_part *svcpt, int cpt)
+{
+	struct ptlrpc_at_array	*array;
+	int			size;
+	int			index;
+	int			rc;
+
+	svcpt->scp_cpt = cpt;
+	INIT_LIST_HEAD(&svcpt->scp_threads);
+
+	/* rqbd and incoming request queue */
+	spin_lock_init(&svcpt->scp_lock);
+	INIT_LIST_HEAD(&svcpt->scp_rqbd_idle);
+	INIT_LIST_HEAD(&svcpt->scp_rqbd_posted);
+	INIT_LIST_HEAD(&svcpt->scp_req_incoming);
+	init_waitqueue_head(&svcpt->scp_waitq);
+	/* history request & rqbd list */
+	INIT_LIST_HEAD(&svcpt->scp_hist_reqs);
+	INIT_LIST_HEAD(&svcpt->scp_hist_rqbds);
+
+	/* active requests and hp requests */
+	spin_lock_init(&svcpt->scp_req_lock);
+
+	/* reply states */
+	spin_lock_init(&svcpt->scp_rep_lock);
+	INIT_LIST_HEAD(&svcpt->scp_rep_active);
+	INIT_LIST_HEAD(&svcpt->scp_rep_idle);
+	init_waitqueue_head(&svcpt->scp_rep_waitq);
+	atomic_set(&svcpt->scp_nreps_difficult, 0);
+
+	/* adaptive timeout */
+	spin_lock_init(&svcpt->scp_at_lock);
+	array = &svcpt->scp_at_array;
+
+	size = at_est2timeout(at_max);
+	array->paa_size     = size;
+	array->paa_count    = 0;
+	array->paa_deadline = -1;
+
+	/* allocate memory for scp_at_array (ptlrpc_at_array) */
+	OBD_CPT_ALLOC(array->paa_reqs_array,
+		      svc->srv_cptable, cpt, sizeof(struct list_head) * size);
+	if (array->paa_reqs_array == NULL)
+		return -ENOMEM;
+
+	for (index = 0; index < size; index++)
+		INIT_LIST_HEAD(&array->paa_reqs_array[index]);
+
+	OBD_CPT_ALLOC(array->paa_reqs_count,
+		      svc->srv_cptable, cpt, sizeof(__u32) * size);
+	if (array->paa_reqs_count == NULL)
+		goto failed;
+
+	cfs_timer_init(&svcpt->scp_at_timer, ptlrpc_at_timer, svcpt);
+	/* At SOW, service time should be quick; 10s seems generous. If client
+	 * timeout is less than this, we'll be sending an early reply. */
+	at_init(&svcpt->scp_at_estimate, 10, 0);
+
+	/* assign this before call ptlrpc_grow_req_bufs */
+	svcpt->scp_service = svc;
+	/* Now allocate the request buffers, but don't post them now */
+	rc = ptlrpc_grow_req_bufs(svcpt, 0);
+	/* We shouldn't be under memory pressure at startup, so
+	 * fail if we can't allocate all our buffers at this time. */
+	if (rc != 0)
+		goto failed;
+
+	return 0;
+
+ failed:
+	if (array->paa_reqs_count != NULL) {
+		OBD_FREE(array->paa_reqs_count, sizeof(__u32) * size);
+		array->paa_reqs_count = NULL;
+	}
+
+	if (array->paa_reqs_array != NULL) {
+		OBD_FREE(array->paa_reqs_array,
+			 sizeof(struct list_head) * array->paa_size);
+		array->paa_reqs_array = NULL;
+	}
+
+	return -ENOMEM;
+}
+
+/**
+ * Initialize service on a given portal.
+ * This includes starting serving threads , allocating and posting rqbds and
+ * so on.
+ */
+struct ptlrpc_service *
+ptlrpc_register_service(struct ptlrpc_service_conf *conf,
+			struct proc_dir_entry *proc_entry)
+{
+	struct ptlrpc_service_cpt_conf	*cconf = &conf->psc_cpt;
+	struct ptlrpc_service		*service;
+	struct ptlrpc_service_part	*svcpt;
+	struct cfs_cpt_table		*cptable;
+	__u32				*cpts = NULL;
+	int				ncpts;
+	int				cpt;
+	int				rc;
+	int				i;
+
+	LASSERT(conf->psc_buf.bc_nbufs > 0);
+	LASSERT(conf->psc_buf.bc_buf_size >=
+		conf->psc_buf.bc_req_max_size + SPTLRPC_MAX_PAYLOAD);
+	LASSERT(conf->psc_thr.tc_ctx_tags != 0);
+
+	cptable = cconf->cc_cptable;
+	if (cptable == NULL)
+		cptable = cfs_cpt_table;
+
+	if (!conf->psc_thr.tc_cpu_affinity) {
+		ncpts = 1;
+	} else {
+		ncpts = cfs_cpt_number(cptable);
+		if (cconf->cc_pattern != NULL) {
+			struct cfs_expr_list	*el;
+
+			rc = cfs_expr_list_parse(cconf->cc_pattern,
+						 strlen(cconf->cc_pattern),
+						 0, ncpts - 1, &el);
+			if (rc != 0) {
+				CERROR("%s: invalid CPT pattern string: %s",
+				       conf->psc_name, cconf->cc_pattern);
+				return ERR_PTR(-EINVAL);
+			}
+
+			rc = cfs_expr_list_values(el, ncpts, &cpts);
+			cfs_expr_list_free(el);
+			if (rc <= 0) {
+				CERROR("%s: failed to parse CPT array %s: %d\n",
+				       conf->psc_name, cconf->cc_pattern, rc);
+				if (cpts != NULL)
+					OBD_FREE(cpts, sizeof(*cpts) * ncpts);
+				return ERR_PTR(rc < 0 ? rc : -EINVAL);
+			}
+			ncpts = rc;
+		}
+	}
+
+	OBD_ALLOC(service, offsetof(struct ptlrpc_service, srv_parts[ncpts]));
+	if (service == NULL) {
+		if (cpts != NULL)
+			OBD_FREE(cpts, sizeof(*cpts) * ncpts);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	service->srv_cptable		= cptable;
+	service->srv_cpts		= cpts;
+	service->srv_ncpts		= ncpts;
+
+	service->srv_cpt_bits = 0; /* it's zero already, easy to read... */
+	while ((1 << service->srv_cpt_bits) < cfs_cpt_number(cptable))
+		service->srv_cpt_bits++;
+
+	/* public members */
+	spin_lock_init(&service->srv_lock);
+	service->srv_name		= conf->psc_name;
+	service->srv_watchdog_factor	= conf->psc_watchdog_factor;
+	INIT_LIST_HEAD(&service->srv_list); /* for safety of cleanup */
+
+	/* buffer configuration */
+	service->srv_nbuf_per_group	= test_req_buffer_pressure ?
+					  1 : conf->psc_buf.bc_nbufs;
+	service->srv_max_req_size	= conf->psc_buf.bc_req_max_size +
+					  SPTLRPC_MAX_PAYLOAD;
+	service->srv_buf_size		= conf->psc_buf.bc_buf_size;
+	service->srv_rep_portal		= conf->psc_buf.bc_rep_portal;
+	service->srv_req_portal		= conf->psc_buf.bc_req_portal;
+
+	/* Increase max reply size to next power of two */
+	service->srv_max_reply_size = 1;
+	while (service->srv_max_reply_size <
+	       conf->psc_buf.bc_rep_max_size + SPTLRPC_MAX_PAYLOAD)
+		service->srv_max_reply_size <<= 1;
+
+	service->srv_thread_name	= conf->psc_thr.tc_thr_name;
+	service->srv_ctx_tags		= conf->psc_thr.tc_ctx_tags;
+	service->srv_hpreq_ratio	= PTLRPC_SVC_HP_RATIO;
+	service->srv_ops		= conf->psc_ops;
+
+	for (i = 0; i < ncpts; i++) {
+		if (!conf->psc_thr.tc_cpu_affinity)
+			cpt = CFS_CPT_ANY;
+		else
+			cpt = cpts != NULL ? cpts[i] : i;
+
+		OBD_CPT_ALLOC(svcpt, cptable, cpt, sizeof(*svcpt));
+		if (svcpt == NULL) {
+			rc = -ENOMEM;
+			goto failed;
+		}
+
+		service->srv_parts[i] = svcpt;
+		rc = ptlrpc_service_part_init(service, svcpt, cpt);
+		if (rc != 0)
+			goto failed;
+	}
+
+	ptlrpc_server_nthreads_check(service, conf);
+
+	rc = LNetSetLazyPortal(service->srv_req_portal);
+	LASSERT(rc == 0);
+
+	mutex_lock(&ptlrpc_all_services_mutex);
+	list_add(&service->srv_list, &ptlrpc_all_services);
+	mutex_unlock(&ptlrpc_all_services_mutex);
+
+	if (proc_entry != NULL)
+		ptlrpc_lprocfs_register_service(proc_entry, service);
+
+	rc = ptlrpc_service_nrs_setup(service);
+	if (rc != 0)
+		goto failed;
+
+	CDEBUG(D_NET, "%s: Started, listening on portal %d\n",
+	       service->srv_name, service->srv_req_portal);
+
+	rc = ptlrpc_start_threads(service);
+	if (rc != 0) {
+		CERROR("Failed to start threads for service %s: %d\n",
+		       service->srv_name, rc);
+		goto failed;
+	}
+
+	return service;
+failed:
+	ptlrpc_unregister_service(service);
+	return ERR_PTR(rc);
+}
+EXPORT_SYMBOL(ptlrpc_register_service);
+
+/**
+ * to actually free the request, must be called without holding svc_lock.
+ * note it's caller's responsibility to unlink req->rq_list.
+ */
+static void ptlrpc_server_free_request(struct ptlrpc_request *req)
+{
+	LASSERT(atomic_read(&req->rq_refcount) == 0);
+	LASSERT(list_empty(&req->rq_timed_list));
+
+	 /* DEBUG_REQ() assumes the reply state of a request with a valid
+	  * ref will not be destroyed until that reference is dropped. */
+	ptlrpc_req_drop_rs(req);
+
+	sptlrpc_svc_ctx_decref(req);
+
+	if (req != &req->rq_rqbd->rqbd_req) {
+		/* NB request buffers use an embedded
+		 * req if the incoming req unlinked the
+		 * MD; this isn't one of them! */
+		ptlrpc_request_cache_free(req);
+	}
+}
+
+/**
+ * drop a reference count of the request. if it reaches 0, we either
+ * put it into history list, or free it immediately.
+ */
+void ptlrpc_server_drop_request(struct ptlrpc_request *req)
+{
+	struct ptlrpc_request_buffer_desc *rqbd = req->rq_rqbd;
+	struct ptlrpc_service_part	  *svcpt = rqbd->rqbd_svcpt;
+	struct ptlrpc_service		  *svc = svcpt->scp_service;
+	int				refcount;
+	struct list_head			*tmp;
+	struct list_head			*nxt;
+
+	if (!atomic_dec_and_test(&req->rq_refcount))
+		return;
+
+	if (req->rq_at_linked) {
+		spin_lock(&svcpt->scp_at_lock);
+		/* recheck with lock, in case it's unlinked by
+		 * ptlrpc_at_check_timed() */
+		if (likely(req->rq_at_linked))
+			ptlrpc_at_remove_timed(req);
+		spin_unlock(&svcpt->scp_at_lock);
+	}
+
+	LASSERT(list_empty(&req->rq_timed_list));
+
+	/* finalize request */
+	if (req->rq_export) {
+		class_export_put(req->rq_export);
+		req->rq_export = NULL;
+	}
+
+	spin_lock(&svcpt->scp_lock);
+
+	list_add(&req->rq_list, &rqbd->rqbd_reqs);
+
+	refcount = --(rqbd->rqbd_refcount);
+	if (refcount == 0) {
+		/* request buffer is now idle: add to history */
+		list_del(&rqbd->rqbd_list);
+
+		list_add_tail(&rqbd->rqbd_list, &svcpt->scp_hist_rqbds);
+		svcpt->scp_hist_nrqbds++;
+
+		/* cull some history?
+		 * I expect only about 1 or 2 rqbds need to be recycled here */
+		while (svcpt->scp_hist_nrqbds > svc->srv_hist_nrqbds_cpt_max) {
+			rqbd = list_entry(svcpt->scp_hist_rqbds.next,
+					      struct ptlrpc_request_buffer_desc,
+					      rqbd_list);
+
+			list_del(&rqbd->rqbd_list);
+			svcpt->scp_hist_nrqbds--;
+
+			/* remove rqbd's reqs from svc's req history while
+			 * I've got the service lock */
+			list_for_each(tmp, &rqbd->rqbd_reqs) {
+				req = list_entry(tmp, struct ptlrpc_request,
+						     rq_list);
+				/* Track the highest culled req seq */
+				if (req->rq_history_seq >
+				    svcpt->scp_hist_seq_culled) {
+					svcpt->scp_hist_seq_culled =
+						req->rq_history_seq;
+				}
+				list_del(&req->rq_history_list);
+			}
+
+			spin_unlock(&svcpt->scp_lock);
+
+			list_for_each_safe(tmp, nxt, &rqbd->rqbd_reqs) {
+				req = list_entry(rqbd->rqbd_reqs.next,
+						     struct ptlrpc_request,
+						     rq_list);
+				list_del(&req->rq_list);
+				ptlrpc_server_free_request(req);
+			}
+
+			spin_lock(&svcpt->scp_lock);
+			/*
+			 * now all reqs including the embedded req has been
+			 * disposed, schedule request buffer for re-use.
+			 */
+			LASSERT(atomic_read(&rqbd->rqbd_req.rq_refcount) ==
+				0);
+			list_add_tail(&rqbd->rqbd_list,
+					  &svcpt->scp_rqbd_idle);
+		}
+
+		spin_unlock(&svcpt->scp_lock);
+	} else if (req->rq_reply_state && req->rq_reply_state->rs_prealloc) {
+		/* If we are low on memory, we are not interested in history */
+		list_del(&req->rq_list);
+		list_del_init(&req->rq_history_list);
+
+		/* Track the highest culled req seq */
+		if (req->rq_history_seq > svcpt->scp_hist_seq_culled)
+			svcpt->scp_hist_seq_culled = req->rq_history_seq;
+
+		spin_unlock(&svcpt->scp_lock);
+
+		ptlrpc_server_free_request(req);
+	} else {
+		spin_unlock(&svcpt->scp_lock);
+	}
+}
+
+/** Change request export and move hp request from old export to new */
+void ptlrpc_request_change_export(struct ptlrpc_request *req,
+				  struct obd_export *export)
+{
+	if (req->rq_export != NULL) {
+		if (!list_empty(&req->rq_exp_list)) {
+			/* remove rq_exp_list from last export */
+			spin_lock_bh(&req->rq_export->exp_rpc_lock);
+			list_del_init(&req->rq_exp_list);
+			spin_unlock_bh(&req->rq_export->exp_rpc_lock);
+
+			/* export has one reference already, so it`s safe to
+			 * add req to export queue here and get another
+			 * reference for request later */
+			spin_lock_bh(&export->exp_rpc_lock);
+			list_add(&req->rq_exp_list, &export->exp_hp_rpcs);
+			spin_unlock_bh(&export->exp_rpc_lock);
+		}
+		class_export_rpc_dec(req->rq_export);
+		class_export_put(req->rq_export);
+	}
+
+	/* request takes one export refcount */
+	req->rq_export = class_export_get(export);
+	class_export_rpc_inc(export);
+
+	return;
+}
+
+/**
+ * to finish a request: stop sending more early replies, and release
+ * the request.
+ */
+static void ptlrpc_server_finish_request(struct ptlrpc_service_part *svcpt,
+					 struct ptlrpc_request *req)
+{
+	ptlrpc_server_hpreq_fini(req);
+
+	ptlrpc_server_drop_request(req);
+}
+
+/**
+ * to finish a active request: stop sending more early replies, and release
+ * the request. should be called after we finished handling the request.
+ */
+static void ptlrpc_server_finish_active_request(
+					struct ptlrpc_service_part *svcpt,
+					struct ptlrpc_request *req)
+{
+	spin_lock(&svcpt->scp_req_lock);
+	ptlrpc_nrs_req_stop_nolock(req);
+	svcpt->scp_nreqs_active--;
+	if (req->rq_hp)
+		svcpt->scp_nhreqs_active--;
+	spin_unlock(&svcpt->scp_req_lock);
+
+	ptlrpc_nrs_req_finalize(req);
+
+	if (req->rq_export != NULL)
+		class_export_rpc_dec(req->rq_export);
+
+	ptlrpc_server_finish_request(svcpt, req);
+}
+
+/**
+ * This function makes sure dead exports are evicted in a timely manner.
+ * This function is only called when some export receives a message (i.e.,
+ * the network is up.)
+ */
+static void ptlrpc_update_export_timer(struct obd_export *exp, long extra_delay)
+{
+	struct obd_export *oldest_exp;
+	time_t oldest_time, new_time;
+
+	LASSERT(exp);
+
+	/* Compensate for slow machines, etc, by faking our request time
+	   into the future.  Although this can break the strict time-ordering
+	   of the list, we can be really lazy here - we don't have to evict
+	   at the exact right moment.  Eventually, all silent exports
+	   will make it to the top of the list. */
+
+	/* Do not pay attention on 1sec or smaller renewals. */
+	new_time = get_seconds() + extra_delay;
+	if (exp->exp_last_request_time + 1 /*second */ >= new_time)
+		return;
+
+	exp->exp_last_request_time = new_time;
+
+	/* exports may get disconnected from the chain even though the
+	   export has references, so we must keep the spin lock while
+	   manipulating the lists */
+	spin_lock(&exp->exp_obd->obd_dev_lock);
+
+	if (list_empty(&exp->exp_obd_chain_timed)) {
+		/* this one is not timed */
+		spin_unlock(&exp->exp_obd->obd_dev_lock);
+		return;
+	}
+
+	list_move_tail(&exp->exp_obd_chain_timed,
+			   &exp->exp_obd->obd_exports_timed);
+
+	oldest_exp = list_entry(exp->exp_obd->obd_exports_timed.next,
+				    struct obd_export, exp_obd_chain_timed);
+	oldest_time = oldest_exp->exp_last_request_time;
+	spin_unlock(&exp->exp_obd->obd_dev_lock);
+
+	if (exp->exp_obd->obd_recovering) {
+		/* be nice to everyone during recovery */
+		return;
+	}
+
+	/* Note - racing to start/reset the obd_eviction timer is safe */
+	if (exp->exp_obd->obd_eviction_timer == 0) {
+		/* Check if the oldest entry is expired. */
+		if (get_seconds() > (oldest_time + PING_EVICT_TIMEOUT +
+					      extra_delay)) {
+			/* We need a second timer, in case the net was down and
+			 * it just came back. Since the pinger may skip every
+			 * other PING_INTERVAL (see note in ptlrpc_pinger_main),
+			 * we better wait for 3. */
+			exp->exp_obd->obd_eviction_timer =
+				get_seconds() + 3 * PING_INTERVAL;
+			CDEBUG(D_HA, "%s: Think about evicting %s from "CFS_TIME_T"\n",
+			       exp->exp_obd->obd_name,
+			       obd_export_nid2str(oldest_exp), oldest_time);
+		}
+	} else {
+		if (get_seconds() >
+		    (exp->exp_obd->obd_eviction_timer + extra_delay)) {
+			/* The evictor won't evict anyone who we've heard from
+			 * recently, so we don't have to check before we start
+			 * it. */
+			if (!ping_evictor_wake(exp))
+				exp->exp_obd->obd_eviction_timer = 0;
+		}
+	}
+}
+
+/**
+ * Sanity check request \a req.
+ * Return 0 if all is ok, error code otherwise.
+ */
+static int ptlrpc_check_req(struct ptlrpc_request *req)
+{
+	struct obd_device *obd = req->rq_export->exp_obd;
+	int rc = 0;
+
+	if (unlikely(lustre_msg_get_conn_cnt(req->rq_reqmsg) <
+		     req->rq_export->exp_conn_cnt)) {
+		DEBUG_REQ(D_RPCTRACE, req,
+			  "DROPPING req from old connection %d < %d",
+			  lustre_msg_get_conn_cnt(req->rq_reqmsg),
+			  req->rq_export->exp_conn_cnt);
+		return -EEXIST;
+	}
+	if (unlikely(obd == NULL || obd->obd_fail)) {
+		/*
+		 * Failing over, don't handle any more reqs, send
+		 * error response instead.
+		 */
+		CDEBUG(D_RPCTRACE, "Dropping req %p for failed obd %s\n",
+		       req, (obd != NULL) ? obd->obd_name : "unknown");
+		rc = -ENODEV;
+	} else if (lustre_msg_get_flags(req->rq_reqmsg) &
+		   (MSG_REPLAY | MSG_REQ_REPLAY_DONE) &&
+		   !obd->obd_recovering) {
+			DEBUG_REQ(D_ERROR, req,
+				  "Invalid replay without recovery");
+			class_fail_export(req->rq_export);
+			rc = -ENODEV;
+	} else if (lustre_msg_get_transno(req->rq_reqmsg) != 0 &&
+		   !obd->obd_recovering) {
+			DEBUG_REQ(D_ERROR, req, "Invalid req with transno %llu without recovery",
+				  lustre_msg_get_transno(req->rq_reqmsg));
+			class_fail_export(req->rq_export);
+			rc = -ENODEV;
+	}
+
+	if (unlikely(rc < 0)) {
+		req->rq_status = rc;
+		ptlrpc_error(req);
+	}
+	return rc;
+}
+
+static void ptlrpc_at_set_timer(struct ptlrpc_service_part *svcpt)
+{
+	struct ptlrpc_at_array *array = &svcpt->scp_at_array;
+	__s32 next;
+
+	if (array->paa_count == 0) {
+		cfs_timer_disarm(&svcpt->scp_at_timer);
+		return;
+	}
+
+	/* Set timer for closest deadline */
+	next = (__s32)(array->paa_deadline - get_seconds() -
+		       at_early_margin);
+	if (next <= 0) {
+		ptlrpc_at_timer((unsigned long)svcpt);
+	} else {
+		cfs_timer_arm(&svcpt->scp_at_timer, cfs_time_shift(next));
+		CDEBUG(D_INFO, "armed %s at %+ds\n",
+		       svcpt->scp_service->srv_name, next);
+	}
+}
+
+/* Add rpc to early reply check list */
+static int ptlrpc_at_add_timed(struct ptlrpc_request *req)
+{
+	struct ptlrpc_service_part *svcpt = req->rq_rqbd->rqbd_svcpt;
+	struct ptlrpc_at_array *array = &svcpt->scp_at_array;
+	struct ptlrpc_request *rq = NULL;
+	__u32 index;
+
+	if (AT_OFF)
+		return 0;
+
+	if (req->rq_no_reply)
+		return 0;
+
+	if ((lustre_msghdr_get_flags(req->rq_reqmsg) & MSGHDR_AT_SUPPORT) == 0)
+		return -ENOSYS;
+
+	spin_lock(&svcpt->scp_at_lock);
+	LASSERT(list_empty(&req->rq_timed_list));
+
+	index = (unsigned long)req->rq_deadline % array->paa_size;
+	if (array->paa_reqs_count[index] > 0) {
+		/* latest rpcs will have the latest deadlines in the list,
+		 * so search backward. */
+		list_for_each_entry_reverse(rq,
+						&array->paa_reqs_array[index],
+						rq_timed_list) {
+			if (req->rq_deadline >= rq->rq_deadline) {
+				list_add(&req->rq_timed_list,
+					     &rq->rq_timed_list);
+				break;
+			}
+		}
+	}
+
+	/* Add the request at the head of the list */
+	if (list_empty(&req->rq_timed_list))
+		list_add(&req->rq_timed_list,
+			     &array->paa_reqs_array[index]);
+
+	spin_lock(&req->rq_lock);
+	req->rq_at_linked = 1;
+	spin_unlock(&req->rq_lock);
+	req->rq_at_index = index;
+	array->paa_reqs_count[index]++;
+	array->paa_count++;
+	if (array->paa_count == 1 || array->paa_deadline > req->rq_deadline) {
+		array->paa_deadline = req->rq_deadline;
+		ptlrpc_at_set_timer(svcpt);
+	}
+	spin_unlock(&svcpt->scp_at_lock);
+
+	return 0;
+}
+
+static void
+ptlrpc_at_remove_timed(struct ptlrpc_request *req)
+{
+	struct ptlrpc_at_array *array;
+
+	array = &req->rq_rqbd->rqbd_svcpt->scp_at_array;
+
+	/* NB: must call with hold svcpt::scp_at_lock */
+	LASSERT(!list_empty(&req->rq_timed_list));
+	list_del_init(&req->rq_timed_list);
+
+	spin_lock(&req->rq_lock);
+	req->rq_at_linked = 0;
+	spin_unlock(&req->rq_lock);
+
+	array->paa_reqs_count[req->rq_at_index]--;
+	array->paa_count--;
+}
+
+static int ptlrpc_at_send_early_reply(struct ptlrpc_request *req)
+{
+	struct ptlrpc_service_part *svcpt = req->rq_rqbd->rqbd_svcpt;
+	struct ptlrpc_request *reqcopy;
+	struct lustre_msg *reqmsg;
+	long olddl = req->rq_deadline - get_seconds();
+	time_t newdl;
+	int rc;
+
+	/* deadline is when the client expects us to reply, margin is the
+	   difference between clients' and servers' expectations */
+	DEBUG_REQ(D_ADAPTTO, req,
+		  "%ssending early reply (deadline %+lds, margin %+lds) for %d+%d",
+		  AT_OFF ? "AT off - not " : "",
+		  olddl, olddl - at_get(&svcpt->scp_at_estimate),
+		  at_get(&svcpt->scp_at_estimate), at_extra);
+
+	if (AT_OFF)
+		return 0;
+
+	if (olddl < 0) {
+		DEBUG_REQ(D_WARNING, req, "Already past deadline (%+lds), not sending early reply. Consider increasing at_early_margin (%d)?",
+			  olddl, at_early_margin);
+
+		/* Return an error so we're not re-added to the timed list. */
+		return -ETIMEDOUT;
+	}
+
+	if (!(lustre_msghdr_get_flags(req->rq_reqmsg) & MSGHDR_AT_SUPPORT)) {
+		DEBUG_REQ(D_INFO, req, "Wanted to ask client for more time, but no AT support");
+		return -ENOSYS;
+	}
+
+	if (req->rq_export &&
+	    lustre_msg_get_flags(req->rq_reqmsg) &
+	    (MSG_REPLAY | MSG_REQ_REPLAY_DONE | MSG_LOCK_REPLAY_DONE)) {
+		/* During recovery, we don't want to send too many early
+		 * replies, but on the other hand we want to make sure the
+		 * client has enough time to resend if the rpc is lost. So
+		 * during the recovery period send at least 4 early replies,
+		 * spacing them every at_extra if we can. at_estimate should
+		 * always equal this fixed value during recovery. */
+		at_measured(&svcpt->scp_at_estimate, min(at_extra,
+			    req->rq_export->exp_obd->obd_recovery_timeout / 4));
+	} else {
+		/* Fake our processing time into the future to ask the clients
+		 * for some extra amount of time */
+		at_measured(&svcpt->scp_at_estimate, at_extra +
+			    get_seconds() -
+			    req->rq_arrival_time.tv_sec);
+
+		/* Check to see if we've actually increased the deadline -
+		 * we may be past adaptive_max */
+		if (req->rq_deadline >= req->rq_arrival_time.tv_sec +
+		    at_get(&svcpt->scp_at_estimate)) {
+			DEBUG_REQ(D_WARNING, req, "Couldn't add any time (%ld/%ld), not sending early reply\n",
+				  olddl, req->rq_arrival_time.tv_sec +
+				  at_get(&svcpt->scp_at_estimate) -
+				  get_seconds());
+			return -ETIMEDOUT;
+		}
+	}
+	newdl = get_seconds() + at_get(&svcpt->scp_at_estimate);
+
+	reqcopy = ptlrpc_request_cache_alloc(GFP_NOFS);
+	if (reqcopy == NULL)
+		return -ENOMEM;
+	OBD_ALLOC_LARGE(reqmsg, req->rq_reqlen);
+	if (!reqmsg) {
+		rc = -ENOMEM;
+		goto out_free;
+	}
+
+	*reqcopy = *req;
+	reqcopy->rq_reply_state = NULL;
+	reqcopy->rq_rep_swab_mask = 0;
+	reqcopy->rq_pack_bulk = 0;
+	reqcopy->rq_pack_udesc = 0;
+	reqcopy->rq_packed_final = 0;
+	sptlrpc_svc_ctx_addref(reqcopy);
+	/* We only need the reqmsg for the magic */
+	reqcopy->rq_reqmsg = reqmsg;
+	memcpy(reqmsg, req->rq_reqmsg, req->rq_reqlen);
+
+	LASSERT(atomic_read(&req->rq_refcount));
+	/** if it is last refcount then early reply isn't needed */
+	if (atomic_read(&req->rq_refcount) == 1) {
+		DEBUG_REQ(D_ADAPTTO, reqcopy, "Normal reply already sent out, abort sending early reply\n");
+		rc = -EINVAL;
+		goto out;
+	}
+
+	/* Connection ref */
+	reqcopy->rq_export = class_conn2export(
+				     lustre_msg_get_handle(reqcopy->rq_reqmsg));
+	if (reqcopy->rq_export == NULL) {
+		rc = -ENODEV;
+		goto out;
+	}
+
+	/* RPC ref */
+	class_export_rpc_inc(reqcopy->rq_export);
+	if (reqcopy->rq_export->exp_obd &&
+	    reqcopy->rq_export->exp_obd->obd_fail) {
+		rc = -ENODEV;
+		goto out_put;
+	}
+
+	rc = lustre_pack_reply_flags(reqcopy, 1, NULL, NULL, LPRFL_EARLY_REPLY);
+	if (rc)
+		goto out_put;
+
+	rc = ptlrpc_send_reply(reqcopy, PTLRPC_REPLY_EARLY);
+
+	if (!rc) {
+		/* Adjust our own deadline to what we told the client */
+		req->rq_deadline = newdl;
+		req->rq_early_count++; /* number sent, server side */
+	} else {
+		DEBUG_REQ(D_ERROR, req, "Early reply send failed %d", rc);
+	}
+
+	/* Free the (early) reply state from lustre_pack_reply.
+	   (ptlrpc_send_reply takes it's own rs ref, so this is safe here) */
+	ptlrpc_req_drop_rs(reqcopy);
+
+out_put:
+	class_export_rpc_dec(reqcopy->rq_export);
+	class_export_put(reqcopy->rq_export);
+out:
+	sptlrpc_svc_ctx_decref(reqcopy);
+	OBD_FREE_LARGE(reqmsg, req->rq_reqlen);
+out_free:
+	ptlrpc_request_cache_free(reqcopy);
+	return rc;
+}
+
+/* Send early replies to everybody expiring within at_early_margin
+   asking for at_extra time */
+static int ptlrpc_at_check_timed(struct ptlrpc_service_part *svcpt)
+{
+	struct ptlrpc_at_array *array = &svcpt->scp_at_array;
+	struct ptlrpc_request *rq, *n;
+	struct list_head work_list;
+	__u32  index, count;
+	time_t deadline;
+	time_t now = get_seconds();
+	long delay;
+	int first, counter = 0;
+
+	spin_lock(&svcpt->scp_at_lock);
+	if (svcpt->scp_at_check == 0) {
+		spin_unlock(&svcpt->scp_at_lock);
+		return 0;
+	}
+	delay = cfs_time_sub(cfs_time_current(), svcpt->scp_at_checktime);
+	svcpt->scp_at_check = 0;
+
+	if (array->paa_count == 0) {
+		spin_unlock(&svcpt->scp_at_lock);
+		return 0;
+	}
+
+	/* The timer went off, but maybe the nearest rpc already completed. */
+	first = array->paa_deadline - now;
+	if (first > at_early_margin) {
+		/* We've still got plenty of time.  Reset the timer. */
+		ptlrpc_at_set_timer(svcpt);
+		spin_unlock(&svcpt->scp_at_lock);
+		return 0;
+	}
+
+	/* We're close to a timeout, and we don't know how much longer the
+	   server will take. Send early replies to everyone expiring soon. */
+	INIT_LIST_HEAD(&work_list);
+	deadline = -1;
+	index = (unsigned long)array->paa_deadline % array->paa_size;
+	count = array->paa_count;
+	while (count > 0) {
+		count -= array->paa_reqs_count[index];
+		list_for_each_entry_safe(rq, n,
+					     &array->paa_reqs_array[index],
+					     rq_timed_list) {
+			if (rq->rq_deadline > now + at_early_margin) {
+				/* update the earliest deadline */
+				if (deadline == -1 ||
+				    rq->rq_deadline < deadline)
+					deadline = rq->rq_deadline;
+				break;
+			}
+
+			ptlrpc_at_remove_timed(rq);
+			/**
+			 * ptlrpc_server_drop_request() may drop
+			 * refcount to 0 already. Let's check this and
+			 * don't add entry to work_list
+			 */
+			if (likely(atomic_inc_not_zero(&rq->rq_refcount)))
+				list_add(&rq->rq_timed_list, &work_list);
+			counter++;
+		}
+
+		if (++index >= array->paa_size)
+			index = 0;
+	}
+	array->paa_deadline = deadline;
+	/* we have a new earliest deadline, restart the timer */
+	ptlrpc_at_set_timer(svcpt);
+
+	spin_unlock(&svcpt->scp_at_lock);
+
+	CDEBUG(D_ADAPTTO, "timeout in %+ds, asking for %d secs on %d early replies\n",
+	       first, at_extra, counter);
+	if (first < 0) {
+		/* We're already past request deadlines before we even get a
+		   chance to send early replies */
+		LCONSOLE_WARN("%s: This server is not able to keep up with request traffic (cpu-bound).\n",
+			      svcpt->scp_service->srv_name);
+		CWARN("earlyQ=%d reqQ=%d recA=%d, svcEst=%d, delay=" CFS_DURATION_T "(jiff)\n",
+		      counter, svcpt->scp_nreqs_incoming,
+		      svcpt->scp_nreqs_active,
+		      at_get(&svcpt->scp_at_estimate), delay);
+	}
+
+	/* we took additional refcount so entries can't be deleted from list, no
+	 * locking is needed */
+	while (!list_empty(&work_list)) {
+		rq = list_entry(work_list.next, struct ptlrpc_request,
+				    rq_timed_list);
+		list_del_init(&rq->rq_timed_list);
+
+		if (ptlrpc_at_send_early_reply(rq) == 0)
+			ptlrpc_at_add_timed(rq);
+
+		ptlrpc_server_drop_request(rq);
+	}
+
+	return 1; /* return "did_something" for liblustre */
+}
+
+/**
+ * Put the request to the export list if the request may become
+ * a high priority one.
+ */
+static int ptlrpc_server_hpreq_init(struct ptlrpc_service_part *svcpt,
+				    struct ptlrpc_request *req)
+{
+	int rc = 0;
+
+	if (svcpt->scp_service->srv_ops.so_hpreq_handler) {
+		rc = svcpt->scp_service->srv_ops.so_hpreq_handler(req);
+		if (rc < 0)
+			return rc;
+		LASSERT(rc == 0);
+	}
+	if (req->rq_export && req->rq_ops) {
+		/* Perform request specific check. We should do this check
+		 * before the request is added into exp_hp_rpcs list otherwise
+		 * it may hit swab race at LU-1044. */
+		if (req->rq_ops->hpreq_check) {
+			rc = req->rq_ops->hpreq_check(req);
+			/**
+			 * XXX: Out of all current
+			 * ptlrpc_hpreq_ops::hpreq_check(), only
+			 * ldlm_cancel_hpreq_check() can return an error code;
+			 * other functions assert in similar places, which seems
+			 * odd. What also does not seem right is that handlers
+			 * for those RPCs do not assert on the same checks, but
+			 * rather handle the error cases. e.g. see
+			 * ost_rw_hpreq_check(), and ost_brw_read(),
+			 * ost_brw_write().
+			 */
+			if (rc < 0)
+				return rc;
+			LASSERT(rc == 0 || rc == 1);
+		}
+
+		spin_lock_bh(&req->rq_export->exp_rpc_lock);
+		list_add(&req->rq_exp_list,
+			     &req->rq_export->exp_hp_rpcs);
+		spin_unlock_bh(&req->rq_export->exp_rpc_lock);
+	}
+
+	ptlrpc_nrs_req_initialize(svcpt, req, rc);
+
+	return rc;
+}
+
+/** Remove the request from the export list. */
+static void ptlrpc_server_hpreq_fini(struct ptlrpc_request *req)
+{
+	if (req->rq_export && req->rq_ops) {
+		/* refresh lock timeout again so that client has more
+		 * room to send lock cancel RPC. */
+		if (req->rq_ops->hpreq_fini)
+			req->rq_ops->hpreq_fini(req);
+
+		spin_lock_bh(&req->rq_export->exp_rpc_lock);
+		list_del_init(&req->rq_exp_list);
+		spin_unlock_bh(&req->rq_export->exp_rpc_lock);
+	}
+}
+
+static int ptlrpc_hpreq_check(struct ptlrpc_request *req)
+{
+	return 1;
+}
+
+static struct ptlrpc_hpreq_ops ptlrpc_hpreq_common = {
+	.hpreq_check       = ptlrpc_hpreq_check,
+};
+
+/* Hi-Priority RPC check by RPC operation code. */
+int ptlrpc_hpreq_handler(struct ptlrpc_request *req)
+{
+	int opc = lustre_msg_get_opc(req->rq_reqmsg);
+
+	/* Check for export to let only reconnects for not yet evicted
+	 * export to become a HP rpc. */
+	if ((req->rq_export != NULL) &&
+	    (opc == OBD_PING || opc == MDS_CONNECT || opc == OST_CONNECT))
+		req->rq_ops = &ptlrpc_hpreq_common;
+
+	return 0;
+}
+EXPORT_SYMBOL(ptlrpc_hpreq_handler);
+
+static int ptlrpc_server_request_add(struct ptlrpc_service_part *svcpt,
+				     struct ptlrpc_request *req)
+{
+	int	rc;
+
+	rc = ptlrpc_server_hpreq_init(svcpt, req);
+	if (rc < 0)
+		return rc;
+
+	ptlrpc_nrs_req_add(svcpt, req, !!rc);
+
+	return 0;
+}
+
+/**
+ * Allow to handle high priority request
+ * User can call it w/o any lock but need to hold
+ * ptlrpc_service_part::scp_req_lock to get reliable result
+ */
+static bool ptlrpc_server_allow_high(struct ptlrpc_service_part *svcpt,
+				     bool force)
+{
+	int running = svcpt->scp_nthrs_running;
+
+	if (!nrs_svcpt_has_hp(svcpt))
+		return false;
+
+	if (force)
+		return true;
+
+	if (unlikely(svcpt->scp_service->srv_req_portal == MDS_REQUEST_PORTAL &&
+		     CFS_FAIL_PRECHECK(OBD_FAIL_PTLRPC_CANCEL_RESEND))) {
+		/* leave just 1 thread for normal RPCs */
+		running = PTLRPC_NTHRS_INIT;
+		if (svcpt->scp_service->srv_ops.so_hpreq_handler != NULL)
+			running += 1;
+	}
+
+	if (svcpt->scp_nreqs_active >= running - 1)
+		return false;
+
+	if (svcpt->scp_nhreqs_active == 0)
+		return true;
+
+	return !ptlrpc_nrs_req_pending_nolock(svcpt, false) ||
+	       svcpt->scp_hreq_count < svcpt->scp_service->srv_hpreq_ratio;
+}
+
+static bool ptlrpc_server_high_pending(struct ptlrpc_service_part *svcpt,
+				       bool force)
+{
+	return ptlrpc_server_allow_high(svcpt, force) &&
+	       ptlrpc_nrs_req_pending_nolock(svcpt, true);
+}
+
+/**
+ * Only allow normal priority requests on a service that has a high-priority
+ * queue if forced (i.e. cleanup), if there are other high priority requests
+ * already being processed (i.e. those threads can service more high-priority
+ * requests), or if there are enough idle threads that a later thread can do
+ * a high priority request.
+ * User can call it w/o any lock but need to hold
+ * ptlrpc_service_part::scp_req_lock to get reliable result
+ */
+static bool ptlrpc_server_allow_normal(struct ptlrpc_service_part *svcpt,
+				       bool force)
+{
+	int running = svcpt->scp_nthrs_running;
+	if (unlikely(svcpt->scp_service->srv_req_portal == MDS_REQUEST_PORTAL &&
+		     CFS_FAIL_PRECHECK(OBD_FAIL_PTLRPC_CANCEL_RESEND))) {
+		/* leave just 1 thread for normal RPCs */
+		running = PTLRPC_NTHRS_INIT;
+		if (svcpt->scp_service->srv_ops.so_hpreq_handler != NULL)
+			running += 1;
+	}
+
+	if (force ||
+	    svcpt->scp_nreqs_active < running - 2)
+		return true;
+
+	if (svcpt->scp_nreqs_active >= running - 1)
+		return false;
+
+	return svcpt->scp_nhreqs_active > 0 || !nrs_svcpt_has_hp(svcpt);
+}
+
+static bool ptlrpc_server_normal_pending(struct ptlrpc_service_part *svcpt,
+					 bool force)
+{
+	return ptlrpc_server_allow_normal(svcpt, force) &&
+	       ptlrpc_nrs_req_pending_nolock(svcpt, false);
+}
+
+/**
+ * Returns true if there are requests available in incoming
+ * request queue for processing and it is allowed to fetch them.
+ * User can call it w/o any lock but need to hold ptlrpc_service::scp_req_lock
+ * to get reliable result
+ * \see ptlrpc_server_allow_normal
+ * \see ptlrpc_server_allow high
+ */
+static inline bool
+ptlrpc_server_request_pending(struct ptlrpc_service_part *svcpt, bool force)
+{
+	return ptlrpc_server_high_pending(svcpt, force) ||
+	       ptlrpc_server_normal_pending(svcpt, force);
+}
+
+/**
+ * Fetch a request for processing from queue of unprocessed requests.
+ * Favors high-priority requests.
+ * Returns a pointer to fetched request.
+ */
+static struct ptlrpc_request *
+ptlrpc_server_request_get(struct ptlrpc_service_part *svcpt, bool force)
+{
+	struct ptlrpc_request *req = NULL;
+
+	spin_lock(&svcpt->scp_req_lock);
+
+	if (ptlrpc_server_high_pending(svcpt, force)) {
+		req = ptlrpc_nrs_req_get_nolock(svcpt, true, force);
+		if (req != NULL) {
+			svcpt->scp_hreq_count++;
+			goto got_request;
+		}
+	}
+
+	if (ptlrpc_server_normal_pending(svcpt, force)) {
+		req = ptlrpc_nrs_req_get_nolock(svcpt, false, force);
+		if (req != NULL) {
+			svcpt->scp_hreq_count = 0;
+			goto got_request;
+		}
+	}
+
+	spin_unlock(&svcpt->scp_req_lock);
+	return NULL;
+
+got_request:
+	svcpt->scp_nreqs_active++;
+	if (req->rq_hp)
+		svcpt->scp_nhreqs_active++;
+
+	spin_unlock(&svcpt->scp_req_lock);
+
+	if (likely(req->rq_export))
+		class_export_rpc_inc(req->rq_export);
+
+	return req;
+}
+
+/**
+ * Handle freshly incoming reqs, add to timed early reply list,
+ * pass on to regular request queue.
+ * All incoming requests pass through here before getting into
+ * ptlrpc_server_handle_req later on.
+ */
+static int
+ptlrpc_server_handle_req_in(struct ptlrpc_service_part *svcpt,
+			    struct ptlrpc_thread *thread)
+{
+	struct ptlrpc_service	*svc = svcpt->scp_service;
+	struct ptlrpc_request	*req;
+	__u32			deadline;
+	int			rc;
+
+	spin_lock(&svcpt->scp_lock);
+	if (list_empty(&svcpt->scp_req_incoming)) {
+		spin_unlock(&svcpt->scp_lock);
+		return 0;
+	}
+
+	req = list_entry(svcpt->scp_req_incoming.next,
+			     struct ptlrpc_request, rq_list);
+	list_del_init(&req->rq_list);
+	svcpt->scp_nreqs_incoming--;
+	/* Consider this still a "queued" request as far as stats are
+	 * concerned */
+	spin_unlock(&svcpt->scp_lock);
+
+	/* go through security check/transform */
+	rc = sptlrpc_svc_unwrap_request(req);
+	switch (rc) {
+	case SECSVC_OK:
+		break;
+	case SECSVC_COMPLETE:
+		target_send_reply(req, 0, OBD_FAIL_MDS_ALL_REPLY_NET);
+		goto err_req;
+	case SECSVC_DROP:
+		goto err_req;
+	default:
+		LBUG();
+	}
+
+	/*
+	 * for null-flavored rpc, msg has been unpacked by sptlrpc, although
+	 * redo it wouldn't be harmful.
+	 */
+	if (SPTLRPC_FLVR_POLICY(req->rq_flvr.sf_rpc) != SPTLRPC_POLICY_NULL) {
+		rc = ptlrpc_unpack_req_msg(req, req->rq_reqlen);
+		if (rc != 0) {
+			CERROR("error unpacking request: ptl %d from %s x%llu\n",
+			       svc->srv_req_portal, libcfs_id2str(req->rq_peer),
+			       req->rq_xid);
+			goto err_req;
+		}
+	}
+
+	rc = lustre_unpack_req_ptlrpc_body(req, MSG_PTLRPC_BODY_OFF);
+	if (rc) {
+		CERROR("error unpacking ptlrpc body: ptl %d from %s x%llu\n",
+		       svc->srv_req_portal, libcfs_id2str(req->rq_peer),
+		       req->rq_xid);
+		goto err_req;
+	}
+
+	if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_DROP_REQ_OPC) &&
+	    lustre_msg_get_opc(req->rq_reqmsg) == cfs_fail_val) {
+		CERROR("drop incoming rpc opc %u, x%llu\n",
+		       cfs_fail_val, req->rq_xid);
+		goto err_req;
+	}
+
+	rc = -EINVAL;
+	if (lustre_msg_get_type(req->rq_reqmsg) != PTL_RPC_MSG_REQUEST) {
+		CERROR("wrong packet type received (type=%u) from %s\n",
+		       lustre_msg_get_type(req->rq_reqmsg),
+		       libcfs_id2str(req->rq_peer));
+		goto err_req;
+	}
+
+	switch (lustre_msg_get_opc(req->rq_reqmsg)) {
+	case MDS_WRITEPAGE:
+	case OST_WRITE:
+		req->rq_bulk_write = 1;
+		break;
+	case MDS_READPAGE:
+	case OST_READ:
+	case MGS_CONFIG_READ:
+		req->rq_bulk_read = 1;
+		break;
+	}
+
+	CDEBUG(D_RPCTRACE, "got req x%llu\n", req->rq_xid);
+
+	req->rq_export = class_conn2export(
+		lustre_msg_get_handle(req->rq_reqmsg));
+	if (req->rq_export) {
+		rc = ptlrpc_check_req(req);
+		if (rc == 0) {
+			rc = sptlrpc_target_export_check(req->rq_export, req);
+			if (rc)
+				DEBUG_REQ(D_ERROR, req, "DROPPING req with illegal security flavor,");
+		}
+
+		if (rc)
+			goto err_req;
+		ptlrpc_update_export_timer(req->rq_export, 0);
+	}
+
+	/* req_in handling should/must be fast */
+	if (get_seconds() - req->rq_arrival_time.tv_sec > 5)
+		DEBUG_REQ(D_WARNING, req, "Slow req_in handling "CFS_DURATION_T"s",
+			  cfs_time_sub(get_seconds(),
+				       req->rq_arrival_time.tv_sec));
+
+	/* Set rpc server deadline and add it to the timed list */
+	deadline = (lustre_msghdr_get_flags(req->rq_reqmsg) &
+		    MSGHDR_AT_SUPPORT) ?
+		   /* The max time the client expects us to take */
+		   lustre_msg_get_timeout(req->rq_reqmsg) : obd_timeout;
+	req->rq_deadline = req->rq_arrival_time.tv_sec + deadline;
+	if (unlikely(deadline == 0)) {
+		DEBUG_REQ(D_ERROR, req, "Dropping request with 0 timeout");
+		goto err_req;
+	}
+
+	req->rq_svc_thread = thread;
+
+	ptlrpc_at_add_timed(req);
+
+	/* Move it over to the request processing queue */
+	rc = ptlrpc_server_request_add(svcpt, req);
+	if (rc)
+		goto err_req;
+
+	wake_up(&svcpt->scp_waitq);
+	return 1;
+
+err_req:
+	ptlrpc_server_finish_request(svcpt, req);
+
+	return 1;
+}
+
+/**
+ * Main incoming request handling logic.
+ * Calls handler function from service to do actual processing.
+ */
+static int
+ptlrpc_server_handle_request(struct ptlrpc_service_part *svcpt,
+			     struct ptlrpc_thread *thread)
+{
+	struct ptlrpc_service *svc = svcpt->scp_service;
+	struct ptlrpc_request *request;
+	struct timeval	 work_start;
+	struct timeval	 work_end;
+	long		   timediff;
+	int		    rc;
+	int		    fail_opc = 0;
+
+	request = ptlrpc_server_request_get(svcpt, false);
+	if (request == NULL)
+		return 0;
+
+	if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_HPREQ_NOTIMEOUT))
+		fail_opc = OBD_FAIL_PTLRPC_HPREQ_NOTIMEOUT;
+	else if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_HPREQ_TIMEOUT))
+		fail_opc = OBD_FAIL_PTLRPC_HPREQ_TIMEOUT;
+
+	if (unlikely(fail_opc)) {
+		if (request->rq_export && request->rq_ops)
+			OBD_FAIL_TIMEOUT(fail_opc, 4);
+	}
+
+	ptlrpc_rqphase_move(request, RQ_PHASE_INTERPRET);
+
+	if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_DUMP_LOG))
+		libcfs_debug_dumplog();
+
+	do_gettimeofday(&work_start);
+	timediff = cfs_timeval_sub(&work_start, &request->rq_arrival_time,
+				   NULL);
+	if (likely(svc->srv_stats != NULL)) {
+		lprocfs_counter_add(svc->srv_stats, PTLRPC_REQWAIT_CNTR,
+				    timediff);
+		lprocfs_counter_add(svc->srv_stats, PTLRPC_REQQDEPTH_CNTR,
+				    svcpt->scp_nreqs_incoming);
+		lprocfs_counter_add(svc->srv_stats, PTLRPC_REQACTIVE_CNTR,
+				    svcpt->scp_nreqs_active);
+		lprocfs_counter_add(svc->srv_stats, PTLRPC_TIMEOUT,
+				    at_get(&svcpt->scp_at_estimate));
+	}
+
+	rc = lu_context_init(&request->rq_session, LCT_SESSION | LCT_NOREF);
+	if (rc) {
+		CERROR("Failure to initialize session: %d\n", rc);
+		goto out_req;
+	}
+	request->rq_session.lc_thread = thread;
+	request->rq_session.lc_cookie = 0x5;
+	lu_context_enter(&request->rq_session);
+
+	CDEBUG(D_NET, "got req %llu\n", request->rq_xid);
+
+	request->rq_svc_thread = thread;
+	if (thread)
+		request->rq_svc_thread->t_env->le_ses = &request->rq_session;
+
+	if (likely(request->rq_export)) {
+		if (unlikely(ptlrpc_check_req(request)))
+			goto put_conn;
+		ptlrpc_update_export_timer(request->rq_export, timediff >> 19);
+	}
+
+	/* Discard requests queued for longer than the deadline.
+	   The deadline is increased if we send an early reply. */
+	if (get_seconds() > request->rq_deadline) {
+		DEBUG_REQ(D_ERROR, request, "Dropping timed-out request from %s: deadline " CFS_DURATION_T ":" CFS_DURATION_T "s ago\n",
+			  libcfs_id2str(request->rq_peer),
+			  cfs_time_sub(request->rq_deadline,
+				       request->rq_arrival_time.tv_sec),
+			  cfs_time_sub(get_seconds(),
+				       request->rq_deadline));
+		goto put_conn;
+	}
+
+	CDEBUG(D_RPCTRACE, "Handling RPC pname:cluuid+ref:pid:xid:nid:opc %s:%s+%d:%d:x%llu:%s:%d\n",
+	       current_comm(),
+	       (request->rq_export ?
+		(char *)request->rq_export->exp_client_uuid.uuid : "0"),
+	       (request->rq_export ?
+		atomic_read(&request->rq_export->exp_refcount) : -99),
+	       lustre_msg_get_status(request->rq_reqmsg), request->rq_xid,
+	       libcfs_id2str(request->rq_peer),
+	       lustre_msg_get_opc(request->rq_reqmsg));
+
+	if (lustre_msg_get_opc(request->rq_reqmsg) != OBD_PING)
+		CFS_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_PAUSE_REQ, cfs_fail_val);
+
+	rc = svc->srv_ops.so_req_handler(request);
+
+	ptlrpc_rqphase_move(request, RQ_PHASE_COMPLETE);
+
+put_conn:
+	lu_context_exit(&request->rq_session);
+	lu_context_fini(&request->rq_session);
+
+	if (unlikely(get_seconds() > request->rq_deadline)) {
+		DEBUG_REQ(D_WARNING, request,
+			  "Request took longer than estimated ("
+				CFS_DURATION_T":"CFS_DURATION_T
+				"s); client may timeout.",
+			  cfs_time_sub(request->rq_deadline,
+				       request->rq_arrival_time.tv_sec),
+			  cfs_time_sub(get_seconds(),
+				       request->rq_deadline));
+	}
+
+	do_gettimeofday(&work_end);
+	timediff = cfs_timeval_sub(&work_end, &work_start, NULL);
+	CDEBUG(D_RPCTRACE, "Handled RPC pname:cluuid+ref:pid:xid:nid:opc %s:%s+%d:%d:x%llu:%s:%d Request processed in %ldus (%ldus total) trans %llu rc %d/%d\n",
+	       current_comm(),
+	       (request->rq_export ?
+		(char *)request->rq_export->exp_client_uuid.uuid : "0"),
+	       (request->rq_export ?
+		atomic_read(&request->rq_export->exp_refcount) : -99),
+	       lustre_msg_get_status(request->rq_reqmsg),
+	       request->rq_xid,
+	       libcfs_id2str(request->rq_peer),
+	       lustre_msg_get_opc(request->rq_reqmsg),
+	       timediff,
+	       cfs_timeval_sub(&work_end, &request->rq_arrival_time, NULL),
+	       (request->rq_repmsg ?
+		lustre_msg_get_transno(request->rq_repmsg) :
+		request->rq_transno),
+	       request->rq_status,
+	       (request->rq_repmsg ?
+		lustre_msg_get_status(request->rq_repmsg) : -999));
+	if (likely(svc->srv_stats != NULL && request->rq_reqmsg != NULL)) {
+		__u32 op = lustre_msg_get_opc(request->rq_reqmsg);
+		int opc = opcode_offset(op);
+		if (opc > 0 && !(op == LDLM_ENQUEUE || op == MDS_REINT)) {
+			LASSERT(opc < LUSTRE_MAX_OPCODES);
+			lprocfs_counter_add(svc->srv_stats,
+					    opc + EXTRA_MAX_OPCODES,
+					    timediff);
+		}
+	}
+	if (unlikely(request->rq_early_count)) {
+		DEBUG_REQ(D_ADAPTTO, request,
+			  "sent %d early replies before finishing in "
+			  CFS_DURATION_T"s",
+			  request->rq_early_count,
+			  cfs_time_sub(work_end.tv_sec,
+			  request->rq_arrival_time.tv_sec));
+	}
+
+out_req:
+	ptlrpc_server_finish_active_request(svcpt, request);
+
+	return 1;
+}
+
+/**
+ * An internal function to process a single reply state object.
+ */
+static int
+ptlrpc_handle_rs(struct ptlrpc_reply_state *rs)
+{
+	struct ptlrpc_service_part *svcpt = rs->rs_svcpt;
+	struct ptlrpc_service     *svc = svcpt->scp_service;
+	struct obd_export	 *exp;
+	int			nlocks;
+	int			been_handled;
+
+	exp = rs->rs_export;
+
+	LASSERT(rs->rs_difficult);
+	LASSERT(rs->rs_scheduled);
+	LASSERT(list_empty(&rs->rs_list));
+
+	spin_lock(&exp->exp_lock);
+	/* Noop if removed already */
+	list_del_init(&rs->rs_exp_list);
+	spin_unlock(&exp->exp_lock);
+
+	/* The disk commit callback holds exp_uncommitted_replies_lock while it
+	 * iterates over newly committed replies, removing them from
+	 * exp_uncommitted_replies.  It then drops this lock and schedules the
+	 * replies it found for handling here.
+	 *
+	 * We can avoid contention for exp_uncommitted_replies_lock between the
+	 * HRT threads and further commit callbacks by checking rs_committed
+	 * which is set in the commit callback while it holds both
+	 * rs_lock and exp_uncommitted_reples.
+	 *
+	 * If we see rs_committed clear, the commit callback _may_ not have
+	 * handled this reply yet and we race with it to grab
+	 * exp_uncommitted_replies_lock before removing the reply from
+	 * exp_uncommitted_replies.  Note that if we lose the race and the
+	 * reply has already been removed, list_del_init() is a noop.
+	 *
+	 * If we see rs_committed set, we know the commit callback is handling,
+	 * or has handled this reply since store reordering might allow us to
+	 * see rs_committed set out of sequence.  But since this is done
+	 * holding rs_lock, we can be sure it has all completed once we hold
+	 * rs_lock, which we do right next.
+	 */
+	if (!rs->rs_committed) {
+		spin_lock(&exp->exp_uncommitted_replies_lock);
+		list_del_init(&rs->rs_obd_list);
+		spin_unlock(&exp->exp_uncommitted_replies_lock);
+	}
+
+	spin_lock(&rs->rs_lock);
+
+	been_handled = rs->rs_handled;
+	rs->rs_handled = 1;
+
+	nlocks = rs->rs_nlocks;		 /* atomic "steal", but */
+	rs->rs_nlocks = 0;		      /* locks still on rs_locks! */
+
+	if (nlocks == 0 && !been_handled) {
+		/* If we see this, we should already have seen the warning
+		 * in mds_steal_ack_locks()  */
+		CDEBUG(D_HA, "All locks stolen from rs %p x%lld.t%lld o%d NID %s\n",
+		       rs,
+		       rs->rs_xid, rs->rs_transno, rs->rs_opc,
+		       libcfs_nid2str(exp->exp_connection->c_peer.nid));
+	}
+
+	if ((!been_handled && rs->rs_on_net) || nlocks > 0) {
+		spin_unlock(&rs->rs_lock);
+
+		if (!been_handled && rs->rs_on_net) {
+			LNetMDUnlink(rs->rs_md_h);
+			/* Ignore return code; we're racing with completion */
+		}
+
+		while (nlocks-- > 0)
+			ldlm_lock_decref(&rs->rs_locks[nlocks],
+					 rs->rs_modes[nlocks]);
+
+		spin_lock(&rs->rs_lock);
+	}
+
+	rs->rs_scheduled = 0;
+
+	if (!rs->rs_on_net) {
+		/* Off the net */
+		spin_unlock(&rs->rs_lock);
+
+		class_export_put(exp);
+		rs->rs_export = NULL;
+		ptlrpc_rs_decref(rs);
+		if (atomic_dec_and_test(&svcpt->scp_nreps_difficult) &&
+		    svc->srv_is_stopping)
+			wake_up_all(&svcpt->scp_waitq);
+		return 1;
+	}
+
+	/* still on the net; callback will schedule */
+	spin_unlock(&rs->rs_lock);
+	return 1;
+}
+
+
+static void
+ptlrpc_check_rqbd_pool(struct ptlrpc_service_part *svcpt)
+{
+	int avail = svcpt->scp_nrqbds_posted;
+	int low_water = test_req_buffer_pressure ? 0 :
+			svcpt->scp_service->srv_nbuf_per_group / 2;
+
+	/* NB I'm not locking; just looking. */
+
+	/* CAVEAT EMPTOR: We might be allocating buffers here because we've
+	 * allowed the request history to grow out of control.  We could put a
+	 * sanity check on that here and cull some history if we need the
+	 * space. */
+
+	if (avail <= low_water)
+		ptlrpc_grow_req_bufs(svcpt, 1);
+
+	if (svcpt->scp_service->srv_stats) {
+		lprocfs_counter_add(svcpt->scp_service->srv_stats,
+				    PTLRPC_REQBUF_AVAIL_CNTR, avail);
+	}
+}
+
+static int
+ptlrpc_retry_rqbds(void *arg)
+{
+	struct ptlrpc_service_part *svcpt = (struct ptlrpc_service_part *)arg;
+
+	svcpt->scp_rqbd_timeout = 0;
+	return -ETIMEDOUT;
+}
+
+static inline int
+ptlrpc_threads_enough(struct ptlrpc_service_part *svcpt)
+{
+	return svcpt->scp_nreqs_active <
+	       svcpt->scp_nthrs_running - 1 -
+	       (svcpt->scp_service->srv_ops.so_hpreq_handler != NULL);
+}
+
+/**
+ * allowed to create more threads
+ * user can call it w/o any lock but need to hold
+ * ptlrpc_service_part::scp_lock to get reliable result
+ */
+static inline int
+ptlrpc_threads_increasable(struct ptlrpc_service_part *svcpt)
+{
+	return svcpt->scp_nthrs_running +
+	       svcpt->scp_nthrs_starting <
+	       svcpt->scp_service->srv_nthrs_cpt_limit;
+}
+
+/**
+ * too many requests and allowed to create more threads
+ */
+static inline int
+ptlrpc_threads_need_create(struct ptlrpc_service_part *svcpt)
+{
+	return !ptlrpc_threads_enough(svcpt) &&
+		ptlrpc_threads_increasable(svcpt);
+}
+
+static inline int
+ptlrpc_thread_stopping(struct ptlrpc_thread *thread)
+{
+	return thread_is_stopping(thread) ||
+	       thread->t_svcpt->scp_service->srv_is_stopping;
+}
+
+static inline int
+ptlrpc_rqbd_pending(struct ptlrpc_service_part *svcpt)
+{
+	return !list_empty(&svcpt->scp_rqbd_idle) &&
+	       svcpt->scp_rqbd_timeout == 0;
+}
+
+static inline int
+ptlrpc_at_check(struct ptlrpc_service_part *svcpt)
+{
+	return svcpt->scp_at_check;
+}
+
+/**
+ * requests wait on preprocessing
+ * user can call it w/o any lock but need to hold
+ * ptlrpc_service_part::scp_lock to get reliable result
+ */
+static inline int
+ptlrpc_server_request_incoming(struct ptlrpc_service_part *svcpt)
+{
+	return !list_empty(&svcpt->scp_req_incoming);
+}
+
+static __attribute__((__noinline__)) int
+ptlrpc_wait_event(struct ptlrpc_service_part *svcpt,
+		  struct ptlrpc_thread *thread)
+{
+	/* Don't exit while there are replies to be handled */
+	struct l_wait_info lwi = LWI_TIMEOUT(svcpt->scp_rqbd_timeout,
+					     ptlrpc_retry_rqbds, svcpt);
+
+	/* XXX: Add this back when libcfs watchdog is merged upstream
+	lc_watchdog_disable(thread->t_watchdog);
+	 */
+
+	cond_resched();
+
+	l_wait_event_exclusive_head(svcpt->scp_waitq,
+				ptlrpc_thread_stopping(thread) ||
+				ptlrpc_server_request_incoming(svcpt) ||
+				ptlrpc_server_request_pending(svcpt, false) ||
+				ptlrpc_rqbd_pending(svcpt) ||
+				ptlrpc_at_check(svcpt), &lwi);
+
+	if (ptlrpc_thread_stopping(thread))
+		return -EINTR;
+
+	/*
+	lc_watchdog_touch(thread->t_watchdog,
+			  ptlrpc_server_get_timeout(svcpt));
+	 */
+	return 0;
+}
+
+/**
+ * Main thread body for service threads.
+ * Waits in a loop waiting for new requests to process to appear.
+ * Every time an incoming requests is added to its queue, a waitq
+ * is woken up and one of the threads will handle it.
+ */
+static int ptlrpc_main(void *arg)
+{
+	struct ptlrpc_thread		*thread = (struct ptlrpc_thread *)arg;
+	struct ptlrpc_service_part	*svcpt = thread->t_svcpt;
+	struct ptlrpc_service		*svc = svcpt->scp_service;
+	struct ptlrpc_reply_state	*rs;
+	struct group_info *ginfo = NULL;
+	struct lu_env *env;
+	int counter = 0, rc = 0;
+
+	thread->t_pid = current_pid();
+	unshare_fs_struct();
+
+	/* NB: we will call cfs_cpt_bind() for all threads, because we
+	 * might want to run lustre server only on a subset of system CPUs,
+	 * in that case ->scp_cpt is CFS_CPT_ANY */
+	rc = cfs_cpt_bind(svc->srv_cptable, svcpt->scp_cpt);
+	if (rc != 0) {
+		CWARN("%s: failed to bind %s on CPT %d\n",
+		      svc->srv_name, thread->t_name, svcpt->scp_cpt);
+	}
+
+	ginfo = groups_alloc(0);
+	if (!ginfo) {
+		rc = -ENOMEM;
+		goto out;
+	}
+
+	set_current_groups(ginfo);
+	put_group_info(ginfo);
+
+	if (svc->srv_ops.so_thr_init != NULL) {
+		rc = svc->srv_ops.so_thr_init(thread);
+		if (rc)
+			goto out;
+	}
+
+	OBD_ALLOC_PTR(env);
+	if (env == NULL) {
+		rc = -ENOMEM;
+		goto out_srv_fini;
+	}
+
+	rc = lu_context_init(&env->le_ctx,
+			     svc->srv_ctx_tags|LCT_REMEMBER|LCT_NOREF);
+	if (rc)
+		goto out_srv_fini;
+
+	thread->t_env = env;
+	env->le_ctx.lc_thread = thread;
+	env->le_ctx.lc_cookie = 0x6;
+
+	while (!list_empty(&svcpt->scp_rqbd_idle)) {
+		rc = ptlrpc_server_post_idle_rqbds(svcpt);
+		if (rc >= 0)
+			continue;
+
+		CERROR("Failed to post rqbd for %s on CPT %d: %d\n",
+			svc->srv_name, svcpt->scp_cpt, rc);
+		goto out_srv_fini;
+	}
+
+	/* Alloc reply state structure for this one */
+	OBD_ALLOC_LARGE(rs, svc->srv_max_reply_size);
+	if (!rs) {
+		rc = -ENOMEM;
+		goto out_srv_fini;
+	}
+
+	spin_lock(&svcpt->scp_lock);
+
+	LASSERT(thread_is_starting(thread));
+	thread_clear_flags(thread, SVC_STARTING);
+
+	LASSERT(svcpt->scp_nthrs_starting == 1);
+	svcpt->scp_nthrs_starting--;
+
+	/* SVC_STOPPING may already be set here if someone else is trying
+	 * to stop the service while this new thread has been dynamically
+	 * forked. We still set SVC_RUNNING to let our creator know that
+	 * we are now running, however we will exit as soon as possible */
+	thread_add_flags(thread, SVC_RUNNING);
+	svcpt->scp_nthrs_running++;
+	spin_unlock(&svcpt->scp_lock);
+
+	/* wake up our creator in case he's still waiting. */
+	wake_up(&thread->t_ctl_waitq);
+
+	/*
+	thread->t_watchdog = lc_watchdog_add(ptlrpc_server_get_timeout(svcpt),
+					     NULL, NULL);
+	 */
+
+	spin_lock(&svcpt->scp_rep_lock);
+	list_add(&rs->rs_list, &svcpt->scp_rep_idle);
+	wake_up(&svcpt->scp_rep_waitq);
+	spin_unlock(&svcpt->scp_rep_lock);
+
+	CDEBUG(D_NET, "service thread %d (#%d) started\n", thread->t_id,
+	       svcpt->scp_nthrs_running);
+
+	/* XXX maintain a list of all managed devices: insert here */
+	while (!ptlrpc_thread_stopping(thread)) {
+		if (ptlrpc_wait_event(svcpt, thread))
+			break;
+
+		ptlrpc_check_rqbd_pool(svcpt);
+
+		if (ptlrpc_threads_need_create(svcpt)) {
+			/* Ignore return code - we tried... */
+			ptlrpc_start_thread(svcpt, 0);
+		}
+
+		/* Process all incoming reqs before handling any */
+		if (ptlrpc_server_request_incoming(svcpt)) {
+			lu_context_enter(&env->le_ctx);
+			env->le_ses = NULL;
+			ptlrpc_server_handle_req_in(svcpt, thread);
+			lu_context_exit(&env->le_ctx);
+
+			/* but limit ourselves in case of flood */
+			if (counter++ < 100)
+				continue;
+			counter = 0;
+		}
+
+		if (ptlrpc_at_check(svcpt))
+			ptlrpc_at_check_timed(svcpt);
+
+		if (ptlrpc_server_request_pending(svcpt, false)) {
+			lu_context_enter(&env->le_ctx);
+			ptlrpc_server_handle_request(svcpt, thread);
+			lu_context_exit(&env->le_ctx);
+		}
+
+		if (ptlrpc_rqbd_pending(svcpt) &&
+		    ptlrpc_server_post_idle_rqbds(svcpt) < 0) {
+			/* I just failed to repost request buffers.
+			 * Wait for a timeout (unless something else
+			 * happens) before I try again */
+			svcpt->scp_rqbd_timeout = cfs_time_seconds(1) / 10;
+			CDEBUG(D_RPCTRACE, "Posted buffers: %d\n",
+			       svcpt->scp_nrqbds_posted);
+		}
+	}
+
+	/*
+	lc_watchdog_delete(thread->t_watchdog);
+	thread->t_watchdog = NULL;
+	*/
+
+out_srv_fini:
+	/*
+	 * deconstruct service specific state created by ptlrpc_start_thread()
+	 */
+	if (svc->srv_ops.so_thr_done != NULL)
+		svc->srv_ops.so_thr_done(thread);
+
+	if (env != NULL) {
+		lu_context_fini(&env->le_ctx);
+		OBD_FREE_PTR(env);
+	}
+out:
+	CDEBUG(D_RPCTRACE, "service thread [ %p : %u ] %d exiting: rc %d\n",
+	       thread, thread->t_pid, thread->t_id, rc);
+
+	spin_lock(&svcpt->scp_lock);
+	if (thread_test_and_clear_flags(thread, SVC_STARTING))
+		svcpt->scp_nthrs_starting--;
+
+	if (thread_test_and_clear_flags(thread, SVC_RUNNING)) {
+		/* must know immediately */
+		svcpt->scp_nthrs_running--;
+	}
+
+	thread->t_id = rc;
+	thread_add_flags(thread, SVC_STOPPED);
+
+	wake_up(&thread->t_ctl_waitq);
+	spin_unlock(&svcpt->scp_lock);
+
+	return rc;
+}
+
+static int hrt_dont_sleep(struct ptlrpc_hr_thread *hrt,
+			  struct list_head *replies)
+{
+	int result;
+
+	spin_lock(&hrt->hrt_lock);
+
+	list_splice_init(&hrt->hrt_queue, replies);
+	result = ptlrpc_hr.hr_stopping || !list_empty(replies);
+
+	spin_unlock(&hrt->hrt_lock);
+	return result;
+}
+
+/**
+ * Main body of "handle reply" function.
+ * It processes acked reply states
+ */
+static int ptlrpc_hr_main(void *arg)
+{
+	struct ptlrpc_hr_thread		*hrt = (struct ptlrpc_hr_thread *)arg;
+	struct ptlrpc_hr_partition	*hrp = hrt->hrt_partition;
+	LIST_HEAD			(replies);
+	char				threadname[20];
+	int				rc;
+
+	snprintf(threadname, sizeof(threadname), "ptlrpc_hr%02d_%03d",
+		 hrp->hrp_cpt, hrt->hrt_id);
+	unshare_fs_struct();
+
+	rc = cfs_cpt_bind(ptlrpc_hr.hr_cpt_table, hrp->hrp_cpt);
+	if (rc != 0) {
+		CWARN("Failed to bind %s on CPT %d of CPT table %p: rc = %d\n",
+		      threadname, hrp->hrp_cpt, ptlrpc_hr.hr_cpt_table, rc);
+	}
+
+	atomic_inc(&hrp->hrp_nstarted);
+	wake_up(&ptlrpc_hr.hr_waitq);
+
+	while (!ptlrpc_hr.hr_stopping) {
+		l_wait_condition(hrt->hrt_waitq, hrt_dont_sleep(hrt, &replies));
+
+		while (!list_empty(&replies)) {
+			struct ptlrpc_reply_state *rs;
+
+			rs = list_entry(replies.prev,
+					    struct ptlrpc_reply_state,
+					    rs_list);
+			list_del_init(&rs->rs_list);
+			ptlrpc_handle_rs(rs);
+		}
+	}
+
+	atomic_inc(&hrp->hrp_nstopped);
+	wake_up(&ptlrpc_hr.hr_waitq);
+
+	return 0;
+}
+
+static void ptlrpc_stop_hr_threads(void)
+{
+	struct ptlrpc_hr_partition	*hrp;
+	int				i;
+	int				j;
+
+	ptlrpc_hr.hr_stopping = 1;
+
+	cfs_percpt_for_each(hrp, i, ptlrpc_hr.hr_partitions) {
+		if (hrp->hrp_thrs == NULL)
+			continue; /* uninitialized */
+		for (j = 0; j < hrp->hrp_nthrs; j++)
+			wake_up_all(&hrp->hrp_thrs[j].hrt_waitq);
+	}
+
+	cfs_percpt_for_each(hrp, i, ptlrpc_hr.hr_partitions) {
+		if (hrp->hrp_thrs == NULL)
+			continue; /* uninitialized */
+		wait_event(ptlrpc_hr.hr_waitq,
+			       atomic_read(&hrp->hrp_nstopped) ==
+			       atomic_read(&hrp->hrp_nstarted));
+	}
+}
+
+static int ptlrpc_start_hr_threads(void)
+{
+	struct ptlrpc_hr_partition	*hrp;
+	int				i;
+	int				j;
+
+	cfs_percpt_for_each(hrp, i, ptlrpc_hr.hr_partitions) {
+		int	rc = 0;
+
+		for (j = 0; j < hrp->hrp_nthrs; j++) {
+			struct	ptlrpc_hr_thread *hrt = &hrp->hrp_thrs[j];
+			rc = PTR_ERR(kthread_run(ptlrpc_hr_main,
+						 &hrp->hrp_thrs[j],
+						 "ptlrpc_hr%02d_%03d",
+						 hrp->hrp_cpt,
+						 hrt->hrt_id));
+			if (IS_ERR_VALUE(rc))
+				break;
+		}
+		wait_event(ptlrpc_hr.hr_waitq,
+			       atomic_read(&hrp->hrp_nstarted) == j);
+		if (!IS_ERR_VALUE(rc))
+			continue;
+
+		CERROR("Reply handling thread %d:%d Failed on starting: rc = %d\n",
+		       i, j, rc);
+		ptlrpc_stop_hr_threads();
+		return rc;
+	}
+	return 0;
+}
+
+static void ptlrpc_svcpt_stop_threads(struct ptlrpc_service_part *svcpt)
+{
+	struct l_wait_info	lwi = { 0 };
+	struct ptlrpc_thread	*thread;
+	LIST_HEAD		(zombie);
+
+	CDEBUG(D_INFO, "Stopping threads for service %s\n",
+	       svcpt->scp_service->srv_name);
+
+	spin_lock(&svcpt->scp_lock);
+	/* let the thread know that we would like it to stop asap */
+	list_for_each_entry(thread, &svcpt->scp_threads, t_link) {
+		CDEBUG(D_INFO, "Stopping thread %s #%u\n",
+		       svcpt->scp_service->srv_thread_name, thread->t_id);
+		thread_add_flags(thread, SVC_STOPPING);
+	}
+
+	wake_up_all(&svcpt->scp_waitq);
+
+	while (!list_empty(&svcpt->scp_threads)) {
+		thread = list_entry(svcpt->scp_threads.next,
+					struct ptlrpc_thread, t_link);
+		if (thread_is_stopped(thread)) {
+			list_del(&thread->t_link);
+			list_add(&thread->t_link, &zombie);
+			continue;
+		}
+		spin_unlock(&svcpt->scp_lock);
+
+		CDEBUG(D_INFO, "waiting for stopping-thread %s #%u\n",
+		       svcpt->scp_service->srv_thread_name, thread->t_id);
+		l_wait_event(thread->t_ctl_waitq,
+			     thread_is_stopped(thread), &lwi);
+
+		spin_lock(&svcpt->scp_lock);
+	}
+
+	spin_unlock(&svcpt->scp_lock);
+
+	while (!list_empty(&zombie)) {
+		thread = list_entry(zombie.next,
+					struct ptlrpc_thread, t_link);
+		list_del(&thread->t_link);
+		OBD_FREE_PTR(thread);
+	}
+}
+
+/**
+ * Stops all threads of a particular service \a svc
+ */
+void ptlrpc_stop_all_threads(struct ptlrpc_service *svc)
+{
+	struct ptlrpc_service_part *svcpt;
+	int			   i;
+
+	ptlrpc_service_for_each_part(svcpt, i, svc) {
+		if (svcpt->scp_service != NULL)
+			ptlrpc_svcpt_stop_threads(svcpt);
+	}
+}
+EXPORT_SYMBOL(ptlrpc_stop_all_threads);
+
+int ptlrpc_start_threads(struct ptlrpc_service *svc)
+{
+	int	rc = 0;
+	int	i;
+	int	j;
+
+	/* We require 2 threads min, see note in ptlrpc_server_handle_request */
+	LASSERT(svc->srv_nthrs_cpt_init >= PTLRPC_NTHRS_INIT);
+
+	for (i = 0; i < svc->srv_ncpts; i++) {
+		for (j = 0; j < svc->srv_nthrs_cpt_init; j++) {
+			rc = ptlrpc_start_thread(svc->srv_parts[i], 1);
+			if (rc == 0)
+				continue;
+
+			if (rc != -EMFILE)
+				goto failed;
+			/* We have enough threads, don't start more. b=15759 */
+			break;
+		}
+	}
+
+	return 0;
+ failed:
+	CERROR("cannot start %s thread #%d_%d: rc %d\n",
+	       svc->srv_thread_name, i, j, rc);
+	ptlrpc_stop_all_threads(svc);
+	return rc;
+}
+EXPORT_SYMBOL(ptlrpc_start_threads);
+
+int ptlrpc_start_thread(struct ptlrpc_service_part *svcpt, int wait)
+{
+	struct l_wait_info	lwi = { 0 };
+	struct ptlrpc_thread	*thread;
+	struct ptlrpc_service	*svc;
+	int			rc;
+
+	LASSERT(svcpt != NULL);
+
+	svc = svcpt->scp_service;
+
+	CDEBUG(D_RPCTRACE, "%s[%d] started %d min %d max %d\n",
+	       svc->srv_name, svcpt->scp_cpt, svcpt->scp_nthrs_running,
+	       svc->srv_nthrs_cpt_init, svc->srv_nthrs_cpt_limit);
+
+ again:
+	if (unlikely(svc->srv_is_stopping))
+		return -ESRCH;
+
+	if (!ptlrpc_threads_increasable(svcpt) ||
+	    (OBD_FAIL_CHECK(OBD_FAIL_TGT_TOOMANY_THREADS) &&
+	     svcpt->scp_nthrs_running == svc->srv_nthrs_cpt_init - 1))
+		return -EMFILE;
+
+	OBD_CPT_ALLOC_PTR(thread, svc->srv_cptable, svcpt->scp_cpt);
+	if (thread == NULL)
+		return -ENOMEM;
+	init_waitqueue_head(&thread->t_ctl_waitq);
+
+	spin_lock(&svcpt->scp_lock);
+	if (!ptlrpc_threads_increasable(svcpt)) {
+		spin_unlock(&svcpt->scp_lock);
+		OBD_FREE_PTR(thread);
+		return -EMFILE;
+	}
+
+	if (svcpt->scp_nthrs_starting != 0) {
+		/* serialize starting because some modules (obdfilter)
+		 * might require unique and contiguous t_id */
+		LASSERT(svcpt->scp_nthrs_starting == 1);
+		spin_unlock(&svcpt->scp_lock);
+		OBD_FREE_PTR(thread);
+		if (wait) {
+			CDEBUG(D_INFO, "Waiting for creating thread %s #%d\n",
+			       svc->srv_thread_name, svcpt->scp_thr_nextid);
+			schedule();
+			goto again;
+		}
+
+		CDEBUG(D_INFO, "Creating thread %s #%d race, retry later\n",
+		       svc->srv_thread_name, svcpt->scp_thr_nextid);
+		return -EAGAIN;
+	}
+
+	svcpt->scp_nthrs_starting++;
+	thread->t_id = svcpt->scp_thr_nextid++;
+	thread_add_flags(thread, SVC_STARTING);
+	thread->t_svcpt = svcpt;
+
+	list_add(&thread->t_link, &svcpt->scp_threads);
+	spin_unlock(&svcpt->scp_lock);
+
+	if (svcpt->scp_cpt >= 0) {
+		snprintf(thread->t_name, sizeof(thread->t_name), "%s%02d_%03d",
+			 svc->srv_thread_name, svcpt->scp_cpt, thread->t_id);
+	} else {
+		snprintf(thread->t_name, sizeof(thread->t_name), "%s_%04d",
+			 svc->srv_thread_name, thread->t_id);
+	}
+
+	CDEBUG(D_RPCTRACE, "starting thread '%s'\n", thread->t_name);
+	rc = PTR_ERR(kthread_run(ptlrpc_main, thread, "%s", thread->t_name));
+	if (IS_ERR_VALUE(rc)) {
+		CERROR("cannot start thread '%s': rc %d\n",
+		       thread->t_name, rc);
+		spin_lock(&svcpt->scp_lock);
+		--svcpt->scp_nthrs_starting;
+		if (thread_is_stopping(thread)) {
+			/* this ptlrpc_thread is being handled
+			 * by ptlrpc_svcpt_stop_threads now
+			 */
+			thread_add_flags(thread, SVC_STOPPED);
+			wake_up(&thread->t_ctl_waitq);
+			spin_unlock(&svcpt->scp_lock);
+		} else {
+			list_del(&thread->t_link);
+			spin_unlock(&svcpt->scp_lock);
+			OBD_FREE_PTR(thread);
+		}
+		return rc;
+	}
+
+	if (!wait)
+		return 0;
+
+	l_wait_event(thread->t_ctl_waitq,
+		     thread_is_running(thread) || thread_is_stopped(thread),
+		     &lwi);
+
+	rc = thread_is_stopped(thread) ? thread->t_id : 0;
+	return rc;
+}
+
+int ptlrpc_hr_init(void)
+{
+	struct ptlrpc_hr_partition	*hrp;
+	struct ptlrpc_hr_thread		*hrt;
+	int				rc;
+	int				i;
+	int				j;
+	int				weight;
+
+	memset(&ptlrpc_hr, 0, sizeof(ptlrpc_hr));
+	ptlrpc_hr.hr_cpt_table = cfs_cpt_table;
+
+	ptlrpc_hr.hr_partitions = cfs_percpt_alloc(ptlrpc_hr.hr_cpt_table,
+						   sizeof(*hrp));
+	if (ptlrpc_hr.hr_partitions == NULL)
+		return -ENOMEM;
+
+	init_waitqueue_head(&ptlrpc_hr.hr_waitq);
+
+	weight = cpumask_weight(topology_thread_cpumask(0));
+
+	cfs_percpt_for_each(hrp, i, ptlrpc_hr.hr_partitions) {
+		hrp->hrp_cpt = i;
+
+		atomic_set(&hrp->hrp_nstarted, 0);
+		atomic_set(&hrp->hrp_nstopped, 0);
+
+		hrp->hrp_nthrs = cfs_cpt_weight(ptlrpc_hr.hr_cpt_table, i);
+		hrp->hrp_nthrs /= weight;
+
+		LASSERT(hrp->hrp_nthrs > 0);
+		OBD_CPT_ALLOC(hrp->hrp_thrs, ptlrpc_hr.hr_cpt_table, i,
+			      hrp->hrp_nthrs * sizeof(*hrt));
+		if (hrp->hrp_thrs == NULL) {
+			rc = -ENOMEM;
+			goto out;
+		}
+
+		for (j = 0; j < hrp->hrp_nthrs; j++) {
+			hrt = &hrp->hrp_thrs[j];
+
+			hrt->hrt_id = j;
+			hrt->hrt_partition = hrp;
+			init_waitqueue_head(&hrt->hrt_waitq);
+			spin_lock_init(&hrt->hrt_lock);
+			INIT_LIST_HEAD(&hrt->hrt_queue);
+		}
+	}
+
+	rc = ptlrpc_start_hr_threads();
+out:
+	if (rc != 0)
+		ptlrpc_hr_fini();
+	return rc;
+}
+
+void ptlrpc_hr_fini(void)
+{
+	struct ptlrpc_hr_partition	*hrp;
+	int				i;
+
+	if (ptlrpc_hr.hr_partitions == NULL)
+		return;
+
+	ptlrpc_stop_hr_threads();
+
+	cfs_percpt_for_each(hrp, i, ptlrpc_hr.hr_partitions) {
+		if (hrp->hrp_thrs != NULL) {
+			OBD_FREE(hrp->hrp_thrs,
+				 hrp->hrp_nthrs * sizeof(hrp->hrp_thrs[0]));
+		}
+	}
+
+	cfs_percpt_free(ptlrpc_hr.hr_partitions);
+	ptlrpc_hr.hr_partitions = NULL;
+}
+
+
+/**
+ * Wait until all already scheduled replies are processed.
+ */
+static void ptlrpc_wait_replies(struct ptlrpc_service_part *svcpt)
+{
+	while (1) {
+		int rc;
+		struct l_wait_info lwi = LWI_TIMEOUT(cfs_time_seconds(10),
+						     NULL, NULL);
+
+		rc = l_wait_event(svcpt->scp_waitq,
+		     atomic_read(&svcpt->scp_nreps_difficult) == 0, &lwi);
+		if (rc == 0)
+			break;
+		CWARN("Unexpectedly long timeout %s %p\n",
+		      svcpt->scp_service->srv_name, svcpt->scp_service);
+	}
+}
+
+static void
+ptlrpc_service_del_atimer(struct ptlrpc_service *svc)
+{
+	struct ptlrpc_service_part	*svcpt;
+	int				i;
+
+	/* early disarm AT timer... */
+	ptlrpc_service_for_each_part(svcpt, i, svc) {
+		if (svcpt->scp_service != NULL)
+			cfs_timer_disarm(&svcpt->scp_at_timer);
+	}
+}
+
+static void
+ptlrpc_service_unlink_rqbd(struct ptlrpc_service *svc)
+{
+	struct ptlrpc_service_part	  *svcpt;
+	struct ptlrpc_request_buffer_desc *rqbd;
+	struct l_wait_info		  lwi;
+	int				  rc;
+	int				  i;
+
+	/* All history will be culled when the next request buffer is
+	 * freed in ptlrpc_service_purge_all() */
+	svc->srv_hist_nrqbds_cpt_max = 0;
+
+	rc = LNetClearLazyPortal(svc->srv_req_portal);
+	LASSERT(rc == 0);
+
+	ptlrpc_service_for_each_part(svcpt, i, svc) {
+		if (svcpt->scp_service == NULL)
+			break;
+
+		/* Unlink all the request buffers.  This forces a 'final'
+		 * event with its 'unlink' flag set for each posted rqbd */
+		list_for_each_entry(rqbd, &svcpt->scp_rqbd_posted,
+					rqbd_list) {
+			rc = LNetMDUnlink(rqbd->rqbd_md_h);
+			LASSERT(rc == 0 || rc == -ENOENT);
+		}
+	}
+
+	ptlrpc_service_for_each_part(svcpt, i, svc) {
+		if (svcpt->scp_service == NULL)
+			break;
+
+		/* Wait for the network to release any buffers
+		 * it's currently filling */
+		spin_lock(&svcpt->scp_lock);
+		while (svcpt->scp_nrqbds_posted != 0) {
+			spin_unlock(&svcpt->scp_lock);
+			/* Network access will complete in finite time but
+			 * the HUGE timeout lets us CWARN for visibility
+			 * of sluggish NALs */
+			lwi = LWI_TIMEOUT_INTERVAL(
+					cfs_time_seconds(LONG_UNLINK),
+					cfs_time_seconds(1), NULL, NULL);
+			rc = l_wait_event(svcpt->scp_waitq,
+					  svcpt->scp_nrqbds_posted == 0, &lwi);
+			if (rc == -ETIMEDOUT) {
+				CWARN("Service %s waiting for request buffers\n",
+				      svcpt->scp_service->srv_name);
+			}
+			spin_lock(&svcpt->scp_lock);
+		}
+		spin_unlock(&svcpt->scp_lock);
+	}
+}
+
+static void
+ptlrpc_service_purge_all(struct ptlrpc_service *svc)
+{
+	struct ptlrpc_service_part		*svcpt;
+	struct ptlrpc_request_buffer_desc	*rqbd;
+	struct ptlrpc_request			*req;
+	struct ptlrpc_reply_state		*rs;
+	int					i;
+
+	ptlrpc_service_for_each_part(svcpt, i, svc) {
+		if (svcpt->scp_service == NULL)
+			break;
+
+		spin_lock(&svcpt->scp_rep_lock);
+		while (!list_empty(&svcpt->scp_rep_active)) {
+			rs = list_entry(svcpt->scp_rep_active.next,
+					    struct ptlrpc_reply_state, rs_list);
+			spin_lock(&rs->rs_lock);
+			ptlrpc_schedule_difficult_reply(rs);
+			spin_unlock(&rs->rs_lock);
+		}
+		spin_unlock(&svcpt->scp_rep_lock);
+
+		/* purge the request queue.  NB No new replies (rqbds
+		 * all unlinked) and no service threads, so I'm the only
+		 * thread noodling the request queue now */
+		while (!list_empty(&svcpt->scp_req_incoming)) {
+			req = list_entry(svcpt->scp_req_incoming.next,
+					     struct ptlrpc_request, rq_list);
+
+			list_del(&req->rq_list);
+			svcpt->scp_nreqs_incoming--;
+			ptlrpc_server_finish_request(svcpt, req);
+		}
+
+		while (ptlrpc_server_request_pending(svcpt, true)) {
+			req = ptlrpc_server_request_get(svcpt, true);
+			ptlrpc_server_finish_active_request(svcpt, req);
+		}
+
+		LASSERT(list_empty(&svcpt->scp_rqbd_posted));
+		LASSERT(svcpt->scp_nreqs_incoming == 0);
+		LASSERT(svcpt->scp_nreqs_active == 0);
+		/* history should have been culled by
+		 * ptlrpc_server_finish_request */
+		LASSERT(svcpt->scp_hist_nrqbds == 0);
+
+		/* Now free all the request buffers since nothing
+		 * references them any more... */
+
+		while (!list_empty(&svcpt->scp_rqbd_idle)) {
+			rqbd = list_entry(svcpt->scp_rqbd_idle.next,
+					      struct ptlrpc_request_buffer_desc,
+					      rqbd_list);
+			ptlrpc_free_rqbd(rqbd);
+		}
+		ptlrpc_wait_replies(svcpt);
+
+		while (!list_empty(&svcpt->scp_rep_idle)) {
+			rs = list_entry(svcpt->scp_rep_idle.next,
+					    struct ptlrpc_reply_state,
+					    rs_list);
+			list_del(&rs->rs_list);
+			OBD_FREE_LARGE(rs, svc->srv_max_reply_size);
+		}
+	}
+}
+
+static void
+ptlrpc_service_free(struct ptlrpc_service *svc)
+{
+	struct ptlrpc_service_part	*svcpt;
+	struct ptlrpc_at_array		*array;
+	int				i;
+
+	ptlrpc_service_for_each_part(svcpt, i, svc) {
+		if (svcpt->scp_service == NULL)
+			break;
+
+		/* In case somebody rearmed this in the meantime */
+		cfs_timer_disarm(&svcpt->scp_at_timer);
+		array = &svcpt->scp_at_array;
+
+		if (array->paa_reqs_array != NULL) {
+			OBD_FREE(array->paa_reqs_array,
+				 sizeof(struct list_head) * array->paa_size);
+			array->paa_reqs_array = NULL;
+		}
+
+		if (array->paa_reqs_count != NULL) {
+			OBD_FREE(array->paa_reqs_count,
+				 sizeof(__u32) * array->paa_size);
+			array->paa_reqs_count = NULL;
+		}
+	}
+
+	ptlrpc_service_for_each_part(svcpt, i, svc)
+		OBD_FREE_PTR(svcpt);
+
+	if (svc->srv_cpts != NULL)
+		cfs_expr_list_values_free(svc->srv_cpts, svc->srv_ncpts);
+
+	OBD_FREE(svc, offsetof(struct ptlrpc_service,
+			       srv_parts[svc->srv_ncpts]));
+}
+
+int ptlrpc_unregister_service(struct ptlrpc_service *service)
+{
+	CDEBUG(D_NET, "%s: tearing down\n", service->srv_name);
+
+	service->srv_is_stopping = 1;
+
+	mutex_lock(&ptlrpc_all_services_mutex);
+	list_del_init(&service->srv_list);
+	mutex_unlock(&ptlrpc_all_services_mutex);
+
+	ptlrpc_service_del_atimer(service);
+	ptlrpc_stop_all_threads(service);
+
+	ptlrpc_service_unlink_rqbd(service);
+	ptlrpc_service_purge_all(service);
+	ptlrpc_service_nrs_cleanup(service);
+
+	ptlrpc_lprocfs_unregister_service(service);
+
+	ptlrpc_service_free(service);
+
+	return 0;
+}
+EXPORT_SYMBOL(ptlrpc_unregister_service);
+
+/**
+ * Returns 0 if the service is healthy.
+ *
+ * Right now, it just checks to make sure that requests aren't languishing
+ * in the queue.  We'll use this health check to govern whether a node needs
+ * to be shot, so it's intentionally non-aggressive. */
+int ptlrpc_svcpt_health_check(struct ptlrpc_service_part *svcpt)
+{
+	struct ptlrpc_request		*request = NULL;
+	struct timeval			right_now;
+	long				timediff;
+
+	do_gettimeofday(&right_now);
+
+	spin_lock(&svcpt->scp_req_lock);
+	/* How long has the next entry been waiting? */
+	if (ptlrpc_server_high_pending(svcpt, true))
+		request = ptlrpc_nrs_req_peek_nolock(svcpt, true);
+	else if (ptlrpc_server_normal_pending(svcpt, true))
+		request = ptlrpc_nrs_req_peek_nolock(svcpt, false);
+
+	if (request == NULL) {
+		spin_unlock(&svcpt->scp_req_lock);
+		return 0;
+	}
+
+	timediff = cfs_timeval_sub(&right_now, &request->rq_arrival_time, NULL);
+	spin_unlock(&svcpt->scp_req_lock);
+
+	if ((timediff / ONE_MILLION) >
+	    (AT_OFF ? obd_timeout * 3 / 2 : at_max)) {
+		CERROR("%s: unhealthy - request has been waiting %lds\n",
+		       svcpt->scp_service->srv_name, timediff / ONE_MILLION);
+		return -1;
+	}
+
+	return 0;
+}
+
+int
+ptlrpc_service_health_check(struct ptlrpc_service *svc)
+{
+	struct ptlrpc_service_part	*svcpt;
+	int				i;
+
+	if (svc == NULL)
+		return 0;
+
+	ptlrpc_service_for_each_part(svcpt, i, svc) {
+		int rc = ptlrpc_svcpt_health_check(svcpt);
+
+		if (rc != 0)
+			return rc;
+	}
+	return 0;
+}
+EXPORT_SYMBOL(ptlrpc_service_health_check);
diff --git a/kernel/drivers/staging/lustre/lustre/ptlrpc/wiretest.c b/kernel/drivers/staging/lustre/lustre/ptlrpc/wiretest.c
new file mode 100644
index 000000000..d6d92046c
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/ptlrpc/wiretest.c
@@ -0,0 +1,4492 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_RPC
+
+#include <linux/fs.h>
+#include <linux/posix_acl_xattr.h>
+
+#include "../include/obd_support.h"
+#include "../include/obd_class.h"
+#include "../include/lustre_net.h"
+#include "../include/lustre_disk.h"
+void lustre_assert_wire_constants(void)
+{
+	 /* Wire protocol assertions generated by 'wirecheck'
+	  * (make -C lustre/utils newwiretest)
+	  * running on Linux centos6-bis 2.6.32-358.0.1.el6-head
+	  * #3 SMP Wed Apr 17 17:37:43 CEST 2013
+	  * with gcc version 4.4.6 20110731 (Red Hat 4.4.6-3) (GCC)
+	  */
+
+	/* Constants... */
+	LASSERTF(PTL_RPC_MSG_REQUEST == 4711, "found %lld\n",
+		 (long long)PTL_RPC_MSG_REQUEST);
+	LASSERTF(PTL_RPC_MSG_ERR == 4712, "found %lld\n",
+		 (long long)PTL_RPC_MSG_ERR);
+	LASSERTF(PTL_RPC_MSG_REPLY == 4713, "found %lld\n",
+		 (long long)PTL_RPC_MSG_REPLY);
+	LASSERTF(MDS_DIR_END_OFF == 0xfffffffffffffffeULL, "found 0x%.16llxULL\n",
+		 MDS_DIR_END_OFF);
+	LASSERTF(DEAD_HANDLE_MAGIC == 0xdeadbeefcafebabeULL, "found 0x%.16llxULL\n",
+		 DEAD_HANDLE_MAGIC);
+	CLASSERT(MTI_NAME_MAXLEN == 64);
+	LASSERTF(OST_REPLY == 0, "found %lld\n",
+		 (long long)OST_REPLY);
+	LASSERTF(OST_GETATTR == 1, "found %lld\n",
+		 (long long)OST_GETATTR);
+	LASSERTF(OST_SETATTR == 2, "found %lld\n",
+		 (long long)OST_SETATTR);
+	LASSERTF(OST_READ == 3, "found %lld\n",
+		 (long long)OST_READ);
+	LASSERTF(OST_WRITE == 4, "found %lld\n",
+		 (long long)OST_WRITE);
+	LASSERTF(OST_CREATE == 5, "found %lld\n",
+		 (long long)OST_CREATE);
+	LASSERTF(OST_DESTROY == 6, "found %lld\n",
+		 (long long)OST_DESTROY);
+	LASSERTF(OST_GET_INFO == 7, "found %lld\n",
+		 (long long)OST_GET_INFO);
+	LASSERTF(OST_CONNECT == 8, "found %lld\n",
+		 (long long)OST_CONNECT);
+	LASSERTF(OST_DISCONNECT == 9, "found %lld\n",
+		 (long long)OST_DISCONNECT);
+	LASSERTF(OST_PUNCH == 10, "found %lld\n",
+		 (long long)OST_PUNCH);
+	LASSERTF(OST_OPEN == 11, "found %lld\n",
+		 (long long)OST_OPEN);
+	LASSERTF(OST_CLOSE == 12, "found %lld\n",
+		 (long long)OST_CLOSE);
+	LASSERTF(OST_STATFS == 13, "found %lld\n",
+		 (long long)OST_STATFS);
+	LASSERTF(OST_SYNC == 16, "found %lld\n",
+		 (long long)OST_SYNC);
+	LASSERTF(OST_SET_INFO == 17, "found %lld\n",
+		 (long long)OST_SET_INFO);
+	LASSERTF(OST_QUOTACHECK == 18, "found %lld\n",
+		 (long long)OST_QUOTACHECK);
+	LASSERTF(OST_QUOTACTL == 19, "found %lld\n",
+		 (long long)OST_QUOTACTL);
+	LASSERTF(OST_QUOTA_ADJUST_QUNIT == 20, "found %lld\n",
+		 (long long)OST_QUOTA_ADJUST_QUNIT);
+	LASSERTF(OST_LAST_OPC == 21, "found %lld\n",
+		 (long long)OST_LAST_OPC);
+	LASSERTF(OBD_OBJECT_EOF == 0xffffffffffffffffULL, "found 0x%.16llxULL\n",
+		 OBD_OBJECT_EOF);
+	LASSERTF(OST_MIN_PRECREATE == 32, "found %lld\n",
+		 (long long)OST_MIN_PRECREATE);
+	LASSERTF(OST_MAX_PRECREATE == 20000, "found %lld\n",
+		 (long long)OST_MAX_PRECREATE);
+	LASSERTF(OST_LVB_ERR_INIT == 0xffbadbad80000000ULL, "found 0x%.16llxULL\n",
+		 OST_LVB_ERR_INIT);
+	LASSERTF(OST_LVB_ERR_MASK == 0xffbadbad00000000ULL, "found 0x%.16llxULL\n",
+		 OST_LVB_ERR_MASK);
+	LASSERTF(MDS_FIRST_OPC == 33, "found %lld\n",
+		 (long long)MDS_FIRST_OPC);
+	LASSERTF(MDS_GETATTR == 33, "found %lld\n",
+		 (long long)MDS_GETATTR);
+	LASSERTF(MDS_GETATTR_NAME == 34, "found %lld\n",
+		 (long long)MDS_GETATTR_NAME);
+	LASSERTF(MDS_CLOSE == 35, "found %lld\n",
+		 (long long)MDS_CLOSE);
+	LASSERTF(MDS_REINT == 36, "found %lld\n",
+		 (long long)MDS_REINT);
+	LASSERTF(MDS_READPAGE == 37, "found %lld\n",
+		 (long long)MDS_READPAGE);
+	LASSERTF(MDS_CONNECT == 38, "found %lld\n",
+		 (long long)MDS_CONNECT);
+	LASSERTF(MDS_DISCONNECT == 39, "found %lld\n",
+		 (long long)MDS_DISCONNECT);
+	LASSERTF(MDS_GETSTATUS == 40, "found %lld\n",
+		 (long long)MDS_GETSTATUS);
+	LASSERTF(MDS_STATFS == 41, "found %lld\n",
+		 (long long)MDS_STATFS);
+	LASSERTF(MDS_PIN == 42, "found %lld\n",
+		 (long long)MDS_PIN);
+	LASSERTF(MDS_UNPIN == 43, "found %lld\n",
+		 (long long)MDS_UNPIN);
+	LASSERTF(MDS_SYNC == 44, "found %lld\n",
+		 (long long)MDS_SYNC);
+	LASSERTF(MDS_DONE_WRITING == 45, "found %lld\n",
+		 (long long)MDS_DONE_WRITING);
+	LASSERTF(MDS_SET_INFO == 46, "found %lld\n",
+		 (long long)MDS_SET_INFO);
+	LASSERTF(MDS_QUOTACHECK == 47, "found %lld\n",
+		 (long long)MDS_QUOTACHECK);
+	LASSERTF(MDS_QUOTACTL == 48, "found %lld\n",
+		 (long long)MDS_QUOTACTL);
+	LASSERTF(MDS_GETXATTR == 49, "found %lld\n",
+		 (long long)MDS_GETXATTR);
+	LASSERTF(MDS_SETXATTR == 50, "found %lld\n",
+		 (long long)MDS_SETXATTR);
+	LASSERTF(MDS_WRITEPAGE == 51, "found %lld\n",
+		 (long long)MDS_WRITEPAGE);
+	LASSERTF(MDS_IS_SUBDIR == 52, "found %lld\n",
+		 (long long)MDS_IS_SUBDIR);
+	LASSERTF(MDS_GET_INFO == 53, "found %lld\n",
+		 (long long)MDS_GET_INFO);
+	LASSERTF(MDS_HSM_STATE_GET == 54, "found %lld\n",
+		 (long long)MDS_HSM_STATE_GET);
+	LASSERTF(MDS_HSM_STATE_SET == 55, "found %lld\n",
+		 (long long)MDS_HSM_STATE_SET);
+	LASSERTF(MDS_HSM_ACTION == 56, "found %lld\n",
+		 (long long)MDS_HSM_ACTION);
+	LASSERTF(MDS_HSM_PROGRESS == 57, "found %lld\n",
+		 (long long)MDS_HSM_PROGRESS);
+	LASSERTF(MDS_HSM_REQUEST == 58, "found %lld\n",
+		 (long long)MDS_HSM_REQUEST);
+	LASSERTF(MDS_HSM_CT_REGISTER == 59, "found %lld\n",
+		 (long long)MDS_HSM_CT_REGISTER);
+	LASSERTF(MDS_HSM_CT_UNREGISTER == 60, "found %lld\n",
+		 (long long)MDS_HSM_CT_UNREGISTER);
+	LASSERTF(MDS_SWAP_LAYOUTS == 61, "found %lld\n",
+		 (long long)MDS_SWAP_LAYOUTS);
+	LASSERTF(MDS_LAST_OPC == 62, "found %lld\n",
+		 (long long)MDS_LAST_OPC);
+	LASSERTF(REINT_SETATTR == 1, "found %lld\n",
+		 (long long)REINT_SETATTR);
+	LASSERTF(REINT_CREATE == 2, "found %lld\n",
+		 (long long)REINT_CREATE);
+	LASSERTF(REINT_LINK == 3, "found %lld\n",
+		 (long long)REINT_LINK);
+	LASSERTF(REINT_UNLINK == 4, "found %lld\n",
+		 (long long)REINT_UNLINK);
+	LASSERTF(REINT_RENAME == 5, "found %lld\n",
+		 (long long)REINT_RENAME);
+	LASSERTF(REINT_OPEN == 6, "found %lld\n",
+		 (long long)REINT_OPEN);
+	LASSERTF(REINT_SETXATTR == 7, "found %lld\n",
+		 (long long)REINT_SETXATTR);
+	LASSERTF(REINT_RMENTRY == 8, "found %lld\n",
+		 (long long)REINT_RMENTRY);
+	LASSERTF(REINT_MAX == 9, "found %lld\n",
+		 (long long)REINT_MAX);
+	LASSERTF(DISP_IT_EXECD == 0x00000001UL, "found 0x%.8xUL\n",
+		(unsigned)DISP_IT_EXECD);
+	LASSERTF(DISP_LOOKUP_EXECD == 0x00000002UL, "found 0x%.8xUL\n",
+		(unsigned)DISP_LOOKUP_EXECD);
+	LASSERTF(DISP_LOOKUP_NEG == 0x00000004UL, "found 0x%.8xUL\n",
+		(unsigned)DISP_LOOKUP_NEG);
+	LASSERTF(DISP_LOOKUP_POS == 0x00000008UL, "found 0x%.8xUL\n",
+		(unsigned)DISP_LOOKUP_POS);
+	LASSERTF(DISP_OPEN_CREATE == 0x00000010UL, "found 0x%.8xUL\n",
+		(unsigned)DISP_OPEN_CREATE);
+	LASSERTF(DISP_OPEN_OPEN == 0x00000020UL, "found 0x%.8xUL\n",
+		(unsigned)DISP_OPEN_OPEN);
+	LASSERTF(DISP_ENQ_COMPLETE == 0x00400000UL, "found 0x%.8xUL\n",
+		(unsigned)DISP_ENQ_COMPLETE);
+	LASSERTF(DISP_ENQ_OPEN_REF == 0x00800000UL, "found 0x%.8xUL\n",
+		(unsigned)DISP_ENQ_OPEN_REF);
+	LASSERTF(DISP_ENQ_CREATE_REF == 0x01000000UL, "found 0x%.8xUL\n",
+		(unsigned)DISP_ENQ_CREATE_REF);
+	LASSERTF(DISP_OPEN_LOCK == 0x02000000UL, "found 0x%.8xUL\n",
+		(unsigned)DISP_OPEN_LOCK);
+	LASSERTF(MDS_STATUS_CONN == 1, "found %lld\n",
+		 (long long)MDS_STATUS_CONN);
+	LASSERTF(MDS_STATUS_LOV == 2, "found %lld\n",
+		 (long long)MDS_STATUS_LOV);
+	LASSERTF(LUSTRE_BFLAG_UNCOMMITTED_WRITES == 1, "found %lld\n",
+		 (long long)LUSTRE_BFLAG_UNCOMMITTED_WRITES);
+	LASSERTF(MF_SOM_CHANGE == 0x00000001UL, "found 0x%.8xUL\n",
+		(unsigned)MF_SOM_CHANGE);
+	LASSERTF(MF_EPOCH_OPEN == 0x00000002UL, "found 0x%.8xUL\n",
+		(unsigned)MF_EPOCH_OPEN);
+	LASSERTF(MF_EPOCH_CLOSE == 0x00000004UL, "found 0x%.8xUL\n",
+		(unsigned)MF_EPOCH_CLOSE);
+	LASSERTF(MF_MDC_CANCEL_FID1 == 0x00000008UL, "found 0x%.8xUL\n",
+		(unsigned)MF_MDC_CANCEL_FID1);
+	LASSERTF(MF_MDC_CANCEL_FID2 == 0x00000010UL, "found 0x%.8xUL\n",
+		(unsigned)MF_MDC_CANCEL_FID2);
+	LASSERTF(MF_MDC_CANCEL_FID3 == 0x00000020UL, "found 0x%.8xUL\n",
+		(unsigned)MF_MDC_CANCEL_FID3);
+	LASSERTF(MF_MDC_CANCEL_FID4 == 0x00000040UL, "found 0x%.8xUL\n",
+		(unsigned)MF_MDC_CANCEL_FID4);
+	LASSERTF(MF_SOM_AU == 0x00000080UL, "found 0x%.8xUL\n",
+		(unsigned)MF_SOM_AU);
+	LASSERTF(MF_GETATTR_LOCK == 0x00000100UL, "found 0x%.8xUL\n",
+		(unsigned)MF_GETATTR_LOCK);
+	LASSERTF(MDS_ATTR_MODE == 0x0000000000000001ULL, "found 0x%.16llxULL\n",
+			(long long)MDS_ATTR_MODE);
+	LASSERTF(MDS_ATTR_UID == 0x0000000000000002ULL, "found 0x%.16llxULL\n",
+			(long long)MDS_ATTR_UID);
+	LASSERTF(MDS_ATTR_GID == 0x0000000000000004ULL, "found 0x%.16llxULL\n",
+			(long long)MDS_ATTR_GID);
+	LASSERTF(MDS_ATTR_SIZE == 0x0000000000000008ULL, "found 0x%.16llxULL\n",
+			(long long)MDS_ATTR_SIZE);
+	LASSERTF(MDS_ATTR_ATIME == 0x0000000000000010ULL, "found 0x%.16llxULL\n",
+			(long long)MDS_ATTR_ATIME);
+	LASSERTF(MDS_ATTR_MTIME == 0x0000000000000020ULL, "found 0x%.16llxULL\n",
+			(long long)MDS_ATTR_MTIME);
+	LASSERTF(MDS_ATTR_CTIME == 0x0000000000000040ULL, "found 0x%.16llxULL\n",
+			(long long)MDS_ATTR_CTIME);
+	LASSERTF(MDS_ATTR_ATIME_SET == 0x0000000000000080ULL, "found 0x%.16llxULL\n",
+			(long long)MDS_ATTR_ATIME_SET);
+	LASSERTF(MDS_ATTR_MTIME_SET == 0x0000000000000100ULL, "found 0x%.16llxULL\n",
+			(long long)MDS_ATTR_MTIME_SET);
+	LASSERTF(MDS_ATTR_FORCE == 0x0000000000000200ULL, "found 0x%.16llxULL\n",
+			(long long)MDS_ATTR_FORCE);
+	LASSERTF(MDS_ATTR_ATTR_FLAG == 0x0000000000000400ULL, "found 0x%.16llxULL\n",
+			(long long)MDS_ATTR_ATTR_FLAG);
+	LASSERTF(MDS_ATTR_KILL_SUID == 0x0000000000000800ULL, "found 0x%.16llxULL\n",
+			(long long)MDS_ATTR_KILL_SUID);
+	LASSERTF(MDS_ATTR_KILL_SGID == 0x0000000000001000ULL, "found 0x%.16llxULL\n",
+			(long long)MDS_ATTR_KILL_SGID);
+	LASSERTF(MDS_ATTR_CTIME_SET == 0x0000000000002000ULL, "found 0x%.16llxULL\n",
+			(long long)MDS_ATTR_CTIME_SET);
+	LASSERTF(MDS_ATTR_FROM_OPEN == 0x0000000000004000ULL, "found 0x%.16llxULL\n",
+			(long long)MDS_ATTR_FROM_OPEN);
+	LASSERTF(MDS_ATTR_BLOCKS == 0x0000000000008000ULL, "found 0x%.16llxULL\n",
+			(long long)MDS_ATTR_BLOCKS);
+	LASSERTF(FLD_QUERY == 900, "found %lld\n",
+		 (long long)FLD_QUERY);
+	LASSERTF(FLD_FIRST_OPC == 900, "found %lld\n",
+		 (long long)FLD_FIRST_OPC);
+	LASSERTF(FLD_LAST_OPC == 901, "found %lld\n",
+		 (long long)FLD_LAST_OPC);
+	LASSERTF(SEQ_QUERY == 700, "found %lld\n",
+		 (long long)SEQ_QUERY);
+	LASSERTF(SEQ_FIRST_OPC == 700, "found %lld\n",
+		 (long long)SEQ_FIRST_OPC);
+	LASSERTF(SEQ_LAST_OPC == 701, "found %lld\n",
+		 (long long)SEQ_LAST_OPC);
+	LASSERTF(SEQ_ALLOC_SUPER == 0, "found %lld\n",
+		 (long long)SEQ_ALLOC_SUPER);
+	LASSERTF(SEQ_ALLOC_META == 1, "found %lld\n",
+		 (long long)SEQ_ALLOC_META);
+	LASSERTF(LDLM_ENQUEUE == 101, "found %lld\n",
+		 (long long)LDLM_ENQUEUE);
+	LASSERTF(LDLM_CONVERT == 102, "found %lld\n",
+		 (long long)LDLM_CONVERT);
+	LASSERTF(LDLM_CANCEL == 103, "found %lld\n",
+		 (long long)LDLM_CANCEL);
+	LASSERTF(LDLM_BL_CALLBACK == 104, "found %lld\n",
+		 (long long)LDLM_BL_CALLBACK);
+	LASSERTF(LDLM_CP_CALLBACK == 105, "found %lld\n",
+		 (long long)LDLM_CP_CALLBACK);
+	LASSERTF(LDLM_GL_CALLBACK == 106, "found %lld\n",
+		 (long long)LDLM_GL_CALLBACK);
+	LASSERTF(LDLM_SET_INFO == 107, "found %lld\n",
+		 (long long)LDLM_SET_INFO);
+	LASSERTF(LDLM_LAST_OPC == 108, "found %lld\n",
+		 (long long)LDLM_LAST_OPC);
+	LASSERTF(LCK_MINMODE == 0, "found %lld\n",
+		 (long long)LCK_MINMODE);
+	LASSERTF(LCK_EX == 1, "found %lld\n",
+		 (long long)LCK_EX);
+	LASSERTF(LCK_PW == 2, "found %lld\n",
+		 (long long)LCK_PW);
+	LASSERTF(LCK_PR == 4, "found %lld\n",
+		 (long long)LCK_PR);
+	LASSERTF(LCK_CW == 8, "found %lld\n",
+		 (long long)LCK_CW);
+	LASSERTF(LCK_CR == 16, "found %lld\n",
+		 (long long)LCK_CR);
+	LASSERTF(LCK_NL == 32, "found %lld\n",
+		 (long long)LCK_NL);
+	LASSERTF(LCK_GROUP == 64, "found %lld\n",
+		 (long long)LCK_GROUP);
+	LASSERTF(LCK_COS == 128, "found %lld\n",
+		 (long long)LCK_COS);
+	LASSERTF(LCK_MAXMODE == 129, "found %lld\n",
+		 (long long)LCK_MAXMODE);
+	LASSERTF(LCK_MODE_NUM == 8, "found %lld\n",
+		 (long long)LCK_MODE_NUM);
+	CLASSERT(LDLM_PLAIN == 10);
+	CLASSERT(LDLM_EXTENT == 11);
+	CLASSERT(LDLM_FLOCK == 12);
+	CLASSERT(LDLM_IBITS == 13);
+	CLASSERT(LDLM_MAX_TYPE == 14);
+	CLASSERT(LUSTRE_RES_ID_SEQ_OFF == 0);
+	CLASSERT(LUSTRE_RES_ID_VER_OID_OFF == 1);
+	LASSERTF(UPDATE_OBJ == 1000, "found %lld\n",
+		 (long long)UPDATE_OBJ);
+	LASSERTF(UPDATE_LAST_OPC == 1001, "found %lld\n",
+		 (long long)UPDATE_LAST_OPC);
+	CLASSERT(LUSTRE_RES_ID_QUOTA_SEQ_OFF == 2);
+	CLASSERT(LUSTRE_RES_ID_QUOTA_VER_OID_OFF == 3);
+	CLASSERT(LUSTRE_RES_ID_HSH_OFF == 3);
+	CLASSERT(LQUOTA_TYPE_USR == 0);
+	CLASSERT(LQUOTA_TYPE_GRP == 1);
+	CLASSERT(LQUOTA_RES_MD == 1);
+	CLASSERT(LQUOTA_RES_DT == 2);
+	LASSERTF(OBD_PING == 400, "found %lld\n",
+		 (long long)OBD_PING);
+	LASSERTF(OBD_LOG_CANCEL == 401, "found %lld\n",
+		 (long long)OBD_LOG_CANCEL);
+	LASSERTF(OBD_QC_CALLBACK == 402, "found %lld\n",
+		 (long long)OBD_QC_CALLBACK);
+	LASSERTF(OBD_IDX_READ == 403, "found %lld\n",
+		 (long long)OBD_IDX_READ);
+	LASSERTF(OBD_LAST_OPC == 404, "found %lld\n",
+		 (long long)OBD_LAST_OPC);
+	LASSERTF(QUOTA_DQACQ == 601, "found %lld\n",
+		 (long long)QUOTA_DQACQ);
+	LASSERTF(QUOTA_DQREL == 602, "found %lld\n",
+		 (long long)QUOTA_DQREL);
+	LASSERTF(QUOTA_LAST_OPC == 603, "found %lld\n",
+		 (long long)QUOTA_LAST_OPC);
+	LASSERTF(MGS_CONNECT == 250, "found %lld\n",
+		 (long long)MGS_CONNECT);
+	LASSERTF(MGS_DISCONNECT == 251, "found %lld\n",
+		 (long long)MGS_DISCONNECT);
+	LASSERTF(MGS_EXCEPTION == 252, "found %lld\n",
+		 (long long)MGS_EXCEPTION);
+	LASSERTF(MGS_TARGET_REG == 253, "found %lld\n",
+		 (long long)MGS_TARGET_REG);
+	LASSERTF(MGS_TARGET_DEL == 254, "found %lld\n",
+		 (long long)MGS_TARGET_DEL);
+	LASSERTF(MGS_SET_INFO == 255, "found %lld\n",
+		 (long long)MGS_SET_INFO);
+	LASSERTF(MGS_LAST_OPC == 257, "found %lld\n",
+		 (long long)MGS_LAST_OPC);
+	LASSERTF(SEC_CTX_INIT == 801, "found %lld\n",
+		 (long long)SEC_CTX_INIT);
+	LASSERTF(SEC_CTX_INIT_CONT == 802, "found %lld\n",
+		 (long long)SEC_CTX_INIT_CONT);
+	LASSERTF(SEC_CTX_FINI == 803, "found %lld\n",
+		 (long long)SEC_CTX_FINI);
+	LASSERTF(SEC_LAST_OPC == 804, "found %lld\n",
+		 (long long)SEC_LAST_OPC);
+	/* Sizes and Offsets */
+
+	/* Checks for struct obd_uuid */
+	LASSERTF((int)sizeof(struct obd_uuid) == 40, "found %lld\n",
+		 (long long)(int)sizeof(struct obd_uuid));
+
+	/* Checks for struct lu_seq_range */
+	LASSERTF((int)sizeof(struct lu_seq_range) == 24, "found %lld\n",
+		 (long long)(int)sizeof(struct lu_seq_range));
+	LASSERTF((int)offsetof(struct lu_seq_range, lsr_start) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct lu_seq_range, lsr_start));
+	LASSERTF((int)sizeof(((struct lu_seq_range *)0)->lsr_start) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct lu_seq_range *)0)->lsr_start));
+	LASSERTF((int)offsetof(struct lu_seq_range, lsr_end) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct lu_seq_range, lsr_end));
+	LASSERTF((int)sizeof(((struct lu_seq_range *)0)->lsr_end) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct lu_seq_range *)0)->lsr_end));
+	LASSERTF((int)offsetof(struct lu_seq_range, lsr_index) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct lu_seq_range, lsr_index));
+	LASSERTF((int)sizeof(((struct lu_seq_range *)0)->lsr_index) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lu_seq_range *)0)->lsr_index));
+	LASSERTF((int)offsetof(struct lu_seq_range, lsr_flags) == 20, "found %lld\n",
+		 (long long)(int)offsetof(struct lu_seq_range, lsr_flags));
+	LASSERTF((int)sizeof(((struct lu_seq_range *)0)->lsr_flags) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lu_seq_range *)0)->lsr_flags));
+	LASSERTF(LU_SEQ_RANGE_MDT == 0, "found %lld\n",
+		 (long long)LU_SEQ_RANGE_MDT);
+	LASSERTF(LU_SEQ_RANGE_OST == 1, "found %lld\n",
+		 (long long)LU_SEQ_RANGE_OST);
+
+	/* Checks for struct lustre_mdt_attrs */
+	LASSERTF((int)sizeof(struct lustre_mdt_attrs) == 24, "found %lld\n",
+		 (long long)(int)sizeof(struct lustre_mdt_attrs));
+	LASSERTF((int)offsetof(struct lustre_mdt_attrs, lma_compat) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct lustre_mdt_attrs, lma_compat));
+	LASSERTF((int)sizeof(((struct lustre_mdt_attrs *)0)->lma_compat) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lustre_mdt_attrs *)0)->lma_compat));
+	LASSERTF((int)offsetof(struct lustre_mdt_attrs, lma_incompat) == 4, "found %lld\n",
+		 (long long)(int)offsetof(struct lustre_mdt_attrs, lma_incompat));
+	LASSERTF((int)sizeof(((struct lustre_mdt_attrs *)0)->lma_incompat) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lustre_mdt_attrs *)0)->lma_incompat));
+	LASSERTF((int)offsetof(struct lustre_mdt_attrs, lma_self_fid) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct lustre_mdt_attrs, lma_self_fid));
+	LASSERTF((int)sizeof(((struct lustre_mdt_attrs *)0)->lma_self_fid) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct lustre_mdt_attrs *)0)->lma_self_fid));
+	LASSERTF(LMAI_RELEASED == 0x00000001UL, "found 0x%.8xUL\n",
+		(unsigned)LMAI_RELEASED);
+	LASSERTF(LMAC_HSM == 0x00000001UL, "found 0x%.8xUL\n",
+		(unsigned)LMAC_HSM);
+	LASSERTF(LMAC_SOM == 0x00000002UL, "found 0x%.8xUL\n",
+		(unsigned)LMAC_SOM);
+	LASSERTF(LMAC_NOT_IN_OI == 0x00000004UL, "found 0x%.8xUL\n",
+		(unsigned)LMAC_NOT_IN_OI);
+	LASSERTF(LMAC_FID_ON_OST == 0x00000008UL, "found 0x%.8xUL\n",
+		(unsigned)LMAC_FID_ON_OST);
+	LASSERTF(OBJ_CREATE == 1, "found %lld\n",
+		 (long long)OBJ_CREATE);
+	LASSERTF(OBJ_DESTROY == 2, "found %lld\n",
+		 (long long)OBJ_DESTROY);
+	LASSERTF(OBJ_REF_ADD == 3, "found %lld\n",
+		 (long long)OBJ_REF_ADD);
+	LASSERTF(OBJ_REF_DEL == 4, "found %lld\n",
+		 (long long)OBJ_REF_DEL);
+	LASSERTF(OBJ_ATTR_SET == 5, "found %lld\n",
+		 (long long)OBJ_ATTR_SET);
+	LASSERTF(OBJ_ATTR_GET == 6, "found %lld\n",
+		 (long long)OBJ_ATTR_GET);
+	LASSERTF(OBJ_XATTR_SET == 7, "found %lld\n",
+		 (long long)OBJ_XATTR_SET);
+	LASSERTF(OBJ_XATTR_GET == 8, "found %lld\n",
+		 (long long)OBJ_XATTR_GET);
+	LASSERTF(OBJ_INDEX_LOOKUP == 9, "found %lld\n",
+		 (long long)OBJ_INDEX_LOOKUP);
+	LASSERTF(OBJ_INDEX_LOOKUP == 9, "found %lld\n",
+		 (long long)OBJ_INDEX_LOOKUP);
+	LASSERTF(OBJ_INDEX_INSERT == 10, "found %lld\n",
+		 (long long)OBJ_INDEX_INSERT);
+	LASSERTF(OBJ_INDEX_DELETE == 11, "found %lld\n",
+		 (long long)OBJ_INDEX_DELETE);
+
+	/* Checks for struct ost_id */
+	LASSERTF((int)sizeof(struct ost_id) == 16, "found %lld\n",
+		 (long long)(int)sizeof(struct ost_id));
+	LASSERTF((int)offsetof(struct ost_id, oi) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct ost_id, oi));
+	LASSERTF((int)sizeof(((struct ost_id *)0)->oi) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct ost_id *)0)->oi));
+	LASSERTF(LUSTRE_FID_INIT_OID == 1, "found %lld\n",
+		 (long long)LUSTRE_FID_INIT_OID);
+	LASSERTF(FID_SEQ_OST_MDT0 == 0, "found %lld\n",
+		 (long long)FID_SEQ_OST_MDT0);
+	LASSERTF(FID_SEQ_LLOG == 1, "found %lld\n",
+		 (long long)FID_SEQ_LLOG);
+	LASSERTF(FID_SEQ_ECHO == 2, "found %lld\n",
+		 (long long)FID_SEQ_ECHO);
+	LASSERTF(FID_SEQ_OST_MDT1 == 3, "found %lld\n",
+		 (long long)FID_SEQ_OST_MDT1);
+	LASSERTF(FID_SEQ_OST_MAX == 9, "found %lld\n",
+		 (long long)FID_SEQ_OST_MAX);
+	LASSERTF(FID_SEQ_RSVD == 11, "found %lld\n",
+		 (long long)FID_SEQ_RSVD);
+	LASSERTF(FID_SEQ_IGIF == 12, "found %lld\n",
+		 (long long)FID_SEQ_IGIF);
+	LASSERTF(FID_SEQ_IGIF_MAX == 0x00000000ffffffffULL, "found 0x%.16llxULL\n",
+			(long long)FID_SEQ_IGIF_MAX);
+	LASSERTF(FID_SEQ_IDIF == 0x0000000100000000ULL, "found 0x%.16llxULL\n",
+			(long long)FID_SEQ_IDIF);
+	LASSERTF(FID_SEQ_IDIF_MAX == 0x00000001ffffffffULL, "found 0x%.16llxULL\n",
+			(long long)FID_SEQ_IDIF_MAX);
+	LASSERTF(FID_SEQ_START == 0x0000000200000000ULL, "found 0x%.16llxULL\n",
+			(long long)FID_SEQ_START);
+	LASSERTF(FID_SEQ_LOCAL_FILE == 0x0000000200000001ULL, "found 0x%.16llxULL\n",
+			(long long)FID_SEQ_LOCAL_FILE);
+	LASSERTF(FID_SEQ_DOT_LUSTRE == 0x0000000200000002ULL, "found 0x%.16llxULL\n",
+			(long long)FID_SEQ_DOT_LUSTRE);
+	LASSERTF(FID_SEQ_SPECIAL == 0x0000000200000004ULL, "found 0x%.16llxULL\n",
+			(long long)FID_SEQ_SPECIAL);
+	LASSERTF(FID_SEQ_QUOTA == 0x0000000200000005ULL, "found 0x%.16llxULL\n",
+			(long long)FID_SEQ_QUOTA);
+	LASSERTF(FID_SEQ_QUOTA_GLB == 0x0000000200000006ULL, "found 0x%.16llxULL\n",
+			(long long)FID_SEQ_QUOTA_GLB);
+	LASSERTF(FID_SEQ_ROOT == 0x0000000200000007ULL, "found 0x%.16llxULL\n",
+			(long long)FID_SEQ_ROOT);
+	LASSERTF(FID_SEQ_NORMAL == 0x0000000200000400ULL, "found 0x%.16llxULL\n",
+			(long long)FID_SEQ_NORMAL);
+	LASSERTF(FID_SEQ_LOV_DEFAULT == 0xffffffffffffffffULL, "found 0x%.16llxULL\n",
+			(long long)FID_SEQ_LOV_DEFAULT);
+	LASSERTF(FID_OID_SPECIAL_BFL == 0x00000001UL, "found 0x%.8xUL\n",
+		(unsigned)FID_OID_SPECIAL_BFL);
+	LASSERTF(FID_OID_DOT_LUSTRE == 0x00000001UL, "found 0x%.8xUL\n",
+		(unsigned)FID_OID_DOT_LUSTRE);
+	LASSERTF(FID_OID_DOT_LUSTRE_OBF == 0x00000002UL, "found 0x%.8xUL\n",
+		(unsigned)FID_OID_DOT_LUSTRE_OBF);
+
+	/* Checks for struct lu_dirent */
+	LASSERTF((int)sizeof(struct lu_dirent) == 32, "found %lld\n",
+		 (long long)(int)sizeof(struct lu_dirent));
+	LASSERTF((int)offsetof(struct lu_dirent, lde_fid) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct lu_dirent, lde_fid));
+	LASSERTF((int)sizeof(((struct lu_dirent *)0)->lde_fid) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct lu_dirent *)0)->lde_fid));
+	LASSERTF((int)offsetof(struct lu_dirent, lde_hash) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct lu_dirent, lde_hash));
+	LASSERTF((int)sizeof(((struct lu_dirent *)0)->lde_hash) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct lu_dirent *)0)->lde_hash));
+	LASSERTF((int)offsetof(struct lu_dirent, lde_reclen) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct lu_dirent, lde_reclen));
+	LASSERTF((int)sizeof(((struct lu_dirent *)0)->lde_reclen) == 2, "found %lld\n",
+		 (long long)(int)sizeof(((struct lu_dirent *)0)->lde_reclen));
+	LASSERTF((int)offsetof(struct lu_dirent, lde_namelen) == 26, "found %lld\n",
+		 (long long)(int)offsetof(struct lu_dirent, lde_namelen));
+	LASSERTF((int)sizeof(((struct lu_dirent *)0)->lde_namelen) == 2, "found %lld\n",
+		 (long long)(int)sizeof(((struct lu_dirent *)0)->lde_namelen));
+	LASSERTF((int)offsetof(struct lu_dirent, lde_attrs) == 28, "found %lld\n",
+		 (long long)(int)offsetof(struct lu_dirent, lde_attrs));
+	LASSERTF((int)sizeof(((struct lu_dirent *)0)->lde_attrs) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lu_dirent *)0)->lde_attrs));
+	LASSERTF((int)offsetof(struct lu_dirent, lde_name[0]) == 32, "found %lld\n",
+		 (long long)(int)offsetof(struct lu_dirent, lde_name[0]));
+	LASSERTF((int)sizeof(((struct lu_dirent *)0)->lde_name[0]) == 1, "found %lld\n",
+		 (long long)(int)sizeof(((struct lu_dirent *)0)->lde_name[0]));
+	LASSERTF(LUDA_FID == 0x00000001UL, "found 0x%.8xUL\n",
+		(unsigned)LUDA_FID);
+	LASSERTF(LUDA_TYPE == 0x00000002UL, "found 0x%.8xUL\n",
+		(unsigned)LUDA_TYPE);
+	LASSERTF(LUDA_64BITHASH == 0x00000004UL, "found 0x%.8xUL\n",
+		(unsigned)LUDA_64BITHASH);
+
+	/* Checks for struct luda_type */
+	LASSERTF((int)sizeof(struct luda_type) == 2, "found %lld\n",
+		 (long long)(int)sizeof(struct luda_type));
+	LASSERTF((int)offsetof(struct luda_type, lt_type) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct luda_type, lt_type));
+	LASSERTF((int)sizeof(((struct luda_type *)0)->lt_type) == 2, "found %lld\n",
+		 (long long)(int)sizeof(((struct luda_type *)0)->lt_type));
+
+	/* Checks for struct lu_dirpage */
+	LASSERTF((int)sizeof(struct lu_dirpage) == 24, "found %lld\n",
+		 (long long)(int)sizeof(struct lu_dirpage));
+	LASSERTF((int)offsetof(struct lu_dirpage, ldp_hash_start) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct lu_dirpage, ldp_hash_start));
+	LASSERTF((int)sizeof(((struct lu_dirpage *)0)->ldp_hash_start) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct lu_dirpage *)0)->ldp_hash_start));
+	LASSERTF((int)offsetof(struct lu_dirpage, ldp_hash_end) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct lu_dirpage, ldp_hash_end));
+	LASSERTF((int)sizeof(((struct lu_dirpage *)0)->ldp_hash_end) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct lu_dirpage *)0)->ldp_hash_end));
+	LASSERTF((int)offsetof(struct lu_dirpage, ldp_flags) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct lu_dirpage, ldp_flags));
+	LASSERTF((int)sizeof(((struct lu_dirpage *)0)->ldp_flags) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lu_dirpage *)0)->ldp_flags));
+	LASSERTF((int)offsetof(struct lu_dirpage, ldp_pad0) == 20, "found %lld\n",
+		 (long long)(int)offsetof(struct lu_dirpage, ldp_pad0));
+	LASSERTF((int)sizeof(((struct lu_dirpage *)0)->ldp_pad0) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lu_dirpage *)0)->ldp_pad0));
+	LASSERTF((int)offsetof(struct lu_dirpage, ldp_entries[0]) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct lu_dirpage, ldp_entries[0]));
+	LASSERTF((int)sizeof(((struct lu_dirpage *)0)->ldp_entries[0]) == 32, "found %lld\n",
+		 (long long)(int)sizeof(((struct lu_dirpage *)0)->ldp_entries[0]));
+	LASSERTF(LDF_EMPTY == 1, "found %lld\n",
+		 (long long)LDF_EMPTY);
+	LASSERTF(LDF_COLLIDE == 2, "found %lld\n",
+		 (long long)LDF_COLLIDE);
+	LASSERTF(LU_PAGE_SIZE == 4096, "found %lld\n",
+		 (long long)LU_PAGE_SIZE);
+	/* Checks for union lu_page */
+	LASSERTF((int)sizeof(union lu_page) == 4096, "found %lld\n",
+		 (long long)(int)sizeof(union lu_page));
+
+	/* Checks for struct lustre_handle */
+	LASSERTF((int)sizeof(struct lustre_handle) == 8, "found %lld\n",
+		 (long long)(int)sizeof(struct lustre_handle));
+	LASSERTF((int)offsetof(struct lustre_handle, cookie) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct lustre_handle, cookie));
+	LASSERTF((int)sizeof(((struct lustre_handle *)0)->cookie) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct lustre_handle *)0)->cookie));
+
+	/* Checks for struct lustre_msg_v2 */
+	LASSERTF((int)sizeof(struct lustre_msg_v2) == 32, "found %lld\n",
+		 (long long)(int)sizeof(struct lustre_msg_v2));
+	LASSERTF((int)offsetof(struct lustre_msg_v2, lm_bufcount) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct lustre_msg_v2, lm_bufcount));
+	LASSERTF((int)sizeof(((struct lustre_msg_v2 *)0)->lm_bufcount) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lustre_msg_v2 *)0)->lm_bufcount));
+	LASSERTF((int)offsetof(struct lustre_msg_v2, lm_secflvr) == 4, "found %lld\n",
+		 (long long)(int)offsetof(struct lustre_msg_v2, lm_secflvr));
+	LASSERTF((int)sizeof(((struct lustre_msg_v2 *)0)->lm_secflvr) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lustre_msg_v2 *)0)->lm_secflvr));
+	LASSERTF((int)offsetof(struct lustre_msg_v2, lm_magic) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct lustre_msg_v2, lm_magic));
+	LASSERTF((int)sizeof(((struct lustre_msg_v2 *)0)->lm_magic) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lustre_msg_v2 *)0)->lm_magic));
+	LASSERTF((int)offsetof(struct lustre_msg_v2, lm_repsize) == 12, "found %lld\n",
+		 (long long)(int)offsetof(struct lustre_msg_v2, lm_repsize));
+	LASSERTF((int)sizeof(((struct lustre_msg_v2 *)0)->lm_repsize) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lustre_msg_v2 *)0)->lm_repsize));
+	LASSERTF((int)offsetof(struct lustre_msg_v2, lm_cksum) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct lustre_msg_v2, lm_cksum));
+	LASSERTF((int)sizeof(((struct lustre_msg_v2 *)0)->lm_cksum) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lustre_msg_v2 *)0)->lm_cksum));
+	LASSERTF((int)offsetof(struct lustre_msg_v2, lm_flags) == 20, "found %lld\n",
+		 (long long)(int)offsetof(struct lustre_msg_v2, lm_flags));
+	LASSERTF((int)sizeof(((struct lustre_msg_v2 *)0)->lm_flags) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lustre_msg_v2 *)0)->lm_flags));
+	LASSERTF((int)offsetof(struct lustre_msg_v2, lm_padding_2) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct lustre_msg_v2, lm_padding_2));
+	LASSERTF((int)sizeof(((struct lustre_msg_v2 *)0)->lm_padding_2) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lustre_msg_v2 *)0)->lm_padding_2));
+	LASSERTF((int)offsetof(struct lustre_msg_v2, lm_padding_3) == 28, "found %lld\n",
+		 (long long)(int)offsetof(struct lustre_msg_v2, lm_padding_3));
+	LASSERTF((int)sizeof(((struct lustre_msg_v2 *)0)->lm_padding_3) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lustre_msg_v2 *)0)->lm_padding_3));
+	LASSERTF((int)offsetof(struct lustre_msg_v2, lm_buflens[0]) == 32, "found %lld\n",
+		 (long long)(int)offsetof(struct lustre_msg_v2, lm_buflens[0]));
+	LASSERTF((int)sizeof(((struct lustre_msg_v2 *)0)->lm_buflens[0]) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lustre_msg_v2 *)0)->lm_buflens[0]));
+	LASSERTF(LUSTRE_MSG_MAGIC_V1 == 0x0BD00BD0, "found 0x%.8x\n",
+		LUSTRE_MSG_MAGIC_V1);
+	LASSERTF(LUSTRE_MSG_MAGIC_V2 == 0x0BD00BD3, "found 0x%.8x\n",
+		LUSTRE_MSG_MAGIC_V2);
+	LASSERTF(LUSTRE_MSG_MAGIC_V1_SWABBED == 0xD00BD00B, "found 0x%.8x\n",
+		LUSTRE_MSG_MAGIC_V1_SWABBED);
+	LASSERTF(LUSTRE_MSG_MAGIC_V2_SWABBED == 0xD30BD00B, "found 0x%.8x\n",
+		LUSTRE_MSG_MAGIC_V2_SWABBED);
+
+	/* Checks for struct ptlrpc_body */
+	LASSERTF((int)sizeof(struct ptlrpc_body_v3) == 184, "found %lld\n",
+		 (long long)(int)sizeof(struct ptlrpc_body_v3));
+	LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_handle) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct ptlrpc_body_v3, pb_handle));
+	LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_handle) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_handle));
+	LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_type) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct ptlrpc_body_v3, pb_type));
+	LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_type) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_type));
+	LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_version) == 12, "found %lld\n",
+		 (long long)(int)offsetof(struct ptlrpc_body_v3, pb_version));
+	LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_version) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_version));
+	LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_opc) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct ptlrpc_body_v3, pb_opc));
+	LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_opc) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_opc));
+	LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_status) == 20, "found %lld\n",
+		 (long long)(int)offsetof(struct ptlrpc_body_v3, pb_status));
+	LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_status) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_status));
+	LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_last_xid) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct ptlrpc_body_v3, pb_last_xid));
+	LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_last_xid) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_last_xid));
+	LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_last_seen) == 32, "found %lld\n",
+		 (long long)(int)offsetof(struct ptlrpc_body_v3, pb_last_seen));
+	LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_last_seen) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_last_seen));
+	LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_last_committed) == 40, "found %lld\n",
+		 (long long)(int)offsetof(struct ptlrpc_body_v3, pb_last_committed));
+	LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_last_committed) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_last_committed));
+	LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_transno) == 48, "found %lld\n",
+		 (long long)(int)offsetof(struct ptlrpc_body_v3, pb_transno));
+	LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_transno) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_transno));
+	LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_flags) == 56, "found %lld\n",
+		 (long long)(int)offsetof(struct ptlrpc_body_v3, pb_flags));
+	LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_flags) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_flags));
+	LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_op_flags) == 60, "found %lld\n",
+		 (long long)(int)offsetof(struct ptlrpc_body_v3, pb_op_flags));
+	LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_op_flags) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_op_flags));
+	LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_conn_cnt) == 64, "found %lld\n",
+		 (long long)(int)offsetof(struct ptlrpc_body_v3, pb_conn_cnt));
+	LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_conn_cnt) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_conn_cnt));
+	LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_timeout) == 68, "found %lld\n",
+		 (long long)(int)offsetof(struct ptlrpc_body_v3, pb_timeout));
+	LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_timeout) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_timeout));
+	LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_service_time) == 72, "found %lld\n",
+		 (long long)(int)offsetof(struct ptlrpc_body_v3, pb_service_time));
+	LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_service_time) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_service_time));
+	LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_limit) == 76, "found %lld\n",
+		 (long long)(int)offsetof(struct ptlrpc_body_v3, pb_limit));
+	LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_limit) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_limit));
+	LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_slv) == 80, "found %lld\n",
+		 (long long)(int)offsetof(struct ptlrpc_body_v3, pb_slv));
+	LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_slv) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_slv));
+	CLASSERT(PTLRPC_NUM_VERSIONS == 4);
+	LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_pre_versions) == 88, "found %lld\n",
+		 (long long)(int)offsetof(struct ptlrpc_body_v3, pb_pre_versions));
+	LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_pre_versions) == 32, "found %lld\n",
+		 (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_pre_versions));
+	LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_padding) == 120, "found %lld\n",
+		 (long long)(int)offsetof(struct ptlrpc_body_v3, pb_padding));
+	LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_padding) == 32, "found %lld\n",
+		 (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_padding));
+	CLASSERT(JOBSTATS_JOBID_SIZE == 32);
+	LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_jobid) == 152, "found %lld\n",
+		 (long long)(int)offsetof(struct ptlrpc_body_v3, pb_jobid));
+	LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_jobid) == 32, "found %lld\n",
+		 (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_jobid));
+	LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_handle) == (int)offsetof(struct ptlrpc_body_v2, pb_handle), "%d != %d\n",
+		 (int)offsetof(struct ptlrpc_body_v3, pb_handle), (int)offsetof(struct ptlrpc_body_v2, pb_handle));
+	LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_handle) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_handle), "%d != %d\n",
+		 (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_handle), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_handle));
+	LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_type) == (int)offsetof(struct ptlrpc_body_v2, pb_type), "%d != %d\n",
+		 (int)offsetof(struct ptlrpc_body_v3, pb_type), (int)offsetof(struct ptlrpc_body_v2, pb_type));
+	LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_type) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_type), "%d != %d\n",
+		 (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_type), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_type));
+	LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_version) == (int)offsetof(struct ptlrpc_body_v2, pb_version), "%d != %d\n",
+		 (int)offsetof(struct ptlrpc_body_v3, pb_version), (int)offsetof(struct ptlrpc_body_v2, pb_version));
+	LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_version) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_version), "%d != %d\n",
+		 (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_version), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_version));
+	LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_opc) == (int)offsetof(struct ptlrpc_body_v2, pb_opc), "%d != %d\n",
+		 (int)offsetof(struct ptlrpc_body_v3, pb_opc), (int)offsetof(struct ptlrpc_body_v2, pb_opc));
+	LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_opc) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_opc), "%d != %d\n",
+		 (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_opc), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_opc));
+	LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_status) == (int)offsetof(struct ptlrpc_body_v2, pb_status), "%d != %d\n",
+		 (int)offsetof(struct ptlrpc_body_v3, pb_status), (int)offsetof(struct ptlrpc_body_v2, pb_status));
+	LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_status) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_status), "%d != %d\n",
+		 (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_status), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_status));
+	LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_last_xid) == (int)offsetof(struct ptlrpc_body_v2, pb_last_xid), "%d != %d\n",
+		 (int)offsetof(struct ptlrpc_body_v3, pb_last_xid), (int)offsetof(struct ptlrpc_body_v2, pb_last_xid));
+	LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_last_xid) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_last_xid), "%d != %d\n",
+		 (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_last_xid), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_last_xid));
+	LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_last_seen) == (int)offsetof(struct ptlrpc_body_v2, pb_last_seen), "%d != %d\n",
+		 (int)offsetof(struct ptlrpc_body_v3, pb_last_seen), (int)offsetof(struct ptlrpc_body_v2, pb_last_seen));
+	LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_last_seen) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_last_seen), "%d != %d\n",
+		 (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_last_seen), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_last_seen));
+	LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_last_committed) == (int)offsetof(struct ptlrpc_body_v2, pb_last_committed), "%d != %d\n",
+		 (int)offsetof(struct ptlrpc_body_v3, pb_last_committed), (int)offsetof(struct ptlrpc_body_v2, pb_last_committed));
+	LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_last_committed) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_last_committed), "%d != %d\n",
+		 (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_last_committed), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_last_committed));
+	LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_transno) == (int)offsetof(struct ptlrpc_body_v2, pb_transno), "%d != %d\n",
+		 (int)offsetof(struct ptlrpc_body_v3, pb_transno), (int)offsetof(struct ptlrpc_body_v2, pb_transno));
+	LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_transno) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_transno), "%d != %d\n",
+		 (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_transno), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_transno));
+	LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_flags) == (int)offsetof(struct ptlrpc_body_v2, pb_flags), "%d != %d\n",
+		 (int)offsetof(struct ptlrpc_body_v3, pb_flags), (int)offsetof(struct ptlrpc_body_v2, pb_flags));
+	LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_flags) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_flags), "%d != %d\n",
+		 (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_flags), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_flags));
+	LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_op_flags) == (int)offsetof(struct ptlrpc_body_v2, pb_op_flags), "%d != %d\n",
+		 (int)offsetof(struct ptlrpc_body_v3, pb_op_flags), (int)offsetof(struct ptlrpc_body_v2, pb_op_flags));
+	LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_op_flags) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_op_flags), "%d != %d\n",
+		 (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_op_flags), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_op_flags));
+	LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_conn_cnt) == (int)offsetof(struct ptlrpc_body_v2, pb_conn_cnt), "%d != %d\n",
+		 (int)offsetof(struct ptlrpc_body_v3, pb_conn_cnt), (int)offsetof(struct ptlrpc_body_v2, pb_conn_cnt));
+	LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_conn_cnt) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_conn_cnt), "%d != %d\n",
+		 (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_conn_cnt), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_conn_cnt));
+	LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_timeout) == (int)offsetof(struct ptlrpc_body_v2, pb_timeout), "%d != %d\n",
+		 (int)offsetof(struct ptlrpc_body_v3, pb_timeout), (int)offsetof(struct ptlrpc_body_v2, pb_timeout));
+	LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_timeout) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_timeout), "%d != %d\n",
+		 (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_timeout), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_timeout));
+	LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_service_time) == (int)offsetof(struct ptlrpc_body_v2, pb_service_time), "%d != %d\n",
+		 (int)offsetof(struct ptlrpc_body_v3, pb_service_time), (int)offsetof(struct ptlrpc_body_v2, pb_service_time));
+	LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_service_time) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_service_time), "%d != %d\n",
+		 (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_service_time), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_service_time));
+	LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_limit) == (int)offsetof(struct ptlrpc_body_v2, pb_limit), "%d != %d\n",
+		 (int)offsetof(struct ptlrpc_body_v3, pb_limit), (int)offsetof(struct ptlrpc_body_v2, pb_limit));
+	LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_limit) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_limit), "%d != %d\n",
+		 (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_limit), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_limit));
+	LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_slv) == (int)offsetof(struct ptlrpc_body_v2, pb_slv), "%d != %d\n",
+		 (int)offsetof(struct ptlrpc_body_v3, pb_slv), (int)offsetof(struct ptlrpc_body_v2, pb_slv));
+	LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_slv) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_slv), "%d != %d\n",
+		 (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_slv), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_slv));
+	LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_pre_versions) == (int)offsetof(struct ptlrpc_body_v2, pb_pre_versions), "%d != %d\n",
+		 (int)offsetof(struct ptlrpc_body_v3, pb_pre_versions), (int)offsetof(struct ptlrpc_body_v2, pb_pre_versions));
+	LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_pre_versions) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_pre_versions), "%d != %d\n",
+		 (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_pre_versions), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_pre_versions));
+	LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_padding) == (int)offsetof(struct ptlrpc_body_v2, pb_padding), "%d != %d\n",
+		 (int)offsetof(struct ptlrpc_body_v3, pb_padding), (int)offsetof(struct ptlrpc_body_v2, pb_padding));
+	LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_padding) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_padding), "%d != %d\n",
+		 (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_padding), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_padding));
+	LASSERTF(MSG_PTLRPC_BODY_OFF == 0, "found %lld\n",
+		 (long long)MSG_PTLRPC_BODY_OFF);
+	LASSERTF(REQ_REC_OFF == 1, "found %lld\n",
+		 (long long)REQ_REC_OFF);
+	LASSERTF(REPLY_REC_OFF == 1, "found %lld\n",
+		 (long long)REPLY_REC_OFF);
+	LASSERTF(DLM_LOCKREQ_OFF == 1, "found %lld\n",
+		 (long long)DLM_LOCKREQ_OFF);
+	LASSERTF(DLM_REQ_REC_OFF == 2, "found %lld\n",
+		 (long long)DLM_REQ_REC_OFF);
+	LASSERTF(DLM_INTENT_IT_OFF == 2, "found %lld\n",
+		 (long long)DLM_INTENT_IT_OFF);
+	LASSERTF(DLM_INTENT_REC_OFF == 3, "found %lld\n",
+		 (long long)DLM_INTENT_REC_OFF);
+	LASSERTF(DLM_LOCKREPLY_OFF == 1, "found %lld\n",
+		 (long long)DLM_LOCKREPLY_OFF);
+	LASSERTF(DLM_REPLY_REC_OFF == 2, "found %lld\n",
+		 (long long)DLM_REPLY_REC_OFF);
+	LASSERTF(MSG_PTLRPC_HEADER_OFF == 31, "found %lld\n",
+		 (long long)MSG_PTLRPC_HEADER_OFF);
+	LASSERTF(PTLRPC_MSG_VERSION == 0x00000003, "found 0x%.8x\n",
+		PTLRPC_MSG_VERSION);
+	LASSERTF(LUSTRE_VERSION_MASK == 0xffff0000, "found 0x%.8x\n",
+		LUSTRE_VERSION_MASK);
+	LASSERTF(LUSTRE_OBD_VERSION == 0x00010000, "found 0x%.8x\n",
+		LUSTRE_OBD_VERSION);
+	LASSERTF(LUSTRE_MDS_VERSION == 0x00020000, "found 0x%.8x\n",
+		LUSTRE_MDS_VERSION);
+	LASSERTF(LUSTRE_OST_VERSION == 0x00030000, "found 0x%.8x\n",
+		LUSTRE_OST_VERSION);
+	LASSERTF(LUSTRE_DLM_VERSION == 0x00040000, "found 0x%.8x\n",
+		LUSTRE_DLM_VERSION);
+	LASSERTF(LUSTRE_LOG_VERSION == 0x00050000, "found 0x%.8x\n",
+		LUSTRE_LOG_VERSION);
+	LASSERTF(LUSTRE_MGS_VERSION == 0x00060000, "found 0x%.8x\n",
+		LUSTRE_MGS_VERSION);
+	LASSERTF(MSGHDR_AT_SUPPORT == 1, "found %lld\n",
+		 (long long)MSGHDR_AT_SUPPORT);
+	LASSERTF(MSGHDR_CKSUM_INCOMPAT18 == 2, "found %lld\n",
+		 (long long)MSGHDR_CKSUM_INCOMPAT18);
+	LASSERTF(MSG_OP_FLAG_MASK == 0xffff0000UL, "found 0x%.8xUL\n",
+		(unsigned)MSG_OP_FLAG_MASK);
+	LASSERTF(MSG_OP_FLAG_SHIFT == 16, "found %lld\n",
+		 (long long)MSG_OP_FLAG_SHIFT);
+	LASSERTF(MSG_GEN_FLAG_MASK == 0x0000ffffUL, "found 0x%.8xUL\n",
+		(unsigned)MSG_GEN_FLAG_MASK);
+	LASSERTF(MSG_LAST_REPLAY == 0x00000001UL, "found 0x%.8xUL\n",
+		(unsigned)MSG_LAST_REPLAY);
+	LASSERTF(MSG_RESENT == 0x00000002UL, "found 0x%.8xUL\n",
+		(unsigned)MSG_RESENT);
+	LASSERTF(MSG_REPLAY == 0x00000004UL, "found 0x%.8xUL\n",
+		(unsigned)MSG_REPLAY);
+	LASSERTF(MSG_DELAY_REPLAY == 0x00000010UL, "found 0x%.8xUL\n",
+		(unsigned)MSG_DELAY_REPLAY);
+	LASSERTF(MSG_VERSION_REPLAY == 0x00000020UL, "found 0x%.8xUL\n",
+		(unsigned)MSG_VERSION_REPLAY);
+	LASSERTF(MSG_REQ_REPLAY_DONE == 0x00000040UL, "found 0x%.8xUL\n",
+		(unsigned)MSG_REQ_REPLAY_DONE);
+	LASSERTF(MSG_LOCK_REPLAY_DONE == 0x00000080UL, "found 0x%.8xUL\n",
+		(unsigned)MSG_LOCK_REPLAY_DONE);
+	LASSERTF(MSG_CONNECT_RECOVERING == 0x00000001UL, "found 0x%.8xUL\n",
+		(unsigned)MSG_CONNECT_RECOVERING);
+	LASSERTF(MSG_CONNECT_RECONNECT == 0x00000002UL, "found 0x%.8xUL\n",
+		(unsigned)MSG_CONNECT_RECONNECT);
+	LASSERTF(MSG_CONNECT_REPLAYABLE == 0x00000004UL, "found 0x%.8xUL\n",
+		(unsigned)MSG_CONNECT_REPLAYABLE);
+	LASSERTF(MSG_CONNECT_LIBCLIENT == 0x00000010UL, "found 0x%.8xUL\n",
+		(unsigned)MSG_CONNECT_LIBCLIENT);
+	LASSERTF(MSG_CONNECT_INITIAL == 0x00000020UL, "found 0x%.8xUL\n",
+		(unsigned)MSG_CONNECT_INITIAL);
+	LASSERTF(MSG_CONNECT_ASYNC == 0x00000040UL, "found 0x%.8xUL\n",
+		(unsigned)MSG_CONNECT_ASYNC);
+	LASSERTF(MSG_CONNECT_NEXT_VER == 0x00000080UL, "found 0x%.8xUL\n",
+		(unsigned)MSG_CONNECT_NEXT_VER);
+	LASSERTF(MSG_CONNECT_TRANSNO == 0x00000100UL, "found 0x%.8xUL\n",
+		(unsigned)MSG_CONNECT_TRANSNO);
+
+	/* Checks for struct obd_connect_data */
+	LASSERTF((int)sizeof(struct obd_connect_data) == 192, "found %lld\n",
+		 (long long)(int)sizeof(struct obd_connect_data));
+	LASSERTF((int)offsetof(struct obd_connect_data, ocd_connect_flags) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_connect_data, ocd_connect_flags));
+	LASSERTF((int)sizeof(((struct obd_connect_data *)0)->ocd_connect_flags) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_connect_data *)0)->ocd_connect_flags));
+	LASSERTF((int)offsetof(struct obd_connect_data, ocd_version) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_connect_data, ocd_version));
+	LASSERTF((int)sizeof(((struct obd_connect_data *)0)->ocd_version) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_connect_data *)0)->ocd_version));
+	LASSERTF((int)offsetof(struct obd_connect_data, ocd_grant) == 12, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_connect_data, ocd_grant));
+	LASSERTF((int)sizeof(((struct obd_connect_data *)0)->ocd_grant) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_connect_data *)0)->ocd_grant));
+	LASSERTF((int)offsetof(struct obd_connect_data, ocd_index) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_connect_data, ocd_index));
+	LASSERTF((int)sizeof(((struct obd_connect_data *)0)->ocd_index) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_connect_data *)0)->ocd_index));
+	LASSERTF((int)offsetof(struct obd_connect_data, ocd_brw_size) == 20, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_connect_data, ocd_brw_size));
+	LASSERTF((int)sizeof(((struct obd_connect_data *)0)->ocd_brw_size) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_connect_data *)0)->ocd_brw_size));
+	LASSERTF((int)offsetof(struct obd_connect_data, ocd_ibits_known) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_connect_data, ocd_ibits_known));
+	LASSERTF((int)sizeof(((struct obd_connect_data *)0)->ocd_ibits_known) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_connect_data *)0)->ocd_ibits_known));
+	LASSERTF((int)offsetof(struct obd_connect_data, ocd_blocksize) == 32, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_connect_data, ocd_blocksize));
+	LASSERTF((int)sizeof(((struct obd_connect_data *)0)->ocd_blocksize) == 1, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_connect_data *)0)->ocd_blocksize));
+	LASSERTF((int)offsetof(struct obd_connect_data, ocd_inodespace) == 33, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_connect_data, ocd_inodespace));
+	LASSERTF((int)sizeof(((struct obd_connect_data *)0)->ocd_inodespace) == 1, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_connect_data *)0)->ocd_inodespace));
+	LASSERTF((int)offsetof(struct obd_connect_data, ocd_grant_extent) == 34, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_connect_data, ocd_grant_extent));
+	LASSERTF((int)sizeof(((struct obd_connect_data *)0)->ocd_grant_extent) == 2, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_connect_data *)0)->ocd_grant_extent));
+	LASSERTF((int)offsetof(struct obd_connect_data, ocd_unused) == 36, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_connect_data, ocd_unused));
+	LASSERTF((int)sizeof(((struct obd_connect_data *)0)->ocd_unused) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_connect_data *)0)->ocd_unused));
+	LASSERTF((int)offsetof(struct obd_connect_data, ocd_transno) == 40, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_connect_data, ocd_transno));
+	LASSERTF((int)sizeof(((struct obd_connect_data *)0)->ocd_transno) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_connect_data *)0)->ocd_transno));
+	LASSERTF((int)offsetof(struct obd_connect_data, ocd_group) == 48, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_connect_data, ocd_group));
+	LASSERTF((int)sizeof(((struct obd_connect_data *)0)->ocd_group) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_connect_data *)0)->ocd_group));
+	LASSERTF((int)offsetof(struct obd_connect_data, ocd_cksum_types) == 52, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_connect_data, ocd_cksum_types));
+	LASSERTF((int)sizeof(((struct obd_connect_data *)0)->ocd_cksum_types) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_connect_data *)0)->ocd_cksum_types));
+	LASSERTF((int)offsetof(struct obd_connect_data, ocd_max_easize) == 56, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_connect_data, ocd_max_easize));
+	LASSERTF((int)sizeof(((struct obd_connect_data *)0)->ocd_max_easize) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_connect_data *)0)->ocd_max_easize));
+	LASSERTF((int)offsetof(struct obd_connect_data, ocd_instance) == 60, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_connect_data, ocd_instance));
+	LASSERTF((int)sizeof(((struct obd_connect_data *)0)->ocd_instance) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_connect_data *)0)->ocd_instance));
+	LASSERTF((int)offsetof(struct obd_connect_data, ocd_maxbytes) == 64, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_connect_data, ocd_maxbytes));
+	LASSERTF((int)sizeof(((struct obd_connect_data *)0)->ocd_maxbytes) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_connect_data *)0)->ocd_maxbytes));
+	LASSERTF((int)offsetof(struct obd_connect_data, padding1) == 72, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_connect_data, padding1));
+	LASSERTF((int)sizeof(((struct obd_connect_data *)0)->padding1) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_connect_data *)0)->padding1));
+	LASSERTF((int)offsetof(struct obd_connect_data, padding2) == 80, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_connect_data, padding2));
+	LASSERTF((int)sizeof(((struct obd_connect_data *)0)->padding2) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_connect_data *)0)->padding2));
+	LASSERTF((int)offsetof(struct obd_connect_data, padding3) == 88, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_connect_data, padding3));
+	LASSERTF((int)sizeof(((struct obd_connect_data *)0)->padding3) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_connect_data *)0)->padding3));
+	LASSERTF((int)offsetof(struct obd_connect_data, padding4) == 96, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_connect_data, padding4));
+	LASSERTF((int)sizeof(((struct obd_connect_data *)0)->padding4) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_connect_data *)0)->padding4));
+	LASSERTF((int)offsetof(struct obd_connect_data, padding5) == 104, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_connect_data, padding5));
+	LASSERTF((int)sizeof(((struct obd_connect_data *)0)->padding5) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_connect_data *)0)->padding5));
+	LASSERTF((int)offsetof(struct obd_connect_data, padding6) == 112, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_connect_data, padding6));
+	LASSERTF((int)sizeof(((struct obd_connect_data *)0)->padding6) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_connect_data *)0)->padding6));
+	LASSERTF((int)offsetof(struct obd_connect_data, padding7) == 120, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_connect_data, padding7));
+	LASSERTF((int)sizeof(((struct obd_connect_data *)0)->padding7) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_connect_data *)0)->padding7));
+	LASSERTF((int)offsetof(struct obd_connect_data, padding8) == 128, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_connect_data, padding8));
+	LASSERTF((int)sizeof(((struct obd_connect_data *)0)->padding8) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_connect_data *)0)->padding8));
+	LASSERTF((int)offsetof(struct obd_connect_data, padding9) == 136, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_connect_data, padding9));
+	LASSERTF((int)sizeof(((struct obd_connect_data *)0)->padding9) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_connect_data *)0)->padding9));
+	LASSERTF((int)offsetof(struct obd_connect_data, paddingA) == 144, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_connect_data, paddingA));
+	LASSERTF((int)sizeof(((struct obd_connect_data *)0)->paddingA) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_connect_data *)0)->paddingA));
+	LASSERTF((int)offsetof(struct obd_connect_data, paddingB) == 152, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_connect_data, paddingB));
+	LASSERTF((int)sizeof(((struct obd_connect_data *)0)->paddingB) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_connect_data *)0)->paddingB));
+	LASSERTF((int)offsetof(struct obd_connect_data, paddingC) == 160, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_connect_data, paddingC));
+	LASSERTF((int)sizeof(((struct obd_connect_data *)0)->paddingC) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_connect_data *)0)->paddingC));
+	LASSERTF((int)offsetof(struct obd_connect_data, paddingD) == 168, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_connect_data, paddingD));
+	LASSERTF((int)sizeof(((struct obd_connect_data *)0)->paddingD) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_connect_data *)0)->paddingD));
+	LASSERTF((int)offsetof(struct obd_connect_data, paddingE) == 176, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_connect_data, paddingE));
+	LASSERTF((int)sizeof(((struct obd_connect_data *)0)->paddingE) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_connect_data *)0)->paddingE));
+	LASSERTF((int)offsetof(struct obd_connect_data, paddingF) == 184, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_connect_data, paddingF));
+	LASSERTF((int)sizeof(((struct obd_connect_data *)0)->paddingF) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_connect_data *)0)->paddingF));
+	LASSERTF(OBD_CONNECT_RDONLY == 0x1ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_RDONLY);
+	LASSERTF(OBD_CONNECT_INDEX == 0x2ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_INDEX);
+	LASSERTF(OBD_CONNECT_MDS == 0x4ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_MDS);
+	LASSERTF(OBD_CONNECT_GRANT == 0x8ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_GRANT);
+	LASSERTF(OBD_CONNECT_SRVLOCK == 0x10ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_SRVLOCK);
+	LASSERTF(OBD_CONNECT_VERSION == 0x20ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_VERSION);
+	LASSERTF(OBD_CONNECT_REQPORTAL == 0x40ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_REQPORTAL);
+	LASSERTF(OBD_CONNECT_ACL == 0x80ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_ACL);
+	LASSERTF(OBD_CONNECT_XATTR == 0x100ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_XATTR);
+	LASSERTF(OBD_CONNECT_CROW == 0x200ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_CROW);
+	LASSERTF(OBD_CONNECT_TRUNCLOCK == 0x400ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_TRUNCLOCK);
+	LASSERTF(OBD_CONNECT_TRANSNO == 0x800ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_TRANSNO);
+	LASSERTF(OBD_CONNECT_IBITS == 0x1000ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_IBITS);
+	LASSERTF(OBD_CONNECT_JOIN == 0x2000ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_JOIN);
+	LASSERTF(OBD_CONNECT_ATTRFID == 0x4000ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_ATTRFID);
+	LASSERTF(OBD_CONNECT_NODEVOH == 0x8000ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_NODEVOH);
+	LASSERTF(OBD_CONNECT_RMT_CLIENT == 0x10000ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_RMT_CLIENT);
+	LASSERTF(OBD_CONNECT_RMT_CLIENT_FORCE == 0x20000ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_RMT_CLIENT_FORCE);
+	LASSERTF(OBD_CONNECT_BRW_SIZE == 0x40000ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_BRW_SIZE);
+	LASSERTF(OBD_CONNECT_QUOTA64 == 0x80000ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_QUOTA64);
+	LASSERTF(OBD_CONNECT_MDS_CAPA == 0x100000ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_MDS_CAPA);
+	LASSERTF(OBD_CONNECT_OSS_CAPA == 0x200000ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_OSS_CAPA);
+	LASSERTF(OBD_CONNECT_CANCELSET == 0x400000ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_CANCELSET);
+	LASSERTF(OBD_CONNECT_SOM == 0x800000ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_SOM);
+	LASSERTF(OBD_CONNECT_AT == 0x1000000ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_AT);
+	LASSERTF(OBD_CONNECT_LRU_RESIZE == 0x2000000ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_LRU_RESIZE);
+	LASSERTF(OBD_CONNECT_MDS_MDS == 0x4000000ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_MDS_MDS);
+	LASSERTF(OBD_CONNECT_REAL == 0x8000000ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_REAL);
+	LASSERTF(OBD_CONNECT_CHANGE_QS == 0x10000000ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_CHANGE_QS);
+	LASSERTF(OBD_CONNECT_CKSUM == 0x20000000ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_CKSUM);
+	LASSERTF(OBD_CONNECT_FID == 0x40000000ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_FID);
+	LASSERTF(OBD_CONNECT_VBR == 0x80000000ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_VBR);
+	LASSERTF(OBD_CONNECT_LOV_V3 == 0x100000000ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_LOV_V3);
+	LASSERTF(OBD_CONNECT_GRANT_SHRINK == 0x200000000ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_GRANT_SHRINK);
+	LASSERTF(OBD_CONNECT_SKIP_ORPHAN == 0x400000000ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_SKIP_ORPHAN);
+	LASSERTF(OBD_CONNECT_MAX_EASIZE == 0x800000000ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_MAX_EASIZE);
+	LASSERTF(OBD_CONNECT_FULL20 == 0x1000000000ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_FULL20);
+	LASSERTF(OBD_CONNECT_LAYOUTLOCK == 0x2000000000ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_LAYOUTLOCK);
+	LASSERTF(OBD_CONNECT_64BITHASH == 0x4000000000ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_64BITHASH);
+	LASSERTF(OBD_CONNECT_MAXBYTES == 0x8000000000ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_MAXBYTES);
+	LASSERTF(OBD_CONNECT_IMP_RECOV == 0x10000000000ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_IMP_RECOV);
+	LASSERTF(OBD_CONNECT_JOBSTATS == 0x20000000000ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_JOBSTATS);
+	LASSERTF(OBD_CONNECT_UMASK == 0x40000000000ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_UMASK);
+	LASSERTF(OBD_CONNECT_EINPROGRESS == 0x80000000000ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_EINPROGRESS);
+	LASSERTF(OBD_CONNECT_GRANT_PARAM == 0x100000000000ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_GRANT_PARAM);
+	LASSERTF(OBD_CONNECT_FLOCK_OWNER == 0x200000000000ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_FLOCK_OWNER);
+	LASSERTF(OBD_CONNECT_LVB_TYPE == 0x400000000000ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_LVB_TYPE);
+	LASSERTF(OBD_CONNECT_NANOSEC_TIME == 0x800000000000ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_NANOSEC_TIME);
+	LASSERTF(OBD_CONNECT_LIGHTWEIGHT == 0x1000000000000ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_LIGHTWEIGHT);
+	LASSERTF(OBD_CONNECT_SHORTIO == 0x2000000000000ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_SHORTIO);
+	LASSERTF(OBD_CONNECT_PINGLESS == 0x4000000000000ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_PINGLESS);
+	LASSERTF(OBD_CONNECT_FLOCK_DEAD == 0x8000000000000ULL,
+		 "found 0x%.16llxULL\n", OBD_CONNECT_FLOCK_DEAD);
+	LASSERTF(OBD_CKSUM_CRC32 == 0x00000001UL, "found 0x%.8xUL\n",
+		(unsigned)OBD_CKSUM_CRC32);
+	LASSERTF(OBD_CKSUM_ADLER == 0x00000002UL, "found 0x%.8xUL\n",
+		(unsigned)OBD_CKSUM_ADLER);
+	LASSERTF(OBD_CKSUM_CRC32C == 0x00000004UL, "found 0x%.8xUL\n",
+		(unsigned)OBD_CKSUM_CRC32C);
+
+	/* Checks for struct obdo */
+	LASSERTF((int)sizeof(struct obdo) == 208, "found %lld\n",
+		 (long long)(int)sizeof(struct obdo));
+	LASSERTF((int)offsetof(struct obdo, o_valid) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct obdo, o_valid));
+	LASSERTF((int)sizeof(((struct obdo *)0)->o_valid) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct obdo *)0)->o_valid));
+	LASSERTF((int)offsetof(struct obdo, o_oi) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct obdo, o_oi));
+	LASSERTF((int)sizeof(((struct obdo *)0)->o_oi) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct obdo *)0)->o_oi));
+	LASSERTF((int)offsetof(struct obdo, o_parent_seq) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct obdo, o_parent_seq));
+	LASSERTF((int)sizeof(((struct obdo *)0)->o_parent_seq) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct obdo *)0)->o_parent_seq));
+	LASSERTF((int)offsetof(struct obdo, o_size) == 32, "found %lld\n",
+		 (long long)(int)offsetof(struct obdo, o_size));
+	LASSERTF((int)sizeof(((struct obdo *)0)->o_size) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct obdo *)0)->o_size));
+	LASSERTF((int)offsetof(struct obdo, o_mtime) == 40, "found %lld\n",
+		 (long long)(int)offsetof(struct obdo, o_mtime));
+	LASSERTF((int)sizeof(((struct obdo *)0)->o_mtime) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct obdo *)0)->o_mtime));
+	LASSERTF((int)offsetof(struct obdo, o_atime) == 48, "found %lld\n",
+		 (long long)(int)offsetof(struct obdo, o_atime));
+	LASSERTF((int)sizeof(((struct obdo *)0)->o_atime) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct obdo *)0)->o_atime));
+	LASSERTF((int)offsetof(struct obdo, o_ctime) == 56, "found %lld\n",
+		 (long long)(int)offsetof(struct obdo, o_ctime));
+	LASSERTF((int)sizeof(((struct obdo *)0)->o_ctime) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct obdo *)0)->o_ctime));
+	LASSERTF((int)offsetof(struct obdo, o_blocks) == 64, "found %lld\n",
+		 (long long)(int)offsetof(struct obdo, o_blocks));
+	LASSERTF((int)sizeof(((struct obdo *)0)->o_blocks) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct obdo *)0)->o_blocks));
+	LASSERTF((int)offsetof(struct obdo, o_grant) == 72, "found %lld\n",
+		 (long long)(int)offsetof(struct obdo, o_grant));
+	LASSERTF((int)sizeof(((struct obdo *)0)->o_grant) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct obdo *)0)->o_grant));
+	LASSERTF((int)offsetof(struct obdo, o_blksize) == 80, "found %lld\n",
+		 (long long)(int)offsetof(struct obdo, o_blksize));
+	LASSERTF((int)sizeof(((struct obdo *)0)->o_blksize) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct obdo *)0)->o_blksize));
+	LASSERTF((int)offsetof(struct obdo, o_mode) == 84, "found %lld\n",
+		 (long long)(int)offsetof(struct obdo, o_mode));
+	LASSERTF((int)sizeof(((struct obdo *)0)->o_mode) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct obdo *)0)->o_mode));
+	LASSERTF((int)offsetof(struct obdo, o_uid) == 88, "found %lld\n",
+		 (long long)(int)offsetof(struct obdo, o_uid));
+	LASSERTF((int)sizeof(((struct obdo *)0)->o_uid) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct obdo *)0)->o_uid));
+	LASSERTF((int)offsetof(struct obdo, o_gid) == 92, "found %lld\n",
+		 (long long)(int)offsetof(struct obdo, o_gid));
+	LASSERTF((int)sizeof(((struct obdo *)0)->o_gid) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct obdo *)0)->o_gid));
+	LASSERTF((int)offsetof(struct obdo, o_flags) == 96, "found %lld\n",
+		 (long long)(int)offsetof(struct obdo, o_flags));
+	LASSERTF((int)sizeof(((struct obdo *)0)->o_flags) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct obdo *)0)->o_flags));
+	LASSERTF((int)offsetof(struct obdo, o_nlink) == 100, "found %lld\n",
+		 (long long)(int)offsetof(struct obdo, o_nlink));
+	LASSERTF((int)sizeof(((struct obdo *)0)->o_nlink) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct obdo *)0)->o_nlink));
+	LASSERTF((int)offsetof(struct obdo, o_parent_oid) == 104, "found %lld\n",
+		 (long long)(int)offsetof(struct obdo, o_parent_oid));
+	LASSERTF((int)sizeof(((struct obdo *)0)->o_parent_oid) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct obdo *)0)->o_parent_oid));
+	LASSERTF((int)offsetof(struct obdo, o_misc) == 108, "found %lld\n",
+		 (long long)(int)offsetof(struct obdo, o_misc));
+	LASSERTF((int)sizeof(((struct obdo *)0)->o_misc) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct obdo *)0)->o_misc));
+	LASSERTF((int)offsetof(struct obdo, o_ioepoch) == 112, "found %lld\n",
+		 (long long)(int)offsetof(struct obdo, o_ioepoch));
+	LASSERTF((int)sizeof(((struct obdo *)0)->o_ioepoch) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct obdo *)0)->o_ioepoch));
+	LASSERTF((int)offsetof(struct obdo, o_stripe_idx) == 120, "found %lld\n",
+		 (long long)(int)offsetof(struct obdo, o_stripe_idx));
+	LASSERTF((int)sizeof(((struct obdo *)0)->o_stripe_idx) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct obdo *)0)->o_stripe_idx));
+	LASSERTF((int)offsetof(struct obdo, o_parent_ver) == 124, "found %lld\n",
+		 (long long)(int)offsetof(struct obdo, o_parent_ver));
+	LASSERTF((int)sizeof(((struct obdo *)0)->o_parent_ver) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct obdo *)0)->o_parent_ver));
+	LASSERTF((int)offsetof(struct obdo, o_handle) == 128, "found %lld\n",
+		 (long long)(int)offsetof(struct obdo, o_handle));
+	LASSERTF((int)sizeof(((struct obdo *)0)->o_handle) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct obdo *)0)->o_handle));
+	LASSERTF((int)offsetof(struct obdo, o_lcookie) == 136, "found %lld\n",
+		 (long long)(int)offsetof(struct obdo, o_lcookie));
+	LASSERTF((int)sizeof(((struct obdo *)0)->o_lcookie) == 32, "found %lld\n",
+		 (long long)(int)sizeof(((struct obdo *)0)->o_lcookie));
+	LASSERTF((int)offsetof(struct obdo, o_uid_h) == 168, "found %lld\n",
+		 (long long)(int)offsetof(struct obdo, o_uid_h));
+	LASSERTF((int)sizeof(((struct obdo *)0)->o_uid_h) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct obdo *)0)->o_uid_h));
+	LASSERTF((int)offsetof(struct obdo, o_gid_h) == 172, "found %lld\n",
+		 (long long)(int)offsetof(struct obdo, o_gid_h));
+	LASSERTF((int)sizeof(((struct obdo *)0)->o_gid_h) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct obdo *)0)->o_gid_h));
+	LASSERTF((int)offsetof(struct obdo, o_data_version) == 176, "found %lld\n",
+		 (long long)(int)offsetof(struct obdo, o_data_version));
+	LASSERTF((int)sizeof(((struct obdo *)0)->o_data_version) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct obdo *)0)->o_data_version));
+	LASSERTF((int)offsetof(struct obdo, o_padding_4) == 184, "found %lld\n",
+		 (long long)(int)offsetof(struct obdo, o_padding_4));
+	LASSERTF((int)sizeof(((struct obdo *)0)->o_padding_4) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct obdo *)0)->o_padding_4));
+	LASSERTF((int)offsetof(struct obdo, o_padding_5) == 192, "found %lld\n",
+		 (long long)(int)offsetof(struct obdo, o_padding_5));
+	LASSERTF((int)sizeof(((struct obdo *)0)->o_padding_5) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct obdo *)0)->o_padding_5));
+	LASSERTF((int)offsetof(struct obdo, o_padding_6) == 200, "found %lld\n",
+		 (long long)(int)offsetof(struct obdo, o_padding_6));
+	LASSERTF((int)sizeof(((struct obdo *)0)->o_padding_6) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct obdo *)0)->o_padding_6));
+	LASSERTF(OBD_MD_FLID == (0x00000001ULL), "found 0x%.16llxULL\n",
+		 OBD_MD_FLID);
+	LASSERTF(OBD_MD_FLATIME == (0x00000002ULL), "found 0x%.16llxULL\n",
+		 OBD_MD_FLATIME);
+	LASSERTF(OBD_MD_FLMTIME == (0x00000004ULL), "found 0x%.16llxULL\n",
+		 OBD_MD_FLMTIME);
+	LASSERTF(OBD_MD_FLCTIME == (0x00000008ULL), "found 0x%.16llxULL\n",
+		 OBD_MD_FLCTIME);
+	LASSERTF(OBD_MD_FLSIZE == (0x00000010ULL), "found 0x%.16llxULL\n",
+		 OBD_MD_FLSIZE);
+	LASSERTF(OBD_MD_FLBLOCKS == (0x00000020ULL), "found 0x%.16llxULL\n",
+		 OBD_MD_FLBLOCKS);
+	LASSERTF(OBD_MD_FLBLKSZ == (0x00000040ULL), "found 0x%.16llxULL\n",
+		 OBD_MD_FLBLKSZ);
+	LASSERTF(OBD_MD_FLMODE == (0x00000080ULL), "found 0x%.16llxULL\n",
+		 OBD_MD_FLMODE);
+	LASSERTF(OBD_MD_FLTYPE == (0x00000100ULL), "found 0x%.16llxULL\n",
+		 OBD_MD_FLTYPE);
+	LASSERTF(OBD_MD_FLUID == (0x00000200ULL), "found 0x%.16llxULL\n",
+		 OBD_MD_FLUID);
+	LASSERTF(OBD_MD_FLGID == (0x00000400ULL), "found 0x%.16llxULL\n",
+		 OBD_MD_FLGID);
+	LASSERTF(OBD_MD_FLFLAGS == (0x00000800ULL), "found 0x%.16llxULL\n",
+		 OBD_MD_FLFLAGS);
+	LASSERTF(OBD_MD_FLNLINK == (0x00002000ULL), "found 0x%.16llxULL\n",
+		 OBD_MD_FLNLINK);
+	LASSERTF(OBD_MD_FLGENER == (0x00004000ULL), "found 0x%.16llxULL\n",
+		 OBD_MD_FLGENER);
+	LASSERTF(OBD_MD_FLRDEV == (0x00010000ULL), "found 0x%.16llxULL\n",
+		 OBD_MD_FLRDEV);
+	LASSERTF(OBD_MD_FLEASIZE == (0x00020000ULL), "found 0x%.16llxULL\n",
+		 OBD_MD_FLEASIZE);
+	LASSERTF(OBD_MD_LINKNAME == (0x00040000ULL), "found 0x%.16llxULL\n",
+		 OBD_MD_LINKNAME);
+	LASSERTF(OBD_MD_FLHANDLE == (0x00080000ULL), "found 0x%.16llxULL\n",
+		 OBD_MD_FLHANDLE);
+	LASSERTF(OBD_MD_FLCKSUM == (0x00100000ULL), "found 0x%.16llxULL\n",
+		 OBD_MD_FLCKSUM);
+	LASSERTF(OBD_MD_FLQOS == (0x00200000ULL), "found 0x%.16llxULL\n",
+		 OBD_MD_FLQOS);
+	LASSERTF(OBD_MD_FLCOOKIE == (0x00800000ULL), "found 0x%.16llxULL\n",
+		 OBD_MD_FLCOOKIE);
+	LASSERTF(OBD_MD_FLGROUP == (0x01000000ULL), "found 0x%.16llxULL\n",
+		 OBD_MD_FLGROUP);
+	LASSERTF(OBD_MD_FLFID == (0x02000000ULL), "found 0x%.16llxULL\n",
+		 OBD_MD_FLFID);
+	LASSERTF(OBD_MD_FLEPOCH == (0x04000000ULL), "found 0x%.16llxULL\n",
+		 OBD_MD_FLEPOCH);
+	LASSERTF(OBD_MD_FLGRANT == (0x08000000ULL), "found 0x%.16llxULL\n",
+		 OBD_MD_FLGRANT);
+	LASSERTF(OBD_MD_FLDIREA == (0x10000000ULL), "found 0x%.16llxULL\n",
+		 OBD_MD_FLDIREA);
+	LASSERTF(OBD_MD_FLUSRQUOTA == (0x20000000ULL), "found 0x%.16llxULL\n",
+		 OBD_MD_FLUSRQUOTA);
+	LASSERTF(OBD_MD_FLGRPQUOTA == (0x40000000ULL), "found 0x%.16llxULL\n",
+		 OBD_MD_FLGRPQUOTA);
+	LASSERTF(OBD_MD_FLMODEASIZE == (0x80000000ULL), "found 0x%.16llxULL\n",
+		 OBD_MD_FLMODEASIZE);
+	LASSERTF(OBD_MD_MDS == (0x0000000100000000ULL), "found 0x%.16llxULL\n",
+		 OBD_MD_MDS);
+	LASSERTF(OBD_MD_REINT == (0x0000000200000000ULL), "found 0x%.16llxULL\n",
+		 OBD_MD_REINT);
+	LASSERTF(OBD_MD_MEA == (0x0000000400000000ULL), "found 0x%.16llxULL\n",
+		 OBD_MD_MEA);
+	LASSERTF(OBD_MD_TSTATE == (0x0000000800000000ULL),
+		 "found 0x%.16llxULL\n", OBD_MD_TSTATE);
+	LASSERTF(OBD_MD_FLXATTR == (0x0000001000000000ULL), "found 0x%.16llxULL\n",
+		 OBD_MD_FLXATTR);
+	LASSERTF(OBD_MD_FLXATTRLS == (0x0000002000000000ULL), "found 0x%.16llxULL\n",
+		 OBD_MD_FLXATTRLS);
+	LASSERTF(OBD_MD_FLXATTRRM == (0x0000004000000000ULL), "found 0x%.16llxULL\n",
+		 OBD_MD_FLXATTRRM);
+	LASSERTF(OBD_MD_FLACL == (0x0000008000000000ULL), "found 0x%.16llxULL\n",
+		 OBD_MD_FLACL);
+	LASSERTF(OBD_MD_FLRMTPERM == (0x0000010000000000ULL), "found 0x%.16llxULL\n",
+		 OBD_MD_FLRMTPERM);
+	LASSERTF(OBD_MD_FLMDSCAPA == (0x0000020000000000ULL), "found 0x%.16llxULL\n",
+		 OBD_MD_FLMDSCAPA);
+	LASSERTF(OBD_MD_FLOSSCAPA == (0x0000040000000000ULL), "found 0x%.16llxULL\n",
+		 OBD_MD_FLOSSCAPA);
+	LASSERTF(OBD_MD_FLCKSPLIT == (0x0000080000000000ULL), "found 0x%.16llxULL\n",
+		 OBD_MD_FLCKSPLIT);
+	LASSERTF(OBD_MD_FLCROSSREF == (0x0000100000000000ULL), "found 0x%.16llxULL\n",
+		 OBD_MD_FLCROSSREF);
+	LASSERTF(OBD_MD_FLGETATTRLOCK == (0x0000200000000000ULL), "found 0x%.16llxULL\n",
+		 OBD_MD_FLGETATTRLOCK);
+	LASSERTF(OBD_MD_FLRMTLSETFACL == (0x0001000000000000ULL), "found 0x%.16llxULL\n",
+		 OBD_MD_FLRMTLSETFACL);
+	LASSERTF(OBD_MD_FLRMTLGETFACL == (0x0002000000000000ULL), "found 0x%.16llxULL\n",
+		 OBD_MD_FLRMTLGETFACL);
+	LASSERTF(OBD_MD_FLRMTRSETFACL == (0x0004000000000000ULL), "found 0x%.16llxULL\n",
+		 OBD_MD_FLRMTRSETFACL);
+	LASSERTF(OBD_MD_FLRMTRGETFACL == (0x0008000000000000ULL), "found 0x%.16llxULL\n",
+		 OBD_MD_FLRMTRGETFACL);
+	LASSERTF(OBD_MD_FLDATAVERSION == (0x0010000000000000ULL), "found 0x%.16llxULL\n",
+		 OBD_MD_FLDATAVERSION);
+	CLASSERT(OBD_FL_INLINEDATA == 0x00000001);
+	CLASSERT(OBD_FL_OBDMDEXISTS == 0x00000002);
+	CLASSERT(OBD_FL_DELORPHAN == 0x00000004);
+	CLASSERT(OBD_FL_NORPC == 0x00000008);
+	CLASSERT(OBD_FL_IDONLY == 0x00000010);
+	CLASSERT(OBD_FL_RECREATE_OBJS == 0x00000020);
+	CLASSERT(OBD_FL_DEBUG_CHECK == 0x00000040);
+	CLASSERT(OBD_FL_NO_USRQUOTA == 0x00000100);
+	CLASSERT(OBD_FL_NO_GRPQUOTA == 0x00000200);
+	CLASSERT(OBD_FL_CREATE_CROW == 0x00000400);
+	CLASSERT(OBD_FL_SRVLOCK == 0x00000800);
+	CLASSERT(OBD_FL_CKSUM_CRC32 == 0x00001000);
+	CLASSERT(OBD_FL_CKSUM_ADLER == 0x00002000);
+	CLASSERT(OBD_FL_CKSUM_CRC32C == 0x00004000);
+	CLASSERT(OBD_FL_CKSUM_RSVD2 == 0x00008000);
+	CLASSERT(OBD_FL_CKSUM_RSVD3 == 0x00010000);
+	CLASSERT(OBD_FL_SHRINK_GRANT == 0x00020000);
+	CLASSERT(OBD_FL_MMAP == 0x00040000);
+	CLASSERT(OBD_FL_RECOV_RESEND == 0x00080000);
+	CLASSERT(OBD_FL_NOSPC_BLK == 0x00100000);
+	CLASSERT(OBD_FL_LOCAL_MASK == 0xf0000000);
+
+	/* Checks for struct lov_ost_data_v1 */
+	LASSERTF((int)sizeof(struct lov_ost_data_v1) == 24, "found %lld\n",
+		 (long long)(int)sizeof(struct lov_ost_data_v1));
+	LASSERTF((int)offsetof(struct lov_ost_data_v1, l_ost_oi) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct lov_ost_data_v1, l_ost_oi));
+	LASSERTF((int)sizeof(((struct lov_ost_data_v1 *)0)->l_ost_oi) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct lov_ost_data_v1 *)0)->l_ost_oi));
+	LASSERTF((int)offsetof(struct lov_ost_data_v1, l_ost_gen) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct lov_ost_data_v1, l_ost_gen));
+	LASSERTF((int)sizeof(((struct lov_ost_data_v1 *)0)->l_ost_gen) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lov_ost_data_v1 *)0)->l_ost_gen));
+	LASSERTF((int)offsetof(struct lov_ost_data_v1, l_ost_idx) == 20, "found %lld\n",
+		 (long long)(int)offsetof(struct lov_ost_data_v1, l_ost_idx));
+	LASSERTF((int)sizeof(((struct lov_ost_data_v1 *)0)->l_ost_idx) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lov_ost_data_v1 *)0)->l_ost_idx));
+
+	/* Checks for struct lov_mds_md_v1 */
+	LASSERTF((int)sizeof(struct lov_mds_md_v1) == 32, "found %lld\n",
+		 (long long)(int)sizeof(struct lov_mds_md_v1));
+	LASSERTF((int)offsetof(struct lov_mds_md_v1, lmm_magic) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct lov_mds_md_v1, lmm_magic));
+	LASSERTF((int)sizeof(((struct lov_mds_md_v1 *)0)->lmm_magic) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lov_mds_md_v1 *)0)->lmm_magic));
+	LASSERTF((int)offsetof(struct lov_mds_md_v1, lmm_pattern) == 4, "found %lld\n",
+		 (long long)(int)offsetof(struct lov_mds_md_v1, lmm_pattern));
+	LASSERTF((int)sizeof(((struct lov_mds_md_v1 *)0)->lmm_pattern) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lov_mds_md_v1 *)0)->lmm_pattern));
+	LASSERTF((int)offsetof(struct lov_mds_md_v1, lmm_oi) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct lov_mds_md_v1, lmm_oi));
+	LASSERTF((int)sizeof(((struct lov_mds_md_v1 *)0)->lmm_oi) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct lov_mds_md_v1 *)0)->lmm_oi));
+	LASSERTF((int)offsetof(struct lov_mds_md_v1, lmm_stripe_size) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct lov_mds_md_v1, lmm_stripe_size));
+	LASSERTF((int)sizeof(((struct lov_mds_md_v1 *)0)->lmm_stripe_size) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lov_mds_md_v1 *)0)->lmm_stripe_size));
+	LASSERTF((int)offsetof(struct lov_mds_md_v1, lmm_stripe_count) == 28, "found %lld\n",
+		 (long long)(int)offsetof(struct lov_mds_md_v1, lmm_stripe_count));
+	LASSERTF((int)sizeof(((struct lov_mds_md_v1 *)0)->lmm_stripe_count) == 2, "found %lld\n",
+		 (long long)(int)sizeof(((struct lov_mds_md_v1 *)0)->lmm_stripe_count));
+	LASSERTF((int)offsetof(struct lov_mds_md_v1, lmm_layout_gen) == 30, "found %lld\n",
+		 (long long)(int)offsetof(struct lov_mds_md_v1, lmm_layout_gen));
+	LASSERTF((int)sizeof(((struct lov_mds_md_v1 *)0)->lmm_layout_gen) == 2, "found %lld\n",
+		 (long long)(int)sizeof(((struct lov_mds_md_v1 *)0)->lmm_layout_gen));
+	LASSERTF((int)offsetof(struct lov_mds_md_v1, lmm_objects[0]) == 32, "found %lld\n",
+		 (long long)(int)offsetof(struct lov_mds_md_v1, lmm_objects[0]));
+	LASSERTF((int)sizeof(((struct lov_mds_md_v1 *)0)->lmm_objects[0]) == 24, "found %lld\n",
+		 (long long)(int)sizeof(((struct lov_mds_md_v1 *)0)->lmm_objects[0]));
+	CLASSERT(LOV_MAGIC_V1 == 0x0BD10BD0);
+
+	/* Checks for struct lov_mds_md_v3 */
+	LASSERTF((int)sizeof(struct lov_mds_md_v3) == 48, "found %lld\n",
+		 (long long)(int)sizeof(struct lov_mds_md_v3));
+	LASSERTF((int)offsetof(struct lov_mds_md_v3, lmm_magic) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct lov_mds_md_v3, lmm_magic));
+	LASSERTF((int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_magic) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_magic));
+	LASSERTF((int)offsetof(struct lov_mds_md_v3, lmm_pattern) == 4, "found %lld\n",
+		 (long long)(int)offsetof(struct lov_mds_md_v3, lmm_pattern));
+	LASSERTF((int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_pattern) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_pattern));
+	LASSERTF((int)offsetof(struct lov_mds_md_v3, lmm_oi) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct lov_mds_md_v3, lmm_oi));
+	LASSERTF((int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_oi) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_oi));
+	LASSERTF((int)offsetof(struct lov_mds_md_v3, lmm_stripe_size) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct lov_mds_md_v3, lmm_stripe_size));
+	LASSERTF((int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_stripe_size) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_stripe_size));
+	LASSERTF((int)offsetof(struct lov_mds_md_v3, lmm_stripe_count) == 28, "found %lld\n",
+		 (long long)(int)offsetof(struct lov_mds_md_v3, lmm_stripe_count));
+	LASSERTF((int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_stripe_count) == 2, "found %lld\n",
+		 (long long)(int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_stripe_count));
+	LASSERTF((int)offsetof(struct lov_mds_md_v3, lmm_layout_gen) == 30, "found %lld\n",
+		 (long long)(int)offsetof(struct lov_mds_md_v3, lmm_layout_gen));
+	LASSERTF((int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_layout_gen) == 2, "found %lld\n",
+		 (long long)(int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_layout_gen));
+	CLASSERT(LOV_MAXPOOLNAME == 16);
+	LASSERTF((int)offsetof(struct lov_mds_md_v3, lmm_pool_name[16]) == 48, "found %lld\n",
+		 (long long)(int)offsetof(struct lov_mds_md_v3, lmm_pool_name[16]));
+	LASSERTF((int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_pool_name[16]) == 1, "found %lld\n",
+		 (long long)(int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_pool_name[16]));
+	LASSERTF((int)offsetof(struct lov_mds_md_v3, lmm_objects[0]) == 48, "found %lld\n",
+		 (long long)(int)offsetof(struct lov_mds_md_v3, lmm_objects[0]));
+	LASSERTF((int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_objects[0]) == 24, "found %lld\n",
+		 (long long)(int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_objects[0]));
+	CLASSERT(LOV_MAGIC_V3 == 0x0BD30BD0);
+	LASSERTF(LOV_PATTERN_RAID0 == 0x00000001UL, "found 0x%.8xUL\n",
+		(unsigned)LOV_PATTERN_RAID0);
+	LASSERTF(LOV_PATTERN_RAID1 == 0x00000002UL, "found 0x%.8xUL\n",
+		(unsigned)LOV_PATTERN_RAID1);
+	LASSERTF(LOV_PATTERN_FIRST == 0x00000100UL, "found 0x%.8xUL\n",
+		(unsigned)LOV_PATTERN_FIRST);
+	LASSERTF(LOV_PATTERN_CMOBD == 0x00000200UL, "found 0x%.8xUL\n",
+		(unsigned)LOV_PATTERN_CMOBD);
+
+	/* Checks for struct obd_statfs */
+	LASSERTF((int)sizeof(struct obd_statfs) == 144, "found %lld\n",
+		 (long long)(int)sizeof(struct obd_statfs));
+	LASSERTF((int)offsetof(struct obd_statfs, os_type) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_statfs, os_type));
+	LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_type) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_statfs *)0)->os_type));
+	LASSERTF((int)offsetof(struct obd_statfs, os_blocks) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_statfs, os_blocks));
+	LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_blocks) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_statfs *)0)->os_blocks));
+	LASSERTF((int)offsetof(struct obd_statfs, os_bfree) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_statfs, os_bfree));
+	LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_bfree) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_statfs *)0)->os_bfree));
+	LASSERTF((int)offsetof(struct obd_statfs, os_bavail) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_statfs, os_bavail));
+	LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_bavail) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_statfs *)0)->os_bavail));
+	LASSERTF((int)offsetof(struct obd_statfs, os_ffree) == 40, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_statfs, os_ffree));
+	LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_ffree) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_statfs *)0)->os_ffree));
+	LASSERTF((int)offsetof(struct obd_statfs, os_fsid) == 48, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_statfs, os_fsid));
+	LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_fsid) == 40, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_statfs *)0)->os_fsid));
+	LASSERTF((int)offsetof(struct obd_statfs, os_bsize) == 88, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_statfs, os_bsize));
+	LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_bsize) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_statfs *)0)->os_bsize));
+	LASSERTF((int)offsetof(struct obd_statfs, os_namelen) == 92, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_statfs, os_namelen));
+	LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_namelen) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_statfs *)0)->os_namelen));
+	LASSERTF((int)offsetof(struct obd_statfs, os_state) == 104, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_statfs, os_state));
+	LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_state) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_statfs *)0)->os_state));
+	LASSERTF((int)offsetof(struct obd_statfs, os_fprecreated) == 108, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_statfs, os_fprecreated));
+	LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_fprecreated) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_statfs *)0)->os_fprecreated));
+	LASSERTF((int)offsetof(struct obd_statfs, os_spare2) == 112, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_statfs, os_spare2));
+	LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_spare2) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_statfs *)0)->os_spare2));
+	LASSERTF((int)offsetof(struct obd_statfs, os_spare3) == 116, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_statfs, os_spare3));
+	LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_spare3) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_statfs *)0)->os_spare3));
+	LASSERTF((int)offsetof(struct obd_statfs, os_spare4) == 120, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_statfs, os_spare4));
+	LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_spare4) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_statfs *)0)->os_spare4));
+	LASSERTF((int)offsetof(struct obd_statfs, os_spare5) == 124, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_statfs, os_spare5));
+	LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_spare5) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_statfs *)0)->os_spare5));
+	LASSERTF((int)offsetof(struct obd_statfs, os_spare6) == 128, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_statfs, os_spare6));
+	LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_spare6) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_statfs *)0)->os_spare6));
+	LASSERTF((int)offsetof(struct obd_statfs, os_spare7) == 132, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_statfs, os_spare7));
+	LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_spare7) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_statfs *)0)->os_spare7));
+	LASSERTF((int)offsetof(struct obd_statfs, os_spare8) == 136, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_statfs, os_spare8));
+	LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_spare8) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_statfs *)0)->os_spare8));
+	LASSERTF((int)offsetof(struct obd_statfs, os_spare9) == 140, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_statfs, os_spare9));
+	LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_spare9) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_statfs *)0)->os_spare9));
+
+	/* Checks for struct obd_ioobj */
+	LASSERTF((int)sizeof(struct obd_ioobj) == 24, "found %lld\n",
+		 (long long)(int)sizeof(struct obd_ioobj));
+	LASSERTF((int)offsetof(struct obd_ioobj, ioo_oid) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_ioobj, ioo_oid));
+	LASSERTF((int)sizeof(((struct obd_ioobj *)0)->ioo_oid) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_ioobj *)0)->ioo_oid));
+	LASSERTF((int)offsetof(struct obd_ioobj, ioo_max_brw) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_ioobj, ioo_max_brw));
+	LASSERTF((int)sizeof(((struct obd_ioobj *)0)->ioo_max_brw) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_ioobj *)0)->ioo_max_brw));
+	LASSERTF((int)offsetof(struct obd_ioobj, ioo_bufcnt) == 20, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_ioobj, ioo_bufcnt));
+	LASSERTF((int)sizeof(((struct obd_ioobj *)0)->ioo_bufcnt) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_ioobj *)0)->ioo_bufcnt));
+
+	/* Checks for union lquota_id */
+	LASSERTF((int)sizeof(union lquota_id) == 16, "found %lld\n",
+		 (long long)(int)sizeof(union lquota_id));
+
+	LASSERTF(QUOTABLOCK_BITS == 10, "found %lld\n",
+		 (long long)QUOTABLOCK_BITS);
+	LASSERTF(QUOTABLOCK_SIZE == 1024, "found %lld\n",
+		 (long long)QUOTABLOCK_SIZE);
+
+	/* Checks for struct obd_quotactl */
+	LASSERTF((int)sizeof(struct obd_quotactl) == 112, "found %lld\n",
+		 (long long)(int)sizeof(struct obd_quotactl));
+	LASSERTF((int)offsetof(struct obd_quotactl, qc_cmd) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_quotactl, qc_cmd));
+	LASSERTF((int)sizeof(((struct obd_quotactl *)0)->qc_cmd) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_quotactl *)0)->qc_cmd));
+	LASSERTF((int)offsetof(struct obd_quotactl, qc_type) == 4, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_quotactl, qc_type));
+	LASSERTF((int)sizeof(((struct obd_quotactl *)0)->qc_type) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_quotactl *)0)->qc_type));
+	LASSERTF((int)offsetof(struct obd_quotactl, qc_id) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_quotactl, qc_id));
+	LASSERTF((int)sizeof(((struct obd_quotactl *)0)->qc_id) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_quotactl *)0)->qc_id));
+	LASSERTF((int)offsetof(struct obd_quotactl, qc_stat) == 12, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_quotactl, qc_stat));
+	LASSERTF((int)sizeof(((struct obd_quotactl *)0)->qc_stat) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_quotactl *)0)->qc_stat));
+	LASSERTF((int)offsetof(struct obd_quotactl, qc_dqinfo) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_quotactl, qc_dqinfo));
+	LASSERTF((int)sizeof(((struct obd_quotactl *)0)->qc_dqinfo) == 24, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_quotactl *)0)->qc_dqinfo));
+	LASSERTF((int)offsetof(struct obd_quotactl, qc_dqblk) == 40, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_quotactl, qc_dqblk));
+	LASSERTF((int)sizeof(((struct obd_quotactl *)0)->qc_dqblk) == 72, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_quotactl *)0)->qc_dqblk));
+
+	/* Checks for struct obd_dqinfo */
+	LASSERTF((int)sizeof(struct obd_dqinfo) == 24, "found %lld\n",
+		 (long long)(int)sizeof(struct obd_dqinfo));
+	LASSERTF((int)offsetof(struct obd_dqinfo, dqi_bgrace) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_dqinfo, dqi_bgrace));
+	LASSERTF((int)sizeof(((struct obd_dqinfo *)0)->dqi_bgrace) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_dqinfo *)0)->dqi_bgrace));
+	LASSERTF((int)offsetof(struct obd_dqinfo, dqi_igrace) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_dqinfo, dqi_igrace));
+	LASSERTF((int)sizeof(((struct obd_dqinfo *)0)->dqi_igrace) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_dqinfo *)0)->dqi_igrace));
+	LASSERTF((int)offsetof(struct obd_dqinfo, dqi_flags) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_dqinfo, dqi_flags));
+	LASSERTF((int)sizeof(((struct obd_dqinfo *)0)->dqi_flags) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_dqinfo *)0)->dqi_flags));
+	LASSERTF((int)offsetof(struct obd_dqinfo, dqi_valid) == 20, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_dqinfo, dqi_valid));
+	LASSERTF((int)sizeof(((struct obd_dqinfo *)0)->dqi_valid) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_dqinfo *)0)->dqi_valid));
+
+	/* Checks for struct obd_dqblk */
+	LASSERTF((int)sizeof(struct obd_dqblk) == 72, "found %lld\n",
+		 (long long)(int)sizeof(struct obd_dqblk));
+	LASSERTF((int)offsetof(struct obd_dqblk, dqb_bhardlimit) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_dqblk, dqb_bhardlimit));
+	LASSERTF((int)sizeof(((struct obd_dqblk *)0)->dqb_bhardlimit) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_dqblk *)0)->dqb_bhardlimit));
+	LASSERTF((int)offsetof(struct obd_dqblk, dqb_bsoftlimit) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_dqblk, dqb_bsoftlimit));
+	LASSERTF((int)sizeof(((struct obd_dqblk *)0)->dqb_bsoftlimit) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_dqblk *)0)->dqb_bsoftlimit));
+	LASSERTF((int)offsetof(struct obd_dqblk, dqb_curspace) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_dqblk, dqb_curspace));
+	LASSERTF((int)sizeof(((struct obd_dqblk *)0)->dqb_curspace) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_dqblk *)0)->dqb_curspace));
+	LASSERTF((int)offsetof(struct obd_dqblk, dqb_ihardlimit) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_dqblk, dqb_ihardlimit));
+	LASSERTF((int)sizeof(((struct obd_dqblk *)0)->dqb_ihardlimit) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_dqblk *)0)->dqb_ihardlimit));
+	LASSERTF((int)offsetof(struct obd_dqblk, dqb_isoftlimit) == 32, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_dqblk, dqb_isoftlimit));
+	LASSERTF((int)sizeof(((struct obd_dqblk *)0)->dqb_isoftlimit) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_dqblk *)0)->dqb_isoftlimit));
+	LASSERTF((int)offsetof(struct obd_dqblk, dqb_curinodes) == 40, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_dqblk, dqb_curinodes));
+	LASSERTF((int)sizeof(((struct obd_dqblk *)0)->dqb_curinodes) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_dqblk *)0)->dqb_curinodes));
+	LASSERTF((int)offsetof(struct obd_dqblk, dqb_btime) == 48, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_dqblk, dqb_btime));
+	LASSERTF((int)sizeof(((struct obd_dqblk *)0)->dqb_btime) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_dqblk *)0)->dqb_btime));
+	LASSERTF((int)offsetof(struct obd_dqblk, dqb_itime) == 56, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_dqblk, dqb_itime));
+	LASSERTF((int)sizeof(((struct obd_dqblk *)0)->dqb_itime) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_dqblk *)0)->dqb_itime));
+	LASSERTF((int)offsetof(struct obd_dqblk, dqb_valid) == 64, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_dqblk, dqb_valid));
+	LASSERTF((int)sizeof(((struct obd_dqblk *)0)->dqb_valid) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_dqblk *)0)->dqb_valid));
+	LASSERTF((int)offsetof(struct obd_dqblk, dqb_padding) == 68, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_dqblk, dqb_padding));
+	LASSERTF((int)sizeof(((struct obd_dqblk *)0)->dqb_padding) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_dqblk *)0)->dqb_padding));
+	LASSERTF(Q_QUOTACHECK == 0x800100, "found 0x%.8x\n",
+		Q_QUOTACHECK);
+	LASSERTF(Q_INITQUOTA == 0x800101, "found 0x%.8x\n",
+		Q_INITQUOTA);
+	LASSERTF(Q_GETOINFO == 0x800102, "found 0x%.8x\n",
+		Q_GETOINFO);
+	LASSERTF(Q_GETOQUOTA == 0x800103, "found 0x%.8x\n",
+		Q_GETOQUOTA);
+	LASSERTF(Q_FINVALIDATE == 0x800104, "found 0x%.8x\n",
+		Q_FINVALIDATE);
+
+	/* Checks for struct lquota_acct_rec */
+	LASSERTF((int)sizeof(struct lquota_acct_rec) == 16, "found %lld\n",
+		 (long long)(int)sizeof(struct lquota_acct_rec));
+	LASSERTF((int)offsetof(struct lquota_acct_rec, bspace) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct lquota_acct_rec, bspace));
+	LASSERTF((int)sizeof(((struct lquota_acct_rec *)0)->bspace) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct lquota_acct_rec *)0)->bspace));
+	LASSERTF((int)offsetof(struct lquota_acct_rec, ispace) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct lquota_acct_rec, ispace));
+	LASSERTF((int)sizeof(((struct lquota_acct_rec *)0)->ispace) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct lquota_acct_rec *)0)->ispace));
+
+	/* Checks for struct lquota_glb_rec */
+	LASSERTF((int)sizeof(struct lquota_glb_rec) == 32, "found %lld\n",
+		 (long long)(int)sizeof(struct lquota_glb_rec));
+	LASSERTF((int)offsetof(struct lquota_glb_rec, qbr_hardlimit) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct lquota_glb_rec, qbr_hardlimit));
+	LASSERTF((int)sizeof(((struct lquota_glb_rec *)0)->qbr_hardlimit) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct lquota_glb_rec *)0)->qbr_hardlimit));
+	LASSERTF((int)offsetof(struct lquota_glb_rec, qbr_softlimit) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct lquota_glb_rec, qbr_softlimit));
+	LASSERTF((int)sizeof(((struct lquota_glb_rec *)0)->qbr_softlimit) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct lquota_glb_rec *)0)->qbr_softlimit));
+	LASSERTF((int)offsetof(struct lquota_glb_rec, qbr_time) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct lquota_glb_rec, qbr_time));
+	LASSERTF((int)sizeof(((struct lquota_glb_rec *)0)->qbr_time) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct lquota_glb_rec *)0)->qbr_time));
+	LASSERTF((int)offsetof(struct lquota_glb_rec, qbr_granted) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct lquota_glb_rec, qbr_granted));
+	LASSERTF((int)sizeof(((struct lquota_glb_rec *)0)->qbr_granted) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct lquota_glb_rec *)0)->qbr_granted));
+
+	/* Checks for struct lquota_slv_rec */
+	LASSERTF((int)sizeof(struct lquota_slv_rec) == 8, "found %lld\n",
+		 (long long)(int)sizeof(struct lquota_slv_rec));
+	LASSERTF((int)offsetof(struct lquota_slv_rec, qsr_granted) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct lquota_slv_rec, qsr_granted));
+	LASSERTF((int)sizeof(((struct lquota_slv_rec *)0)->qsr_granted) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct lquota_slv_rec *)0)->qsr_granted));
+
+	/* Checks for struct idx_info */
+	LASSERTF((int)sizeof(struct idx_info) == 80, "found %lld\n",
+		 (long long)(int)sizeof(struct idx_info));
+	LASSERTF((int)offsetof(struct idx_info, ii_magic) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct idx_info, ii_magic));
+	LASSERTF((int)sizeof(((struct idx_info *)0)->ii_magic) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct idx_info *)0)->ii_magic));
+	LASSERTF((int)offsetof(struct idx_info, ii_flags) == 4, "found %lld\n",
+		 (long long)(int)offsetof(struct idx_info, ii_flags));
+	LASSERTF((int)sizeof(((struct idx_info *)0)->ii_flags) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct idx_info *)0)->ii_flags));
+	LASSERTF((int)offsetof(struct idx_info, ii_count) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct idx_info, ii_count));
+	LASSERTF((int)sizeof(((struct idx_info *)0)->ii_count) == 2, "found %lld\n",
+		 (long long)(int)sizeof(((struct idx_info *)0)->ii_count));
+	LASSERTF((int)offsetof(struct idx_info, ii_pad0) == 10, "found %lld\n",
+		 (long long)(int)offsetof(struct idx_info, ii_pad0));
+	LASSERTF((int)sizeof(((struct idx_info *)0)->ii_pad0) == 2, "found %lld\n",
+		 (long long)(int)sizeof(((struct idx_info *)0)->ii_pad0));
+	LASSERTF((int)offsetof(struct idx_info, ii_attrs) == 12, "found %lld\n",
+		 (long long)(int)offsetof(struct idx_info, ii_attrs));
+	LASSERTF((int)sizeof(((struct idx_info *)0)->ii_attrs) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct idx_info *)0)->ii_attrs));
+	LASSERTF((int)offsetof(struct idx_info, ii_fid) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct idx_info, ii_fid));
+	LASSERTF((int)sizeof(((struct idx_info *)0)->ii_fid) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct idx_info *)0)->ii_fid));
+	LASSERTF((int)offsetof(struct idx_info, ii_version) == 32, "found %lld\n",
+		 (long long)(int)offsetof(struct idx_info, ii_version));
+	LASSERTF((int)sizeof(((struct idx_info *)0)->ii_version) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct idx_info *)0)->ii_version));
+	LASSERTF((int)offsetof(struct idx_info, ii_hash_start) == 40, "found %lld\n",
+		 (long long)(int)offsetof(struct idx_info, ii_hash_start));
+	LASSERTF((int)sizeof(((struct idx_info *)0)->ii_hash_start) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct idx_info *)0)->ii_hash_start));
+	LASSERTF((int)offsetof(struct idx_info, ii_hash_end) == 48, "found %lld\n",
+		 (long long)(int)offsetof(struct idx_info, ii_hash_end));
+	LASSERTF((int)sizeof(((struct idx_info *)0)->ii_hash_end) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct idx_info *)0)->ii_hash_end));
+	LASSERTF((int)offsetof(struct idx_info, ii_keysize) == 56, "found %lld\n",
+		 (long long)(int)offsetof(struct idx_info, ii_keysize));
+	LASSERTF((int)sizeof(((struct idx_info *)0)->ii_keysize) == 2, "found %lld\n",
+		 (long long)(int)sizeof(((struct idx_info *)0)->ii_keysize));
+	LASSERTF((int)offsetof(struct idx_info, ii_recsize) == 58, "found %lld\n",
+		 (long long)(int)offsetof(struct idx_info, ii_recsize));
+	LASSERTF((int)sizeof(((struct idx_info *)0)->ii_recsize) == 2, "found %lld\n",
+		 (long long)(int)sizeof(((struct idx_info *)0)->ii_recsize));
+	LASSERTF((int)offsetof(struct idx_info, ii_pad1) == 60, "found %lld\n",
+		 (long long)(int)offsetof(struct idx_info, ii_pad1));
+	LASSERTF((int)sizeof(((struct idx_info *)0)->ii_pad1) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct idx_info *)0)->ii_pad1));
+	LASSERTF((int)offsetof(struct idx_info, ii_pad2) == 64, "found %lld\n",
+		 (long long)(int)offsetof(struct idx_info, ii_pad2));
+	LASSERTF((int)sizeof(((struct idx_info *)0)->ii_pad2) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct idx_info *)0)->ii_pad2));
+	LASSERTF((int)offsetof(struct idx_info, ii_pad3) == 72, "found %lld\n",
+		 (long long)(int)offsetof(struct idx_info, ii_pad3));
+	LASSERTF((int)sizeof(((struct idx_info *)0)->ii_pad3) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct idx_info *)0)->ii_pad3));
+	CLASSERT(IDX_INFO_MAGIC == 0x3D37CC37);
+
+	/* Checks for struct lu_idxpage */
+	LASSERTF((int)sizeof(struct lu_idxpage) == 16, "found %lld\n",
+		 (long long)(int)sizeof(struct lu_idxpage));
+	LASSERTF((int)offsetof(struct lu_idxpage, lip_magic) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct lu_idxpage, lip_magic));
+	LASSERTF((int)sizeof(((struct lu_idxpage *)0)->lip_magic) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lu_idxpage *)0)->lip_magic));
+	LASSERTF((int)offsetof(struct lu_idxpage, lip_flags) == 4, "found %lld\n",
+		 (long long)(int)offsetof(struct lu_idxpage, lip_flags));
+	LASSERTF((int)sizeof(((struct lu_idxpage *)0)->lip_flags) == 2, "found %lld\n",
+		 (long long)(int)sizeof(((struct lu_idxpage *)0)->lip_flags));
+	LASSERTF((int)offsetof(struct lu_idxpage, lip_nr) == 6, "found %lld\n",
+		 (long long)(int)offsetof(struct lu_idxpage, lip_nr));
+	LASSERTF((int)sizeof(((struct lu_idxpage *)0)->lip_nr) == 2, "found %lld\n",
+		 (long long)(int)sizeof(((struct lu_idxpage *)0)->lip_nr));
+	LASSERTF((int)offsetof(struct lu_idxpage, lip_pad0) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct lu_idxpage, lip_pad0));
+	LASSERTF((int)sizeof(((struct lu_idxpage *)0)->lip_pad0) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct lu_idxpage *)0)->lip_pad0));
+	CLASSERT(LIP_MAGIC == 0x8A6D6B6C);
+	LASSERTF(LIP_HDR_SIZE == 16, "found %lld\n",
+		 (long long)LIP_HDR_SIZE);
+	LASSERTF(II_FL_NOHASH == 1, "found %lld\n",
+		 (long long)II_FL_NOHASH);
+	LASSERTF(II_FL_VARKEY == 2, "found %lld\n",
+		 (long long)II_FL_VARKEY);
+	LASSERTF(II_FL_VARREC == 4, "found %lld\n",
+		 (long long)II_FL_VARREC);
+	LASSERTF(II_FL_NONUNQ == 8, "found %lld\n",
+		 (long long)II_FL_NONUNQ);
+
+	/* Checks for struct niobuf_remote */
+	LASSERTF((int)sizeof(struct niobuf_remote) == 16, "found %lld\n",
+		 (long long)(int)sizeof(struct niobuf_remote));
+	LASSERTF((int)offsetof(struct niobuf_remote, offset) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct niobuf_remote, offset));
+	LASSERTF((int)sizeof(((struct niobuf_remote *)0)->offset) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct niobuf_remote *)0)->offset));
+	LASSERTF((int)offsetof(struct niobuf_remote, len) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct niobuf_remote, len));
+	LASSERTF((int)sizeof(((struct niobuf_remote *)0)->len) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct niobuf_remote *)0)->len));
+	LASSERTF((int)offsetof(struct niobuf_remote, flags) == 12, "found %lld\n",
+		 (long long)(int)offsetof(struct niobuf_remote, flags));
+	LASSERTF((int)sizeof(((struct niobuf_remote *)0)->flags) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct niobuf_remote *)0)->flags));
+	LASSERTF(OBD_BRW_READ == 0x01, "found 0x%.8x\n",
+		OBD_BRW_READ);
+	LASSERTF(OBD_BRW_WRITE == 0x02, "found 0x%.8x\n",
+		OBD_BRW_WRITE);
+	LASSERTF(OBD_BRW_SYNC == 0x08, "found 0x%.8x\n",
+		OBD_BRW_SYNC);
+	LASSERTF(OBD_BRW_CHECK == 0x10, "found 0x%.8x\n",
+		OBD_BRW_CHECK);
+	LASSERTF(OBD_BRW_FROM_GRANT == 0x20, "found 0x%.8x\n",
+		OBD_BRW_FROM_GRANT);
+	LASSERTF(OBD_BRW_GRANTED == 0x40, "found 0x%.8x\n",
+		OBD_BRW_GRANTED);
+	LASSERTF(OBD_BRW_NOCACHE == 0x80, "found 0x%.8x\n",
+		OBD_BRW_NOCACHE);
+	LASSERTF(OBD_BRW_NOQUOTA == 0x100, "found 0x%.8x\n",
+		OBD_BRW_NOQUOTA);
+	LASSERTF(OBD_BRW_SRVLOCK == 0x200, "found 0x%.8x\n",
+		OBD_BRW_SRVLOCK);
+	LASSERTF(OBD_BRW_ASYNC == 0x400, "found 0x%.8x\n",
+		OBD_BRW_ASYNC);
+	LASSERTF(OBD_BRW_MEMALLOC == 0x800, "found 0x%.8x\n",
+		OBD_BRW_MEMALLOC);
+
+	/* Checks for struct ost_body */
+	LASSERTF((int)sizeof(struct ost_body) == 208, "found %lld\n",
+		 (long long)(int)sizeof(struct ost_body));
+	LASSERTF((int)offsetof(struct ost_body, oa) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct ost_body, oa));
+	LASSERTF((int)sizeof(((struct ost_body *)0)->oa) == 208, "found %lld\n",
+		 (long long)(int)sizeof(((struct ost_body *)0)->oa));
+
+	/* Checks for struct ll_fid */
+	LASSERTF((int)sizeof(struct ll_fid) == 16, "found %lld\n",
+		 (long long)(int)sizeof(struct ll_fid));
+	LASSERTF((int)offsetof(struct ll_fid, id) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct ll_fid, id));
+	LASSERTF((int)sizeof(((struct ll_fid *)0)->id) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct ll_fid *)0)->id));
+	LASSERTF((int)offsetof(struct ll_fid, generation) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct ll_fid, generation));
+	LASSERTF((int)sizeof(((struct ll_fid *)0)->generation) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct ll_fid *)0)->generation));
+	LASSERTF((int)offsetof(struct ll_fid, f_type) == 12, "found %lld\n",
+		 (long long)(int)offsetof(struct ll_fid, f_type));
+	LASSERTF((int)sizeof(((struct ll_fid *)0)->f_type) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct ll_fid *)0)->f_type));
+
+	/* Checks for struct mdt_body */
+	LASSERTF((int)sizeof(struct mdt_body) == 216, "found %lld\n",
+		 (long long)(int)sizeof(struct mdt_body));
+	LASSERTF((int)offsetof(struct mdt_body, fid1) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_body, fid1));
+	LASSERTF((int)sizeof(((struct mdt_body *)0)->fid1) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_body *)0)->fid1));
+	LASSERTF((int)offsetof(struct mdt_body, fid2) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_body, fid2));
+	LASSERTF((int)sizeof(((struct mdt_body *)0)->fid2) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_body *)0)->fid2));
+	LASSERTF((int)offsetof(struct mdt_body, handle) == 32, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_body, handle));
+	LASSERTF((int)sizeof(((struct mdt_body *)0)->handle) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_body *)0)->handle));
+	LASSERTF((int)offsetof(struct mdt_body, valid) == 40, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_body, valid));
+	LASSERTF((int)sizeof(((struct mdt_body *)0)->valid) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_body *)0)->valid));
+	LASSERTF((int)offsetof(struct mdt_body, size) == 48, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_body, size));
+	LASSERTF((int)sizeof(((struct mdt_body *)0)->size) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_body *)0)->size));
+	LASSERTF((int)offsetof(struct mdt_body, mtime) == 56, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_body, mtime));
+	LASSERTF((int)sizeof(((struct mdt_body *)0)->mtime) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_body *)0)->mtime));
+	LASSERTF((int)offsetof(struct mdt_body, atime) == 64, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_body, atime));
+	LASSERTF((int)sizeof(((struct mdt_body *)0)->atime) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_body *)0)->atime));
+	LASSERTF((int)offsetof(struct mdt_body, ctime) == 72, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_body, ctime));
+	LASSERTF((int)sizeof(((struct mdt_body *)0)->ctime) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_body *)0)->ctime));
+	LASSERTF((int)offsetof(struct mdt_body, blocks) == 80, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_body, blocks));
+	LASSERTF((int)sizeof(((struct mdt_body *)0)->blocks) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_body *)0)->blocks));
+	LASSERTF((int)offsetof(struct mdt_body, t_state) == 96, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_body, t_state));
+	LASSERTF((int)sizeof(((struct mdt_body *)0)->t_state) == 8,
+		 "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_body *)0)->t_state));
+	LASSERTF((int)offsetof(struct mdt_body, fsuid) == 104, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_body, fsuid));
+	LASSERTF((int)sizeof(((struct mdt_body *)0)->fsuid) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_body *)0)->fsuid));
+	LASSERTF((int)offsetof(struct mdt_body, fsgid) == 108, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_body, fsgid));
+	LASSERTF((int)sizeof(((struct mdt_body *)0)->fsgid) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_body *)0)->fsgid));
+	LASSERTF((int)offsetof(struct mdt_body, capability) == 112, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_body, capability));
+	LASSERTF((int)sizeof(((struct mdt_body *)0)->capability) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_body *)0)->capability));
+	LASSERTF((int)offsetof(struct mdt_body, mode) == 116, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_body, mode));
+	LASSERTF((int)sizeof(((struct mdt_body *)0)->mode) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_body *)0)->mode));
+	LASSERTF((int)offsetof(struct mdt_body, uid) == 120, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_body, uid));
+	LASSERTF((int)sizeof(((struct mdt_body *)0)->uid) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_body *)0)->uid));
+	LASSERTF((int)offsetof(struct mdt_body, gid) == 124, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_body, gid));
+	LASSERTF((int)sizeof(((struct mdt_body *)0)->gid) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_body *)0)->gid));
+	LASSERTF((int)offsetof(struct mdt_body, flags) == 128, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_body, flags));
+	LASSERTF((int)sizeof(((struct mdt_body *)0)->flags) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_body *)0)->flags));
+	LASSERTF((int)offsetof(struct mdt_body, rdev) == 132, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_body, rdev));
+	LASSERTF((int)sizeof(((struct mdt_body *)0)->rdev) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_body *)0)->rdev));
+	LASSERTF((int)offsetof(struct mdt_body, nlink) == 136, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_body, nlink));
+	LASSERTF((int)sizeof(((struct mdt_body *)0)->nlink) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_body *)0)->nlink));
+	LASSERTF((int)offsetof(struct mdt_body, unused2) == 140, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_body, unused2));
+	LASSERTF((int)sizeof(((struct mdt_body *)0)->unused2) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_body *)0)->unused2));
+	LASSERTF((int)offsetof(struct mdt_body, suppgid) == 144, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_body, suppgid));
+	LASSERTF((int)sizeof(((struct mdt_body *)0)->suppgid) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_body *)0)->suppgid));
+	LASSERTF((int)offsetof(struct mdt_body, eadatasize) == 148, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_body, eadatasize));
+	LASSERTF((int)sizeof(((struct mdt_body *)0)->eadatasize) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_body *)0)->eadatasize));
+	LASSERTF((int)offsetof(struct mdt_body, aclsize) == 152, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_body, aclsize));
+	LASSERTF((int)sizeof(((struct mdt_body *)0)->aclsize) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_body *)0)->aclsize));
+	LASSERTF((int)offsetof(struct mdt_body, max_mdsize) == 156, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_body, max_mdsize));
+	LASSERTF((int)sizeof(((struct mdt_body *)0)->max_mdsize) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_body *)0)->max_mdsize));
+	LASSERTF((int)offsetof(struct mdt_body, max_cookiesize) == 160, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_body, max_cookiesize));
+	LASSERTF((int)sizeof(((struct mdt_body *)0)->max_cookiesize) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_body *)0)->max_cookiesize));
+	LASSERTF((int)offsetof(struct mdt_body, uid_h) == 164, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_body, uid_h));
+	LASSERTF((int)sizeof(((struct mdt_body *)0)->uid_h) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_body *)0)->uid_h));
+	LASSERTF((int)offsetof(struct mdt_body, gid_h) == 168, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_body, gid_h));
+	LASSERTF((int)sizeof(((struct mdt_body *)0)->gid_h) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_body *)0)->gid_h));
+	LASSERTF((int)offsetof(struct mdt_body, padding_5) == 172, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_body, padding_5));
+	LASSERTF((int)sizeof(((struct mdt_body *)0)->padding_5) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_body *)0)->padding_5));
+	LASSERTF((int)offsetof(struct mdt_body, padding_6) == 176, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_body, padding_6));
+	LASSERTF((int)sizeof(((struct mdt_body *)0)->padding_6) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_body *)0)->padding_6));
+	LASSERTF((int)offsetof(struct mdt_body, padding_7) == 184, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_body, padding_7));
+	LASSERTF((int)sizeof(((struct mdt_body *)0)->padding_7) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_body *)0)->padding_7));
+	LASSERTF((int)offsetof(struct mdt_body, padding_8) == 192, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_body, padding_8));
+	LASSERTF((int)sizeof(((struct mdt_body *)0)->padding_8) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_body *)0)->padding_8));
+	LASSERTF((int)offsetof(struct mdt_body, padding_9) == 200, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_body, padding_9));
+	LASSERTF((int)sizeof(((struct mdt_body *)0)->padding_9) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_body *)0)->padding_9));
+	LASSERTF((int)offsetof(struct mdt_body, padding_10) == 208, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_body, padding_10));
+	LASSERTF((int)sizeof(((struct mdt_body *)0)->padding_10) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_body *)0)->padding_10));
+	LASSERTF(MDS_FMODE_CLOSED == 000000000000UL, "found 0%.11oUL\n",
+		MDS_FMODE_CLOSED);
+	LASSERTF(MDS_FMODE_EXEC == 000000000004UL, "found 0%.11oUL\n",
+		MDS_FMODE_EXEC);
+	LASSERTF(MDS_FMODE_EPOCH == 000001000000UL, "found 0%.11oUL\n",
+		MDS_FMODE_EPOCH);
+	LASSERTF(MDS_FMODE_TRUNC == 000002000000UL, "found 0%.11oUL\n",
+		MDS_FMODE_TRUNC);
+	LASSERTF(MDS_FMODE_SOM == 000004000000UL, "found 0%.11oUL\n",
+		MDS_FMODE_SOM);
+	LASSERTF(MDS_OPEN_CREATED == 000000000010UL, "found 0%.11oUL\n",
+		MDS_OPEN_CREATED);
+	LASSERTF(MDS_OPEN_CROSS == 000000000020UL, "found 0%.11oUL\n",
+		MDS_OPEN_CROSS);
+	LASSERTF(MDS_OPEN_CREAT == 000000000100UL, "found 0%.11oUL\n",
+		MDS_OPEN_CREAT);
+	LASSERTF(MDS_OPEN_EXCL == 000000000200UL, "found 0%.11oUL\n",
+		MDS_OPEN_EXCL);
+	LASSERTF(MDS_OPEN_TRUNC == 000000001000UL, "found 0%.11oUL\n",
+		MDS_OPEN_TRUNC);
+	LASSERTF(MDS_OPEN_APPEND == 000000002000UL, "found 0%.11oUL\n",
+		MDS_OPEN_APPEND);
+	LASSERTF(MDS_OPEN_SYNC == 000000010000UL, "found 0%.11oUL\n",
+		MDS_OPEN_SYNC);
+	LASSERTF(MDS_OPEN_DIRECTORY == 000000200000UL, "found 0%.11oUL\n",
+		MDS_OPEN_DIRECTORY);
+	LASSERTF(MDS_OPEN_BY_FID == 000040000000UL, "found 0%.11oUL\n",
+		MDS_OPEN_BY_FID);
+	LASSERTF(MDS_OPEN_DELAY_CREATE == 000100000000UL, "found 0%.11oUL\n",
+		MDS_OPEN_DELAY_CREATE);
+	LASSERTF(MDS_OPEN_OWNEROVERRIDE == 000200000000UL, "found 0%.11oUL\n",
+		MDS_OPEN_OWNEROVERRIDE);
+	LASSERTF(MDS_OPEN_JOIN_FILE == 000400000000UL, "found 0%.11oUL\n",
+		MDS_OPEN_JOIN_FILE);
+	LASSERTF(MDS_OPEN_LOCK == 004000000000UL, "found 0%.11oUL\n",
+		MDS_OPEN_LOCK);
+	LASSERTF(MDS_OPEN_HAS_EA == 010000000000UL, "found 0%.11oUL\n",
+		MDS_OPEN_HAS_EA);
+	LASSERTF(MDS_OPEN_HAS_OBJS == 020000000000UL, "found 0%.11oUL\n",
+		MDS_OPEN_HAS_OBJS);
+	LASSERTF(MDS_OPEN_NORESTORE == 00000000000100000000000ULL, "found 0%.22lloULL\n",
+			(long long)MDS_OPEN_NORESTORE);
+	LASSERTF(MDS_OPEN_NEWSTRIPE == 00000000000200000000000ULL, "found 0%.22lloULL\n",
+			(long long)MDS_OPEN_NEWSTRIPE);
+	LASSERTF(MDS_OPEN_VOLATILE == 00000000000400000000000ULL, "found 0%.22lloULL\n",
+			(long long)MDS_OPEN_VOLATILE);
+	LASSERTF(LUSTRE_SYNC_FL == 0x00000008, "found 0x%.8x\n",
+		LUSTRE_SYNC_FL);
+	LASSERTF(LUSTRE_IMMUTABLE_FL == 0x00000010, "found 0x%.8x\n",
+		LUSTRE_IMMUTABLE_FL);
+	LASSERTF(LUSTRE_APPEND_FL == 0x00000020, "found 0x%.8x\n",
+		LUSTRE_APPEND_FL);
+	LASSERTF(LUSTRE_NOATIME_FL == 0x00000080, "found 0x%.8x\n",
+		LUSTRE_NOATIME_FL);
+	LASSERTF(LUSTRE_DIRSYNC_FL == 0x00010000, "found 0x%.8x\n",
+		LUSTRE_DIRSYNC_FL);
+	LASSERTF(MDS_INODELOCK_LOOKUP == 0x000001, "found 0x%.8x\n",
+		MDS_INODELOCK_LOOKUP);
+	LASSERTF(MDS_INODELOCK_UPDATE == 0x000002, "found 0x%.8x\n",
+		MDS_INODELOCK_UPDATE);
+	LASSERTF(MDS_INODELOCK_OPEN == 0x000004, "found 0x%.8x\n",
+		MDS_INODELOCK_OPEN);
+	LASSERTF(MDS_INODELOCK_LAYOUT == 0x000008, "found 0x%.8x\n",
+		MDS_INODELOCK_LAYOUT);
+
+	/* Checks for struct mdt_ioepoch */
+	LASSERTF((int)sizeof(struct mdt_ioepoch) == 24, "found %lld\n",
+		 (long long)(int)sizeof(struct mdt_ioepoch));
+	LASSERTF((int)offsetof(struct mdt_ioepoch, handle) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_ioepoch, handle));
+	LASSERTF((int)sizeof(((struct mdt_ioepoch *)0)->handle) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_ioepoch *)0)->handle));
+	LASSERTF((int)offsetof(struct mdt_ioepoch, ioepoch) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_ioepoch, ioepoch));
+	LASSERTF((int)sizeof(((struct mdt_ioepoch *)0)->ioepoch) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_ioepoch *)0)->ioepoch));
+	LASSERTF((int)offsetof(struct mdt_ioepoch, flags) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_ioepoch, flags));
+	LASSERTF((int)sizeof(((struct mdt_ioepoch *)0)->flags) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_ioepoch *)0)->flags));
+	LASSERTF((int)offsetof(struct mdt_ioepoch, padding) == 20, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_ioepoch, padding));
+	LASSERTF((int)sizeof(((struct mdt_ioepoch *)0)->padding) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_ioepoch *)0)->padding));
+
+	/* Checks for struct mdt_remote_perm */
+	LASSERTF((int)sizeof(struct mdt_remote_perm) == 32, "found %lld\n",
+		 (long long)(int)sizeof(struct mdt_remote_perm));
+	LASSERTF((int)offsetof(struct mdt_remote_perm, rp_uid) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_remote_perm, rp_uid));
+	LASSERTF((int)sizeof(((struct mdt_remote_perm *)0)->rp_uid) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_remote_perm *)0)->rp_uid));
+	LASSERTF((int)offsetof(struct mdt_remote_perm, rp_gid) == 4, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_remote_perm, rp_gid));
+	LASSERTF((int)sizeof(((struct mdt_remote_perm *)0)->rp_gid) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_remote_perm *)0)->rp_gid));
+	LASSERTF((int)offsetof(struct mdt_remote_perm, rp_fsuid) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_remote_perm, rp_fsuid));
+	LASSERTF((int)sizeof(((struct mdt_remote_perm *)0)->rp_fsuid) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_remote_perm *)0)->rp_fsuid));
+	LASSERTF((int)offsetof(struct mdt_remote_perm, rp_fsgid) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_remote_perm, rp_fsgid));
+	LASSERTF((int)sizeof(((struct mdt_remote_perm *)0)->rp_fsgid) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_remote_perm *)0)->rp_fsgid));
+	LASSERTF((int)offsetof(struct mdt_remote_perm, rp_access_perm) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_remote_perm, rp_access_perm));
+	LASSERTF((int)sizeof(((struct mdt_remote_perm *)0)->rp_access_perm) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_remote_perm *)0)->rp_access_perm));
+	LASSERTF((int)offsetof(struct mdt_remote_perm, rp_padding) == 28, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_remote_perm, rp_padding));
+	LASSERTF((int)sizeof(((struct mdt_remote_perm *)0)->rp_padding) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_remote_perm *)0)->rp_padding));
+	LASSERTF(CFS_SETUID_PERM == 0x00000001UL, "found 0x%.8xUL\n",
+		(unsigned)CFS_SETUID_PERM);
+	LASSERTF(CFS_SETGID_PERM == 0x00000002UL, "found 0x%.8xUL\n",
+		(unsigned)CFS_SETGID_PERM);
+	LASSERTF(CFS_SETGRP_PERM == 0x00000004UL, "found 0x%.8xUL\n",
+		(unsigned)CFS_SETGRP_PERM);
+	LASSERTF(CFS_RMTACL_PERM == 0x00000008UL, "found 0x%.8xUL\n",
+		(unsigned)CFS_RMTACL_PERM);
+	LASSERTF(CFS_RMTOWN_PERM == 0x00000010UL, "found 0x%.8xUL\n",
+		(unsigned)CFS_RMTOWN_PERM);
+
+	/* Checks for struct mdt_rec_setattr */
+	LASSERTF((int)sizeof(struct mdt_rec_setattr) == 136, "found %lld\n",
+		 (long long)(int)sizeof(struct mdt_rec_setattr));
+	LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_opcode) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setattr, sa_opcode));
+	LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_opcode) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_opcode));
+	LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_cap) == 4, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setattr, sa_cap));
+	LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_cap) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_cap));
+	LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_fsuid) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setattr, sa_fsuid));
+	LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_fsuid) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_fsuid));
+	LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_fsuid_h) == 12, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setattr, sa_fsuid_h));
+	LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_fsuid_h) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_fsuid_h));
+	LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_fsgid) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setattr, sa_fsgid));
+	LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_fsgid) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_fsgid));
+	LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_fsgid_h) == 20, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setattr, sa_fsgid_h));
+	LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_fsgid_h) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_fsgid_h));
+	LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_suppgid) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setattr, sa_suppgid));
+	LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_suppgid) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_suppgid));
+	LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_suppgid_h) == 28, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setattr, sa_suppgid_h));
+	LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_suppgid_h) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_suppgid_h));
+	LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_padding_1) == 32, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setattr, sa_padding_1));
+	LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_padding_1) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_padding_1));
+	LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_padding_1_h) == 36, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setattr, sa_padding_1_h));
+	LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_padding_1_h) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_padding_1_h));
+	LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_fid) == 40, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setattr, sa_fid));
+	LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_fid) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_fid));
+	LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_valid) == 56, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setattr, sa_valid));
+	LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_valid) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_valid));
+	LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_uid) == 64, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setattr, sa_uid));
+	LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_uid) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_uid));
+	LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_gid) == 68, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setattr, sa_gid));
+	LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_gid) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_gid));
+	LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_size) == 72, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setattr, sa_size));
+	LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_size) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_size));
+	LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_blocks) == 80, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setattr, sa_blocks));
+	LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_blocks) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_blocks));
+	LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_mtime) == 88, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setattr, sa_mtime));
+	LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_mtime) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_mtime));
+	LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_atime) == 96, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setattr, sa_atime));
+	LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_atime) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_atime));
+	LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_ctime) == 104, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setattr, sa_ctime));
+	LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_ctime) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_ctime));
+	LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_attr_flags) == 112, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setattr, sa_attr_flags));
+	LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_attr_flags) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_attr_flags));
+	LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_mode) == 116, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setattr, sa_mode));
+	LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_mode) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_mode));
+	LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_bias) == 120, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setattr, sa_bias));
+	LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_bias) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_bias));
+	LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_padding_3) == 124, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setattr, sa_padding_3));
+	LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_padding_3) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_padding_3));
+	LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_padding_4) == 128, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setattr, sa_padding_4));
+	LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_padding_4) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_padding_4));
+	LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_padding_5) == 132, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setattr, sa_padding_5));
+	LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_padding_5) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_padding_5));
+
+	/* Checks for struct mdt_rec_create */
+	LASSERTF((int)sizeof(struct mdt_rec_create) == 136, "found %lld\n",
+		 (long long)(int)sizeof(struct mdt_rec_create));
+	LASSERTF((int)offsetof(struct mdt_rec_create, cr_opcode) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_create, cr_opcode));
+	LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_opcode) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_opcode));
+	LASSERTF((int)offsetof(struct mdt_rec_create, cr_cap) == 4, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_create, cr_cap));
+	LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_cap) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_cap));
+	LASSERTF((int)offsetof(struct mdt_rec_create, cr_fsuid) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_create, cr_fsuid));
+	LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_fsuid) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_fsuid));
+	LASSERTF((int)offsetof(struct mdt_rec_create, cr_fsuid_h) == 12, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_create, cr_fsuid_h));
+	LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_fsuid_h) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_fsuid_h));
+	LASSERTF((int)offsetof(struct mdt_rec_create, cr_fsgid) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_create, cr_fsgid));
+	LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_fsgid) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_fsgid));
+	LASSERTF((int)offsetof(struct mdt_rec_create, cr_fsgid_h) == 20, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_create, cr_fsgid_h));
+	LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_fsgid_h) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_fsgid_h));
+	LASSERTF((int)offsetof(struct mdt_rec_create, cr_suppgid1) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_create, cr_suppgid1));
+	LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_suppgid1) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_suppgid1));
+	LASSERTF((int)offsetof(struct mdt_rec_create, cr_suppgid1_h) == 28, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_create, cr_suppgid1_h));
+	LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_suppgid1_h) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_suppgid1_h));
+	LASSERTF((int)offsetof(struct mdt_rec_create, cr_suppgid2) == 32, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_create, cr_suppgid2));
+	LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_suppgid2) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_suppgid2));
+	LASSERTF((int)offsetof(struct mdt_rec_create, cr_suppgid2_h) == 36, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_create, cr_suppgid2_h));
+	LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_suppgid2_h) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_suppgid2_h));
+	LASSERTF((int)offsetof(struct mdt_rec_create, cr_fid1) == 40, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_create, cr_fid1));
+	LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_fid1) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_fid1));
+	LASSERTF((int)offsetof(struct mdt_rec_create, cr_fid2) == 56, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_create, cr_fid2));
+	LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_fid2) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_fid2));
+	LASSERTF((int)offsetof(struct mdt_rec_create, cr_old_handle) == 72, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_create, cr_old_handle));
+	LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_old_handle) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_old_handle));
+	LASSERTF((int)offsetof(struct mdt_rec_create, cr_time) == 80, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_create, cr_time));
+	LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_time) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_time));
+	LASSERTF((int)offsetof(struct mdt_rec_create, cr_rdev) == 88, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_create, cr_rdev));
+	LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_rdev) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_rdev));
+	LASSERTF((int)offsetof(struct mdt_rec_create, cr_ioepoch) == 96, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_create, cr_ioepoch));
+	LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_ioepoch) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_ioepoch));
+	LASSERTF((int)offsetof(struct mdt_rec_create, cr_padding_1) == 104, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_create, cr_padding_1));
+	LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_padding_1) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_padding_1));
+	LASSERTF((int)offsetof(struct mdt_rec_create, cr_mode) == 112, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_create, cr_mode));
+	LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_mode) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_mode));
+	LASSERTF((int)offsetof(struct mdt_rec_create, cr_bias) == 116, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_create, cr_bias));
+	LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_bias) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_bias));
+	LASSERTF((int)offsetof(struct mdt_rec_create, cr_flags_l) == 120, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_create, cr_flags_l));
+	LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_flags_l) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_flags_l));
+	LASSERTF((int)offsetof(struct mdt_rec_create, cr_flags_h) == 124, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_create, cr_flags_h));
+	LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_flags_h) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_flags_h));
+	LASSERTF((int)offsetof(struct mdt_rec_create, cr_umask) == 128, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_create, cr_umask));
+	LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_umask) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_umask));
+	LASSERTF((int)offsetof(struct mdt_rec_create, cr_padding_4) == 132, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_create, cr_padding_4));
+	LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_padding_4) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_padding_4));
+
+	/* Checks for struct mdt_rec_link */
+	LASSERTF((int)sizeof(struct mdt_rec_link) == 136, "found %lld\n",
+		 (long long)(int)sizeof(struct mdt_rec_link));
+	LASSERTF((int)offsetof(struct mdt_rec_link, lk_opcode) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_link, lk_opcode));
+	LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_opcode) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_opcode));
+	LASSERTF((int)offsetof(struct mdt_rec_link, lk_cap) == 4, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_link, lk_cap));
+	LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_cap) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_cap));
+	LASSERTF((int)offsetof(struct mdt_rec_link, lk_fsuid) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_link, lk_fsuid));
+	LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_fsuid) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_fsuid));
+	LASSERTF((int)offsetof(struct mdt_rec_link, lk_fsuid_h) == 12, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_link, lk_fsuid_h));
+	LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_fsuid_h) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_fsuid_h));
+	LASSERTF((int)offsetof(struct mdt_rec_link, lk_fsgid) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_link, lk_fsgid));
+	LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_fsgid) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_fsgid));
+	LASSERTF((int)offsetof(struct mdt_rec_link, lk_fsgid_h) == 20, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_link, lk_fsgid_h));
+	LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_fsgid_h) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_fsgid_h));
+	LASSERTF((int)offsetof(struct mdt_rec_link, lk_suppgid1) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_link, lk_suppgid1));
+	LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_suppgid1) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_suppgid1));
+	LASSERTF((int)offsetof(struct mdt_rec_link, lk_suppgid1_h) == 28, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_link, lk_suppgid1_h));
+	LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_suppgid1_h) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_suppgid1_h));
+	LASSERTF((int)offsetof(struct mdt_rec_link, lk_suppgid2) == 32, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_link, lk_suppgid2));
+	LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_suppgid2) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_suppgid2));
+	LASSERTF((int)offsetof(struct mdt_rec_link, lk_suppgid2_h) == 36, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_link, lk_suppgid2_h));
+	LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_suppgid2_h) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_suppgid2_h));
+	LASSERTF((int)offsetof(struct mdt_rec_link, lk_fid1) == 40, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_link, lk_fid1));
+	LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_fid1) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_fid1));
+	LASSERTF((int)offsetof(struct mdt_rec_link, lk_fid2) == 56, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_link, lk_fid2));
+	LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_fid2) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_fid2));
+	LASSERTF((int)offsetof(struct mdt_rec_link, lk_time) == 72, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_link, lk_time));
+	LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_time) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_time));
+	LASSERTF((int)offsetof(struct mdt_rec_link, lk_padding_1) == 80, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_link, lk_padding_1));
+	LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_padding_1) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_padding_1));
+	LASSERTF((int)offsetof(struct mdt_rec_link, lk_padding_2) == 88, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_link, lk_padding_2));
+	LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_padding_2) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_padding_2));
+	LASSERTF((int)offsetof(struct mdt_rec_link, lk_padding_3) == 96, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_link, lk_padding_3));
+	LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_padding_3) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_padding_3));
+	LASSERTF((int)offsetof(struct mdt_rec_link, lk_padding_4) == 104, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_link, lk_padding_4));
+	LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_padding_4) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_padding_4));
+	LASSERTF((int)offsetof(struct mdt_rec_link, lk_bias) == 112, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_link, lk_bias));
+	LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_bias) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_bias));
+	LASSERTF((int)offsetof(struct mdt_rec_link, lk_padding_5) == 116, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_link, lk_padding_5));
+	LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_padding_5) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_padding_5));
+	LASSERTF((int)offsetof(struct mdt_rec_link, lk_padding_6) == 120, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_link, lk_padding_6));
+	LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_padding_6) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_padding_6));
+	LASSERTF((int)offsetof(struct mdt_rec_link, lk_padding_7) == 124, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_link, lk_padding_7));
+	LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_padding_7) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_padding_7));
+	LASSERTF((int)offsetof(struct mdt_rec_link, lk_padding_8) == 128, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_link, lk_padding_8));
+	LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_padding_8) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_padding_8));
+	LASSERTF((int)offsetof(struct mdt_rec_link, lk_padding_9) == 132, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_link, lk_padding_9));
+	LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_padding_9) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_padding_9));
+
+	/* Checks for struct mdt_rec_unlink */
+	LASSERTF((int)sizeof(struct mdt_rec_unlink) == 136, "found %lld\n",
+		 (long long)(int)sizeof(struct mdt_rec_unlink));
+	LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_opcode) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_unlink, ul_opcode));
+	LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_opcode) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_opcode));
+	LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_cap) == 4, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_unlink, ul_cap));
+	LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_cap) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_cap));
+	LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_fsuid) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_unlink, ul_fsuid));
+	LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_fsuid) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_fsuid));
+	LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_fsuid_h) == 12, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_unlink, ul_fsuid_h));
+	LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_fsuid_h) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_fsuid_h));
+	LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_fsgid) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_unlink, ul_fsgid));
+	LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_fsgid) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_fsgid));
+	LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_fsgid_h) == 20, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_unlink, ul_fsgid_h));
+	LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_fsgid_h) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_fsgid_h));
+	LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_suppgid1) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_unlink, ul_suppgid1));
+	LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_suppgid1) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_suppgid1));
+	LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_suppgid1_h) == 28, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_unlink, ul_suppgid1_h));
+	LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_suppgid1_h) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_suppgid1_h));
+	LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_suppgid2) == 32, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_unlink, ul_suppgid2));
+	LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_suppgid2) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_suppgid2));
+	LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_suppgid2_h) == 36, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_unlink, ul_suppgid2_h));
+	LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_suppgid2_h) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_suppgid2_h));
+	LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_fid1) == 40, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_unlink, ul_fid1));
+	LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_fid1) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_fid1));
+	LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_fid2) == 56, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_unlink, ul_fid2));
+	LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_fid2) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_fid2));
+	LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_time) == 72, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_unlink, ul_time));
+	LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_time) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_time));
+	LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_padding_2) == 80, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_unlink, ul_padding_2));
+	LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_padding_2) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_padding_2));
+	LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_padding_3) == 88, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_unlink, ul_padding_3));
+	LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_padding_3) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_padding_3));
+	LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_padding_4) == 96, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_unlink, ul_padding_4));
+	LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_padding_4) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_padding_4));
+	LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_padding_5) == 104, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_unlink, ul_padding_5));
+	LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_padding_5) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_padding_5));
+	LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_bias) == 112, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_unlink, ul_bias));
+	LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_bias) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_bias));
+	LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_mode) == 116, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_unlink, ul_mode));
+	LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_mode) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_mode));
+	LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_padding_6) == 120, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_unlink, ul_padding_6));
+	LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_padding_6) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_padding_6));
+	LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_padding_7) == 124, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_unlink, ul_padding_7));
+	LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_padding_7) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_padding_7));
+	LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_padding_8) == 128, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_unlink, ul_padding_8));
+	LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_padding_8) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_padding_8));
+	LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_padding_9) == 132, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_unlink, ul_padding_9));
+	LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_padding_9) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_padding_9));
+
+	/* Checks for struct mdt_rec_rename */
+	LASSERTF((int)sizeof(struct mdt_rec_rename) == 136, "found %lld\n",
+		 (long long)(int)sizeof(struct mdt_rec_rename));
+	LASSERTF((int)offsetof(struct mdt_rec_rename, rn_opcode) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_rename, rn_opcode));
+	LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_opcode) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_opcode));
+	LASSERTF((int)offsetof(struct mdt_rec_rename, rn_cap) == 4, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_rename, rn_cap));
+	LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_cap) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_cap));
+	LASSERTF((int)offsetof(struct mdt_rec_rename, rn_fsuid) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_rename, rn_fsuid));
+	LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_fsuid) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_fsuid));
+	LASSERTF((int)offsetof(struct mdt_rec_rename, rn_fsuid_h) == 12, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_rename, rn_fsuid_h));
+	LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_fsuid_h) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_fsuid_h));
+	LASSERTF((int)offsetof(struct mdt_rec_rename, rn_fsgid) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_rename, rn_fsgid));
+	LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_fsgid) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_fsgid));
+	LASSERTF((int)offsetof(struct mdt_rec_rename, rn_fsgid_h) == 20, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_rename, rn_fsgid_h));
+	LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_fsgid_h) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_fsgid_h));
+	LASSERTF((int)offsetof(struct mdt_rec_rename, rn_suppgid1) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_rename, rn_suppgid1));
+	LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_suppgid1) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_suppgid1));
+	LASSERTF((int)offsetof(struct mdt_rec_rename, rn_suppgid1_h) == 28, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_rename, rn_suppgid1_h));
+	LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_suppgid1_h) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_suppgid1_h));
+	LASSERTF((int)offsetof(struct mdt_rec_rename, rn_suppgid2) == 32, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_rename, rn_suppgid2));
+	LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_suppgid2) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_suppgid2));
+	LASSERTF((int)offsetof(struct mdt_rec_rename, rn_suppgid2_h) == 36, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_rename, rn_suppgid2_h));
+	LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_suppgid2_h) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_suppgid2_h));
+	LASSERTF((int)offsetof(struct mdt_rec_rename, rn_fid1) == 40, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_rename, rn_fid1));
+	LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_fid1) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_fid1));
+	LASSERTF((int)offsetof(struct mdt_rec_rename, rn_fid2) == 56, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_rename, rn_fid2));
+	LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_fid2) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_fid2));
+	LASSERTF((int)offsetof(struct mdt_rec_rename, rn_time) == 72, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_rename, rn_time));
+	LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_time) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_time));
+	LASSERTF((int)offsetof(struct mdt_rec_rename, rn_padding_1) == 80, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_rename, rn_padding_1));
+	LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_padding_1) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_padding_1));
+	LASSERTF((int)offsetof(struct mdt_rec_rename, rn_padding_2) == 88, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_rename, rn_padding_2));
+	LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_padding_2) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_padding_2));
+	LASSERTF((int)offsetof(struct mdt_rec_rename, rn_padding_3) == 96, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_rename, rn_padding_3));
+	LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_padding_3) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_padding_3));
+	LASSERTF((int)offsetof(struct mdt_rec_rename, rn_padding_4) == 104, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_rename, rn_padding_4));
+	LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_padding_4) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_padding_4));
+	LASSERTF((int)offsetof(struct mdt_rec_rename, rn_bias) == 112, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_rename, rn_bias));
+	LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_bias) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_bias));
+	LASSERTF((int)offsetof(struct mdt_rec_rename, rn_mode) == 116, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_rename, rn_mode));
+	LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_mode) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_mode));
+	LASSERTF((int)offsetof(struct mdt_rec_rename, rn_padding_5) == 120, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_rename, rn_padding_5));
+	LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_padding_5) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_padding_5));
+	LASSERTF((int)offsetof(struct mdt_rec_rename, rn_padding_6) == 124, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_rename, rn_padding_6));
+	LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_padding_6) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_padding_6));
+	LASSERTF((int)offsetof(struct mdt_rec_rename, rn_padding_7) == 128, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_rename, rn_padding_7));
+	LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_padding_7) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_padding_7));
+	LASSERTF((int)offsetof(struct mdt_rec_rename, rn_padding_8) == 132, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_rename, rn_padding_8));
+	LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_padding_8) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_padding_8));
+
+	/* Checks for struct mdt_rec_setxattr */
+	LASSERTF((int)sizeof(struct mdt_rec_setxattr) == 136, "found %lld\n",
+		 (long long)(int)sizeof(struct mdt_rec_setxattr));
+	LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_opcode) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setxattr, sx_opcode));
+	LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_opcode) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_opcode));
+	LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_cap) == 4, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setxattr, sx_cap));
+	LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_cap) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_cap));
+	LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_fsuid) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setxattr, sx_fsuid));
+	LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_fsuid) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_fsuid));
+	LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_fsuid_h) == 12, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setxattr, sx_fsuid_h));
+	LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_fsuid_h) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_fsuid_h));
+	LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_fsgid) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setxattr, sx_fsgid));
+	LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_fsgid) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_fsgid));
+	LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_fsgid_h) == 20, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setxattr, sx_fsgid_h));
+	LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_fsgid_h) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_fsgid_h));
+	LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_suppgid1) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setxattr, sx_suppgid1));
+	LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_suppgid1) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_suppgid1));
+	LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_suppgid1_h) == 28, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setxattr, sx_suppgid1_h));
+	LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_suppgid1_h) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_suppgid1_h));
+	LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_suppgid2) == 32, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setxattr, sx_suppgid2));
+	LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_suppgid2) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_suppgid2));
+	LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_suppgid2_h) == 36, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setxattr, sx_suppgid2_h));
+	LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_suppgid2_h) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_suppgid2_h));
+	LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_fid) == 40, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setxattr, sx_fid));
+	LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_fid) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_fid));
+	LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_padding_1) == 56, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setxattr, sx_padding_1));
+	LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_1) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_1));
+	LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_padding_2) == 64, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setxattr, sx_padding_2));
+	LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_2) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_2));
+	LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_padding_3) == 68, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setxattr, sx_padding_3));
+	LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_3) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_3));
+	LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_valid) == 72, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setxattr, sx_valid));
+	LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_valid) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_valid));
+	LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_time) == 80, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setxattr, sx_time));
+	LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_time) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_time));
+	LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_padding_5) == 88, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setxattr, sx_padding_5));
+	LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_5) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_5));
+	LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_padding_6) == 96, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setxattr, sx_padding_6));
+	LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_6) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_6));
+	LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_padding_7) == 104, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setxattr, sx_padding_7));
+	LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_7) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_7));
+	LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_size) == 112, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setxattr, sx_size));
+	LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_size) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_size));
+	LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_flags) == 116, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setxattr, sx_flags));
+	LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_flags) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_flags));
+	LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_padding_8) == 120, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setxattr, sx_padding_8));
+	LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_8) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_8));
+	LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_padding_9) == 124, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setxattr, sx_padding_9));
+	LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_9) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_9));
+	LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_padding_10) == 128, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setxattr, sx_padding_10));
+	LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_10) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_10));
+	LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_padding_11) == 132, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setxattr, sx_padding_11));
+	LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_11) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_11));
+
+	/* Checks for struct mdt_rec_reint */
+	LASSERTF((int)sizeof(struct mdt_rec_reint) == 136, "found %lld\n",
+		 (long long)(int)sizeof(struct mdt_rec_reint));
+	LASSERTF((int)offsetof(struct mdt_rec_reint, rr_opcode) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_reint, rr_opcode));
+	LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_opcode) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_opcode));
+	LASSERTF((int)offsetof(struct mdt_rec_reint, rr_cap) == 4, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_reint, rr_cap));
+	LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_cap) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_cap));
+	LASSERTF((int)offsetof(struct mdt_rec_reint, rr_fsuid) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_reint, rr_fsuid));
+	LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_fsuid) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_fsuid));
+	LASSERTF((int)offsetof(struct mdt_rec_reint, rr_fsuid_h) == 12, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_reint, rr_fsuid_h));
+	LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_fsuid_h) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_fsuid_h));
+	LASSERTF((int)offsetof(struct mdt_rec_reint, rr_fsgid) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_reint, rr_fsgid));
+	LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_fsgid) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_fsgid));
+	LASSERTF((int)offsetof(struct mdt_rec_reint, rr_fsgid_h) == 20, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_reint, rr_fsgid_h));
+	LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_fsgid_h) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_fsgid_h));
+	LASSERTF((int)offsetof(struct mdt_rec_reint, rr_suppgid1) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_reint, rr_suppgid1));
+	LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_suppgid1) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_suppgid1));
+	LASSERTF((int)offsetof(struct mdt_rec_reint, rr_suppgid1_h) == 28, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_reint, rr_suppgid1_h));
+	LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_suppgid1_h) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_suppgid1_h));
+	LASSERTF((int)offsetof(struct mdt_rec_reint, rr_suppgid2) == 32, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_reint, rr_suppgid2));
+	LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_suppgid2) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_suppgid2));
+	LASSERTF((int)offsetof(struct mdt_rec_reint, rr_suppgid2_h) == 36, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_reint, rr_suppgid2_h));
+	LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_suppgid2_h) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_suppgid2_h));
+	LASSERTF((int)offsetof(struct mdt_rec_reint, rr_fid1) == 40, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_reint, rr_fid1));
+	LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_fid1) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_fid1));
+	LASSERTF((int)offsetof(struct mdt_rec_reint, rr_fid2) == 56, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_reint, rr_fid2));
+	LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_fid2) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_fid2));
+	LASSERTF((int)offsetof(struct mdt_rec_reint, rr_mtime) == 72, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_reint, rr_mtime));
+	LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_mtime) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_mtime));
+	LASSERTF((int)offsetof(struct mdt_rec_reint, rr_atime) == 80, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_reint, rr_atime));
+	LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_atime) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_atime));
+	LASSERTF((int)offsetof(struct mdt_rec_reint, rr_ctime) == 88, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_reint, rr_ctime));
+	LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_ctime) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_ctime));
+	LASSERTF((int)offsetof(struct mdt_rec_reint, rr_size) == 96, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_reint, rr_size));
+	LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_size) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_size));
+	LASSERTF((int)offsetof(struct mdt_rec_reint, rr_blocks) == 104, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_reint, rr_blocks));
+	LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_blocks) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_blocks));
+	LASSERTF((int)offsetof(struct mdt_rec_reint, rr_bias) == 112, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_reint, rr_bias));
+	LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_bias) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_bias));
+	LASSERTF((int)offsetof(struct mdt_rec_reint, rr_mode) == 116, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_reint, rr_mode));
+	LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_mode) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_mode));
+	LASSERTF((int)offsetof(struct mdt_rec_reint, rr_flags) == 120, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_reint, rr_flags));
+	LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_flags) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_flags));
+	LASSERTF((int)offsetof(struct mdt_rec_reint, rr_flags_h) == 124, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_reint, rr_flags_h));
+	LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_flags_h) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_flags_h));
+	LASSERTF((int)offsetof(struct mdt_rec_reint, rr_umask) == 128, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_reint, rr_umask));
+	LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_umask) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_umask));
+	LASSERTF((int)offsetof(struct mdt_rec_reint, rr_padding_4) == 132, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_reint, rr_padding_4));
+	LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_padding_4) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_padding_4));
+
+	/* Checks for struct lmv_desc */
+	LASSERTF((int)sizeof(struct lmv_desc) == 88, "found %lld\n",
+		 (long long)(int)sizeof(struct lmv_desc));
+	LASSERTF((int)offsetof(struct lmv_desc, ld_tgt_count) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct lmv_desc, ld_tgt_count));
+	LASSERTF((int)sizeof(((struct lmv_desc *)0)->ld_tgt_count) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lmv_desc *)0)->ld_tgt_count));
+	LASSERTF((int)offsetof(struct lmv_desc, ld_active_tgt_count) == 4, "found %lld\n",
+		 (long long)(int)offsetof(struct lmv_desc, ld_active_tgt_count));
+	LASSERTF((int)sizeof(((struct lmv_desc *)0)->ld_active_tgt_count) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lmv_desc *)0)->ld_active_tgt_count));
+	LASSERTF((int)offsetof(struct lmv_desc, ld_default_stripe_count) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct lmv_desc, ld_default_stripe_count));
+	LASSERTF((int)sizeof(((struct lmv_desc *)0)->ld_default_stripe_count) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lmv_desc *)0)->ld_default_stripe_count));
+	LASSERTF((int)offsetof(struct lmv_desc, ld_pattern) == 12, "found %lld\n",
+		 (long long)(int)offsetof(struct lmv_desc, ld_pattern));
+	LASSERTF((int)sizeof(((struct lmv_desc *)0)->ld_pattern) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lmv_desc *)0)->ld_pattern));
+	LASSERTF((int)offsetof(struct lmv_desc, ld_default_hash_size) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct lmv_desc, ld_default_hash_size));
+	LASSERTF((int)sizeof(((struct lmv_desc *)0)->ld_default_hash_size) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct lmv_desc *)0)->ld_default_hash_size));
+	LASSERTF((int)offsetof(struct lmv_desc, ld_padding_1) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct lmv_desc, ld_padding_1));
+	LASSERTF((int)sizeof(((struct lmv_desc *)0)->ld_padding_1) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct lmv_desc *)0)->ld_padding_1));
+	LASSERTF((int)offsetof(struct lmv_desc, ld_padding_2) == 32, "found %lld\n",
+		 (long long)(int)offsetof(struct lmv_desc, ld_padding_2));
+	LASSERTF((int)sizeof(((struct lmv_desc *)0)->ld_padding_2) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lmv_desc *)0)->ld_padding_2));
+	LASSERTF((int)offsetof(struct lmv_desc, ld_qos_maxage) == 36, "found %lld\n",
+		 (long long)(int)offsetof(struct lmv_desc, ld_qos_maxage));
+	LASSERTF((int)sizeof(((struct lmv_desc *)0)->ld_qos_maxage) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lmv_desc *)0)->ld_qos_maxage));
+	LASSERTF((int)offsetof(struct lmv_desc, ld_padding_3) == 40, "found %lld\n",
+		 (long long)(int)offsetof(struct lmv_desc, ld_padding_3));
+	LASSERTF((int)sizeof(((struct lmv_desc *)0)->ld_padding_3) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lmv_desc *)0)->ld_padding_3));
+	LASSERTF((int)offsetof(struct lmv_desc, ld_padding_4) == 44, "found %lld\n",
+		 (long long)(int)offsetof(struct lmv_desc, ld_padding_4));
+	LASSERTF((int)sizeof(((struct lmv_desc *)0)->ld_padding_4) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lmv_desc *)0)->ld_padding_4));
+	LASSERTF((int)offsetof(struct lmv_desc, ld_uuid) == 48, "found %lld\n",
+		 (long long)(int)offsetof(struct lmv_desc, ld_uuid));
+	LASSERTF((int)sizeof(((struct lmv_desc *)0)->ld_uuid) == 40, "found %lld\n",
+		 (long long)(int)sizeof(((struct lmv_desc *)0)->ld_uuid));
+
+	/* Checks for struct lmv_stripe_md */
+	LASSERTF((int)sizeof(struct lmv_stripe_md) == 32, "found %lld\n",
+		 (long long)(int)sizeof(struct lmv_stripe_md));
+	LASSERTF((int)offsetof(struct lmv_stripe_md, mea_magic) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct lmv_stripe_md, mea_magic));
+	LASSERTF((int)sizeof(((struct lmv_stripe_md *)0)->mea_magic) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lmv_stripe_md *)0)->mea_magic));
+	LASSERTF((int)offsetof(struct lmv_stripe_md, mea_count) == 4, "found %lld\n",
+		 (long long)(int)offsetof(struct lmv_stripe_md, mea_count));
+	LASSERTF((int)sizeof(((struct lmv_stripe_md *)0)->mea_count) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lmv_stripe_md *)0)->mea_count));
+	LASSERTF((int)offsetof(struct lmv_stripe_md, mea_master) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct lmv_stripe_md, mea_master));
+	LASSERTF((int)sizeof(((struct lmv_stripe_md *)0)->mea_master) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lmv_stripe_md *)0)->mea_master));
+	LASSERTF((int)offsetof(struct lmv_stripe_md, mea_padding) == 12, "found %lld\n",
+		 (long long)(int)offsetof(struct lmv_stripe_md, mea_padding));
+	LASSERTF((int)sizeof(((struct lmv_stripe_md *)0)->mea_padding) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lmv_stripe_md *)0)->mea_padding));
+	CLASSERT(LOV_MAXPOOLNAME == 16);
+	LASSERTF((int)offsetof(struct lmv_stripe_md, mea_pool_name[16]) == 32, "found %lld\n",
+		 (long long)(int)offsetof(struct lmv_stripe_md, mea_pool_name[16]));
+	LASSERTF((int)sizeof(((struct lmv_stripe_md *)0)->mea_pool_name[16]) == 1, "found %lld\n",
+		 (long long)(int)sizeof(((struct lmv_stripe_md *)0)->mea_pool_name[16]));
+	LASSERTF((int)offsetof(struct lmv_stripe_md, mea_ids[0]) == 32, "found %lld\n",
+		 (long long)(int)offsetof(struct lmv_stripe_md, mea_ids[0]));
+	LASSERTF((int)sizeof(((struct lmv_stripe_md *)0)->mea_ids[0]) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct lmv_stripe_md *)0)->mea_ids[0]));
+
+	/* Checks for struct lov_desc */
+	LASSERTF((int)sizeof(struct lov_desc) == 88, "found %lld\n",
+		 (long long)(int)sizeof(struct lov_desc));
+	LASSERTF((int)offsetof(struct lov_desc, ld_tgt_count) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct lov_desc, ld_tgt_count));
+	LASSERTF((int)sizeof(((struct lov_desc *)0)->ld_tgt_count) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lov_desc *)0)->ld_tgt_count));
+	LASSERTF((int)offsetof(struct lov_desc, ld_active_tgt_count) == 4, "found %lld\n",
+		 (long long)(int)offsetof(struct lov_desc, ld_active_tgt_count));
+	LASSERTF((int)sizeof(((struct lov_desc *)0)->ld_active_tgt_count) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lov_desc *)0)->ld_active_tgt_count));
+	LASSERTF((int)offsetof(struct lov_desc, ld_default_stripe_count) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct lov_desc, ld_default_stripe_count));
+	LASSERTF((int)sizeof(((struct lov_desc *)0)->ld_default_stripe_count) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lov_desc *)0)->ld_default_stripe_count));
+	LASSERTF((int)offsetof(struct lov_desc, ld_pattern) == 12, "found %lld\n",
+		 (long long)(int)offsetof(struct lov_desc, ld_pattern));
+	LASSERTF((int)sizeof(((struct lov_desc *)0)->ld_pattern) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lov_desc *)0)->ld_pattern));
+	LASSERTF((int)offsetof(struct lov_desc, ld_default_stripe_size) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct lov_desc, ld_default_stripe_size));
+	LASSERTF((int)sizeof(((struct lov_desc *)0)->ld_default_stripe_size) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct lov_desc *)0)->ld_default_stripe_size));
+	LASSERTF((int)offsetof(struct lov_desc, ld_default_stripe_offset) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct lov_desc, ld_default_stripe_offset));
+	LASSERTF((int)sizeof(((struct lov_desc *)0)->ld_default_stripe_offset) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct lov_desc *)0)->ld_default_stripe_offset));
+	LASSERTF((int)offsetof(struct lov_desc, ld_padding_0) == 32, "found %lld\n",
+		 (long long)(int)offsetof(struct lov_desc, ld_padding_0));
+	LASSERTF((int)sizeof(((struct lov_desc *)0)->ld_padding_0) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lov_desc *)0)->ld_padding_0));
+	LASSERTF((int)offsetof(struct lov_desc, ld_qos_maxage) == 36, "found %lld\n",
+		 (long long)(int)offsetof(struct lov_desc, ld_qos_maxage));
+	LASSERTF((int)sizeof(((struct lov_desc *)0)->ld_qos_maxage) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lov_desc *)0)->ld_qos_maxage));
+	LASSERTF((int)offsetof(struct lov_desc, ld_padding_1) == 40, "found %lld\n",
+		 (long long)(int)offsetof(struct lov_desc, ld_padding_1));
+	LASSERTF((int)sizeof(((struct lov_desc *)0)->ld_padding_1) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lov_desc *)0)->ld_padding_1));
+	LASSERTF((int)offsetof(struct lov_desc, ld_padding_2) == 44, "found %lld\n",
+		 (long long)(int)offsetof(struct lov_desc, ld_padding_2));
+	LASSERTF((int)sizeof(((struct lov_desc *)0)->ld_padding_2) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lov_desc *)0)->ld_padding_2));
+	LASSERTF((int)offsetof(struct lov_desc, ld_uuid) == 48, "found %lld\n",
+		 (long long)(int)offsetof(struct lov_desc, ld_uuid));
+	LASSERTF((int)sizeof(((struct lov_desc *)0)->ld_uuid) == 40, "found %lld\n",
+		 (long long)(int)sizeof(((struct lov_desc *)0)->ld_uuid));
+	CLASSERT(LOV_DESC_MAGIC == 0xB0CCDE5C);
+
+	/* Checks for struct ldlm_res_id */
+	LASSERTF((int)sizeof(struct ldlm_res_id) == 32, "found %lld\n",
+		 (long long)(int)sizeof(struct ldlm_res_id));
+	CLASSERT(RES_NAME_SIZE == 4);
+	LASSERTF((int)offsetof(struct ldlm_res_id, name[4]) == 32, "found %lld\n",
+		 (long long)(int)offsetof(struct ldlm_res_id, name[4]));
+	LASSERTF((int)sizeof(((struct ldlm_res_id *)0)->name[4]) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct ldlm_res_id *)0)->name[4]));
+
+	/* Checks for struct ldlm_extent */
+	LASSERTF((int)sizeof(struct ldlm_extent) == 24, "found %lld\n",
+		 (long long)(int)sizeof(struct ldlm_extent));
+	LASSERTF((int)offsetof(struct ldlm_extent, start) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct ldlm_extent, start));
+	LASSERTF((int)sizeof(((struct ldlm_extent *)0)->start) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct ldlm_extent *)0)->start));
+	LASSERTF((int)offsetof(struct ldlm_extent, end) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct ldlm_extent, end));
+	LASSERTF((int)sizeof(((struct ldlm_extent *)0)->end) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct ldlm_extent *)0)->end));
+	LASSERTF((int)offsetof(struct ldlm_extent, gid) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct ldlm_extent, gid));
+	LASSERTF((int)sizeof(((struct ldlm_extent *)0)->gid) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct ldlm_extent *)0)->gid));
+
+	/* Checks for struct ldlm_inodebits */
+	LASSERTF((int)sizeof(struct ldlm_inodebits) == 8, "found %lld\n",
+		 (long long)(int)sizeof(struct ldlm_inodebits));
+	LASSERTF((int)offsetof(struct ldlm_inodebits, bits) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct ldlm_inodebits, bits));
+	LASSERTF((int)sizeof(((struct ldlm_inodebits *)0)->bits) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct ldlm_inodebits *)0)->bits));
+
+	/* Checks for struct ldlm_flock_wire */
+	LASSERTF((int)sizeof(struct ldlm_flock_wire) == 32, "found %lld\n",
+		 (long long)(int)sizeof(struct ldlm_flock_wire));
+	LASSERTF((int)offsetof(struct ldlm_flock_wire, lfw_start) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct ldlm_flock_wire, lfw_start));
+	LASSERTF((int)sizeof(((struct ldlm_flock_wire *)0)->lfw_start) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct ldlm_flock_wire *)0)->lfw_start));
+	LASSERTF((int)offsetof(struct ldlm_flock_wire, lfw_end) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct ldlm_flock_wire, lfw_end));
+	LASSERTF((int)sizeof(((struct ldlm_flock_wire *)0)->lfw_end) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct ldlm_flock_wire *)0)->lfw_end));
+	LASSERTF((int)offsetof(struct ldlm_flock_wire, lfw_owner) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct ldlm_flock_wire, lfw_owner));
+	LASSERTF((int)sizeof(((struct ldlm_flock_wire *)0)->lfw_owner) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct ldlm_flock_wire *)0)->lfw_owner));
+	LASSERTF((int)offsetof(struct ldlm_flock_wire, lfw_padding) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct ldlm_flock_wire, lfw_padding));
+	LASSERTF((int)sizeof(((struct ldlm_flock_wire *)0)->lfw_padding) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct ldlm_flock_wire *)0)->lfw_padding));
+	LASSERTF((int)offsetof(struct ldlm_flock_wire, lfw_pid) == 28, "found %lld\n",
+		 (long long)(int)offsetof(struct ldlm_flock_wire, lfw_pid));
+	LASSERTF((int)sizeof(((struct ldlm_flock_wire *)0)->lfw_pid) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct ldlm_flock_wire *)0)->lfw_pid));
+
+	/* Checks for struct ldlm_intent */
+	LASSERTF((int)sizeof(struct ldlm_intent) == 8, "found %lld\n",
+		 (long long)(int)sizeof(struct ldlm_intent));
+	LASSERTF((int)offsetof(struct ldlm_intent, opc) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct ldlm_intent, opc));
+	LASSERTF((int)sizeof(((struct ldlm_intent *)0)->opc) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct ldlm_intent *)0)->opc));
+
+	/* Checks for struct ldlm_resource_desc */
+	LASSERTF((int)sizeof(struct ldlm_resource_desc) == 40, "found %lld\n",
+		 (long long)(int)sizeof(struct ldlm_resource_desc));
+	LASSERTF((int)offsetof(struct ldlm_resource_desc, lr_type) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct ldlm_resource_desc, lr_type));
+	LASSERTF((int)sizeof(((struct ldlm_resource_desc *)0)->lr_type) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct ldlm_resource_desc *)0)->lr_type));
+	LASSERTF((int)offsetof(struct ldlm_resource_desc, lr_padding) == 4, "found %lld\n",
+		 (long long)(int)offsetof(struct ldlm_resource_desc, lr_padding));
+	LASSERTF((int)sizeof(((struct ldlm_resource_desc *)0)->lr_padding) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct ldlm_resource_desc *)0)->lr_padding));
+	LASSERTF((int)offsetof(struct ldlm_resource_desc, lr_name) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct ldlm_resource_desc, lr_name));
+	LASSERTF((int)sizeof(((struct ldlm_resource_desc *)0)->lr_name) == 32, "found %lld\n",
+		 (long long)(int)sizeof(((struct ldlm_resource_desc *)0)->lr_name));
+
+	/* Checks for struct ldlm_lock_desc */
+	LASSERTF((int)sizeof(struct ldlm_lock_desc) == 80, "found %lld\n",
+		 (long long)(int)sizeof(struct ldlm_lock_desc));
+	LASSERTF((int)offsetof(struct ldlm_lock_desc, l_resource) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct ldlm_lock_desc, l_resource));
+	LASSERTF((int)sizeof(((struct ldlm_lock_desc *)0)->l_resource) == 40, "found %lld\n",
+		 (long long)(int)sizeof(((struct ldlm_lock_desc *)0)->l_resource));
+	LASSERTF((int)offsetof(struct ldlm_lock_desc, l_req_mode) == 40, "found %lld\n",
+		 (long long)(int)offsetof(struct ldlm_lock_desc, l_req_mode));
+	LASSERTF((int)sizeof(((struct ldlm_lock_desc *)0)->l_req_mode) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct ldlm_lock_desc *)0)->l_req_mode));
+	LASSERTF((int)offsetof(struct ldlm_lock_desc, l_granted_mode) == 44, "found %lld\n",
+		 (long long)(int)offsetof(struct ldlm_lock_desc, l_granted_mode));
+	LASSERTF((int)sizeof(((struct ldlm_lock_desc *)0)->l_granted_mode) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct ldlm_lock_desc *)0)->l_granted_mode));
+	LASSERTF((int)offsetof(struct ldlm_lock_desc, l_policy_data) == 48, "found %lld\n",
+		 (long long)(int)offsetof(struct ldlm_lock_desc, l_policy_data));
+	LASSERTF((int)sizeof(((struct ldlm_lock_desc *)0)->l_policy_data) == 32, "found %lld\n",
+		 (long long)(int)sizeof(((struct ldlm_lock_desc *)0)->l_policy_data));
+
+	/* Checks for struct ldlm_request */
+	LASSERTF((int)sizeof(struct ldlm_request) == 104, "found %lld\n",
+		 (long long)(int)sizeof(struct ldlm_request));
+	LASSERTF((int)offsetof(struct ldlm_request, lock_flags) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct ldlm_request, lock_flags));
+	LASSERTF((int)sizeof(((struct ldlm_request *)0)->lock_flags) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct ldlm_request *)0)->lock_flags));
+	LASSERTF((int)offsetof(struct ldlm_request, lock_count) == 4, "found %lld\n",
+		 (long long)(int)offsetof(struct ldlm_request, lock_count));
+	LASSERTF((int)sizeof(((struct ldlm_request *)0)->lock_count) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct ldlm_request *)0)->lock_count));
+	LASSERTF((int)offsetof(struct ldlm_request, lock_desc) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct ldlm_request, lock_desc));
+	LASSERTF((int)sizeof(((struct ldlm_request *)0)->lock_desc) == 80, "found %lld\n",
+		 (long long)(int)sizeof(((struct ldlm_request *)0)->lock_desc));
+	LASSERTF((int)offsetof(struct ldlm_request, lock_handle) == 88, "found %lld\n",
+		 (long long)(int)offsetof(struct ldlm_request, lock_handle));
+	LASSERTF((int)sizeof(((struct ldlm_request *)0)->lock_handle) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct ldlm_request *)0)->lock_handle));
+
+	/* Checks for struct ldlm_reply */
+	LASSERTF((int)sizeof(struct ldlm_reply) == 112, "found %lld\n",
+		 (long long)(int)sizeof(struct ldlm_reply));
+	LASSERTF((int)offsetof(struct ldlm_reply, lock_flags) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct ldlm_reply, lock_flags));
+	LASSERTF((int)sizeof(((struct ldlm_reply *)0)->lock_flags) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct ldlm_reply *)0)->lock_flags));
+	LASSERTF((int)offsetof(struct ldlm_reply, lock_padding) == 4, "found %lld\n",
+		 (long long)(int)offsetof(struct ldlm_reply, lock_padding));
+	LASSERTF((int)sizeof(((struct ldlm_reply *)0)->lock_padding) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct ldlm_reply *)0)->lock_padding));
+	LASSERTF((int)offsetof(struct ldlm_reply, lock_desc) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct ldlm_reply, lock_desc));
+	LASSERTF((int)sizeof(((struct ldlm_reply *)0)->lock_desc) == 80, "found %lld\n",
+		 (long long)(int)sizeof(((struct ldlm_reply *)0)->lock_desc));
+	LASSERTF((int)offsetof(struct ldlm_reply, lock_handle) == 88, "found %lld\n",
+		 (long long)(int)offsetof(struct ldlm_reply, lock_handle));
+	LASSERTF((int)sizeof(((struct ldlm_reply *)0)->lock_handle) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct ldlm_reply *)0)->lock_handle));
+	LASSERTF((int)offsetof(struct ldlm_reply, lock_policy_res1) == 96, "found %lld\n",
+		 (long long)(int)offsetof(struct ldlm_reply, lock_policy_res1));
+	LASSERTF((int)sizeof(((struct ldlm_reply *)0)->lock_policy_res1) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct ldlm_reply *)0)->lock_policy_res1));
+	LASSERTF((int)offsetof(struct ldlm_reply, lock_policy_res2) == 104, "found %lld\n",
+		 (long long)(int)offsetof(struct ldlm_reply, lock_policy_res2));
+	LASSERTF((int)sizeof(((struct ldlm_reply *)0)->lock_policy_res2) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct ldlm_reply *)0)->lock_policy_res2));
+
+	/* Checks for struct ost_lvb_v1 */
+	LASSERTF((int)sizeof(struct ost_lvb_v1) == 40, "found %lld\n",
+		 (long long)(int)sizeof(struct ost_lvb_v1));
+	LASSERTF((int)offsetof(struct ost_lvb_v1, lvb_size) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct ost_lvb_v1, lvb_size));
+	LASSERTF((int)sizeof(((struct ost_lvb_v1 *)0)->lvb_size) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct ost_lvb_v1 *)0)->lvb_size));
+	LASSERTF((int)offsetof(struct ost_lvb_v1, lvb_mtime) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct ost_lvb_v1, lvb_mtime));
+	LASSERTF((int)sizeof(((struct ost_lvb_v1 *)0)->lvb_mtime) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct ost_lvb_v1 *)0)->lvb_mtime));
+	LASSERTF((int)offsetof(struct ost_lvb_v1, lvb_atime) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct ost_lvb_v1, lvb_atime));
+	LASSERTF((int)sizeof(((struct ost_lvb_v1 *)0)->lvb_atime) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct ost_lvb_v1 *)0)->lvb_atime));
+	LASSERTF((int)offsetof(struct ost_lvb_v1, lvb_ctime) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct ost_lvb_v1, lvb_ctime));
+	LASSERTF((int)sizeof(((struct ost_lvb_v1 *)0)->lvb_ctime) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct ost_lvb_v1 *)0)->lvb_ctime));
+	LASSERTF((int)offsetof(struct ost_lvb_v1, lvb_blocks) == 32, "found %lld\n",
+		 (long long)(int)offsetof(struct ost_lvb_v1, lvb_blocks));
+	LASSERTF((int)sizeof(((struct ost_lvb_v1 *)0)->lvb_blocks) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct ost_lvb_v1 *)0)->lvb_blocks));
+
+	/* Checks for struct ost_lvb */
+	LASSERTF((int)sizeof(struct ost_lvb) == 56, "found %lld\n",
+		 (long long)(int)sizeof(struct ost_lvb));
+	LASSERTF((int)offsetof(struct ost_lvb, lvb_size) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct ost_lvb, lvb_size));
+	LASSERTF((int)sizeof(((struct ost_lvb *)0)->lvb_size) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct ost_lvb *)0)->lvb_size));
+	LASSERTF((int)offsetof(struct ost_lvb, lvb_mtime) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct ost_lvb, lvb_mtime));
+	LASSERTF((int)sizeof(((struct ost_lvb *)0)->lvb_mtime) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct ost_lvb *)0)->lvb_mtime));
+	LASSERTF((int)offsetof(struct ost_lvb, lvb_atime) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct ost_lvb, lvb_atime));
+	LASSERTF((int)sizeof(((struct ost_lvb *)0)->lvb_atime) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct ost_lvb *)0)->lvb_atime));
+	LASSERTF((int)offsetof(struct ost_lvb, lvb_ctime) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct ost_lvb, lvb_ctime));
+	LASSERTF((int)sizeof(((struct ost_lvb *)0)->lvb_ctime) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct ost_lvb *)0)->lvb_ctime));
+	LASSERTF((int)offsetof(struct ost_lvb, lvb_blocks) == 32, "found %lld\n",
+		 (long long)(int)offsetof(struct ost_lvb, lvb_blocks));
+	LASSERTF((int)sizeof(((struct ost_lvb *)0)->lvb_blocks) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct ost_lvb *)0)->lvb_blocks));
+	LASSERTF((int)offsetof(struct ost_lvb, lvb_mtime_ns) == 40, "found %lld\n",
+		 (long long)(int)offsetof(struct ost_lvb, lvb_mtime_ns));
+	LASSERTF((int)sizeof(((struct ost_lvb *)0)->lvb_mtime_ns) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct ost_lvb *)0)->lvb_mtime_ns));
+	LASSERTF((int)offsetof(struct ost_lvb, lvb_atime_ns) == 44, "found %lld\n",
+		 (long long)(int)offsetof(struct ost_lvb, lvb_atime_ns));
+	LASSERTF((int)sizeof(((struct ost_lvb *)0)->lvb_atime_ns) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct ost_lvb *)0)->lvb_atime_ns));
+	LASSERTF((int)offsetof(struct ost_lvb, lvb_ctime_ns) == 48, "found %lld\n",
+		 (long long)(int)offsetof(struct ost_lvb, lvb_ctime_ns));
+	LASSERTF((int)sizeof(((struct ost_lvb *)0)->lvb_ctime_ns) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct ost_lvb *)0)->lvb_ctime_ns));
+	LASSERTF((int)offsetof(struct ost_lvb, lvb_padding) == 52, "found %lld\n",
+		 (long long)(int)offsetof(struct ost_lvb, lvb_padding));
+	LASSERTF((int)sizeof(((struct ost_lvb *)0)->lvb_padding) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct ost_lvb *)0)->lvb_padding));
+
+	/* Checks for struct lquota_lvb */
+	LASSERTF((int)sizeof(struct lquota_lvb) == 40, "found %lld\n",
+		 (long long)(int)sizeof(struct lquota_lvb));
+	LASSERTF((int)offsetof(struct lquota_lvb, lvb_flags) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct lquota_lvb, lvb_flags));
+	LASSERTF((int)sizeof(((struct lquota_lvb *)0)->lvb_flags) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct lquota_lvb *)0)->lvb_flags));
+	LASSERTF((int)offsetof(struct lquota_lvb, lvb_id_may_rel) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct lquota_lvb, lvb_id_may_rel));
+	LASSERTF((int)sizeof(((struct lquota_lvb *)0)->lvb_id_may_rel) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct lquota_lvb *)0)->lvb_id_may_rel));
+	LASSERTF((int)offsetof(struct lquota_lvb, lvb_id_rel) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct lquota_lvb, lvb_id_rel));
+	LASSERTF((int)sizeof(((struct lquota_lvb *)0)->lvb_id_rel) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct lquota_lvb *)0)->lvb_id_rel));
+	LASSERTF((int)offsetof(struct lquota_lvb, lvb_id_qunit) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct lquota_lvb, lvb_id_qunit));
+	LASSERTF((int)sizeof(((struct lquota_lvb *)0)->lvb_id_qunit) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct lquota_lvb *)0)->lvb_id_qunit));
+	LASSERTF((int)offsetof(struct lquota_lvb, lvb_pad1) == 32, "found %lld\n",
+		 (long long)(int)offsetof(struct lquota_lvb, lvb_pad1));
+	LASSERTF((int)sizeof(((struct lquota_lvb *)0)->lvb_pad1) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct lquota_lvb *)0)->lvb_pad1));
+	LASSERTF(LQUOTA_FL_EDQUOT == 1, "found %lld\n",
+		 (long long)LQUOTA_FL_EDQUOT);
+
+	/* Checks for struct ldlm_gl_lquota_desc */
+	LASSERTF((int)sizeof(struct ldlm_gl_lquota_desc) == 64, "found %lld\n",
+		 (long long)(int)sizeof(struct ldlm_gl_lquota_desc));
+	LASSERTF((int)offsetof(struct ldlm_gl_lquota_desc, gl_id) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct ldlm_gl_lquota_desc, gl_id));
+	LASSERTF((int)sizeof(((struct ldlm_gl_lquota_desc *)0)->gl_id) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct ldlm_gl_lquota_desc *)0)->gl_id));
+	LASSERTF((int)offsetof(struct ldlm_gl_lquota_desc, gl_flags) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct ldlm_gl_lquota_desc, gl_flags));
+	LASSERTF((int)sizeof(((struct ldlm_gl_lquota_desc *)0)->gl_flags) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct ldlm_gl_lquota_desc *)0)->gl_flags));
+	LASSERTF((int)offsetof(struct ldlm_gl_lquota_desc, gl_ver) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct ldlm_gl_lquota_desc, gl_ver));
+	LASSERTF((int)sizeof(((struct ldlm_gl_lquota_desc *)0)->gl_ver) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct ldlm_gl_lquota_desc *)0)->gl_ver));
+	LASSERTF((int)offsetof(struct ldlm_gl_lquota_desc, gl_hardlimit) == 32, "found %lld\n",
+		 (long long)(int)offsetof(struct ldlm_gl_lquota_desc, gl_hardlimit));
+	LASSERTF((int)sizeof(((struct ldlm_gl_lquota_desc *)0)->gl_hardlimit) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct ldlm_gl_lquota_desc *)0)->gl_hardlimit));
+	LASSERTF((int)offsetof(struct ldlm_gl_lquota_desc, gl_softlimit) == 40, "found %lld\n",
+		 (long long)(int)offsetof(struct ldlm_gl_lquota_desc, gl_softlimit));
+	LASSERTF((int)sizeof(((struct ldlm_gl_lquota_desc *)0)->gl_softlimit) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct ldlm_gl_lquota_desc *)0)->gl_softlimit));
+	LASSERTF((int)offsetof(struct ldlm_gl_lquota_desc, gl_time) == 48, "found %lld\n",
+		 (long long)(int)offsetof(struct ldlm_gl_lquota_desc, gl_time));
+	LASSERTF((int)sizeof(((struct ldlm_gl_lquota_desc *)0)->gl_time) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct ldlm_gl_lquota_desc *)0)->gl_time));
+	LASSERTF((int)offsetof(struct ldlm_gl_lquota_desc, gl_pad2) == 56, "found %lld\n",
+		 (long long)(int)offsetof(struct ldlm_gl_lquota_desc, gl_pad2));
+	LASSERTF((int)sizeof(((struct ldlm_gl_lquota_desc *)0)->gl_pad2) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct ldlm_gl_lquota_desc *)0)->gl_pad2));
+
+	/* Checks for struct mgs_send_param */
+	LASSERTF((int)sizeof(struct mgs_send_param) == 1024, "found %lld\n",
+		 (long long)(int)sizeof(struct mgs_send_param));
+	CLASSERT(MGS_PARAM_MAXLEN == 1024);
+	LASSERTF((int)offsetof(struct mgs_send_param, mgs_param[1024]) == 1024, "found %lld\n",
+		 (long long)(int)offsetof(struct mgs_send_param, mgs_param[1024]));
+	LASSERTF((int)sizeof(((struct mgs_send_param *)0)->mgs_param[1024]) == 1, "found %lld\n",
+		 (long long)(int)sizeof(((struct mgs_send_param *)0)->mgs_param[1024]));
+
+	/* Checks for struct cfg_marker */
+	LASSERTF((int)sizeof(struct cfg_marker) == 160, "found %lld\n",
+		 (long long)(int)sizeof(struct cfg_marker));
+	LASSERTF((int)offsetof(struct cfg_marker, cm_step) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct cfg_marker, cm_step));
+	LASSERTF((int)sizeof(((struct cfg_marker *)0)->cm_step) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct cfg_marker *)0)->cm_step));
+	LASSERTF((int)offsetof(struct cfg_marker, cm_flags) == 4, "found %lld\n",
+		 (long long)(int)offsetof(struct cfg_marker, cm_flags));
+	LASSERTF((int)sizeof(((struct cfg_marker *)0)->cm_flags) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct cfg_marker *)0)->cm_flags));
+	LASSERTF((int)offsetof(struct cfg_marker, cm_vers) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct cfg_marker, cm_vers));
+	LASSERTF((int)sizeof(((struct cfg_marker *)0)->cm_vers) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct cfg_marker *)0)->cm_vers));
+	LASSERTF((int)offsetof(struct cfg_marker, cm_padding) == 12, "found %lld\n",
+		 (long long)(int)offsetof(struct cfg_marker, cm_padding));
+	LASSERTF((int)sizeof(((struct cfg_marker *)0)->cm_padding) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct cfg_marker *)0)->cm_padding));
+	LASSERTF((int)offsetof(struct cfg_marker, cm_createtime) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct cfg_marker, cm_createtime));
+	LASSERTF((int)sizeof(((struct cfg_marker *)0)->cm_createtime) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct cfg_marker *)0)->cm_createtime));
+	LASSERTF((int)offsetof(struct cfg_marker, cm_canceltime) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct cfg_marker, cm_canceltime));
+	LASSERTF((int)sizeof(((struct cfg_marker *)0)->cm_canceltime) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct cfg_marker *)0)->cm_canceltime));
+	LASSERTF((int)offsetof(struct cfg_marker, cm_tgtname) == 32, "found %lld\n",
+		 (long long)(int)offsetof(struct cfg_marker, cm_tgtname));
+	LASSERTF((int)sizeof(((struct cfg_marker *)0)->cm_tgtname) == 64, "found %lld\n",
+		 (long long)(int)sizeof(((struct cfg_marker *)0)->cm_tgtname));
+	LASSERTF((int)offsetof(struct cfg_marker, cm_comment) == 96, "found %lld\n",
+		 (long long)(int)offsetof(struct cfg_marker, cm_comment));
+	LASSERTF((int)sizeof(((struct cfg_marker *)0)->cm_comment) == 64, "found %lld\n",
+		 (long long)(int)sizeof(((struct cfg_marker *)0)->cm_comment));
+
+	/* Checks for struct llog_logid */
+	LASSERTF((int)sizeof(struct llog_logid) == 20, "found %lld\n",
+		 (long long)(int)sizeof(struct llog_logid));
+	LASSERTF((int)offsetof(struct llog_logid, lgl_oi) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_logid, lgl_oi));
+	LASSERTF((int)sizeof(((struct llog_logid *)0)->lgl_oi) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_logid *)0)->lgl_oi));
+	LASSERTF((int)offsetof(struct llog_logid, lgl_ogen) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_logid, lgl_ogen));
+	LASSERTF((int)sizeof(((struct llog_logid *)0)->lgl_ogen) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_logid *)0)->lgl_ogen));
+	CLASSERT(OST_SZ_REC == 274730752);
+	CLASSERT(MDS_UNLINK_REC == 274801668);
+	CLASSERT(MDS_UNLINK64_REC == 275325956);
+	CLASSERT(MDS_SETATTR64_REC == 275325953);
+	CLASSERT(OBD_CFG_REC == 274857984);
+	CLASSERT(LLOG_GEN_REC == 274989056);
+	CLASSERT(CHANGELOG_REC == 275120128);
+	CLASSERT(CHANGELOG_USER_REC == 275185664);
+	CLASSERT(LLOG_HDR_MAGIC == 275010873);
+	CLASSERT(LLOG_LOGID_MAGIC == 275010875);
+
+	/* Checks for struct llog_catid */
+	LASSERTF((int)sizeof(struct llog_catid) == 32, "found %lld\n",
+		 (long long)(int)sizeof(struct llog_catid));
+	LASSERTF((int)offsetof(struct llog_catid, lci_logid) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_catid, lci_logid));
+	LASSERTF((int)sizeof(((struct llog_catid *)0)->lci_logid) == 20, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_catid *)0)->lci_logid));
+	LASSERTF((int)offsetof(struct llog_catid, lci_padding1) == 20, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_catid, lci_padding1));
+	LASSERTF((int)sizeof(((struct llog_catid *)0)->lci_padding1) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_catid *)0)->lci_padding1));
+	LASSERTF((int)offsetof(struct llog_catid, lci_padding2) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_catid, lci_padding2));
+	LASSERTF((int)sizeof(((struct llog_catid *)0)->lci_padding2) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_catid *)0)->lci_padding2));
+	LASSERTF((int)offsetof(struct llog_catid, lci_padding3) == 28, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_catid, lci_padding3));
+	LASSERTF((int)sizeof(((struct llog_catid *)0)->lci_padding3) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_catid *)0)->lci_padding3));
+
+	/* Checks for struct llog_rec_hdr */
+	LASSERTF((int)sizeof(struct llog_rec_hdr) == 16, "found %lld\n",
+		 (long long)(int)sizeof(struct llog_rec_hdr));
+	LASSERTF((int)offsetof(struct llog_rec_hdr, lrh_len) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_rec_hdr, lrh_len));
+	LASSERTF((int)sizeof(((struct llog_rec_hdr *)0)->lrh_len) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_rec_hdr *)0)->lrh_len));
+	LASSERTF((int)offsetof(struct llog_rec_hdr, lrh_index) == 4, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_rec_hdr, lrh_index));
+	LASSERTF((int)sizeof(((struct llog_rec_hdr *)0)->lrh_index) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_rec_hdr *)0)->lrh_index));
+	LASSERTF((int)offsetof(struct llog_rec_hdr, lrh_type) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_rec_hdr, lrh_type));
+	LASSERTF((int)sizeof(((struct llog_rec_hdr *)0)->lrh_type) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_rec_hdr *)0)->lrh_type));
+	LASSERTF((int)offsetof(struct llog_rec_hdr, lrh_id) == 12, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_rec_hdr, lrh_id));
+	LASSERTF((int)sizeof(((struct llog_rec_hdr *)0)->lrh_id) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_rec_hdr *)0)->lrh_id));
+
+	/* Checks for struct llog_rec_tail */
+	LASSERTF((int)sizeof(struct llog_rec_tail) == 8, "found %lld\n",
+		 (long long)(int)sizeof(struct llog_rec_tail));
+	LASSERTF((int)offsetof(struct llog_rec_tail, lrt_len) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_rec_tail, lrt_len));
+	LASSERTF((int)sizeof(((struct llog_rec_tail *)0)->lrt_len) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_rec_tail *)0)->lrt_len));
+	LASSERTF((int)offsetof(struct llog_rec_tail, lrt_index) == 4, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_rec_tail, lrt_index));
+	LASSERTF((int)sizeof(((struct llog_rec_tail *)0)->lrt_index) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_rec_tail *)0)->lrt_index));
+
+	/* Checks for struct llog_logid_rec */
+	LASSERTF((int)sizeof(struct llog_logid_rec) == 64, "found %lld\n",
+		 (long long)(int)sizeof(struct llog_logid_rec));
+	LASSERTF((int)offsetof(struct llog_logid_rec, lid_hdr) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_logid_rec, lid_hdr));
+	LASSERTF((int)sizeof(((struct llog_logid_rec *)0)->lid_hdr) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_logid_rec *)0)->lid_hdr));
+	LASSERTF((int)offsetof(struct llog_logid_rec, lid_id) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_logid_rec, lid_id));
+	LASSERTF((int)sizeof(((struct llog_logid_rec *)0)->lid_id) == 20, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_logid_rec *)0)->lid_id));
+	LASSERTF((int)offsetof(struct llog_logid_rec, lid_padding1) == 36, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_logid_rec, lid_padding1));
+	LASSERTF((int)sizeof(((struct llog_logid_rec *)0)->lid_padding1) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_logid_rec *)0)->lid_padding1));
+	LASSERTF((int)offsetof(struct llog_logid_rec, lid_padding2) == 40, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_logid_rec, lid_padding2));
+	LASSERTF((int)sizeof(((struct llog_logid_rec *)0)->lid_padding2) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_logid_rec *)0)->lid_padding2));
+	LASSERTF((int)offsetof(struct llog_logid_rec, lid_padding3) == 48, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_logid_rec, lid_padding3));
+	LASSERTF((int)sizeof(((struct llog_logid_rec *)0)->lid_padding3) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_logid_rec *)0)->lid_padding3));
+	LASSERTF((int)offsetof(struct llog_logid_rec, lid_tail) == 56, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_logid_rec, lid_tail));
+	LASSERTF((int)sizeof(((struct llog_logid_rec *)0)->lid_tail) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_logid_rec *)0)->lid_tail));
+
+	/* Checks for struct llog_unlink_rec */
+	LASSERTF((int)sizeof(struct llog_unlink_rec) == 40, "found %lld\n",
+		 (long long)(int)sizeof(struct llog_unlink_rec));
+	LASSERTF((int)offsetof(struct llog_unlink_rec, lur_hdr) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_unlink_rec, lur_hdr));
+	LASSERTF((int)sizeof(((struct llog_unlink_rec *)0)->lur_hdr) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_unlink_rec *)0)->lur_hdr));
+	LASSERTF((int)offsetof(struct llog_unlink_rec, lur_oid) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_unlink_rec, lur_oid));
+	LASSERTF((int)sizeof(((struct llog_unlink_rec *)0)->lur_oid) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_unlink_rec *)0)->lur_oid));
+	LASSERTF((int)offsetof(struct llog_unlink_rec, lur_oseq) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_unlink_rec, lur_oseq));
+	LASSERTF((int)sizeof(((struct llog_unlink_rec *)0)->lur_oseq) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_unlink_rec *)0)->lur_oseq));
+	LASSERTF((int)offsetof(struct llog_unlink_rec, lur_count) == 28, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_unlink_rec, lur_count));
+	LASSERTF((int)sizeof(((struct llog_unlink_rec *)0)->lur_count) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_unlink_rec *)0)->lur_count));
+	LASSERTF((int)offsetof(struct llog_unlink_rec, lur_tail) == 32, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_unlink_rec, lur_tail));
+	LASSERTF((int)sizeof(((struct llog_unlink_rec *)0)->lur_tail) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_unlink_rec *)0)->lur_tail));
+	/* Checks for struct llog_unlink64_rec */
+	LASSERTF((int)sizeof(struct llog_unlink64_rec) == 64, "found %lld\n",
+		 (long long)(int)sizeof(struct llog_unlink64_rec));
+	LASSERTF((int)offsetof(struct llog_unlink64_rec, lur_hdr) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_unlink64_rec, lur_hdr));
+	LASSERTF((int)sizeof(((struct llog_unlink64_rec *)0)->lur_hdr) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_unlink64_rec *)0)->lur_hdr));
+	LASSERTF((int)offsetof(struct llog_unlink64_rec, lur_fid) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_unlink64_rec, lur_fid));
+	LASSERTF((int)sizeof(((struct llog_unlink64_rec *)0)->lur_fid) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_unlink64_rec *)0)->lur_fid));
+	LASSERTF((int)offsetof(struct llog_unlink64_rec, lur_count) == 32, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_unlink64_rec, lur_count));
+	LASSERTF((int)sizeof(((struct llog_unlink64_rec *)0)->lur_count) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_unlink64_rec *)0)->lur_count));
+	LASSERTF((int)offsetof(struct llog_unlink64_rec, lur_tail) == 56, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_unlink64_rec, lur_tail));
+	LASSERTF((int)sizeof(((struct llog_unlink64_rec *)0)->lur_tail) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_unlink64_rec *)0)->lur_tail));
+	LASSERTF((int)offsetof(struct llog_unlink64_rec, lur_padding1) == 36, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_unlink64_rec, lur_padding1));
+	LASSERTF((int)sizeof(((struct llog_unlink64_rec *)0)->lur_padding1) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_unlink64_rec *)0)->lur_padding1));
+	LASSERTF((int)offsetof(struct llog_unlink64_rec, lur_padding2) == 40, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_unlink64_rec, lur_padding2));
+	LASSERTF((int)sizeof(((struct llog_unlink64_rec *)0)->lur_padding2) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_unlink64_rec *)0)->lur_padding2));
+	LASSERTF((int)offsetof(struct llog_unlink64_rec, lur_padding3) == 48, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_unlink64_rec, lur_padding3));
+	LASSERTF((int)sizeof(((struct llog_unlink64_rec *)0)->lur_padding3) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_unlink64_rec *)0)->lur_padding3));
+
+	/* Checks for struct llog_setattr64_rec */
+	LASSERTF((int)sizeof(struct llog_setattr64_rec) == 64, "found %lld\n",
+		 (long long)(int)sizeof(struct llog_setattr64_rec));
+	LASSERTF((int)offsetof(struct llog_setattr64_rec, lsr_hdr) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_setattr64_rec, lsr_hdr));
+	LASSERTF((int)sizeof(((struct llog_setattr64_rec *)0)->lsr_hdr) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_setattr64_rec *)0)->lsr_hdr));
+	LASSERTF((int)offsetof(struct llog_setattr64_rec, lsr_oi) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_setattr64_rec, lsr_oi));
+	LASSERTF((int)sizeof(((struct llog_setattr64_rec *)0)->lsr_oi) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_setattr64_rec *)0)->lsr_oi));
+	LASSERTF((int)offsetof(struct llog_setattr64_rec, lsr_uid) == 32, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_setattr64_rec, lsr_uid));
+	LASSERTF((int)sizeof(((struct llog_setattr64_rec *)0)->lsr_uid) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_setattr64_rec *)0)->lsr_uid));
+	LASSERTF((int)offsetof(struct llog_setattr64_rec, lsr_uid_h) == 36, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_setattr64_rec, lsr_uid_h));
+	LASSERTF((int)sizeof(((struct llog_setattr64_rec *)0)->lsr_uid_h) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_setattr64_rec *)0)->lsr_uid_h));
+	LASSERTF((int)offsetof(struct llog_setattr64_rec, lsr_gid) == 40, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_setattr64_rec, lsr_gid));
+	LASSERTF((int)sizeof(((struct llog_setattr64_rec *)0)->lsr_gid) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_setattr64_rec *)0)->lsr_gid));
+	LASSERTF((int)offsetof(struct llog_setattr64_rec, lsr_gid_h) == 44, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_setattr64_rec, lsr_gid_h));
+	LASSERTF((int)sizeof(((struct llog_setattr64_rec *)0)->lsr_gid_h) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_setattr64_rec *)0)->lsr_gid_h));
+	LASSERTF((int)offsetof(struct llog_setattr64_rec, lsr_padding) == 48, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_setattr64_rec, lsr_padding));
+	LASSERTF((int)sizeof(((struct llog_setattr64_rec *)0)->lsr_padding) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_setattr64_rec *)0)->lsr_padding));
+	LASSERTF((int)offsetof(struct llog_setattr64_rec, lsr_tail) == 56, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_setattr64_rec, lsr_tail));
+	LASSERTF((int)sizeof(((struct llog_setattr64_rec *)0)->lsr_tail) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_setattr64_rec *)0)->lsr_tail));
+
+	/* Checks for struct llog_size_change_rec */
+	LASSERTF((int)sizeof(struct llog_size_change_rec) == 64, "found %lld\n",
+		 (long long)(int)sizeof(struct llog_size_change_rec));
+	LASSERTF((int)offsetof(struct llog_size_change_rec, lsc_hdr) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_size_change_rec, lsc_hdr));
+	LASSERTF((int)sizeof(((struct llog_size_change_rec *)0)->lsc_hdr) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_size_change_rec *)0)->lsc_hdr));
+	LASSERTF((int)offsetof(struct llog_size_change_rec, lsc_fid) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_size_change_rec, lsc_fid));
+	LASSERTF((int)sizeof(((struct llog_size_change_rec *)0)->lsc_fid) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_size_change_rec *)0)->lsc_fid));
+	LASSERTF((int)offsetof(struct llog_size_change_rec, lsc_ioepoch) == 32, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_size_change_rec, lsc_ioepoch));
+	LASSERTF((int)sizeof(((struct llog_size_change_rec *)0)->lsc_ioepoch) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_size_change_rec *)0)->lsc_ioepoch));
+	LASSERTF((int)offsetof(struct llog_size_change_rec, lsc_padding1) == 36, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_size_change_rec, lsc_padding1));
+	LASSERTF((int)sizeof(((struct llog_size_change_rec *)0)->lsc_padding1) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_size_change_rec *)0)->lsc_padding1));
+	LASSERTF((int)offsetof(struct llog_size_change_rec, lsc_padding2) == 40, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_size_change_rec, lsc_padding2));
+	LASSERTF((int)sizeof(((struct llog_size_change_rec *)0)->lsc_padding2) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_size_change_rec *)0)->lsc_padding2));
+	LASSERTF((int)offsetof(struct llog_size_change_rec, lsc_padding3) == 48, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_size_change_rec, lsc_padding3));
+	LASSERTF((int)sizeof(((struct llog_size_change_rec *)0)->lsc_padding3) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_size_change_rec *)0)->lsc_padding3));
+	LASSERTF((int)offsetof(struct llog_size_change_rec, lsc_tail) == 56, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_size_change_rec, lsc_tail));
+	LASSERTF((int)sizeof(((struct llog_size_change_rec *)0)->lsc_tail) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_size_change_rec *)0)->lsc_tail));
+
+	/* Checks for struct changelog_rec */
+	LASSERTF((int)sizeof(struct changelog_rec) == 64, "found %lld\n",
+		 (long long)(int)sizeof(struct changelog_rec));
+	LASSERTF((int)offsetof(struct changelog_rec, cr_namelen) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct changelog_rec, cr_namelen));
+	LASSERTF((int)sizeof(((struct changelog_rec *)0)->cr_namelen) == 2, "found %lld\n",
+		 (long long)(int)sizeof(((struct changelog_rec *)0)->cr_namelen));
+	LASSERTF((int)offsetof(struct changelog_rec, cr_flags) == 2, "found %lld\n",
+		 (long long)(int)offsetof(struct changelog_rec, cr_flags));
+	LASSERTF((int)sizeof(((struct changelog_rec *)0)->cr_flags) == 2, "found %lld\n",
+		 (long long)(int)sizeof(((struct changelog_rec *)0)->cr_flags));
+	LASSERTF((int)offsetof(struct changelog_rec, cr_type) == 4, "found %lld\n",
+		 (long long)(int)offsetof(struct changelog_rec, cr_type));
+	LASSERTF((int)sizeof(((struct changelog_rec *)0)->cr_type) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct changelog_rec *)0)->cr_type));
+	LASSERTF((int)offsetof(struct changelog_rec, cr_index) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct changelog_rec, cr_index));
+	LASSERTF((int)sizeof(((struct changelog_rec *)0)->cr_index) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct changelog_rec *)0)->cr_index));
+	LASSERTF((int)offsetof(struct changelog_rec, cr_prev) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct changelog_rec, cr_prev));
+	LASSERTF((int)sizeof(((struct changelog_rec *)0)->cr_prev) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct changelog_rec *)0)->cr_prev));
+	LASSERTF((int)offsetof(struct changelog_rec, cr_time) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct changelog_rec, cr_time));
+	LASSERTF((int)sizeof(((struct changelog_rec *)0)->cr_time) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct changelog_rec *)0)->cr_time));
+	LASSERTF((int)offsetof(struct changelog_rec, cr_tfid) == 32, "found %lld\n",
+		 (long long)(int)offsetof(struct changelog_rec, cr_tfid));
+	LASSERTF((int)sizeof(((struct changelog_rec *)0)->cr_tfid) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct changelog_rec *)0)->cr_tfid));
+	LASSERTF((int)offsetof(struct changelog_rec, cr_pfid) == 48, "found %lld\n",
+		 (long long)(int)offsetof(struct changelog_rec, cr_pfid));
+	LASSERTF((int)sizeof(((struct changelog_rec *)0)->cr_pfid) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct changelog_rec *)0)->cr_pfid));
+
+	/* Checks for struct changelog_ext_rec */
+	LASSERTF((int)sizeof(struct changelog_ext_rec) == 96, "found %lld\n",
+		 (long long)(int)sizeof(struct changelog_ext_rec));
+	LASSERTF((int)offsetof(struct changelog_ext_rec, cr_namelen) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct changelog_ext_rec, cr_namelen));
+	LASSERTF((int)sizeof(((struct changelog_ext_rec *)0)->cr_namelen) == 2, "found %lld\n",
+		 (long long)(int)sizeof(((struct changelog_ext_rec *)0)->cr_namelen));
+	LASSERTF((int)offsetof(struct changelog_ext_rec, cr_flags) == 2, "found %lld\n",
+		 (long long)(int)offsetof(struct changelog_ext_rec, cr_flags));
+	LASSERTF((int)sizeof(((struct changelog_ext_rec *)0)->cr_flags) == 2, "found %lld\n",
+		 (long long)(int)sizeof(((struct changelog_ext_rec *)0)->cr_flags));
+	LASSERTF((int)offsetof(struct changelog_ext_rec, cr_type) == 4, "found %lld\n",
+		 (long long)(int)offsetof(struct changelog_ext_rec, cr_type));
+	LASSERTF((int)sizeof(((struct changelog_ext_rec *)0)->cr_type) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct changelog_ext_rec *)0)->cr_type));
+	LASSERTF((int)offsetof(struct changelog_ext_rec, cr_index) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct changelog_ext_rec, cr_index));
+	LASSERTF((int)sizeof(((struct changelog_ext_rec *)0)->cr_index) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct changelog_ext_rec *)0)->cr_index));
+	LASSERTF((int)offsetof(struct changelog_ext_rec, cr_prev) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct changelog_ext_rec, cr_prev));
+	LASSERTF((int)sizeof(((struct changelog_ext_rec *)0)->cr_prev) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct changelog_ext_rec *)0)->cr_prev));
+	LASSERTF((int)offsetof(struct changelog_ext_rec, cr_time) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct changelog_ext_rec, cr_time));
+	LASSERTF((int)sizeof(((struct changelog_ext_rec *)0)->cr_time) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct changelog_ext_rec *)0)->cr_time));
+	LASSERTF((int)offsetof(struct changelog_ext_rec, cr_tfid) == 32, "found %lld\n",
+		 (long long)(int)offsetof(struct changelog_ext_rec, cr_tfid));
+	LASSERTF((int)sizeof(((struct changelog_ext_rec *)0)->cr_tfid) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct changelog_ext_rec *)0)->cr_tfid));
+	LASSERTF((int)offsetof(struct changelog_ext_rec, cr_pfid) == 48, "found %lld\n",
+		 (long long)(int)offsetof(struct changelog_ext_rec, cr_pfid));
+	LASSERTF((int)sizeof(((struct changelog_ext_rec *)0)->cr_pfid) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct changelog_ext_rec *)0)->cr_pfid));
+	LASSERTF((int)offsetof(struct changelog_ext_rec, cr_sfid) == 64, "found %lld\n",
+		 (long long)(int)offsetof(struct changelog_ext_rec, cr_sfid));
+	LASSERTF((int)sizeof(((struct changelog_ext_rec *)0)->cr_sfid) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct changelog_ext_rec *)0)->cr_sfid));
+	LASSERTF((int)offsetof(struct changelog_ext_rec, cr_spfid) == 80, "found %lld\n",
+		 (long long)(int)offsetof(struct changelog_ext_rec, cr_spfid));
+	LASSERTF((int)sizeof(((struct changelog_ext_rec *)0)->cr_spfid) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct changelog_ext_rec *)0)->cr_spfid));
+
+	/* Checks for struct changelog_setinfo */
+	LASSERTF((int)sizeof(struct changelog_setinfo) == 12, "found %lld\n",
+		 (long long)(int)sizeof(struct changelog_setinfo));
+	LASSERTF((int)offsetof(struct changelog_setinfo, cs_recno) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct changelog_setinfo, cs_recno));
+	LASSERTF((int)sizeof(((struct changelog_setinfo *)0)->cs_recno) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct changelog_setinfo *)0)->cs_recno));
+	LASSERTF((int)offsetof(struct changelog_setinfo, cs_id) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct changelog_setinfo, cs_id));
+	LASSERTF((int)sizeof(((struct changelog_setinfo *)0)->cs_id) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct changelog_setinfo *)0)->cs_id));
+
+	/* Checks for struct llog_changelog_rec */
+	LASSERTF((int)sizeof(struct llog_changelog_rec) == 88, "found %lld\n",
+		 (long long)(int)sizeof(struct llog_changelog_rec));
+	LASSERTF((int)offsetof(struct llog_changelog_rec, cr_hdr) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_changelog_rec, cr_hdr));
+	LASSERTF((int)sizeof(((struct llog_changelog_rec *)0)->cr_hdr) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_changelog_rec *)0)->cr_hdr));
+	LASSERTF((int)offsetof(struct llog_changelog_rec, cr) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_changelog_rec, cr));
+	LASSERTF((int)sizeof(((struct llog_changelog_rec *)0)->cr) == 64, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_changelog_rec *)0)->cr));
+	LASSERTF((int)offsetof(struct llog_changelog_rec, cr_tail) == 80, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_changelog_rec, cr_tail));
+	LASSERTF((int)sizeof(((struct llog_changelog_rec *)0)->cr_tail) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_changelog_rec *)0)->cr_tail));
+
+	/* Checks for struct llog_changelog_user_rec */
+	LASSERTF((int)sizeof(struct llog_changelog_user_rec) == 40, "found %lld\n",
+		 (long long)(int)sizeof(struct llog_changelog_user_rec));
+	LASSERTF((int)offsetof(struct llog_changelog_user_rec, cur_hdr) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_changelog_user_rec, cur_hdr));
+	LASSERTF((int)sizeof(((struct llog_changelog_user_rec *)0)->cur_hdr) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_changelog_user_rec *)0)->cur_hdr));
+	LASSERTF((int)offsetof(struct llog_changelog_user_rec, cur_id) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_changelog_user_rec, cur_id));
+	LASSERTF((int)sizeof(((struct llog_changelog_user_rec *)0)->cur_id) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_changelog_user_rec *)0)->cur_id));
+	LASSERTF((int)offsetof(struct llog_changelog_user_rec, cur_padding) == 20, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_changelog_user_rec, cur_padding));
+	LASSERTF((int)sizeof(((struct llog_changelog_user_rec *)0)->cur_padding) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_changelog_user_rec *)0)->cur_padding));
+	LASSERTF((int)offsetof(struct llog_changelog_user_rec, cur_endrec) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_changelog_user_rec, cur_endrec));
+	LASSERTF((int)sizeof(((struct llog_changelog_user_rec *)0)->cur_endrec) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_changelog_user_rec *)0)->cur_endrec));
+	LASSERTF((int)offsetof(struct llog_changelog_user_rec, cur_tail) == 32, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_changelog_user_rec, cur_tail));
+	LASSERTF((int)sizeof(((struct llog_changelog_user_rec *)0)->cur_tail) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_changelog_user_rec *)0)->cur_tail));
+
+	/* Checks for struct llog_gen */
+	LASSERTF((int)sizeof(struct llog_gen) == 16, "found %lld\n",
+		 (long long)(int)sizeof(struct llog_gen));
+	LASSERTF((int)offsetof(struct llog_gen, mnt_cnt) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_gen, mnt_cnt));
+	LASSERTF((int)sizeof(((struct llog_gen *)0)->mnt_cnt) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_gen *)0)->mnt_cnt));
+	LASSERTF((int)offsetof(struct llog_gen, conn_cnt) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_gen, conn_cnt));
+	LASSERTF((int)sizeof(((struct llog_gen *)0)->conn_cnt) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_gen *)0)->conn_cnt));
+
+	/* Checks for struct llog_gen_rec */
+	LASSERTF((int)sizeof(struct llog_gen_rec) == 64, "found %lld\n",
+		 (long long)(int)sizeof(struct llog_gen_rec));
+	LASSERTF((int)offsetof(struct llog_gen_rec, lgr_hdr) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_gen_rec, lgr_hdr));
+	LASSERTF((int)sizeof(((struct llog_gen_rec *)0)->lgr_hdr) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_gen_rec *)0)->lgr_hdr));
+	LASSERTF((int)offsetof(struct llog_gen_rec, lgr_gen) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_gen_rec, lgr_gen));
+	LASSERTF((int)sizeof(((struct llog_gen_rec *)0)->lgr_gen) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_gen_rec *)0)->lgr_gen));
+	LASSERTF((int)offsetof(struct llog_gen_rec, lgr_tail) == 56, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_gen_rec, lgr_tail));
+	LASSERTF((int)sizeof(((struct llog_gen_rec *)0)->lgr_tail) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_gen_rec *)0)->lgr_tail));
+
+	/* Checks for struct llog_log_hdr */
+	LASSERTF((int)sizeof(struct llog_log_hdr) == 8192, "found %lld\n",
+		 (long long)(int)sizeof(struct llog_log_hdr));
+	LASSERTF((int)offsetof(struct llog_log_hdr, llh_hdr) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_log_hdr, llh_hdr));
+	LASSERTF((int)sizeof(((struct llog_log_hdr *)0)->llh_hdr) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_log_hdr *)0)->llh_hdr));
+	LASSERTF((int)offsetof(struct llog_log_hdr, llh_timestamp) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_log_hdr, llh_timestamp));
+	LASSERTF((int)sizeof(((struct llog_log_hdr *)0)->llh_timestamp) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_log_hdr *)0)->llh_timestamp));
+	LASSERTF((int)offsetof(struct llog_log_hdr, llh_count) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_log_hdr, llh_count));
+	LASSERTF((int)sizeof(((struct llog_log_hdr *)0)->llh_count) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_log_hdr *)0)->llh_count));
+	LASSERTF((int)offsetof(struct llog_log_hdr, llh_bitmap_offset) == 28, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_log_hdr, llh_bitmap_offset));
+	LASSERTF((int)sizeof(((struct llog_log_hdr *)0)->llh_bitmap_offset) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_log_hdr *)0)->llh_bitmap_offset));
+	LASSERTF((int)offsetof(struct llog_log_hdr, llh_size) == 32, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_log_hdr, llh_size));
+	LASSERTF((int)sizeof(((struct llog_log_hdr *)0)->llh_size) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_log_hdr *)0)->llh_size));
+	LASSERTF((int)offsetof(struct llog_log_hdr, llh_flags) == 36, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_log_hdr, llh_flags));
+	LASSERTF((int)sizeof(((struct llog_log_hdr *)0)->llh_flags) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_log_hdr *)0)->llh_flags));
+	LASSERTF((int)offsetof(struct llog_log_hdr, llh_cat_idx) == 40, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_log_hdr, llh_cat_idx));
+	LASSERTF((int)sizeof(((struct llog_log_hdr *)0)->llh_cat_idx) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_log_hdr *)0)->llh_cat_idx));
+	LASSERTF((int)offsetof(struct llog_log_hdr, llh_tgtuuid) == 44, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_log_hdr, llh_tgtuuid));
+	LASSERTF((int)sizeof(((struct llog_log_hdr *)0)->llh_tgtuuid) == 40, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_log_hdr *)0)->llh_tgtuuid));
+	LASSERTF((int)offsetof(struct llog_log_hdr, llh_reserved) == 84, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_log_hdr, llh_reserved));
+	LASSERTF((int)sizeof(((struct llog_log_hdr *)0)->llh_reserved) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_log_hdr *)0)->llh_reserved));
+	LASSERTF((int)offsetof(struct llog_log_hdr, llh_bitmap) == 88, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_log_hdr, llh_bitmap));
+	LASSERTF((int)sizeof(((struct llog_log_hdr *)0)->llh_bitmap) == 8096, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_log_hdr *)0)->llh_bitmap));
+	LASSERTF((int)offsetof(struct llog_log_hdr, llh_tail) == 8184, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_log_hdr, llh_tail));
+	LASSERTF((int)sizeof(((struct llog_log_hdr *)0)->llh_tail) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_log_hdr *)0)->llh_tail));
+
+	/* Checks for struct llog_cookie */
+	LASSERTF((int)sizeof(struct llog_cookie) == 32, "found %lld\n",
+		 (long long)(int)sizeof(struct llog_cookie));
+	LASSERTF((int)offsetof(struct llog_cookie, lgc_lgl) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_cookie, lgc_lgl));
+	LASSERTF((int)sizeof(((struct llog_cookie *)0)->lgc_lgl) == 20, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_cookie *)0)->lgc_lgl));
+	LASSERTF((int)offsetof(struct llog_cookie, lgc_subsys) == 20, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_cookie, lgc_subsys));
+	LASSERTF((int)sizeof(((struct llog_cookie *)0)->lgc_subsys) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_cookie *)0)->lgc_subsys));
+	LASSERTF((int)offsetof(struct llog_cookie, lgc_index) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_cookie, lgc_index));
+	LASSERTF((int)sizeof(((struct llog_cookie *)0)->lgc_index) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_cookie *)0)->lgc_index));
+	LASSERTF((int)offsetof(struct llog_cookie, lgc_padding) == 28, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_cookie, lgc_padding));
+	LASSERTF((int)sizeof(((struct llog_cookie *)0)->lgc_padding) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_cookie *)0)->lgc_padding));
+
+	/* Checks for struct llogd_body */
+	LASSERTF((int)sizeof(struct llogd_body) == 48, "found %lld\n",
+		 (long long)(int)sizeof(struct llogd_body));
+	LASSERTF((int)offsetof(struct llogd_body, lgd_logid) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct llogd_body, lgd_logid));
+	LASSERTF((int)sizeof(((struct llogd_body *)0)->lgd_logid) == 20, "found %lld\n",
+		 (long long)(int)sizeof(((struct llogd_body *)0)->lgd_logid));
+	LASSERTF((int)offsetof(struct llogd_body, lgd_ctxt_idx) == 20, "found %lld\n",
+		 (long long)(int)offsetof(struct llogd_body, lgd_ctxt_idx));
+	LASSERTF((int)sizeof(((struct llogd_body *)0)->lgd_ctxt_idx) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct llogd_body *)0)->lgd_ctxt_idx));
+	LASSERTF((int)offsetof(struct llogd_body, lgd_llh_flags) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct llogd_body, lgd_llh_flags));
+	LASSERTF((int)sizeof(((struct llogd_body *)0)->lgd_llh_flags) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct llogd_body *)0)->lgd_llh_flags));
+	LASSERTF((int)offsetof(struct llogd_body, lgd_index) == 28, "found %lld\n",
+		 (long long)(int)offsetof(struct llogd_body, lgd_index));
+	LASSERTF((int)sizeof(((struct llogd_body *)0)->lgd_index) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct llogd_body *)0)->lgd_index));
+	LASSERTF((int)offsetof(struct llogd_body, lgd_saved_index) == 32, "found %lld\n",
+		 (long long)(int)offsetof(struct llogd_body, lgd_saved_index));
+	LASSERTF((int)sizeof(((struct llogd_body *)0)->lgd_saved_index) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct llogd_body *)0)->lgd_saved_index));
+	LASSERTF((int)offsetof(struct llogd_body, lgd_len) == 36, "found %lld\n",
+		 (long long)(int)offsetof(struct llogd_body, lgd_len));
+	LASSERTF((int)sizeof(((struct llogd_body *)0)->lgd_len) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct llogd_body *)0)->lgd_len));
+	LASSERTF((int)offsetof(struct llogd_body, lgd_cur_offset) == 40, "found %lld\n",
+		 (long long)(int)offsetof(struct llogd_body, lgd_cur_offset));
+	LASSERTF((int)sizeof(((struct llogd_body *)0)->lgd_cur_offset) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct llogd_body *)0)->lgd_cur_offset));
+	CLASSERT(LLOG_ORIGIN_HANDLE_CREATE == 501);
+	CLASSERT(LLOG_ORIGIN_HANDLE_NEXT_BLOCK == 502);
+	CLASSERT(LLOG_ORIGIN_HANDLE_READ_HEADER == 503);
+	CLASSERT(LLOG_ORIGIN_HANDLE_WRITE_REC == 504);
+	CLASSERT(LLOG_ORIGIN_HANDLE_CLOSE == 505);
+	CLASSERT(LLOG_ORIGIN_CONNECT == 506);
+	CLASSERT(LLOG_CATINFO == 507);
+	CLASSERT(LLOG_ORIGIN_HANDLE_PREV_BLOCK == 508);
+	CLASSERT(LLOG_ORIGIN_HANDLE_DESTROY == 509);
+	CLASSERT(LLOG_FIRST_OPC == 501);
+	CLASSERT(LLOG_LAST_OPC == 510);
+
+	/* Checks for struct llogd_conn_body */
+	LASSERTF((int)sizeof(struct llogd_conn_body) == 40, "found %lld\n",
+		 (long long)(int)sizeof(struct llogd_conn_body));
+	LASSERTF((int)offsetof(struct llogd_conn_body, lgdc_gen) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct llogd_conn_body, lgdc_gen));
+	LASSERTF((int)sizeof(((struct llogd_conn_body *)0)->lgdc_gen) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct llogd_conn_body *)0)->lgdc_gen));
+	LASSERTF((int)offsetof(struct llogd_conn_body, lgdc_logid) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct llogd_conn_body, lgdc_logid));
+	LASSERTF((int)sizeof(((struct llogd_conn_body *)0)->lgdc_logid) == 20, "found %lld\n",
+		 (long long)(int)sizeof(((struct llogd_conn_body *)0)->lgdc_logid));
+	LASSERTF((int)offsetof(struct llogd_conn_body, lgdc_ctxt_idx) == 36, "found %lld\n",
+		 (long long)(int)offsetof(struct llogd_conn_body, lgdc_ctxt_idx));
+	LASSERTF((int)sizeof(((struct llogd_conn_body *)0)->lgdc_ctxt_idx) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct llogd_conn_body *)0)->lgdc_ctxt_idx));
+
+	/* Checks for struct ll_fiemap_info_key */
+	LASSERTF((int)sizeof(struct ll_fiemap_info_key) == 248, "found %lld\n",
+		 (long long)(int)sizeof(struct ll_fiemap_info_key));
+	LASSERTF((int)offsetof(struct ll_fiemap_info_key, name[8]) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct ll_fiemap_info_key, name[8]));
+	LASSERTF((int)sizeof(((struct ll_fiemap_info_key *)0)->name[8]) == 1, "found %lld\n",
+		 (long long)(int)sizeof(((struct ll_fiemap_info_key *)0)->name[8]));
+	LASSERTF((int)offsetof(struct ll_fiemap_info_key, oa) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct ll_fiemap_info_key, oa));
+	LASSERTF((int)sizeof(((struct ll_fiemap_info_key *)0)->oa) == 208, "found %lld\n",
+		 (long long)(int)sizeof(((struct ll_fiemap_info_key *)0)->oa));
+	LASSERTF((int)offsetof(struct ll_fiemap_info_key, fiemap) == 216, "found %lld\n",
+		 (long long)(int)offsetof(struct ll_fiemap_info_key, fiemap));
+	LASSERTF((int)sizeof(((struct ll_fiemap_info_key *)0)->fiemap) == 32, "found %lld\n",
+		 (long long)(int)sizeof(((struct ll_fiemap_info_key *)0)->fiemap));
+
+	/* Checks for struct quota_body */
+	LASSERTF((int)sizeof(struct quota_body) == 112, "found %lld\n",
+		 (long long)(int)sizeof(struct quota_body));
+	LASSERTF((int)offsetof(struct quota_body, qb_fid) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct quota_body, qb_fid));
+	LASSERTF((int)sizeof(((struct quota_body *)0)->qb_fid) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct quota_body *)0)->qb_fid));
+	LASSERTF((int)offsetof(struct quota_body, qb_id) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct quota_body, qb_id));
+	LASSERTF((int)sizeof(((struct quota_body *)0)->qb_id) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct quota_body *)0)->qb_id));
+	LASSERTF((int)offsetof(struct quota_body, qb_flags) == 32, "found %lld\n",
+		 (long long)(int)offsetof(struct quota_body, qb_flags));
+	LASSERTF((int)sizeof(((struct quota_body *)0)->qb_flags) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct quota_body *)0)->qb_flags));
+	LASSERTF((int)offsetof(struct quota_body, qb_padding) == 36, "found %lld\n",
+		 (long long)(int)offsetof(struct quota_body, qb_padding));
+	LASSERTF((int)sizeof(((struct quota_body *)0)->qb_padding) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct quota_body *)0)->qb_padding));
+	LASSERTF((int)offsetof(struct quota_body, qb_count) == 40, "found %lld\n",
+		 (long long)(int)offsetof(struct quota_body, qb_count));
+	LASSERTF((int)sizeof(((struct quota_body *)0)->qb_count) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct quota_body *)0)->qb_count));
+	LASSERTF((int)offsetof(struct quota_body, qb_usage) == 48, "found %lld\n",
+		 (long long)(int)offsetof(struct quota_body, qb_usage));
+	LASSERTF((int)sizeof(((struct quota_body *)0)->qb_usage) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct quota_body *)0)->qb_usage));
+	LASSERTF((int)offsetof(struct quota_body, qb_slv_ver) == 56, "found %lld\n",
+		 (long long)(int)offsetof(struct quota_body, qb_slv_ver));
+	LASSERTF((int)sizeof(((struct quota_body *)0)->qb_slv_ver) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct quota_body *)0)->qb_slv_ver));
+	LASSERTF((int)offsetof(struct quota_body, qb_lockh) == 64, "found %lld\n",
+		 (long long)(int)offsetof(struct quota_body, qb_lockh));
+	LASSERTF((int)sizeof(((struct quota_body *)0)->qb_lockh) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct quota_body *)0)->qb_lockh));
+	LASSERTF((int)offsetof(struct quota_body, qb_glb_lockh) == 72, "found %lld\n",
+		 (long long)(int)offsetof(struct quota_body, qb_glb_lockh));
+	LASSERTF((int)sizeof(((struct quota_body *)0)->qb_glb_lockh) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct quota_body *)0)->qb_glb_lockh));
+	LASSERTF((int)offsetof(struct quota_body, qb_padding1[4]) == 112, "found %lld\n",
+		 (long long)(int)offsetof(struct quota_body, qb_padding1[4]));
+	LASSERTF((int)sizeof(((struct quota_body *)0)->qb_padding1[4]) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct quota_body *)0)->qb_padding1[4]));
+
+	/* Checks for struct mgs_target_info */
+	LASSERTF((int)sizeof(struct mgs_target_info) == 4544, "found %lld\n",
+		 (long long)(int)sizeof(struct mgs_target_info));
+	LASSERTF((int)offsetof(struct mgs_target_info, mti_lustre_ver) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct mgs_target_info, mti_lustre_ver));
+	LASSERTF((int)sizeof(((struct mgs_target_info *)0)->mti_lustre_ver) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mgs_target_info *)0)->mti_lustre_ver));
+	LASSERTF((int)offsetof(struct mgs_target_info, mti_stripe_index) == 4, "found %lld\n",
+		 (long long)(int)offsetof(struct mgs_target_info, mti_stripe_index));
+	LASSERTF((int)sizeof(((struct mgs_target_info *)0)->mti_stripe_index) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mgs_target_info *)0)->mti_stripe_index));
+	LASSERTF((int)offsetof(struct mgs_target_info, mti_config_ver) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct mgs_target_info, mti_config_ver));
+	LASSERTF((int)sizeof(((struct mgs_target_info *)0)->mti_config_ver) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mgs_target_info *)0)->mti_config_ver));
+	LASSERTF((int)offsetof(struct mgs_target_info, mti_flags) == 12, "found %lld\n",
+		 (long long)(int)offsetof(struct mgs_target_info, mti_flags));
+	LASSERTF((int)sizeof(((struct mgs_target_info *)0)->mti_flags) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mgs_target_info *)0)->mti_flags));
+	LASSERTF((int)offsetof(struct mgs_target_info, mti_nid_count) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct mgs_target_info, mti_nid_count));
+	LASSERTF((int)sizeof(((struct mgs_target_info *)0)->mti_nid_count) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mgs_target_info *)0)->mti_nid_count));
+	LASSERTF((int)offsetof(struct mgs_target_info, mti_instance) == 20, "found %lld\n",
+		 (long long)(int)offsetof(struct mgs_target_info, mti_instance));
+	LASSERTF((int)sizeof(((struct mgs_target_info *)0)->mti_instance) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mgs_target_info *)0)->mti_instance));
+	LASSERTF((int)offsetof(struct mgs_target_info, mti_fsname) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct mgs_target_info, mti_fsname));
+	LASSERTF((int)sizeof(((struct mgs_target_info *)0)->mti_fsname) == 64, "found %lld\n",
+		 (long long)(int)sizeof(((struct mgs_target_info *)0)->mti_fsname));
+	LASSERTF((int)offsetof(struct mgs_target_info, mti_svname) == 88, "found %lld\n",
+		 (long long)(int)offsetof(struct mgs_target_info, mti_svname));
+	LASSERTF((int)sizeof(((struct mgs_target_info *)0)->mti_svname) == 64, "found %lld\n",
+		 (long long)(int)sizeof(((struct mgs_target_info *)0)->mti_svname));
+	LASSERTF((int)offsetof(struct mgs_target_info, mti_uuid) == 152, "found %lld\n",
+		 (long long)(int)offsetof(struct mgs_target_info, mti_uuid));
+	LASSERTF((int)sizeof(((struct mgs_target_info *)0)->mti_uuid) == 40, "found %lld\n",
+		 (long long)(int)sizeof(((struct mgs_target_info *)0)->mti_uuid));
+	LASSERTF((int)offsetof(struct mgs_target_info, mti_nids) == 192, "found %lld\n",
+		 (long long)(int)offsetof(struct mgs_target_info, mti_nids));
+	LASSERTF((int)sizeof(((struct mgs_target_info *)0)->mti_nids) == 256, "found %lld\n",
+		 (long long)(int)sizeof(((struct mgs_target_info *)0)->mti_nids));
+	LASSERTF((int)offsetof(struct mgs_target_info, mti_params) == 448, "found %lld\n",
+		 (long long)(int)offsetof(struct mgs_target_info, mti_params));
+	LASSERTF((int)sizeof(((struct mgs_target_info *)0)->mti_params) == 4096, "found %lld\n",
+		 (long long)(int)sizeof(((struct mgs_target_info *)0)->mti_params));
+
+	/* Checks for struct lustre_capa */
+	LASSERTF((int)sizeof(struct lustre_capa) == 120, "found %lld\n",
+		 (long long)(int)sizeof(struct lustre_capa));
+	LASSERTF((int)offsetof(struct lustre_capa, lc_fid) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct lustre_capa, lc_fid));
+	LASSERTF((int)sizeof(((struct lustre_capa *)0)->lc_fid) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct lustre_capa *)0)->lc_fid));
+	LASSERTF((int)offsetof(struct lustre_capa, lc_opc) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct lustre_capa, lc_opc));
+	LASSERTF((int)sizeof(((struct lustre_capa *)0)->lc_opc) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct lustre_capa *)0)->lc_opc));
+	LASSERTF((int)offsetof(struct lustre_capa, lc_uid) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct lustre_capa, lc_uid));
+	LASSERTF((int)sizeof(((struct lustre_capa *)0)->lc_uid) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct lustre_capa *)0)->lc_uid));
+	LASSERTF((int)offsetof(struct lustre_capa, lc_gid) == 32, "found %lld\n",
+		 (long long)(int)offsetof(struct lustre_capa, lc_gid));
+	LASSERTF((int)sizeof(((struct lustre_capa *)0)->lc_gid) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct lustre_capa *)0)->lc_gid));
+	LASSERTF((int)offsetof(struct lustre_capa, lc_flags) == 40, "found %lld\n",
+		 (long long)(int)offsetof(struct lustre_capa, lc_flags));
+	LASSERTF((int)sizeof(((struct lustre_capa *)0)->lc_flags) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lustre_capa *)0)->lc_flags));
+	LASSERTF((int)offsetof(struct lustre_capa, lc_keyid) == 44, "found %lld\n",
+		 (long long)(int)offsetof(struct lustre_capa, lc_keyid));
+	LASSERTF((int)sizeof(((struct lustre_capa *)0)->lc_keyid) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lustre_capa *)0)->lc_keyid));
+	LASSERTF((int)offsetof(struct lustre_capa, lc_timeout) == 48, "found %lld\n",
+		 (long long)(int)offsetof(struct lustre_capa, lc_timeout));
+	LASSERTF((int)sizeof(((struct lustre_capa *)0)->lc_timeout) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lustre_capa *)0)->lc_timeout));
+	LASSERTF((int)offsetof(struct lustre_capa, lc_expiry) == 52, "found %lld\n",
+		 (long long)(int)offsetof(struct lustre_capa, lc_expiry));
+	LASSERTF((int)sizeof(((struct lustre_capa *)0)->lc_expiry) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lustre_capa *)0)->lc_expiry));
+	CLASSERT(CAPA_HMAC_MAX_LEN == 64);
+	LASSERTF((int)offsetof(struct lustre_capa, lc_hmac[64]) == 120, "found %lld\n",
+		 (long long)(int)offsetof(struct lustre_capa, lc_hmac[64]));
+	LASSERTF((int)sizeof(((struct lustre_capa *)0)->lc_hmac[64]) == 1, "found %lld\n",
+		 (long long)(int)sizeof(((struct lustre_capa *)0)->lc_hmac[64]));
+
+	/* Checks for struct lustre_capa_key */
+	LASSERTF((int)sizeof(struct lustre_capa_key) == 72, "found %lld\n",
+		 (long long)(int)sizeof(struct lustre_capa_key));
+	LASSERTF((int)offsetof(struct lustre_capa_key, lk_seq) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct lustre_capa_key, lk_seq));
+	LASSERTF((int)sizeof(((struct lustre_capa_key *)0)->lk_seq) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct lustre_capa_key *)0)->lk_seq));
+	LASSERTF((int)offsetof(struct lustre_capa_key, lk_keyid) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct lustre_capa_key, lk_keyid));
+	LASSERTF((int)sizeof(((struct lustre_capa_key *)0)->lk_keyid) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lustre_capa_key *)0)->lk_keyid));
+	LASSERTF((int)offsetof(struct lustre_capa_key, lk_padding) == 12, "found %lld\n",
+		 (long long)(int)offsetof(struct lustre_capa_key, lk_padding));
+	LASSERTF((int)sizeof(((struct lustre_capa_key *)0)->lk_padding) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lustre_capa_key *)0)->lk_padding));
+	CLASSERT(CAPA_HMAC_KEY_MAX_LEN == 56);
+	LASSERTF((int)offsetof(struct lustre_capa_key, lk_key[56]) == 72, "found %lld\n",
+		 (long long)(int)offsetof(struct lustre_capa_key, lk_key[56]));
+	LASSERTF((int)sizeof(((struct lustre_capa_key *)0)->lk_key[56]) == 1, "found %lld\n",
+		 (long long)(int)sizeof(((struct lustre_capa_key *)0)->lk_key[56]));
+
+	/* Checks for struct getinfo_fid2path */
+	LASSERTF((int)sizeof(struct getinfo_fid2path) == 32, "found %lld\n",
+		 (long long)(int)sizeof(struct getinfo_fid2path));
+	LASSERTF((int)offsetof(struct getinfo_fid2path, gf_fid) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct getinfo_fid2path, gf_fid));
+	LASSERTF((int)sizeof(((struct getinfo_fid2path *)0)->gf_fid) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct getinfo_fid2path *)0)->gf_fid));
+	LASSERTF((int)offsetof(struct getinfo_fid2path, gf_recno) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct getinfo_fid2path, gf_recno));
+	LASSERTF((int)sizeof(((struct getinfo_fid2path *)0)->gf_recno) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct getinfo_fid2path *)0)->gf_recno));
+	LASSERTF((int)offsetof(struct getinfo_fid2path, gf_linkno) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct getinfo_fid2path, gf_linkno));
+	LASSERTF((int)sizeof(((struct getinfo_fid2path *)0)->gf_linkno) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct getinfo_fid2path *)0)->gf_linkno));
+	LASSERTF((int)offsetof(struct getinfo_fid2path, gf_pathlen) == 28, "found %lld\n",
+		 (long long)(int)offsetof(struct getinfo_fid2path, gf_pathlen));
+	LASSERTF((int)sizeof(((struct getinfo_fid2path *)0)->gf_pathlen) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct getinfo_fid2path *)0)->gf_pathlen));
+	LASSERTF((int)offsetof(struct getinfo_fid2path, gf_path[0]) == 32, "found %lld\n",
+		 (long long)(int)offsetof(struct getinfo_fid2path, gf_path[0]));
+	LASSERTF((int)sizeof(((struct getinfo_fid2path *)0)->gf_path[0]) == 1, "found %lld\n",
+		 (long long)(int)sizeof(((struct getinfo_fid2path *)0)->gf_path[0]));
+
+	/* Checks for struct ll_user_fiemap */
+	LASSERTF((int)sizeof(struct ll_user_fiemap) == 32, "found %lld\n",
+		 (long long)(int)sizeof(struct ll_user_fiemap));
+	LASSERTF((int)offsetof(struct ll_user_fiemap, fm_start) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct ll_user_fiemap, fm_start));
+	LASSERTF((int)sizeof(((struct ll_user_fiemap *)0)->fm_start) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct ll_user_fiemap *)0)->fm_start));
+	LASSERTF((int)offsetof(struct ll_user_fiemap, fm_length) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct ll_user_fiemap, fm_length));
+	LASSERTF((int)sizeof(((struct ll_user_fiemap *)0)->fm_length) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct ll_user_fiemap *)0)->fm_length));
+	LASSERTF((int)offsetof(struct ll_user_fiemap, fm_flags) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct ll_user_fiemap, fm_flags));
+	LASSERTF((int)sizeof(((struct ll_user_fiemap *)0)->fm_flags) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct ll_user_fiemap *)0)->fm_flags));
+	LASSERTF((int)offsetof(struct ll_user_fiemap, fm_mapped_extents) == 20, "found %lld\n",
+		 (long long)(int)offsetof(struct ll_user_fiemap, fm_mapped_extents));
+	LASSERTF((int)sizeof(((struct ll_user_fiemap *)0)->fm_mapped_extents) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct ll_user_fiemap *)0)->fm_mapped_extents));
+	LASSERTF((int)offsetof(struct ll_user_fiemap, fm_extent_count) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct ll_user_fiemap, fm_extent_count));
+	LASSERTF((int)sizeof(((struct ll_user_fiemap *)0)->fm_extent_count) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct ll_user_fiemap *)0)->fm_extent_count));
+	LASSERTF((int)offsetof(struct ll_user_fiemap, fm_reserved) == 28, "found %lld\n",
+		 (long long)(int)offsetof(struct ll_user_fiemap, fm_reserved));
+	LASSERTF((int)sizeof(((struct ll_user_fiemap *)0)->fm_reserved) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct ll_user_fiemap *)0)->fm_reserved));
+	LASSERTF((int)offsetof(struct ll_user_fiemap, fm_extents) == 32, "found %lld\n",
+		 (long long)(int)offsetof(struct ll_user_fiemap, fm_extents));
+	LASSERTF((int)sizeof(((struct ll_user_fiemap *)0)->fm_extents) == 0, "found %lld\n",
+		 (long long)(int)sizeof(((struct ll_user_fiemap *)0)->fm_extents));
+	CLASSERT(FIEMAP_FLAG_SYNC == 0x00000001);
+	CLASSERT(FIEMAP_FLAG_XATTR == 0x00000002);
+	CLASSERT(FIEMAP_FLAG_DEVICE_ORDER == 0x40000000);
+
+	/* Checks for struct ll_fiemap_extent */
+	LASSERTF((int)sizeof(struct ll_fiemap_extent) == 56, "found %lld\n",
+		 (long long)(int)sizeof(struct ll_fiemap_extent));
+	LASSERTF((int)offsetof(struct ll_fiemap_extent, fe_logical) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct ll_fiemap_extent, fe_logical));
+	LASSERTF((int)sizeof(((struct ll_fiemap_extent *)0)->fe_logical) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct ll_fiemap_extent *)0)->fe_logical));
+	LASSERTF((int)offsetof(struct ll_fiemap_extent, fe_physical) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct ll_fiemap_extent, fe_physical));
+	LASSERTF((int)sizeof(((struct ll_fiemap_extent *)0)->fe_physical) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct ll_fiemap_extent *)0)->fe_physical));
+	LASSERTF((int)offsetof(struct ll_fiemap_extent, fe_length) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct ll_fiemap_extent, fe_length));
+	LASSERTF((int)sizeof(((struct ll_fiemap_extent *)0)->fe_length) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct ll_fiemap_extent *)0)->fe_length));
+	LASSERTF((int)offsetof(struct ll_fiemap_extent, fe_flags) == 40, "found %lld\n",
+		 (long long)(int)offsetof(struct ll_fiemap_extent, fe_flags));
+	LASSERTF((int)sizeof(((struct ll_fiemap_extent *)0)->fe_flags) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct ll_fiemap_extent *)0)->fe_flags));
+	LASSERTF((int)offsetof(struct ll_fiemap_extent, fe_device) == 44, "found %lld\n",
+		 (long long)(int)offsetof(struct ll_fiemap_extent, fe_device));
+	LASSERTF((int)sizeof(((struct ll_fiemap_extent *)0)->fe_device) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct ll_fiemap_extent *)0)->fe_device));
+	CLASSERT(FIEMAP_EXTENT_LAST == 0x00000001);
+	CLASSERT(FIEMAP_EXTENT_UNKNOWN == 0x00000002);
+	CLASSERT(FIEMAP_EXTENT_DELALLOC == 0x00000004);
+	CLASSERT(FIEMAP_EXTENT_ENCODED == 0x00000008);
+	CLASSERT(FIEMAP_EXTENT_DATA_ENCRYPTED == 0x00000080);
+	CLASSERT(FIEMAP_EXTENT_NOT_ALIGNED == 0x00000100);
+	CLASSERT(FIEMAP_EXTENT_DATA_INLINE == 0x00000200);
+	CLASSERT(FIEMAP_EXTENT_DATA_TAIL == 0x00000400);
+	CLASSERT(FIEMAP_EXTENT_UNWRITTEN == 0x00000800);
+	CLASSERT(FIEMAP_EXTENT_MERGED == 0x00001000);
+	CLASSERT(FIEMAP_EXTENT_NO_DIRECT == 0x40000000);
+	CLASSERT(FIEMAP_EXTENT_NET == 0x80000000);
+
+	/* Checks for type posix_acl_xattr_entry */
+	LASSERTF((int)sizeof(posix_acl_xattr_entry) == 8, "found %lld\n",
+		 (long long)(int)sizeof(posix_acl_xattr_entry));
+	LASSERTF((int)offsetof(posix_acl_xattr_entry, e_tag) == 0, "found %lld\n",
+		 (long long)(int)offsetof(posix_acl_xattr_entry, e_tag));
+	LASSERTF((int)sizeof(((posix_acl_xattr_entry *)0)->e_tag) == 2, "found %lld\n",
+		 (long long)(int)sizeof(((posix_acl_xattr_entry *)0)->e_tag));
+	LASSERTF((int)offsetof(posix_acl_xattr_entry, e_perm) == 2, "found %lld\n",
+		 (long long)(int)offsetof(posix_acl_xattr_entry, e_perm));
+	LASSERTF((int)sizeof(((posix_acl_xattr_entry *)0)->e_perm) == 2, "found %lld\n",
+		 (long long)(int)sizeof(((posix_acl_xattr_entry *)0)->e_perm));
+	LASSERTF((int)offsetof(posix_acl_xattr_entry, e_id) == 4, "found %lld\n",
+		 (long long)(int)offsetof(posix_acl_xattr_entry, e_id));
+	LASSERTF((int)sizeof(((posix_acl_xattr_entry *)0)->e_id) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((posix_acl_xattr_entry *)0)->e_id));
+
+	/* Checks for type posix_acl_xattr_header */
+	LASSERTF((int)sizeof(posix_acl_xattr_header) == 4, "found %lld\n",
+		 (long long)(int)sizeof(posix_acl_xattr_header));
+	LASSERTF((int)offsetof(posix_acl_xattr_header, a_version) == 0, "found %lld\n",
+		 (long long)(int)offsetof(posix_acl_xattr_header, a_version));
+	LASSERTF((int)sizeof(((posix_acl_xattr_header *)0)->a_version) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((posix_acl_xattr_header *)0)->a_version));
+	LASSERTF((int)offsetof(posix_acl_xattr_header, a_entries) == 4, "found %lld\n",
+		 (long long)(int)offsetof(posix_acl_xattr_header, a_entries));
+	LASSERTF((int)sizeof(((posix_acl_xattr_header *)0)->a_entries) == 0, "found %lld\n",
+		 (long long)(int)sizeof(((posix_acl_xattr_header *)0)->a_entries));
+
+	/* Checks for struct link_ea_header */
+	LASSERTF((int)sizeof(struct link_ea_header) == 24, "found %lld\n",
+		 (long long)(int)sizeof(struct link_ea_header));
+	LASSERTF((int)offsetof(struct link_ea_header, leh_magic) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct link_ea_header, leh_magic));
+	LASSERTF((int)sizeof(((struct link_ea_header *)0)->leh_magic) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct link_ea_header *)0)->leh_magic));
+	LASSERTF((int)offsetof(struct link_ea_header, leh_reccount) == 4, "found %lld\n",
+		 (long long)(int)offsetof(struct link_ea_header, leh_reccount));
+	LASSERTF((int)sizeof(((struct link_ea_header *)0)->leh_reccount) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct link_ea_header *)0)->leh_reccount));
+	LASSERTF((int)offsetof(struct link_ea_header, leh_len) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct link_ea_header, leh_len));
+	LASSERTF((int)sizeof(((struct link_ea_header *)0)->leh_len) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct link_ea_header *)0)->leh_len));
+	LASSERTF((int)offsetof(struct link_ea_header, padding1) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct link_ea_header, padding1));
+	LASSERTF((int)sizeof(((struct link_ea_header *)0)->padding1) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct link_ea_header *)0)->padding1));
+	LASSERTF((int)offsetof(struct link_ea_header, padding2) == 20, "found %lld\n",
+		 (long long)(int)offsetof(struct link_ea_header, padding2));
+	LASSERTF((int)sizeof(((struct link_ea_header *)0)->padding2) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct link_ea_header *)0)->padding2));
+	CLASSERT(LINK_EA_MAGIC == 0x11EAF1DFUL);
+
+	/* Checks for struct link_ea_entry */
+	LASSERTF((int)sizeof(struct link_ea_entry) == 18, "found %lld\n",
+		 (long long)(int)sizeof(struct link_ea_entry));
+	LASSERTF((int)offsetof(struct link_ea_entry, lee_reclen) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct link_ea_entry, lee_reclen));
+	LASSERTF((int)sizeof(((struct link_ea_entry *)0)->lee_reclen) == 2, "found %lld\n",
+		 (long long)(int)sizeof(((struct link_ea_entry *)0)->lee_reclen));
+	LASSERTF((int)offsetof(struct link_ea_entry, lee_parent_fid) == 2, "found %lld\n",
+		 (long long)(int)offsetof(struct link_ea_entry, lee_parent_fid));
+	LASSERTF((int)sizeof(((struct link_ea_entry *)0)->lee_parent_fid) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct link_ea_entry *)0)->lee_parent_fid));
+	LASSERTF((int)offsetof(struct link_ea_entry, lee_name) == 18, "found %lld\n",
+		 (long long)(int)offsetof(struct link_ea_entry, lee_name));
+	LASSERTF((int)sizeof(((struct link_ea_entry *)0)->lee_name) == 0, "found %lld\n",
+		 (long long)(int)sizeof(((struct link_ea_entry *)0)->lee_name));
+
+	/* Checks for struct layout_intent */
+	LASSERTF((int)sizeof(struct layout_intent) == 24, "found %lld\n",
+		 (long long)(int)sizeof(struct layout_intent));
+	LASSERTF((int)offsetof(struct layout_intent, li_opc) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct layout_intent, li_opc));
+	LASSERTF((int)sizeof(((struct layout_intent *)0)->li_opc) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct layout_intent *)0)->li_opc));
+	LASSERTF((int)offsetof(struct layout_intent, li_flags) == 4, "found %lld\n",
+		 (long long)(int)offsetof(struct layout_intent, li_flags));
+	LASSERTF((int)sizeof(((struct layout_intent *)0)->li_flags) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct layout_intent *)0)->li_flags));
+	LASSERTF((int)offsetof(struct layout_intent, li_start) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct layout_intent, li_start));
+	LASSERTF((int)sizeof(((struct layout_intent *)0)->li_start) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct layout_intent *)0)->li_start));
+	LASSERTF((int)offsetof(struct layout_intent, li_end) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct layout_intent, li_end));
+	LASSERTF((int)sizeof(((struct layout_intent *)0)->li_end) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct layout_intent *)0)->li_end));
+	LASSERTF(LAYOUT_INTENT_ACCESS == 0, "found %lld\n",
+		 (long long)LAYOUT_INTENT_ACCESS);
+	LASSERTF(LAYOUT_INTENT_READ == 1, "found %lld\n",
+		 (long long)LAYOUT_INTENT_READ);
+	LASSERTF(LAYOUT_INTENT_WRITE == 2, "found %lld\n",
+		 (long long)LAYOUT_INTENT_WRITE);
+	LASSERTF(LAYOUT_INTENT_GLIMPSE == 3, "found %lld\n",
+		 (long long)LAYOUT_INTENT_GLIMPSE);
+	LASSERTF(LAYOUT_INTENT_TRUNC == 4, "found %lld\n",
+		 (long long)LAYOUT_INTENT_TRUNC);
+	LASSERTF(LAYOUT_INTENT_RELEASE == 5, "found %lld\n",
+		 (long long)LAYOUT_INTENT_RELEASE);
+	LASSERTF(LAYOUT_INTENT_RESTORE == 6, "found %lld\n",
+		 (long long)LAYOUT_INTENT_RESTORE);
+
+	/* Checks for struct hsm_action_item */
+	LASSERTF((int)sizeof(struct hsm_action_item) == 72, "found %lld\n",
+		 (long long)(int)sizeof(struct hsm_action_item));
+	LASSERTF((int)offsetof(struct hsm_action_item, hai_len) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_action_item, hai_len));
+	LASSERTF((int)sizeof(((struct hsm_action_item *)0)->hai_len) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_action_item *)0)->hai_len));
+	LASSERTF((int)offsetof(struct hsm_action_item, hai_action) == 4, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_action_item, hai_action));
+	LASSERTF((int)sizeof(((struct hsm_action_item *)0)->hai_action) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_action_item *)0)->hai_action));
+	LASSERTF((int)offsetof(struct hsm_action_item, hai_fid) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_action_item, hai_fid));
+	LASSERTF((int)sizeof(((struct hsm_action_item *)0)->hai_fid) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_action_item *)0)->hai_fid));
+	LASSERTF((int)offsetof(struct hsm_action_item, hai_dfid) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_action_item, hai_dfid));
+	LASSERTF((int)sizeof(((struct hsm_action_item *)0)->hai_dfid) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_action_item *)0)->hai_dfid));
+	LASSERTF((int)offsetof(struct hsm_action_item, hai_extent) == 40, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_action_item, hai_extent));
+	LASSERTF((int)sizeof(((struct hsm_action_item *)0)->hai_extent) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_action_item *)0)->hai_extent));
+	LASSERTF((int)offsetof(struct hsm_action_item, hai_cookie) == 56, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_action_item, hai_cookie));
+	LASSERTF((int)sizeof(((struct hsm_action_item *)0)->hai_cookie) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_action_item *)0)->hai_cookie));
+	LASSERTF((int)offsetof(struct hsm_action_item, hai_gid) == 64, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_action_item, hai_gid));
+	LASSERTF((int)sizeof(((struct hsm_action_item *)0)->hai_gid) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_action_item *)0)->hai_gid));
+	LASSERTF((int)offsetof(struct hsm_action_item, hai_data) == 72, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_action_item, hai_data));
+	LASSERTF((int)sizeof(((struct hsm_action_item *)0)->hai_data) == 0, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_action_item *)0)->hai_data));
+
+	/* Checks for struct hsm_action_list */
+	LASSERTF((int)sizeof(struct hsm_action_list) == 32, "found %lld\n",
+		 (long long)(int)sizeof(struct hsm_action_list));
+	LASSERTF((int)offsetof(struct hsm_action_list, hal_version) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_action_list, hal_version));
+	LASSERTF((int)sizeof(((struct hsm_action_list *)0)->hal_version) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_action_list *)0)->hal_version));
+	LASSERTF((int)offsetof(struct hsm_action_list, hal_count) == 4, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_action_list, hal_count));
+	LASSERTF((int)sizeof(((struct hsm_action_list *)0)->hal_count) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_action_list *)0)->hal_count));
+	LASSERTF((int)offsetof(struct hsm_action_list, hal_compound_id) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_action_list, hal_compound_id));
+	LASSERTF((int)sizeof(((struct hsm_action_list *)0)->hal_compound_id) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_action_list *)0)->hal_compound_id));
+	LASSERTF((int)offsetof(struct hsm_action_list, hal_flags) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_action_list, hal_flags));
+	LASSERTF((int)sizeof(((struct hsm_action_list *)0)->hal_flags) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_action_list *)0)->hal_flags));
+	LASSERTF((int)offsetof(struct hsm_action_list, hal_archive_id) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_action_list, hal_archive_id));
+	LASSERTF((int)sizeof(((struct hsm_action_list *)0)->hal_archive_id) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_action_list *)0)->hal_archive_id));
+	LASSERTF((int)offsetof(struct hsm_action_list, padding1) == 28, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_action_list, padding1));
+	LASSERTF((int)sizeof(((struct hsm_action_list *)0)->padding1) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_action_list *)0)->padding1));
+	LASSERTF((int)offsetof(struct hsm_action_list, hal_fsname) == 32, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_action_list, hal_fsname));
+	LASSERTF((int)sizeof(((struct hsm_action_list *)0)->hal_fsname) == 0, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_action_list *)0)->hal_fsname));
+
+	/* Checks for struct hsm_progress */
+	LASSERTF((int)sizeof(struct hsm_progress) == 48, "found %lld\n",
+		 (long long)(int)sizeof(struct hsm_progress));
+	LASSERTF((int)offsetof(struct hsm_progress, hp_fid) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_progress, hp_fid));
+	LASSERTF((int)sizeof(((struct hsm_progress *)0)->hp_fid) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_progress *)0)->hp_fid));
+	LASSERTF((int)offsetof(struct hsm_progress, hp_cookie) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_progress, hp_cookie));
+	LASSERTF((int)sizeof(((struct hsm_progress *)0)->hp_cookie) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_progress *)0)->hp_cookie));
+	LASSERTF((int)offsetof(struct hsm_progress, hp_extent) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_progress, hp_extent));
+	LASSERTF((int)sizeof(((struct hsm_progress *)0)->hp_extent) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_progress *)0)->hp_extent));
+	LASSERTF((int)offsetof(struct hsm_progress, hp_flags) == 40, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_progress, hp_flags));
+	LASSERTF((int)sizeof(((struct hsm_progress *)0)->hp_flags) == 2, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_progress *)0)->hp_flags));
+	LASSERTF((int)offsetof(struct hsm_progress, hp_errval) == 42, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_progress, hp_errval));
+	LASSERTF((int)sizeof(((struct hsm_progress *)0)->hp_errval) == 2, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_progress *)0)->hp_errval));
+	LASSERTF((int)offsetof(struct hsm_progress, padding) == 44, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_progress, padding));
+	LASSERTF((int)sizeof(((struct hsm_progress *)0)->padding) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_progress *)0)->padding));
+	LASSERTF(HP_FLAG_COMPLETED == 0x01, "found 0x%.8x\n",
+		HP_FLAG_COMPLETED);
+	LASSERTF(HP_FLAG_RETRY == 0x02, "found 0x%.8x\n",
+		HP_FLAG_RETRY);
+
+	LASSERTF((int)offsetof(struct hsm_copy, hc_data_version) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_copy, hc_data_version));
+	LASSERTF((int)sizeof(((struct hsm_copy *)0)->hc_data_version) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_copy *)0)->hc_data_version));
+	LASSERTF((int)offsetof(struct hsm_copy, hc_flags) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_copy, hc_flags));
+	LASSERTF((int)sizeof(((struct hsm_copy *)0)->hc_flags) == 2, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_copy *)0)->hc_flags));
+	LASSERTF((int)offsetof(struct hsm_copy, hc_errval) == 10, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_copy, hc_errval));
+	LASSERTF((int)sizeof(((struct hsm_copy *)0)->hc_errval) == 2, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_copy *)0)->hc_errval));
+	LASSERTF((int)offsetof(struct hsm_copy, padding) == 12, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_copy, padding));
+	LASSERTF((int)sizeof(((struct hsm_copy *)0)->padding) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_copy *)0)->padding));
+	LASSERTF((int)offsetof(struct hsm_copy, hc_hai) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_copy, hc_hai));
+	LASSERTF((int)sizeof(((struct hsm_copy *)0)->hc_hai) == 72, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_copy *)0)->hc_hai));
+
+	/* Checks for struct hsm_progress_kernel */
+	LASSERTF((int)sizeof(struct hsm_progress_kernel) == 64, "found %lld\n",
+		 (long long)(int)sizeof(struct hsm_progress_kernel));
+	LASSERTF((int)offsetof(struct hsm_progress_kernel, hpk_fid) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_progress_kernel, hpk_fid));
+	LASSERTF((int)sizeof(((struct hsm_progress_kernel *)0)->hpk_fid) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_progress_kernel *)0)->hpk_fid));
+	LASSERTF((int)offsetof(struct hsm_progress_kernel, hpk_cookie) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_progress_kernel, hpk_cookie));
+	LASSERTF((int)sizeof(((struct hsm_progress_kernel *)0)->hpk_cookie) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_progress_kernel *)0)->hpk_cookie));
+	LASSERTF((int)offsetof(struct hsm_progress_kernel, hpk_extent) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_progress_kernel, hpk_extent));
+	LASSERTF((int)sizeof(((struct hsm_progress_kernel *)0)->hpk_extent) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_progress_kernel *)0)->hpk_extent));
+	LASSERTF((int)offsetof(struct hsm_progress_kernel, hpk_flags) == 40, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_progress_kernel, hpk_flags));
+	LASSERTF((int)sizeof(((struct hsm_progress_kernel *)0)->hpk_flags) == 2, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_progress_kernel *)0)->hpk_flags));
+	LASSERTF((int)offsetof(struct hsm_progress_kernel, hpk_errval) == 42, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_progress_kernel, hpk_errval));
+	LASSERTF((int)sizeof(((struct hsm_progress_kernel *)0)->hpk_errval) == 2, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_progress_kernel *)0)->hpk_errval));
+	LASSERTF((int)offsetof(struct hsm_progress_kernel, hpk_padding1) == 44, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_progress_kernel, hpk_padding1));
+	LASSERTF((int)sizeof(((struct hsm_progress_kernel *)0)->hpk_padding1) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_progress_kernel *)0)->hpk_padding1));
+	LASSERTF((int)offsetof(struct hsm_progress_kernel, hpk_data_version) == 48, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_progress_kernel, hpk_data_version));
+	LASSERTF((int)sizeof(((struct hsm_progress_kernel *)0)->hpk_data_version) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_progress_kernel *)0)->hpk_data_version));
+	LASSERTF((int)offsetof(struct hsm_progress_kernel, hpk_padding2) == 56, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_progress_kernel, hpk_padding2));
+	LASSERTF((int)sizeof(((struct hsm_progress_kernel *)0)->hpk_padding2) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_progress_kernel *)0)->hpk_padding2));
+
+	/* Checks for struct hsm_user_item */
+	LASSERTF((int)sizeof(struct hsm_user_item) == 32, "found %lld\n",
+		 (long long)(int)sizeof(struct hsm_user_item));
+	LASSERTF((int)offsetof(struct hsm_user_item, hui_fid) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_user_item, hui_fid));
+	LASSERTF((int)sizeof(((struct hsm_user_item *)0)->hui_fid) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_user_item *)0)->hui_fid));
+	LASSERTF((int)offsetof(struct hsm_user_item, hui_extent) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_user_item, hui_extent));
+	LASSERTF((int)sizeof(((struct hsm_user_item *)0)->hui_extent) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_user_item *)0)->hui_extent));
+
+	/* Checks for struct hsm_user_state */
+	LASSERTF((int)sizeof(struct hsm_user_state) == 32, "found %lld\n",
+		 (long long)(int)sizeof(struct hsm_user_state));
+	LASSERTF((int)offsetof(struct hsm_user_state, hus_states) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_user_state, hus_states));
+	LASSERTF((int)sizeof(((struct hsm_user_state *)0)->hus_states) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_user_state *)0)->hus_states));
+	LASSERTF((int)offsetof(struct hsm_user_state, hus_archive_id) == 4, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_user_state, hus_archive_id));
+	LASSERTF((int)sizeof(((struct hsm_user_state *)0)->hus_archive_id) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_user_state *)0)->hus_archive_id));
+	LASSERTF((int)offsetof(struct hsm_user_state, hus_in_progress_state) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_user_state, hus_in_progress_state));
+	LASSERTF((int)sizeof(((struct hsm_user_state *)0)->hus_in_progress_state) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_user_state *)0)->hus_in_progress_state));
+	LASSERTF((int)offsetof(struct hsm_user_state, hus_in_progress_action) == 12, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_user_state, hus_in_progress_action));
+	LASSERTF((int)sizeof(((struct hsm_user_state *)0)->hus_in_progress_action) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_user_state *)0)->hus_in_progress_action));
+	LASSERTF((int)offsetof(struct hsm_user_state, hus_in_progress_location) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_user_state, hus_in_progress_location));
+	LASSERTF((int)sizeof(((struct hsm_user_state *)0)->hus_in_progress_location) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_user_state *)0)->hus_in_progress_location));
+
+	/* Checks for struct hsm_state_set */
+	LASSERTF((int)sizeof(struct hsm_state_set) == 24, "found %lld\n",
+		 (long long)(int)sizeof(struct hsm_state_set));
+	LASSERTF((int)offsetof(struct hsm_state_set, hss_valid) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_state_set, hss_valid));
+	LASSERTF((int)sizeof(((struct hsm_state_set *)0)->hss_valid) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_state_set *)0)->hss_valid));
+	LASSERTF((int)offsetof(struct hsm_state_set, hss_archive_id) == 4, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_state_set, hss_archive_id));
+	LASSERTF((int)sizeof(((struct hsm_state_set *)0)->hss_archive_id) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_state_set *)0)->hss_archive_id));
+	LASSERTF((int)offsetof(struct hsm_state_set, hss_setmask) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_state_set, hss_setmask));
+	LASSERTF((int)sizeof(((struct hsm_state_set *)0)->hss_setmask) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_state_set *)0)->hss_setmask));
+	LASSERTF((int)offsetof(struct hsm_state_set, hss_clearmask) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_state_set, hss_clearmask));
+	LASSERTF((int)sizeof(((struct hsm_state_set *)0)->hss_clearmask) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_state_set *)0)->hss_clearmask));
+
+	/* Checks for struct hsm_current_action */
+	LASSERTF((int)sizeof(struct hsm_current_action) == 24, "found %lld\n",
+		 (long long)(int)sizeof(struct hsm_current_action));
+	LASSERTF((int)offsetof(struct hsm_current_action, hca_state) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_current_action, hca_state));
+	LASSERTF((int)sizeof(((struct hsm_current_action *)0)->hca_state) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_current_action *)0)->hca_state));
+	LASSERTF((int)offsetof(struct hsm_current_action, hca_action) == 4, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_current_action, hca_action));
+	LASSERTF((int)sizeof(((struct hsm_current_action *)0)->hca_action) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_current_action *)0)->hca_action));
+	LASSERTF((int)offsetof(struct hsm_current_action, hca_location) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_current_action, hca_location));
+	LASSERTF((int)sizeof(((struct hsm_current_action *)0)->hca_location) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_current_action *)0)->hca_location));
+
+	/* Checks for struct hsm_request */
+	LASSERTF((int)sizeof(struct hsm_request) == 24, "found %lld\n",
+		 (long long)(int)sizeof(struct hsm_request));
+	LASSERTF((int)offsetof(struct hsm_request, hr_action) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_request, hr_action));
+	LASSERTF((int)sizeof(((struct hsm_request *)0)->hr_action) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_request *)0)->hr_action));
+	LASSERTF((int)offsetof(struct hsm_request, hr_archive_id) == 4, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_request, hr_archive_id));
+	LASSERTF((int)sizeof(((struct hsm_request *)0)->hr_archive_id) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_request *)0)->hr_archive_id));
+	LASSERTF((int)offsetof(struct hsm_request, hr_flags) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_request, hr_flags));
+	LASSERTF((int)sizeof(((struct hsm_request *)0)->hr_flags) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_request *)0)->hr_flags));
+	LASSERTF((int)offsetof(struct hsm_request, hr_itemcount) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_request, hr_itemcount));
+	LASSERTF((int)sizeof(((struct hsm_request *)0)->hr_itemcount) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_request *)0)->hr_itemcount));
+	LASSERTF((int)offsetof(struct hsm_request, hr_data_len) == 20, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_request, hr_data_len));
+	LASSERTF((int)sizeof(((struct hsm_request *)0)->hr_data_len) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_request *)0)->hr_data_len));
+	LASSERTF(HSM_FORCE_ACTION == 0x00000001UL, "found 0x%.8xUL\n",
+		(unsigned)HSM_FORCE_ACTION);
+	LASSERTF(HSM_GHOST_COPY == 0x00000002UL, "found 0x%.8xUL\n",
+		(unsigned)HSM_GHOST_COPY);
+
+	/* Checks for struct hsm_user_request */
+	LASSERTF((int)sizeof(struct hsm_user_request) == 24, "found %lld\n",
+		 (long long)(int)sizeof(struct hsm_user_request));
+	LASSERTF((int)offsetof(struct hsm_user_request, hur_request) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_user_request, hur_request));
+	LASSERTF((int)sizeof(((struct hsm_user_request *)0)->hur_request) == 24, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_user_request *)0)->hur_request));
+	LASSERTF((int)offsetof(struct hsm_user_request, hur_user_item) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_user_request, hur_user_item));
+	LASSERTF((int)sizeof(((struct hsm_user_request *)0)->hur_user_item) == 0, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_user_request *)0)->hur_user_item));
+
+	/* Checks for struct hsm_user_import */
+	LASSERTF(sizeof(struct hsm_user_import) == 48, "found %lld\n",
+		 (long long)sizeof(struct hsm_user_import));
+	LASSERTF(offsetof(struct hsm_user_import, hui_size) == 0,
+		 "found %lld\n",
+		 (long long)offsetof(struct hsm_user_import, hui_size));
+	LASSERTF(sizeof(((struct hsm_user_import *)0)->hui_size) == 8,
+		 "found %lld\n",
+		 (long long)sizeof(((struct hsm_user_import *)0)->hui_size));
+	LASSERTF(offsetof(struct hsm_user_import, hui_uid) == 32,
+		 "found %lld\n",
+		 (long long)offsetof(struct hsm_user_import, hui_uid));
+	LASSERTF(sizeof(((struct hsm_user_import *)0)->hui_uid) == 4,
+		 "found %lld\n",
+		 (long long)sizeof(((struct hsm_user_import *)0)->hui_uid));
+	LASSERTF(offsetof(struct hsm_user_import, hui_gid) == 36,
+		 "found %lld\n",
+		 (long long)offsetof(struct hsm_user_import, hui_gid));
+	LASSERTF(sizeof(((struct hsm_user_import *)0)->hui_gid) == 4,
+		 "found %lld\n",
+		 (long long)sizeof(((struct hsm_user_import *)0)->hui_gid));
+	LASSERTF(offsetof(struct hsm_user_import, hui_mode) == 40,
+		 "found %lld\n",
+		 (long long)offsetof(struct hsm_user_import, hui_mode));
+	LASSERTF(sizeof(((struct hsm_user_import *)0)->hui_mode) == 4,
+		 "found %lld\n",
+		 (long long)sizeof(((struct hsm_user_import *)0)->hui_mode));
+	LASSERTF(offsetof(struct hsm_user_import, hui_atime) == 8,
+		 "found %lld\n",
+		 (long long)offsetof(struct hsm_user_import, hui_atime));
+	LASSERTF(sizeof(((struct hsm_user_import *)0)->hui_atime) == 8,
+		 "found %lld\n",
+		 (long long)sizeof(((struct hsm_user_import *)0)->hui_atime));
+	LASSERTF(offsetof(struct hsm_user_import, hui_atime_ns) == 24,
+		 "found %lld\n",
+		(long long)(int)offsetof(struct hsm_user_import, hui_atime_ns));
+	LASSERTF(sizeof(((struct hsm_user_import *)0)->hui_atime_ns) == 4,
+		 "found %lld\n",
+		(long long)sizeof(((struct hsm_user_import *)0)->hui_atime_ns));
+	LASSERTF(offsetof(struct hsm_user_import, hui_mtime) == 16,
+		 "found %lld\n",
+		 (long long)offsetof(struct hsm_user_import, hui_mtime));
+	LASSERTF(sizeof(((struct hsm_user_import *)0)->hui_mtime) == 8,
+		 "found %lld\n",
+		 (long long)sizeof(((struct hsm_user_import *)0)->hui_mtime));
+	LASSERTF(offsetof(struct hsm_user_import, hui_mtime_ns) == 28,
+		 "found %lld\n",
+		(long long)offsetof(struct hsm_user_import, hui_mtime_ns));
+	LASSERTF(sizeof(((struct hsm_user_import *)0)->hui_mtime_ns) == 4,
+		 "found %lld\n",
+		(long long)sizeof(((struct hsm_user_import *)0)->hui_mtime_ns));
+	LASSERTF(offsetof(struct hsm_user_import, hui_archive_id) == 44,
+		 "found %lld\n",
+		 (long long)offsetof(struct hsm_user_import, hui_archive_id));
+	LASSERTF(sizeof(((struct hsm_user_import *)0)->hui_archive_id) == 4,
+		 "found %lld\n",
+	      (long long)sizeof(((struct hsm_user_import *)0)->hui_archive_id));
+
+	/* Checks for struct update_buf */
+	LASSERTF((int)sizeof(struct update_buf) == 8, "found %lld\n",
+		 (long long)(int)sizeof(struct update_buf));
+	LASSERTF((int)offsetof(struct update_buf, ub_magic) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct update_buf, ub_magic));
+	LASSERTF((int)sizeof(((struct update_buf *)0)->ub_magic) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct update_buf *)0)->ub_magic));
+	LASSERTF((int)offsetof(struct update_buf, ub_count) == 4, "found %lld\n",
+		 (long long)(int)offsetof(struct update_buf, ub_count));
+	LASSERTF((int)sizeof(((struct update_buf *)0)->ub_count) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct update_buf *)0)->ub_count));
+	LASSERTF((int)offsetof(struct update_buf, ub_bufs) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct update_buf, ub_bufs));
+	LASSERTF((int)sizeof(((struct update_buf *)0)->ub_bufs) == 0, "found %lld\n",
+		 (long long)(int)sizeof(((struct update_buf *)0)->ub_bufs));
+
+	/* Checks for struct update_reply */
+	LASSERTF((int)sizeof(struct update_reply) == 8, "found %lld\n",
+		 (long long)(int)sizeof(struct update_reply));
+	LASSERTF((int)offsetof(struct update_reply, ur_version) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct update_reply, ur_version));
+	LASSERTF((int)sizeof(((struct update_reply *)0)->ur_version) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct update_reply *)0)->ur_version));
+	LASSERTF((int)offsetof(struct update_reply, ur_count) == 4, "found %lld\n",
+		 (long long)(int)offsetof(struct update_reply, ur_count));
+	LASSERTF((int)sizeof(((struct update_reply *)0)->ur_count) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct update_reply *)0)->ur_count));
+	LASSERTF((int)offsetof(struct update_reply, ur_lens) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct update_reply, ur_lens));
+	LASSERTF((int)sizeof(((struct update_reply *)0)->ur_lens) == 0, "found %lld\n",
+		 (long long)(int)sizeof(((struct update_reply *)0)->ur_lens));
+
+	/* Checks for struct update */
+	LASSERTF((int)sizeof(struct update) == 56, "found %lld\n",
+		 (long long)(int)sizeof(struct update));
+	LASSERTF((int)offsetof(struct update, u_type) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct update, u_type));
+	LASSERTF((int)sizeof(((struct update *)0)->u_type) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct update *)0)->u_type));
+	LASSERTF((int)offsetof(struct update, u_batchid) == 4, "found %lld\n",
+		 (long long)(int)offsetof(struct update, u_batchid));
+	LASSERTF((int)sizeof(((struct update *)0)->u_batchid) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct update *)0)->u_batchid));
+	LASSERTF((int)offsetof(struct update, u_fid) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct update, u_fid));
+	LASSERTF((int)sizeof(((struct update *)0)->u_fid) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct update *)0)->u_fid));
+	LASSERTF((int)offsetof(struct update, u_lens) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct update, u_lens));
+	LASSERTF((int)sizeof(((struct update *)0)->u_lens) == 32, "found %lld\n",
+		 (long long)(int)sizeof(((struct update *)0)->u_lens));
+	LASSERTF((int)offsetof(struct update, u_bufs) == 56, "found %lld\n",
+		 (long long)(int)offsetof(struct update, u_bufs));
+	LASSERTF((int)sizeof(((struct update *)0)->u_bufs) == 0, "found %lld\n",
+		 (long long)(int)sizeof(((struct update *)0)->u_bufs));
+}
-- 
cgit 1.2.3-korg